diff options
Diffstat (limited to 'net/core')
34 files changed, 4845 insertions, 2628 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index b33b996f5dd..71093d94ad2 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o  obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \  			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ -			sock_diag.o dev_ioctl.o +			sock_diag.o dev_ioctl.o tso.o  obj-$(CONFIG_XFRM) += flow.o  obj-y += net-sysfs.o @@ -21,4 +21,6 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o  obj-$(CONFIG_TRACEPOINTS) += net-traces.o  obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o  obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o -obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o +obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o +obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o +obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o diff --git a/net/core/datagram.c b/net/core/datagram.c index af814e76420..488dd1a825c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -577,7 +577,7 @@ EXPORT_SYMBOL(skb_copy_datagram_from_iovec);  /**   *	zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec   *	@skb: buffer to copy - *	@from: io vector to copy to + *	@from: io vector to copy from   *	@offset: offset in the io vector to start copying from   *	@count: amount of vectors to copy to buffer from   * @@ -740,17 +740,37 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)  	sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));  	if (likely(!sum)) { -		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) +		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && +		    !skb->csum_complete_sw)  			netdev_rx_csum_fault(skb->dev); -		skb->ip_summed = CHECKSUM_UNNECESSARY;  	} +	skb->csum_valid = !sum;  	return sum;  }  EXPORT_SYMBOL(__skb_checksum_complete_head);  __sum16 __skb_checksum_complete(struct sk_buff *skb)  { -	return __skb_checksum_complete_head(skb, skb->len); +	__wsum csum; +	__sum16 sum; + +	csum = skb_checksum(skb, 0, skb->len, 0); + +	/* skb->csum holds pseudo checksum */ +	sum = csum_fold(csum_add(skb->csum, csum)); +	if (likely(!sum)) { +		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && +		    !skb->csum_complete_sw) +			netdev_rx_csum_fault(skb->dev); +	} + +	/* Save full packet checksum */ +	skb->csum = csum; +	skb->ip_summed = CHECKSUM_COMPLETE; +	skb->csum_complete_sw = 1; +	skb->csum_valid = !sum; + +	return sum;  }  EXPORT_SYMBOL(__skb_checksum_complete); diff --git a/net/core/dev.c b/net/core/dev.c index 5c713f2239c..367a586d0c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -131,6 +131,7 @@  #include <linux/static_key.h>  #include <linux/hashtable.h>  #include <linux/vmalloc.h> +#include <linux/if_macvlan.h>  #include "net-sysfs.h" @@ -146,6 +147,11 @@ struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;  struct list_head ptype_all __read_mostly;	/* Taps */  static struct list_head offload_base __read_mostly; +static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, +					 struct net_device *dev, +					 struct netdev_notifier_info *info); +  /*   * The @dev_base_head list is protected by @dev_base_lock and the rtnl   * semaphore. @@ -479,7 +485,7 @@ EXPORT_SYMBOL(dev_add_offload);   *	and must not be freed until after all the CPU's have gone   *	through a quiescent state.   */ -void __dev_remove_offload(struct packet_offload *po) +static void __dev_remove_offload(struct packet_offload *po)  {  	struct list_head *head = &offload_base;  	struct packet_offload *po1; @@ -497,7 +503,6 @@ void __dev_remove_offload(struct packet_offload *po)  out:  	spin_unlock(&offload_lock);  } -EXPORT_SYMBOL(__dev_remove_offload);  /**   *	dev_remove_offload	 - remove packet offload handler @@ -1117,6 +1122,8 @@ rollback:  	write_seqcount_end(&devnet_rename_seq); +	netdev_adjacent_rename_links(dev, oldname); +  	write_lock_bh(&dev_base_lock);  	hlist_del_rcu(&dev->name_hlist);  	write_unlock_bh(&dev_base_lock); @@ -1136,6 +1143,7 @@ rollback:  			err = ret;  			write_seqcount_begin(&devnet_rename_seq);  			memcpy(dev->name, oldname, IFNAMSIZ); +			memcpy(oldname, newname, IFNAMSIZ);  			goto rollback;  		} else {  			pr_err("%s: name change rollback failed: %d\n", @@ -1202,8 +1210,12 @@ EXPORT_SYMBOL(netdev_features_change);  void netdev_state_change(struct net_device *dev)  {  	if (dev->flags & IFF_UP) { -		call_netdevice_notifiers(NETDEV_CHANGE, dev); -		rtmsg_ifinfo(RTM_NEWLINK, dev, 0); +		struct netdev_notifier_change_info change_info; + +		change_info.flags_changed = 0; +		call_netdevice_notifiers_info(NETDEV_CHANGE, dev, +					      &change_info.info); +		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);  	}  }  EXPORT_SYMBOL(netdev_state_change); @@ -1240,7 +1252,7 @@ static int __dev_open(struct net_device *dev)  	 * If we don't do this there is a chance ndo_poll_controller  	 * or ndo_poll may be running while we open the device  	 */ -	netpoll_rx_disable(dev); +	netpoll_poll_disable(dev);  	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);  	ret = notifier_to_errno(ret); @@ -1255,7 +1267,7 @@ static int __dev_open(struct net_device *dev)  	if (!ret && ops->ndo_open)  		ret = ops->ndo_open(dev); -	netpoll_rx_enable(dev); +	netpoll_poll_enable(dev);  	if (ret)  		clear_bit(__LINK_STATE_START, &dev->state); @@ -1293,7 +1305,7 @@ int dev_open(struct net_device *dev)  	if (ret < 0)  		return ret; -	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); +	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);  	call_netdevice_notifiers(NETDEV_UP, dev);  	return ret; @@ -1307,7 +1319,10 @@ static int __dev_close_many(struct list_head *head)  	ASSERT_RTNL();  	might_sleep(); -	list_for_each_entry(dev, head, unreg_list) { +	list_for_each_entry(dev, head, close_list) { +		/* Temporarily disable netpoll until the interface is down */ +		netpoll_poll_disable(dev); +  		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);  		clear_bit(__LINK_STATE_START, &dev->state); @@ -1318,12 +1333,12 @@ static int __dev_close_many(struct list_head *head)  		 * dev->stop() will invoke napi_disable() on all of it's  		 * napi_struct instances on this device.  		 */ -		smp_mb__after_clear_bit(); /* Commit netif_running(). */ +		smp_mb__after_atomic(); /* Commit netif_running(). */  	}  	dev_deactivate_many(head); -	list_for_each_entry(dev, head, unreg_list) { +	list_for_each_entry(dev, head, close_list) {  		const struct net_device_ops *ops = dev->netdev_ops;  		/* @@ -1338,6 +1353,7 @@ static int __dev_close_many(struct list_head *head)  		dev->flags &= ~IFF_UP;  		net_dmaengine_put(); +		netpoll_poll_enable(dev);  	}  	return 0; @@ -1348,35 +1364,30 @@ static int __dev_close(struct net_device *dev)  	int retval;  	LIST_HEAD(single); -	/* Temporarily disable netpoll until the interface is down */ -	netpoll_rx_disable(dev); - -	list_add(&dev->unreg_list, &single); +	list_add(&dev->close_list, &single);  	retval = __dev_close_many(&single);  	list_del(&single); -	netpoll_rx_enable(dev);  	return retval;  }  static int dev_close_many(struct list_head *head)  {  	struct net_device *dev, *tmp; -	LIST_HEAD(tmp_list); -	list_for_each_entry_safe(dev, tmp, head, unreg_list) +	/* Remove the devices that don't need to be closed */ +	list_for_each_entry_safe(dev, tmp, head, close_list)  		if (!(dev->flags & IFF_UP)) -			list_move(&dev->unreg_list, &tmp_list); +			list_del_init(&dev->close_list);  	__dev_close_many(head); -	list_for_each_entry(dev, head, unreg_list) { -		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); +	list_for_each_entry_safe(dev, tmp, head, close_list) { +		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);  		call_netdevice_notifiers(NETDEV_DOWN, dev); +		list_del_init(&dev->close_list);  	} -	/* rollback_registered_many needs the complete original list */ -	list_splice(&tmp_list, head);  	return 0;  } @@ -1394,14 +1405,9 @@ int dev_close(struct net_device *dev)  	if (dev->flags & IFF_UP) {  		LIST_HEAD(single); -		/* Block netpoll rx while the interface is going down */ -		netpoll_rx_disable(dev); - -		list_add(&dev->unreg_list, &single); +		list_add(&dev->close_list, &single);  		dev_close_many(&single);  		list_del(&single); - -		netpoll_rx_enable(dev);  	}  	return 0;  } @@ -1425,6 +1431,10 @@ void dev_disable_lro(struct net_device *dev)  	if (is_vlan_dev(dev))  		dev = vlan_dev_real_dev(dev); +	/* the same for macvlan devices */ +	if (netif_is_macvlan(dev)) +		dev = macvlan_dev_real_dev(dev); +  	dev->wanted_features &= ~NETIF_F_LRO;  	netdev_update_features(dev); @@ -1562,14 +1572,14 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);   *	are as for raw_notifier_call_chain().   */ -int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, -				  struct netdev_notifier_info *info) +static int call_netdevice_notifiers_info(unsigned long val, +					 struct net_device *dev, +					 struct netdev_notifier_info *info)  {  	ASSERT_RTNL();  	netdev_notifier_info_init(info, dev);  	return raw_notifier_call_chain(&netdev_chain, val, info);  } -EXPORT_SYMBOL(call_netdevice_notifiers_info);  /**   *	call_netdevice_notifiers - call all network notifier blocks @@ -1637,8 +1647,7 @@ static inline void net_timestamp_set(struct sk_buff *skb)  			__net_timestamp(SKB);		\  	}						\ -static inline bool is_skb_forwardable(struct net_device *dev, -				      struct sk_buff *skb) +bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)  {  	unsigned int len; @@ -1657,6 +1666,30 @@ static inline bool is_skb_forwardable(struct net_device *dev,  	return false;  } +EXPORT_SYMBOL_GPL(is_skb_forwardable); + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ +	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { +		if (skb_copy_ubufs(skb, GFP_ATOMIC)) { +			atomic_long_inc(&dev->rx_dropped); +			kfree_skb(skb); +			return NET_RX_DROP; +		} +	} + +	if (unlikely(!is_skb_forwardable(dev, skb))) { +		atomic_long_inc(&dev->rx_dropped); +		kfree_skb(skb); +		return NET_RX_DROP; +	} + +	skb_scrub_packet(skb, true); +	skb->protocol = eth_type_trans(skb, dev); + +	return 0; +} +EXPORT_SYMBOL_GPL(__dev_forward_skb);  /**   * dev_forward_skb - loopback an skb to another netif @@ -1678,28 +1711,7 @@ static inline bool is_skb_forwardable(struct net_device *dev,   */  int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)  { -	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { -		if (skb_copy_ubufs(skb, GFP_ATOMIC)) { -			atomic_long_inc(&dev->rx_dropped); -			kfree_skb(skb); -			return NET_RX_DROP; -		} -	} - -	if (unlikely(!is_skb_forwardable(dev, skb))) { -		atomic_long_inc(&dev->rx_dropped); -		kfree_skb(skb); -		return NET_RX_DROP; -	} -	skb->protocol = eth_type_trans(skb, dev); - -	/* eth_type_trans() can set pkt_type. -	 * call skb_scrub_packet() after it to clear pkt_type _after_ calling -	 * eth_type_trans(). -	 */ -	skb_scrub_packet(skb, true); - -	return netif_rx(skb); +	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);  }  EXPORT_SYMBOL_GPL(dev_forward_skb); @@ -1917,7 +1929,8 @@ static struct xps_map *expand_xps_map(struct xps_map *map,  	return new_map;  } -int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index) +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, +			u16 index)  {  	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;  	struct xps_map *map, *new_map; @@ -2078,7 +2091,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)  }  EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  /**   *	netif_set_real_num_rx_queues - set actual number of RX queues used   *	@dev: Network device @@ -2144,30 +2157,42 @@ void __netif_schedule(struct Qdisc *q)  }  EXPORT_SYMBOL(__netif_schedule); -void dev_kfree_skb_irq(struct sk_buff *skb) +struct dev_kfree_skb_cb { +	enum skb_free_reason reason; +}; + +static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)  { -	if (atomic_dec_and_test(&skb->users)) { -		struct softnet_data *sd; -		unsigned long flags; +	return (struct dev_kfree_skb_cb *)skb->cb; +} + +void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) +{ +	unsigned long flags; -		local_irq_save(flags); -		sd = &__get_cpu_var(softnet_data); -		skb->next = sd->completion_queue; -		sd->completion_queue = skb; -		raise_softirq_irqoff(NET_TX_SOFTIRQ); -		local_irq_restore(flags); +	if (likely(atomic_read(&skb->users) == 1)) { +		smp_rmb(); +		atomic_set(&skb->users, 0); +	} else if (likely(!atomic_dec_and_test(&skb->users))) { +		return;  	} +	get_kfree_skb_cb(skb)->reason = reason; +	local_irq_save(flags); +	skb->next = __this_cpu_read(softnet_data.completion_queue); +	__this_cpu_write(softnet_data.completion_queue, skb); +	raise_softirq_irqoff(NET_TX_SOFTIRQ); +	local_irq_restore(flags);  } -EXPORT_SYMBOL(dev_kfree_skb_irq); +EXPORT_SYMBOL(__dev_kfree_skb_irq); -void dev_kfree_skb_any(struct sk_buff *skb) +void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)  {  	if (in_irq() || irqs_disabled()) -		dev_kfree_skb_irq(skb); +		__dev_kfree_skb_irq(skb, reason);  	else  		dev_kfree_skb(skb);  } -EXPORT_SYMBOL(dev_kfree_skb_any); +EXPORT_SYMBOL(__dev_kfree_skb_any);  /** @@ -2269,10 +2294,10 @@ out:  }  EXPORT_SYMBOL(skb_checksum_help); -__be16 skb_network_protocol(struct sk_buff *skb) +__be16 skb_network_protocol(struct sk_buff *skb, int *depth)  { +	unsigned int vlan_depth = skb->mac_len;  	__be16 type = skb->protocol; -	int vlan_depth = ETH_HLEN;  	/* Tunnel gso handlers can set protocol to ethernet. */  	if (type == htons(ETH_P_TEB)) { @@ -2285,17 +2310,34 @@ __be16 skb_network_protocol(struct sk_buff *skb)  		type = eth->h_proto;  	} -	while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { -		struct vlan_hdr *vh; +	/* if skb->protocol is 802.1Q/AD then the header should already be +	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at +	 * ETH_HLEN otherwise +	 */ +	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { +		if (vlan_depth) { +			if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN))) +				return 0; +			vlan_depth -= VLAN_HLEN; +		} else { +			vlan_depth = ETH_HLEN; +		} +		do { +			struct vlan_hdr *vh; -		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) -			return 0; +			if (unlikely(!pskb_may_pull(skb, +						    vlan_depth + VLAN_HLEN))) +				return 0; -		vh = (struct vlan_hdr *)(skb->data + vlan_depth); -		type = vh->h_vlan_encapsulated_proto; -		vlan_depth += VLAN_HLEN; +			vh = (struct vlan_hdr *)(skb->data + vlan_depth); +			type = vh->h_vlan_encapsulated_proto; +			vlan_depth += VLAN_HLEN; +		} while (type == htons(ETH_P_8021Q) || +			 type == htons(ETH_P_8021AD));  	} +	*depth = vlan_depth; +  	return type;  } @@ -2309,12 +2351,13 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,  {  	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);  	struct packet_offload *ptype; -	__be16 type = skb_network_protocol(skb); +	int vlan_depth = skb->mac_len; +	__be16 type = skb_network_protocol(skb, &vlan_depth);  	if (unlikely(!type))  		return ERR_PTR(-EINVAL); -	__skb_pull(skb, skb->mac_len); +	__skb_pull(skb, vlan_depth);  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, &offload_base, list) { @@ -2377,6 +2420,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,  	}  	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); +	SKB_GSO_CB(skb)->encap_level = 0; +  	skb_reset_mac_header(skb);  	skb_reset_mac_len(skb); @@ -2439,13 +2484,8 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)  {  	struct dev_gso_cb *cb; -	do { -		struct sk_buff *nskb = skb->next; - -		skb->next = nskb->next; -		nskb->next = NULL; -		kfree_skb(nskb); -	} while (skb->next); +	kfree_skb_list(skb->next); +	skb->next = NULL;  	cb = DEV_GSO_CB(skb);  	if (cb->destructor) @@ -2480,11 +2520,39 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)  	return 0;  } +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev. + */ +#ifdef CONFIG_NET_MPLS_GSO +static netdev_features_t net_mpls_features(struct sk_buff *skb, +					   netdev_features_t features, +					   __be16 type) +{ +	if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) +		features &= skb->dev->mpls_features; + +	return features; +} +#else +static netdev_features_t net_mpls_features(struct sk_buff *skb, +					   netdev_features_t features, +					   __be16 type) +{ +	return features; +} +#endif +  static netdev_features_t harmonize_features(struct sk_buff *skb,  	netdev_features_t features)  { +	int tmp; +	__be16 type; + +	type = skb_network_protocol(skb, &tmp); +	features = net_mpls_features(skb, features, type); +  	if (skb->ip_summed != CHECKSUM_NONE && -	    !can_checksum_protocol(features, skb_network_protocol(skb))) { +	    !can_checksum_protocol(features, type)) {  		features &= ~NETIF_F_ALL_CSUM;  	} else if (illegal_highdma(skb->dev, skb)) {  		features &= ~NETIF_F_SG; @@ -2520,21 +2588,6 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)  }  EXPORT_SYMBOL(netif_skb_features); -/* - * Returns true if either: - *	1. skb has frag_list and the device doesn't support FRAGLIST, or - *	2. skb is fragmented and the device does not support SG. - */ -static inline int skb_needs_linearize(struct sk_buff *skb, -				      netdev_features_t features) -{ -	return skb_is_nonlinear(skb) && -			((skb_has_frag_list(skb) && -				!(features & NETIF_F_FRAGLIST)) || -			(skb_shinfo(skb)->nr_frags && -				!(features & NETIF_F_SG))); -} -  int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  			struct netdev_queue *txq)  { @@ -2602,6 +2655,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  			dev_queue_xmit_nit(skb, dev);  		skb_len = skb->len; +		trace_net_dev_start_xmit(skb, dev);  		rc = ops->ndo_start_xmit(skb, dev);  		trace_net_dev_xmit(skb, rc, dev, skb_len);  		if (rc == NETDEV_TX_OK) @@ -2620,6 +2674,7 @@ gso:  			dev_queue_xmit_nit(nskb, dev);  		skb_len = nskb->len; +		trace_net_dev_start_xmit(nskb, dev);  		rc = ops->ndo_start_xmit(nskb, dev);  		trace_net_dev_xmit(nskb, rc, dev, skb_len);  		if (unlikely(rc != NETDEV_TX_OK)) { @@ -2645,6 +2700,7 @@ out_kfree_skb:  out:  	return rc;  } +EXPORT_SYMBOL_GPL(dev_hard_start_xmit);  static void qdisc_pkt_len_init(struct sk_buff *skb)  { @@ -2739,7 +2795,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,  	return rc;  } -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)  static void skb_update_prio(struct sk_buff *skb)  {  	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); @@ -2776,8 +2832,9 @@ int dev_loopback_xmit(struct sk_buff *skb)  EXPORT_SYMBOL(dev_loopback_xmit);  /** - *	dev_queue_xmit - transmit a buffer + *	__dev_queue_xmit - transmit a buffer   *	@skb: buffer to transmit + *	@accel_priv: private data used for L2 forwarding offload   *   *	Queue a buffer for transmission to a network device. The caller must   *	have set the device and priority and built the buffer before calling @@ -2800,7 +2857,7 @@ EXPORT_SYMBOL(dev_loopback_xmit);   *      the BH enable code must have IRQs enabled so that it will not deadlock.   *          --BLG   */ -int dev_queue_xmit(struct sk_buff *skb) +static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)  {  	struct net_device *dev = skb->dev;  	struct netdev_queue *txq; @@ -2816,7 +2873,7 @@ int dev_queue_xmit(struct sk_buff *skb)  	skb_update_prio(skb); -	txq = netdev_pick_tx(dev, skb); +	txq = netdev_pick_tx(dev, skb, accel_priv);  	q = rcu_dereference_bh(txq->qdisc);  #ifdef CONFIG_NET_CLS_ACT @@ -2875,14 +2932,26 @@ recursion_alert:  	rc = -ENETDOWN;  	rcu_read_unlock_bh(); +	atomic_long_inc(&dev->tx_dropped);  	kfree_skb(skb);  	return rc;  out:  	rcu_read_unlock_bh();  	return rc;  } + +int dev_queue_xmit(struct sk_buff *skb) +{ +	return __dev_queue_xmit(skb, NULL); +}  EXPORT_SYMBOL(dev_queue_xmit); +int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +{ +	return __dev_queue_xmit(skb, accel_priv); +} +EXPORT_SYMBOL(dev_queue_xmit_accel); +  /*=======================================================================  			Receiver routines @@ -2936,7 +3005,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		flow_table = rcu_dereference(rxqueue->rps_flow_table);  		if (!flow_table)  			goto out; -		flow_id = skb->rxhash & flow_table->mask; +		flow_id = skb_get_hash(skb) & flow_table->mask;  		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,  							rxq_index, flow_id);  		if (rc < 0) @@ -2970,6 +3039,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	struct rps_sock_flow_table *sock_flow_table;  	int cpu = -1;  	u16 tcpu; +	u32 hash;  	if (skb_rx_queue_recorded(skb)) {  		u16 index = skb_get_rx_queue(skb); @@ -2998,7 +3068,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	}  	skb_reset_network_header(skb); -	if (!skb_get_rxhash(skb)) +	hash = skb_get_hash(skb); +	if (!hash)  		goto done;  	flow_table = rcu_dereference(rxqueue->rps_flow_table); @@ -3007,11 +3078,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		u16 next_cpu;  		struct rps_dev_flow *rflow; -		rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; +		rflow = &flow_table->flows[hash & flow_table->mask];  		tcpu = rflow->cpu; -		next_cpu = sock_flow_table->ents[skb->rxhash & -		    sock_flow_table->mask]; +		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];  		/*  		 * If the desired CPU (where last recvmsg was done) is @@ -3040,7 +3110,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	}  	if (map) { -		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; +		tcpu = map->cpus[((u64) hash * map->len) >> 32];  		if (cpu_online(tcpu)) {  			cpu = tcpu; @@ -3143,7 +3213,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)  	rcu_read_lock();  	fl = rcu_dereference(sd->flow_limit);  	if (fl) { -		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1); +		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);  		old_flow = fl->history[fl->history_head];  		fl->history[fl->history_head] = new_flow; @@ -3211,29 +3281,10 @@ enqueue:  	return NET_RX_DROP;  } -/** - *	netif_rx	-	post buffer to the network code - *	@skb: buffer to post - * - *	This function receives a packet from a device driver and queues it for - *	the upper (protocol) levels to process.  It always succeeds. The buffer - *	may be dropped during processing for congestion control or by the - *	protocol layers. - * - *	return values: - *	NET_RX_SUCCESS	(no congestion) - *	NET_RX_DROP     (packet was dropped) - * - */ - -int netif_rx(struct sk_buff *skb) +static int netif_rx_internal(struct sk_buff *skb)  {  	int ret; -	/* if netpoll wants it, pretend we never saw it */ -	if (netpoll_rx(skb)) -		return NET_RX_DROP; -  	net_timestamp_check(netdev_tstamp_prequeue, skb);  	trace_netif_rx(skb); @@ -3262,14 +3313,38 @@ int netif_rx(struct sk_buff *skb)  	}  	return ret;  } + +/** + *	netif_rx	-	post buffer to the network code + *	@skb: buffer to post + * + *	This function receives a packet from a device driver and queues it for + *	the upper (protocol) levels to process.  It always succeeds. The buffer + *	may be dropped during processing for congestion control or by the + *	protocol layers. + * + *	return values: + *	NET_RX_SUCCESS	(no congestion) + *	NET_RX_DROP     (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ +	trace_netif_rx_entry(skb); + +	return netif_rx_internal(skb); +}  EXPORT_SYMBOL(netif_rx);  int netif_rx_ni(struct sk_buff *skb)  {  	int err; +	trace_netif_rx_ni_entry(skb); +  	preempt_disable(); -	err = netif_rx(skb); +	err = netif_rx_internal(skb);  	if (local_softirq_pending())  		do_softirq();  	preempt_enable(); @@ -3295,7 +3370,10 @@ static void net_tx_action(struct softirq_action *h)  			clist = clist->next;  			WARN_ON(atomic_read(&skb->users)); -			trace_kfree_skb(skb, net_tx_action); +			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) +				trace_consume_skb(skb); +			else +				trace_kfree_skb(skb, net_tx_action);  			__kfree_skb(skb);  		}  	} @@ -3317,7 +3395,7 @@ static void net_tx_action(struct softirq_action *h)  			root_lock = qdisc_lock(q);  			if (spin_trylock(root_lock)) { -				smp_mb__before_clear_bit(); +				smp_mb__before_atomic();  				clear_bit(__QDISC_STATE_SCHED,  					  &q->state);  				qdisc_run(q); @@ -3327,7 +3405,7 @@ static void net_tx_action(struct softirq_action *h)  					      &q->state)) {  					__netif_reschedule(q);  				} else { -					smp_mb__before_clear_bit(); +					smp_mb__before_atomic();  					clear_bit(__QDISC_STATE_SCHED,  						  &q->state);  				} @@ -3413,7 +3491,7 @@ out:   *	@rx_handler: receive handler to register   *	@rx_handler_data: data pointer that is used by rx handler   * - *	Register a receive hander for a device. This handler will then be + *	Register a receive handler for a device. This handler will then be   *	called from __netif_receive_skb. A negative errno code is returned   *	on a failure.   * @@ -3467,11 +3545,11 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);  static bool skb_pfmemalloc_protocol(struct sk_buff *skb)  {  	switch (skb->protocol) { -	case __constant_htons(ETH_P_ARP): -	case __constant_htons(ETH_P_IP): -	case __constant_htons(ETH_P_IPV6): -	case __constant_htons(ETH_P_8021Q): -	case __constant_htons(ETH_P_8021AD): +	case htons(ETH_P_ARP): +	case htons(ETH_P_IP): +	case htons(ETH_P_IPV6): +	case htons(ETH_P_8021Q): +	case htons(ETH_P_8021AD):  		return true;  	default:  		return false; @@ -3492,10 +3570,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)  	trace_netif_receive_skb(skb); -	/* if we've gotten here through NAPI, check netpoll */ -	if (netpoll_receive_skb(skb)) -		goto out; -  	orig_dev = skb->dev;  	skb_reset_network_header(skb); @@ -3622,7 +3696,6 @@ drop:  unlock:  	rcu_read_unlock(); -out:  	return ret;  } @@ -3651,22 +3724,7 @@ static int __netif_receive_skb(struct sk_buff *skb)  	return ret;  } -/** - *	netif_receive_skb - process receive buffer from network - *	@skb: buffer to process - * - *	netif_receive_skb() is the main receive data processing function. - *	It always succeeds. The buffer may be dropped during processing - *	for congestion control or by the protocol layers. - * - *	This function may only be called from softirq context and interrupts - *	should be enabled. - * - *	Return values (usually ignored): - *	NET_RX_SUCCESS: no congestion - *	NET_RX_DROP: packet was dropped - */ -int netif_receive_skb(struct sk_buff *skb) +static int netif_receive_skb_internal(struct sk_buff *skb)  {  	net_timestamp_check(netdev_tstamp_prequeue, skb); @@ -3692,6 +3750,28 @@ int netif_receive_skb(struct sk_buff *skb)  #endif  	return __netif_receive_skb(skb);  } + +/** + *	netif_receive_skb - process receive buffer from network + *	@skb: buffer to process + * + *	netif_receive_skb() is the main receive data processing function. + *	It always succeeds. The buffer may be dropped during processing + *	for congestion control or by the protocol layers. + * + *	This function may only be called from softirq context and interrupts + *	should be enabled. + * + *	Return values (usually ignored): + *	NET_RX_SUCCESS: no congestion + *	NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ +	trace_netif_receive_skb_entry(skb); + +	return netif_receive_skb_internal(skb); +}  EXPORT_SYMBOL(netif_receive_skb);  /* Network device is going away, flush any packets still pending @@ -3741,7 +3821,7 @@ static int napi_gro_complete(struct sk_buff *skb)  		if (ptype->type != type || !ptype->callbacks.gro_complete)  			continue; -		err = ptype->callbacks.gro_complete(skb); +		err = ptype->callbacks.gro_complete(skb, 0);  		break;  	}  	rcu_read_unlock(); @@ -3753,7 +3833,7 @@ static int napi_gro_complete(struct sk_buff *skb)  	}  out: -	return netif_receive_skb(skb); +	return netif_receive_skb_internal(skb);  }  /* napi->gro_list contains packets ordered by age. @@ -3789,21 +3869,66 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)  {  	struct sk_buff *p;  	unsigned int maclen = skb->dev->hard_header_len; +	u32 hash = skb_get_hash_raw(skb);  	for (p = napi->gro_list; p; p = p->next) {  		unsigned long diffs; +		NAPI_GRO_CB(p)->flush = 0; + +		if (hash != skb_get_hash_raw(p)) { +			NAPI_GRO_CB(p)->same_flow = 0; +			continue; +		} +  		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;  		diffs |= p->vlan_tci ^ skb->vlan_tci;  		if (maclen == ETH_HLEN)  			diffs |= compare_ether_header(skb_mac_header(p), -						      skb_gro_mac_header(skb)); +						      skb_mac_header(skb));  		else if (!diffs)  			diffs = memcmp(skb_mac_header(p), -				       skb_gro_mac_header(skb), +				       skb_mac_header(skb),  				       maclen);  		NAPI_GRO_CB(p)->same_flow = !diffs; -		NAPI_GRO_CB(p)->flush = 0; +	} +} + +static void skb_gro_reset_offset(struct sk_buff *skb) +{ +	const struct skb_shared_info *pinfo = skb_shinfo(skb); +	const skb_frag_t *frag0 = &pinfo->frags[0]; + +	NAPI_GRO_CB(skb)->data_offset = 0; +	NAPI_GRO_CB(skb)->frag0 = NULL; +	NAPI_GRO_CB(skb)->frag0_len = 0; + +	if (skb_mac_header(skb) == skb_tail_pointer(skb) && +	    pinfo->nr_frags && +	    !PageHighMem(skb_frag_page(frag0))) { +		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); +		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); +	} +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ +	struct skb_shared_info *pinfo = skb_shinfo(skb); + +	BUG_ON(skb->end - skb->tail < grow); + +	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + +	skb->data_len -= grow; +	skb->tail += grow; + +	pinfo->frags[0].page_offset += grow; +	skb_frag_size_sub(&pinfo->frags[0], grow); + +	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { +		skb_frag_unref(skb, 0); +		memmove(pinfo->frags, pinfo->frags + 1, +			--pinfo->nr_frags * sizeof(pinfo->frags[0]));  	}  } @@ -3815,14 +3940,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff  	struct list_head *head = &offload_base;  	int same_flow;  	enum gro_result ret; +	int grow; -	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) +	if (!(skb->dev->features & NETIF_F_GRO))  		goto normal;  	if (skb_is_gso(skb) || skb_has_frag_list(skb))  		goto normal;  	gro_list_prepare(napi, skb); +	NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, head, list) { @@ -3834,6 +3961,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff  		NAPI_GRO_CB(skb)->same_flow = 0;  		NAPI_GRO_CB(skb)->flush = 0;  		NAPI_GRO_CB(skb)->free = 0; +		NAPI_GRO_CB(skb)->udp_mark = 0;  		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);  		break; @@ -3858,39 +3986,35 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff  	if (same_flow)  		goto ok; -	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) +	if (NAPI_GRO_CB(skb)->flush)  		goto normal; -	napi->gro_count++; +	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { +		struct sk_buff *nskb = napi->gro_list; + +		/* locate the end of the list to select the 'oldest' flow */ +		while (nskb->next) { +			pp = &nskb->next; +			nskb = *pp; +		} +		*pp = NULL; +		nskb->next = NULL; +		napi_gro_complete(nskb); +	} else { +		napi->gro_count++; +	}  	NAPI_GRO_CB(skb)->count = 1;  	NAPI_GRO_CB(skb)->age = jiffies; +	NAPI_GRO_CB(skb)->last = skb;  	skb_shinfo(skb)->gso_size = skb_gro_len(skb);  	skb->next = napi->gro_list;  	napi->gro_list = skb;  	ret = GRO_HELD;  pull: -	if (skb_headlen(skb) < skb_gro_offset(skb)) { -		int grow = skb_gro_offset(skb) - skb_headlen(skb); - -		BUG_ON(skb->end - skb->tail < grow); - -		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); - -		skb->tail += grow; -		skb->data_len -= grow; - -		skb_shinfo(skb)->frags[0].page_offset += grow; -		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); - -		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { -			skb_frag_unref(skb, 0); -			memmove(skb_shinfo(skb)->frags, -				skb_shinfo(skb)->frags + 1, -				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); -		} -	} - +	grow = skb_gro_offset(skb) - skb_headlen(skb); +	if (grow > 0) +		gro_pull_from_frag0(skb, grow);  ok:  	return ret; @@ -3899,12 +4023,39 @@ normal:  	goto pull;  } +struct packet_offload *gro_find_receive_by_type(__be16 type) +{ +	struct list_head *offload_head = &offload_base; +	struct packet_offload *ptype; + +	list_for_each_entry_rcu(ptype, offload_head, list) { +		if (ptype->type != type || !ptype->callbacks.gro_receive) +			continue; +		return ptype; +	} +	return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ +	struct list_head *offload_head = &offload_base; +	struct packet_offload *ptype; + +	list_for_each_entry_rcu(ptype, offload_head, list) { +		if (ptype->type != type || !ptype->callbacks.gro_complete) +			continue; +		return ptype; +	} +	return NULL; +} +EXPORT_SYMBOL(gro_find_complete_by_type);  static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  {  	switch (ret) {  	case GRO_NORMAL: -		if (netif_receive_skb(skb)) +		if (netif_receive_skb_internal(skb))  			ret = GRO_DROP;  		break; @@ -3927,25 +4078,10 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  	return ret;  } -static void skb_gro_reset_offset(struct sk_buff *skb) -{ -	const struct skb_shared_info *pinfo = skb_shinfo(skb); -	const skb_frag_t *frag0 = &pinfo->frags[0]; - -	NAPI_GRO_CB(skb)->data_offset = 0; -	NAPI_GRO_CB(skb)->frag0 = NULL; -	NAPI_GRO_CB(skb)->frag0_len = 0; - -	if (skb_mac_header(skb) == skb_tail_pointer(skb) && -	    pinfo->nr_frags && -	    !PageHighMem(skb_frag_page(frag0))) { -		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); -		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); -	} -} -  gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  { +	trace_napi_gro_receive_entry(skb); +  	skb_gro_reset_offset(skb);  	return napi_skb_finish(dev_gro_receive(napi, skb), skb); @@ -3960,6 +4096,9 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)  	skb->vlan_tci = 0;  	skb->dev = napi->dev;  	skb->skb_iif = 0; +	skb->encapsulation = 0; +	skb_shinfo(skb)->gso_type = 0; +	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));  	napi->skb = skb;  } @@ -3970,24 +4109,22 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)  	if (!skb) {  		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); -		if (skb) -			napi->skb = skb; +		napi->skb = skb;  	}  	return skb;  }  EXPORT_SYMBOL(napi_get_frags); -static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, -			       gro_result_t ret) +static gro_result_t napi_frags_finish(struct napi_struct *napi, +				      struct sk_buff *skb, +				      gro_result_t ret)  {  	switch (ret) {  	case GRO_NORMAL:  	case GRO_HELD: +		__skb_push(skb, ETH_HLEN);  		skb->protocol = eth_type_trans(skb, skb->dev); - -		if (ret == GRO_HELD) -			skb_gro_pull(skb, -ETH_HLEN); -		else if (netif_receive_skb(skb)) +		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))  			ret = GRO_DROP;  		break; @@ -4003,39 +4140,42 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *  	return ret;  } +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */  static struct sk_buff *napi_frags_skb(struct napi_struct *napi)  {  	struct sk_buff *skb = napi->skb; -	struct ethhdr *eth; -	unsigned int hlen; -	unsigned int off; +	const struct ethhdr *eth; +	unsigned int hlen = sizeof(*eth);  	napi->skb = NULL;  	skb_reset_mac_header(skb);  	skb_gro_reset_offset(skb); -	off = skb_gro_offset(skb); -	hlen = off + sizeof(*eth); -	eth = skb_gro_header_fast(skb, off); -	if (skb_gro_header_hard(skb, hlen)) { -		eth = skb_gro_header_slow(skb, hlen, off); +	eth = skb_gro_header_fast(skb, 0); +	if (unlikely(skb_gro_header_hard(skb, hlen))) { +		eth = skb_gro_header_slow(skb, hlen, 0);  		if (unlikely(!eth)) {  			napi_reuse_skb(napi, skb); -			skb = NULL; -			goto out; +			return NULL;  		} +	} else { +		gro_pull_from_frag0(skb, hlen); +		NAPI_GRO_CB(skb)->frag0 += hlen; +		NAPI_GRO_CB(skb)->frag0_len -= hlen;  	} - -	skb_gro_pull(skb, sizeof(*eth)); +	__skb_pull(skb, hlen);  	/*  	 * This works because the only protocols we care about don't require -	 * special handling.  We'll fix it up properly at the end. +	 * special handling. +	 * We'll fix it up properly in napi_frags_finish()  	 */  	skb->protocol = eth->h_proto; -out:  	return skb;  } @@ -4046,12 +4186,14 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)  	if (!skb)  		return GRO_DROP; +	trace_napi_gro_frags_entry(skb); +  	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));  }  EXPORT_SYMBOL(napi_gro_frags);  /* - * net_rps_action sends any pending IPI's for rps. + * net_rps_action_and_irq_enable sends any pending IPI's for rps.   * Note: called with local irq disabled, but exits with local irq enabled.   */  static void net_rps_action_and_irq_enable(struct softnet_data *sd) @@ -4069,8 +4211,8 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)  			struct softnet_data *next = remsd->rps_ipi_next;  			if (cpu_online(remsd->cpu)) -				__smp_call_function_single(remsd->cpu, -							   &remsd->csd, 0); +				smp_call_function_single_async(remsd->cpu, +							   &remsd->csd);  			remsd = next;  		}  	} else @@ -4094,9 +4236,8 @@ static int process_backlog(struct napi_struct *napi, int quota)  #endif  	napi->weight = weight_p;  	local_irq_disable(); -	while (work < quota) { +	while (1) {  		struct sk_buff *skb; -		unsigned int qlen;  		while ((skb = __skb_dequeue(&sd->process_queue))) {  			local_irq_enable(); @@ -4110,24 +4251,24 @@ static int process_backlog(struct napi_struct *napi, int quota)  		}  		rps_lock(sd); -		qlen = skb_queue_len(&sd->input_pkt_queue); -		if (qlen) -			skb_queue_splice_tail_init(&sd->input_pkt_queue, -						   &sd->process_queue); - -		if (qlen < quota - work) { +		if (skb_queue_empty(&sd->input_pkt_queue)) {  			/*  			 * Inline a custom version of __napi_complete().  			 * only current cpu owns and manipulates this napi, -			 * and NAPI_STATE_SCHED is the only possible flag set on backlog. -			 * we can use a plain write instead of clear_bit(), +			 * and NAPI_STATE_SCHED is the only possible flag set +			 * on backlog. +			 * We can use a plain write instead of clear_bit(),  			 * and we dont need an smp_mb() memory barrier.  			 */  			list_del(&napi->poll_list);  			napi->state = 0; +			rps_unlock(sd); -			quota = work + qlen; +			break;  		} + +		skb_queue_splice_tail_init(&sd->input_pkt_queue, +					   &sd->process_queue);  		rps_unlock(sd);  	}  	local_irq_enable(); @@ -4157,7 +4298,7 @@ void __napi_complete(struct napi_struct *n)  	BUG_ON(n->gro_list);  	list_del(&n->poll_list); -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	clear_bit(NAPI_STATE_SCHED, &n->state);  }  EXPORT_SYMBOL(__napi_complete); @@ -4256,17 +4397,10 @@ EXPORT_SYMBOL(netif_napi_add);  void netif_napi_del(struct napi_struct *napi)  { -	struct sk_buff *skb, *next; -  	list_del_init(&napi->dev_list);  	napi_free_frags(napi); -	for (skb = napi->gro_list; skb; skb = next) { -		next = skb->next; -		skb->next = NULL; -		kfree_skb(skb); -	} - +	kfree_skb_list(napi->gro_list);  	napi->gro_list = NULL;  	napi->gro_count = 0;  } @@ -4373,44 +4507,29 @@ struct netdev_adjacent {  	/* upper master flag, there can only be one master device per list */  	bool master; -	/* indicates that this dev is our first-level lower/upper device */ -	bool neighbour; -  	/* counter for the number of times this device was added to us */  	u16 ref_nr; +	/* private field for the users */ +	void *private; +  	struct list_head list;  	struct rcu_head rcu;  };  static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,  						 struct net_device *adj_dev, -						 bool upper) +						 struct list_head *adj_list)  {  	struct netdev_adjacent *adj; -	struct list_head *dev_list; - -	dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list; -	list_for_each_entry(adj, dev_list, list) { +	list_for_each_entry(adj, adj_list, list) {  		if (adj->dev == adj_dev)  			return adj;  	}  	return NULL;  } -static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, -							  struct net_device *udev) -{ -	return __netdev_find_adj(dev, udev, true); -} - -static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, -							  struct net_device *ldev) -{ -	return __netdev_find_adj(dev, ldev, false); -} -  /**   * netdev_has_upper_dev - Check if device is linked to an upper device   * @dev: device @@ -4425,7 +4544,7 @@ bool netdev_has_upper_dev(struct net_device *dev,  {  	ASSERT_RTNL(); -	return __netdev_find_upper(dev, upper_dev); +	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);  }  EXPORT_SYMBOL(netdev_has_upper_dev); @@ -4436,13 +4555,12 @@ EXPORT_SYMBOL(netdev_has_upper_dev);   * Find out if a device is linked to an upper device and return true in case   * it is. The caller must hold the RTNL lock.   */ -bool netdev_has_any_upper_dev(struct net_device *dev) +static bool netdev_has_any_upper_dev(struct net_device *dev)  {  	ASSERT_RTNL(); -	return !list_empty(&dev->upper_dev_list); +	return !list_empty(&dev->all_adj_list.upper);  } -EXPORT_SYMBOL(netdev_has_any_upper_dev);  /**   * netdev_master_upper_dev_get - Get master upper device @@ -4457,10 +4575,10 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)  	ASSERT_RTNL(); -	if (list_empty(&dev->upper_dev_list)) +	if (list_empty(&dev->adj_list.upper))  		return NULL; -	upper = list_first_entry(&dev->upper_dev_list, +	upper = list_first_entry(&dev->adj_list.upper,  				 struct netdev_adjacent, list);  	if (likely(upper->master))  		return upper->dev; @@ -4468,7 +4586,18 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)  }  EXPORT_SYMBOL(netdev_master_upper_dev_get); -/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list +void *netdev_adjacent_get_private(struct list_head *adj_list) +{ +	struct netdev_adjacent *adj; + +	adj = list_entry(adj_list, struct netdev_adjacent, list); + +	return adj->private; +} +EXPORT_SYMBOL(netdev_adjacent_get_private); + +/** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list   * @dev: device   * @iter: list_head ** of the current position   * @@ -4480,11 +4609,11 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,  {  	struct netdev_adjacent *upper; -	WARN_ON_ONCE(!rcu_read_lock_held()); +	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());  	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); -	if (&upper->list == &dev->upper_dev_list) +	if (&upper->list == &dev->adj_list.upper)  		return NULL;  	*iter = &upper->list; @@ -4494,6 +4623,134 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,  EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);  /** + * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, +						     struct list_head **iter) +{ +	struct netdev_adjacent *upper; + +	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + +	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + +	if (&upper->list == &dev->all_adj_list.upper) +		return NULL; + +	*iter = &upper->list; + +	return upper->dev; +} +EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); + +/** + * netdev_lower_get_next_private - Get the next ->private from the + *				   lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, +				    struct list_head **iter) +{ +	struct netdev_adjacent *lower; + +	lower = list_entry(*iter, struct netdev_adjacent, list); + +	if (&lower->list == &dev->adj_list.lower) +		return NULL; + +	*iter = lower->list.next; + +	return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private); + +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + *				       lower neighbour list, RCU + *				       variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +void *netdev_lower_get_next_private_rcu(struct net_device *dev, +					struct list_head **iter) +{ +	struct netdev_adjacent *lower; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + +	if (&lower->list == &dev->adj_list.lower) +		return NULL; + +	*iter = &lower->list; + +	return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); + +/** + * netdev_lower_get_next - Get the next device from the lower neighbour + *                         list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ +	struct netdev_adjacent *lower; + +	lower = list_entry((*iter)->next, struct netdev_adjacent, list); + +	if (&lower->list == &dev->adj_list.lower) +		return NULL; + +	*iter = &lower->list; + +	return lower->dev; +} +EXPORT_SYMBOL(netdev_lower_get_next); + +/** + * netdev_lower_get_first_private_rcu - Get the first ->private from the + *				       lower neighbour list, RCU + *				       variant + * @dev: device + * + * Gets the first netdev_adjacent->private from the dev's lower neighbour + * list. The caller must hold RCU read lock. + */ +void *netdev_lower_get_first_private_rcu(struct net_device *dev) +{ +	struct netdev_adjacent *lower; + +	lower = list_first_or_null_rcu(&dev->adj_list.lower, +			struct netdev_adjacent, list); +	if (lower) +		return lower->private; +	return NULL; +} +EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); + +/**   * netdev_master_upper_dev_get_rcu - Get master upper device   * @dev: device   * @@ -4504,7 +4761,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)  {  	struct netdev_adjacent *upper; -	upper = list_first_or_null_rcu(&dev->upper_dev_list, +	upper = list_first_or_null_rcu(&dev->adj_list.upper,  				       struct netdev_adjacent, list);  	if (upper && likely(upper->master))  		return upper->dev; @@ -4512,17 +4769,41 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)  }  EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); +static int netdev_adjacent_sysfs_add(struct net_device *dev, +			      struct net_device *adj_dev, +			      struct list_head *dev_list) +{ +	char linkname[IFNAMSIZ+7]; +	sprintf(linkname, dev_list == &dev->adj_list.upper ? +		"upper_%s" : "lower_%s", adj_dev->name); +	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), +				 linkname); +} +static void netdev_adjacent_sysfs_del(struct net_device *dev, +			       char *name, +			       struct list_head *dev_list) +{ +	char linkname[IFNAMSIZ+7]; +	sprintf(linkname, dev_list == &dev->adj_list.upper ? +		"upper_%s" : "lower_%s", name); +	sysfs_remove_link(&(dev->dev.kobj), linkname); +} + +#define netdev_adjacent_is_neigh_list(dev, dev_list) \ +		(dev_list == &dev->adj_list.upper || \ +		 dev_list == &dev->adj_list.lower) +  static int __netdev_adjacent_dev_insert(struct net_device *dev,  					struct net_device *adj_dev, -					bool neighbour, bool master, -					bool upper) +					struct list_head *dev_list, +					void *private, bool master)  {  	struct netdev_adjacent *adj; +	int ret; -	adj = __netdev_find_adj(dev, adj_dev, upper); +	adj = __netdev_find_adj(dev, adj_dev, dev_list);  	if (adj) { -		BUG_ON(neighbour);  		adj->ref_nr++;  		return 0;  	} @@ -4533,124 +4814,159 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,  	adj->dev = adj_dev;  	adj->master = master; -	adj->neighbour = neighbour;  	adj->ref_nr = 1; - +	adj->private = private;  	dev_hold(adj_dev); -	pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", -		 adj_dev->name, upper ? "upper" : "lower", dev->name, -		 adj_dev->name); -	if (!upper) { -		list_add_tail_rcu(&adj->list, &dev->lower_dev_list); -		return 0; +	pr_debug("dev_hold for %s, because of link added from %s to %s\n", +		 adj_dev->name, dev->name, adj_dev->name); + +	if (netdev_adjacent_is_neigh_list(dev, dev_list)) { +		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); +		if (ret) +			goto free_adj;  	} -	/* Ensure that master upper link is always the first item in list. */ -	if (master) -		list_add_rcu(&adj->list, &dev->upper_dev_list); -	else -		list_add_tail_rcu(&adj->list, &dev->upper_dev_list); +	/* Ensure that master link is always the first item in list. */ +	if (master) { +		ret = sysfs_create_link(&(dev->dev.kobj), +					&(adj_dev->dev.kobj), "master"); +		if (ret) +			goto remove_symlinks; + +		list_add_rcu(&adj->list, dev_list); +	} else { +		list_add_tail_rcu(&adj->list, dev_list); +	}  	return 0; -} -static inline int __netdev_upper_dev_insert(struct net_device *dev, -					    struct net_device *udev, -					    bool master, bool neighbour) -{ -	return __netdev_adjacent_dev_insert(dev, udev, neighbour, master, -					    true); -} +remove_symlinks: +	if (netdev_adjacent_is_neigh_list(dev, dev_list)) +		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); +free_adj: +	kfree(adj); +	dev_put(adj_dev); -static inline int __netdev_lower_dev_insert(struct net_device *dev, -					    struct net_device *ldev, -					    bool neighbour) -{ -	return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false, -					    false); +	return ret;  } -void __netdev_adjacent_dev_remove(struct net_device *dev, -				  struct net_device *adj_dev, bool upper) +static void __netdev_adjacent_dev_remove(struct net_device *dev, +					 struct net_device *adj_dev, +					 struct list_head *dev_list)  {  	struct netdev_adjacent *adj; -	if (upper) -		adj = __netdev_find_upper(dev, adj_dev); -	else -		adj = __netdev_find_lower(dev, adj_dev); +	adj = __netdev_find_adj(dev, adj_dev, dev_list); -	if (!adj) +	if (!adj) { +		pr_err("tried to remove device %s from %s\n", +		       dev->name, adj_dev->name);  		BUG(); +	}  	if (adj->ref_nr > 1) { +		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, +			 adj->ref_nr-1);  		adj->ref_nr--;  		return;  	} +	if (adj->master) +		sysfs_remove_link(&(dev->dev.kobj), "master"); + +	if (netdev_adjacent_is_neigh_list(dev, dev_list)) +		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); +  	list_del_rcu(&adj->list); -	pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", -		 adj_dev->name, upper ? "upper" : "lower", dev->name, -		 adj_dev->name); +	pr_debug("dev_put for %s, because link removed from %s to %s\n", +		 adj_dev->name, dev->name, adj_dev->name);  	dev_put(adj_dev);  	kfree_rcu(adj, rcu);  } -static inline void __netdev_upper_dev_remove(struct net_device *dev, -					     struct net_device *udev) -{ -	return __netdev_adjacent_dev_remove(dev, udev, true); -} - -static inline void __netdev_lower_dev_remove(struct net_device *dev, -					     struct net_device *ldev) -{ -	return __netdev_adjacent_dev_remove(dev, ldev, false); -} - -int __netdev_adjacent_dev_insert_link(struct net_device *dev, -				      struct net_device *upper_dev, -				      bool master, bool neighbour) +static int __netdev_adjacent_dev_link_lists(struct net_device *dev, +					    struct net_device *upper_dev, +					    struct list_head *up_list, +					    struct list_head *down_list, +					    void *private, bool master)  {  	int ret; -	ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour); +	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, +					   master);  	if (ret)  		return ret; -	ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour); +	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, +					   false);  	if (ret) { -		__netdev_upper_dev_remove(dev, upper_dev); +		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);  		return ret;  	}  	return 0;  } -static inline int __netdev_adjacent_dev_link(struct net_device *dev, -					     struct net_device *udev) +static int __netdev_adjacent_dev_link(struct net_device *dev, +				      struct net_device *upper_dev)  { -	return __netdev_adjacent_dev_insert_link(dev, udev, false, false); +	return __netdev_adjacent_dev_link_lists(dev, upper_dev, +						&dev->all_adj_list.upper, +						&upper_dev->all_adj_list.lower, +						NULL, false);  } -static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, -						       struct net_device *udev, -						       bool master) +static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, +					       struct net_device *upper_dev, +					       struct list_head *up_list, +					       struct list_head *down_list)  { -	return __netdev_adjacent_dev_insert_link(dev, udev, master, true); +	__netdev_adjacent_dev_remove(dev, upper_dev, up_list); +	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);  } -void __netdev_adjacent_dev_unlink(struct net_device *dev, -				  struct net_device *upper_dev) +static void __netdev_adjacent_dev_unlink(struct net_device *dev, +					 struct net_device *upper_dev)  { -	__netdev_upper_dev_remove(dev, upper_dev); -	__netdev_lower_dev_remove(upper_dev, dev); +	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, +					   &dev->all_adj_list.upper, +					   &upper_dev->all_adj_list.lower);  } +static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, +						struct net_device *upper_dev, +						void *private, bool master) +{ +	int ret = __netdev_adjacent_dev_link(dev, upper_dev); + +	if (ret) +		return ret; + +	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, +					       &dev->adj_list.upper, +					       &upper_dev->adj_list.lower, +					       private, master); +	if (ret) { +		__netdev_adjacent_dev_unlink(dev, upper_dev); +		return ret; +	} + +	return 0; +} + +static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, +						   struct net_device *upper_dev) +{ +	__netdev_adjacent_dev_unlink(dev, upper_dev); +	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, +					   &dev->adj_list.upper, +					   &upper_dev->adj_list.lower); +}  static int __netdev_upper_dev_link(struct net_device *dev, -				   struct net_device *upper_dev, bool master) +				   struct net_device *upper_dev, bool master, +				   void *private)  {  	struct netdev_adjacent *i, *j, *to_i, *to_j;  	int ret = 0; @@ -4661,26 +4977,29 @@ static int __netdev_upper_dev_link(struct net_device *dev,  		return -EBUSY;  	/* To prevent loops, check if dev is not upper device to upper_dev. */ -	if (__netdev_find_upper(upper_dev, dev)) +	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))  		return -EBUSY; -	if (__netdev_find_upper(dev, upper_dev)) +	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))  		return -EEXIST;  	if (master && netdev_master_upper_dev_get(dev))  		return -EBUSY; -	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master); +	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, +						   master);  	if (ret)  		return ret;  	/* Now that we linked these devs, make all the upper_dev's -	 * upper_dev_list visible to every dev's lower_dev_list and vice +	 * all_adj_list.upper visible to every dev's all_adj_list.lower an  	 * versa, and don't forget the devices itself. All of these  	 * links are non-neighbours.  	 */ -	list_for_each_entry(i, &dev->lower_dev_list, list) { -		list_for_each_entry(j, &upper_dev->upper_dev_list, list) { +	list_for_each_entry(i, &dev->all_adj_list.lower, list) { +		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { +			pr_debug("Interlinking %s with %s, non-neighbour\n", +				 i->dev->name, j->dev->name);  			ret = __netdev_adjacent_dev_link(i->dev, j->dev);  			if (ret)  				goto rollback_mesh; @@ -4688,14 +5007,18 @@ static int __netdev_upper_dev_link(struct net_device *dev,  	}  	/* add dev to every upper_dev's upper device */ -	list_for_each_entry(i, &upper_dev->upper_dev_list, list) { +	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { +		pr_debug("linking %s's upper device %s with %s\n", +			 upper_dev->name, i->dev->name, dev->name);  		ret = __netdev_adjacent_dev_link(dev, i->dev);  		if (ret)  			goto rollback_upper_mesh;  	}  	/* add upper_dev to every dev's lower device */ -	list_for_each_entry(i, &dev->lower_dev_list, list) { +	list_for_each_entry(i, &dev->all_adj_list.lower, list) { +		pr_debug("linking %s's lower device %s with %s\n", dev->name, +			 i->dev->name, upper_dev->name);  		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);  		if (ret)  			goto rollback_lower_mesh; @@ -4706,7 +5029,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,  rollback_lower_mesh:  	to_i = i; -	list_for_each_entry(i, &dev->lower_dev_list, list) { +	list_for_each_entry(i, &dev->all_adj_list.lower, list) {  		if (i == to_i)  			break;  		__netdev_adjacent_dev_unlink(i->dev, upper_dev); @@ -4716,7 +5039,7 @@ rollback_lower_mesh:  rollback_upper_mesh:  	to_i = i; -	list_for_each_entry(i, &upper_dev->upper_dev_list, list) { +	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {  		if (i == to_i)  			break;  		__netdev_adjacent_dev_unlink(dev, i->dev); @@ -4727,8 +5050,8 @@ rollback_upper_mesh:  rollback_mesh:  	to_i = i;  	to_j = j; -	list_for_each_entry(i, &dev->lower_dev_list, list) { -		list_for_each_entry(j, &upper_dev->upper_dev_list, list) { +	list_for_each_entry(i, &dev->all_adj_list.lower, list) { +		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {  			if (i == to_i && j == to_j)  				break;  			__netdev_adjacent_dev_unlink(i->dev, j->dev); @@ -4737,7 +5060,7 @@ rollback_mesh:  			break;  	} -	__netdev_adjacent_dev_unlink(dev, upper_dev); +	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);  	return ret;  } @@ -4755,7 +5078,7 @@ rollback_mesh:  int netdev_upper_dev_link(struct net_device *dev,  			  struct net_device *upper_dev)  { -	return __netdev_upper_dev_link(dev, upper_dev, false); +	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);  }  EXPORT_SYMBOL(netdev_upper_dev_link); @@ -4773,10 +5096,18 @@ EXPORT_SYMBOL(netdev_upper_dev_link);  int netdev_master_upper_dev_link(struct net_device *dev,  				 struct net_device *upper_dev)  { -	return __netdev_upper_dev_link(dev, upper_dev, true); +	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);  }  EXPORT_SYMBOL(netdev_master_upper_dev_link); +int netdev_master_upper_dev_link_private(struct net_device *dev, +					 struct net_device *upper_dev, +					 void *private) +{ +	return __netdev_upper_dev_link(dev, upper_dev, true, private); +} +EXPORT_SYMBOL(netdev_master_upper_dev_link_private); +  /**   * netdev_upper_dev_unlink - Removes a link to upper device   * @dev: device @@ -4791,38 +5122,96 @@ void netdev_upper_dev_unlink(struct net_device *dev,  	struct netdev_adjacent *i, *j;  	ASSERT_RTNL(); -	__netdev_adjacent_dev_unlink(dev, upper_dev); +	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);  	/* Here is the tricky part. We must remove all dev's lower  	 * devices from all upper_dev's upper devices and vice  	 * versa, to maintain the graph relationship.  	 */ -	list_for_each_entry(i, &dev->lower_dev_list, list) -		list_for_each_entry(j, &upper_dev->upper_dev_list, list) +	list_for_each_entry(i, &dev->all_adj_list.lower, list) +		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)  			__netdev_adjacent_dev_unlink(i->dev, j->dev);  	/* remove also the devices itself from lower/upper device  	 * list  	 */ -	list_for_each_entry(i, &dev->lower_dev_list, list) +	list_for_each_entry(i, &dev->all_adj_list.lower, list)  		__netdev_adjacent_dev_unlink(i->dev, upper_dev); -	list_for_each_entry(i, &upper_dev->upper_dev_list, list) +	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)  		__netdev_adjacent_dev_unlink(dev, i->dev);  	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);  }  EXPORT_SYMBOL(netdev_upper_dev_unlink); +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) +{ +	struct netdev_adjacent *iter; + +	list_for_each_entry(iter, &dev->adj_list.upper, list) { +		netdev_adjacent_sysfs_del(iter->dev, oldname, +					  &iter->dev->adj_list.lower); +		netdev_adjacent_sysfs_add(iter->dev, dev, +					  &iter->dev->adj_list.lower); +	} + +	list_for_each_entry(iter, &dev->adj_list.lower, list) { +		netdev_adjacent_sysfs_del(iter->dev, oldname, +					  &iter->dev->adj_list.upper); +		netdev_adjacent_sysfs_add(iter->dev, dev, +					  &iter->dev->adj_list.upper); +	} +} + +void *netdev_lower_dev_get_private(struct net_device *dev, +				   struct net_device *lower_dev) +{ +	struct netdev_adjacent *lower; + +	if (!lower_dev) +		return NULL; +	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); +	if (!lower) +		return NULL; + +	return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private); + + +int dev_get_nest_level(struct net_device *dev, +		       bool (*type_check)(struct net_device *dev)) +{ +	struct net_device *lower = NULL; +	struct list_head *iter; +	int max_nest = -1; +	int nest; + +	ASSERT_RTNL(); + +	netdev_for_each_lower_dev(dev, lower, iter) { +		nest = dev_get_nest_level(lower, type_check); +		if (max_nest < nest) +			max_nest = nest; +	} + +	if (type_check(dev)) +		max_nest++; + +	return max_nest; +} +EXPORT_SYMBOL(dev_get_nest_level); +  static void dev_change_rx_flags(struct net_device *dev, int flags)  {  	const struct net_device_ops *ops = dev->netdev_ops; -	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) +	if (ops->ndo_change_rx_flags)  		ops->ndo_change_rx_flags(dev, flags);  } -static int __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)  {  	unsigned int old_flags = dev->flags;  	kuid_t uid; @@ -4865,6 +5254,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)  		dev_change_rx_flags(dev, IFF_PROMISC);  	} +	if (notify) +		__dev_notify_flags(dev, old_flags, IFF_PROMISC);  	return 0;  } @@ -4884,7 +5275,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc)  	unsigned int old_flags = dev->flags;  	int err; -	err = __dev_set_promiscuity(dev, inc); +	err = __dev_set_promiscuity(dev, inc, true);  	if (err < 0)  		return err;  	if (dev->flags != old_flags) @@ -4893,22 +5284,9 @@ int dev_set_promiscuity(struct net_device *dev, int inc)  }  EXPORT_SYMBOL(dev_set_promiscuity); -/** - *	dev_set_allmulti	- update allmulti count on a device - *	@dev: device - *	@inc: modifier - * - *	Add or remove reception of all multicast frames to a device. While the - *	count in the device remains above zero the interface remains listening - *	to all interfaces. Once it hits zero the device reverts back to normal - *	filtering operation. A negative @inc value is used to drop the counter - *	when releasing a resource needing all multicasts. - *	Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) +static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)  { -	unsigned int old_flags = dev->flags; +	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;  	ASSERT_RTNL(); @@ -4931,9 +5309,30 @@ int dev_set_allmulti(struct net_device *dev, int inc)  	if (dev->flags ^ old_flags) {  		dev_change_rx_flags(dev, IFF_ALLMULTI);  		dev_set_rx_mode(dev); +		if (notify) +			__dev_notify_flags(dev, old_flags, +					   dev->gflags ^ old_gflags);  	}  	return 0;  } + +/** + *	dev_set_allmulti	- update allmulti count on a device + *	@dev: device + *	@inc: modifier + * + *	Add or remove reception of all multicast frames to a device. While the + *	count in the device remains above zero the interface remains listening + *	to all interfaces. Once it hits zero the device reverts back to normal + *	filtering operation. A negative @inc value is used to drop the counter + *	when releasing a resource needing all multicasts. + *	Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ +	return __dev_set_allmulti(dev, inc, true); +}  EXPORT_SYMBOL(dev_set_allmulti);  /* @@ -4958,10 +5357,10 @@ void __dev_set_rx_mode(struct net_device *dev)  		 * therefore calling __dev_set_promiscuity here is safe.  		 */  		if (!netdev_uc_empty(dev) && !dev->uc_promisc) { -			__dev_set_promiscuity(dev, 1); +			__dev_set_promiscuity(dev, 1, false);  			dev->uc_promisc = true;  		} else if (netdev_uc_empty(dev) && dev->uc_promisc) { -			__dev_set_promiscuity(dev, -1); +			__dev_set_promiscuity(dev, -1, false);  			dev->uc_promisc = false;  		}  	} @@ -5050,9 +5449,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)  	if ((flags ^ dev->gflags) & IFF_PROMISC) {  		int inc = (flags & IFF_PROMISC) ? 1 : -1; +		unsigned int old_flags = dev->flags;  		dev->gflags ^= IFF_PROMISC; -		dev_set_promiscuity(dev, inc); + +		if (__dev_set_promiscuity(dev, inc, false) >= 0) +			if (dev->flags != old_flags) +				dev_set_rx_mode(dev);  	}  	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI @@ -5063,16 +5466,20 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)  		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;  		dev->gflags ^= IFF_ALLMULTI; -		dev_set_allmulti(dev, inc); +		__dev_set_allmulti(dev, inc, false);  	}  	return ret;  } -void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, +			unsigned int gchanges)  {  	unsigned int changes = dev->flags ^ old_flags; +	if (gchanges) +		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); +  	if (changes & IFF_UP) {  		if (dev->flags & IFF_UP)  			call_netdevice_notifiers(NETDEV_UP, dev); @@ -5101,21 +5508,29 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)  int dev_change_flags(struct net_device *dev, unsigned int flags)  {  	int ret; -	unsigned int changes, old_flags = dev->flags; +	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;  	ret = __dev_change_flags(dev, flags);  	if (ret < 0)  		return ret; -	changes = old_flags ^ dev->flags; -	if (changes) -		rtmsg_ifinfo(RTM_NEWLINK, dev, changes); - -	__dev_notify_flags(dev, old_flags); +	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); +	__dev_notify_flags(dev, old_flags, changes);  	return ret;  }  EXPORT_SYMBOL(dev_change_flags); +static int __dev_set_mtu(struct net_device *dev, int new_mtu) +{ +	const struct net_device_ops *ops = dev->netdev_ops; + +	if (ops->ndo_change_mtu) +		return ops->ndo_change_mtu(dev, new_mtu); + +	dev->mtu = new_mtu; +	return 0; +} +  /**   *	dev_set_mtu - Change maximum transfer unit   *	@dev: device @@ -5125,8 +5540,7 @@ EXPORT_SYMBOL(dev_change_flags);   */  int dev_set_mtu(struct net_device *dev, int new_mtu)  { -	const struct net_device_ops *ops = dev->netdev_ops; -	int err; +	int err, orig_mtu;  	if (new_mtu == dev->mtu)  		return 0; @@ -5138,14 +5552,25 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)  	if (!netif_device_present(dev))  		return -ENODEV; -	err = 0; -	if (ops->ndo_change_mtu) -		err = ops->ndo_change_mtu(dev, new_mtu); -	else -		dev->mtu = new_mtu; +	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); +	err = notifier_to_errno(err); +	if (err) +		return err; -	if (!err) -		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); +	orig_mtu = dev->mtu; +	err = __dev_set_mtu(dev, new_mtu); + +	if (!err) { +		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); +		err = notifier_to_errno(err); +		if (err) { +			/* setting mtu back and notifying everyone again, +			 * so that they have a chance to revert changes. +			 */ +			__dev_set_mtu(dev, orig_mtu); +			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); +		} +	}  	return err;  }  EXPORT_SYMBOL(dev_set_mtu); @@ -5247,15 +5672,18 @@ static int dev_new_index(struct net *net)  /* Delayed registration/unregisteration */  static LIST_HEAD(net_todo_list); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);  static void net_set_todo(struct net_device *dev)  {  	list_add_tail(&dev->todo_list, &net_todo_list); +	dev_net(dev)->dev_unreg_count++;  }  static void rollback_registered_many(struct list_head *head)  {  	struct net_device *dev, *tmp; +	LIST_HEAD(close_head);  	BUG_ON(dev_boot_phase);  	ASSERT_RTNL(); @@ -5278,7 +5706,9 @@ static void rollback_registered_many(struct list_head *head)  	}  	/* If device is running, close it first. */ -	dev_close_many(head); +	list_for_each_entry(dev, head, unreg_list) +		list_add_tail(&dev->close_list, &close_head); +	dev_close_many(&close_head);  	list_for_each_entry(dev, head, unreg_list) {  		/* And unlink it from device chain. */ @@ -5299,10 +5729,6 @@ static void rollback_registered_many(struct list_head *head)  		*/  		call_netdevice_notifiers(NETDEV_UNREGISTER, dev); -		if (!dev->rtnl_link_ops || -		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) -			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); -  		/*  		 *	Flush the unicast and multicast chains  		 */ @@ -5312,6 +5738,10 @@ static void rollback_registered_many(struct list_head *head)  		if (dev->netdev_ops->ndo_uninit)  			dev->netdev_ops->ndo_uninit(dev); +		if (!dev->rtnl_link_ops || +		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) +			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); +  		/* Notifier chain MUST detach us all upper devices. */  		WARN_ON(netdev_has_any_upper_dev(dev)); @@ -5395,6 +5825,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,  		}  	} +#ifdef CONFIG_NET_RX_BUSY_POLL +	if (dev->netdev_ops->ndo_busy_poll) +		features |= NETIF_F_BUSY_POLL; +	else +#endif +		features &= ~NETIF_F_BUSY_POLL; +  	return features;  } @@ -5494,7 +5931,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,  }  EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  static int netif_alloc_rx_queues(struct net_device *dev)  {  	unsigned int i, count = dev->num_rx_queues; @@ -5530,10 +5967,7 @@ static void netdev_init_one_queue(struct net_device *dev,  static void netif_free_tx_queues(struct net_device *dev)  { -	if (is_vmalloc_addr(dev->_tx)) -		vfree(dev->_tx); -	else -		kfree(dev->_tx); +	kvfree(dev->_tx);  }  static int netif_alloc_netdev_queues(struct net_device *dev) @@ -5633,13 +6067,8 @@ int register_netdevice(struct net_device *dev)  	dev->features |= NETIF_F_SOFT_FEATURES;  	dev->wanted_features = dev->features & dev->hw_features; -	/* Turn on no cache copy if HW is doing checksum */  	if (!(dev->flags & IFF_LOOPBACK)) {  		dev->hw_features |= NETIF_F_NOCACHE_COPY; -		if (dev->features & NETIF_F_ALL_CSUM) { -			dev->wanted_features |= NETIF_F_NOCACHE_COPY; -			dev->features |= NETIF_F_NOCACHE_COPY; -		}  	}  	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices. @@ -5700,7 +6129,7 @@ int register_netdevice(struct net_device *dev)  	 */  	if (!dev->rtnl_link_ops ||  	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) -		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); +		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);  out:  	return ret; @@ -5918,6 +6347,12 @@ void netdev_run_todo(void)  		if (dev->destructor)  			dev->destructor(dev); +		/* Report a network device has been unregistered */ +		rtnl_lock(); +		dev_net(dev)->dev_unreg_count--; +		__rtnl_unlock(); +		wake_up(&netdev_unregistering_wq); +  		/* Free network device */  		kobject_put(&dev->dev.kobj);  	} @@ -5969,6 +6404,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,  		netdev_stats_to_stats64(storage, &dev->stats);  	}  	storage->rx_dropped += atomic_long_read(&dev->rx_dropped); +	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);  	return storage;  }  EXPORT_SYMBOL(dev_get_stats); @@ -6001,6 +6437,13 @@ void netdev_set_default_ethtool_ops(struct net_device *dev,  }  EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); +void netdev_freemem(struct net_device *dev) +{ +	char *addr = (char *)dev - dev->padded; + +	kvfree(addr); +} +  /**   *	alloc_netdev_mqs - allocate network device   *	@sizeof_priv:	size of private data to allocate space for @@ -6010,7 +6453,7 @@ EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);   *	@rxqs:		the number of RX subqueues to allocate   *   *	Allocates a struct net_device with private data area for driver use - *	and performs basic initialization.  Also allocates subquue structs + *	and performs basic initialization.  Also allocates subqueue structs   *	for each queue on the device.   */  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, @@ -6028,7 +6471,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  		return NULL;  	} -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  	if (rxqs < 1) {  		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");  		return NULL; @@ -6044,7 +6487,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	/* ensure 32-byte alignment of whole construct */  	alloc_size += NETDEV_ALIGN - 1; -	p = kzalloc(alloc_size, GFP_KERNEL); +	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); +	if (!p) +		p = vzalloc(alloc_size);  	if (!p)  		return NULL; @@ -6053,7 +6498,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	dev->pcpu_refcnt = alloc_percpu(int);  	if (!dev->pcpu_refcnt) -		goto free_p; +		goto free_dev;  	if (dev_addr_init(dev))  		goto free_pcpu; @@ -6068,9 +6513,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	INIT_LIST_HEAD(&dev->napi_list);  	INIT_LIST_HEAD(&dev->unreg_list); +	INIT_LIST_HEAD(&dev->close_list);  	INIT_LIST_HEAD(&dev->link_watch_list); -	INIT_LIST_HEAD(&dev->upper_dev_list); -	INIT_LIST_HEAD(&dev->lower_dev_list); +	INIT_LIST_HEAD(&dev->adj_list.upper); +	INIT_LIST_HEAD(&dev->adj_list.lower); +	INIT_LIST_HEAD(&dev->all_adj_list.upper); +	INIT_LIST_HEAD(&dev->all_adj_list.lower);  	dev->priv_flags = IFF_XMIT_DST_RELEASE;  	setup(dev); @@ -6079,7 +6527,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	if (netif_alloc_netdev_queues(dev))  		goto free_all; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  	dev->num_rx_queues = rxqs;  	dev->real_num_rx_queues = rxqs;  	if (netif_alloc_rx_queues(dev)) @@ -6098,13 +6546,8 @@ free_all:  free_pcpu:  	free_percpu(dev->pcpu_refcnt); -	netif_free_tx_queues(dev); -#ifdef CONFIG_RPS -	kfree(dev->_rx); -#endif - -free_p: -	kfree(p); +free_dev: +	netdev_freemem(dev);  	return NULL;  }  EXPORT_SYMBOL(alloc_netdev_mqs); @@ -6124,7 +6567,7 @@ void free_netdev(struct net_device *dev)  	release_net(dev_net(dev));  	netif_free_tx_queues(dev); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  	kfree(dev->_rx);  #endif @@ -6141,7 +6584,7 @@ void free_netdev(struct net_device *dev)  	/*  Compatibility with error handling in drivers */  	if (dev->reg_state == NETREG_UNINITIALIZED) { -		kfree((char *)dev - dev->padded); +		netdev_freemem(dev);  		return;  	} @@ -6199,6 +6642,9 @@ EXPORT_SYMBOL(unregister_netdevice_queue);  /**   *	unregister_netdevice_many - unregister many devices   *	@head: list of devices + * + *  Note: As most callers use a stack allocated list_head, + *  we force a list_del() to make sure stack wont be corrupted later.   */  void unregister_netdevice_many(struct list_head *head)  { @@ -6208,6 +6654,7 @@ void unregister_netdevice_many(struct list_head *head)  		rollback_registered_many(head);  		list_for_each_entry(dev, head, unreg_list)  			net_set_todo(dev); +		list_del(head);  	}  }  EXPORT_SYMBOL(unregister_netdevice_many); @@ -6303,7 +6750,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);  	rcu_barrier();  	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); -	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); +	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);  	/*  	 *	Flush the unicast and multicast chains @@ -6342,7 +6789,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	 *	Prevent userspace races by waiting until the network  	 *	device is fully setup before sending notifications.  	 */ -	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); +	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);  	synchronize_net();  	err = 0; @@ -6394,11 +6841,11 @@ static int dev_cpu_callback(struct notifier_block *nfb,  	/* Process offline CPU's input_pkt_queue */  	while ((skb = __skb_dequeue(&oldsd->process_queue))) { -		netif_rx(skb); +		netif_rx_internal(skb);  		input_queue_head_incr(oldsd);  	}  	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { -		netif_rx(skb); +		netif_rx_internal(skb);  		input_queue_head_incr(oldsd);  	} @@ -6603,6 +7050,34 @@ static void __net_exit default_device_exit(struct net *net)  	rtnl_unlock();  } +static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) +{ +	/* Return with the rtnl_lock held when there are no network +	 * devices unregistering in any network namespace in net_list. +	 */ +	struct net *net; +	bool unregistering; +	DEFINE_WAIT(wait); + +	for (;;) { +		prepare_to_wait(&netdev_unregistering_wq, &wait, +				TASK_UNINTERRUPTIBLE); +		unregistering = false; +		rtnl_lock(); +		list_for_each_entry(net, net_list, exit_list) { +			if (net->dev_unreg_count > 0) { +				unregistering = true; +				break; +			} +		} +		if (!unregistering) +			break; +		__rtnl_unlock(); +		schedule(); +	} +	finish_wait(&netdev_unregistering_wq, &wait); +} +  static void __net_exit default_device_exit_batch(struct list_head *net_list)  {  	/* At exit all network devices most be removed from a network @@ -6614,7 +7089,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)  	struct net *net;  	LIST_HEAD(dev_kill_list); -	rtnl_lock(); +	/* To prevent network device cleanup code from dereferencing +	 * loopback devices or network devices that have been freed +	 * wait here for all pending unregistrations to complete, +	 * before unregistring the loopback device and allowing the +	 * network namespace be freed. +	 * +	 * The netdev todo list containing all network devices +	 * unregistrations that happen in default_device_exit_batch +	 * will run in the rtnl_unlock() at the end of +	 * default_device_exit_batch. +	 */ +	rtnl_lock_unregistering(net_list);  	list_for_each_entry(net, net_list, exit_list) {  		for_each_netdev_reverse(net, dev) {  			if (dev->rtnl_link_ops) @@ -6624,7 +7110,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)  		}  	}  	unregister_netdevice_many(&dev_kill_list); -	list_del(&dev_kill_list);  	rtnl_unlock();  } @@ -6672,28 +7157,18 @@ static int __init net_dev_init(void)  	for_each_possible_cpu(i) {  		struct softnet_data *sd = &per_cpu(softnet_data, i); -		memset(sd, 0, sizeof(*sd));  		skb_queue_head_init(&sd->input_pkt_queue);  		skb_queue_head_init(&sd->process_queue); -		sd->completion_queue = NULL;  		INIT_LIST_HEAD(&sd->poll_list); -		sd->output_queue = NULL;  		sd->output_queue_tailp = &sd->output_queue;  #ifdef CONFIG_RPS  		sd->csd.func = rps_trigger_softirq;  		sd->csd.info = sd; -		sd->csd.flags = 0;  		sd->cpu = i;  #endif  		sd->backlog.poll = process_backlog;  		sd->backlog.weight = weight_p; -		sd->backlog.gro_list = NULL; -		sd->backlog.gro_count = 0; - -#ifdef CONFIG_NET_FLOW_LIMIT -		sd->flow_limit = NULL; -#endif  	}  	dev_boot_phase = 0; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 6cda4e2c213..b6b230600b9 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -38,7 +38,7 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,  	ha->type = addr_type;  	ha->refcount = 1;  	ha->global_use = global; -	ha->synced = sync; +	ha->synced = sync ? 1 : 0;  	ha->sync_cnt = 0;  	list_add_tail_rcu(&ha->list, &list->list);  	list->count++; @@ -48,7 +48,8 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,  static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,  			    const unsigned char *addr, int addr_len, -			    unsigned char addr_type, bool global, bool sync) +			    unsigned char addr_type, bool global, bool sync, +			    int sync_count)  {  	struct netdev_hw_addr *ha; @@ -66,10 +67,10 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,  					ha->global_use = true;  			}  			if (sync) { -				if (ha->synced) +				if (ha->synced && sync_count)  					return -EEXIST;  				else -					ha->synced = true; +					ha->synced++;  			}  			ha->refcount++;  			return 0; @@ -84,7 +85,8 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list,  			 const unsigned char *addr, int addr_len,  			 unsigned char addr_type)  { -	return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false); +	return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, +				0);  }  static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, @@ -101,7 +103,7 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,  		ha->global_use = false;  	if (sync) -		ha->synced = false; +		ha->synced--;  	if (--ha->refcount)  		return 0; @@ -139,7 +141,7 @@ static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,  	int err;  	err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, -			       false, true); +			       false, true, ha->sync_cnt);  	if (err && err != -EEXIST)  		return err; @@ -186,47 +188,6 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,  	return err;  } -int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, -			   struct netdev_hw_addr_list *from_list, -			   int addr_len, unsigned char addr_type) -{ -	int err; -	struct netdev_hw_addr *ha, *ha2; -	unsigned char type; - -	list_for_each_entry(ha, &from_list->list, list) { -		type = addr_type ? addr_type : ha->type; -		err = __hw_addr_add(to_list, ha->addr, addr_len, type); -		if (err) -			goto unroll; -	} -	return 0; - -unroll: -	list_for_each_entry(ha2, &from_list->list, list) { -		if (ha2 == ha) -			break; -		type = addr_type ? addr_type : ha2->type; -		__hw_addr_del(to_list, ha2->addr, addr_len, type); -	} -	return err; -} -EXPORT_SYMBOL(__hw_addr_add_multiple); - -void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, -			    struct netdev_hw_addr_list *from_list, -			    int addr_len, unsigned char addr_type) -{ -	struct netdev_hw_addr *ha; -	unsigned char type; - -	list_for_each_entry(ha, &from_list->list, list) { -		type = addr_type ? addr_type : ha->type; -		__hw_addr_del(to_list, ha->addr, addr_len, type); -	} -} -EXPORT_SYMBOL(__hw_addr_del_multiple); -  /* This function only works where there is a strict 1-1 relationship   * between source and destionation of they synch. If you ever need to   * sync addresses to more then 1 destination, you need to use @@ -264,7 +225,92 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,  }  EXPORT_SYMBOL(__hw_addr_unsync); -void __hw_addr_flush(struct netdev_hw_addr_list *list) +/** + *  __hw_addr_sync_dev - Synchonize device's multicast list + *  @list: address list to syncronize + *  @dev:  device to sync + *  @sync: function to call if address should be added + *  @unsync: function to call if address should be removed + * + *  This funciton is intended to be called from the ndo_set_rx_mode + *  function of devices that require explicit address add/remove + *  notifications.  The unsync function may be NULL in which case + *  the addresses requiring removal will simply be removed without + *  any notification to the device. + **/ +int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, +		       struct net_device *dev, +		       int (*sync)(struct net_device *, const unsigned char *), +		       int (*unsync)(struct net_device *, +				     const unsigned char *)) +{ +	struct netdev_hw_addr *ha, *tmp; +	int err; + +	/* first go through and flush out any stale entries */ +	list_for_each_entry_safe(ha, tmp, &list->list, list) { +		if (!ha->sync_cnt || ha->refcount != 1) +			continue; + +		/* if unsync is defined and fails defer unsyncing address */ +		if (unsync && unsync(dev, ha->addr)) +			continue; + +		ha->sync_cnt--; +		__hw_addr_del_entry(list, ha, false, false); +	} + +	/* go through and sync new entries to the list */ +	list_for_each_entry_safe(ha, tmp, &list->list, list) { +		if (ha->sync_cnt) +			continue; + +		err = sync(dev, ha->addr); +		if (err) +			return err; + +		ha->sync_cnt++; +		ha->refcount++; +	} + +	return 0; +} +EXPORT_SYMBOL(__hw_addr_sync_dev); + +/** + *  __hw_addr_unsync_dev - Remove synchonized addresses from device + *  @list: address list to remove syncronized addresses from + *  @dev:  device to sync + *  @unsync: function to call if address should be removed + * + *  Remove all addresses that were added to the device by __hw_addr_sync_dev(). + *  This function is intended to be called from the ndo_stop or ndo_open + *  functions on devices that require explicit address add/remove + *  notifications.  If the unsync function pointer is NULL then this function + *  can be used to just reset the sync_cnt for the addresses in the list. + **/ +void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, +			  struct net_device *dev, +			  int (*unsync)(struct net_device *, +					const unsigned char *)) +{ +	struct netdev_hw_addr *ha, *tmp; + +	list_for_each_entry_safe(ha, tmp, &list->list, list) { +		if (!ha->sync_cnt) +			continue; + +		/* if unsync is defined and fails defer unsyncing address */ +		if (unsync && unsync(dev, ha->addr)) +			continue; + +		ha->sync_cnt--; +		__hw_addr_del_entry(list, ha, false, false); +	} +} +EXPORT_SYMBOL(__hw_addr_unsync_dev); + +static void __hw_addr_flush(struct netdev_hw_addr_list *list)  {  	struct netdev_hw_addr *ha, *tmp; @@ -274,7 +320,6 @@ void __hw_addr_flush(struct netdev_hw_addr_list *list)  	}  	list->count = 0;  } -EXPORT_SYMBOL(__hw_addr_flush);  void __hw_addr_init(struct netdev_hw_addr_list *list)  { @@ -400,59 +445,6 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr,  }  EXPORT_SYMBOL(dev_addr_del); -/** - *	dev_addr_add_multiple - Add device addresses from another device - *	@to_dev: device to which addresses will be added - *	@from_dev: device from which addresses will be added - *	@addr_type: address type - 0 means type will be used from from_dev - * - *	Add device addresses of the one device to another. - ** - *	The caller must hold the rtnl_mutex. - */ -int dev_addr_add_multiple(struct net_device *to_dev, -			  struct net_device *from_dev, -			  unsigned char addr_type) -{ -	int err; - -	ASSERT_RTNL(); - -	if (from_dev->addr_len != to_dev->addr_len) -		return -EINVAL; -	err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, -				     to_dev->addr_len, addr_type); -	if (!err) -		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); -	return err; -} -EXPORT_SYMBOL(dev_addr_add_multiple); - -/** - *	dev_addr_del_multiple - Delete device addresses by another device - *	@to_dev: device where the addresses will be deleted - *	@from_dev: device supplying the addresses to be deleted - *	@addr_type: address type - 0 means type will be used from from_dev - * - *	Deletes addresses in to device by the list of addresses in from device. - * - *	The caller must hold the rtnl_mutex. - */ -int dev_addr_del_multiple(struct net_device *to_dev, -			  struct net_device *from_dev, -			  unsigned char addr_type) -{ -	ASSERT_RTNL(); - -	if (from_dev->addr_len != to_dev->addr_len) -		return -EINVAL; -	__hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, -			       to_dev->addr_len, addr_type); -	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); -	return 0; -} -EXPORT_SYMBOL(dev_addr_del_multiple); -  /*   * Unicast list handling functions   */ @@ -676,7 +668,7 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,  	netif_addr_lock_bh(dev);  	err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, -			       NETDEV_HW_ADDR_T_MULTICAST, global, false); +			       NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);  	if (!err)  		__dev_set_rx_mode(dev);  	netif_addr_unlock_bh(dev); @@ -752,7 +744,7 @@ int dev_mc_del_global(struct net_device *dev, const unsigned char *addr)  EXPORT_SYMBOL(dev_mc_del_global);  /** - *	dev_mc_sync - Synchronize device's unicast list to another device + *	dev_mc_sync - Synchronize device's multicast list to another device   *	@to: destination device   *	@from: source device   * @@ -780,7 +772,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from)  EXPORT_SYMBOL(dev_mc_sync);  /** - *	dev_mc_sync_multiple - Synchronize device's unicast list to another + *	dev_mc_sync_multiple - Synchronize device's multicast list to another   *	device, but allow for multiple calls to sync to multiple devices.   *	@to: destination device   *	@from: source device diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 5b7d0e1d066..cf999e09bcd 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -327,6 +327,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)  		    cmd == SIOCBRADDIF ||  		    cmd == SIOCBRDELIF ||  		    cmd == SIOCSHWTSTAMP || +		    cmd == SIOCGHWTSTAMP ||  		    cmd == SIOCWANDEV) {  			err = -EOPNOTSUPP;  			if (ops->ndo_do_ioctl) { @@ -546,6 +547,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)  	 */  	default:  		if (cmd == SIOCWANDEV || +		    cmd == SIOCGHWTSTAMP ||  		    (cmd >= SIOCDEVPRIVATE &&  		     cmd <= SIOCDEVPRIVATE + 15)) {  			dev_load(net, ifr.ifr_name); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 5e78d44333b..e70301eb7a4 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -64,7 +64,6 @@ static struct genl_family net_drop_monitor_family = {  	.hdrsize        = 0,  	.name           = "NET_DM",  	.version        = 2, -	.maxattr        = NET_DM_CMD_MAX,  };  static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); @@ -106,6 +105,10 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)  	return skb;  } +static struct genl_multicast_group dropmon_mcgrps[] = { +	{ .name = "events", }, +}; +  static void send_dm_alert(struct work_struct *work)  {  	struct sk_buff *skb; @@ -116,7 +119,8 @@ static void send_dm_alert(struct work_struct *work)  	skb = reset_per_cpu_data(data);  	if (skb) -		genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); +		genlmsg_multicast(&net_drop_monitor_family, skb, 0, +				  0, GFP_KERNEL);  }  /* @@ -333,7 +337,7 @@ out:  	return NOTIFY_DONE;  } -static struct genl_ops dropmon_ops[] = { +static const struct genl_ops dropmon_ops[] = {  	{  		.cmd = NET_DM_CMD_CONFIG,  		.doit = net_dm_cmd_config, @@ -364,13 +368,13 @@ static int __init init_net_drop_monitor(void)  		return -ENOSPC;  	} -	rc = genl_register_family_with_ops(&net_drop_monitor_family, -					   dropmon_ops, -					   ARRAY_SIZE(dropmon_ops)); +	rc = genl_register_family_with_ops_groups(&net_drop_monitor_family, +						  dropmon_ops, dropmon_mcgrps);  	if (rc) {  		pr_err("Could not create drop monitor netlink family\n");  		return rc;  	} +	WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);  	rc = register_netdevice_notifier(&dropmon_net_notifier);  	if (rc < 0) { diff --git a/net/core/dst.c b/net/core/dst.c index ca4231ec734..a028409ee43 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -142,12 +142,12 @@ loop:  	mutex_unlock(&dst_gc_mutex);  } -int dst_discard(struct sk_buff *skb) +int dst_discard_sk(struct sock *sk, struct sk_buff *skb)  {  	kfree_skb(skb);  	return 0;  } -EXPORT_SYMBOL(dst_discard); +EXPORT_SYMBOL(dst_discard_sk);  const u32 dst_default_metrics[RTAX_MAX + 1] = {  	/* This initializer is needed to force linker to place this variable @@ -184,7 +184,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,  	dst->xfrm = NULL;  #endif  	dst->input = dst_discard; -	dst->output = dst_discard; +	dst->output = dst_discard_sk;  	dst->error = 0;  	dst->obsolete = initial_obsolete;  	dst->header_len = 0; @@ -209,8 +209,10 @@ static void ___dst_free(struct dst_entry *dst)  	/* The first case (dev==NULL) is required, when  	   protocol module is unloaded.  	 */ -	if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) -		dst->input = dst->output = dst_discard; +	if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { +		dst->input = dst_discard; +		dst->output = dst_discard_sk; +	}  	dst->obsolete = DST_OBSOLETE_DEAD;  } @@ -267,6 +269,15 @@ again:  }  EXPORT_SYMBOL(dst_destroy); +static void dst_destroy_rcu(struct rcu_head *head) +{ +	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + +	dst = dst_destroy(dst); +	if (dst) +		__dst_free(dst); +} +  void dst_release(struct dst_entry *dst)  {  	if (dst) { @@ -274,11 +285,8 @@ void dst_release(struct dst_entry *dst)  		newrefcnt = atomic_dec_return(&dst->__refcnt);  		WARN_ON(newrefcnt < 0); -		if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { -			dst = dst_destroy(dst); -			if (dst) -				__dst_free(dst); -		} +		if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) +			call_rcu(&dst->rcu_head, dst_destroy_rcu);  	}  }  EXPORT_SYMBOL(dst_release); @@ -361,7 +369,8 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,  		return;  	if (!unregister) { -		dst->input = dst->output = dst_discard; +		dst->input = dst_discard; +		dst->output = dst_discard_sk;  	} else {  		dst->dev = dev_net(dst->dev)->loopback_dev;  		dev_hold(dst->dev); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 78e9d9223e4..17cb912793f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -81,6 +81,8 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]  	[NETIF_F_TSO6_BIT] =             "tx-tcp6-segmentation",  	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation",  	[NETIF_F_GSO_GRE_BIT] =		 "tx-gre-segmentation", +	[NETIF_F_GSO_IPIP_BIT] =	 "tx-ipip-segmentation", +	[NETIF_F_GSO_SIT_BIT] =		 "tx-sit-segmentation",  	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",  	[NETIF_F_GSO_MPLS_BIT] =	 "tx-mpls-segmentation", @@ -94,6 +96,8 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]  	[NETIF_F_LOOPBACK_BIT] =         "loopback",  	[NETIF_F_RXFCS_BIT] =            "rx-fcs",  	[NETIF_F_RXALL_BIT] =            "rx-all", +	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", +	[NETIF_F_BUSY_POLL_BIT] =        "busy-poll",  };  static int ethtool_get_features(struct net_device *dev, void __user *useraddr) @@ -553,6 +557,23 @@ err_out:  	return ret;  } +static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, +					struct ethtool_rxnfc *rx_rings, +					u32 size) +{ +	int i; + +	if (copy_from_user(indir, useraddr, size * sizeof(indir[0]))) +		return -EFAULT; + +	/* Validate ring indices */ +	for (i = 0; i < size; i++) +		if (indir[i] >= rx_rings->data) +			return -EINVAL; + +	return 0; +} +  static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,  						     void __user *useraddr)  { @@ -561,7 +582,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,  	int ret;  	if (!dev->ethtool_ops->get_rxfh_indir_size || -	    !dev->ethtool_ops->get_rxfh_indir) +	    !dev->ethtool_ops->get_rxfh)  		return -EOPNOTSUPP;  	dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);  	if (dev_size == 0) @@ -587,7 +608,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,  	if (!indir)  		return -ENOMEM; -	ret = dev->ethtool_ops->get_rxfh_indir(dev, indir); +	ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL);  	if (ret)  		goto out; @@ -609,8 +630,9 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,  	u32 *indir;  	const struct ethtool_ops *ops = dev->ethtool_ops;  	int ret; +	u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); -	if (!ops->get_rxfh_indir_size || !ops->set_rxfh_indir || +	if (!ops->get_rxfh_indir_size || !ops->set_rxfh ||  	    !ops->get_rxnfc)  		return -EOPNOTSUPP; @@ -639,28 +661,184 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,  		for (i = 0; i < dev_size; i++)  			indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);  	} else { -		if (copy_from_user(indir, -				  useraddr + -				  offsetof(struct ethtool_rxfh_indir, -					   ring_index[0]), -				  dev_size * sizeof(indir[0]))) { +		ret = ethtool_copy_validate_indir(indir, +						  useraddr + ringidx_offset, +						  &rx_rings, +						  dev_size); +		if (ret) +			goto out; +	} + +	ret = ops->set_rxfh(dev, indir, NULL); + +out: +	kfree(indir); +	return ret; +} + +static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, +					       void __user *useraddr) +{ +	int ret; +	const struct ethtool_ops *ops = dev->ethtool_ops; +	u32 user_indir_size, user_key_size; +	u32 dev_indir_size = 0, dev_key_size = 0; +	struct ethtool_rxfh rxfh; +	u32 total_size; +	u32 indir_bytes; +	u32 *indir = NULL; +	u8 *hkey = NULL; +	u8 *rss_config; + +	if (!(dev->ethtool_ops->get_rxfh_indir_size || +	      dev->ethtool_ops->get_rxfh_key_size) || +	      !dev->ethtool_ops->get_rxfh) +		return -EOPNOTSUPP; + +	if (ops->get_rxfh_indir_size) +		dev_indir_size = ops->get_rxfh_indir_size(dev); +	if (ops->get_rxfh_key_size) +		dev_key_size = ops->get_rxfh_key_size(dev); + +	if ((dev_key_size + dev_indir_size) == 0) +		return -EOPNOTSUPP; + +	if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) +		return -EFAULT; +	user_indir_size = rxfh.indir_size; +	user_key_size = rxfh.key_size; + +	/* Check that reserved fields are 0 for now */ +	if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) +		return -EINVAL; + +	rxfh.indir_size = dev_indir_size; +	rxfh.key_size = dev_key_size; +	if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) +		return -EFAULT; + +	/* If the user buffer size is 0, this is just a query for the +	 * device table size and key size.  Otherwise, if the User size is +	 * not equal to device table size or key size it's an error. +	 */ +	if (!user_indir_size && !user_key_size) +		return 0; + +	if ((user_indir_size && (user_indir_size != dev_indir_size)) || +	    (user_key_size && (user_key_size != dev_key_size))) +		return -EINVAL; + +	indir_bytes = user_indir_size * sizeof(indir[0]); +	total_size = indir_bytes + user_key_size; +	rss_config = kzalloc(total_size, GFP_USER); +	if (!rss_config) +		return -ENOMEM; + +	if (user_indir_size) +		indir = (u32 *)rss_config; + +	if (user_key_size) +		hkey = rss_config + indir_bytes; + +	ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey); +	if (!ret) { +		if (copy_to_user(useraddr + +				 offsetof(struct ethtool_rxfh, rss_config[0]), +				 rss_config, total_size))  			ret = -EFAULT; +	} + +	kfree(rss_config); + +	return ret; +} + +static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, +					       void __user *useraddr) +{ +	int ret; +	const struct ethtool_ops *ops = dev->ethtool_ops; +	struct ethtool_rxnfc rx_rings; +	struct ethtool_rxfh rxfh; +	u32 dev_indir_size = 0, dev_key_size = 0, i; +	u32 *indir = NULL, indir_bytes = 0; +	u8 *hkey = NULL; +	u8 *rss_config; +	u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); + +	if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) || +	    !ops->get_rxnfc || !ops->set_rxfh) +		return -EOPNOTSUPP; + +	if (ops->get_rxfh_indir_size) +		dev_indir_size = ops->get_rxfh_indir_size(dev); +	if (ops->get_rxfh_key_size) +		dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev); +	if ((dev_key_size + dev_indir_size) == 0) +		return -EOPNOTSUPP; + +	if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) +		return -EFAULT; + +	/* Check that reserved fields are 0 for now */ +	if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) +		return -EINVAL; + +	/* If either indir or hash key is valid, proceed further. +	 * It is not valid to request that both be unchanged. +	 */ +	if ((rxfh.indir_size && +	     rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && +	     rxfh.indir_size != dev_indir_size) || +	    (rxfh.key_size && (rxfh.key_size != dev_key_size)) || +	    (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && +	     rxfh.key_size == 0)) +		return -EINVAL; + +	if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) +		indir_bytes = dev_indir_size * sizeof(indir[0]); + +	rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); +	if (!rss_config) +		return -ENOMEM; + +	rx_rings.cmd = ETHTOOL_GRXRINGS; +	ret = ops->get_rxnfc(dev, &rx_rings, NULL); +	if (ret) +		goto out; + +	/* rxfh.indir_size == 0 means reset the indir table to default. +	 * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. +	 */ +	if (rxfh.indir_size && +	    rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { +		indir = (u32 *)rss_config; +		ret = ethtool_copy_validate_indir(indir, +						  useraddr + rss_cfg_offset, +						  &rx_rings, +						  rxfh.indir_size); +		if (ret)  			goto out; -		} +	} else if (rxfh.indir_size == 0) { +		indir = (u32 *)rss_config; +		for (i = 0; i < dev_indir_size; i++) +			indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); +	} -		/* Validate ring indices */ -		for (i = 0; i < dev_size; i++) { -			if (indir[i] >= rx_rings.data) { -				ret = -EINVAL; -				goto out; -			} +	if (rxfh.key_size) { +		hkey = rss_config + indir_bytes; +		if (copy_from_user(hkey, +				   useraddr + rss_cfg_offset + indir_bytes, +				   rxfh.key_size)) { +			ret = -EFAULT; +			goto out;  		}  	} -	ret = ops->set_rxfh_indir(dev, indir); +	ret = ops->set_rxfh(dev, indir, hkey);  out: -	kfree(indir); +	kfree(rss_config);  	return ret;  } @@ -1487,6 +1665,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	case ETHTOOL_GRXCLSRULE:  	case ETHTOOL_GRXCLSRLALL:  	case ETHTOOL_GRXFHINDIR: +	case ETHTOOL_GRSSH:  	case ETHTOOL_GFEATURES:  	case ETHTOOL_GCHANNELS:  	case ETHTOOL_GET_TS_INFO: @@ -1624,6 +1803,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	case ETHTOOL_SRXFHINDIR:  		rc = ethtool_set_rxfh_indir(dev, useraddr);  		break; +	case ETHTOOL_GRSSH: +		rc = ethtool_get_rxfh(dev, useraddr); +		break; +	case ETHTOOL_SRSSH: +		rc = ethtool_set_rxfh(dev, useraddr); +		break;  	case ETHTOOL_GFEATURES:  		rc = ethtool_get_features(dev, useraddr);  		break; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 2e654138433..185c341fafb 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -460,7 +460,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)  		if (frh->action && (frh->action != rule->action))  			continue; -		if (frh->table && (frh_get_table(frh, tb) != rule->table)) +		if (frh_get_table(frh, tb) && +		    (frh_get_table(frh, tb) != rule->table))  			continue;  		if (tb[FRA_PRIORITY] && @@ -744,6 +745,13 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event,  			attach_rules(&ops->rules_list, dev);  		break; +	case NETDEV_CHANGENAME: +		list_for_each_entry(ops, &net->rules_ops, list) { +			detach_rules(&ops->rules_list, dev); +			attach_rules(&ops->rules_list, dev); +		} +		break; +  	case NETDEV_UNREGISTER:  		list_for_each_entry(ops, &net->rules_ops, list)  			detach_rules(&ops->rules_list, dev); diff --git a/net/core/filter.c b/net/core/filter.c index 6438f29ff26..1dbf6462f76 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1,11 +1,16 @@  /*   * Linux Socket Filter - Kernel level socket filtering   * - * Author: - *     Jay Schulist <jschlst@samba.org> + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid:   * - * Based on the design of: - *     - The Berkeley Packet Filter + *	Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com + * + * Authors: + * + *	Jay Schulist <jschlst@samba.org> + *	Alexei Starovoitov <ast@plumgrid.com> + *	Daniel Borkmann <dborkman@redhat.com>   *   * This program is free software; you can redistribute it and/or   * modify it under the terms of the GNU General Public License @@ -36,11 +41,31 @@  #include <asm/uaccess.h>  #include <asm/unaligned.h>  #include <linux/filter.h> -#include <linux/reciprocal_div.h>  #include <linux/ratelimit.h>  #include <linux/seccomp.h>  #include <linux/if_vlan.h> +/* Registers */ +#define BPF_R0	regs[BPF_REG_0] +#define BPF_R1	regs[BPF_REG_1] +#define BPF_R2	regs[BPF_REG_2] +#define BPF_R3	regs[BPF_REG_3] +#define BPF_R4	regs[BPF_REG_4] +#define BPF_R5	regs[BPF_REG_5] +#define BPF_R6	regs[BPF_REG_6] +#define BPF_R7	regs[BPF_REG_7] +#define BPF_R8	regs[BPF_REG_8] +#define BPF_R9	regs[BPF_REG_9] +#define BPF_R10	regs[BPF_REG_10] + +/* Named registers */ +#define DST	regs[insn->dst_reg] +#define SRC	regs[insn->src_reg] +#define FP	regs[BPF_REG_FP] +#define ARG1	regs[BPF_REG_ARG1] +#define CTX	regs[BPF_REG_CTX] +#define IMM	insn->imm +  /* No hurry in this branch   *   * Exported for the bpf jit load helper. @@ -53,9 +78,9 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns  		ptr = skb_network_header(skb) + k - SKF_NET_OFF;  	else if (k >= SKF_LL_OFF)  		ptr = skb_mac_header(skb) + k - SKF_LL_OFF; -  	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))  		return ptr; +  	return NULL;  } @@ -64,6 +89,7 @@ static inline void *load_pointer(const struct sk_buff *skb, int k,  {  	if (k >= 0)  		return skb_header_pointer(skb, k, size, buffer); +  	return bpf_internal_load_pointer_neg_helper(skb, k, size);  } @@ -109,304 +135,960 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)  }  EXPORT_SYMBOL(sk_filter); +/* Base function for offset calculation. Needs to go into .text section, + * therefore keeping it non-static as well; will also be used by JITs + * anyway later on, so do not let the compiler omit it. + */ +noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ +	return 0; +} +  /** - *	sk_run_filter - run a filter on a socket - *	@skb: buffer to run the filter on - *	@fentry: filter to apply + *	__sk_run_filter - run a filter on a given context + *	@ctx: buffer to run the filter on + *	@insn: filter to apply   * - * Decode and apply filter instructions to the skb->data. - * Return length to keep, 0 for none. @skb is the data we are - * filtering, @filter is the array of filter instructions. - * Because all jumps are guaranteed to be before last instruction, - * and last instruction guaranteed to be a RET, we dont need to check - * flen. (We used to pass to this function the length of filter) + * Decode and apply filter instructions to the skb->data. Return length to + * keep, 0 for none. @ctx is the data we are operating on, @insn is the + * array of filter instructions.   */ -unsigned int sk_run_filter(const struct sk_buff *skb, -			   const struct sock_filter *fentry) +static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)  { +	u64 stack[MAX_BPF_STACK / sizeof(u64)]; +	u64 regs[MAX_BPF_REG], tmp; +	static const void *jumptable[256] = { +		[0 ... 255] = &&default_label, +		/* Now overwrite non-defaults ... */ +		/* 32 bit ALU operations */ +		[BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, +		[BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, +		[BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, +		[BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, +		[BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, +		[BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, +		[BPF_ALU | BPF_OR | BPF_X]  = &&ALU_OR_X, +		[BPF_ALU | BPF_OR | BPF_K]  = &&ALU_OR_K, +		[BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, +		[BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, +		[BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, +		[BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, +		[BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, +		[BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, +		[BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, +		[BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, +		[BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, +		[BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, +		[BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, +		[BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, +		[BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, +		[BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, +		[BPF_ALU | BPF_NEG] = &&ALU_NEG, +		[BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, +		[BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, +		/* 64 bit ALU operations */ +		[BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, +		[BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, +		[BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, +		[BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, +		[BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, +		[BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, +		[BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, +		[BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, +		[BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, +		[BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, +		[BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, +		[BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, +		[BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, +		[BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, +		[BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, +		[BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, +		[BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, +		[BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, +		[BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, +		[BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, +		[BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, +		[BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, +		[BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, +		[BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, +		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, +		/* Call instruction */ +		[BPF_JMP | BPF_CALL] = &&JMP_CALL, +		/* Jumps */ +		[BPF_JMP | BPF_JA] = &&JMP_JA, +		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, +		[BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, +		[BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, +		[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, +		[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, +		[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, +		[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, +		[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, +		[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, +		[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, +		[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, +		[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, +		[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, +		[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, +		/* Program return */ +		[BPF_JMP | BPF_EXIT] = &&JMP_EXIT, +		/* Store instructions */ +		[BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, +		[BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, +		[BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, +		[BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, +		[BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, +		[BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, +		[BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, +		[BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, +		[BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, +		[BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, +		/* Load instructions */ +		[BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, +		[BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, +		[BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, +		[BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, +		[BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, +		[BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, +		[BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, +		[BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, +		[BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, +		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, +	};  	void *ptr; -	u32 A = 0;			/* Accumulator */ -	u32 X = 0;			/* Index Register */ -	u32 mem[BPF_MEMWORDS];		/* Scratch Memory Store */ -	u32 tmp; -	int k; - -	/* -	 * Process array of filter instructions. -	 */ -	for (;; fentry++) { -#if defined(CONFIG_X86_32) -#define	K (fentry->k) -#else -		const u32 K = fentry->k; -#endif - -		switch (fentry->code) { -		case BPF_S_ALU_ADD_X: -			A += X; -			continue; -		case BPF_S_ALU_ADD_K: -			A += K; -			continue; -		case BPF_S_ALU_SUB_X: -			A -= X; -			continue; -		case BPF_S_ALU_SUB_K: -			A -= K; -			continue; -		case BPF_S_ALU_MUL_X: -			A *= X; -			continue; -		case BPF_S_ALU_MUL_K: -			A *= K; -			continue; -		case BPF_S_ALU_DIV_X: -			if (X == 0) -				return 0; -			A /= X; -			continue; -		case BPF_S_ALU_DIV_K: -			A = reciprocal_divide(A, K); -			continue; -		case BPF_S_ALU_MOD_X: -			if (X == 0) -				return 0; -			A %= X; -			continue; -		case BPF_S_ALU_MOD_K: -			A %= K; -			continue; -		case BPF_S_ALU_AND_X: -			A &= X; -			continue; -		case BPF_S_ALU_AND_K: -			A &= K; -			continue; -		case BPF_S_ALU_OR_X: -			A |= X; -			continue; -		case BPF_S_ALU_OR_K: -			A |= K; -			continue; -		case BPF_S_ANC_ALU_XOR_X: -		case BPF_S_ALU_XOR_X: -			A ^= X; -			continue; -		case BPF_S_ALU_XOR_K: -			A ^= K; -			continue; -		case BPF_S_ALU_LSH_X: -			A <<= X; -			continue; -		case BPF_S_ALU_LSH_K: -			A <<= K; -			continue; -		case BPF_S_ALU_RSH_X: -			A >>= X; -			continue; -		case BPF_S_ALU_RSH_K: -			A >>= K; -			continue; -		case BPF_S_ALU_NEG: -			A = -A; -			continue; -		case BPF_S_JMP_JA: -			fentry += K; -			continue; -		case BPF_S_JMP_JGT_K: -			fentry += (A > K) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JGE_K: -			fentry += (A >= K) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JEQ_K: -			fentry += (A == K) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JSET_K: -			fentry += (A & K) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JGT_X: -			fentry += (A > X) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JGE_X: -			fentry += (A >= X) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JEQ_X: -			fentry += (A == X) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JSET_X: -			fentry += (A & X) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_LD_W_ABS: -			k = K; -load_w: -			ptr = load_pointer(skb, k, 4, &tmp); -			if (ptr != NULL) { -				A = get_unaligned_be32(ptr); -				continue; -			} +	int off; + +#define CONT	 ({ insn++; goto select_insn; }) +#define CONT_JMP ({ insn++; goto select_insn; }) + +	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; +	ARG1 = (u64) (unsigned long) ctx; + +	/* Registers used in classic BPF programs need to be reset first. */ +	regs[BPF_REG_A] = 0; +	regs[BPF_REG_X] = 0; + +select_insn: +	goto *jumptable[insn->code]; + +	/* ALU */ +#define ALU(OPCODE, OP)			\ +	ALU64_##OPCODE##_X:		\ +		DST = DST OP SRC;	\ +		CONT;			\ +	ALU_##OPCODE##_X:		\ +		DST = (u32) DST OP (u32) SRC;	\ +		CONT;			\ +	ALU64_##OPCODE##_K:		\ +		DST = DST OP IMM;		\ +		CONT;			\ +	ALU_##OPCODE##_K:		\ +		DST = (u32) DST OP (u32) IMM;	\ +		CONT; + +	ALU(ADD,  +) +	ALU(SUB,  -) +	ALU(AND,  &) +	ALU(OR,   |) +	ALU(LSH, <<) +	ALU(RSH, >>) +	ALU(XOR,  ^) +	ALU(MUL,  *) +#undef ALU +	ALU_NEG: +		DST = (u32) -DST; +		CONT; +	ALU64_NEG: +		DST = -DST; +		CONT; +	ALU_MOV_X: +		DST = (u32) SRC; +		CONT; +	ALU_MOV_K: +		DST = (u32) IMM; +		CONT; +	ALU64_MOV_X: +		DST = SRC; +		CONT; +	ALU64_MOV_K: +		DST = IMM; +		CONT; +	ALU64_ARSH_X: +		(*(s64 *) &DST) >>= SRC; +		CONT; +	ALU64_ARSH_K: +		(*(s64 *) &DST) >>= IMM; +		CONT; +	ALU64_MOD_X: +		if (unlikely(SRC == 0))  			return 0; -		case BPF_S_LD_H_ABS: -			k = K; -load_h: -			ptr = load_pointer(skb, k, 2, &tmp); -			if (ptr != NULL) { -				A = get_unaligned_be16(ptr); -				continue; -			} +		tmp = DST; +		DST = do_div(tmp, SRC); +		CONT; +	ALU_MOD_X: +		if (unlikely(SRC == 0))  			return 0; -		case BPF_S_LD_B_ABS: -			k = K; -load_b: -			ptr = load_pointer(skb, k, 1, &tmp); -			if (ptr != NULL) { -				A = *(u8 *)ptr; -				continue; -			} +		tmp = (u32) DST; +		DST = do_div(tmp, (u32) SRC); +		CONT; +	ALU64_MOD_K: +		tmp = DST; +		DST = do_div(tmp, IMM); +		CONT; +	ALU_MOD_K: +		tmp = (u32) DST; +		DST = do_div(tmp, (u32) IMM); +		CONT; +	ALU64_DIV_X: +		if (unlikely(SRC == 0))  			return 0; -		case BPF_S_LD_W_LEN: -			A = skb->len; -			continue; -		case BPF_S_LDX_W_LEN: -			X = skb->len; -			continue; -		case BPF_S_LD_W_IND: -			k = X + K; -			goto load_w; -		case BPF_S_LD_H_IND: -			k = X + K; -			goto load_h; -		case BPF_S_LD_B_IND: -			k = X + K; -			goto load_b; -		case BPF_S_LDX_B_MSH: -			ptr = load_pointer(skb, K, 1, &tmp); -			if (ptr != NULL) { -				X = (*(u8 *)ptr & 0xf) << 2; -				continue; -			} +		do_div(DST, SRC); +		CONT; +	ALU_DIV_X: +		if (unlikely(SRC == 0))  			return 0; -		case BPF_S_LD_IMM: -			A = K; -			continue; -		case BPF_S_LDX_IMM: -			X = K; -			continue; -		case BPF_S_LD_MEM: -			A = mem[K]; -			continue; -		case BPF_S_LDX_MEM: -			X = mem[K]; -			continue; -		case BPF_S_MISC_TAX: -			X = A; -			continue; -		case BPF_S_MISC_TXA: -			A = X; -			continue; -		case BPF_S_RET_K: -			return K; -		case BPF_S_RET_A: -			return A; -		case BPF_S_ST: -			mem[K] = A; -			continue; -		case BPF_S_STX: -			mem[K] = X; -			continue; -		case BPF_S_ANC_PROTOCOL: -			A = ntohs(skb->protocol); -			continue; -		case BPF_S_ANC_PKTTYPE: -			A = skb->pkt_type; -			continue; -		case BPF_S_ANC_IFINDEX: -			if (!skb->dev) -				return 0; -			A = skb->dev->ifindex; -			continue; -		case BPF_S_ANC_MARK: -			A = skb->mark; -			continue; -		case BPF_S_ANC_QUEUE: -			A = skb->queue_mapping; -			continue; -		case BPF_S_ANC_HATYPE: -			if (!skb->dev) -				return 0; -			A = skb->dev->type; -			continue; -		case BPF_S_ANC_RXHASH: -			A = skb->rxhash; -			continue; -		case BPF_S_ANC_CPU: -			A = raw_smp_processor_id(); -			continue; -		case BPF_S_ANC_VLAN_TAG: -			A = vlan_tx_tag_get(skb); -			continue; -		case BPF_S_ANC_VLAN_TAG_PRESENT: -			A = !!vlan_tx_tag_present(skb); -			continue; -		case BPF_S_ANC_PAY_OFFSET: -			A = __skb_get_poff(skb); -			continue; -		case BPF_S_ANC_NLATTR: { -			struct nlattr *nla; - -			if (skb_is_nonlinear(skb)) -				return 0; -			if (A > skb->len - sizeof(struct nlattr)) -				return 0; - -			nla = nla_find((struct nlattr *)&skb->data[A], -				       skb->len - A, X); -			if (nla) -				A = (void *)nla - (void *)skb->data; -			else -				A = 0; -			continue; +		tmp = (u32) DST; +		do_div(tmp, (u32) SRC); +		DST = (u32) tmp; +		CONT; +	ALU64_DIV_K: +		do_div(DST, IMM); +		CONT; +	ALU_DIV_K: +		tmp = (u32) DST; +		do_div(tmp, (u32) IMM); +		DST = (u32) tmp; +		CONT; +	ALU_END_TO_BE: +		switch (IMM) { +		case 16: +			DST = (__force u16) cpu_to_be16(DST); +			break; +		case 32: +			DST = (__force u32) cpu_to_be32(DST); +			break; +		case 64: +			DST = (__force u64) cpu_to_be64(DST); +			break; +		} +		CONT; +	ALU_END_TO_LE: +		switch (IMM) { +		case 16: +			DST = (__force u16) cpu_to_le16(DST); +			break; +		case 32: +			DST = (__force u32) cpu_to_le32(DST); +			break; +		case 64: +			DST = (__force u64) cpu_to_le64(DST); +			break; +		} +		CONT; + +	/* CALL */ +	JMP_CALL: +		/* Function call scratches BPF_R1-BPF_R5 registers, +		 * preserves BPF_R6-BPF_R9, and stores return value +		 * into BPF_R0. +		 */ +		BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, +						       BPF_R4, BPF_R5); +		CONT; + +	/* JMP */ +	JMP_JA: +		insn += insn->off; +		CONT; +	JMP_JEQ_X: +		if (DST == SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JEQ_K: +		if (DST == IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JNE_X: +		if (DST != SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JNE_K: +		if (DST != IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JGT_X: +		if (DST > SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JGT_K: +		if (DST > IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JGE_X: +		if (DST >= SRC) { +			insn += insn->off; +			CONT_JMP;  		} -		case BPF_S_ANC_NLATTR_NEST: { -			struct nlattr *nla; - -			if (skb_is_nonlinear(skb)) -				return 0; -			if (A > skb->len - sizeof(struct nlattr)) -				return 0; - -			nla = (struct nlattr *)&skb->data[A]; -			if (nla->nla_len > A - skb->len) -				return 0; - -			nla = nla_find_nested(nla, X); -			if (nla) -				A = (void *)nla - (void *)skb->data; -			else -				A = 0; -			continue; +		CONT; +	JMP_JGE_K: +		if (DST >= IMM) { +			insn += insn->off; +			CONT_JMP;  		} -#ifdef CONFIG_SECCOMP_FILTER -		case BPF_S_ANC_SECCOMP_LD_W: -			A = seccomp_bpf_load(fentry->k); -			continue; +		CONT; +	JMP_JSGT_X: +		if (((s64) DST) > ((s64) SRC)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSGT_K: +		if (((s64) DST) > ((s64) IMM)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSGE_X: +		if (((s64) DST) >= ((s64) SRC)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSGE_K: +		if (((s64) DST) >= ((s64) IMM)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSET_X: +		if (DST & SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSET_K: +		if (DST & IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_EXIT: +		return BPF_R0; + +	/* STX and ST and LDX*/ +#define LDST(SIZEOP, SIZE)						\ +	STX_MEM_##SIZEOP:						\ +		*(SIZE *)(unsigned long) (DST + insn->off) = SRC;	\ +		CONT;							\ +	ST_MEM_##SIZEOP:						\ +		*(SIZE *)(unsigned long) (DST + insn->off) = IMM;	\ +		CONT;							\ +	LDX_MEM_##SIZEOP:						\ +		DST = *(SIZE *)(unsigned long) (SRC + insn->off);	\ +		CONT; + +	LDST(B,   u8) +	LDST(H,  u16) +	LDST(W,  u32) +	LDST(DW, u64) +#undef LDST +	STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ +		atomic_add((u32) SRC, (atomic_t *)(unsigned long) +			   (DST + insn->off)); +		CONT; +	STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ +		atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) +			     (DST + insn->off)); +		CONT; +	LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ +		off = IMM; +load_word: +		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are +		 * only appearing in the programs where ctx == +		 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] +		 * == BPF_R6, sk_convert_filter() saves it in BPF_R6, +		 * internal BPF verifier will check that BPF_R6 == +		 * ctx. +		 * +		 * BPF_ABS and BPF_IND are wrappers of function calls, +		 * so they scratch BPF_R1-BPF_R5 registers, preserve +		 * BPF_R6-BPF_R9, and store return value into BPF_R0. +		 * +		 * Implicit input: +		 *   ctx == skb == BPF_R6 == CTX +		 * +		 * Explicit input: +		 *   SRC == any register +		 *   IMM == 32-bit immediate +		 * +		 * Output: +		 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness +		 */ + +		ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); +		if (likely(ptr != NULL)) { +			BPF_R0 = get_unaligned_be32(ptr); +			CONT; +		} + +		return 0; +	LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ +		off = IMM; +load_half: +		ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); +		if (likely(ptr != NULL)) { +			BPF_R0 = get_unaligned_be16(ptr); +			CONT; +		} + +		return 0; +	LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ +		off = IMM; +load_byte: +		ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); +		if (likely(ptr != NULL)) { +			BPF_R0 = *(u8 *)ptr; +			CONT; +		} + +		return 0; +	LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ +		off = IMM + SRC; +		goto load_word; +	LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ +		off = IMM + SRC; +		goto load_half; +	LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ +		off = IMM + SRC; +		goto load_byte; + +	default_label: +		/* If we ever reach this, we have a bug somewhere. */ +		WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); +		return 0; +} + +/* Helper to find the offset of pkt_type in sk_buff structure. We want + * to make sure its still a 3bit field starting at a byte boundary; + * taken from arch/x86/net/bpf_jit_comp.c. + */ +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_TYPE_MAX	(7 << 5) +#else +#define PKT_TYPE_MAX	7 +#endif +static unsigned int pkt_type_offset(void) +{ +	struct sk_buff skb_probe = { .pkt_type = ~0, }; +	u8 *ct = (u8 *) &skb_probe; +	unsigned int off; + +	for (off = 0; off < sizeof(struct sk_buff); off++) { +		if (ct[off] == PKT_TYPE_MAX) +			return off; +	} + +	pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__); +	return -1; +} + +static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ +	return __skb_get_poff((struct sk_buff *)(unsigned long) ctx); +} + +static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ +	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; +	struct nlattr *nla; + +	if (skb_is_nonlinear(skb)) +		return 0; + +	if (skb->len < sizeof(struct nlattr)) +		return 0; + +	if (a > skb->len - sizeof(struct nlattr)) +		return 0; + +	nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); +	if (nla) +		return (void *) nla - (void *) skb->data; + +	return 0; +} + +static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ +	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; +	struct nlattr *nla; + +	if (skb_is_nonlinear(skb)) +		return 0; + +	if (skb->len < sizeof(struct nlattr)) +		return 0; + +	if (a > skb->len - sizeof(struct nlattr)) +		return 0; + +	nla = (struct nlattr *) &skb->data[a]; +	if (nla->nla_len > skb->len - a) +		return 0; + +	nla = nla_find_nested(nla, x); +	if (nla) +		return (void *) nla - (void *) skb->data; + +	return 0; +} + +static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ +	return raw_smp_processor_id(); +} + +/* note that this only generates 32-bit random numbers */ +static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ +	return prandom_u32(); +} + +static bool convert_bpf_extensions(struct sock_filter *fp, +				   struct sock_filter_int **insnp) +{ +	struct sock_filter_int *insn = *insnp; + +	switch (fp->k) { +	case SKF_AD_OFF + SKF_AD_PROTOCOL: +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + +		/* A = *(u16 *) (CTX + offsetof(protocol)) */ +		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, +				      offsetof(struct sk_buff, protocol)); +		/* A = ntohs(A) [emitting a nop or swap16] */ +		*insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); +		break; + +	case SKF_AD_OFF + SKF_AD_PKTTYPE: +		*insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, +				    pkt_type_offset()); +		if (insn->off < 0) +			return false; +		insn++; +		*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD +		insn++; +                *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5);  #endif +		break; + +	case SKF_AD_OFF + SKF_AD_IFINDEX: +	case SKF_AD_OFF + SKF_AD_HATYPE: +		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); +		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); +		BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); + +		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), +				      BPF_REG_TMP, BPF_REG_CTX, +				      offsetof(struct sk_buff, dev)); +		/* if (tmp != 0) goto pc + 1 */ +		*insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); +		*insn++ = BPF_EXIT_INSN(); +		if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) +			*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, +					    offsetof(struct net_device, ifindex)); +		else +			*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, +					    offsetof(struct net_device, type)); +		break; + +	case SKF_AD_OFF + SKF_AD_MARK: +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + +		*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, +				    offsetof(struct sk_buff, mark)); +		break; + +	case SKF_AD_OFF + SKF_AD_RXHASH: +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); + +		*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, +				    offsetof(struct sk_buff, hash)); +		break; + +	case SKF_AD_OFF + SKF_AD_QUEUE: +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + +		*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, +				    offsetof(struct sk_buff, queue_mapping)); +		break; + +	case SKF_AD_OFF + SKF_AD_VLAN_TAG: +	case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); +		BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + +		/* A = *(u16 *) (CTX + offsetof(vlan_tci)) */ +		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, +				      offsetof(struct sk_buff, vlan_tci)); +		if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) { +			*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, +					      ~VLAN_TAG_PRESENT); +		} else { +			/* A >>= 12 */ +			*insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12); +			/* A &= 1 */ +			*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1); +		} +		break; + +	case SKF_AD_OFF + SKF_AD_PAY_OFFSET: +	case SKF_AD_OFF + SKF_AD_NLATTR: +	case SKF_AD_OFF + SKF_AD_NLATTR_NEST: +	case SKF_AD_OFF + SKF_AD_CPU: +	case SKF_AD_OFF + SKF_AD_RANDOM: +		/* arg1 = CTX */ +		*insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); +		/* arg2 = A */ +		*insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); +		/* arg3 = X */ +		*insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); +		/* Emit call(arg1=CTX, arg2=A, arg3=X) */ +		switch (fp->k) { +		case SKF_AD_OFF + SKF_AD_PAY_OFFSET: +			*insn = BPF_EMIT_CALL(__skb_get_pay_offset); +			break; +		case SKF_AD_OFF + SKF_AD_NLATTR: +			*insn = BPF_EMIT_CALL(__skb_get_nlattr); +			break; +		case SKF_AD_OFF + SKF_AD_NLATTR_NEST: +			*insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); +			break; +		case SKF_AD_OFF + SKF_AD_CPU: +			*insn = BPF_EMIT_CALL(__get_raw_cpu_id); +			break; +		case SKF_AD_OFF + SKF_AD_RANDOM: +			*insn = BPF_EMIT_CALL(__get_random_u32); +			break; +		} +		break; + +	case SKF_AD_OFF + SKF_AD_ALU_XOR_X: +		/* A ^= X */ +		*insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); +		break; + +	default: +		/* This is just a dummy call to avoid letting the compiler +		 * evict __bpf_call_base() as an optimization. Placed here +		 * where no-one bothers. +		 */ +		BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); +		return false; +	} + +	*insnp = insn; +	return true; +} + +/** + *	sk_convert_filter - convert filter program + *	@prog: the user passed filter program + *	@len: the length of the user passed filter program + *	@new_prog: buffer where converted program will be stored + *	@new_len: pointer to store length of converted program + * + * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. + * Conversion workflow: + * + * 1) First pass for calculating the new program length: + *   sk_convert_filter(old_prog, old_len, NULL, &new_len) + * + * 2) 2nd pass to remap in two passes: 1st pass finds new + *    jump offsets, 2nd pass remapping: + *   new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len); + *   sk_convert_filter(old_prog, old_len, new_prog, &new_len); + * + * User BPF's register A is mapped to our BPF register 6, user BPF + * register X is mapped to BPF register 7; frame pointer is always + * register 10; Context 'void *ctx' is stored in register 1, that is, + * for socket filters: ctx == 'struct sk_buff *', for seccomp: + * ctx == 'struct seccomp_data *'. + */ +int sk_convert_filter(struct sock_filter *prog, int len, +		      struct sock_filter_int *new_prog, int *new_len) +{ +	int new_flen = 0, pass = 0, target, i; +	struct sock_filter_int *new_insn; +	struct sock_filter *fp; +	int *addrs = NULL; +	u8 bpf_src; + +	BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); +	BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); + +	if (len <= 0 || len > BPF_MAXINSNS) +		return -EINVAL; + +	if (new_prog) { +		addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL); +		if (!addrs) +			return -ENOMEM; +	} + +do_pass: +	new_insn = new_prog; +	fp = prog; + +	if (new_insn) +		*new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); +	new_insn++; + +	for (i = 0; i < len; fp++, i++) { +		struct sock_filter_int tmp_insns[6] = { }; +		struct sock_filter_int *insn = tmp_insns; + +		if (addrs) +			addrs[i] = new_insn - new_prog; + +		switch (fp->code) { +		/* All arithmetic insns and skb loads map as-is. */ +		case BPF_ALU | BPF_ADD | BPF_X: +		case BPF_ALU | BPF_ADD | BPF_K: +		case BPF_ALU | BPF_SUB | BPF_X: +		case BPF_ALU | BPF_SUB | BPF_K: +		case BPF_ALU | BPF_AND | BPF_X: +		case BPF_ALU | BPF_AND | BPF_K: +		case BPF_ALU | BPF_OR | BPF_X: +		case BPF_ALU | BPF_OR | BPF_K: +		case BPF_ALU | BPF_LSH | BPF_X: +		case BPF_ALU | BPF_LSH | BPF_K: +		case BPF_ALU | BPF_RSH | BPF_X: +		case BPF_ALU | BPF_RSH | BPF_K: +		case BPF_ALU | BPF_XOR | BPF_X: +		case BPF_ALU | BPF_XOR | BPF_K: +		case BPF_ALU | BPF_MUL | BPF_X: +		case BPF_ALU | BPF_MUL | BPF_K: +		case BPF_ALU | BPF_DIV | BPF_X: +		case BPF_ALU | BPF_DIV | BPF_K: +		case BPF_ALU | BPF_MOD | BPF_X: +		case BPF_ALU | BPF_MOD | BPF_K: +		case BPF_ALU | BPF_NEG: +		case BPF_LD | BPF_ABS | BPF_W: +		case BPF_LD | BPF_ABS | BPF_H: +		case BPF_LD | BPF_ABS | BPF_B: +		case BPF_LD | BPF_IND | BPF_W: +		case BPF_LD | BPF_IND | BPF_H: +		case BPF_LD | BPF_IND | BPF_B: +			/* Check for overloaded BPF extension and +			 * directly convert it if found, otherwise +			 * just move on with mapping. +			 */ +			if (BPF_CLASS(fp->code) == BPF_LD && +			    BPF_MODE(fp->code) == BPF_ABS && +			    convert_bpf_extensions(fp, &insn)) +				break; + +			*insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); +			break; + +		/* Jump transformation cannot use BPF block macros +		 * everywhere as offset calculation and target updates +		 * require a bit more work than the rest, i.e. jump +		 * opcodes map as-is, but offsets need adjustment. +		 */ + +#define BPF_EMIT_JMP							\ +	do {								\ +		if (target >= len || target < 0)			\ +			goto err;					\ +		insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0;	\ +		/* Adjust pc relative offset for 2nd or 3rd insn. */	\ +		insn->off -= insn - tmp_insns;				\ +	} while (0) + +		case BPF_JMP | BPF_JA: +			target = i + fp->k + 1; +			insn->code = fp->code; +			BPF_EMIT_JMP; +			break; + +		case BPF_JMP | BPF_JEQ | BPF_K: +		case BPF_JMP | BPF_JEQ | BPF_X: +		case BPF_JMP | BPF_JSET | BPF_K: +		case BPF_JMP | BPF_JSET | BPF_X: +		case BPF_JMP | BPF_JGT | BPF_K: +		case BPF_JMP | BPF_JGT | BPF_X: +		case BPF_JMP | BPF_JGE | BPF_K: +		case BPF_JMP | BPF_JGE | BPF_X: +			if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { +				/* BPF immediates are signed, zero extend +				 * immediate into tmp register and use it +				 * in compare insn. +				 */ +				*insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); + +				insn->dst_reg = BPF_REG_A; +				insn->src_reg = BPF_REG_TMP; +				bpf_src = BPF_X; +			} else { +				insn->dst_reg = BPF_REG_A; +				insn->src_reg = BPF_REG_X; +				insn->imm = fp->k; +				bpf_src = BPF_SRC(fp->code); +			} + +			/* Common case where 'jump_false' is next insn. */ +			if (fp->jf == 0) { +				insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; +				target = i + fp->jt + 1; +				BPF_EMIT_JMP; +				break; +			} + +			/* Convert JEQ into JNE when 'jump_true' is next insn. */ +			if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { +				insn->code = BPF_JMP | BPF_JNE | bpf_src; +				target = i + fp->jf + 1; +				BPF_EMIT_JMP; +				break; +			} + +			/* Other jumps are mapped into two insns: Jxx and JA. */ +			target = i + fp->jt + 1; +			insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; +			BPF_EMIT_JMP; +			insn++; + +			insn->code = BPF_JMP | BPF_JA; +			target = i + fp->jf + 1; +			BPF_EMIT_JMP; +			break; + +		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ +		case BPF_LDX | BPF_MSH | BPF_B: +			/* tmp = A */ +			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); +			/* A = BPF_R0 = *(u8 *) (skb->data + K) */ +			*insn++ = BPF_LD_ABS(BPF_B, fp->k); +			/* A &= 0xf */ +			*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); +			/* A <<= 2 */ +			*insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); +			/* X = A */ +			*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); +			/* A = tmp */ +			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); +			break; + +		/* RET_K, RET_A are remaped into 2 insns. */ +		case BPF_RET | BPF_A: +		case BPF_RET | BPF_K: +			*insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? +						BPF_K : BPF_X, BPF_REG_0, +						BPF_REG_A, fp->k); +			*insn = BPF_EXIT_INSN(); +			break; + +		/* Store to stack. */ +		case BPF_ST: +		case BPF_STX: +			*insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == +					    BPF_ST ? BPF_REG_A : BPF_REG_X, +					    -(BPF_MEMWORDS - fp->k) * 4); +			break; + +		/* Load from stack. */ +		case BPF_LD | BPF_MEM: +		case BPF_LDX | BPF_MEM: +			*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ? +					    BPF_REG_A : BPF_REG_X, BPF_REG_FP, +					    -(BPF_MEMWORDS - fp->k) * 4); +			break; + +		/* A = K or X = K */ +		case BPF_LD | BPF_IMM: +		case BPF_LDX | BPF_IMM: +			*insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? +					      BPF_REG_A : BPF_REG_X, fp->k); +			break; + +		/* X = A */ +		case BPF_MISC | BPF_TAX: +			*insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); +			break; + +		/* A = X */ +		case BPF_MISC | BPF_TXA: +			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); +			break; + +		/* A = skb->len or X = skb->len */ +		case BPF_LD | BPF_W | BPF_LEN: +		case BPF_LDX | BPF_W | BPF_LEN: +			*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? +					    BPF_REG_A : BPF_REG_X, BPF_REG_CTX, +					    offsetof(struct sk_buff, len)); +			break; + +		/* Access seccomp_data fields. */ +		case BPF_LDX | BPF_ABS | BPF_W: +			/* A = *(u32 *) (ctx + K) */ +			*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); +			break; + +		/* Unkown instruction. */  		default: -			WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n", -				       fentry->code, fentry->jt, -				       fentry->jf, fentry->k); -			return 0; +			goto err;  		} + +		insn++; +		if (new_prog) +			memcpy(new_insn, tmp_insns, +			       sizeof(*insn) * (insn - tmp_insns)); +		new_insn += insn - tmp_insns;  	} +	if (!new_prog) { +		/* Only calculating new length. */ +		*new_len = new_insn - new_prog; +		return 0; +	} + +	pass++; +	if (new_flen != new_insn - new_prog) { +		new_flen = new_insn - new_prog; +		if (pass > 2) +			goto err; +		goto do_pass; +	} + +	kfree(addrs); +	BUG_ON(*new_len != new_flen);  	return 0; +err: +	kfree(addrs); +	return -EINVAL;  } -EXPORT_SYMBOL(sk_run_filter); -/* - * Security : +/* Security: + *   * A BPF program is able to use 16 cells of memory to store intermediate - * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()) + * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()). + *   * As we dont want to clear mem[] array for each packet going through   * sk_run_filter(), we check that filter loaded by user never try to read   * a cell if not previously written, and we check all branches to be sure @@ -414,44 +1096,46 @@ EXPORT_SYMBOL(sk_run_filter);   */  static int check_load_and_stores(struct sock_filter *filter, int flen)  { -	u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */ +	u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */  	int pc, ret = 0;  	BUILD_BUG_ON(BPF_MEMWORDS > 16); -	masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); + +	masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);  	if (!masks)  		return -ENOMEM; +  	memset(masks, 0xff, flen * sizeof(*masks));  	for (pc = 0; pc < flen; pc++) {  		memvalid &= masks[pc];  		switch (filter[pc].code) { -		case BPF_S_ST: -		case BPF_S_STX: +		case BPF_ST: +		case BPF_STX:  			memvalid |= (1 << filter[pc].k);  			break; -		case BPF_S_LD_MEM: -		case BPF_S_LDX_MEM: +		case BPF_LD | BPF_MEM: +		case BPF_LDX | BPF_MEM:  			if (!(memvalid & (1 << filter[pc].k))) {  				ret = -EINVAL;  				goto error;  			}  			break; -		case BPF_S_JMP_JA: -			/* a jump must set masks on target */ +		case BPF_JMP | BPF_JA: +			/* A jump must set masks on target */  			masks[pc + 1 + filter[pc].k] &= memvalid;  			memvalid = ~0;  			break; -		case BPF_S_JMP_JEQ_K: -		case BPF_S_JMP_JEQ_X: -		case BPF_S_JMP_JGE_K: -		case BPF_S_JMP_JGE_X: -		case BPF_S_JMP_JGT_K: -		case BPF_S_JMP_JGT_X: -		case BPF_S_JMP_JSET_X: -		case BPF_S_JMP_JSET_K: -			/* a jump must set masks on targets */ +		case BPF_JMP | BPF_JEQ | BPF_K: +		case BPF_JMP | BPF_JEQ | BPF_X: +		case BPF_JMP | BPF_JGE | BPF_K: +		case BPF_JMP | BPF_JGE | BPF_X: +		case BPF_JMP | BPF_JGT | BPF_K: +		case BPF_JMP | BPF_JGT | BPF_X: +		case BPF_JMP | BPF_JSET | BPF_K: +		case BPF_JMP | BPF_JSET | BPF_X: +			/* A jump must set masks on targets */  			masks[pc + 1 + filter[pc].jt] &= memvalid;  			masks[pc + 1 + filter[pc].jf] &= memvalid;  			memvalid = ~0; @@ -463,6 +1147,72 @@ error:  	return ret;  } +static bool chk_code_allowed(u16 code_to_probe) +{ +	static const bool codes[] = { +		/* 32 bit ALU operations */ +		[BPF_ALU | BPF_ADD | BPF_K] = true, +		[BPF_ALU | BPF_ADD | BPF_X] = true, +		[BPF_ALU | BPF_SUB | BPF_K] = true, +		[BPF_ALU | BPF_SUB | BPF_X] = true, +		[BPF_ALU | BPF_MUL | BPF_K] = true, +		[BPF_ALU | BPF_MUL | BPF_X] = true, +		[BPF_ALU | BPF_DIV | BPF_K] = true, +		[BPF_ALU | BPF_DIV | BPF_X] = true, +		[BPF_ALU | BPF_MOD | BPF_K] = true, +		[BPF_ALU | BPF_MOD | BPF_X] = true, +		[BPF_ALU | BPF_AND | BPF_K] = true, +		[BPF_ALU | BPF_AND | BPF_X] = true, +		[BPF_ALU | BPF_OR | BPF_K] = true, +		[BPF_ALU | BPF_OR | BPF_X] = true, +		[BPF_ALU | BPF_XOR | BPF_K] = true, +		[BPF_ALU | BPF_XOR | BPF_X] = true, +		[BPF_ALU | BPF_LSH | BPF_K] = true, +		[BPF_ALU | BPF_LSH | BPF_X] = true, +		[BPF_ALU | BPF_RSH | BPF_K] = true, +		[BPF_ALU | BPF_RSH | BPF_X] = true, +		[BPF_ALU | BPF_NEG] = true, +		/* Load instructions */ +		[BPF_LD | BPF_W | BPF_ABS] = true, +		[BPF_LD | BPF_H | BPF_ABS] = true, +		[BPF_LD | BPF_B | BPF_ABS] = true, +		[BPF_LD | BPF_W | BPF_LEN] = true, +		[BPF_LD | BPF_W | BPF_IND] = true, +		[BPF_LD | BPF_H | BPF_IND] = true, +		[BPF_LD | BPF_B | BPF_IND] = true, +		[BPF_LD | BPF_IMM] = true, +		[BPF_LD | BPF_MEM] = true, +		[BPF_LDX | BPF_W | BPF_LEN] = true, +		[BPF_LDX | BPF_B | BPF_MSH] = true, +		[BPF_LDX | BPF_IMM] = true, +		[BPF_LDX | BPF_MEM] = true, +		/* Store instructions */ +		[BPF_ST] = true, +		[BPF_STX] = true, +		/* Misc instructions */ +		[BPF_MISC | BPF_TAX] = true, +		[BPF_MISC | BPF_TXA] = true, +		/* Return instructions */ +		[BPF_RET | BPF_K] = true, +		[BPF_RET | BPF_A] = true, +		/* Jump instructions */ +		[BPF_JMP | BPF_JA] = true, +		[BPF_JMP | BPF_JEQ | BPF_K] = true, +		[BPF_JMP | BPF_JEQ | BPF_X] = true, +		[BPF_JMP | BPF_JGE | BPF_K] = true, +		[BPF_JMP | BPF_JGE | BPF_X] = true, +		[BPF_JMP | BPF_JGT | BPF_K] = true, +		[BPF_JMP | BPF_JGT | BPF_X] = true, +		[BPF_JMP | BPF_JSET | BPF_K] = true, +		[BPF_JMP | BPF_JSET | BPF_X] = true, +	}; + +	if (code_to_probe >= ARRAY_SIZE(codes)) +		return false; + +	return codes[code_to_probe]; +} +  /**   *	sk_chk_filter - verify socket filter code   *	@filter: filter to verify @@ -479,193 +1229,303 @@ error:   */  int sk_chk_filter(struct sock_filter *filter, unsigned int flen)  { -	/* -	 * Valid instructions are initialized to non-0. -	 * Invalid instructions are initialized to 0. -	 */ -	static const u8 codes[] = { -		[BPF_ALU|BPF_ADD|BPF_K]  = BPF_S_ALU_ADD_K, -		[BPF_ALU|BPF_ADD|BPF_X]  = BPF_S_ALU_ADD_X, -		[BPF_ALU|BPF_SUB|BPF_K]  = BPF_S_ALU_SUB_K, -		[BPF_ALU|BPF_SUB|BPF_X]  = BPF_S_ALU_SUB_X, -		[BPF_ALU|BPF_MUL|BPF_K]  = BPF_S_ALU_MUL_K, -		[BPF_ALU|BPF_MUL|BPF_X]  = BPF_S_ALU_MUL_X, -		[BPF_ALU|BPF_DIV|BPF_X]  = BPF_S_ALU_DIV_X, -		[BPF_ALU|BPF_MOD|BPF_K]  = BPF_S_ALU_MOD_K, -		[BPF_ALU|BPF_MOD|BPF_X]  = BPF_S_ALU_MOD_X, -		[BPF_ALU|BPF_AND|BPF_K]  = BPF_S_ALU_AND_K, -		[BPF_ALU|BPF_AND|BPF_X]  = BPF_S_ALU_AND_X, -		[BPF_ALU|BPF_OR|BPF_K]   = BPF_S_ALU_OR_K, -		[BPF_ALU|BPF_OR|BPF_X]   = BPF_S_ALU_OR_X, -		[BPF_ALU|BPF_XOR|BPF_K]  = BPF_S_ALU_XOR_K, -		[BPF_ALU|BPF_XOR|BPF_X]  = BPF_S_ALU_XOR_X, -		[BPF_ALU|BPF_LSH|BPF_K]  = BPF_S_ALU_LSH_K, -		[BPF_ALU|BPF_LSH|BPF_X]  = BPF_S_ALU_LSH_X, -		[BPF_ALU|BPF_RSH|BPF_K]  = BPF_S_ALU_RSH_K, -		[BPF_ALU|BPF_RSH|BPF_X]  = BPF_S_ALU_RSH_X, -		[BPF_ALU|BPF_NEG]        = BPF_S_ALU_NEG, -		[BPF_LD|BPF_W|BPF_ABS]   = BPF_S_LD_W_ABS, -		[BPF_LD|BPF_H|BPF_ABS]   = BPF_S_LD_H_ABS, -		[BPF_LD|BPF_B|BPF_ABS]   = BPF_S_LD_B_ABS, -		[BPF_LD|BPF_W|BPF_LEN]   = BPF_S_LD_W_LEN, -		[BPF_LD|BPF_W|BPF_IND]   = BPF_S_LD_W_IND, -		[BPF_LD|BPF_H|BPF_IND]   = BPF_S_LD_H_IND, -		[BPF_LD|BPF_B|BPF_IND]   = BPF_S_LD_B_IND, -		[BPF_LD|BPF_IMM]         = BPF_S_LD_IMM, -		[BPF_LDX|BPF_W|BPF_LEN]  = BPF_S_LDX_W_LEN, -		[BPF_LDX|BPF_B|BPF_MSH]  = BPF_S_LDX_B_MSH, -		[BPF_LDX|BPF_IMM]        = BPF_S_LDX_IMM, -		[BPF_MISC|BPF_TAX]       = BPF_S_MISC_TAX, -		[BPF_MISC|BPF_TXA]       = BPF_S_MISC_TXA, -		[BPF_RET|BPF_K]          = BPF_S_RET_K, -		[BPF_RET|BPF_A]          = BPF_S_RET_A, -		[BPF_ALU|BPF_DIV|BPF_K]  = BPF_S_ALU_DIV_K, -		[BPF_LD|BPF_MEM]         = BPF_S_LD_MEM, -		[BPF_LDX|BPF_MEM]        = BPF_S_LDX_MEM, -		[BPF_ST]                 = BPF_S_ST, -		[BPF_STX]                = BPF_S_STX, -		[BPF_JMP|BPF_JA]         = BPF_S_JMP_JA, -		[BPF_JMP|BPF_JEQ|BPF_K]  = BPF_S_JMP_JEQ_K, -		[BPF_JMP|BPF_JEQ|BPF_X]  = BPF_S_JMP_JEQ_X, -		[BPF_JMP|BPF_JGE|BPF_K]  = BPF_S_JMP_JGE_K, -		[BPF_JMP|BPF_JGE|BPF_X]  = BPF_S_JMP_JGE_X, -		[BPF_JMP|BPF_JGT|BPF_K]  = BPF_S_JMP_JGT_K, -		[BPF_JMP|BPF_JGT|BPF_X]  = BPF_S_JMP_JGT_X, -		[BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K, -		[BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X, -	}; -	int pc;  	bool anc_found; +	int pc;  	if (flen == 0 || flen > BPF_MAXINSNS)  		return -EINVAL; -	/* check the filter code now */ +	/* Check the filter code now */  	for (pc = 0; pc < flen; pc++) {  		struct sock_filter *ftest = &filter[pc]; -		u16 code = ftest->code; -		if (code >= ARRAY_SIZE(codes)) -			return -EINVAL; -		code = codes[code]; -		if (!code) +		/* May we actually operate on this code? */ +		if (!chk_code_allowed(ftest->code))  			return -EINVAL; +  		/* Some instructions need special checks */ -		switch (code) { -		case BPF_S_ALU_DIV_K: -			/* check for division by zero */ +		switch (ftest->code) { +		case BPF_ALU | BPF_DIV | BPF_K: +		case BPF_ALU | BPF_MOD | BPF_K: +			/* Check for division by zero */  			if (ftest->k == 0)  				return -EINVAL; -			ftest->k = reciprocal_value(ftest->k);  			break; -		case BPF_S_ALU_MOD_K: -			/* check for division by zero */ -			if (ftest->k == 0) -				return -EINVAL; -			break; -		case BPF_S_LD_MEM: -		case BPF_S_LDX_MEM: -		case BPF_S_ST: -		case BPF_S_STX: -			/* check for invalid memory addresses */ +		case BPF_LD | BPF_MEM: +		case BPF_LDX | BPF_MEM: +		case BPF_ST: +		case BPF_STX: +			/* Check for invalid memory addresses */  			if (ftest->k >= BPF_MEMWORDS)  				return -EINVAL;  			break; -		case BPF_S_JMP_JA: -			/* -			 * Note, the large ftest->k might cause loops. +		case BPF_JMP | BPF_JA: +			/* Note, the large ftest->k might cause loops.  			 * Compare this with conditional jumps below,  			 * where offsets are limited. --ANK (981016)  			 */ -			if (ftest->k >= (unsigned int)(flen-pc-1)) +			if (ftest->k >= (unsigned int)(flen - pc - 1))  				return -EINVAL;  			break; -		case BPF_S_JMP_JEQ_K: -		case BPF_S_JMP_JEQ_X: -		case BPF_S_JMP_JGE_K: -		case BPF_S_JMP_JGE_X: -		case BPF_S_JMP_JGT_K: -		case BPF_S_JMP_JGT_X: -		case BPF_S_JMP_JSET_X: -		case BPF_S_JMP_JSET_K: -			/* for conditionals both must be safe */ +		case BPF_JMP | BPF_JEQ | BPF_K: +		case BPF_JMP | BPF_JEQ | BPF_X: +		case BPF_JMP | BPF_JGE | BPF_K: +		case BPF_JMP | BPF_JGE | BPF_X: +		case BPF_JMP | BPF_JGT | BPF_K: +		case BPF_JMP | BPF_JGT | BPF_X: +		case BPF_JMP | BPF_JSET | BPF_K: +		case BPF_JMP | BPF_JSET | BPF_X: +			/* Both conditionals must be safe */  			if (pc + ftest->jt + 1 >= flen ||  			    pc + ftest->jf + 1 >= flen)  				return -EINVAL;  			break; -		case BPF_S_LD_W_ABS: -		case BPF_S_LD_H_ABS: -		case BPF_S_LD_B_ABS: +		case BPF_LD | BPF_W | BPF_ABS: +		case BPF_LD | BPF_H | BPF_ABS: +		case BPF_LD | BPF_B | BPF_ABS:  			anc_found = false; -#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE:	\ -				code = BPF_S_ANC_##CODE;	\ -				anc_found = true;		\ -				break -			switch (ftest->k) { -			ANCILLARY(PROTOCOL); -			ANCILLARY(PKTTYPE); -			ANCILLARY(IFINDEX); -			ANCILLARY(NLATTR); -			ANCILLARY(NLATTR_NEST); -			ANCILLARY(MARK); -			ANCILLARY(QUEUE); -			ANCILLARY(HATYPE); -			ANCILLARY(RXHASH); -			ANCILLARY(CPU); -			ANCILLARY(ALU_XOR_X); -			ANCILLARY(VLAN_TAG); -			ANCILLARY(VLAN_TAG_PRESENT); -			ANCILLARY(PAY_OFFSET); -			} - -			/* ancillary operation unknown or unsupported */ +			if (bpf_anc_helper(ftest) & BPF_ANC) +				anc_found = true; +			/* Ancillary operation unknown or unsupported */  			if (anc_found == false && ftest->k >= SKF_AD_OFF)  				return -EINVAL;  		} -		ftest->code = code;  	} -	/* last instruction must be a RET code */ +	/* Last instruction must be a RET code */  	switch (filter[flen - 1].code) { -	case BPF_S_RET_K: -	case BPF_S_RET_A: +	case BPF_RET | BPF_K: +	case BPF_RET | BPF_A:  		return check_load_and_stores(filter, flen);  	} +  	return -EINVAL;  }  EXPORT_SYMBOL(sk_chk_filter); +static int sk_store_orig_filter(struct sk_filter *fp, +				const struct sock_fprog *fprog) +{ +	unsigned int fsize = sk_filter_proglen(fprog); +	struct sock_fprog_kern *fkprog; + +	fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); +	if (!fp->orig_prog) +		return -ENOMEM; + +	fkprog = fp->orig_prog; +	fkprog->len = fprog->len; +	fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL); +	if (!fkprog->filter) { +		kfree(fp->orig_prog); +		return -ENOMEM; +	} + +	return 0; +} + +static void sk_release_orig_filter(struct sk_filter *fp) +{ +	struct sock_fprog_kern *fprog = fp->orig_prog; + +	if (fprog) { +		kfree(fprog->filter); +		kfree(fprog); +	} +} +  /**   * 	sk_filter_release_rcu - Release a socket filter by rcu_head   *	@rcu: rcu_head that contains the sk_filter to free   */ -void sk_filter_release_rcu(struct rcu_head *rcu) +static void sk_filter_release_rcu(struct rcu_head *rcu)  {  	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); +	sk_release_orig_filter(fp); +	sk_filter_free(fp); +} + +/** + *	sk_filter_release - release a socket filter + *	@fp: filter to remove + * + *	Remove a filter from a socket and release its resources. + */ +static void sk_filter_release(struct sk_filter *fp) +{ +	if (atomic_dec_and_test(&fp->refcnt)) +		call_rcu(&fp->rcu, sk_filter_release_rcu); +} + +void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) +{ +	atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc); +	sk_filter_release(fp); +} + +void sk_filter_charge(struct sock *sk, struct sk_filter *fp) +{ +	atomic_inc(&fp->refcnt); +	atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc); +} + +static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp, +					      struct sock *sk, +					      unsigned int len) +{ +	struct sk_filter *fp_new; + +	if (sk == NULL) +		return krealloc(fp, len, GFP_KERNEL); + +	fp_new = sock_kmalloc(sk, len, GFP_KERNEL); +	if (fp_new) { +		*fp_new = *fp; +		/* As we're keeping orig_prog in fp_new along, +		 * we need to make sure we're not evicting it +		 * from the old fp. +		 */ +		fp->orig_prog = NULL; +		sk_filter_uncharge(sk, fp); +	} + +	return fp_new; +} + +static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, +					     struct sock *sk) +{ +	struct sock_filter *old_prog; +	struct sk_filter *old_fp; +	int err, new_len, old_len = fp->len; + +	/* We are free to overwrite insns et al right here as it +	 * won't be used at this point in time anymore internally +	 * after the migration to the internal BPF instruction +	 * representation. +	 */ +	BUILD_BUG_ON(sizeof(struct sock_filter) != +		     sizeof(struct sock_filter_int)); + +	/* Conversion cannot happen on overlapping memory areas, +	 * so we need to keep the user BPF around until the 2nd +	 * pass. At this time, the user BPF is stored in fp->insns. +	 */ +	old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), +			   GFP_KERNEL); +	if (!old_prog) { +		err = -ENOMEM; +		goto out_err; +	} + +	/* 1st pass: calculate the new program length. */ +	err = sk_convert_filter(old_prog, old_len, NULL, &new_len); +	if (err) +		goto out_err_free; + +	/* Expand fp for appending the new filter representation. */ +	old_fp = fp; +	fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len)); +	if (!fp) { +		/* The old_fp is still around in case we couldn't +		 * allocate new memory, so uncharge on that one. +		 */ +		fp = old_fp; +		err = -ENOMEM; +		goto out_err_free; +	} + +	fp->len = new_len; + +	/* 2nd pass: remap sock_filter insns into sock_filter_int insns. */ +	err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len); +	if (err) +		/* 2nd sk_convert_filter() can fail only if it fails +		 * to allocate memory, remapping must succeed. Note, +		 * that at this time old_fp has already been released +		 * by __sk_migrate_realloc(). +		 */ +		goto out_err_free; + +	sk_filter_select_runtime(fp); + +	kfree(old_prog); +	return fp; + +out_err_free: +	kfree(old_prog); +out_err: +	/* Rollback filter setup. */ +	if (sk != NULL) +		sk_filter_uncharge(sk, fp); +	else +		kfree(fp); +	return ERR_PTR(err); +} + +void __weak bpf_int_jit_compile(struct sk_filter *prog) +{ +} + +/** + *	sk_filter_select_runtime - select execution runtime for BPF program + *	@fp: sk_filter populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via SK_RUN_FILTER() macro + */ +void sk_filter_select_runtime(struct sk_filter *fp) +{ +	fp->bpf_func = (void *) __sk_run_filter; + +	/* Probe if internal BPF can be JITed */ +	bpf_int_jit_compile(fp); +} +EXPORT_SYMBOL_GPL(sk_filter_select_runtime); + +/* free internal BPF program */ +void sk_filter_free(struct sk_filter *fp) +{  	bpf_jit_free(fp); -	kfree(fp);  } -EXPORT_SYMBOL(sk_filter_release_rcu); +EXPORT_SYMBOL_GPL(sk_filter_free); -static int __sk_prepare_filter(struct sk_filter *fp) +static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, +					     struct sock *sk)  {  	int err; -	fp->bpf_func = sk_run_filter; +	fp->bpf_func = NULL; +	fp->jited = 0;  	err = sk_chk_filter(fp->insns, fp->len); -	if (err) -		return err; +	if (err) { +		if (sk != NULL) +			sk_filter_uncharge(sk, fp); +		else +			kfree(fp); +		return ERR_PTR(err); +	} +	/* Probe if we can JIT compile the filter and if so, do +	 * the compilation of the filter. +	 */  	bpf_jit_compile(fp); -	return 0; + +	/* JIT compiler couldn't process this filter, so do the +	 * internal BPF translation for the optimized interpreter. +	 */ +	if (!fp->jited) +		fp = __sk_migrate_filter(fp, sk); + +	return fp;  }  /**   *	sk_unattached_filter_create - create an unattached filter - *	@fprog: the filter program   *	@pfp: the unattached filter that is created + *	@fprog: the filter program   *   * Create a filter independent of any socket. We first run some   * sanity checks on it to make sure it does not explode on us later. @@ -673,33 +1533,38 @@ static int __sk_prepare_filter(struct sk_filter *fp)   * a negative errno code is returned. On success the return is zero.   */  int sk_unattached_filter_create(struct sk_filter **pfp, -				struct sock_fprog *fprog) +				struct sock_fprog_kern *fprog)  { +	unsigned int fsize = sk_filter_proglen(fprog);  	struct sk_filter *fp; -	unsigned int fsize = sizeof(struct sock_filter) * fprog->len; -	int err;  	/* Make sure new filter is there and in the right amounts. */  	if (fprog->filter == NULL)  		return -EINVAL; -	fp = kmalloc(fsize + sizeof(*fp), GFP_KERNEL); +	fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL);  	if (!fp)  		return -ENOMEM; +  	memcpy(fp->insns, fprog->filter, fsize);  	atomic_set(&fp->refcnt, 1);  	fp->len = fprog->len; +	/* Since unattached filters are not copied back to user +	 * space through sk_get_filter(), we do not need to hold +	 * a copy here, and can spare us the work. +	 */ +	fp->orig_prog = NULL; -	err = __sk_prepare_filter(fp); -	if (err) -		goto free_mem; +	/* __sk_prepare_filter() already takes care of uncharging +	 * memory in case something goes wrong. +	 */ +	fp = __sk_prepare_filter(fp, NULL); +	if (IS_ERR(fp)) +		return PTR_ERR(fp);  	*pfp = fp;  	return 0; -free_mem: -	kfree(fp); -	return err;  }  EXPORT_SYMBOL_GPL(sk_unattached_filter_create); @@ -722,7 +1587,8 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);  int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)  {  	struct sk_filter *fp, *old_fp; -	unsigned int fsize = sizeof(struct sock_filter) * fprog->len; +	unsigned int fsize = sk_filter_proglen(fprog); +	unsigned int sk_fsize = sk_filter_size(fprog->len);  	int err;  	if (sock_flag(sk, SOCK_FILTER_LOCKED)) @@ -732,29 +1598,38 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)  	if (fprog->filter == NULL)  		return -EINVAL; -	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); +	fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);  	if (!fp)  		return -ENOMEM; +  	if (copy_from_user(fp->insns, fprog->filter, fsize)) { -		sock_kfree_s(sk, fp, fsize+sizeof(*fp)); +		sock_kfree_s(sk, fp, sk_fsize);  		return -EFAULT;  	}  	atomic_set(&fp->refcnt, 1);  	fp->len = fprog->len; -	err = __sk_prepare_filter(fp); +	err = sk_store_orig_filter(fp, fprog);  	if (err) {  		sk_filter_uncharge(sk, fp); -		return err; +		return -ENOMEM;  	} +	/* __sk_prepare_filter() already takes care of uncharging +	 * memory in case something goes wrong. +	 */ +	fp = __sk_prepare_filter(fp, sk); +	if (IS_ERR(fp)) +		return PTR_ERR(fp); +  	old_fp = rcu_dereference_protected(sk->sk_filter,  					   sock_owned_by_user(sk));  	rcu_assign_pointer(sk->sk_filter, fp);  	if (old_fp)  		sk_filter_uncharge(sk, old_fp); +  	return 0;  }  EXPORT_SYMBOL_GPL(sk_attach_filter); @@ -774,136 +1649,46 @@ int sk_detach_filter(struct sock *sk)  		sk_filter_uncharge(sk, filter);  		ret = 0;  	} +  	return ret;  }  EXPORT_SYMBOL_GPL(sk_detach_filter); -void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) -{ -	static const u16 decodes[] = { -		[BPF_S_ALU_ADD_K]	= BPF_ALU|BPF_ADD|BPF_K, -		[BPF_S_ALU_ADD_X]	= BPF_ALU|BPF_ADD|BPF_X, -		[BPF_S_ALU_SUB_K]	= BPF_ALU|BPF_SUB|BPF_K, -		[BPF_S_ALU_SUB_X]	= BPF_ALU|BPF_SUB|BPF_X, -		[BPF_S_ALU_MUL_K]	= BPF_ALU|BPF_MUL|BPF_K, -		[BPF_S_ALU_MUL_X]	= BPF_ALU|BPF_MUL|BPF_X, -		[BPF_S_ALU_DIV_X]	= BPF_ALU|BPF_DIV|BPF_X, -		[BPF_S_ALU_MOD_K]	= BPF_ALU|BPF_MOD|BPF_K, -		[BPF_S_ALU_MOD_X]	= BPF_ALU|BPF_MOD|BPF_X, -		[BPF_S_ALU_AND_K]	= BPF_ALU|BPF_AND|BPF_K, -		[BPF_S_ALU_AND_X]	= BPF_ALU|BPF_AND|BPF_X, -		[BPF_S_ALU_OR_K]	= BPF_ALU|BPF_OR|BPF_K, -		[BPF_S_ALU_OR_X]	= BPF_ALU|BPF_OR|BPF_X, -		[BPF_S_ALU_XOR_K]	= BPF_ALU|BPF_XOR|BPF_K, -		[BPF_S_ALU_XOR_X]	= BPF_ALU|BPF_XOR|BPF_X, -		[BPF_S_ALU_LSH_K]	= BPF_ALU|BPF_LSH|BPF_K, -		[BPF_S_ALU_LSH_X]	= BPF_ALU|BPF_LSH|BPF_X, -		[BPF_S_ALU_RSH_K]	= BPF_ALU|BPF_RSH|BPF_K, -		[BPF_S_ALU_RSH_X]	= BPF_ALU|BPF_RSH|BPF_X, -		[BPF_S_ALU_NEG]		= BPF_ALU|BPF_NEG, -		[BPF_S_LD_W_ABS]	= BPF_LD|BPF_W|BPF_ABS, -		[BPF_S_LD_H_ABS]	= BPF_LD|BPF_H|BPF_ABS, -		[BPF_S_LD_B_ABS]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_PROTOCOL]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_PKTTYPE]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_IFINDEX]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_NLATTR]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_NLATTR_NEST]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_MARK]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_QUEUE]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_HATYPE]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_RXHASH]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_CPU]		= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_ALU_XOR_X]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_VLAN_TAG]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS, -		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN, -		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND, -		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND, -		[BPF_S_LD_B_IND]	= BPF_LD|BPF_B|BPF_IND, -		[BPF_S_LD_IMM]		= BPF_LD|BPF_IMM, -		[BPF_S_LDX_W_LEN]	= BPF_LDX|BPF_W|BPF_LEN, -		[BPF_S_LDX_B_MSH]	= BPF_LDX|BPF_B|BPF_MSH, -		[BPF_S_LDX_IMM]		= BPF_LDX|BPF_IMM, -		[BPF_S_MISC_TAX]	= BPF_MISC|BPF_TAX, -		[BPF_S_MISC_TXA]	= BPF_MISC|BPF_TXA, -		[BPF_S_RET_K]		= BPF_RET|BPF_K, -		[BPF_S_RET_A]		= BPF_RET|BPF_A, -		[BPF_S_ALU_DIV_K]	= BPF_ALU|BPF_DIV|BPF_K, -		[BPF_S_LD_MEM]		= BPF_LD|BPF_MEM, -		[BPF_S_LDX_MEM]		= BPF_LDX|BPF_MEM, -		[BPF_S_ST]		= BPF_ST, -		[BPF_S_STX]		= BPF_STX, -		[BPF_S_JMP_JA]		= BPF_JMP|BPF_JA, -		[BPF_S_JMP_JEQ_K]	= BPF_JMP|BPF_JEQ|BPF_K, -		[BPF_S_JMP_JEQ_X]	= BPF_JMP|BPF_JEQ|BPF_X, -		[BPF_S_JMP_JGE_K]	= BPF_JMP|BPF_JGE|BPF_K, -		[BPF_S_JMP_JGE_X]	= BPF_JMP|BPF_JGE|BPF_X, -		[BPF_S_JMP_JGT_K]	= BPF_JMP|BPF_JGT|BPF_K, -		[BPF_S_JMP_JGT_X]	= BPF_JMP|BPF_JGT|BPF_X, -		[BPF_S_JMP_JSET_K]	= BPF_JMP|BPF_JSET|BPF_K, -		[BPF_S_JMP_JSET_X]	= BPF_JMP|BPF_JSET|BPF_X, -	}; -	u16 code; - -	code = filt->code; - -	to->code = decodes[code]; -	to->jt = filt->jt; -	to->jf = filt->jf; - -	if (code == BPF_S_ALU_DIV_K) { -		/* -		 * When loaded this rule user gave us X, which was -		 * translated into R = r(X). Now we calculate the -		 * RR = r(R) and report it back. If next time this -		 * value is loaded and RRR = r(RR) is calculated -		 * then the R == RRR will be true. -		 * -		 * One exception. X == 1 translates into R == 0 and -		 * we can't calculate RR out of it with r(). -		 */ - -		if (filt->k == 0) -			to->k = 1; -		else -			to->k = reciprocal_value(filt->k); - -		BUG_ON(reciprocal_value(to->k) != filt->k); -	} else -		to->k = filt->k; -} - -int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) +int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, +		  unsigned int len)  { +	struct sock_fprog_kern *fprog;  	struct sk_filter *filter; -	int i, ret; +	int ret = 0;  	lock_sock(sk);  	filter = rcu_dereference_protected(sk->sk_filter, -			sock_owned_by_user(sk)); -	ret = 0; +					   sock_owned_by_user(sk));  	if (!filter)  		goto out; -	ret = filter->len; + +	/* We're copying the filter that has been originally attached, +	 * so no conversion/decode needed anymore. +	 */ +	fprog = filter->orig_prog; + +	ret = fprog->len;  	if (!len) +		/* User space only enquires number of filter blocks. */  		goto out; +  	ret = -EINVAL; -	if (len < filter->len) +	if (len < fprog->len)  		goto out;  	ret = -EFAULT; -	for (i = 0; i < filter->len; i++) { -		struct sock_filter fb; - -		sk_decode_filter(&filter->insns[i], &fb); -		if (copy_to_user(&ubuf[i], &fb, sizeof(fb))) -			goto out; -	} +	if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog))) +		goto out; -	ret = filter->len; +	/* Instead of bytes, the API requests to return the number +	 * of filter blocks. +	 */ +	ret = fprog->len;  out:  	release_sock(sk);  	return ret; diff --git a/net/core/flow.c b/net/core/flow.c index dfa602ceb8c..a0348fde1fd 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -24,6 +24,7 @@  #include <net/flow.h>  #include <linux/atomic.h>  #include <linux/security.h> +#include <net/net_namespace.h>  struct flow_cache_entry {  	union { @@ -38,37 +39,14 @@ struct flow_cache_entry {  	struct flow_cache_object	*object;  }; -struct flow_cache_percpu { -	struct hlist_head		*hash_table; -	int				hash_count; -	u32				hash_rnd; -	int				hash_rnd_recalc; -	struct tasklet_struct		flush_tasklet; -}; -  struct flow_flush_info {  	struct flow_cache		*cache;  	atomic_t			cpuleft;  	struct completion		completion;  }; -struct flow_cache { -	u32				hash_shift; -	struct flow_cache_percpu __percpu *percpu; -	struct notifier_block		hotcpu_notifier; -	int				low_watermark; -	int				high_watermark; -	struct timer_list		rnd_timer; -}; - -atomic_t flow_cache_genid = ATOMIC_INIT(0); -EXPORT_SYMBOL(flow_cache_genid); -static struct flow_cache flow_cache_global;  static struct kmem_cache *flow_cachep __read_mostly; -static DEFINE_SPINLOCK(flow_cache_gc_lock); -static LIST_HEAD(flow_cache_gc_list); -  #define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)  #define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ) @@ -84,16 +62,18 @@ static void flow_cache_new_hashrnd(unsigned long arg)  	add_timer(&fc->rnd_timer);  } -static int flow_entry_valid(struct flow_cache_entry *fle) +static int flow_entry_valid(struct flow_cache_entry *fle, +				struct netns_xfrm *xfrm)  { -	if (atomic_read(&flow_cache_genid) != fle->genid) +	if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)  		return 0;  	if (fle->object && !fle->object->ops->check(fle->object))  		return 0;  	return 1;  } -static void flow_entry_kill(struct flow_cache_entry *fle) +static void flow_entry_kill(struct flow_cache_entry *fle, +				struct netns_xfrm *xfrm)  {  	if (fle->object)  		fle->object->ops->delete(fle->object); @@ -104,26 +84,28 @@ static void flow_cache_gc_task(struct work_struct *work)  {  	struct list_head gc_list;  	struct flow_cache_entry *fce, *n; +	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, +						flow_cache_gc_work);  	INIT_LIST_HEAD(&gc_list); -	spin_lock_bh(&flow_cache_gc_lock); -	list_splice_tail_init(&flow_cache_gc_list, &gc_list); -	spin_unlock_bh(&flow_cache_gc_lock); +	spin_lock_bh(&xfrm->flow_cache_gc_lock); +	list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list); +	spin_unlock_bh(&xfrm->flow_cache_gc_lock);  	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) -		flow_entry_kill(fce); +		flow_entry_kill(fce, xfrm);  } -static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);  static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, -				     int deleted, struct list_head *gc_list) +				     int deleted, struct list_head *gc_list, +				     struct netns_xfrm *xfrm)  {  	if (deleted) {  		fcp->hash_count -= deleted; -		spin_lock_bh(&flow_cache_gc_lock); -		list_splice_tail(gc_list, &flow_cache_gc_list); -		spin_unlock_bh(&flow_cache_gc_lock); -		schedule_work(&flow_cache_gc_work); +		spin_lock_bh(&xfrm->flow_cache_gc_lock); +		list_splice_tail(gc_list, &xfrm->flow_cache_gc_list); +		spin_unlock_bh(&xfrm->flow_cache_gc_lock); +		schedule_work(&xfrm->flow_cache_gc_work);  	}  } @@ -135,6 +117,8 @@ static void __flow_cache_shrink(struct flow_cache *fc,  	struct hlist_node *tmp;  	LIST_HEAD(gc_list);  	int i, deleted = 0; +	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, +						flow_cache_global);  	for (i = 0; i < flow_cache_hash_size(fc); i++) {  		int saved = 0; @@ -142,7 +126,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,  		hlist_for_each_entry_safe(fle, tmp,  					  &fcp->hash_table[i], u.hlist) {  			if (saved < shrink_to && -			    flow_entry_valid(fle)) { +			    flow_entry_valid(fle, xfrm)) {  				saved++;  			} else {  				deleted++; @@ -152,7 +136,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,  		}  	} -	flow_cache_queue_garbage(fcp, deleted, &gc_list); +	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);  }  static void flow_cache_shrink(struct flow_cache *fc, @@ -208,7 +192,7 @@ struct flow_cache_object *  flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,  		  flow_resolve_t resolver, void *ctx)  { -	struct flow_cache *fc = &flow_cache_global; +	struct flow_cache *fc = &net->xfrm.flow_cache_global;  	struct flow_cache_percpu *fcp;  	struct flow_cache_entry *fle, *tfle;  	struct flow_cache_object *flo; @@ -258,7 +242,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,  			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);  			fcp->hash_count++;  		} -	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { +	} else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {  		flo = fle->object;  		if (!flo)  			goto ret_object; @@ -279,7 +263,7 @@ nocache:  	}  	flo = resolver(net, key, family, dir, flo, ctx);  	if (fle) { -		fle->genid = atomic_read(&flow_cache_genid); +		fle->genid = atomic_read(&net->xfrm.flow_cache_genid);  		if (!IS_ERR(flo))  			fle->object = flo;  		else @@ -303,12 +287,14 @@ static void flow_cache_flush_tasklet(unsigned long data)  	struct hlist_node *tmp;  	LIST_HEAD(gc_list);  	int i, deleted = 0; +	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, +						flow_cache_global);  	fcp = this_cpu_ptr(fc->percpu);  	for (i = 0; i < flow_cache_hash_size(fc); i++) {  		hlist_for_each_entry_safe(fle, tmp,  					  &fcp->hash_table[i], u.hlist) { -			if (flow_entry_valid(fle)) +			if (flow_entry_valid(fle, xfrm))  				continue;  			deleted++; @@ -317,7 +303,7 @@ static void flow_cache_flush_tasklet(unsigned long data)  		}  	} -	flow_cache_queue_garbage(fcp, deleted, &gc_list); +	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);  	if (atomic_dec_and_test(&info->cpuleft))  		complete(&info->completion); @@ -351,10 +337,9 @@ static void flow_cache_flush_per_cpu(void *data)  	tasklet_schedule(tasklet);  } -void flow_cache_flush(void) +void flow_cache_flush(struct net *net)  {  	struct flow_flush_info info; -	static DEFINE_MUTEX(flow_flush_sem);  	cpumask_var_t mask;  	int i, self; @@ -365,8 +350,8 @@ void flow_cache_flush(void)  	/* Don't want cpus going down or up during this. */  	get_online_cpus(); -	mutex_lock(&flow_flush_sem); -	info.cache = &flow_cache_global; +	mutex_lock(&net->xfrm.flow_flush_sem); +	info.cache = &net->xfrm.flow_cache_global;  	for_each_online_cpu(i)  		if (!flow_cache_percpu_empty(info.cache, i))  			cpumask_set_cpu(i, mask); @@ -386,21 +371,23 @@ void flow_cache_flush(void)  	wait_for_completion(&info.completion);  done: -	mutex_unlock(&flow_flush_sem); +	mutex_unlock(&net->xfrm.flow_flush_sem);  	put_online_cpus();  	free_cpumask_var(mask);  }  static void flow_cache_flush_task(struct work_struct *work)  { -	flow_cache_flush(); -} +	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, +						flow_cache_gc_work); +	struct net *net = container_of(xfrm, struct net, xfrm); -static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task); +	flow_cache_flush(net); +} -void flow_cache_flush_deferred(void) +void flow_cache_flush_deferred(struct net *net)  { -	schedule_work(&flow_cache_flush_work); +	schedule_work(&net->xfrm.flow_cache_flush_work);  }  static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) @@ -425,7 +412,8 @@ static int flow_cache_cpu(struct notifier_block *nfb,  			  unsigned long action,  			  void *hcpu)  { -	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); +	struct flow_cache *fc = container_of(nfb, struct flow_cache, +						hotcpu_notifier);  	int res, cpu = (unsigned long) hcpu;  	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); @@ -444,9 +432,20 @@ static int flow_cache_cpu(struct notifier_block *nfb,  	return NOTIFY_OK;  } -static int __init flow_cache_init(struct flow_cache *fc) +int flow_cache_init(struct net *net)  {  	int i; +	struct flow_cache *fc = &net->xfrm.flow_cache_global; + +	if (!flow_cachep) +		flow_cachep = kmem_cache_create("flow_cache", +						sizeof(struct flow_cache_entry), +						0, SLAB_PANIC, NULL); +	spin_lock_init(&net->xfrm.flow_cache_gc_lock); +	INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list); +	INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task); +	INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task); +	mutex_init(&net->xfrm.flow_flush_sem);  	fc->hash_shift = 10;  	fc->low_watermark = 2 * flow_cache_hash_size(fc); @@ -456,6 +455,8 @@ static int __init flow_cache_init(struct flow_cache *fc)  	if (!fc->percpu)  		return -ENOMEM; +	cpu_notifier_register_begin(); +  	for_each_online_cpu(i) {  		if (flow_cache_cpu_prepare(fc, i))  			goto err; @@ -463,7 +464,9 @@ static int __init flow_cache_init(struct flow_cache *fc)  	fc->hotcpu_notifier = (struct notifier_block){  		.notifier_call = flow_cache_cpu,  	}; -	register_hotcpu_notifier(&fc->hotcpu_notifier); +	__register_hotcpu_notifier(&fc->hotcpu_notifier); + +	cpu_notifier_register_done();  	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,  		    (unsigned long) fc); @@ -479,19 +482,30 @@ err:  		fcp->hash_table = NULL;  	} +	cpu_notifier_register_done(); +  	free_percpu(fc->percpu);  	fc->percpu = NULL;  	return -ENOMEM;  } +EXPORT_SYMBOL(flow_cache_init); -static int __init flow_cache_init_global(void) +void flow_cache_fini(struct net *net)  { -	flow_cachep = kmem_cache_create("flow_cache", -					sizeof(struct flow_cache_entry), -					0, SLAB_PANIC, NULL); +	int i; +	struct flow_cache *fc = &net->xfrm.flow_cache_global; -	return flow_cache_init(&flow_cache_global); -} +	del_timer_sync(&fc->rnd_timer); +	unregister_hotcpu_notifier(&fc->hotcpu_notifier); + +	for_each_possible_cpu(i) { +		struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); +		kfree(fcp->hash_table); +		fcp->hash_table = NULL; +	} -module_init(flow_cache_init_global); +	free_percpu(fc->percpu); +	fc->percpu = NULL; +} +EXPORT_SYMBOL(flow_cache_fini); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1929af87b26..107ed12a532 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -25,9 +25,35 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i  	memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));  } +/** + * skb_flow_get_ports - extract the upper layer ports and return them + * @skb: buffer to extract the ports from + * @thoff: transport header offset + * @ip_proto: protocol for which to get port offset + * + * The function will try to retrieve the ports at offset thoff + poff where poff + * is the protocol port offset returned from proto_ports_offset + */ +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +{ +	int poff = proto_ports_offset(ip_proto); + +	if (poff >= 0) { +		__be32 *ports, _ports; + +		ports = skb_header_pointer(skb, thoff + poff, +					   sizeof(_ports), &_ports); +		if (ports) +			return *ports; +	} + +	return 0; +} +EXPORT_SYMBOL(skb_flow_get_ports); +  bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)  { -	int poff, nhoff = skb_network_offset(skb); +	int nhoff = skb_network_offset(skb);  	u8 ip_proto;  	__be16 proto = skb->protocol; @@ -35,23 +61,23 @@ bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)  again:  	switch (proto) { -	case __constant_htons(ETH_P_IP): { +	case htons(ETH_P_IP): {  		const struct iphdr *iph;  		struct iphdr _iph;  ip:  		iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); -		if (!iph) +		if (!iph || iph->ihl < 5)  			return false; +		nhoff += iph->ihl * 4; +		ip_proto = iph->protocol;  		if (ip_is_fragment(iph))  			ip_proto = 0; -		else -			ip_proto = iph->protocol; +  		iph_to_flow_copy_addrs(flow, iph); -		nhoff += iph->ihl * 4;  		break;  	} -	case __constant_htons(ETH_P_IPV6): { +	case htons(ETH_P_IPV6): {  		const struct ipv6hdr *iph;  		struct ipv6hdr _iph;  ipv6: @@ -65,8 +91,8 @@ ipv6:  		nhoff += sizeof(struct ipv6hdr);  		break;  	} -	case __constant_htons(ETH_P_8021AD): -	case __constant_htons(ETH_P_8021Q): { +	case htons(ETH_P_8021AD): +	case htons(ETH_P_8021Q): {  		const struct vlan_hdr *vlan;  		struct vlan_hdr _vlan; @@ -78,7 +104,7 @@ ipv6:  		nhoff += sizeof(*vlan);  		goto again;  	} -	case __constant_htons(ETH_P_PPP_SES): { +	case htons(ETH_P_PPP_SES): {  		struct {  			struct pppoe_hdr hdr;  			__be16 proto; @@ -89,9 +115,9 @@ ipv6:  		proto = hdr->proto;  		nhoff += PPPOE_SES_HLEN;  		switch (proto) { -		case __constant_htons(PPP_IP): +		case htons(PPP_IP):  			goto ip; -		case __constant_htons(PPP_IPV6): +		case htons(PPP_IPV6):  			goto ipv6;  		default:  			return false; @@ -150,16 +176,7 @@ ipv6:  	}  	flow->ip_proto = ip_proto; -	poff = proto_ports_offset(ip_proto); -	if (poff >= 0) { -		__be32 *ports, _ports; - -		nhoff += poff; -		ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports); -		if (ports) -			flow->ports = *ports; -	} - +	flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);  	flow->thoff = (u16) nhoff;  	return true; @@ -167,14 +184,30 @@ ipv6:  EXPORT_SYMBOL(skb_flow_dissect);  static u32 hashrnd __read_mostly; +static __always_inline void __flow_hash_secret_init(void) +{ +	net_get_random_once(&hashrnd, sizeof(hashrnd)); +} + +static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) +{ +	__flow_hash_secret_init(); +	return jhash_3words(a, b, c, hashrnd); +} + +static __always_inline u32 __flow_hash_1word(u32 a) +{ +	__flow_hash_secret_init(); +	return jhash_1word(a, hashrnd); +}  /* - * __skb_get_rxhash: calculate a flow hash based on src/dst addresses - * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value - * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb + * __skb_get_hash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers.  Sets hash in skb to non-zero hash value + * on success, zero indicates no valid hash.  Also, sets l4_hash in skb   * if hash is a canonical 4-tuple hash over transport ports.   */ -void __skb_get_rxhash(struct sk_buff *skb) +void __skb_get_hash(struct sk_buff *skb)  {  	struct flow_keys keys;  	u32 hash; @@ -183,7 +216,7 @@ void __skb_get_rxhash(struct sk_buff *skb)  		return;  	if (keys.ports) -		skb->l4_rxhash = 1; +		skb->l4_hash = 1;  	/* get a consistent hash (same value on both flow directions) */  	if (((__force u32)keys.dst < (__force u32)keys.src) || @@ -193,15 +226,15 @@ void __skb_get_rxhash(struct sk_buff *skb)  		swap(keys.port16[0], keys.port16[1]);  	} -	hash = jhash_3words((__force u32)keys.dst, -			    (__force u32)keys.src, -			    (__force u32)keys.ports, hashrnd); +	hash = __flow_hash_3words((__force u32)keys.dst, +				  (__force u32)keys.src, +				  (__force u32)keys.ports);  	if (!hash)  		hash = 1; -	skb->rxhash = hash; +	skb->hash = hash;  } -EXPORT_SYMBOL(__skb_get_rxhash); +EXPORT_SYMBOL(__skb_get_hash);  /*   * Returns a Tx hash based on the given packet descriptor a Tx queues' number @@ -231,7 +264,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,  		hash = skb->sk->sk_hash;  	else  		hash = (__force u16) skb->protocol; -	hash = jhash_1word(hash, hashrnd); +	hash = __flow_hash_1word(hash);  	return (u16) (((u64) hash * qcount) >> 32) + qoffset;  } @@ -290,17 +323,6 @@ u32 __skb_get_poff(const struct sk_buff *skb)  	return poff;  } -static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) -{ -	if (unlikely(queue_index >= dev->real_num_tx_queues)) { -		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", -				     dev->name, queue_index, -				     dev->real_num_tx_queues); -		return 0; -	} -	return queue_index; -} -  static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)  {  #ifdef CONFIG_XPS @@ -322,8 +344,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)  					hash = skb->sk->sk_hash;  				else  					hash = (__force u16) skb->protocol ^ -					    skb->rxhash; -				hash = jhash_1word(hash, hashrnd); +					    skb->hash; +				hash = __flow_hash_1word(hash);  				queue_index = map->queues[  				    ((u64)hash * map->len) >> 32];  			} @@ -339,7 +361,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)  #endif  } -u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)  {  	struct sock *sk = skb->sk;  	int queue_index = sk_tx_queue_get(sk); @@ -359,30 +381,25 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)  	return queue_index;  } -EXPORT_SYMBOL(__netdev_pick_tx);  struct netdev_queue *netdev_pick_tx(struct net_device *dev, -				    struct sk_buff *skb) +				    struct sk_buff *skb, +				    void *accel_priv)  {  	int queue_index = 0;  	if (dev->real_num_tx_queues != 1) {  		const struct net_device_ops *ops = dev->netdev_ops;  		if (ops->ndo_select_queue) -			queue_index = ops->ndo_select_queue(dev, skb); +			queue_index = ops->ndo_select_queue(dev, skb, accel_priv, +							    __netdev_pick_tx);  		else  			queue_index = __netdev_pick_tx(dev, skb); -		queue_index = dev_cap_txqueue(dev, queue_index); + +		if (!accel_priv) +			queue_index = netdev_cap_txqueue(dev, queue_index);  	}  	skb_set_queue_mapping(skb, queue_index);  	return netdev_get_tx_queue(dev, queue_index);  } - -static int __init initialize_hashrnd(void) -{ -	get_random_bytes(&hashrnd, sizeof(hashrnd)); -	return 0; -} - -late_initcall_sync(initialize_hashrnd); diff --git a/net/core/iovec.c b/net/core/iovec.c index b77eeecc001..e1ec45ab1e6 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -39,7 +39,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a  {  	int size, ct, err; -	if (m->msg_namelen) { +	if (m->msg_name && m->msg_namelen) {  		if (mode == VERIFY_READ) {  			void __user *namep;  			namep = (void __user __force *) m->msg_name; @@ -51,6 +51,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a  		m->msg_name = address;  	} else {  		m->msg_name = NULL; +		m->msg_namelen = 0;  	}  	size = m->msg_iovlen * sizeof(struct iovec); @@ -74,61 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a  }  /* - *	Copy kernel to iovec. Returns -EFAULT on error. - */ - -int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, -		      int offset, int len) -{ -	int copy; -	for (; len > 0; ++iov) { -		/* Skip over the finished iovecs */ -		if (unlikely(offset >= iov->iov_len)) { -			offset -= iov->iov_len; -			continue; -		} -		copy = min_t(unsigned int, iov->iov_len - offset, len); -		if (copy_to_user(iov->iov_base + offset, kdata, copy)) -			return -EFAULT; -		offset = 0; -		kdata += copy; -		len -= copy; -	} - -	return 0; -} -EXPORT_SYMBOL(memcpy_toiovecend); - -/* - *	Copy iovec from kernel. Returns -EFAULT on error. - */ - -int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, -			int offset, int len) -{ -	/* Skip over the finished iovecs */ -	while (offset >= iov->iov_len) { -		offset -= iov->iov_len; -		iov++; -	} - -	while (len > 0) { -		u8 __user *base = iov->iov_base + offset; -		int copy = min_t(unsigned int, len, iov->iov_len - offset); - -		offset = 0; -		if (copy_from_user(kdata, base, copy)) -			return -EFAULT; -		len -= copy; -		kdata += copy; -		iov++; -	} - -	return 0; -} -EXPORT_SYMBOL(memcpy_fromiovecend); - -/*   *	And now for the all-in-one: copy and checksum from a user iovec   *	directly to a datagram   *	Calls to csum_partial but the last must be in 32 bit chunks diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 9c3a839322b..bd0767e6b2b 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -147,7 +147,7 @@ static void linkwatch_do_dev(struct net_device *dev)  	 * Make sure the above read is complete since it can be  	 * rewritten as soon as we clear the bit below.  	 */ -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	/* We are about to handle this device,  	 * so new events can be accepted diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6072610a867..ef31fef25e5 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -38,6 +38,8 @@  #include <linux/random.h>  #include <linux/string.h>  #include <linux/log2.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h>  #define DEBUG  #define NEIGH_DEBUG 1 @@ -115,7 +117,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)  unsigned long neigh_rand_reach_time(unsigned long base)  { -	return base ? (net_random() % base) + (base >> 1) : 0; +	return base ? (prandom_u32() % base) + (base >> 1) : 0;  }  EXPORT_SYMBOL(neigh_rand_reach_time); @@ -497,7 +499,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,  		goto out_neigh_release;  	} -	n->confirmed = jiffies - (n->parms->base_reachable_time << 1); +	n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);  	write_lock_bh(&tbl->lock);  	nht = rcu_dereference_protected(tbl->nht, @@ -764,9 +766,6 @@ static void neigh_periodic_work(struct work_struct *work)  	nht = rcu_dereference_protected(tbl->nht,  					lockdep_is_held(&tbl->lock)); -	if (atomic_read(&tbl->entries) < tbl->gc_thresh1) -		goto out; -  	/*  	 *	periodically recompute ReachableTime from random function  	 */ @@ -776,9 +775,12 @@ static void neigh_periodic_work(struct work_struct *work)  		tbl->last_rand = jiffies;  		for (p = &tbl->parms; p; p = p->next)  			p->reachable_time = -				neigh_rand_reach_time(p->base_reachable_time); +				neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));  	} +	if (atomic_read(&tbl->entries) < tbl->gc_thresh1) +		goto out; +  	for (i = 0 ; i < (1 << nht->hash_shift); i++) {  		np = &nht->hash_buckets[i]; @@ -799,7 +801,7 @@ static void neigh_periodic_work(struct work_struct *work)  			if (atomic_read(&n->refcnt) == 1 &&  			    (state == NUD_FAILED || -			     time_after(jiffies, n->used + n->parms->gc_staletime))) { +			     time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {  				*np = n->next;  				n->dead = 1;  				write_unlock(&n->lock); @@ -822,21 +824,22 @@ next_elt:  						lockdep_is_held(&tbl->lock));  	}  out: -	/* Cycle through all hash buckets every base_reachable_time/2 ticks. -	 * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 -	 * base_reachable_time. +	/* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. +	 * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 +	 * BASE_REACHABLE_TIME.  	 */ -	schedule_delayed_work(&tbl->gc_work, -			      tbl->parms.base_reachable_time >> 1); +	queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, +			      NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);  	write_unlock_bh(&tbl->lock);  }  static __inline__ int neigh_max_probes(struct neighbour *n)  {  	struct neigh_parms *p = n->parms; -	return (n->nud_state & NUD_PROBE) ? -		p->ucast_probes : -		p->ucast_probes + p->app_probes + p->mcast_probes; +	int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES); +	if (!(n->nud_state & NUD_PROBE)) +		max_probes += NEIGH_VAR(p, MCAST_PROBES); +	return max_probes;  }  static void neigh_invalidate(struct neighbour *neigh) @@ -867,7 +870,7 @@ static void neigh_invalidate(struct neighbour *neigh)  static void neigh_probe(struct neighbour *neigh)  	__releases(neigh->lock)  { -	struct sk_buff *skb = skb_peek(&neigh->arp_queue); +	struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);  	/* keep skb alive even if arp_queue overflows */  	if (skb)  		skb = skb_copy(skb, GFP_ATOMIC); @@ -901,12 +904,13 @@ static void neigh_timer_handler(unsigned long arg)  			neigh_dbg(2, "neigh %p is still alive\n", neigh);  			next = neigh->confirmed + neigh->parms->reachable_time;  		} else if (time_before_eq(now, -					  neigh->used + neigh->parms->delay_probe_time)) { +					  neigh->used + +					  NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {  			neigh_dbg(2, "neigh %p is delayed\n", neigh);  			neigh->nud_state = NUD_DELAY;  			neigh->updated = jiffies;  			neigh_suspect(neigh); -			next = now + neigh->parms->delay_probe_time; +			next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);  		} else {  			neigh_dbg(2, "neigh %p is suspected\n", neigh);  			neigh->nud_state = NUD_STALE; @@ -916,7 +920,8 @@ static void neigh_timer_handler(unsigned long arg)  		}  	} else if (state & NUD_DELAY) {  		if (time_before_eq(now, -				   neigh->confirmed + neigh->parms->delay_probe_time)) { +				   neigh->confirmed + +				   NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {  			neigh_dbg(2, "neigh %p is now reachable\n", neigh);  			neigh->nud_state = NUD_REACHABLE;  			neigh->updated = jiffies; @@ -928,11 +933,11 @@ static void neigh_timer_handler(unsigned long arg)  			neigh->nud_state = NUD_PROBE;  			neigh->updated = jiffies;  			atomic_set(&neigh->probes, 0); -			next = now + neigh->parms->retrans_time; +			next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);  		}  	} else {  		/* NUD_PROBE|NUD_INCOMPLETE */ -		next = now + neigh->parms->retrans_time; +		next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);  	}  	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -940,6 +945,7 @@ static void neigh_timer_handler(unsigned long arg)  		neigh->nud_state = NUD_FAILED;  		notify = 1;  		neigh_invalidate(neigh); +		goto out;  	}  	if (neigh->nud_state & NUD_IN_TIMER) { @@ -973,13 +979,16 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)  		goto out_unlock_bh;  	if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { -		if (neigh->parms->mcast_probes + neigh->parms->app_probes) { +		if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + +		    NEIGH_VAR(neigh->parms, APP_PROBES)) {  			unsigned long next, now = jiffies; -			atomic_set(&neigh->probes, neigh->parms->ucast_probes); +			atomic_set(&neigh->probes, +				   NEIGH_VAR(neigh->parms, UCAST_PROBES));  			neigh->nud_state     = NUD_INCOMPLETE;  			neigh->updated = now; -			next = now + max(neigh->parms->retrans_time, HZ/2); +			next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), +					 HZ/2);  			neigh_add_timer(neigh, next);  			immediate_probe = true;  		} else { @@ -994,14 +1003,14 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)  		neigh_dbg(2, "neigh %p is delayed\n", neigh);  		neigh->nud_state = NUD_DELAY;  		neigh->updated = jiffies; -		neigh_add_timer(neigh, -				jiffies + neigh->parms->delay_probe_time); +		neigh_add_timer(neigh, jiffies + +				NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));  	}  	if (neigh->nud_state == NUD_INCOMPLETE) {  		if (skb) {  			while (neigh->arp_queue_len_bytes + skb->truesize > -			       neigh->parms->queue_len_bytes) { +			       NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {  				struct sk_buff *buff;  				buff = __skb_dequeue(&neigh->arp_queue); @@ -1161,6 +1170,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,  						 neigh->parms->reachable_time :  						 0)));  		neigh->nud_state = new; +		notify = 1;  	}  	if (lladdr != neigh->ha) { @@ -1170,7 +1180,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,  		neigh_update_hhs(neigh);  		if (!(new & NUD_CONNECTED))  			neigh->confirmed = jiffies - -				      (neigh->parms->base_reachable_time << 1); +				      (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);  		notify = 1;  	}  	if (new == old) @@ -1230,6 +1240,21 @@ out:  }  EXPORT_SYMBOL(neigh_update); +/* Update the neigh to listen temporarily for probe responses, even if it is + * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. + */ +void __neigh_set_probe_once(struct neighbour *neigh) +{ +	neigh->updated = jiffies; +	if (!(neigh->nud_state & NUD_FAILED)) +		return; +	neigh->nud_state = NUD_INCOMPLETE; +	atomic_set(&neigh->probes, neigh_max_probes(neigh)); +	neigh_add_timer(neigh, +			jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); +} +EXPORT_SYMBOL(__neigh_set_probe_once); +  struct neighbour *neigh_event_ns(struct neigh_table *tbl,  				 u8 *lladdr, void *saddr,  				 struct net_device *dev) @@ -1274,7 +1299,7 @@ int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb)  	if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,  			    skb->len) < 0 && -	    dev->header_ops->rebuild(skb)) +	    dev_rebuild_header(skb))  		return 0;  	return dev_queue_xmit(skb); @@ -1391,9 +1416,11 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,  		    struct sk_buff *skb)  {  	unsigned long now = jiffies; -	unsigned long sched_next = now + (net_random() % p->proxy_delay); -	if (tbl->proxy_queue.qlen > p->proxy_qlen) { +	unsigned long sched_next = now + (prandom_u32() % +					  NEIGH_VAR(p, PROXY_DELAY)); + +	if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {  		kfree_skb(skb);  		return;  	} @@ -1440,7 +1467,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,  		p->tbl		  = tbl;  		atomic_set(&p->refcnt, 1);  		p->reachable_time = -				neigh_rand_reach_time(p->base_reachable_time); +				neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));  		dev_hold(dev);  		p->dev = dev;  		write_pnet(&p->net, hold_net(net)); @@ -1457,6 +1484,8 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,  		p->next		= tbl->parms.next;  		tbl->parms.next = p;  		write_unlock_bh(&tbl->lock); + +		neigh_parms_data_state_cleanall(p);  	}  	return p;  } @@ -1509,7 +1538,7 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl)  	write_pnet(&tbl->parms.net, &init_net);  	atomic_set(&tbl->parms.refcnt, 1);  	tbl->parms.reachable_time = -			  neigh_rand_reach_time(tbl->parms.base_reachable_time); +			  neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));  	tbl->stats = alloc_percpu(struct neigh_statistics);  	if (!tbl->stats) @@ -1537,7 +1566,8 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl)  	rwlock_init(&tbl->lock);  	INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); -	schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); +	queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, +			tbl->parms.reachable_time);  	setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);  	skb_queue_head_init_class(&tbl->proxy_queue,  			&neigh_table_proxy_queue_class); @@ -1777,24 +1807,32 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)  	if ((parms->dev &&  	     nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||  	    nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) || -	    nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes) || +	    nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, +			NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||  	    /* approximative value for deprecated QUEUE_LEN (in packets) */  	    nla_put_u32(skb, NDTPA_QUEUE_LEN, -			parms->queue_len_bytes / SKB_TRUESIZE(ETH_FRAME_LEN)) || -	    nla_put_u32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen) || -	    nla_put_u32(skb, NDTPA_APP_PROBES, parms->app_probes) || -	    nla_put_u32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes) || -	    nla_put_u32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes) || +			NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || +	    nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || +	    nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || +	    nla_put_u32(skb, NDTPA_UCAST_PROBES, +			NEIGH_VAR(parms, UCAST_PROBES)) || +	    nla_put_u32(skb, NDTPA_MCAST_PROBES, +			NEIGH_VAR(parms, MCAST_PROBES)) ||  	    nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) ||  	    nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, -			  parms->base_reachable_time) || -	    nla_put_msecs(skb, NDTPA_GC_STALETIME, parms->gc_staletime) || +			  NEIGH_VAR(parms, BASE_REACHABLE_TIME)) || +	    nla_put_msecs(skb, NDTPA_GC_STALETIME, +			  NEIGH_VAR(parms, GC_STALETIME)) ||  	    nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, -			  parms->delay_probe_time) || -	    nla_put_msecs(skb, NDTPA_RETRANS_TIME, parms->retrans_time) || -	    nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay) || -	    nla_put_msecs(skb, NDTPA_PROXY_DELAY, parms->proxy_delay) || -	    nla_put_msecs(skb, NDTPA_LOCKTIME, parms->locktime)) +			  NEIGH_VAR(parms, DELAY_PROBE_TIME)) || +	    nla_put_msecs(skb, NDTPA_RETRANS_TIME, +			  NEIGH_VAR(parms, RETRANS_TIME)) || +	    nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, +			  NEIGH_VAR(parms, ANYCAST_DELAY)) || +	    nla_put_msecs(skb, NDTPA_PROXY_DELAY, +			  NEIGH_VAR(parms, PROXY_DELAY)) || +	    nla_put_msecs(skb, NDTPA_LOCKTIME, +			  NEIGH_VAR(parms, LOCKTIME)))  		goto nla_put_failure;  	return nla_nest_end(skb, nest); @@ -2010,44 +2048,57 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)  			switch (i) {  			case NDTPA_QUEUE_LEN: -				p->queue_len_bytes = nla_get_u32(tbp[i]) * -						     SKB_TRUESIZE(ETH_FRAME_LEN); +				NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, +					      nla_get_u32(tbp[i]) * +					      SKB_TRUESIZE(ETH_FRAME_LEN));  				break;  			case NDTPA_QUEUE_LENBYTES: -				p->queue_len_bytes = nla_get_u32(tbp[i]); +				NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, +					      nla_get_u32(tbp[i]));  				break;  			case NDTPA_PROXY_QLEN: -				p->proxy_qlen = nla_get_u32(tbp[i]); +				NEIGH_VAR_SET(p, PROXY_QLEN, +					      nla_get_u32(tbp[i]));  				break;  			case NDTPA_APP_PROBES: -				p->app_probes = nla_get_u32(tbp[i]); +				NEIGH_VAR_SET(p, APP_PROBES, +					      nla_get_u32(tbp[i]));  				break;  			case NDTPA_UCAST_PROBES: -				p->ucast_probes = nla_get_u32(tbp[i]); +				NEIGH_VAR_SET(p, UCAST_PROBES, +					      nla_get_u32(tbp[i]));  				break;  			case NDTPA_MCAST_PROBES: -				p->mcast_probes = nla_get_u32(tbp[i]); +				NEIGH_VAR_SET(p, MCAST_PROBES, +					      nla_get_u32(tbp[i]));  				break;  			case NDTPA_BASE_REACHABLE_TIME: -				p->base_reachable_time = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_GC_STALETIME: -				p->gc_staletime = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, GC_STALETIME, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_DELAY_PROBE_TIME: -				p->delay_probe_time = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, DELAY_PROBE_TIME, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_RETRANS_TIME: -				p->retrans_time = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, RETRANS_TIME, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_ANYCAST_DELAY: -				p->anycast_delay = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, ANYCAST_DELAY, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_PROXY_DELAY: -				p->proxy_delay = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, PROXY_DELAY, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_LOCKTIME: -				p->locktime = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, LOCKTIME, +					      nla_get_msecs(tbp[i]));  				break;  			}  		} @@ -2198,7 +2249,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,  	ndm->ndm_pad1    = 0;  	ndm->ndm_pad2    = 0;  	ndm->ndm_flags	 = pn->flags | NTF_PROXY; -	ndm->ndm_type	 = NDA_DST; +	ndm->ndm_type	 = RTN_UNICAST;  	ndm->ndm_ifindex = pn->dev->ifindex;  	ndm->ndm_state	 = NUD_NONE; @@ -2788,133 +2839,167 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write,  	return ret;  } -enum { -	NEIGH_VAR_MCAST_PROBE, -	NEIGH_VAR_UCAST_PROBE, -	NEIGH_VAR_APP_PROBE, -	NEIGH_VAR_RETRANS_TIME, -	NEIGH_VAR_BASE_REACHABLE_TIME, -	NEIGH_VAR_DELAY_PROBE_TIME, -	NEIGH_VAR_GC_STALETIME, -	NEIGH_VAR_QUEUE_LEN, -	NEIGH_VAR_QUEUE_LEN_BYTES, -	NEIGH_VAR_PROXY_QLEN, -	NEIGH_VAR_ANYCAST_DELAY, -	NEIGH_VAR_PROXY_DELAY, -	NEIGH_VAR_LOCKTIME, -	NEIGH_VAR_RETRANS_TIME_MS, -	NEIGH_VAR_BASE_REACHABLE_TIME_MS, -	NEIGH_VAR_GC_INTERVAL, -	NEIGH_VAR_GC_THRESH1, -	NEIGH_VAR_GC_THRESH2, -	NEIGH_VAR_GC_THRESH3, -	NEIGH_VAR_MAX -}; +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, +						   int family) +{ +	switch (family) { +	case AF_INET: +		return __in_dev_arp_parms_get_rcu(dev); +	case AF_INET6: +		return __in6_dev_nd_parms_get_rcu(dev); +	} +	return NULL; +} + +static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, +				  int index) +{ +	struct net_device *dev; +	int family = neigh_parms_family(p); + +	rcu_read_lock(); +	for_each_netdev_rcu(net, dev) { +		struct neigh_parms *dst_p = +				neigh_get_dev_parms_rcu(dev, family); + +		if (dst_p && !test_bit(index, dst_p->data_state)) +			dst_p->data[index] = p->data[index]; +	} +	rcu_read_unlock(); +} + +static void neigh_proc_update(struct ctl_table *ctl, int write) +{ +	struct net_device *dev = ctl->extra1; +	struct neigh_parms *p = ctl->extra2; +	struct net *net = neigh_parms_net(p); +	int index = (int *) ctl->data - p->data; + +	if (!write) +		return; + +	set_bit(index, p->data_state); +	if (!dev) /* NULL dev means this is default value */ +		neigh_copy_dflt_parms(net, p, index); +} + +static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, +					   void __user *buffer, +					   size_t *lenp, loff_t *ppos) +{ +	struct ctl_table tmp = *ctl; +	int ret; + +	tmp.extra1 = &zero; +	tmp.extra2 = &int_max; + +	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); +	neigh_proc_update(ctl, write); +	return ret; +} + +int neigh_proc_dointvec(struct ctl_table *ctl, int write, +			void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + +	neigh_proc_update(ctl, write); +	return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec); + +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, +				void __user *buffer, +				size_t *lenp, loff_t *ppos) +{ +	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); + +	neigh_proc_update(ctl, write); +	return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); + +static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, +					      void __user *buffer, +					      size_t *lenp, loff_t *ppos) +{ +	int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); + +	neigh_proc_update(ctl, write); +	return ret; +} + +int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, +				   void __user *buffer, +				   size_t *lenp, loff_t *ppos) +{ +	int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); + +	neigh_proc_update(ctl, write); +	return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); + +static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, +					  void __user *buffer, +					  size_t *lenp, loff_t *ppos) +{ +	int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); + +	neigh_proc_update(ctl, write); +	return ret; +} + +#define NEIGH_PARMS_DATA_OFFSET(index)	\ +	(&((struct neigh_parms *) 0)->data[index]) + +#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ +	[NEIGH_VAR_ ## attr] = { \ +		.procname	= name, \ +		.data		= NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ +		.maxlen		= sizeof(int), \ +		.mode		= mval, \ +		.proc_handler	= proc, \ +	} + +#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) + +#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) + +#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) + +#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies) + +#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) + +#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen)  static struct neigh_sysctl_table {  	struct ctl_table_header *sysctl_header;  	struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];  } neigh_sysctl_template __read_mostly = {  	.neigh_vars = { -		[NEIGH_VAR_MCAST_PROBE] = { -			.procname	= "mcast_solicit", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.extra1 	= &zero, -			.extra2		= &int_max, -			.proc_handler	= proc_dointvec_minmax, -		}, -		[NEIGH_VAR_UCAST_PROBE] = { -			.procname	= "ucast_solicit", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.extra1 	= &zero, -			.extra2		= &int_max, -			.proc_handler	= proc_dointvec_minmax, -		}, -		[NEIGH_VAR_APP_PROBE] = { -			.procname	= "app_solicit", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.extra1 	= &zero, -			.extra2		= &int_max, -			.proc_handler	= proc_dointvec_minmax, -		}, -		[NEIGH_VAR_RETRANS_TIME] = { -			.procname	= "retrans_time", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_userhz_jiffies, -		}, -		[NEIGH_VAR_BASE_REACHABLE_TIME] = { -			.procname	= "base_reachable_time", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_jiffies, -		}, -		[NEIGH_VAR_DELAY_PROBE_TIME] = { -			.procname	= "delay_first_probe_time", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_jiffies, -		}, -		[NEIGH_VAR_GC_STALETIME] = { -			.procname	= "gc_stale_time", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_jiffies, -		}, -		[NEIGH_VAR_QUEUE_LEN] = { -			.procname	= "unres_qlen", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_unres_qlen, -		}, -		[NEIGH_VAR_QUEUE_LEN_BYTES] = { -			.procname	= "unres_qlen_bytes", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.extra1		= &zero, -			.proc_handler   = proc_dointvec_minmax, -		}, -		[NEIGH_VAR_PROXY_QLEN] = { -			.procname	= "proxy_qlen", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.extra1 	= &zero, -			.extra2		= &int_max, -			.proc_handler	= proc_dointvec_minmax, -		}, -		[NEIGH_VAR_ANYCAST_DELAY] = { -			.procname	= "anycast_delay", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_userhz_jiffies, -		}, -		[NEIGH_VAR_PROXY_DELAY] = { -			.procname	= "proxy_delay", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_userhz_jiffies, -		}, -		[NEIGH_VAR_LOCKTIME] = { -			.procname	= "locktime", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_userhz_jiffies, -		}, -		[NEIGH_VAR_RETRANS_TIME_MS] = { -			.procname	= "retrans_time_ms", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_ms_jiffies, -		}, -		[NEIGH_VAR_BASE_REACHABLE_TIME_MS] = { -			.procname	= "base_reachable_time_ms", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_ms_jiffies, -		}, +		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), +		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), +		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), +		NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), +		NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), +		NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), +		NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), +		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), +		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), +		NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), +		NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), +		NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), +		NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), +		NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), +		NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"),  		[NEIGH_VAR_GC_INTERVAL] = {  			.procname	= "gc_interval",  			.maxlen		= sizeof(int), @@ -2950,31 +3035,23 @@ static struct neigh_sysctl_table {  };  int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, -			  char *p_name, proc_handler *handler) +			  proc_handler *handler)  { +	int i;  	struct neigh_sysctl_table *t; -	const char *dev_name_source = NULL; +	const char *dev_name_source;  	char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; +	char *p_name;  	t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);  	if (!t)  		goto err; -	t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data  = &p->mcast_probes; -	t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data  = &p->ucast_probes; -	t->neigh_vars[NEIGH_VAR_APP_PROBE].data  = &p->app_probes; -	t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data  = &p->retrans_time; -	t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data  = &p->base_reachable_time; -	t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data  = &p->delay_probe_time; -	t->neigh_vars[NEIGH_VAR_GC_STALETIME].data  = &p->gc_staletime; -	t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data  = &p->queue_len_bytes; -	t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data  = &p->queue_len_bytes; -	t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data  = &p->proxy_qlen; -	t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data  = &p->anycast_delay; -	t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay; -	t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime; -	t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data  = &p->retrans_time; -	t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data  = &p->base_reachable_time; +	for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { +		t->neigh_vars[i].data += (long) p; +		t->neigh_vars[i].extra1 = dev; +		t->neigh_vars[i].extra2 = p; +	}  	if (dev) {  		dev_name_source = dev->name; @@ -2982,33 +3059,40 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,  		memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,  		       sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));  	} else { +		struct neigh_table *tbl = p->tbl;  		dev_name_source = "default"; -		t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1); -		t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1; -		t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2; -		t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3; +		t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; +		t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; +		t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; +		t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;  	} -  	if (handler) {  		/* RetransTime */  		t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; -		t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev;  		/* ReachableTime */  		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; -		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev;  		/* RetransTime (in milliseconds)*/  		t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; -		t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev;  		/* ReachableTime (in milliseconds) */  		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; -		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;  	}  	/* Don't export sysctls to unprivileged users */  	if (neigh_parms_net(p)->user_ns != &init_user_ns)  		t->neigh_vars[0].procname = NULL; +	switch (neigh_parms_family(p)) { +	case AF_INET: +	      p_name = "ipv4"; +	      break; +	case AF_INET6: +	      p_name = "ipv6"; +	      break; +	default: +	      BUG(); +	} +  	snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",  		p_name, dev_name_source);  	t->sysctl_header = diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index d954b56b4e4..1cac29ebb05 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -104,6 +104,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,  }  NETDEVICE_SHOW_RO(dev_id, fmt_hex); +NETDEVICE_SHOW_RO(dev_port, fmt_dec);  NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);  NETDEVICE_SHOW_RO(addr_len, fmt_dec);  NETDEVICE_SHOW_RO(iflink, fmt_dec); @@ -252,6 +253,16 @@ static ssize_t operstate_show(struct device *dev,  }  static DEVICE_ATTR_RO(operstate); +static ssize_t carrier_changes_show(struct device *dev, +				    struct device_attribute *attr, +				    char *buf) +{ +	struct net_device *netdev = to_net_dev(dev); +	return sprintf(buf, fmt_dec, +		       atomic_read(&netdev->carrier_changes)); +} +static DEVICE_ATTR_RO(carrier_changes); +  /* read-write attributes */  static int change_mtu(struct net_device *net, unsigned long new_mtu) @@ -373,6 +384,7 @@ static struct attribute *net_class_attrs[] = {  	&dev_attr_netdev_group.attr,  	&dev_attr_type.attr,  	&dev_attr_dev_id.attr, +	&dev_attr_dev_port.attr,  	&dev_attr_iflink.attr,  	&dev_attr_ifindex.attr,  	&dev_attr_addr_assign_type.attr, @@ -384,6 +396,7 @@ static struct attribute *net_class_attrs[] = {  	&dev_attr_duplex.attr,  	&dev_attr_dormant.attr,  	&dev_attr_operstate.attr, +	&dev_attr_carrier_changes.attr,  	&dev_attr_ifalias.attr,  	&dev_attr_carrier.attr,  	&dev_attr_mtu.attr, @@ -498,17 +511,7 @@ static struct attribute_group wireless_group = {  #define net_class_groups	NULL  #endif /* CONFIG_SYSFS */ -#ifdef CONFIG_RPS -/* - * RX queue sysfs structures and functions. - */ -struct rx_queue_attribute { -	struct attribute attr; -	ssize_t (*show)(struct netdev_rx_queue *queue, -	    struct rx_queue_attribute *attr, char *buf); -	ssize_t (*store)(struct netdev_rx_queue *queue, -	    struct rx_queue_attribute *attr, const char *buf, size_t len); -}; +#ifdef CONFIG_SYSFS  #define to_rx_queue_attr(_attr) container_of(_attr,		\      struct rx_queue_attribute, attr) @@ -543,6 +546,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = {  	.store = rx_queue_attr_store,  }; +#ifdef CONFIG_RPS  static ssize_t show_rps_map(struct netdev_rx_queue *queue,  			    struct rx_queue_attribute *attribute, char *buf)  { @@ -676,8 +680,8 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,  		while ((mask | (mask >> 1)) != mask)  			mask |= (mask >> 1);  		/* On 64 bit arches, must check mask fits in table->mask (u32), -		 * and on 32bit arches, must check RPS_DEV_FLOW_TABLE_SIZE(mask + 1) -		 * doesnt overflow. +		 * and on 32bit arches, must check +		 * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.  		 */  #if BITS_PER_LONG > 32  		if (mask > (unsigned long)(u32)mask) @@ -718,16 +722,20 @@ static struct rx_queue_attribute rps_cpus_attribute =  static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =  	__ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,  	    show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); +#endif /* CONFIG_RPS */  static struct attribute *rx_queue_default_attrs[] = { +#ifdef CONFIG_RPS  	&rps_cpus_attribute.attr,  	&rps_dev_flow_table_cnt_attribute.attr, +#endif  	NULL  };  static void rx_queue_release(struct kobject *kobj)  {  	struct netdev_rx_queue *queue = to_rx_queue(kobj); +#ifdef CONFIG_RPS  	struct rps_map *map;  	struct rps_dev_flow_table *flow_table; @@ -743,15 +751,29 @@ static void rx_queue_release(struct kobject *kobj)  		RCU_INIT_POINTER(queue->rps_flow_table, NULL);  		call_rcu(&flow_table->rcu, rps_dev_flow_table_release);  	} +#endif  	memset(kobj, 0, sizeof(*kobj));  	dev_put(queue->dev);  } +static const void *rx_queue_namespace(struct kobject *kobj) +{ +	struct netdev_rx_queue *queue = to_rx_queue(kobj); +	struct device *dev = &queue->dev->dev; +	const void *ns = NULL; + +	if (dev->class && dev->class->ns_type) +		ns = dev->class->namespace(dev); + +	return ns; +} +  static struct kobj_type rx_queue_ktype = {  	.sysfs_ops = &rx_queue_sysfs_ops,  	.release = rx_queue_release,  	.default_attrs = rx_queue_default_attrs, +	.namespace = rx_queue_namespace  };  static int rx_queue_add_kobject(struct net_device *net, int index) @@ -763,25 +785,36 @@ static int rx_queue_add_kobject(struct net_device *net, int index)  	kobj->kset = net->queues_kset;  	error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,  	    "rx-%u", index); -	if (error) { -		kobject_put(kobj); -		return error; +	if (error) +		goto exit; + +	if (net->sysfs_rx_queue_group) { +		error = sysfs_create_group(kobj, net->sysfs_rx_queue_group); +		if (error) +			goto exit;  	}  	kobject_uevent(kobj, KOBJ_ADD);  	dev_hold(queue->dev);  	return error; +exit: +	kobject_put(kobj); +	return error;  } -#endif /* CONFIG_RPS */ +#endif /* CONFIG_SYSFS */  int  net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)  { -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  	int i;  	int error = 0; +#ifndef CONFIG_RPS +	if (!net->sysfs_rx_queue_group) +		return 0; +#endif  	for (i = old_num; i < new_num; i++) {  		error = rx_queue_add_kobject(net, i);  		if (error) { @@ -790,8 +823,12 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)  		}  	} -	while (--i >= new_num) +	while (--i >= new_num) { +		if (net->sysfs_rx_queue_group) +			sysfs_remove_group(&net->_rx[i].kobj, +					   net->sysfs_rx_queue_group);  		kobject_put(&net->_rx[i].kobj); +	}  	return error;  #else @@ -972,15 +1009,12 @@ static struct attribute_group dql_group = {  #endif /* CONFIG_BQL */  #ifdef CONFIG_XPS -static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +static unsigned int get_netdev_queue_index(struct netdev_queue *queue)  {  	struct net_device *dev = queue->dev; -	int i; - -	for (i = 0; i < dev->num_tx_queues; i++) -		if (queue == &dev->_tx[i]) -			break; +	unsigned int i; +	i = queue - dev->_tx;  	BUG_ON(i >= dev->num_tx_queues);  	return i; @@ -1082,10 +1116,23 @@ static void netdev_queue_release(struct kobject *kobj)  	dev_put(queue->dev);  } +static const void *netdev_queue_namespace(struct kobject *kobj) +{ +	struct netdev_queue *queue = to_netdev_queue(kobj); +	struct device *dev = &queue->dev->dev; +	const void *ns = NULL; + +	if (dev->class && dev->class->ns_type) +		ns = dev->class->namespace(dev); + +	return ns; +} +  static struct kobj_type netdev_queue_ktype = {  	.sysfs_ops = &netdev_queue_sysfs_ops,  	.release = netdev_queue_release,  	.default_attrs = netdev_queue_default_attrs, +	.namespace = netdev_queue_namespace,  };  static int netdev_queue_add_kobject(struct net_device *net, int index) @@ -1155,9 +1202,6 @@ static int register_queue_kobjects(struct net_device *net)  	    NULL, &net->dev.kobj);  	if (!net->queues_kset)  		return -ENOMEM; -#endif - -#ifdef CONFIG_RPS  	real_rx = net->real_num_rx_queues;  #endif  	real_tx = net->real_num_tx_queues; @@ -1184,7 +1228,7 @@ static void remove_queue_kobjects(struct net_device *net)  {  	int real_rx = 0, real_tx = 0; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  	real_rx = net->real_num_rx_queues;  #endif  	real_tx = net->real_num_tx_queues; @@ -1263,7 +1307,7 @@ static void netdev_release(struct device *d)  	BUG_ON(dev->reg_state != NETREG_RELEASED);  	kfree(dev->ifalias); -	kfree((char *)dev - dev->padded); +	netdev_freemem(dev);  }  static const void *net_namespace(struct device *d) @@ -1344,19 +1388,21 @@ int netdev_register_kobject(struct net_device *net)  	return error;  } -int netdev_class_create_file(struct class_attribute *class_attr) +int netdev_class_create_file_ns(struct class_attribute *class_attr, +				const void *ns)  { -	return class_create_file(&net_class, class_attr); +	return class_create_file_ns(&net_class, class_attr, ns);  } -EXPORT_SYMBOL(netdev_class_create_file); +EXPORT_SYMBOL(netdev_class_create_file_ns); -void netdev_class_remove_file(struct class_attribute *class_attr) +void netdev_class_remove_file_ns(struct class_attribute *class_attr, +				 const void *ns)  { -	class_remove_file(&net_class, class_attr); +	class_remove_file_ns(&net_class, class_attr, ns);  } -EXPORT_SYMBOL(netdev_class_remove_file); +EXPORT_SYMBOL(netdev_class_remove_file_ns); -int netdev_kobject_init(void) +int __init netdev_kobject_init(void)  {  	kobj_ns_type_register(&net_ns_type_operations);  	return class_register(&net_class); diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index bd7751ec1c4..2745a1b51e0 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -1,7 +1,7 @@  #ifndef __NET_SYSFS_H__  #define __NET_SYSFS_H__ -int netdev_kobject_init(void); +int __init netdev_kobject_init(void);  int netdev_register_kobject(struct net_device *);  void netdev_unregister_kobject(struct net_device *);  int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 81d3a9a0845..85b62691f4f 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -24,7 +24,7 @@  static LIST_HEAD(pernet_list);  static struct list_head *first_device = &pernet_list; -static DEFINE_MUTEX(net_mutex); +DEFINE_MUTEX(net_mutex);  LIST_HEAD(net_namespace_list);  EXPORT_SYMBOL_GPL(net_namespace_list); @@ -273,7 +273,7 @@ static void cleanup_net(struct work_struct *work)  {  	const struct pernet_operations *ops;  	struct net *net, *tmp; -	LIST_HEAD(net_kill_list); +	struct list_head net_kill_list;  	LIST_HEAD(net_exit_list);  	/* Atomically snapshot the list of namespaces to cleanup */ diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c new file mode 100644 index 00000000000..30d903b19c6 --- /dev/null +++ b/net/core/netclassid_cgroup.c @@ -0,0 +1,111 @@ +/* + * net/core/netclassid_cgroup.c	Classid Cgroupfs Handling + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + * + * Authors:	Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/cgroup.h> +#include <linux/fdtable.h> +#include <net/cls_cgroup.h> +#include <net/sock.h> + +static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) +{ +	return css ? container_of(css, struct cgroup_cls_state, css) : NULL; +} + +struct cgroup_cls_state *task_cls_state(struct task_struct *p) +{ +	return css_cls_state(task_css(p, net_cls_cgrp_id)); +} +EXPORT_SYMBOL_GPL(task_cls_state); + +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) +{ +	struct cgroup_cls_state *cs; + +	cs = kzalloc(sizeof(*cs), GFP_KERNEL); +	if (!cs) +		return ERR_PTR(-ENOMEM); + +	return &cs->css; +} + +static int cgrp_css_online(struct cgroup_subsys_state *css) +{ +	struct cgroup_cls_state *cs = css_cls_state(css); +	struct cgroup_cls_state *parent = css_cls_state(css->parent); + +	if (parent) +		cs->classid = parent->classid; + +	return 0; +} + +static void cgrp_css_free(struct cgroup_subsys_state *css) +{ +	kfree(css_cls_state(css)); +} + +static int update_classid(const void *v, struct file *file, unsigned n) +{ +	int err; +	struct socket *sock = sock_from_file(file, &err); + +	if (sock) +		sock->sk->sk_classid = (u32)(unsigned long)v; + +	return 0; +} + +static void cgrp_attach(struct cgroup_subsys_state *css, +			struct cgroup_taskset *tset) +{ +	struct cgroup_cls_state *cs = css_cls_state(css); +	void *v = (void *)(unsigned long)cs->classid; +	struct task_struct *p; + +	cgroup_taskset_for_each(p, tset) { +		task_lock(p); +		iterate_fd(p->files, 0, update_classid, v); +		task_unlock(p); +	} +} + +static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) +{ +	return css_cls_state(css)->classid; +} + +static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, +			 u64 value) +{ +	css_cls_state(css)->classid = (u32) value; + +	return 0; +} + +static struct cftype ss_files[] = { +	{ +		.name		= "classid", +		.read_u64	= read_classid, +		.write_u64	= write_classid, +	}, +	{ }	/* terminate */ +}; + +struct cgroup_subsys net_cls_cgrp_subsys = { +	.css_alloc		= cgrp_css_alloc, +	.css_online		= cgrp_css_online, +	.css_free		= cgrp_css_free, +	.attach			= cgrp_attach, +	.base_cftypes		= ss_files, +}; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2c637e9a0b2..e33937fb32a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -46,13 +46,9 @@  static struct sk_buff_head skb_pool; -static atomic_t trapped; -  DEFINE_STATIC_SRCU(netpoll_srcu);  #define USEC_PER_POLL	50 -#define NETPOLL_RX_ENABLED  1 -#define NETPOLL_RX_DROP     2  #define MAX_SKB_SIZE							\  	(sizeof(struct ethhdr) +					\ @@ -61,7 +57,6 @@ DEFINE_STATIC_SRCU(netpoll_srcu);  	 MAX_UDP_CHUNK)  static void zap_completion_queue(void); -static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo);  static void netpoll_async_cleanup(struct work_struct *work);  static unsigned int carrier_timeout = 4; @@ -74,6 +69,37 @@ module_param(carrier_timeout, uint, 0644);  #define np_notice(np, fmt, ...)				\  	pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) +static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, +			      struct netdev_queue *txq) +{ +	const struct net_device_ops *ops = dev->netdev_ops; +	int status = NETDEV_TX_OK; +	netdev_features_t features; + +	features = netif_skb_features(skb); + +	if (vlan_tx_tag_present(skb) && +	    !vlan_hw_offload_capable(features, skb->vlan_proto)) { +		skb = __vlan_put_tag(skb, skb->vlan_proto, +				     vlan_tx_tag_get(skb)); +		if (unlikely(!skb)) { +			/* This is actually a packet drop, but we +			 * don't want the code that calls this +			 * function to try and operate on a NULL skb. +			 */ +			goto out; +		} +		skb->vlan_tci = 0; +	} + +	status = ops->ndo_start_xmit(skb, dev); +	if (status == NETDEV_TX_OK) +		txq_trans_update(txq); + +out: +	return status; +} +  static void queue_process(struct work_struct *work)  {  	struct netpoll_info *npinfo = @@ -83,51 +109,31 @@ static void queue_process(struct work_struct *work)  	while ((skb = skb_dequeue(&npinfo->txq))) {  		struct net_device *dev = skb->dev; -		const struct net_device_ops *ops = dev->netdev_ops;  		struct netdev_queue *txq;  		if (!netif_device_present(dev) || !netif_running(dev)) { -			__kfree_skb(skb); +			kfree_skb(skb);  			continue;  		}  		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));  		local_irq_save(flags); -		__netif_tx_lock(txq, smp_processor_id()); +		HARD_TX_LOCK(dev, txq, smp_processor_id());  		if (netif_xmit_frozen_or_stopped(txq) || -		    ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { +		    netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {  			skb_queue_head(&npinfo->txq, skb); -			__netif_tx_unlock(txq); +			HARD_TX_UNLOCK(dev, txq);  			local_irq_restore(flags);  			schedule_delayed_work(&npinfo->tx_work, HZ/10);  			return;  		} -		__netif_tx_unlock(txq); +		HARD_TX_UNLOCK(dev, txq);  		local_irq_restore(flags);  	}  } -static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh, -			    unsigned short ulen, __be32 saddr, __be32 daddr) -{ -	__wsum psum; - -	if (uh->check == 0 || skb_csum_unnecessary(skb)) -		return 0; - -	psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); - -	if (skb->ip_summed == CHECKSUM_COMPLETE && -	    !csum_fold(csum_add(psum, skb->csum))) -		return 0; - -	skb->csum = psum; - -	return __skb_checksum_complete(skb); -} -  /*   * Check whether delayed processing was scheduled for our NIC. If so,   * we attempt to grab the poll lock and use ->poll() to pump the card. @@ -138,14 +144,8 @@ static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,   * trylock here and interrupts are already disabled in the softirq   * case. Further, we test the poll_owner to avoid recursion on UP   * systems where the lock doesn't exist. - * - * In cases where there is bi-directional communications, reading only - * one message at a time can lead to packets being dropped by the - * network adapter, forcing superfluous retries and possibly timeouts. - * Thus, we set our budget to greater than 1.   */ -static int poll_one_napi(struct netpoll_info *npinfo, -			 struct napi_struct *napi, int budget) +static int poll_one_napi(struct napi_struct *napi, int budget)  {  	int work; @@ -156,52 +156,35 @@ static int poll_one_napi(struct netpoll_info *npinfo,  	if (!test_bit(NAPI_STATE_SCHED, &napi->state))  		return budget; -	npinfo->rx_flags |= NETPOLL_RX_DROP; -	atomic_inc(&trapped);  	set_bit(NAPI_STATE_NPSVC, &napi->state);  	work = napi->poll(napi, budget); +	WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);  	trace_napi_poll(napi);  	clear_bit(NAPI_STATE_NPSVC, &napi->state); -	atomic_dec(&trapped); -	npinfo->rx_flags &= ~NETPOLL_RX_DROP;  	return budget - work;  } -static void poll_napi(struct net_device *dev) +static void poll_napi(struct net_device *dev, int budget)  {  	struct napi_struct *napi; -	int budget = 16;  	list_for_each_entry(napi, &dev->napi_list, dev_list) {  		if (napi->poll_owner != smp_processor_id() &&  		    spin_trylock(&napi->poll_lock)) { -			budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), -					       napi, budget); +			budget = poll_one_napi(napi, budget);  			spin_unlock(&napi->poll_lock); - -			if (!budget) -				break;  		}  	}  } -static void service_neigh_queue(struct netpoll_info *npi) -{ -	if (npi) { -		struct sk_buff *skb; - -		while ((skb = skb_dequeue(&npi->neigh_tx))) -			netpoll_neigh_reply(skb, npi); -	} -} -  static void netpoll_poll_dev(struct net_device *dev)  {  	const struct net_device_ops *ops;  	struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); +	int budget = 0;  	/* Don't do any rx activity if the dev_lock mutex is held  	 * the dev_open/close paths use this to block netpoll activity @@ -224,31 +207,14 @@ static void netpoll_poll_dev(struct net_device *dev)  	/* Process pending work on NIC */  	ops->ndo_poll_controller(dev); -	poll_napi(dev); +	poll_napi(dev, budget);  	up(&ni->dev_lock); -	if (dev->flags & IFF_SLAVE) { -		if (ni) { -			struct net_device *bond_dev; -			struct sk_buff *skb; -			struct netpoll_info *bond_ni; - -			bond_dev = netdev_master_upper_dev_get_rcu(dev); -			bond_ni = rcu_dereference_bh(bond_dev->npinfo); -			while ((skb = skb_dequeue(&ni->neigh_tx))) { -				skb->dev = bond_dev; -				skb_queue_tail(&bond_ni->neigh_tx, skb); -			} -		} -	} - -	service_neigh_queue(ni); -  	zap_completion_queue();  } -void netpoll_rx_disable(struct net_device *dev) +void netpoll_poll_disable(struct net_device *dev)  {  	struct netpoll_info *ni;  	int idx; @@ -259,9 +225,9 @@ void netpoll_rx_disable(struct net_device *dev)  		down(&ni->dev_lock);  	srcu_read_unlock(&netpoll_srcu, idx);  } -EXPORT_SYMBOL(netpoll_rx_disable); +EXPORT_SYMBOL(netpoll_poll_disable); -void netpoll_rx_enable(struct net_device *dev) +void netpoll_poll_enable(struct net_device *dev)  {  	struct netpoll_info *ni;  	rcu_read_lock(); @@ -270,7 +236,7 @@ void netpoll_rx_enable(struct net_device *dev)  		up(&ni->dev_lock);  	rcu_read_unlock();  } -EXPORT_SYMBOL(netpoll_rx_enable); +EXPORT_SYMBOL(netpoll_poll_enable);  static void refill_skbs(void)  { @@ -304,7 +270,7 @@ static void zap_completion_queue(void)  		while (clist != NULL) {  			struct sk_buff *skb = clist;  			clist = clist->next; -			if (skb->destructor) { +			if (!skb_irq_freeable(skb)) {  				atomic_inc(&skb->users);  				dev_kfree_skb_any(skb); /* put this one back */  			} else { @@ -359,7 +325,6 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,  {  	int status = NETDEV_TX_BUSY;  	unsigned long tries; -	const struct net_device_ops *ops = dev->netdev_ops;  	/* It is up to the caller to keep npinfo alive. */  	struct netpoll_info *npinfo; @@ -367,7 +332,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,  	npinfo = rcu_dereference_bh(np->dev->npinfo);  	if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { -		__kfree_skb(skb); +		dev_kfree_skb_irq(skb);  		return;  	} @@ -375,27 +340,16 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,  	if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {  		struct netdev_queue *txq; -		txq = netdev_pick_tx(dev, skb); +		txq = netdev_pick_tx(dev, skb, NULL);  		/* try until next clock tick */  		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;  		     tries > 0; --tries) { -			if (__netif_tx_trylock(txq)) { -				if (!netif_xmit_stopped(txq)) { -					if (vlan_tx_tag_present(skb) && -					    !vlan_hw_offload_capable(netif_skb_features(skb), -								     skb->vlan_proto)) { -						skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb)); -						if (unlikely(!skb)) -							break; -						skb->vlan_tci = 0; -					} - -					status = ops->ndo_start_xmit(skb, dev); -					if (status == NETDEV_TX_OK) -						txq_trans_update(txq); -				} -				__netif_tx_unlock(txq); +			if (HARD_TX_TRYLOCK(dev, txq)) { +				if (!netif_xmit_stopped(txq)) +					status = netpoll_start_xmit(skb, dev, txq); + +				HARD_TX_UNLOCK(dev, txq);  				if (status == NETDEV_TX_OK)  					break; @@ -410,7 +364,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,  		WARN_ONCE(!irqs_disabled(),  			"netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", -			dev->name, ops->ndo_start_xmit); +			dev->name, dev->netdev_ops->ndo_start_xmit);  	} @@ -513,8 +467,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)  		skb->protocol = eth->h_proto = htons(ETH_P_IP);  	} -	memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN); -	memcpy(eth->h_dest, np->remote_mac, ETH_ALEN); +	ether_addr_copy(eth->h_source, np->dev->dev_addr); +	ether_addr_copy(eth->h_dest, np->remote_mac);  	skb->dev = np->dev; @@ -522,379 +476,6 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)  }  EXPORT_SYMBOL(netpoll_send_udp); -static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo) -{ -	int size, type = ARPOP_REPLY; -	__be32 sip, tip; -	unsigned char *sha; -	struct sk_buff *send_skb; -	struct netpoll *np, *tmp; -	unsigned long flags; -	int hlen, tlen; -	int hits = 0, proto; - -	if (list_empty(&npinfo->rx_np)) -		return; - -	/* Before checking the packet, we do some early -	   inspection whether this is interesting at all */ -	spin_lock_irqsave(&npinfo->rx_lock, flags); -	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { -		if (np->dev == skb->dev) -			hits++; -	} -	spin_unlock_irqrestore(&npinfo->rx_lock, flags); - -	/* No netpoll struct is using this dev */ -	if (!hits) -		return; - -	proto = ntohs(eth_hdr(skb)->h_proto); -	if (proto == ETH_P_IP) { -		struct arphdr *arp; -		unsigned char *arp_ptr; -		/* No arp on this interface */ -		if (skb->dev->flags & IFF_NOARP) -			return; - -		if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) -			return; - -		skb_reset_network_header(skb); -		skb_reset_transport_header(skb); -		arp = arp_hdr(skb); - -		if ((arp->ar_hrd != htons(ARPHRD_ETHER) && -		     arp->ar_hrd != htons(ARPHRD_IEEE802)) || -		    arp->ar_pro != htons(ETH_P_IP) || -		    arp->ar_op != htons(ARPOP_REQUEST)) -			return; - -		arp_ptr = (unsigned char *)(arp+1); -		/* save the location of the src hw addr */ -		sha = arp_ptr; -		arp_ptr += skb->dev->addr_len; -		memcpy(&sip, arp_ptr, 4); -		arp_ptr += 4; -		/* If we actually cared about dst hw addr, -		   it would get copied here */ -		arp_ptr += skb->dev->addr_len; -		memcpy(&tip, arp_ptr, 4); - -		/* Should we ignore arp? */ -		if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) -			return; - -		size = arp_hdr_len(skb->dev); - -		spin_lock_irqsave(&npinfo->rx_lock, flags); -		list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { -			if (tip != np->local_ip.ip) -				continue; - -			hlen = LL_RESERVED_SPACE(np->dev); -			tlen = np->dev->needed_tailroom; -			send_skb = find_skb(np, size + hlen + tlen, hlen); -			if (!send_skb) -				continue; - -			skb_reset_network_header(send_skb); -			arp = (struct arphdr *) skb_put(send_skb, size); -			send_skb->dev = skb->dev; -			send_skb->protocol = htons(ETH_P_ARP); - -			/* Fill the device header for the ARP frame */ -			if (dev_hard_header(send_skb, skb->dev, ETH_P_ARP, -					    sha, np->dev->dev_addr, -					    send_skb->len) < 0) { -				kfree_skb(send_skb); -				continue; -			} - -			/* -			 * Fill out the arp protocol part. -			 * -			 * we only support ethernet device type, -			 * which (according to RFC 1390) should -			 * always equal 1 (Ethernet). -			 */ - -			arp->ar_hrd = htons(np->dev->type); -			arp->ar_pro = htons(ETH_P_IP); -			arp->ar_hln = np->dev->addr_len; -			arp->ar_pln = 4; -			arp->ar_op = htons(type); - -			arp_ptr = (unsigned char *)(arp + 1); -			memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); -			arp_ptr += np->dev->addr_len; -			memcpy(arp_ptr, &tip, 4); -			arp_ptr += 4; -			memcpy(arp_ptr, sha, np->dev->addr_len); -			arp_ptr += np->dev->addr_len; -			memcpy(arp_ptr, &sip, 4); - -			netpoll_send_skb(np, send_skb); - -			/* If there are several rx_hooks for the same address, -			   we're fine by sending a single reply */ -			break; -		} -		spin_unlock_irqrestore(&npinfo->rx_lock, flags); -	} else if( proto == ETH_P_IPV6) { -#if IS_ENABLED(CONFIG_IPV6) -		struct nd_msg *msg; -		u8 *lladdr = NULL; -		struct ipv6hdr *hdr; -		struct icmp6hdr *icmp6h; -		const struct in6_addr *saddr; -		const struct in6_addr *daddr; -		struct inet6_dev *in6_dev = NULL; -		struct in6_addr *target; - -		in6_dev = in6_dev_get(skb->dev); -		if (!in6_dev || !in6_dev->cnf.accept_ra) -			return; - -		if (!pskb_may_pull(skb, skb->len)) -			return; - -		msg = (struct nd_msg *)skb_transport_header(skb); - -		__skb_push(skb, skb->data - skb_transport_header(skb)); - -		if (ipv6_hdr(skb)->hop_limit != 255) -			return; -		if (msg->icmph.icmp6_code != 0) -			return; -		if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) -			return; - -		saddr = &ipv6_hdr(skb)->saddr; -		daddr = &ipv6_hdr(skb)->daddr; - -		size = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); - -		spin_lock_irqsave(&npinfo->rx_lock, flags); -		list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { -			if (!ipv6_addr_equal(daddr, &np->local_ip.in6)) -				continue; - -			hlen = LL_RESERVED_SPACE(np->dev); -			tlen = np->dev->needed_tailroom; -			send_skb = find_skb(np, size + hlen + tlen, hlen); -			if (!send_skb) -				continue; - -			send_skb->protocol = htons(ETH_P_IPV6); -			send_skb->dev = skb->dev; - -			skb_reset_network_header(send_skb); -			hdr = (struct ipv6hdr *) skb_put(send_skb, sizeof(struct ipv6hdr)); -			*(__be32*)hdr = htonl(0x60000000); -			hdr->payload_len = htons(size); -			hdr->nexthdr = IPPROTO_ICMPV6; -			hdr->hop_limit = 255; -			hdr->saddr = *saddr; -			hdr->daddr = *daddr; - -			icmp6h = (struct icmp6hdr *) skb_put(send_skb, sizeof(struct icmp6hdr)); -			icmp6h->icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; -			icmp6h->icmp6_router = 0; -			icmp6h->icmp6_solicited = 1; - -			target = (struct in6_addr *) skb_put(send_skb, sizeof(struct in6_addr)); -			*target = msg->target; -			icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, size, -							      IPPROTO_ICMPV6, -							      csum_partial(icmp6h, -									   size, 0)); - -			if (dev_hard_header(send_skb, skb->dev, ETH_P_IPV6, -					    lladdr, np->dev->dev_addr, -					    send_skb->len) < 0) { -				kfree_skb(send_skb); -				continue; -			} - -			netpoll_send_skb(np, send_skb); - -			/* If there are several rx_hooks for the same address, -			   we're fine by sending a single reply */ -			break; -		} -		spin_unlock_irqrestore(&npinfo->rx_lock, flags); -#endif -	} -} - -static bool pkt_is_ns(struct sk_buff *skb) -{ -	struct nd_msg *msg; -	struct ipv6hdr *hdr; - -	if (skb->protocol != htons(ETH_P_ARP)) -		return false; -	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg))) -		return false; - -	msg = (struct nd_msg *)skb_transport_header(skb); -	__skb_push(skb, skb->data - skb_transport_header(skb)); -	hdr = ipv6_hdr(skb); - -	if (hdr->nexthdr != IPPROTO_ICMPV6) -		return false; -	if (hdr->hop_limit != 255) -		return false; -	if (msg->icmph.icmp6_code != 0) -		return false; -	if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) -		return false; - -	return true; -} - -int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) -{ -	int proto, len, ulen; -	int hits = 0; -	const struct iphdr *iph; -	struct udphdr *uh; -	struct netpoll *np, *tmp; - -	if (list_empty(&npinfo->rx_np)) -		goto out; - -	if (skb->dev->type != ARPHRD_ETHER) -		goto out; - -	/* check if netpoll clients need ARP */ -	if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) { -		skb_queue_tail(&npinfo->neigh_tx, skb); -		return 1; -	} else if (pkt_is_ns(skb) && atomic_read(&trapped)) { -		skb_queue_tail(&npinfo->neigh_tx, skb); -		return 1; -	} - -	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { -		skb = vlan_untag(skb); -		if (unlikely(!skb)) -			goto out; -	} - -	proto = ntohs(eth_hdr(skb)->h_proto); -	if (proto != ETH_P_IP && proto != ETH_P_IPV6) -		goto out; -	if (skb->pkt_type == PACKET_OTHERHOST) -		goto out; -	if (skb_shared(skb)) -		goto out; - -	if (proto == ETH_P_IP) { -		if (!pskb_may_pull(skb, sizeof(struct iphdr))) -			goto out; -		iph = (struct iphdr *)skb->data; -		if (iph->ihl < 5 || iph->version != 4) -			goto out; -		if (!pskb_may_pull(skb, iph->ihl*4)) -			goto out; -		iph = (struct iphdr *)skb->data; -		if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) -			goto out; - -		len = ntohs(iph->tot_len); -		if (skb->len < len || len < iph->ihl*4) -			goto out; - -		/* -		 * Our transport medium may have padded the buffer out. -		 * Now We trim to the true length of the frame. -		 */ -		if (pskb_trim_rcsum(skb, len)) -			goto out; - -		iph = (struct iphdr *)skb->data; -		if (iph->protocol != IPPROTO_UDP) -			goto out; - -		len -= iph->ihl*4; -		uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); -		ulen = ntohs(uh->len); - -		if (ulen != len) -			goto out; -		if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) -			goto out; -		list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { -			if (np->local_ip.ip && np->local_ip.ip != iph->daddr) -				continue; -			if (np->remote_ip.ip && np->remote_ip.ip != iph->saddr) -				continue; -			if (np->local_port && np->local_port != ntohs(uh->dest)) -				continue; - -			np->rx_hook(np, ntohs(uh->source), -				       (char *)(uh+1), -				       ulen - sizeof(struct udphdr)); -			hits++; -		} -	} else { -#if IS_ENABLED(CONFIG_IPV6) -		const struct ipv6hdr *ip6h; - -		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) -			goto out; -		ip6h = (struct ipv6hdr *)skb->data; -		if (ip6h->version != 6) -			goto out; -		len = ntohs(ip6h->payload_len); -		if (!len) -			goto out; -		if (len + sizeof(struct ipv6hdr) > skb->len) -			goto out; -		if (pskb_trim_rcsum(skb, len + sizeof(struct ipv6hdr))) -			goto out; -		ip6h = ipv6_hdr(skb); -		if (!pskb_may_pull(skb, sizeof(struct udphdr))) -			goto out; -		uh = udp_hdr(skb); -		ulen = ntohs(uh->len); -		if (ulen != skb->len) -			goto out; -		if (udp6_csum_init(skb, uh, IPPROTO_UDP)) -			goto out; -		list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { -			if (!ipv6_addr_equal(&np->local_ip.in6, &ip6h->daddr)) -				continue; -			if (!ipv6_addr_equal(&np->remote_ip.in6, &ip6h->saddr)) -				continue; -			if (np->local_port && np->local_port != ntohs(uh->dest)) -				continue; - -			np->rx_hook(np, ntohs(uh->source), -				       (char *)(uh+1), -				       ulen - sizeof(struct udphdr)); -			hits++; -		} -#endif -	} - -	if (!hits) -		goto out; - -	kfree_skb(skb); -	return 1; - -out: -	if (atomic_read(&trapped)) { -		kfree_skb(skb); -		return 1; -	} - -	return 0; -} -  void netpoll_print_options(struct netpoll *np)  {  	np_info(np, "local port %d\n", np->local_port); @@ -936,6 +517,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  {  	char *cur=opt, *delim;  	int ipv6; +	bool ipversion_set = false;  	if (*cur != '@') {  		if ((delim = strchr(cur, '@')) == NULL) @@ -948,6 +530,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  	cur++;  	if (*cur != '/') { +		ipversion_set = true;  		if ((delim = strchr(cur, '/')) == NULL)  			goto parse_failed;  		*delim = 0; @@ -990,7 +573,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  	ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);  	if (ipv6 < 0)  		goto parse_failed; -	else if (np->ipv6 != (bool)ipv6) +	else if (ipversion_set && np->ipv6 != (bool)ipv6)  		goto parse_failed;  	else  		np->ipv6 = (bool)ipv6; @@ -1012,11 +595,10 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  }  EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev)  {  	struct netpoll_info *npinfo;  	const struct net_device_ops *ops; -	unsigned long flags;  	int err;  	np->dev = ndev; @@ -1032,18 +614,13 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)  	}  	if (!ndev->npinfo) { -		npinfo = kmalloc(sizeof(*npinfo), gfp); +		npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);  		if (!npinfo) {  			err = -ENOMEM;  			goto out;  		} -		npinfo->rx_flags = 0; -		INIT_LIST_HEAD(&npinfo->rx_np); - -		spin_lock_init(&npinfo->rx_lock);  		sema_init(&npinfo->dev_lock, 1); -		skb_queue_head_init(&npinfo->neigh_tx);  		skb_queue_head_init(&npinfo->txq);  		INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); @@ -1051,7 +628,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)  		ops = np->dev->netdev_ops;  		if (ops->ndo_netpoll_setup) { -			err = ops->ndo_netpoll_setup(ndev, npinfo, gfp); +			err = ops->ndo_netpoll_setup(ndev, npinfo);  			if (err)  				goto free_npinfo;  		} @@ -1062,13 +639,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)  	npinfo->netpoll = np; -	if (np->rx_hook) { -		spin_lock_irqsave(&npinfo->rx_lock, flags); -		npinfo->rx_flags |= NETPOLL_RX_ENABLED; -		list_add_tail(&np->rx, &npinfo->rx_np); -		spin_unlock_irqrestore(&npinfo->rx_lock, flags); -	} -  	/* last thing to do is link it to the net device structure */  	rcu_assign_pointer(ndev->npinfo, npinfo); @@ -1190,7 +760,7 @@ int netpoll_setup(struct netpoll *np)  	/* fill up the skb queue */  	refill_skbs(); -	err = __netpoll_setup(np, ndev, GFP_KERNEL); +	err = __netpoll_setup(np, ndev);  	if (err)  		goto put; @@ -1217,7 +787,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)  	struct netpoll_info *npinfo =  			container_of(rcu_head, struct netpoll_info, rcu); -	skb_queue_purge(&npinfo->neigh_tx);  	skb_queue_purge(&npinfo->txq);  	/* we can't call cancel_delayed_work_sync here, as we are in softirq */ @@ -1233,7 +802,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)  void __netpoll_cleanup(struct netpoll *np)  {  	struct netpoll_info *npinfo; -	unsigned long flags;  	/* rtnl_dereference would be preferable here but  	 * rcu_cleanup_netpoll path can put us in here safely without @@ -1243,14 +811,6 @@ void __netpoll_cleanup(struct netpoll *np)  	if (!npinfo)  		return; -	if (!list_empty(&npinfo->rx_np)) { -		spin_lock_irqsave(&npinfo->rx_lock, flags); -		list_del(&np->rx); -		if (list_empty(&npinfo->rx_np)) -			npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; -		spin_unlock_irqrestore(&npinfo->rx_lock, flags); -	} -  	synchronize_srcu(&netpoll_srcu);  	if (atomic_dec_and_test(&npinfo->refcnt)) { @@ -1260,7 +820,7 @@ void __netpoll_cleanup(struct netpoll *np)  		if (ops->ndo_netpoll_cleanup)  			ops->ndo_netpoll_cleanup(np->dev); -		rcu_assign_pointer(np->dev->npinfo, NULL); +		RCU_INIT_POINTER(np->dev->npinfo, NULL);  		call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);  	}  } @@ -1284,29 +844,13 @@ EXPORT_SYMBOL_GPL(__netpoll_free_async);  void netpoll_cleanup(struct netpoll *np)  { -	if (!np->dev) -		return; -  	rtnl_lock(); +	if (!np->dev) +		goto out;  	__netpoll_cleanup(np); -	rtnl_unlock(); -  	dev_put(np->dev);  	np->dev = NULL; +out: +	rtnl_unlock();  }  EXPORT_SYMBOL(netpoll_cleanup); - -int netpoll_trap(void) -{ -	return atomic_read(&trapped); -} -EXPORT_SYMBOL(netpoll_trap); - -void netpoll_set_trap(int trap) -{ -	if (trap) -		atomic_inc(&trapped); -	else -		atomic_dec(&trapped); -} -EXPORT_SYMBOL(netpoll_set_trap); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index d9cd627e6a1..2f385b9bccc 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -30,7 +30,7 @@  #define PRIOMAP_MIN_SZ		128  /* - * Extend @dev->priomap so that it's large enough to accomodate + * Extend @dev->priomap so that it's large enough to accommodate   * @target_idx.  @dev->priomap.priomap_len > @target_idx after successful   * return.  Must be called under rtnl lock.   */ @@ -140,7 +140,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)  static int cgrp_css_online(struct cgroup_subsys_state *css)  { -	struct cgroup_subsys_state *parent_css = css_parent(css); +	struct cgroup_subsys_state *parent_css = css->parent;  	struct net_device *dev;  	int ret = 0; @@ -173,27 +173,27 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)  	return css->cgroup->id;  } -static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft, -			struct cgroup_map_cb *cb) +static int read_priomap(struct seq_file *sf, void *v)  {  	struct net_device *dev;  	rcu_read_lock();  	for_each_netdev_rcu(&init_net, dev) -		cb->fill(cb, dev->name, netprio_prio(css, dev)); +		seq_printf(sf, "%s %u\n", dev->name, +			   netprio_prio(seq_css(sf), dev));  	rcu_read_unlock();  	return 0;  } -static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, -			 const char *buffer) +static ssize_t write_priomap(struct kernfs_open_file *of, +			     char *buf, size_t nbytes, loff_t off)  {  	char devname[IFNAMSIZ + 1];  	struct net_device *dev;  	u32 prio;  	int ret; -	if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) +	if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)  		return -EINVAL;  	dev = dev_get_by_name(&init_net, devname); @@ -202,11 +202,11 @@ static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,  	rtnl_lock(); -	ret = netprio_set_prio(css, dev, prio); +	ret = netprio_set_prio(of_css(of), dev, prio);  	rtnl_unlock();  	dev_put(dev); -	return ret; +	return ret ?: nbytes;  }  static int update_netprio(const void *v, struct file *file, unsigned n) @@ -222,11 +222,10 @@ static void net_prio_attach(struct cgroup_subsys_state *css,  			    struct cgroup_taskset *tset)  {  	struct task_struct *p; -	void *v; +	void *v = (void *)(unsigned long)css->cgroup->id; -	cgroup_taskset_for_each(p, css, tset) { +	cgroup_taskset_for_each(p, tset) {  		task_lock(p); -		v = (void *)(unsigned long)task_netprioidx(p);  		iterate_fd(p->files, 0, update_netprio, v);  		task_unlock(p);  	} @@ -239,21 +238,18 @@ static struct cftype ss_files[] = {  	},  	{  		.name = "ifpriomap", -		.read_map = read_priomap, -		.write_string = write_priomap, +		.seq_show = read_priomap, +		.write = write_priomap,  	},  	{ }	/* terminate */  }; -struct cgroup_subsys net_prio_subsys = { -	.name		= "net_prio", +struct cgroup_subsys net_prio_cgrp_subsys = {  	.css_alloc	= cgrp_css_alloc,  	.css_online	= cgrp_css_online,  	.css_free	= cgrp_css_free,  	.attach		= net_prio_attach, -	.subsys_id	= net_prio_subsys_id,  	.base_cftypes	= ss_files, -	.module		= THIS_MODULE,  };  static int netprio_device_event(struct notifier_block *unused, @@ -284,37 +280,9 @@ static struct notifier_block netprio_device_notifier = {  static int __init init_cgroup_netprio(void)  { -	int ret; - -	ret = cgroup_load_subsys(&net_prio_subsys); -	if (ret) -		goto out; -  	register_netdevice_notifier(&netprio_device_notifier); - -out: -	return ret; -} - -static void __exit exit_cgroup_netprio(void) -{ -	struct netprio_map *old; -	struct net_device *dev; - -	unregister_netdevice_notifier(&netprio_device_notifier); - -	cgroup_unload_subsys(&net_prio_subsys); - -	rtnl_lock(); -	for_each_netdev(&init_net, dev) { -		old = rtnl_dereference(dev->priomap); -		RCU_INIT_POINTER(dev->priomap, NULL); -		if (old) -			kfree_rcu(old, rcu); -	} -	rtnl_unlock(); +	return 0;  } -module_init(init_cgroup_netprio); -module_exit(exit_cgroup_netprio); +subsys_initcall(init_cgroup_netprio);  MODULE_LICENSE("GPL v2"); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 261357a6630..fc17a9d309a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -389,6 +389,9 @@ struct pktgen_dev {  #ifdef CONFIG_XFRM  	__u8	ipsmode;		/* IPSEC mode (config) */  	__u8	ipsproto;		/* IPSEC type (config) */ +	__u32	spi; +	struct dst_entry dst; +	struct dst_ops dstops;  #endif  	char result[512];  }; @@ -473,23 +476,22 @@ static int pgctrl_show(struct seq_file *seq, void *v)  static ssize_t pgctrl_write(struct file *file, const char __user *buf,  			    size_t count, loff_t *ppos)  { -	int err = 0;  	char data[128];  	struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); -	if (!capable(CAP_NET_ADMIN)) { -		err = -EPERM; -		goto out; -	} +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; + +	if (count == 0) +		return -EINVAL;  	if (count > sizeof(data))  		count = sizeof(data); -	if (copy_from_user(data, buf, count)) { -		err = -EFAULT; -		goto out; -	} -	data[count - 1] = 0;	/* Make string */ +	if (copy_from_user(data, buf, count)) +		return -EFAULT; + +	data[count - 1] = 0;	/* Strip trailing '\n' and terminate string */  	if (!strcmp(data, "stop"))  		pktgen_stop_all_threads_ifs(pn); @@ -503,10 +505,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,  	else  		pr_warning("Unknown command: %s\n", data); -	err = count; - -out: -	return err; +	return count;  }  static int pgctrl_open(struct inode *inode, struct file *file) @@ -574,7 +573,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  		   is_zero_ether_addr(pkt_dev->src_mac) ?  			     pkt_dev->odev->dev_addr : pkt_dev->src_mac); -	seq_printf(seq, "dst_mac: "); +	seq_puts(seq, "dst_mac: ");  	seq_printf(seq, "%pM\n", pkt_dev->dst_mac);  	seq_printf(seq, @@ -589,7 +588,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  	if (pkt_dev->nr_labels) {  		unsigned int i; -		seq_printf(seq, "     mpls: "); +		seq_puts(seq, "     mpls: ");  		for (i = 0; i < pkt_dev->nr_labels; i++)  			seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),  				   i == pkt_dev->nr_labels-1 ? "\n" : ", "); @@ -614,64 +613,67 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  	if (pkt_dev->node >= 0)  		seq_printf(seq, "     node: %d\n", pkt_dev->node); -	seq_printf(seq, "     Flags: "); +	seq_puts(seq, "     Flags: ");  	if (pkt_dev->flags & F_IPV6) -		seq_printf(seq, "IPV6  "); +		seq_puts(seq, "IPV6  ");  	if (pkt_dev->flags & F_IPSRC_RND) -		seq_printf(seq, "IPSRC_RND  "); +		seq_puts(seq, "IPSRC_RND  ");  	if (pkt_dev->flags & F_IPDST_RND) -		seq_printf(seq, "IPDST_RND  "); +		seq_puts(seq, "IPDST_RND  ");  	if (pkt_dev->flags & F_TXSIZE_RND) -		seq_printf(seq, "TXSIZE_RND  "); +		seq_puts(seq, "TXSIZE_RND  ");  	if (pkt_dev->flags & F_UDPSRC_RND) -		seq_printf(seq, "UDPSRC_RND  "); +		seq_puts(seq, "UDPSRC_RND  ");  	if (pkt_dev->flags & F_UDPDST_RND) -		seq_printf(seq, "UDPDST_RND  "); +		seq_puts(seq, "UDPDST_RND  ");  	if (pkt_dev->flags & F_UDPCSUM) -		seq_printf(seq, "UDPCSUM  "); +		seq_puts(seq, "UDPCSUM  ");  	if (pkt_dev->flags & F_MPLS_RND) -		seq_printf(seq,  "MPLS_RND  "); +		seq_puts(seq,  "MPLS_RND  ");  	if (pkt_dev->flags & F_QUEUE_MAP_RND) -		seq_printf(seq,  "QUEUE_MAP_RND  "); +		seq_puts(seq,  "QUEUE_MAP_RND  ");  	if (pkt_dev->flags & F_QUEUE_MAP_CPU) -		seq_printf(seq,  "QUEUE_MAP_CPU  "); +		seq_puts(seq,  "QUEUE_MAP_CPU  ");  	if (pkt_dev->cflows) {  		if (pkt_dev->flags & F_FLOW_SEQ) -			seq_printf(seq,  "FLOW_SEQ  "); /*in sequence flows*/ +			seq_puts(seq,  "FLOW_SEQ  "); /*in sequence flows*/  		else -			seq_printf(seq,  "FLOW_RND  "); +			seq_puts(seq,  "FLOW_RND  ");  	}  #ifdef CONFIG_XFRM -	if (pkt_dev->flags & F_IPSEC_ON) -		seq_printf(seq,  "IPSEC  "); +	if (pkt_dev->flags & F_IPSEC_ON) { +		seq_puts(seq,  "IPSEC  "); +		if (pkt_dev->spi) +			seq_printf(seq, "spi:%u", pkt_dev->spi); +	}  #endif  	if (pkt_dev->flags & F_MACSRC_RND) -		seq_printf(seq, "MACSRC_RND  "); +		seq_puts(seq, "MACSRC_RND  ");  	if (pkt_dev->flags & F_MACDST_RND) -		seq_printf(seq, "MACDST_RND  "); +		seq_puts(seq, "MACDST_RND  ");  	if (pkt_dev->flags & F_VID_RND) -		seq_printf(seq, "VID_RND  "); +		seq_puts(seq, "VID_RND  ");  	if (pkt_dev->flags & F_SVID_RND) -		seq_printf(seq, "SVID_RND  "); +		seq_puts(seq, "SVID_RND  ");  	if (pkt_dev->flags & F_NODE) -		seq_printf(seq, "NODE_ALLOC  "); +		seq_puts(seq, "NODE_ALLOC  ");  	seq_puts(seq, "\n"); @@ -714,7 +716,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  	if (pkt_dev->result[0])  		seq_printf(seq, "Result: %s\n", pkt_dev->result);  	else -		seq_printf(seq, "Result: Idle\n"); +		seq_puts(seq, "Result: Idle\n");  	return 0;  } @@ -1245,7 +1247,13 @@ static ssize_t pktgen_if_write(struct file *file,  				"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",  				f,  				"IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " -				"MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n"); +				"MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, " +				"MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, " +				"QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, " +#ifdef CONFIG_XFRM +				"IPSEC, " +#endif +				"NODE_ALLOC\n");  			return count;  		}  		sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); @@ -1434,7 +1442,7 @@ static ssize_t pktgen_if_write(struct file *file,  		if (!mac_pton(valstr, pkt_dev->dst_mac))  			return -EINVAL;  		/* Set up Dest MAC */ -		memcpy(&pkt_dev->hh[0], pkt_dev->dst_mac, ETH_ALEN); +		ether_addr_copy(&pkt_dev->hh[0], pkt_dev->dst_mac);  		sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac);  		return count; @@ -1451,7 +1459,7 @@ static ssize_t pktgen_if_write(struct file *file,  		if (!mac_pton(valstr, pkt_dev->src_mac))  			return -EINVAL;  		/* Set up Src MAC */ -		memcpy(&pkt_dev->hh[6], pkt_dev->src_mac, ETH_ALEN); +		ether_addr_copy(&pkt_dev->hh[6], pkt_dev->src_mac);  		sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac);  		return count; @@ -1476,7 +1484,18 @@ static ssize_t pktgen_if_write(struct file *file,  		sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows);  		return count;  	} +#ifdef CONFIG_XFRM +	if (!strcmp(name, "spi")) { +		len = num_arg(&user_buffer[i], 10, &value); +		if (len < 0) +			return len; +		i += len; +		pkt_dev->spi = value; +		sprintf(pg_result, "OK: spi=%u", pkt_dev->spi); +		return count; +	} +#endif  	if (!strcmp(name, "flowlen")) {  		len = num_arg(&user_buffer[i], 10, &value);  		if (len < 0) @@ -1716,14 +1735,14 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)  	BUG_ON(!t); -	seq_printf(seq, "Running: "); +	seq_puts(seq, "Running: ");  	if_lock(t);  	list_for_each_entry(pkt_dev, &t->if_list, list)  		if (pkt_dev->running)  			seq_printf(seq, "%s ", pkt_dev->odevname); -	seq_printf(seq, "\nStopped: "); +	seq_puts(seq, "\nStopped: ");  	list_for_each_entry(pkt_dev, &t->if_list, list)  		if (!pkt_dev->running) @@ -1732,7 +1751,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)  	if (t->result[0])  		seq_printf(seq, "\nResult: %s\n", t->result);  	else -		seq_printf(seq, "\nResult: NA\n"); +		seq_puts(seq, "\nResult: NA\n");  	if_unlock(t); @@ -2043,10 +2062,10 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)  	/* Default to the interface's mac if not explicitly set. */  	if (is_zero_ether_addr(pkt_dev->src_mac)) -		memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN); +		ether_addr_copy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr);  	/* Set up Dest MAC */ -	memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); +	ether_addr_copy(&(pkt_dev->hh[0]), pkt_dev->dst_mac);  	if (pkt_dev->flags & F_IPV6) {  		int i, set = 0, err = 1; @@ -2233,13 +2252,21 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)  	struct xfrm_state *x = pkt_dev->flows[flow].x;  	struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);  	if (!x) { -		/*slow path: we dont already have xfrm_state*/ -		x = xfrm_stateonly_find(pn->net, DUMMY_MARK, -					(xfrm_address_t *)&pkt_dev->cur_daddr, -					(xfrm_address_t *)&pkt_dev->cur_saddr, -					AF_INET, -					pkt_dev->ipsmode, -					pkt_dev->ipsproto, 0); + +		if (pkt_dev->spi) { +			/* We need as quick as possible to find the right SA +			 * Searching with minimum criteria to archieve this. +			 */ +			x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); +		} else { +			/* slow path: we dont already have xfrm_state */ +			x = xfrm_stateonly_find(pn->net, DUMMY_MARK, +						(xfrm_address_t *)&pkt_dev->cur_daddr, +						(xfrm_address_t *)&pkt_dev->cur_saddr, +						AF_INET, +						pkt_dev->ipsmode, +						pkt_dev->ipsproto, 0); +		}  		if (x) {  			pkt_dev->flows[flow].x = x;  			set_pkt_overhead(pkt_dev); @@ -2475,31 +2502,47 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  #ifdef CONFIG_XFRM +static u32 pktgen_dst_metrics[RTAX_MAX + 1] = { + +	[RTAX_HOPLIMIT] = 0x5, /* Set a static hoplimit */ +}; +  static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)  {  	struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;  	int err = 0; +	struct net *net = dev_net(pkt_dev->odev);  	if (!x)  		return 0;  	/* XXX: we dont support tunnel mode for now until  	 * we resolve the dst issue */ -	if (x->props.mode != XFRM_MODE_TRANSPORT) +	if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0))  		return 0; -	spin_lock(&x->lock); +	/* But when user specify an valid SPI, transformation +	 * supports both transport/tunnel mode + ESP/AH type. +	 */ +	if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) +		skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF; +	rcu_read_lock_bh();  	err = x->outer_mode->output(x, skb); -	if (err) +	rcu_read_unlock_bh(); +	if (err) { +		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);  		goto error; +	}  	err = x->type->output(x, skb); -	if (err) +	if (err) { +		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR);  		goto error; - +	} +	spin_lock_bh(&x->lock);  	x->curlft.bytes += skb->len;  	x->curlft.packets++; +	spin_unlock_bh(&x->lock);  error: -	spin_unlock(&x->lock);  	return err;  } @@ -2527,6 +2570,8 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,  		if (x) {  			int ret;  			__u8 *eth; +			struct iphdr *iph; +  			nhead = x->props.header_len - skb_headroom(skb);  			if (nhead > 0) {  				ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC); @@ -2548,6 +2593,11 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,  			eth = (__u8 *) skb_push(skb, ETH_HLEN);  			memcpy(eth, pkt_dev->hh, 12);  			*(u16 *) ð[12] = protocol; + +			/* Update IPv4 header len as well as checksum value */ +			iph = ip_hdr(skb); +			iph->tot_len = htons(skb->len - ETH_HLEN); +			ip_send_check(iph);  		}  	}  	return 1; @@ -3288,9 +3338,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  	queue_map = skb_get_queue_mapping(pkt_dev->skb);  	txq = netdev_get_tx_queue(odev, queue_map); -	__netif_tx_lock_bh(txq); +	local_bh_disable(); + +	HARD_TX_LOCK(odev, txq, smp_processor_id()); -	if (unlikely(netif_xmit_frozen_or_stopped(txq))) { +	if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {  		ret = NETDEV_TX_BUSY;  		pkt_dev->last_ok = 0;  		goto unlock; @@ -3324,7 +3376,9 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  		pkt_dev->last_ok = 0;  	}  unlock: -	__netif_tx_unlock_bh(txq); +	HARD_TX_UNLOCK(odev, txq); + +	local_bh_enable();  	/* If pkt_dev->count is zero, then run forever */  	if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { @@ -3535,6 +3589,17 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)  #ifdef CONFIG_XFRM  	pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;  	pkt_dev->ipsproto = IPPROTO_ESP; + +	/* xfrm tunnel mode needs additional dst to extract outter +	 * ip header protocol/ttl/id field, here creat a phony one. +	 * instead of looking for a valid rt, which definitely hurting +	 * performance under such circumstance. +	 */ +	pkt_dev->dstops.family = AF_INET; +	pkt_dev->dst.dev = pkt_dev->odev; +	dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false); +	pkt_dev->dst.child = &pkt_dev->dst; +	pkt_dev->dst.ops = &pkt_dev->dstops;  #endif  	return add_dev_to_thread(t, pkt_dev); diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c new file mode 100644 index 00000000000..d3027a73fd4 --- /dev/null +++ b/net/core/ptp_classifier.c @@ -0,0 +1,141 @@ +/* PTP classifier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* The below program is the bpf_asm (tools/net/) representation of + * the opcode array in the ptp_filter structure. + * + * For convenience, this can easily be altered and reviewed with + * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a + * simple file containing the below program: + * + * ldh [12]                        ; load ethertype + * + * ; PTP over UDP over IPv4 over Ethernet + * test_ipv4: + *   jneq #0x800, test_ipv6        ; ETH_P_IP ? + *   ldb [23]                      ; load proto + *   jneq #17, drop_ipv4           ; IPPROTO_UDP ? + *   ldh [20]                      ; load frag offset field + *   jset #0x1fff, drop_ipv4       ; don't allow fragments + *   ldxb 4*([14]&0xf)             ; load IP header len + *   ldh [x + 16]                  ; load UDP dst port + *   jneq #319, drop_ipv4          ; is port PTP_EV_PORT ? + *   ldh [x + 22]                  ; load payload + *   and #0xf                      ; mask PTP_CLASS_VMASK + *   or #0x10                      ; PTP_CLASS_IPV4 + *   ret a                         ; return PTP class + *   drop_ipv4: ret #0x0           ; PTP_CLASS_NONE + * + * ; PTP over UDP over IPv6 over Ethernet + * test_ipv6: + *   jneq #0x86dd, test_8021q      ; ETH_P_IPV6 ? + *   ldb [20]                      ; load proto + *   jneq #17, drop_ipv6           ; IPPROTO_UDP ? + *   ldh [56]                      ; load UDP dst port + *   jneq #319, drop_ipv6          ; is port PTP_EV_PORT ? + *   ldh [62]                      ; load payload + *   and #0xf                      ; mask PTP_CLASS_VMASK + *   or #0x20                      ; PTP_CLASS_IPV6 + *   ret a                         ; return PTP class + *   drop_ipv6: ret #0x0           ; PTP_CLASS_NONE + * + * ; PTP over 802.1Q over Ethernet + * test_8021q: + *   jneq #0x8100, test_ieee1588   ; ETH_P_8021Q ? + *   ldh [16]                      ; load inner type + *   jneq #0x88f7, drop_ieee1588   ; ETH_P_1588 ? + *   ldb [18]                      ; load payload + *   and #0x8                      ; as we don't have ports here, test + *   jneq #0x0, drop_ieee1588      ; for PTP_GEN_BIT and drop these + *   ldh [18]                      ; reload payload + *   and #0xf                      ; mask PTP_CLASS_VMASK + *   or #0x40                      ; PTP_CLASS_V2_VLAN + *   ret a                         ; return PTP class + * + * ; PTP over Ethernet + * test_ieee1588: + *   jneq #0x88f7, drop_ieee1588   ; ETH_P_1588 ? + *   ldb [14]                      ; load payload + *   and #0x8                      ; as we don't have ports here, test + *   jneq #0x0, drop_ieee1588      ; for PTP_GEN_BIT and drop these + *   ldh [14]                      ; reload payload + *   and #0xf                      ; mask PTP_CLASS_VMASK + *   or #0x30                      ; PTP_CLASS_L2 + *   ret a                         ; return PTP class + *   drop_ieee1588: ret #0x0       ; PTP_CLASS_NONE + */ + +#include <linux/skbuff.h> +#include <linux/filter.h> +#include <linux/ptp_classify.h> + +static struct sk_filter *ptp_insns __read_mostly; + +unsigned int ptp_classify_raw(const struct sk_buff *skb) +{ +	return SK_RUN_FILTER(ptp_insns, skb); +} +EXPORT_SYMBOL_GPL(ptp_classify_raw); + +void __init ptp_classifier_init(void) +{ +	static struct sock_filter ptp_filter[] __initdata = { +		{ 0x28,  0,  0, 0x0000000c }, +		{ 0x15,  0, 12, 0x00000800 }, +		{ 0x30,  0,  0, 0x00000017 }, +		{ 0x15,  0,  9, 0x00000011 }, +		{ 0x28,  0,  0, 0x00000014 }, +		{ 0x45,  7,  0, 0x00001fff }, +		{ 0xb1,  0,  0, 0x0000000e }, +		{ 0x48,  0,  0, 0x00000010 }, +		{ 0x15,  0,  4, 0x0000013f }, +		{ 0x48,  0,  0, 0x00000016 }, +		{ 0x54,  0,  0, 0x0000000f }, +		{ 0x44,  0,  0, 0x00000010 }, +		{ 0x16,  0,  0, 0x00000000 }, +		{ 0x06,  0,  0, 0x00000000 }, +		{ 0x15,  0,  9, 0x000086dd }, +		{ 0x30,  0,  0, 0x00000014 }, +		{ 0x15,  0,  6, 0x00000011 }, +		{ 0x28,  0,  0, 0x00000038 }, +		{ 0x15,  0,  4, 0x0000013f }, +		{ 0x28,  0,  0, 0x0000003e }, +		{ 0x54,  0,  0, 0x0000000f }, +		{ 0x44,  0,  0, 0x00000020 }, +		{ 0x16,  0,  0, 0x00000000 }, +		{ 0x06,  0,  0, 0x00000000 }, +		{ 0x15,  0,  9, 0x00008100 }, +		{ 0x28,  0,  0, 0x00000010 }, +		{ 0x15,  0, 15, 0x000088f7 }, +		{ 0x30,  0,  0, 0x00000012 }, +		{ 0x54,  0,  0, 0x00000008 }, +		{ 0x15,  0, 12, 0x00000000 }, +		{ 0x28,  0,  0, 0x00000012 }, +		{ 0x54,  0,  0, 0x0000000f }, +		{ 0x44,  0,  0, 0x00000040 }, +		{ 0x16,  0,  0, 0x00000000 }, +		{ 0x15,  0,  7, 0x000088f7 }, +		{ 0x30,  0,  0, 0x0000000e }, +		{ 0x54,  0,  0, 0x00000008 }, +		{ 0x15,  0,  4, 0x00000000 }, +		{ 0x28,  0,  0, 0x0000000e }, +		{ 0x54,  0,  0, 0x0000000f }, +		{ 0x44,  0,  0, 0x00000030 }, +		{ 0x16,  0,  0, 0x00000000 }, +		{ 0x06,  0,  0, 0x00000000 }, +	}; +	struct sock_fprog_kern ptp_prog = { +		.len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter, +	}; + +	BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog)); +} diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 4425148d2b5..467f326126e 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -221,5 +221,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,  out:  	spin_unlock_bh(&fastopenq->lock);  	sock_put(lsk); -	return;  } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2a0e21de306..1063996f831 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -353,18 +353,65 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops)  }  EXPORT_SYMBOL_GPL(__rtnl_link_unregister); +/* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace. + */ +static void rtnl_lock_unregistering_all(void) +{ +	struct net *net; +	bool unregistering; +	DEFINE_WAIT(wait); + +	for (;;) { +		prepare_to_wait(&netdev_unregistering_wq, &wait, +				TASK_UNINTERRUPTIBLE); +		unregistering = false; +		rtnl_lock(); +		for_each_net(net) { +			if (net->dev_unreg_count > 0) { +				unregistering = true; +				break; +			} +		} +		if (!unregistering) +			break; +		__rtnl_unlock(); +		schedule(); +	} +	finish_wait(&netdev_unregistering_wq, &wait); +} +  /**   * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.   * @ops: struct rtnl_link_ops * to unregister   */  void rtnl_link_unregister(struct rtnl_link_ops *ops)  { -	rtnl_lock(); +	/* Close the race with cleanup_net() */ +	mutex_lock(&net_mutex); +	rtnl_lock_unregistering_all();  	__rtnl_link_unregister(ops);  	rtnl_unlock(); +	mutex_unlock(&net_mutex);  }  EXPORT_SYMBOL_GPL(rtnl_link_unregister); +static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) +{ +	struct net_device *master_dev; +	const struct rtnl_link_ops *ops; + +	master_dev = netdev_master_upper_dev_get((struct net_device *) dev); +	if (!master_dev) +		return 0; +	ops = master_dev->rtnl_link_ops; +	if (!ops || !ops->get_slave_size) +		return 0; +	/* IFLA_INFO_SLAVE_DATA + nested data */ +	return nla_total_size(sizeof(struct nlattr)) + +	       ops->get_slave_size(master_dev, dev); +} +  static size_t rtnl_link_get_size(const struct net_device *dev)  {  	const struct rtnl_link_ops *ops = dev->rtnl_link_ops; @@ -385,6 +432,8 @@ static size_t rtnl_link_get_size(const struct net_device *dev)  		/* IFLA_INFO_XSTATS */  		size += nla_total_size(ops->get_xstats_size(dev)); +	size += rtnl_link_get_slave_info_data_size(dev); +  	return size;  } @@ -403,34 +452,16 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family)  }  /** - * __rtnl_af_register - Register rtnl_af_ops with rtnetlink. - * @ops: struct rtnl_af_ops * to register - * - * The caller must hold the rtnl_mutex. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_af_register(struct rtnl_af_ops *ops) -{ -	list_add_tail(&ops->list, &rtnl_af_ops); -	return 0; -} -EXPORT_SYMBOL_GPL(__rtnl_af_register); - -/**   * rtnl_af_register - Register rtnl_af_ops with rtnetlink.   * @ops: struct rtnl_af_ops * to register   *   * Returns 0 on success or a negative error code.   */ -int rtnl_af_register(struct rtnl_af_ops *ops) +void rtnl_af_register(struct rtnl_af_ops *ops)  { -	int err; -  	rtnl_lock(); -	err = __rtnl_af_register(ops); +	list_add_tail(&ops->list, &rtnl_af_ops);  	rtnl_unlock(); -	return err;  }  EXPORT_SYMBOL_GPL(rtnl_af_register); @@ -477,40 +508,100 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev)  	return size;  } -static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +static bool rtnl_have_link_slave_info(const struct net_device *dev)  { -	const struct rtnl_link_ops *ops = dev->rtnl_link_ops; -	struct nlattr *linkinfo, *data; -	int err = -EMSGSIZE; +	struct net_device *master_dev; -	linkinfo = nla_nest_start(skb, IFLA_LINKINFO); -	if (linkinfo == NULL) -		goto out; +	master_dev = netdev_master_upper_dev_get((struct net_device *) dev); +	if (master_dev && master_dev->rtnl_link_ops) +		return true; +	return false; +} + +static int rtnl_link_slave_info_fill(struct sk_buff *skb, +				     const struct net_device *dev) +{ +	struct net_device *master_dev; +	const struct rtnl_link_ops *ops; +	struct nlattr *slave_data; +	int err; + +	master_dev = netdev_master_upper_dev_get((struct net_device *) dev); +	if (!master_dev) +		return 0; +	ops = master_dev->rtnl_link_ops; +	if (!ops) +		return 0; +	if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) +		return -EMSGSIZE; +	if (ops->fill_slave_info) { +		slave_data = nla_nest_start(skb, IFLA_INFO_SLAVE_DATA); +		if (!slave_data) +			return -EMSGSIZE; +		err = ops->fill_slave_info(skb, master_dev, dev); +		if (err < 0) +			goto err_cancel_slave_data; +		nla_nest_end(skb, slave_data); +	} +	return 0; + +err_cancel_slave_data: +	nla_nest_cancel(skb, slave_data); +	return err; +} + +static int rtnl_link_info_fill(struct sk_buff *skb, +			       const struct net_device *dev) +{ +	const struct rtnl_link_ops *ops = dev->rtnl_link_ops; +	struct nlattr *data; +	int err; +	if (!ops) +		return 0;  	if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) -		goto err_cancel_link; +		return -EMSGSIZE;  	if (ops->fill_xstats) {  		err = ops->fill_xstats(skb, dev);  		if (err < 0) -			goto err_cancel_link; +			return err;  	}  	if (ops->fill_info) {  		data = nla_nest_start(skb, IFLA_INFO_DATA); -		if (data == NULL) { -			err = -EMSGSIZE; -			goto err_cancel_link; -		} +		if (data == NULL) +			return -EMSGSIZE;  		err = ops->fill_info(skb, dev);  		if (err < 0)  			goto err_cancel_data;  		nla_nest_end(skb, data);  	} - -	nla_nest_end(skb, linkinfo);  	return 0;  err_cancel_data:  	nla_nest_cancel(skb, data); +	return err; +} + +static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +{ +	struct nlattr *linkinfo; +	int err = -EMSGSIZE; + +	linkinfo = nla_nest_start(skb, IFLA_LINKINFO); +	if (linkinfo == NULL) +		goto out; + +	err = rtnl_link_info_fill(skb, dev); +	if (err < 0) +		goto err_cancel_link; + +	err = rtnl_link_slave_info_fill(skb, dev); +	if (err < 0) +		goto err_cancel_link; + +	nla_nest_end(skb, linkinfo); +	return 0; +  err_cancel_link:  	nla_nest_cancel(skb, linkinfo);  out: @@ -707,14 +798,15 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,  		size += num_vfs *  			(nla_total_size(sizeof(struct ifla_vf_mac)) +  			 nla_total_size(sizeof(struct ifla_vf_vlan)) + -			 nla_total_size(sizeof(struct ifla_vf_tx_rate)) + -			 nla_total_size(sizeof(struct ifla_vf_spoofchk))); +			 nla_total_size(sizeof(struct ifla_vf_spoofchk)) + +			 nla_total_size(sizeof(struct ifla_vf_rate)));  		return size;  	} else  		return 0;  } -static size_t rtnl_port_size(const struct net_device *dev) +static size_t rtnl_port_size(const struct net_device *dev, +			     u32 ext_filter_mask)  {  	size_t port_size = nla_total_size(4)		/* PORT_VF */  		+ nla_total_size(PORT_PROFILE_MAX)	/* PORT_PROFILE */ @@ -730,7 +822,8 @@ static size_t rtnl_port_size(const struct net_device *dev)  	size_t port_self_size = nla_total_size(sizeof(struct nlattr))  		+ port_size; -	if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) +	if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || +	    !(ext_filter_mask & RTEXT_FILTER_VF))  		return 0;  	if (dev_num_vf(dev->dev.parent))  		return port_self_size + vf_ports_size + @@ -762,10 +855,11 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,  	       + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */  	       + nla_total_size(1) /* IFLA_OPERSTATE */  	       + nla_total_size(1) /* IFLA_LINKMODE */ +	       + nla_total_size(4) /* IFLA_CARRIER_CHANGES */  	       + nla_total_size(ext_filter_mask  			        & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */  	       + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ -	       + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ +	       + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */  	       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */  	       + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */  	       + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ @@ -827,11 +921,13 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)  	return 0;  } -static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) +static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, +			  u32 ext_filter_mask)  {  	int err; -	if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) +	if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || +	    !(ext_filter_mask & RTEXT_FILTER_VF))  		return 0;  	err = rtnl_port_self_fill(skb, dev); @@ -910,7 +1006,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  	    (dev->qdisc &&  	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||  	    (dev->ifalias && -	     nla_put_string(skb, IFLA_IFALIAS, dev->ifalias))) +	     nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || +	    nla_put_u32(skb, IFLA_CARRIER_CHANGES, +			atomic_read(&dev->carrier_changes)))  		goto nla_put_failure;  	if (1) { @@ -967,6 +1065,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  			struct ifla_vf_info ivi;  			struct ifla_vf_mac vf_mac;  			struct ifla_vf_vlan vf_vlan; +			struct ifla_vf_rate vf_rate;  			struct ifla_vf_tx_rate vf_tx_rate;  			struct ifla_vf_spoofchk vf_spoofchk;  			struct ifla_vf_link_state vf_linkstate; @@ -987,6 +1086,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  				break;  			vf_mac.vf =  				vf_vlan.vf = +				vf_rate.vf =  				vf_tx_rate.vf =  				vf_spoofchk.vf =  				vf_linkstate.vf = ivi.vf; @@ -994,7 +1094,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  			memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));  			vf_vlan.vlan = ivi.vlan;  			vf_vlan.qos = ivi.qos; -			vf_tx_rate.rate = ivi.tx_rate; +			vf_tx_rate.rate = ivi.max_tx_rate; +			vf_rate.min_tx_rate = ivi.min_tx_rate; +			vf_rate.max_tx_rate = ivi.max_tx_rate;  			vf_spoofchk.setting = ivi.spoofchk;  			vf_linkstate.link_state = ivi.linkstate;  			vf = nla_nest_start(skb, IFLA_VF_INFO); @@ -1004,6 +1106,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  			}  			if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||  			    nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || +			    nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), +				    &vf_rate) ||  			    nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),  				    &vf_tx_rate) ||  			    nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), @@ -1016,10 +1120,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  		nla_nest_end(skb, vfinfo);  	} -	if (rtnl_port_fill(skb, dev)) +	if (rtnl_port_fill(skb, dev, ext_filter_mask))  		goto nla_put_failure; -	if (dev->rtnl_link_ops) { +	if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {  		if (rtnl_link_fill(skb, dev) < 0)  			goto nla_put_failure;  	} @@ -1061,56 +1165,7 @@ nla_put_failure:  	return -EMSGSIZE;  } -static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) -{ -	struct net *net = sock_net(skb->sk); -	int h, s_h; -	int idx = 0, s_idx; -	struct net_device *dev; -	struct hlist_head *head; -	struct nlattr *tb[IFLA_MAX+1]; -	u32 ext_filter_mask = 0; - -	s_h = cb->args[0]; -	s_idx = cb->args[1]; - -	rcu_read_lock(); -	cb->seq = net->dev_base_seq; - -	if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, -			ifla_policy) >= 0) { - -		if (tb[IFLA_EXT_MASK]) -			ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); -	} - -	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { -		idx = 0; -		head = &net->dev_index_head[h]; -		hlist_for_each_entry_rcu(dev, head, index_hlist) { -			if (idx < s_idx) -				goto cont; -			if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, -					     NETLINK_CB(cb->skb).portid, -					     cb->nlh->nlmsg_seq, 0, -					     NLM_F_MULTI, -					     ext_filter_mask) <= 0) -				goto out; - -			nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: -			idx++; -		} -	} -out: -	rcu_read_unlock(); -	cb->args[1] = idx; -	cb->args[0] = h; - -	return skb->len; -} - -const struct nla_policy ifla_policy[IFLA_MAX+1] = { +static const struct nla_policy ifla_policy[IFLA_MAX+1] = {  	[IFLA_IFNAME]		= { .type = NLA_STRING, .len = IFNAMSIZ-1 },  	[IFLA_ADDRESS]		= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },  	[IFLA_BROADCAST]	= { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, @@ -1136,12 +1191,14 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {  	[IFLA_NUM_TX_QUEUES]	= { .type = NLA_U32 },  	[IFLA_NUM_RX_QUEUES]	= { .type = NLA_U32 },  	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, +	[IFLA_CARRIER_CHANGES]	= { .type = NLA_U32 },  /* ignored */  }; -EXPORT_SYMBOL(ifla_policy);  static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {  	[IFLA_INFO_KIND]	= { .type = NLA_STRING },  	[IFLA_INFO_DATA]	= { .type = NLA_NESTED }, +	[IFLA_INFO_SLAVE_KIND]	= { .type = NLA_STRING }, +	[IFLA_INFO_SLAVE_DATA]	= { .type = NLA_NESTED },  };  static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { @@ -1157,6 +1214,10 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {  				    .len = sizeof(struct ifla_vf_tx_rate) },  	[IFLA_VF_SPOOFCHK]	= { .type = NLA_BINARY,  				    .len = sizeof(struct ifla_vf_spoofchk) }, +	[IFLA_VF_RATE]		= { .type = NLA_BINARY, +				    .len = sizeof(struct ifla_vf_rate) }, +	[IFLA_VF_LINK_STATE]	= { .type = NLA_BINARY, +				    .len = sizeof(struct ifla_vf_link_state) },  };  static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { @@ -1173,6 +1234,78 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {  	[IFLA_PORT_RESPONSE]	= { .type = NLA_U16, },  }; +static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct net *net = sock_net(skb->sk); +	int h, s_h; +	int idx = 0, s_idx; +	struct net_device *dev; +	struct hlist_head *head; +	struct nlattr *tb[IFLA_MAX+1]; +	u32 ext_filter_mask = 0; +	int err; +	int hdrlen; + +	s_h = cb->args[0]; +	s_idx = cb->args[1]; + +	rcu_read_lock(); +	cb->seq = net->dev_base_seq; + +	/* A hack to preserve kernel<->userspace interface. +	 * The correct header is ifinfomsg. It is consistent with rtnl_getlink. +	 * However, before Linux v3.9 the code here assumed rtgenmsg and that's +	 * what iproute2 < v3.9.0 used. +	 * We can detect the old iproute2. Even including the IFLA_EXT_MASK +	 * attribute, its netlink message is shorter than struct ifinfomsg. +	 */ +	hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ? +		 sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + +	if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) { + +		if (tb[IFLA_EXT_MASK]) +			ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); +	} + +	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { +		idx = 0; +		head = &net->dev_index_head[h]; +		hlist_for_each_entry_rcu(dev, head, index_hlist) { +			if (idx < s_idx) +				goto cont; +			err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, +					       NETLINK_CB(cb->skb).portid, +					       cb->nlh->nlmsg_seq, 0, +					       NLM_F_MULTI, +					       ext_filter_mask); +			/* If we ran out of room on the first message, +			 * we're in trouble +			 */ +			WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + +			if (err <= 0) +				goto out; + +			nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +			idx++; +		} +	} +out: +	rcu_read_unlock(); +	cb->args[1] = idx; +	cb->args[0] = h; + +	return skb->len; +} + +int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len) +{ +	return nla_parse(tb, IFLA_MAX, head, len, ifla_policy); +} +EXPORT_SYMBOL(rtnl_nla_parse_ifla); +  struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])  {  	struct net *net; @@ -1254,11 +1387,29 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)  		}  		case IFLA_VF_TX_RATE: {  			struct ifla_vf_tx_rate *ivt; +			struct ifla_vf_info ivf; +			ivt = nla_data(vf); +			err = -EOPNOTSUPP; +			if (ops->ndo_get_vf_config) +				err = ops->ndo_get_vf_config(dev, ivt->vf, +							     &ivf); +			if (err) +				break; +			err = -EOPNOTSUPP; +			if (ops->ndo_set_vf_rate) +				err = ops->ndo_set_vf_rate(dev, ivt->vf, +							   ivf.min_tx_rate, +							   ivt->rate); +			break; +		} +		case IFLA_VF_RATE: { +			struct ifla_vf_rate *ivt;  			ivt = nla_data(vf);  			err = -EOPNOTSUPP; -			if (ops->ndo_set_vf_tx_rate) -				err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, -							      ivt->rate); +			if (ops->ndo_set_vf_rate) +				err = ops->ndo_set_vf_rate(dev, ivt->vf, +							   ivt->min_tx_rate, +							   ivt->max_tx_rate);  			break;  		}  		case IFLA_VF_SPOOFCHK: { @@ -1324,7 +1475,8 @@ static int do_set_master(struct net_device *dev, int ifindex)  	return 0;  } -static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, +static int do_setlink(const struct sk_buff *skb, +		      struct net_device *dev, struct ifinfomsg *ifm,  		      struct nlattr **tb, char *ifname, int modified)  {  	const struct net_device_ops *ops = dev->netdev_ops; @@ -1336,7 +1488,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  			err = PTR_ERR(net);  			goto errout;  		} -		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { +		if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {  			err = -EPERM;  			goto errout;  		} @@ -1590,7 +1742,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)  	if (err < 0)  		goto errout; -	err = do_setlink(dev, ifm, tb, ifname, 0); +	err = do_setlink(skb, dev, ifm, tb, ifname, 0);  errout:  	return err;  } @@ -1630,7 +1782,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)  	ops->dellink(dev, &list_kill);  	unregister_netdevice_many(&list_kill); -	list_del(&list_kill);  	return 0;  } @@ -1647,9 +1798,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)  	}  	dev->rtnl_link_state = RTNL_LINK_INITIALIZED; -	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); -	__dev_notify_flags(dev, old_flags); +	__dev_notify_flags(dev, old_flags, ~0U);  	return 0;  }  EXPORT_SYMBOL(rtnl_configure_link); @@ -1708,7 +1858,8 @@ err:  }  EXPORT_SYMBOL(rtnl_create_link); -static int rtnl_group_changelink(struct net *net, int group, +static int rtnl_group_changelink(const struct sk_buff *skb, +		struct net *net, int group,  		struct ifinfomsg *ifm,  		struct nlattr **tb)  { @@ -1717,7 +1868,7 @@ static int rtnl_group_changelink(struct net *net, int group,  	for_each_netdev(net, dev) {  		if (dev->group == group) { -			err = do_setlink(dev, ifm, tb, NULL, 0); +			err = do_setlink(skb, dev, ifm, tb, NULL, 0);  			if (err < 0)  				return err;  		} @@ -1730,7 +1881,9 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	const struct rtnl_link_ops *ops; +	const struct rtnl_link_ops *m_ops = NULL;  	struct net_device *dev; +	struct net_device *master_dev = NULL;  	struct ifinfomsg *ifm;  	char kind[MODULE_NAME_LEN];  	char ifname[IFNAMSIZ]; @@ -1760,6 +1913,12 @@ replay:  			dev = NULL;  	} +	if (dev) { +		master_dev = netdev_master_upper_dev_get(dev); +		if (master_dev) +			m_ops = master_dev->rtnl_link_ops; +	} +  	err = validate_linkmsg(dev, tb);  	if (err < 0)  		return err; @@ -1781,7 +1940,10 @@ replay:  	}  	if (1) { -		struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; +		struct nlattr *attr[ops ? ops->maxtype + 1 : 0]; +		struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 0]; +		struct nlattr **data = NULL; +		struct nlattr **slave_data = NULL;  		struct net *dest_net;  		if (ops) { @@ -1800,6 +1962,24 @@ replay:  			}  		} +		if (m_ops) { +			if (m_ops->slave_maxtype && +			    linkinfo[IFLA_INFO_SLAVE_DATA]) { +				err = nla_parse_nested(slave_attr, +						       m_ops->slave_maxtype, +						       linkinfo[IFLA_INFO_SLAVE_DATA], +						       m_ops->slave_policy); +				if (err < 0) +					return err; +				slave_data = slave_attr; +			} +			if (m_ops->slave_validate) { +				err = m_ops->slave_validate(tb, slave_data); +				if (err < 0) +					return err; +			} +		} +  		if (dev) {  			int modified = 0; @@ -1819,12 +1999,23 @@ replay:  				modified = 1;  			} -			return do_setlink(dev, ifm, tb, ifname, modified); +			if (linkinfo[IFLA_INFO_SLAVE_DATA]) { +				if (!m_ops || !m_ops->slave_changelink) +					return -EOPNOTSUPP; + +				err = m_ops->slave_changelink(master_dev, dev, +							      tb, slave_data); +				if (err < 0) +					return err; +				modified = 1; +			} + +			return do_setlink(skb, dev, ifm, tb, ifname, modified);  		}  		if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {  			if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) -				return rtnl_group_changelink(net, +				return rtnl_group_changelink(skb, net,  						nla_get_u32(tb[IFLA_GROUP]),  						ifm, tb);  			return -ENODEV; @@ -1862,16 +2053,25 @@ replay:  		dev->ifindex = ifm->ifi_index; -		if (ops->newlink) +		if (ops->newlink) {  			err = ops->newlink(net, dev, tb, data); -		else +			/* Drivers should call free_netdev() in ->destructor +			 * and unregister it on failure after registration +			 * so that device could be finally freed in rtnl_unlock. +			 */ +			if (err < 0) { +				/* If device is not registered at all, free it now */ +				if (dev->reg_state == NETREG_UNINITIALIZED) +					free_netdev(dev); +				goto out; +			} +		} else {  			err = register_netdevice(dev); - -		if (err < 0) { -			free_netdev(dev); -			goto out; +			if (err < 0) { +				free_netdev(dev); +				goto out; +			}  		} -  		err = rtnl_configure_link(dev, ifm);  		if (err < 0)  			unregister_netdevice(dev); @@ -1936,9 +2136,13 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)  	struct nlattr *tb[IFLA_MAX+1];  	u32 ext_filter_mask = 0;  	u16 min_ifinfo_dump_size = 0; +	int hdrlen; + +	/* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ +	hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? +		 sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); -	if (nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, -			ifla_policy) >= 0) { +	if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) {  		if (tb[IFLA_EXT_MASK])  			ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);  	} @@ -1985,14 +2189,15 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)  	return skb->len;  } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change) +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, +		  gfp_t flags)  {  	struct net *net = dev_net(dev);  	struct sk_buff *skb;  	int err = -ENOBUFS;  	size_t if_info_size; -	skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); +	skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags);  	if (skb == NULL)  		goto errout; @@ -2003,7 +2208,7 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change)  		kfree_skb(skb);  		goto errout;  	} -	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); +	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);  	return;  errout:  	if (err < 0) @@ -2014,12 +2219,13 @@ EXPORT_SYMBOL(rtmsg_ifinfo);  static int nlmsg_populate_fdb_fill(struct sk_buff *skb,  				   struct net_device *dev,  				   u8 *addr, u32 pid, u32 seq, -				   int type, unsigned int flags) +				   int type, unsigned int flags, +				   int nlflags)  {  	struct nlmsghdr *nlh;  	struct ndmsg *ndm; -	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), NLM_F_MULTI); +	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags);  	if (!nlh)  		return -EMSGSIZE; @@ -2057,7 +2263,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type)  	if (!skb)  		goto errout; -	err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF); +	err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF, 0);  	if (err < 0) {  		kfree_skb(skb);  		goto errout; @@ -2204,7 +2410,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)  	int err = -EINVAL;  	__u8 *addr; -	if (!capable(CAP_NET_ADMIN)) +	if (!netlink_capable(skb, CAP_NET_ADMIN))  		return -EPERM;  	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); @@ -2282,7 +2488,8 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,  		err = nlmsg_populate_fdb_fill(skb, dev, ha->addr,  					      portid, seq, -					      RTM_NEWNEIGH, NTF_SELF); +					      RTM_NEWNEIGH, NTF_SELF, +					      NLM_F_MULTI);  		if (err < 0)  			return err;  skip: @@ -2655,7 +2862,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	sz_idx = type>>2;  	kind = type&3; -	if (kind != 2 && !ns_capable(net->user_ns, CAP_NET_ADMIN)) +	if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))  		return -EPERM;  	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { @@ -2717,7 +2924,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi  	case NETDEV_JOIN:  		break;  	default: -		rtmsg_ifinfo(RTM_NEWLINK, dev, 0); +		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);  		break;  	}  	return NOTIFY_DONE; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 6a2f13cee86..ba71212f025 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -7,15 +7,20 @@  #include <linux/hrtimer.h>  #include <linux/ktime.h>  #include <linux/string.h> +#include <linux/net.h>  #include <net/secure_seq.h> -static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; +#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) +#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) -void net_secret_init(void) +static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; + +static __always_inline void net_secret_init(void)  { -	get_random_bytes(net_secret, sizeof(net_secret)); +	net_get_random_once(net_secret, sizeof(net_secret));  } +#endif  #ifdef CONFIG_INET  static u32 seq_scale(u32 seq) @@ -42,6 +47,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,  	u32 hash[MD5_DIGEST_WORDS];  	u32 i; +	net_secret_init();  	memcpy(hash, saddr, 16);  	for (i = 0; i < 4; i++)  		secret[i] = net_secret[i] + (__force u32)daddr[i]; @@ -63,6 +69,7 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,  	u32 hash[MD5_DIGEST_WORDS];  	u32 i; +	net_secret_init();  	memcpy(hash, saddr, 16);  	for (i = 0; i < 4; i++)  		secret[i] = net_secret[i] + (__force u32) daddr[i]; @@ -78,35 +85,13 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);  #endif  #ifdef CONFIG_INET -__u32 secure_ip_id(__be32 daddr) -{ -	u32 hash[MD5_DIGEST_WORDS]; - -	hash[0] = (__force __u32) daddr; -	hash[1] = net_secret[13]; -	hash[2] = net_secret[14]; -	hash[3] = net_secret[15]; - -	md5_transform(hash, net_secret); - -	return hash[0]; -} - -__u32 secure_ipv6_id(const __be32 daddr[4]) -{ -	__u32 hash[4]; - -	memcpy(hash, daddr, 16); -	md5_transform(hash, net_secret); - -	return hash[0]; -}  __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,  				 __be16 sport, __be16 dport)  {  	u32 hash[MD5_DIGEST_WORDS]; +	net_secret_init();  	hash[0] = (__force u32)saddr;  	hash[1] = (__force u32)daddr;  	hash[2] = ((__force u16)sport << 16) + (__force u16)dport; @@ -121,6 +106,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)  {  	u32 hash[MD5_DIGEST_WORDS]; +	net_secret_init();  	hash[0] = (__force u32)saddr;  	hash[1] = (__force u32)daddr;  	hash[2] = (__force u32)dport ^ net_secret[14]; @@ -140,6 +126,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,  	u32 hash[MD5_DIGEST_WORDS];  	u64 seq; +	net_secret_init();  	hash[0] = (__force u32)saddr;  	hash[1] = (__force u32)daddr;  	hash[2] = ((__force u16)sport << 16) + (__force u16)dport; @@ -164,6 +151,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,  	u64 seq;  	u32 i; +	net_secret_init();  	memcpy(hash, saddr, 16);  	for (i = 0; i < 4; i++)  		secret[i] = net_secret[i] + daddr[i]; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d81cff119f7..c1a33033cbe 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -47,6 +47,8 @@  #include <linux/in.h>  #include <linux/inet.h>  #include <linux/slab.h> +#include <linux/tcp.h> +#include <linux/udp.h>  #include <linux/netdevice.h>  #ifdef CONFIG_NET_CLS_ACT  #include <net/pkt_sched.h> @@ -65,6 +67,7 @@  #include <net/dst.h>  #include <net/sock.h>  #include <net/checksum.h> +#include <net/ip6_checksum.h>  #include <net/xfrm.h>  #include <asm/uaccess.h> @@ -74,36 +77,6 @@  struct kmem_cache *skbuff_head_cache __read_mostly;  static struct kmem_cache *skbuff_fclone_cache __read_mostly; -static void sock_pipe_buf_release(struct pipe_inode_info *pipe, -				  struct pipe_buffer *buf) -{ -	put_page(buf->page); -} - -static void sock_pipe_buf_get(struct pipe_inode_info *pipe, -				struct pipe_buffer *buf) -{ -	get_page(buf->page); -} - -static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, -			       struct pipe_buffer *buf) -{ -	return 1; -} - - -/* Pipe buffer operations for a socket. */ -static const struct pipe_buf_operations sock_pipe_buf_ops = { -	.can_merge = 0, -	.map = generic_pipe_buf_map, -	.unmap = generic_pipe_buf_unmap, -	.confirm = generic_pipe_buf_confirm, -	.release = sock_pipe_buf_release, -	.steal = sock_pipe_buf_steal, -	.get = sock_pipe_buf_get, -}; -  /**   *	skb_panic - private function for out-of-line support   *	@skb:	buffer @@ -476,6 +449,18 @@ void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,  }  EXPORT_SYMBOL(skb_add_rx_frag); +void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, +			  unsigned int truesize) +{ +	skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + +	skb_frag_size_add(frag, size); +	skb->len += size; +	skb->data_len += size; +	skb->truesize += truesize; +} +EXPORT_SYMBOL(skb_coalesce_rx_frag); +  static void skb_drop_list(struct sk_buff **listp)  {  	kfree_skb_list(*listp); @@ -580,9 +565,6 @@ static void skb_release_head_state(struct sk_buff *skb)  #if IS_ENABLED(CONFIG_NF_CONNTRACK)  	nf_conntrack_put(skb->nfct);  #endif -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED -	nf_conntrack_put_reasm(skb->nfct_reasm); -#endif  #ifdef CONFIG_BRIDGE_NETFILTER  	nf_bridge_put(skb->nf_bridge);  #endif @@ -703,17 +685,19 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)  	new->inner_network_header = old->inner_network_header;  	new->inner_mac_header = old->inner_mac_header;  	skb_dst_copy(new, old); -	new->rxhash		= old->rxhash; +	skb_copy_hash(new, old);  	new->ooo_okay		= old->ooo_okay; -	new->l4_rxhash		= old->l4_rxhash;  	new->no_fcs		= old->no_fcs;  	new->encapsulation	= old->encapsulation; +	new->encap_hdr_csum	= old->encap_hdr_csum; +	new->csum_valid		= old->csum_valid; +	new->csum_complete_sw	= old->csum_complete_sw;  #ifdef CONFIG_XFRM  	new->sp			= secpath_get(old->sp);  #endif  	memcpy(new->cb, old->cb, sizeof(old->cb));  	new->csum		= old->csum; -	new->local_df		= old->local_df; +	new->ignore_df		= old->ignore_df;  	new->pkt_type		= old->pkt_type;  	new->ip_summed		= old->ip_summed;  	skb_copy_queue_mapping(new, old); @@ -726,9 +710,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)  	new->mark		= old->mark;  	new->skb_iif		= old->skb_iif;  	__nf_copy(new, old); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) -	new->nf_trace		= old->nf_trace; -#endif  #ifdef CONFIG_NET_SCHED  	new->tc_index		= old->tc_index;  #ifdef CONFIG_NET_CLS_ACT @@ -903,6 +884,9 @@ EXPORT_SYMBOL(skb_clone);  static void skb_headers_offset_update(struct sk_buff *skb, int off)  { +	/* Only adjust this if it actually is csum_start rather than csum */ +	if (skb->ip_summed == CHECKSUM_PARTIAL) +		skb->csum_start += off;  	/* {transport,network,mac}_header and tail are relative to skb->head */  	skb->transport_header += off;  	skb->network_header   += off; @@ -970,10 +954,13 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)  EXPORT_SYMBOL(skb_copy);  /** - *	__pskb_copy	-	create copy of an sk_buff with private head. + *	__pskb_copy_fclone	-  create copy of an sk_buff with private head.   *	@skb: buffer to copy   *	@headroom: headroom of new skb   *	@gfp_mask: allocation priority + *	@fclone: if true allocate the copy of the skb from the fclone + *	cache instead of the head cache; it is recommended to set this + *	to true for the cases where the copy will likely be cloned   *   *	Make a copy of both an &sk_buff and part of its data, located   *	in header. Fragmented data remain shared. This is used when @@ -983,11 +970,12 @@ EXPORT_SYMBOL(skb_copy);   *	The returned buffer has a reference count of 1.   */ -struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) +struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, +				   gfp_t gfp_mask, bool fclone)  {  	unsigned int size = skb_headlen(skb) + headroom; -	struct sk_buff *n = __alloc_skb(size, gfp_mask, -					skb_alloc_rx_flag(skb), NUMA_NO_NODE); +	int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); +	struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);  	if (!n)  		goto out; @@ -1027,7 +1015,7 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)  out:  	return n;  } -EXPORT_SYMBOL(__pskb_copy); +EXPORT_SYMBOL(__pskb_copy_fclone);  /**   *	pskb_expand_head - reallocate header of &sk_buff @@ -1036,8 +1024,8 @@ EXPORT_SYMBOL(__pskb_copy);   *	@ntail: room to add at tail   *	@gfp_mask: allocation priority   * - *	Expands (or creates identical copy, if &nhead and &ntail are zero) - *	header of skb. &sk_buff itself is not changed. &sk_buff MUST have + *	Expands (or creates identical copy, if @nhead and @ntail are zero) + *	header of @skb. &sk_buff itself is not changed. &sk_buff MUST have   *	reference count of 1. Returns zero in the case of success or error,   *	if expansion failed. In the last case, &sk_buff is not changed.   * @@ -1109,9 +1097,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,  #endif  	skb->tail	      += off;  	skb_headers_offset_update(skb, nhead); -	/* Only adjust this if it actually is csum_start rather than csum */ -	if (skb->ip_summed == CHECKSUM_PARTIAL) -		skb->csum_start += nhead;  	skb->cloned   = 0;  	skb->hdr_len  = 0;  	skb->nohdr    = 0; @@ -1176,7 +1161,6 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,  					NUMA_NO_NODE);  	int oldheadroom = skb_headroom(skb);  	int head_copy_len, head_copy_off; -	int off;  	if (!n)  		return NULL; @@ -1200,11 +1184,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,  	copy_skb_header(n, skb); -	off                  = newheadroom - oldheadroom; -	if (n->ip_summed == CHECKSUM_PARTIAL) -		n->csum_start += off; - -	skb_headers_offset_update(n, off); +	skb_headers_offset_update(n, newheadroom - oldheadroom);  	return n;  } @@ -1257,6 +1237,29 @@ free_skb:  EXPORT_SYMBOL(skb_pad);  /** + *	pskb_put - add data to the tail of a potentially fragmented buffer + *	@skb: start of the buffer to use + *	@tail: tail fragment of the buffer to use + *	@len: amount of data to add + * + *	This function extends the used data area of the potentially + *	fragmented buffer. @tail must be the last fragment of @skb -- or + *	@skb itself. If this would exceed the total buffer size the kernel + *	will panic. A pointer to the first byte of the extra data is + *	returned. + */ + +unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +{ +	if (tail != skb) { +		skb->data_len += len; +		skb->len += len; +	} +	return skb_put(tail, len); +} +EXPORT_SYMBOL_GPL(pskb_put); + +/**   *	skb_put - add data to a buffer   *	@skb: buffer to use   *	@len: amount of data to add @@ -1803,7 +1806,7 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,  		.partial = partial,  		.nr_pages_max = MAX_SKB_FRAGS,  		.flags = flags, -		.ops = &sock_pipe_buf_ops, +		.ops = &nosteal_pipe_buf_ops,  		.spd_release = sock_spd_release,  	};  	struct sk_buff *frag_iter; @@ -1933,9 +1936,8 @@ fault:  EXPORT_SYMBOL(skb_store_bits);  /* Checksum skb data. */ - -__wsum skb_checksum(const struct sk_buff *skb, int offset, -			  int len, __wsum csum) +__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, +		      __wsum csum, const struct skb_checksum_ops *ops)  {  	int start = skb_headlen(skb);  	int i, copy = start - offset; @@ -1946,7 +1948,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,  	if (copy > 0) {  		if (copy > len)  			copy = len; -		csum = csum_partial(skb->data + offset, copy, csum); +		csum = ops->update(skb->data + offset, copy, csum);  		if ((len -= copy) == 0)  			return csum;  		offset += copy; @@ -1967,10 +1969,10 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,  			if (copy > len)  				copy = len;  			vaddr = kmap_atomic(skb_frag_page(frag)); -			csum2 = csum_partial(vaddr + frag->page_offset + -					     offset - start, copy, 0); +			csum2 = ops->update(vaddr + frag->page_offset + +					    offset - start, copy, 0);  			kunmap_atomic(vaddr); -			csum = csum_block_add(csum, csum2, pos); +			csum = ops->combine(csum, csum2, pos, copy);  			if (!(len -= copy))  				return csum;  			offset += copy; @@ -1989,9 +1991,9 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,  			__wsum csum2;  			if (copy > len)  				copy = len; -			csum2 = skb_checksum(frag_iter, offset - start, -					     copy, 0); -			csum = csum_block_add(csum, csum2, pos); +			csum2 = __skb_checksum(frag_iter, offset - start, +					       copy, 0, ops); +			csum = ops->combine(csum, csum2, pos, copy);  			if ((len -= copy) == 0)  				return csum;  			offset += copy; @@ -2003,6 +2005,18 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,  	return csum;  } +EXPORT_SYMBOL(__skb_checksum); + +__wsum skb_checksum(const struct sk_buff *skb, int offset, +		    int len, __wsum csum) +{ +	const struct skb_checksum_ops ops = { +		.update  = csum_partial_ext, +		.combine = csum_block_add_ext, +	}; + +	return __skb_checksum(skb, offset, len, csum, &ops); +}  EXPORT_SYMBOL(skb_checksum);  /* Both of above in one bottle. */ @@ -2084,6 +2098,104 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,  }  EXPORT_SYMBOL(skb_copy_and_csum_bits); + /** + *	skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() + *	@from: source buffer + * + *	Calculates the amount of linear headroom needed in the 'to' skb passed + *	into skb_zerocopy(). + */ +unsigned int +skb_zerocopy_headlen(const struct sk_buff *from) +{ +	unsigned int hlen = 0; + +	if (!from->head_frag || +	    skb_headlen(from) < L1_CACHE_BYTES || +	    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) +		hlen = skb_headlen(from); + +	if (skb_has_frag_list(from)) +		hlen = from->len; + +	return hlen; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); + +/** + *	skb_zerocopy - Zero copy skb to skb + *	@to: destination buffer + *	@from: source buffer + *	@len: number of bytes to copy from source buffer + *	@hlen: size of linear headroom in destination buffer + * + *	Copies up to `len` bytes from `from` to `to` by creating references + *	to the frags in the source buffer. + * + *	The `hlen` as calculated by skb_zerocopy_headlen() specifies the + *	headroom in the `to` buffer. + * + *	Return value: + *	0: everything is OK + *	-ENOMEM: couldn't orphan frags of @from due to lack of memory + *	-EFAULT: skb_copy_bits() found some problem with skb geometry + */ +int +skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) +{ +	int i, j = 0; +	int plen = 0; /* length of skb->head fragment */ +	int ret; +	struct page *page; +	unsigned int offset; + +	BUG_ON(!from->head_frag && !hlen); + +	/* dont bother with small payloads */ +	if (len <= skb_tailroom(to)) +		return skb_copy_bits(from, 0, skb_put(to, len), len); + +	if (hlen) { +		ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); +		if (unlikely(ret)) +			return ret; +		len -= hlen; +	} else { +		plen = min_t(int, skb_headlen(from), len); +		if (plen) { +			page = virt_to_head_page(from->head); +			offset = from->data - (unsigned char *)page_address(page); +			__skb_fill_page_desc(to, 0, page, offset, plen); +			get_page(page); +			j = 1; +			len -= plen; +		} +	} + +	to->truesize += len + plen; +	to->len += len + plen; +	to->data_len += len + plen; + +	if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { +		skb_tx_error(from); +		return -ENOMEM; +	} + +	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { +		if (!len) +			break; +		skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; +		skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); +		len -= skb_shinfo(to)->frags[j].size; +		skb_frag_ref(to, j); +		j++; +	} +	skb_shinfo(to)->nr_frags = j; + +	return 0; +} +EXPORT_SYMBOL_GPL(skb_zerocopy); +  void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)  {  	__wsum csum; @@ -2522,14 +2634,14 @@ EXPORT_SYMBOL(skb_prepare_seq_read);   * @data: destination pointer for data to be returned   * @st: state variable   * - * Reads a block of skb data at &consumed relative to the + * Reads a block of skb data at @consumed relative to the   * lower offset specified to skb_prepare_seq_read(). Assigns - * the head of the data block to &data and returns the length + * the head of the data block to @data and returns the length   * of the block or 0 if the end of the skb data or the upper   * offset has been reached.   *   * The caller is not required to consume all of the data - * returned, i.e. &consumed is typically set to the number + * returned, i.e. @consumed is typically set to the number   * of bytes already consumed and the next call to   * skb_seq_read() will return the remaining part of the block.   * @@ -2746,67 +2858,96 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);  /**   *	skb_segment - Perform protocol segmentation on skb. - *	@skb: buffer to segment + *	@head_skb: buffer to segment   *	@features: features for the output path (see dev->features)   *   *	This function performs segmentation on the given skb.  It returns   *	a pointer to the first in a list of new skbs for the segments.   *	In case of error it returns ERR_PTR(err).   */ -struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) +struct sk_buff *skb_segment(struct sk_buff *head_skb, +			    netdev_features_t features)  {  	struct sk_buff *segs = NULL;  	struct sk_buff *tail = NULL; -	struct sk_buff *fskb = skb_shinfo(skb)->frag_list; -	unsigned int mss = skb_shinfo(skb)->gso_size; -	unsigned int doffset = skb->data - skb_mac_header(skb); +	struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; +	skb_frag_t *frag = skb_shinfo(head_skb)->frags; +	unsigned int mss = skb_shinfo(head_skb)->gso_size; +	unsigned int doffset = head_skb->data - skb_mac_header(head_skb); +	struct sk_buff *frag_skb = head_skb;  	unsigned int offset = doffset; -	unsigned int tnl_hlen = skb_tnl_header_len(skb); +	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);  	unsigned int headroom;  	unsigned int len;  	__be16 proto;  	bool csum;  	int sg = !!(features & NETIF_F_SG); -	int nfrags = skb_shinfo(skb)->nr_frags; +	int nfrags = skb_shinfo(head_skb)->nr_frags;  	int err = -ENOMEM;  	int i = 0;  	int pos; +	int dummy; -	proto = skb_network_protocol(skb); +	__skb_push(head_skb, doffset); +	proto = skb_network_protocol(head_skb, &dummy);  	if (unlikely(!proto))  		return ERR_PTR(-EINVAL); -	csum = !!can_checksum_protocol(features, proto); -	__skb_push(skb, doffset); -	headroom = skb_headroom(skb); -	pos = skb_headlen(skb); +	csum = !head_skb->encap_hdr_csum && +	    !!can_checksum_protocol(features, proto); + +	headroom = skb_headroom(head_skb); +	pos = skb_headlen(head_skb);  	do {  		struct sk_buff *nskb; -		skb_frag_t *frag; +		skb_frag_t *nskb_frag;  		int hsize;  		int size; -		len = skb->len - offset; +		len = head_skb->len - offset;  		if (len > mss)  			len = mss; -		hsize = skb_headlen(skb) - offset; +		hsize = skb_headlen(head_skb) - offset;  		if (hsize < 0)  			hsize = 0;  		if (hsize > len || !sg)  			hsize = len; -		if (!hsize && i >= nfrags) { -			BUG_ON(fskb->len != len); +		if (!hsize && i >= nfrags && skb_headlen(list_skb) && +		    (skb_headlen(list_skb) == len || sg)) { +			BUG_ON(skb_headlen(list_skb) > len); + +			i = 0; +			nfrags = skb_shinfo(list_skb)->nr_frags; +			frag = skb_shinfo(list_skb)->frags; +			frag_skb = list_skb; +			pos += skb_headlen(list_skb); + +			while (pos < offset + len) { +				BUG_ON(i >= nfrags); -			pos += len; -			nskb = skb_clone(fskb, GFP_ATOMIC); -			fskb = fskb->next; +				size = skb_frag_size(frag); +				if (pos + size > offset + len) +					break; + +				i++; +				pos += size; +				frag++; +			} + +			nskb = skb_clone(list_skb, GFP_ATOMIC); +			list_skb = list_skb->next;  			if (unlikely(!nskb))  				goto err; +			if (unlikely(pskb_trim(nskb, len))) { +				kfree_skb(nskb); +				goto err; +			} +  			hsize = skb_end_offset(nskb);  			if (skb_cow_head(nskb, doffset + headroom)) {  				kfree_skb(nskb); @@ -2818,7 +2959,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)  			__skb_push(nskb, doffset);  		} else {  			nskb = __alloc_skb(hsize + doffset + headroom, -					   GFP_ATOMIC, skb_alloc_rx_flag(skb), +					   GFP_ATOMIC, skb_alloc_rx_flag(head_skb),  					   NUMA_NO_NODE);  			if (unlikely(!nskb)) @@ -2834,80 +2975,82 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)  			segs = nskb;  		tail = nskb; -		__copy_skb_header(nskb, skb); -		nskb->mac_len = skb->mac_len; +		__copy_skb_header(nskb, head_skb); +		nskb->mac_len = head_skb->mac_len; -		/* nskb and skb might have different headroom */ -		if (nskb->ip_summed == CHECKSUM_PARTIAL) -			nskb->csum_start += skb_headroom(nskb) - headroom; +		skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); -		skb_reset_mac_header(nskb); -		skb_set_network_header(nskb, skb->mac_len); -		nskb->transport_header = (nskb->network_header + -					  skb_network_header_len(skb)); - -		skb_copy_from_linear_data_offset(skb, -tnl_hlen, +		skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,  						 nskb->data - tnl_hlen,  						 doffset + tnl_hlen); -		if (fskb != skb_shinfo(skb)->frag_list) +		if (nskb->len == len + doffset)  			goto perform_csum_check;  		if (!sg) {  			nskb->ip_summed = CHECKSUM_NONE; -			nskb->csum = skb_copy_and_csum_bits(skb, offset, +			nskb->csum = skb_copy_and_csum_bits(head_skb, offset,  							    skb_put(nskb, len),  							    len, 0); +			SKB_GSO_CB(nskb)->csum_start = +			    skb_headroom(nskb) + doffset;  			continue;  		} -		frag = skb_shinfo(nskb)->frags; +		nskb_frag = skb_shinfo(nskb)->frags; -		skb_copy_from_linear_data_offset(skb, offset, +		skb_copy_from_linear_data_offset(head_skb, offset,  						 skb_put(nskb, hsize), hsize); -		skb_shinfo(nskb)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; +		skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags & +			SKBTX_SHARED_FRAG; + +		while (pos < offset + len) { +			if (i >= nfrags) { +				BUG_ON(skb_headlen(list_skb)); -		while (pos < offset + len && i < nfrags) { -			*frag = skb_shinfo(skb)->frags[i]; -			__skb_frag_ref(frag); -			size = skb_frag_size(frag); +				i = 0; +				nfrags = skb_shinfo(list_skb)->nr_frags; +				frag = skb_shinfo(list_skb)->frags; +				frag_skb = list_skb; + +				BUG_ON(!nfrags); + +				list_skb = list_skb->next; +			} + +			if (unlikely(skb_shinfo(nskb)->nr_frags >= +				     MAX_SKB_FRAGS)) { +				net_warn_ratelimited( +					"skb_segment: too many frags: %u %u\n", +					pos, mss); +				goto err; +			} + +			if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) +				goto err; + +			*nskb_frag = *frag; +			__skb_frag_ref(nskb_frag); +			size = skb_frag_size(nskb_frag);  			if (pos < offset) { -				frag->page_offset += offset - pos; -				skb_frag_size_sub(frag, offset - pos); +				nskb_frag->page_offset += offset - pos; +				skb_frag_size_sub(nskb_frag, offset - pos);  			}  			skb_shinfo(nskb)->nr_frags++;  			if (pos + size <= offset + len) {  				i++; +				frag++;  				pos += size;  			} else { -				skb_frag_size_sub(frag, pos + size - (offset + len)); +				skb_frag_size_sub(nskb_frag, pos + size - (offset + len));  				goto skip_fraglist;  			} -			frag++; -		} - -		if (pos < offset + len) { -			struct sk_buff *fskb2 = fskb; - -			BUG_ON(pos + fskb->len != offset + len); - -			pos += fskb->len; -			fskb = fskb->next; - -			if (fskb2->next) { -				fskb2 = skb_clone(fskb2, GFP_ATOMIC); -				if (!fskb2) -					goto err; -			} else -				skb_get(fskb2); - -			SKB_FRAG_ASSERT(nskb); -			skb_shinfo(nskb)->frag_list = fskb2; +			nskb_frag++;  		}  skip_fraglist: @@ -2920,48 +3063,45 @@ perform_csum_check:  			nskb->csum = skb_checksum(nskb, doffset,  						  nskb->len - doffset, 0);  			nskb->ip_summed = CHECKSUM_NONE; +			SKB_GSO_CB(nskb)->csum_start = +			    skb_headroom(nskb) + doffset;  		} -	} while ((offset += len) < skb->len); +	} while ((offset += len) < head_skb->len);  	return segs;  err: -	while ((skb = segs)) { -		segs = skb->next; -		kfree_skb(skb); -	} +	kfree_skb_list(segs);  	return ERR_PTR(err);  }  EXPORT_SYMBOL_GPL(skb_segment);  int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)  { -	struct sk_buff *p = *head; -	struct sk_buff *nskb; -	struct skb_shared_info *skbinfo = skb_shinfo(skb); -	struct skb_shared_info *pinfo = skb_shinfo(p); -	unsigned int headroom; -	unsigned int len = skb_gro_len(skb); +	struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);  	unsigned int offset = skb_gro_offset(skb);  	unsigned int headlen = skb_headlen(skb); +	struct sk_buff *nskb, *lp, *p = *head; +	unsigned int len = skb_gro_len(skb);  	unsigned int delta_truesize; +	unsigned int headroom; -	if (p->len + len >= 65536) +	if (unlikely(p->len + len >= 65536))  		return -E2BIG; -	if (pinfo->frag_list) -		goto merge; -	else if (headlen <= offset) { +	lp = NAPI_GRO_CB(p)->last; +	pinfo = skb_shinfo(lp); + +	if (headlen <= offset) {  		skb_frag_t *frag;  		skb_frag_t *frag2;  		int i = skbinfo->nr_frags;  		int nr_frags = pinfo->nr_frags + i; -		offset -= headlen; -  		if (nr_frags > MAX_SKB_FRAGS) -			return -E2BIG; +			goto merge; +		offset -= headlen;  		pinfo->nr_frags = nr_frags;  		skbinfo->nr_frags = 0; @@ -2992,7 +3132,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)  		unsigned int first_offset;  		if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) -			return -E2BIG; +			goto merge;  		first_offset = skb->data -  			       (unsigned char *)page_address(page) + @@ -3010,7 +3150,10 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)  		delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));  		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;  		goto done; -	} else if (skb_gro_len(p) != pinfo->gso_size) +	} +	if (pinfo->frag_list) +		goto merge; +	if (skb_gro_len(p) != pinfo->gso_size)  		return -E2BIG;  	headroom = skb_headroom(p); @@ -3062,16 +3205,24 @@ merge:  	__skb_pull(skb, offset); -	NAPI_GRO_CB(p)->last->next = skb; +	if (NAPI_GRO_CB(p)->last == p) +		skb_shinfo(p)->frag_list = skb; +	else +		NAPI_GRO_CB(p)->last->next = skb;  	NAPI_GRO_CB(p)->last = skb;  	skb_header_release(skb); +	lp = p;  done:  	NAPI_GRO_CB(p)->count++;  	p->data_len += len;  	p->truesize += delta_truesize;  	p->len += len; - +	if (lp != p) { +		lp->data_len += len; +		lp->truesize += delta_truesize; +		lp->len += len; +	}  	NAPI_GRO_CB(skb)->same_flow = 1;  	return 0;  } @@ -3162,6 +3313,32 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)  	return elt;  } +/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given + * sglist without mark the sg which contain last skb data as the end. + * So the caller can mannipulate sg list as will when padding new data after + * the first call without calling sg_unmark_end to expend sg list. + * + * Scenario to use skb_to_sgvec_nomark: + * 1. sg_init_table + * 2. skb_to_sgvec_nomark(payload1) + * 3. skb_to_sgvec_nomark(payload2) + * + * This is equivalent to: + * 1. sg_init_table + * 2. skb_to_sgvec(payload1) + * 3. sg_unmark_end + * 4. skb_to_sgvec(payload2) + * + * When mapping mutilple payload conditionally, skb_to_sgvec_nomark + * is more preferable. + */ +int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, +			int offset, int len) +{ +	return __skb_to_sgvec(skb, sg, offset, len); +} +EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); +  int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)  {  	int nsg = __skb_to_sgvec(skb, sg, offset, len); @@ -3294,8 +3471,6 @@ static void sock_rmem_free(struct sk_buff *skb)   */  int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)  { -	int len = skb->len; -  	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=  	    (unsigned int)sk->sk_rcvbuf)  		return -ENOMEM; @@ -3310,7 +3485,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)  	skb_queue_tail(&sk->sk_error_queue, skb);  	if (!sock_flag(sk, SOCK_DEAD)) -		sk->sk_data_ready(sk, len); +		sk->sk_data_ready(sk);  	return 0;  }  EXPORT_SYMBOL(sock_queue_err_skb); @@ -3403,6 +3578,238 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)  }  EXPORT_SYMBOL_GPL(skb_partial_csum_set); +static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, +			       unsigned int max) +{ +	if (skb_headlen(skb) >= len) +		return 0; + +	/* If we need to pullup then pullup to the max, so we +	 * won't need to do it again. +	 */ +	if (max > skb->len) +		max = skb->len; + +	if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) +		return -ENOMEM; + +	if (skb_headlen(skb) < len) +		return -EPROTO; + +	return 0; +} + +#define MAX_TCP_HDR_LEN (15 * 4) + +static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, +				      typeof(IPPROTO_IP) proto, +				      unsigned int off) +{ +	switch (proto) { +		int err; + +	case IPPROTO_TCP: +		err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), +					  off + MAX_TCP_HDR_LEN); +		if (!err && !skb_partial_csum_set(skb, off, +						  offsetof(struct tcphdr, +							   check))) +			err = -EPROTO; +		return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; + +	case IPPROTO_UDP: +		err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), +					  off + sizeof(struct udphdr)); +		if (!err && !skb_partial_csum_set(skb, off, +						  offsetof(struct udphdr, +							   check))) +			err = -EPROTO; +		return err ? ERR_PTR(err) : &udp_hdr(skb)->check; +	} + +	return ERR_PTR(-EPROTO); +} + +/* This value should be large enough to cover a tagged ethernet header plus + * maximally sized IP and TCP or UDP headers. + */ +#define MAX_IP_HDR_LEN 128 + +static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) +{ +	unsigned int off; +	bool fragment; +	__sum16 *csum; +	int err; + +	fragment = false; + +	err = skb_maybe_pull_tail(skb, +				  sizeof(struct iphdr), +				  MAX_IP_HDR_LEN); +	if (err < 0) +		goto out; + +	if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF)) +		fragment = true; + +	off = ip_hdrlen(skb); + +	err = -EPROTO; + +	if (fragment) +		goto out; + +	csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); +	if (IS_ERR(csum)) +		return PTR_ERR(csum); + +	if (recalculate) +		*csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, +					   ip_hdr(skb)->daddr, +					   skb->len - off, +					   ip_hdr(skb)->protocol, 0); +	err = 0; + +out: +	return err; +} + +/* This value should be large enough to cover a tagged ethernet header plus + * an IPv6 header, all options, and a maximal TCP or UDP header. + */ +#define MAX_IPV6_HDR_LEN 256 + +#define OPT_HDR(type, skb, off) \ +	(type *)(skb_network_header(skb) + (off)) + +static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) +{ +	int err; +	u8 nexthdr; +	unsigned int off; +	unsigned int len; +	bool fragment; +	bool done; +	__sum16 *csum; + +	fragment = false; +	done = false; + +	off = sizeof(struct ipv6hdr); + +	err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); +	if (err < 0) +		goto out; + +	nexthdr = ipv6_hdr(skb)->nexthdr; + +	len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); +	while (off <= len && !done) { +		switch (nexthdr) { +		case IPPROTO_DSTOPTS: +		case IPPROTO_HOPOPTS: +		case IPPROTO_ROUTING: { +			struct ipv6_opt_hdr *hp; + +			err = skb_maybe_pull_tail(skb, +						  off + +						  sizeof(struct ipv6_opt_hdr), +						  MAX_IPV6_HDR_LEN); +			if (err < 0) +				goto out; + +			hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); +			nexthdr = hp->nexthdr; +			off += ipv6_optlen(hp); +			break; +		} +		case IPPROTO_AH: { +			struct ip_auth_hdr *hp; + +			err = skb_maybe_pull_tail(skb, +						  off + +						  sizeof(struct ip_auth_hdr), +						  MAX_IPV6_HDR_LEN); +			if (err < 0) +				goto out; + +			hp = OPT_HDR(struct ip_auth_hdr, skb, off); +			nexthdr = hp->nexthdr; +			off += ipv6_authlen(hp); +			break; +		} +		case IPPROTO_FRAGMENT: { +			struct frag_hdr *hp; + +			err = skb_maybe_pull_tail(skb, +						  off + +						  sizeof(struct frag_hdr), +						  MAX_IPV6_HDR_LEN); +			if (err < 0) +				goto out; + +			hp = OPT_HDR(struct frag_hdr, skb, off); + +			if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) +				fragment = true; + +			nexthdr = hp->nexthdr; +			off += sizeof(struct frag_hdr); +			break; +		} +		default: +			done = true; +			break; +		} +	} + +	err = -EPROTO; + +	if (!done || fragment) +		goto out; + +	csum = skb_checksum_setup_ip(skb, nexthdr, off); +	if (IS_ERR(csum)) +		return PTR_ERR(csum); + +	if (recalculate) +		*csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, +					 &ipv6_hdr(skb)->daddr, +					 skb->len - off, nexthdr, 0); +	err = 0; + +out: +	return err; +} + +/** + * skb_checksum_setup - set up partial checksum offset + * @skb: the skb to set up + * @recalculate: if true the pseudo-header checksum will be recalculated + */ +int skb_checksum_setup(struct sk_buff *skb, bool recalculate) +{ +	int err; + +	switch (skb->protocol) { +	case htons(ETH_P_IP): +		err = skb_checksum_setup_ipv4(skb, recalculate); +		break; + +	case htons(ETH_P_IPV6): +		err = skb_checksum_setup_ipv6(skb, recalculate); +		break; + +	default: +		err = -EPROTO; +		break; +	} + +	return err; +} +EXPORT_SYMBOL(skb_checksum_setup); +  void __skb_warn_lro_forwarding(const struct sk_buff *skb)  {  	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", @@ -3519,6 +3926,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)  	skb->tstamp.tv64 = 0;  	skb->pkt_type = PACKET_HOST;  	skb->skb_iif = 0; +	skb->ignore_df = 0;  	skb_dst_drop(skb);  	skb->mark = 0;  	secpath_reset(skb); @@ -3526,3 +3934,28 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)  	nf_reset_trace(skb);  }  EXPORT_SYMBOL_GPL(skb_scrub_packet); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ +	const struct skb_shared_info *shinfo = skb_shinfo(skb); + +	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) +		return tcp_hdrlen(skb) + shinfo->gso_size; + +	/* UFO sets gso_size to the size of the fragmentation +	 * payload, i.e. the size of the L4 (UDP) header is already +	 * accounted for. +	 */ +	return shinfo->gso_size; +} +EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); diff --git a/net/core/sock.c b/net/core/sock.c index 5b6beba494a..026e01f7027 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -145,6 +145,55 @@  static DEFINE_MUTEX(proto_list_mutex);  static LIST_HEAD(proto_list); +/** + * sk_ns_capable - General socket capability test + * @sk: Socket to use a capability on or through + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in the user + * namespace @user_ns. + */ +bool sk_ns_capable(const struct sock *sk, +		   struct user_namespace *user_ns, int cap) +{ +	return file_ns_capable(sk->sk_socket->file, user_ns, cap) && +		ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(sk_ns_capable); + +/** + * sk_capable - Socket global capability test + * @sk: Socket to use a capability on or through + * @cap: The global capbility to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in all user + * namespaces. + */ +bool sk_capable(const struct sock *sk, int cap) +{ +	return sk_ns_capable(sk, &init_user_ns, cap); +} +EXPORT_SYMBOL(sk_capable); + +/** + * sk_net_capable - Network namespace socket capability test + * @sk: Socket to use a capability on or through + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socke was created + * and the current process has the capability @cap over the network namespace + * the socket is a member of. + */ +bool sk_net_capable(const struct sock *sk, int cap) +{ +	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); +} +EXPORT_SYMBOL(sk_net_capable); + +  #ifdef CONFIG_MEMCG_KMEM  int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)  { @@ -428,7 +477,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  	spin_unlock_irqrestore(&list->lock, flags);  	if (!sock_flag(sk, SOCK_DEAD)) -		sk->sk_data_ready(sk, skb_len); +		sk->sk_data_ready(sk);  	return 0;  }  EXPORT_SYMBOL(sock_queue_rcv_skb); @@ -475,12 +524,6 @@ discard_and_relse:  }  EXPORT_SYMBOL(sk_receive_skb); -void sk_reset_txq(struct sock *sk) -{ -	sk_tx_queue_clear(sk); -} -EXPORT_SYMBOL(sk_reset_txq); -  struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)  {  	struct dst_entry *dst = __sk_dst_get(sk); @@ -741,7 +784,7 @@ set_rcvbuf:  		break;  	case SO_NO_CHECK: -		sk->sk_no_check = valbool; +		sk->sk_no_check_tx = valbool;  		break;  	case SO_PRIORITY: @@ -888,7 +931,7 @@ set_rcvbuf:  	case SO_PEEK_OFF:  		if (sock->ops->set_peek_off) -			sock->ops->set_peek_off(sk, val); +			ret = sock->ops->set_peek_off(sk, val);  		else  			ret = -EOPNOTSUPP;  		break; @@ -914,6 +957,13 @@ set_rcvbuf:  		}  		break;  #endif + +	case SO_MAX_PACING_RATE: +		sk->sk_max_pacing_rate = val; +		sk->sk_pacing_rate = min(sk->sk_pacing_rate, +					 sk->sk_max_pacing_rate); +		break; +  	default:  		ret = -ENOPROTOOPT;  		break; @@ -924,8 +974,8 @@ set_rcvbuf:  EXPORT_SYMBOL(sock_setsockopt); -void cred_to_ucred(struct pid *pid, const struct cred *cred, -		   struct ucred *ucred) +static void cred_to_ucred(struct pid *pid, const struct cred *cred, +			  struct ucred *ucred)  {  	ucred->pid = pid_vnr(pid);  	ucred->uid = ucred->gid = -1; @@ -936,7 +986,6 @@ void cred_to_ucred(struct pid *pid, const struct cred *cred,  		ucred->gid = from_kgid_munged(current_ns, cred->egid);  	}  } -EXPORT_SYMBOL_GPL(cred_to_ucred);  int sock_getsockopt(struct socket *sock, int level, int optname,  		    char __user *optval, int __user *optlen) @@ -1015,7 +1064,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		break;  	case SO_NO_CHECK: -		v.val = sk->sk_no_check; +		v.val = sk->sk_no_check_tx;  		break;  	case SO_PRIORITY: @@ -1167,6 +1216,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);  		break; +	case SO_BPF_EXTENSIONS: +		v.val = bpf_tell_extensions(); +		break; +  	case SO_SELECT_ERR_QUEUE:  		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);  		break; @@ -1177,6 +1230,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		break;  #endif +	case SO_MAX_PACING_RATE: +		v.val = sk->sk_max_pacing_rate; +		break; +  	default:  		return -ENOPROTOOPT;  	} @@ -1303,19 +1360,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)  	module_put(owner);  } -#if IS_ENABLED(CONFIG_NET_CLS_CGROUP) -void sock_update_classid(struct sock *sk) -{ -	u32 classid; - -	classid = task_cls_classid(current); -	if (classid != sk->sk_classid) -		sk->sk_classid = classid; -} -EXPORT_SYMBOL(sock_update_classid); -#endif - -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)  void sock_update_netprioidx(struct sock *sk)  {  	if (in_interrupt()) @@ -1661,22 +1706,6 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,  EXPORT_SYMBOL(sock_wmalloc);  /* - * Allocate a skb from the socket's receive buffer. - */ -struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, -			     gfp_t priority) -{ -	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { -		struct sk_buff *skb = alloc_skb(size, priority); -		if (skb) { -			skb_set_owner_r(skb, sk); -			return skb; -		} -	} -	return NULL; -} - -/*   * Allocate a memory block from the socket's option memory buffer.   */  void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) @@ -1795,7 +1824,9 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,  			while (order) {  				if (npages >= 1 << order) {  					page = alloc_pages(sk->sk_allocation | -							   __GFP_COMP | __GFP_NOWARN, +							   __GFP_COMP | +							   __GFP_NOWARN | +							   __GFP_NORETRY,  							   order);  					if (page)  						goto fill_page; @@ -1836,7 +1867,17 @@ EXPORT_SYMBOL(sock_alloc_send_skb);  /* On 32bit arches, an skb frag is limited to 2^15 */  #define SKB_FRAG_PAGE_ORDER	get_order(32768) -bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +/** + * skb_page_frag_refill - check that a page_frag contains enough room + * @sz: minimum size of the fragment we want to get + * @pfrag: pointer to page_frag + * @prio: priority for memory allocation + * + * Note: While this allocator tries to use high order pages, there is + * no guarantee that allocations succeed. Therefore, @sz MUST be + * less or equal than PAGE_SIZE. + */ +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)  {  	int order; @@ -1845,19 +1886,17 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)  			pfrag->offset = 0;  			return true;  		} -		if (pfrag->offset < pfrag->size) +		if (pfrag->offset + sz <= pfrag->size)  			return true;  		put_page(pfrag->page);  	} -	/* We restrict high order allocations to users that can afford to wait */ -	order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; - +	order = SKB_FRAG_PAGE_ORDER;  	do { -		gfp_t gfp = sk->sk_allocation; +		gfp_t gfp = prio;  		if (order) -			gfp |= __GFP_COMP | __GFP_NOWARN; +			gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;  		pfrag->page = alloc_pages(gfp, order);  		if (likely(pfrag->page)) {  			pfrag->offset = 0; @@ -1866,6 +1905,15 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)  		}  	} while (--order >= 0); +	return false; +} +EXPORT_SYMBOL(skb_page_frag_refill); + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ +	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) +		return true; +  	sk_enter_memory_pressure(sk);  	sk_stream_moderate_sndbuf(sk);  	return false; @@ -2197,7 +2245,7 @@ static void sock_def_error_report(struct sock *sk)  	rcu_read_unlock();  } -static void sock_def_readable(struct sock *sk, int len) +static void sock_def_readable(struct sock *sk)  {  	struct socket_wq *wq; @@ -2319,6 +2367,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)  	sk->sk_ll_usec		=	sysctl_net_busy_read;  #endif +	sk->sk_max_pacing_rate = ~0U; +	sk->sk_pacing_rate = ~0U;  	/*  	 * Before updating sk_refcnt, we must commit prior changes to memory  	 * (Documentation/RCU/rculist_nulls.txt for details) @@ -2356,10 +2406,13 @@ void release_sock(struct sock *sk)  	if (sk->sk_backlog.tail)  		__release_sock(sk); +	/* Warning : release_cb() might need to release sk ownership, +	 * ie call sock_release_ownership(sk) before us. +	 */  	if (sk->sk_prot->release_cb)  		sk->sk_prot->release_cb(sk); -	sk->sk_lock.owned = 0; +	sock_release_ownership(sk);  	if (waitqueue_active(&sk->sk_lock.wq))  		wake_up(&sk->sk_lock.wq);  	spin_unlock_bh(&sk->sk_lock.slock); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a0e9cf6379d..a4216a4c957 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -49,38 +49,35 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)  }  EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); -int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk, +int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,  			     struct sk_buff *skb, int attrtype)  { -	struct nlattr *attr; +	struct sock_fprog_kern *fprog;  	struct sk_filter *filter; -	unsigned int len; +	struct nlattr *attr; +	unsigned int flen;  	int err = 0; -	if (!ns_capable(user_ns, CAP_NET_ADMIN)) { +	if (!may_report_filterinfo) {  		nla_reserve(skb, attrtype, 0);  		return 0;  	}  	rcu_read_lock(); -  	filter = rcu_dereference(sk->sk_filter); -	len = filter ? filter->len * sizeof(struct sock_filter) : 0; +	if (!filter) +		goto out; -	attr = nla_reserve(skb, attrtype, len); +	fprog = filter->orig_prog; +	flen = sk_filter_proglen(fprog); + +	attr = nla_reserve(skb, attrtype, flen);  	if (attr == NULL) {  		err = -EMSGSIZE;  		goto out;  	} -	if (filter) { -		struct sock_filter *fb = (struct sock_filter *)nla_data(attr); -		int i; - -		for (i = 0; i < filter->len; i++, fb++) -			sk_decode_filter(&filter->insns[i], fb); -	} - +	memcpy(nla_data(attr), fprog->filter, flen);  out:  	rcu_read_unlock();  	return err; diff --git a/net/core/stream.c b/net/core/stream.c index 512f0a24269..301c05f2606 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -122,7 +122,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)  	DEFINE_WAIT(wait);  	if (sk_stream_memory_free(sk)) -		current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; +		current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;  	while (1) {  		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cca44419090..cf9cd13509a 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -122,7 +122,8 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,  				synchronize_rcu();  				kfree(cur);  			} else if (!cur && cpumask_test_cpu(i, mask)) { -				cur = kzalloc(len, GFP_KERNEL); +				cur = kzalloc_node(len, GFP_KERNEL, +						   cpu_to_node(i));  				if (!cur) {  					/* not unwinding previous changes */  					ret = -ENOMEM; diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 661b5a40ec1..6521dfd8b7c 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -23,16 +23,11 @@  #include <linux/skbuff.h>  #include <linux/export.h> -static struct sock_filter ptp_filter[] = { -	PTP_FILTER -}; -  static unsigned int classify(const struct sk_buff *skb)  { -	if (likely(skb->dev && -		   skb->dev->phydev && +	if (likely(skb->dev && skb->dev->phydev &&  		   skb->dev->phydev->drv)) -		return sk_run_filter(skb, ptp_filter); +		return ptp_classify_raw(skb);  	else  		return PTP_CLASS_NONE;  } @@ -60,11 +55,13 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)  		if (likely(phydev->drv->txtstamp)) {  			if (!atomic_inc_not_zero(&sk->sk_refcnt))  				return; +  			clone = skb_clone(skb, GFP_ATOMIC);  			if (!clone) {  				sock_put(sk);  				return;  			} +  			clone->sk = sk;  			phydev->drv->txtstamp(phydev, clone, type);  		} @@ -89,12 +86,15 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,  	}  	*skb_hwtstamps(skb) = *hwtstamps; +  	serr = SKB_EXT_ERR(skb);  	memset(serr, 0, sizeof(*serr));  	serr->ee.ee_errno = ENOMSG;  	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;  	skb->sk = NULL; +  	err = sock_queue_err_skb(sk, skb); +  	sock_put(sk);  	if (err)  		kfree_skb(skb); @@ -132,8 +132,3 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)  	return false;  }  EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp); - -void __init skb_timestamping_init(void) -{ -	BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter))); -} diff --git a/net/core/tso.c b/net/core/tso.c new file mode 100644 index 00000000000..8c3203c585b --- /dev/null +++ b/net/core/tso.c @@ -0,0 +1,77 @@ +#include <linux/export.h> +#include <net/ip.h> +#include <net/tso.h> + +/* Calculate expected number of TX descriptors */ +int tso_count_descs(struct sk_buff *skb) +{ +	/* The Marvell Way */ +	return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; +} +EXPORT_SYMBOL(tso_count_descs); + +void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, +		   int size, bool is_last) +{ +	struct iphdr *iph; +	struct tcphdr *tcph; +	int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); +	int mac_hdr_len = skb_network_offset(skb); + +	memcpy(hdr, skb->data, hdr_len); +	iph = (struct iphdr *)(hdr + mac_hdr_len); +	iph->id = htons(tso->ip_id); +	iph->tot_len = htons(size + hdr_len - mac_hdr_len); +	tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb)); +	tcph->seq = htonl(tso->tcp_seq); +	tso->ip_id++; + +	if (!is_last) { +		/* Clear all special flags for not last packet */ +		tcph->psh = 0; +		tcph->fin = 0; +		tcph->rst = 0; +	} +} +EXPORT_SYMBOL(tso_build_hdr); + +void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) +{ +	tso->tcp_seq += size; +	tso->size -= size; +	tso->data += size; + +	if ((tso->size == 0) && +	    (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { +		skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + +		/* Move to next segment */ +		tso->size = frag->size; +		tso->data = page_address(frag->page.p) + frag->page_offset; +		tso->next_frag_idx++; +	} +} +EXPORT_SYMBOL(tso_build_data); + +void tso_start(struct sk_buff *skb, struct tso_t *tso) +{ +	int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + +	tso->ip_id = ntohs(ip_hdr(skb)->id); +	tso->tcp_seq = ntohl(tcp_hdr(skb)->seq); +	tso->next_frag_idx = 0; + +	/* Build first data */ +	tso->size = skb_headlen(skb) - hdr_len; +	tso->data = skb->data + hdr_len; +	if ((tso->size == 0) && +	    (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { +		skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + +		/* Move to next segment */ +		tso->size = frag->size; +		tso->data = page_address(frag->page.p) + frag->page_offset; +		tso->next_frag_idx++; +	} +} +EXPORT_SYMBOL(tso_start); diff --git a/net/core/utils.c b/net/core/utils.c index aa88e23fc87..eed34338736 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -338,3 +338,52 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,  				  csum_unfold(*sum)));  }  EXPORT_SYMBOL(inet_proto_csum_replace16); + +struct __net_random_once_work { +	struct work_struct work; +	struct static_key *key; +}; + +static void __net_random_once_deferred(struct work_struct *w) +{ +	struct __net_random_once_work *work = +		container_of(w, struct __net_random_once_work, work); +	BUG_ON(!static_key_enabled(work->key)); +	static_key_slow_dec(work->key); +	kfree(work); +} + +static void __net_random_once_disable_jump(struct static_key *key) +{ +	struct __net_random_once_work *w; + +	w = kmalloc(sizeof(*w), GFP_ATOMIC); +	if (!w) +		return; + +	INIT_WORK(&w->work, __net_random_once_deferred); +	w->key = key; +	schedule_work(&w->work); +} + +bool __net_get_random_once(void *buf, int nbytes, bool *done, +			   struct static_key *once_key) +{ +	static DEFINE_SPINLOCK(lock); +	unsigned long flags; + +	spin_lock_irqsave(&lock, flags); +	if (*done) { +		spin_unlock_irqrestore(&lock, flags); +		return false; +	} + +	get_random_bytes(buf, nbytes); +	*done = true; +	spin_unlock_irqrestore(&lock, flags); + +	__net_random_once_disable_jump(once_key); + +	return true; +} +EXPORT_SYMBOL(__net_get_random_once);  | 
