diff options
Diffstat (limited to 'net/core')
42 files changed, 14825 insertions, 7103 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 8a04dd22cf7..71093d94ad2 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -3,15 +3,17 @@  #  obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ -	 gen_stats.o gen_estimator.o net_namespace.o +	 gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o  obj-$(CONFIG_SYSCTL) += sysctl_net_core.o  obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ -			neighbour.o rtnetlink.o utils.o link_watch.o filter.o +			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ +			sock_diag.o dev_ioctl.o tso.o  obj-$(CONFIG_XFRM) += flow.o  obj-y += net-sysfs.o +obj-$(CONFIG_PROC_FS) += net-procfs.o  obj-$(CONFIG_NET_PKTGEN) += pktgen.o  obj-$(CONFIG_NETPOLL) += netpoll.o  obj-$(CONFIG_NET_DMA) += user_dma.o @@ -19,3 +21,6 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o  obj-$(CONFIG_TRACEPOINTS) += net-traces.o  obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o  obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o +obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o +obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o +obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 18ac112ea7a..488dd1a825c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -37,7 +37,6 @@  #include <linux/types.h>  #include <linux/kernel.h>  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/mm.h>  #include <linux/interrupt.h>  #include <linux/errno.h> @@ -49,6 +48,7 @@  #include <linux/highmem.h>  #include <linux/spinlock.h>  #include <linux/slab.h> +#include <linux/pagemap.h>  #include <net/protocol.h>  #include <linux/skbuff.h> @@ -57,6 +57,7 @@  #include <net/sock.h>  #include <net/tcp_states.h>  #include <trace/events/skb.h> +#include <net/busy_poll.h>  /*   *	Is a socket 'connection oriented' ? @@ -66,7 +67,7 @@ static inline int connection_based(struct sock *sk)  	return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;  } -static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync, +static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,  				  void *key)  {  	unsigned long bits = (unsigned long)key; @@ -79,9 +80,10 @@ static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync,  	return autoremove_wake_function(wait, mode, sync, key);  }  /* - * Wait for a packet.. + * Wait for the last received packet to be different from skb   */ -static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) +static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, +				 const struct sk_buff *skb)  {  	int error;  	DEFINE_WAIT_FUNC(wait, receiver_wake_function); @@ -93,7 +95,7 @@ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)  	if (error)  		goto out_err; -	if (!skb_queue_empty(&sk->sk_receive_queue)) +	if (sk->sk_receive_queue.prev != skb)  		goto out;  	/* Socket shut down? */ @@ -133,6 +135,8 @@ out_noerr:   *	@sk: socket   *	@flags: MSG_ flags   *	@peeked: returns non-zero if this packet has been seen before + *	@off: an offset in bytes to peek skb from. Returns an offset + *	      within an skb where data actually starts   *	@err: error code returned   *   *	Get a datagram skbuff, understands the peeking, nonblocking wakeups @@ -157,10 +161,10 @@ out_noerr:   *	quite explicitly by POSIX 1003.1g, don't change them without having   *	the standard around please.   */ -struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, -				    int *peeked, int *err) +struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, +				    int *peeked, int *off, int *err)  { -	struct sk_buff *skb; +	struct sk_buff *skb, *last;  	long timeo;  	/*  	 * Caller is allowed not to check sk->sk_err before skb_recv_datagram() @@ -180,28 +184,41 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,  		 * However, this function was correct in any case. 8)  		 */  		unsigned long cpu_flags; +		struct sk_buff_head *queue = &sk->sk_receive_queue; +		int _off = *off; -		spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); -		skb = skb_peek(&sk->sk_receive_queue); -		if (skb) { +		last = (struct sk_buff *)queue; +		spin_lock_irqsave(&queue->lock, cpu_flags); +		skb_queue_walk(queue, skb) { +			last = skb;  			*peeked = skb->peeked;  			if (flags & MSG_PEEK) { +				if (_off >= skb->len && (skb->len || _off || +							 skb->peeked)) { +					_off -= skb->len; +					continue; +				}  				skb->peeked = 1;  				atomic_inc(&skb->users);  			} else -				__skb_unlink(skb, &sk->sk_receive_queue); -		} -		spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); +				__skb_unlink(skb, queue); -		if (skb) +			spin_unlock_irqrestore(&queue->lock, cpu_flags); +			*off = _off;  			return skb; +		} +		spin_unlock_irqrestore(&queue->lock, cpu_flags); + +		if (sk_can_busy_loop(sk) && +		    sk_busy_loop(sk, flags & MSG_DONTWAIT)) +			continue;  		/* User doesn't want to wait */  		error = -EAGAIN;  		if (!timeo)  			goto no_packet; -	} while (!wait_for_packet(sk, err, &timeo)); +	} while (!wait_for_more_packets(sk, err, &timeo, last));  	return NULL; @@ -211,13 +228,13 @@ no_packet:  }  EXPORT_SYMBOL(__skb_recv_datagram); -struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,  				  int noblock, int *err)  { -	int peeked; +	int peeked, off = 0;  	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), -				   &peeked, err); +				   &peeked, &off, err);  }  EXPORT_SYMBOL(skb_recv_datagram); @@ -243,7 +260,6 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)  	unlock_sock_fast(sk, slow);  	/* skb is now orphaned, can be freed outside of locked section */ -	trace_kfree_skb(skb, skb_free_datagram_locked);  	__kfree_skb(skb);  }  EXPORT_SYMBOL(skb_free_datagram_locked); @@ -324,15 +340,15 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,  	/* Copy paged appendix. Hmm... why does this look so complicated? */  	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {  		int end; +		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];  		WARN_ON(start > offset + len); -		end = start + skb_shinfo(skb)->frags[i].size; +		end = start + skb_frag_size(frag);  		if ((copy = end - offset) > 0) {  			int err;  			u8  *vaddr; -			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; -			struct page *page = frag->page; +			struct page *page = skb_frag_page(frag);  			if (copy > len)  				copy = len; @@ -410,15 +426,15 @@ int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,  	/* Copy paged appendix. Hmm... why does this look so complicated? */  	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {  		int end; +		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];  		WARN_ON(start > offset + len); -		end = start + skb_shinfo(skb)->frags[i].size; +		end = start + skb_frag_size(frag);  		if ((copy = end - offset) > 0) {  			int err;  			u8  *vaddr; -			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; -			struct page *page = frag->page; +			struct page *page = skb_frag_page(frag);  			if (copy > len)  				copy = len; @@ -500,15 +516,15 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,  	/* Copy paged appendix. Hmm... why does this look so complicated? */  	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {  		int end; +		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];  		WARN_ON(start > offset + len); -		end = start + skb_shinfo(skb)->frags[i].size; +		end = start + skb_frag_size(frag);  		if ((copy = end - offset) > 0) {  			int err;  			u8  *vaddr; -			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; -			struct page *page = frag->page; +			struct page *page = skb_frag_page(frag);  			if (copy > len)  				copy = len; @@ -558,6 +574,77 @@ fault:  }  EXPORT_SYMBOL(skb_copy_datagram_from_iovec); +/** + *	zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec + *	@skb: buffer to copy + *	@from: io vector to copy from + *	@offset: offset in the io vector to start copying from + *	@count: amount of vectors to copy to buffer from + * + *	The function will first copy up to headlen, and then pin the userspace + *	pages and build frags through them. + * + *	Returns 0, -EFAULT or -EMSGSIZE. + *	Note: the iovec is not modified during the copy + */ +int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, +				  int offset, size_t count) +{ +	int len = iov_length(from, count) - offset; +	int copy = min_t(int, skb_headlen(skb), len); +	int size; +	int i = 0; + +	/* copy up to skb headlen */ +	if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy)) +		return -EFAULT; + +	if (len == copy) +		return 0; + +	offset += copy; +	while (count--) { +		struct page *page[MAX_SKB_FRAGS]; +		int num_pages; +		unsigned long base; +		unsigned long truesize; + +		/* Skip over from offset and copied */ +		if (offset >= from->iov_len) { +			offset -= from->iov_len; +			++from; +			continue; +		} +		len = from->iov_len - offset; +		base = (unsigned long)from->iov_base + offset; +		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; +		if (i + size > MAX_SKB_FRAGS) +			return -EMSGSIZE; +		num_pages = get_user_pages_fast(base, size, 0, &page[i]); +		if (num_pages != size) { +			release_pages(&page[i], num_pages, 0); +			return -EFAULT; +		} +		truesize = size * PAGE_SIZE; +		skb->data_len += len; +		skb->len += len; +		skb->truesize += truesize; +		atomic_add(truesize, &skb->sk->sk_wmem_alloc); +		while (len) { +			int off = base & ~PAGE_MASK; +			int size = min_t(int, len, PAGE_SIZE - off); +			skb_fill_page_desc(skb, i, page[i], off, size); +			base += size; +			len -= size; +			i++; +		} +		offset = 0; +		++from; +	} +	return 0; +} +EXPORT_SYMBOL(zerocopy_sg_from_iovec); +  static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,  				      u8 __user *to, int len,  				      __wsum *csump) @@ -585,16 +672,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,  	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {  		int end; +		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];  		WARN_ON(start > offset + len); -		end = start + skb_shinfo(skb)->frags[i].size; +		end = start + skb_frag_size(frag);  		if ((copy = end - offset) > 0) {  			__wsum csum2;  			int err = 0;  			u8  *vaddr; -			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; -			struct page *page = frag->page; +			struct page *page = skb_frag_page(frag);  			if (copy > len)  				copy = len; @@ -653,17 +740,37 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)  	sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));  	if (likely(!sum)) { -		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) +		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && +		    !skb->csum_complete_sw)  			netdev_rx_csum_fault(skb->dev); -		skb->ip_summed = CHECKSUM_UNNECESSARY;  	} +	skb->csum_valid = !sum;  	return sum;  }  EXPORT_SYMBOL(__skb_checksum_complete_head);  __sum16 __skb_checksum_complete(struct sk_buff *skb)  { -	return __skb_checksum_complete_head(skb, skb->len); +	__wsum csum; +	__sum16 sum; + +	csum = skb_checksum(skb, 0, skb->len, 0); + +	/* skb->csum holds pseudo checksum */ +	sum = csum_fold(csum_add(skb->csum, csum)); +	if (likely(!sum)) { +		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && +		    !skb->csum_complete_sw) +			netdev_rx_csum_fault(skb->dev); +	} + +	/* Save full packet checksum */ +	skb->csum = csum; +	skb->ip_summed = CHECKSUM_COMPLETE; +	skb->csum_complete_sw = 1; +	skb->csum_valid = !sum; + +	return sum;  }  EXPORT_SYMBOL(__skb_checksum_complete); @@ -745,7 +852,9 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,  	/* exceptional events? */  	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) -		mask |= POLLERR; +		mask |= POLLERR | +			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0); +  	if (sk->sk_shutdown & RCV_SHUTDOWN)  		mask |= POLLRDHUP | POLLIN | POLLRDNORM;  	if (sk->sk_shutdown == SHUTDOWN_MASK) diff --git a/net/core/dev.c b/net/core/dev.c index d28b3a023bb..367a586d0c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -73,7 +73,6 @@   */  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/bitops.h>  #include <linux/capability.h>  #include <linux/cpu.h> @@ -98,8 +97,6 @@  #include <net/net_namespace.h>  #include <net/sock.h>  #include <linux/rtnetlink.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h>  #include <linux/stat.h>  #include <net/dst.h>  #include <net/pkt_sched.h> @@ -107,12 +104,10 @@  #include <net/xfrm.h>  #include <linux/highmem.h>  #include <linux/init.h> -#include <linux/kmod.h>  #include <linux/module.h>  #include <linux/netpoll.h>  #include <linux/rcupdate.h>  #include <linux/delay.h> -#include <net/wext.h>  #include <net/iw_handler.h>  #include <asm/current.h>  #include <linux/audit.h> @@ -132,6 +127,11 @@  #include <trace/events/skb.h>  #include <linux/pci.h>  #include <linux/inetdevice.h> +#include <linux/cpu_rmap.h> +#include <linux/static_key.h> +#include <linux/hashtable.h> +#include <linux/vmalloc.h> +#include <linux/if_macvlan.h>  #include "net-sysfs.h" @@ -141,40 +141,16 @@  /* This should be increased if a protocol with a bigger head is added. */  #define GRO_MAX_HEAD (MAX_HEADER + 128) -/* - *	The list of packet types we will receive (as opposed to discard) - *	and the routines to invoke. - * - *	Why 16. Because with 16 the only overlap we get on a hash of the - *	low nibble of the protocol value is RARP/SNAP/X.25. - * - *      NOTE:  That is no longer true with the addition of VLAN tags.  Not - *             sure which should go first, but I bet it won't make much - *             difference if we are running VLANs.  The good news is that - *             this protocol won't be in the list unless compiled in, so - *             the average user (w/out VLANs) will not be adversely affected. - *             --BLG - * - *		0800	IP - *		8100    802.1Q VLAN - *		0001	802.3 - *		0002	AX.25 - *		0004	802.2 - *		8035	RARP - *		0005	SNAP - *		0805	X.25 - *		0806	ARP - *		8137	IPX - *		0009	Localtalk - *		86DD	IPv6 - */ - -#define PTYPE_HASH_SIZE	(16) -#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1) -  static DEFINE_SPINLOCK(ptype_lock); -static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -static struct list_head ptype_all __read_mostly;	/* Taps */ +static DEFINE_SPINLOCK(offload_lock); +struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +struct list_head ptype_all __read_mostly;	/* Taps */ +static struct list_head offload_base __read_mostly; + +static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, +					 struct net_device *dev, +					 struct netdev_notifier_info *info);  /*   * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -198,9 +174,23 @@ static struct list_head ptype_all __read_mostly;	/* Taps */  DEFINE_RWLOCK(dev_base_lock);  EXPORT_SYMBOL(dev_base_lock); +/* protects napi_hash addition/deletion and napi_gen_id */ +static DEFINE_SPINLOCK(napi_hash_lock); + +static unsigned int napi_gen_id; +static DEFINE_HASHTABLE(napi_hash, 8); + +static seqcount_t devnet_rename_seq; + +static inline void dev_base_seq_inc(struct net *net) +{ +	while (++net->dev_base_seq == 0); +} +  static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)  { -	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); +	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); +  	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];  } @@ -224,7 +214,7 @@ static inline void rps_unlock(struct softnet_data *sd)  }  /* Device list insertion */ -static int list_netdevice(struct net_device *dev) +static void list_netdevice(struct net_device *dev)  {  	struct net *net = dev_net(dev); @@ -236,7 +226,8 @@ static int list_netdevice(struct net_device *dev)  	hlist_add_head_rcu(&dev->index_hlist,  			   dev_index_hash(net, dev->ifindex));  	write_unlock_bh(&dev_base_lock); -	return 0; + +	dev_base_seq_inc(net);  }  /* Device list removal @@ -252,6 +243,8 @@ static void unlist_netdevice(struct net_device *dev)  	hlist_del_rcu(&dev->name_hlist);  	hlist_del_rcu(&dev->index_hlist);  	write_unlock_bh(&dev_base_lock); + +	dev_base_seq_inc(dev_net(dev));  }  /* @@ -286,10 +279,9 @@ static const unsigned short netdev_lock_type[] =  	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,  	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,  	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, -	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, -	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, -	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, -	 ARPHRD_VOID, ARPHRD_NONE}; +	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, +	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, +	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};  static const char *const netdev_lock_name[] =  	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", @@ -304,10 +296,9 @@ static const char *const netdev_lock_name[] =  	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",  	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",  	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", -	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", -	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", -	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154", -	 "_xmit_VOID", "_xmit_NONE"}; +	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", +	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", +	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};  static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];  static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; @@ -432,7 +423,7 @@ void __dev_remove_pack(struct packet_type *pt)  		}  	} -	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); +	pr_warn("dev_remove_pack: %p not found\n", pt);  out:  	spin_unlock(&ptype_lock);  } @@ -458,6 +449,81 @@ void dev_remove_pack(struct packet_type *pt)  }  EXPORT_SYMBOL(dev_remove_pack); + +/** + *	dev_add_offload - register offload handlers + *	@po: protocol offload declaration + * + *	Add protocol offload handlers to the networking stack. The passed + *	&proto_offload is linked into kernel lists and may not be freed until + *	it has been removed from the kernel lists. + * + *	This call does not sleep therefore it can not + *	guarantee all CPU's that are in middle of receiving packets + *	will see the new offload handlers (until the next received packet). + */ +void dev_add_offload(struct packet_offload *po) +{ +	struct list_head *head = &offload_base; + +	spin_lock(&offload_lock); +	list_add_rcu(&po->list, head); +	spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(dev_add_offload); + +/** + *	__dev_remove_offload	 - remove offload handler + *	@po: packet offload declaration + * + *	Remove a protocol offload handler that was previously added to the + *	kernel offload handlers by dev_add_offload(). The passed &offload_type + *	is removed from the kernel lists and can be freed or reused once this + *	function returns. + * + *      The packet type might still be in use by receivers + *	and must not be freed until after all the CPU's have gone + *	through a quiescent state. + */ +static void __dev_remove_offload(struct packet_offload *po) +{ +	struct list_head *head = &offload_base; +	struct packet_offload *po1; + +	spin_lock(&offload_lock); + +	list_for_each_entry(po1, head, list) { +		if (po == po1) { +			list_del_rcu(&po->list); +			goto out; +		} +	} + +	pr_warn("dev_remove_offload: %p not found\n", po); +out: +	spin_unlock(&offload_lock); +} + +/** + *	dev_remove_offload	 - remove packet offload handler + *	@po: packet offload declaration + * + *	Remove a packet offload handler that was previously added to the kernel + *	offload handlers by dev_add_offload(). The passed &offload_type is + *	removed from the kernel lists and can be freed or reused once this + *	function returns. + * + *	This call sleeps to guarantee that no CPU is looking at the packet + *	type after return. + */ +void dev_remove_offload(struct packet_offload *po) +{ +	__dev_remove_offload(po); + +	synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_offload); +  /******************************************************************************  		      Device Boot-time Settings Routines @@ -603,11 +669,10 @@ __setup("netdev=", netdev_boot_setup);  struct net_device *__dev_get_by_name(struct net *net, const char *name)  { -	struct hlist_node *p;  	struct net_device *dev;  	struct hlist_head *head = dev_name_hash(net, name); -	hlist_for_each_entry(dev, p, head, name_hlist) +	hlist_for_each_entry(dev, head, name_hlist)  		if (!strncmp(dev->name, name, IFNAMSIZ))  			return dev; @@ -629,11 +694,10 @@ EXPORT_SYMBOL(__dev_get_by_name);  struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)  { -	struct hlist_node *p;  	struct net_device *dev;  	struct hlist_head *head = dev_name_hash(net, name); -	hlist_for_each_entry_rcu(dev, p, head, name_hlist) +	hlist_for_each_entry_rcu(dev, head, name_hlist)  		if (!strncmp(dev->name, name, IFNAMSIZ))  			return dev; @@ -680,11 +744,10 @@ EXPORT_SYMBOL(dev_get_by_name);  struct net_device *__dev_get_by_index(struct net *net, int ifindex)  { -	struct hlist_node *p;  	struct net_device *dev;  	struct hlist_head *head = dev_index_hash(net, ifindex); -	hlist_for_each_entry(dev, p, head, index_hlist) +	hlist_for_each_entry(dev, head, index_hlist)  		if (dev->ifindex == ifindex)  			return dev; @@ -705,11 +768,10 @@ EXPORT_SYMBOL(__dev_get_by_index);  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)  { -	struct hlist_node *p;  	struct net_device *dev;  	struct hlist_head *head = dev_index_hash(net, ifindex); -	hlist_for_each_entry_rcu(dev, p, head, index_hlist) +	hlist_for_each_entry_rcu(dev, head, index_hlist)  		if (dev->ifindex == ifindex)  			return dev; @@ -743,13 +805,48 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)  EXPORT_SYMBOL(dev_get_by_index);  /** + *	netdev_get_name - get a netdevice name, knowing its ifindex. + *	@net: network namespace + *	@name: a pointer to the buffer where the name will be stored. + *	@ifindex: the ifindex of the interface to get the name from. + * + *	The use of raw_seqcount_begin() and cond_resched() before + *	retrying is required as we want to give the writers a chance + *	to complete when CONFIG_PREEMPT is not set. + */ +int netdev_get_name(struct net *net, char *name, int ifindex) +{ +	struct net_device *dev; +	unsigned int seq; + +retry: +	seq = raw_seqcount_begin(&devnet_rename_seq); +	rcu_read_lock(); +	dev = dev_get_by_index_rcu(net, ifindex); +	if (!dev) { +		rcu_read_unlock(); +		return -ENODEV; +	} + +	strcpy(name, dev->name); +	rcu_read_unlock(); +	if (read_seqcount_retry(&devnet_rename_seq, seq)) { +		cond_resched(); +		goto retry; +	} + +	return 0; +} + +/**   *	dev_getbyhwaddr_rcu - find a device by its hardware address   *	@net: the applicable net namespace   *	@type: media type of device   *	@ha: hardware address   *   *	Search for an interface by MAC address. Returns NULL if the device - *	is not found or a pointer to the device. The caller must hold RCU + *	is not found or a pointer to the device. + *	The caller must hold RCU or RTNL.   *	The returned device has not had its ref count increased   *	and the caller must therefore be careful about locking   * @@ -833,21 +930,21 @@ EXPORT_SYMBOL(dev_get_by_flags_rcu);   *	to allow sysfs to work.  We also disallow any kind of   *	whitespace.   */ -int dev_valid_name(const char *name) +bool dev_valid_name(const char *name)  {  	if (*name == '\0') -		return 0; +		return false;  	if (strlen(name) >= IFNAMSIZ) -		return 0; +		return false;  	if (!strcmp(name, ".") || !strcmp(name, "..")) -		return 0; +		return false;  	while (*name) {  		if (*name == '/' || isspace(*name)) -			return 0; +			return false;  		name++;  	} -	return 1; +	return true;  }  EXPORT_SYMBOL(dev_valid_name); @@ -946,18 +1043,30 @@ int dev_alloc_name(struct net_device *dev, const char *name)  }  EXPORT_SYMBOL(dev_alloc_name); -static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt) +static int dev_alloc_name_ns(struct net *net, +			     struct net_device *dev, +			     const char *name)  { -	struct net *net; +	char buf[IFNAMSIZ]; +	int ret; -	BUG_ON(!dev_net(dev)); -	net = dev_net(dev); +	ret = __dev_alloc_name(net, name, buf); +	if (ret >= 0) +		strlcpy(dev->name, buf, IFNAMSIZ); +	return ret; +} + +static int dev_get_valid_name(struct net *net, +			      struct net_device *dev, +			      const char *name) +{ +	BUG_ON(!net);  	if (!dev_valid_name(name))  		return -EINVAL; -	if (fmt && strchr(name, '%')) -		return dev_alloc_name(dev, name); +	if (strchr(name, '%')) +		return dev_alloc_name_ns(net, dev, name);  	else if (__dev_get_by_name(net, name))  		return -EEXIST;  	else if (dev->name != name) @@ -988,24 +1097,35 @@ int dev_change_name(struct net_device *dev, const char *newname)  	if (dev->flags & IFF_UP)  		return -EBUSY; -	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) +	write_seqcount_begin(&devnet_rename_seq); + +	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { +		write_seqcount_end(&devnet_rename_seq);  		return 0; +	}  	memcpy(oldname, dev->name, IFNAMSIZ); -	err = dev_get_valid_name(dev, newname, 1); -	if (err < 0) +	err = dev_get_valid_name(net, dev, newname); +	if (err < 0) { +		write_seqcount_end(&devnet_rename_seq);  		return err; +	}  rollback:  	ret = device_rename(&dev->dev, dev->name);  	if (ret) {  		memcpy(dev->name, oldname, IFNAMSIZ); +		write_seqcount_end(&devnet_rename_seq);  		return ret;  	} +	write_seqcount_end(&devnet_rename_seq); + +	netdev_adjacent_rename_links(dev, oldname); +  	write_lock_bh(&dev_base_lock); -	hlist_del(&dev->name_hlist); +	hlist_del_rcu(&dev->name_hlist);  	write_unlock_bh(&dev_base_lock);  	synchronize_rcu(); @@ -1021,11 +1141,12 @@ rollback:  		/* err >= 0 after dev_alloc_name() or stores the first errno */  		if (err >= 0) {  			err = ret; +			write_seqcount_begin(&devnet_rename_seq);  			memcpy(dev->name, oldname, IFNAMSIZ); +			memcpy(oldname, newname, IFNAMSIZ);  			goto rollback;  		} else { -			printk(KERN_ERR -			       "%s: name change rollback failed: %d.\n", +			pr_err("%s: name change rollback failed: %d\n",  			       dev->name, ret);  		}  	} @@ -1043,22 +1164,23 @@ rollback:   */  int dev_set_alias(struct net_device *dev, const char *alias, size_t len)  { +	char *new_ifalias; +  	ASSERT_RTNL();  	if (len >= IFALIASZ)  		return -EINVAL;  	if (!len) { -		if (dev->ifalias) { -			kfree(dev->ifalias); -			dev->ifalias = NULL; -		} +		kfree(dev->ifalias); +		dev->ifalias = NULL;  		return 0;  	} -	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); -	if (!dev->ifalias) +	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); +	if (!new_ifalias)  		return -ENOMEM; +	dev->ifalias = new_ifalias;  	strlcpy(dev->ifalias, alias, len+1);  	return len; @@ -1088,40 +1210,33 @@ EXPORT_SYMBOL(netdev_features_change);  void netdev_state_change(struct net_device *dev)  {  	if (dev->flags & IFF_UP) { -		call_netdevice_notifiers(NETDEV_CHANGE, dev); -		rtmsg_ifinfo(RTM_NEWLINK, dev, 0); +		struct netdev_notifier_change_info change_info; + +		change_info.flags_changed = 0; +		call_netdevice_notifiers_info(NETDEV_CHANGE, dev, +					      &change_info.info); +		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);  	}  }  EXPORT_SYMBOL(netdev_state_change); -int netdev_bonding_change(struct net_device *dev, unsigned long event) -{ -	return call_netdevice_notifiers(event, dev); -} -EXPORT_SYMBOL(netdev_bonding_change); -  /** - *	dev_load 	- load a network module - *	@net: the applicable net namespace - *	@name: name of interface + * 	netdev_notify_peers - notify network peers about existence of @dev + * 	@dev: network device   * - *	If a network interface is not present and the process has suitable - *	privileges this function loads the module. If module loading is not - *	available in this kernel then it becomes a nop. + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration.   */ - -void dev_load(struct net *net, const char *name) +void netdev_notify_peers(struct net_device *dev)  { -	struct net_device *dev; - -	rcu_read_lock(); -	dev = dev_get_by_name_rcu(net, name); -	rcu_read_unlock(); - -	if (!dev && capable(CAP_NET_ADMIN)) -		request_module("%s", name); +	rtnl_lock(); +	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); +	rtnl_unlock();  } -EXPORT_SYMBOL(dev_load); +EXPORT_SYMBOL(netdev_notify_peers);  static int __dev_open(struct net_device *dev)  { @@ -1130,20 +1245,20 @@ static int __dev_open(struct net_device *dev)  	ASSERT_RTNL(); -	/* -	 *	Is it even present? -	 */  	if (!netif_device_present(dev))  		return -ENODEV; +	/* Block netpoll from trying to do any rx path servicing. +	 * If we don't do this there is a chance ndo_poll_controller +	 * or ndo_poll may be running while we open the device +	 */ +	netpoll_poll_disable(dev); +  	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);  	ret = notifier_to_errno(ret);  	if (ret)  		return ret; -	/* -	 *	Call device private open method -	 */  	set_bit(__LINK_STATE_START, &dev->state);  	if (ops->ndo_validate_addr) @@ -1152,32 +1267,16 @@ static int __dev_open(struct net_device *dev)  	if (!ret && ops->ndo_open)  		ret = ops->ndo_open(dev); -	/* -	 *	If it went open OK then: -	 */ +	netpoll_poll_enable(dev);  	if (ret)  		clear_bit(__LINK_STATE_START, &dev->state);  	else { -		/* -		 *	Set the flags. -		 */  		dev->flags |= IFF_UP; - -		/* -		 *	Enable NET_DMA -		 */  		net_dmaengine_get(); - -		/* -		 *	Initialize multicasting status -		 */  		dev_set_rx_mode(dev); - -		/* -		 *	Wakeup transmit queue engine -		 */  		dev_activate(dev); +		add_device_randomness(dev->dev_addr, dev->addr_len);  	}  	return ret; @@ -1199,74 +1298,95 @@ int dev_open(struct net_device *dev)  {  	int ret; -	/* -	 *	Is it already up? -	 */  	if (dev->flags & IFF_UP)  		return 0; -	/* -	 *	Open device -	 */  	ret = __dev_open(dev);  	if (ret < 0)  		return ret; -	/* -	 *	... and announce new interface. -	 */ -	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); +	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);  	call_netdevice_notifiers(NETDEV_UP, dev);  	return ret;  }  EXPORT_SYMBOL(dev_open); -static int __dev_close(struct net_device *dev) +static int __dev_close_many(struct list_head *head)  { -	const struct net_device_ops *ops = dev->netdev_ops; +	struct net_device *dev;  	ASSERT_RTNL();  	might_sleep(); -	/* -	 *	Tell people we are going down, so that they can -	 *	prepare to death, when device is still operating. -	 */ -	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); +	list_for_each_entry(dev, head, close_list) { +		/* Temporarily disable netpoll until the interface is down */ +		netpoll_poll_disable(dev); -	clear_bit(__LINK_STATE_START, &dev->state); +		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); -	/* Synchronize to scheduled poll. We cannot touch poll list, -	 * it can be even on different cpu. So just clear netif_running(). -	 * -	 * dev->stop() will invoke napi_disable() on all of it's -	 * napi_struct instances on this device. -	 */ -	smp_mb__after_clear_bit(); /* Commit netif_running(). */ +		clear_bit(__LINK_STATE_START, &dev->state); -	dev_deactivate(dev); +		/* Synchronize to scheduled poll. We cannot touch poll list, it +		 * can be even on different cpu. So just clear netif_running(). +		 * +		 * dev->stop() will invoke napi_disable() on all of it's +		 * napi_struct instances on this device. +		 */ +		smp_mb__after_atomic(); /* Commit netif_running(). */ +	} -	/* -	 *	Call the device specific close. This cannot fail. -	 *	Only if device is UP -	 * -	 *	We allow it to be called even after a DETACH hot-plug -	 *	event. -	 */ -	if (ops->ndo_stop) -		ops->ndo_stop(dev); +	dev_deactivate_many(head); -	/* -	 *	Device is now down. -	 */ +	list_for_each_entry(dev, head, close_list) { +		const struct net_device_ops *ops = dev->netdev_ops; -	dev->flags &= ~IFF_UP; +		/* +		 *	Call the device specific close. This cannot fail. +		 *	Only if device is UP +		 * +		 *	We allow it to be called even after a DETACH hot-plug +		 *	event. +		 */ +		if (ops->ndo_stop) +			ops->ndo_stop(dev); -	/* -	 *	Shutdown NET_DMA -	 */ -	net_dmaengine_put(); +		dev->flags &= ~IFF_UP; +		net_dmaengine_put(); +		netpoll_poll_enable(dev); +	} + +	return 0; +} + +static int __dev_close(struct net_device *dev) +{ +	int retval; +	LIST_HEAD(single); + +	list_add(&dev->close_list, &single); +	retval = __dev_close_many(&single); +	list_del(&single); + +	return retval; +} + +static int dev_close_many(struct list_head *head) +{ +	struct net_device *dev, *tmp; + +	/* Remove the devices that don't need to be closed */ +	list_for_each_entry_safe(dev, tmp, head, close_list) +		if (!(dev->flags & IFF_UP)) +			list_del_init(&dev->close_list); + +	__dev_close_many(head); + +	list_for_each_entry_safe(dev, tmp, head, close_list) { +		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); +		call_netdevice_notifiers(NETDEV_DOWN, dev); +		list_del_init(&dev->close_list); +	}  	return 0;  } @@ -1282,17 +1402,13 @@ static int __dev_close(struct net_device *dev)   */  int dev_close(struct net_device *dev)  { -	if (!(dev->flags & IFF_UP)) -		return 0; - -	__dev_close(dev); - -	/* -	 * Tell people we are down -	 */ -	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); -	call_netdevice_notifiers(NETDEV_DOWN, dev); +	if (dev->flags & IFF_UP) { +		LIST_HEAD(single); +		list_add(&dev->close_list, &single); +		dev_close_many(&single); +		list_del(&single); +	}  	return 0;  }  EXPORT_SYMBOL(dev_close); @@ -1308,25 +1424,35 @@ EXPORT_SYMBOL(dev_close);   */  void dev_disable_lro(struct net_device *dev)  { -	if (dev->ethtool_ops && dev->ethtool_ops->get_flags && -	    dev->ethtool_ops->set_flags) { -		u32 flags = dev->ethtool_ops->get_flags(dev); -		if (flags & ETH_FLAG_LRO) { -			flags &= ~ETH_FLAG_LRO; -			dev->ethtool_ops->set_flags(dev, flags); -		} -	} -	WARN_ON(dev->features & NETIF_F_LRO); +	/* +	 * If we're trying to disable lro on a vlan device +	 * use the underlying physical device instead +	 */ +	if (is_vlan_dev(dev)) +		dev = vlan_dev_real_dev(dev); + +	/* the same for macvlan devices */ +	if (netif_is_macvlan(dev)) +		dev = macvlan_dev_real_dev(dev); + +	dev->wanted_features &= ~NETIF_F_LRO; +	netdev_update_features(dev); + +	if (unlikely(dev->features & NETIF_F_LRO)) +		netdev_WARN(dev, "failed to disable LRO!\n");  }  EXPORT_SYMBOL(dev_disable_lro); +static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, +				   struct net_device *dev) +{ +	struct netdev_notifier_info info; -static int dev_boot_phase = 1; +	netdev_notifier_info_init(&info, dev); +	return nb->notifier_call(nb, val, &info); +} -/* - *	Device change register/unregister. These are not inline or static - *	as we export them to the world. - */ +static int dev_boot_phase = 1;  /**   *	register_netdevice_notifier - register a network notifier block @@ -1357,7 +1483,7 @@ int register_netdevice_notifier(struct notifier_block *nb)  		goto unlock;  	for_each_net(net) {  		for_each_netdev(net, dev) { -			err = nb->notifier_call(nb, NETDEV_REGISTER, dev); +			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);  			err = notifier_to_errno(err);  			if (err)  				goto rollback; @@ -1365,7 +1491,7 @@ int register_netdevice_notifier(struct notifier_block *nb)  			if (!(dev->flags & IFF_UP))  				continue; -			nb->notifier_call(nb, NETDEV_UP, dev); +			call_netdevice_notifier(nb, NETDEV_UP, dev);  		}  	} @@ -1378,17 +1504,18 @@ rollback:  	for_each_net(net) {  		for_each_netdev(net, dev) {  			if (dev == last) -				break; +				goto outroll;  			if (dev->flags & IFF_UP) { -				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); -				nb->notifier_call(nb, NETDEV_DOWN, dev); +				call_netdevice_notifier(nb, NETDEV_GOING_DOWN, +							dev); +				call_netdevice_notifier(nb, NETDEV_DOWN, dev);  			} -			nb->notifier_call(nb, NETDEV_UNREGISTER, dev); -			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); +			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);  		}  	} +outroll:  	raw_notifier_chain_unregister(&netdev_chain, nb);  	goto unlock;  } @@ -1402,20 +1529,59 @@ EXPORT_SYMBOL(register_netdevice_notifier);   *	register_netdevice_notifier(). The notifier is unlinked into the   *	kernel structures and may then be reused. A negative errno code   *	is returned on a failure. + * + * 	After unregistering unregister and down device events are synthesized + *	for all devices on the device list to the removed notifier to remove + *	the need for special case cleanup code.   */  int unregister_netdevice_notifier(struct notifier_block *nb)  { +	struct net_device *dev; +	struct net *net;  	int err;  	rtnl_lock();  	err = raw_notifier_chain_unregister(&netdev_chain, nb); +	if (err) +		goto unlock; + +	for_each_net(net) { +		for_each_netdev(net, dev) { +			if (dev->flags & IFF_UP) { +				call_netdevice_notifier(nb, NETDEV_GOING_DOWN, +							dev); +				call_netdevice_notifier(nb, NETDEV_DOWN, dev); +			} +			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); +		} +	} +unlock:  	rtnl_unlock();  	return err;  }  EXPORT_SYMBOL(unregister_netdevice_notifier);  /** + *	call_netdevice_notifiers_info - call all network notifier blocks + *	@val: value passed unmodified to notifier function + *	@dev: net_device pointer passed unmodified to notifier function + *	@info: notifier information data + * + *	Call all network notifier blocks.  Parameters and return value + *	are as for raw_notifier_call_chain(). + */ + +static int call_netdevice_notifiers_info(unsigned long val, +					 struct net_device *dev, +					 struct netdev_notifier_info *info) +{ +	ASSERT_RTNL(); +	netdev_notifier_info_init(info, dev); +	return raw_notifier_call_chain(&netdev_chain, val, info); +} + +/**   *	call_netdevice_notifiers - call all network notifier blocks   *      @val: value passed unmodified to notifier function   *      @dev: net_device pointer passed unmodified to notifier function @@ -1426,38 +1592,104 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);  int call_netdevice_notifiers(unsigned long val, struct net_device *dev)  { -	ASSERT_RTNL(); -	return raw_notifier_call_chain(&netdev_chain, val, dev); +	struct netdev_notifier_info info; + +	return call_netdevice_notifiers_info(val, dev, &info);  } +EXPORT_SYMBOL(call_netdevice_notifiers); -/* When > 0 there are consumers of rx skb time stamps */ -static atomic_t netstamp_needed = ATOMIC_INIT(0); +static struct static_key netstamp_needed __read_mostly; +#ifdef HAVE_JUMP_LABEL +/* We are not allowed to call static_key_slow_dec() from irq context + * If net_disable_timestamp() is called from irq context, defer the + * static_key_slow_dec() calls. + */ +static atomic_t netstamp_needed_deferred; +#endif  void net_enable_timestamp(void)  { -	atomic_inc(&netstamp_needed); +#ifdef HAVE_JUMP_LABEL +	int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + +	if (deferred) { +		while (--deferred) +			static_key_slow_dec(&netstamp_needed); +		return; +	} +#endif +	static_key_slow_inc(&netstamp_needed);  }  EXPORT_SYMBOL(net_enable_timestamp);  void net_disable_timestamp(void)  { -	atomic_dec(&netstamp_needed); +#ifdef HAVE_JUMP_LABEL +	if (in_interrupt()) { +		atomic_inc(&netstamp_needed_deferred); +		return; +	} +#endif +	static_key_slow_dec(&netstamp_needed);  }  EXPORT_SYMBOL(net_disable_timestamp);  static inline void net_timestamp_set(struct sk_buff *skb)  { -	if (atomic_read(&netstamp_needed)) +	skb->tstamp.tv64 = 0; +	if (static_key_false(&netstamp_needed))  		__net_timestamp(skb); -	else -		skb->tstamp.tv64 = 0;  } -static inline void net_timestamp_check(struct sk_buff *skb) +#define net_timestamp_check(COND, SKB)			\ +	if (static_key_false(&netstamp_needed)) {		\ +		if ((COND) && !(SKB)->tstamp.tv64)	\ +			__net_timestamp(SKB);		\ +	}						\ + +bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)  { -	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) -		__net_timestamp(skb); +	unsigned int len; + +	if (!(dev->flags & IFF_UP)) +		return false; + +	len = dev->mtu + dev->hard_header_len + VLAN_HLEN; +	if (skb->len <= len) +		return true; + +	/* if TSO is enabled, we don't care about the length as the packet +	 * could be forwarded without being segmented before +	 */ +	if (skb_is_gso(skb)) +		return true; + +	return false; +} +EXPORT_SYMBOL_GPL(is_skb_forwardable); + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ +	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { +		if (skb_copy_ubufs(skb, GFP_ATOMIC)) { +			atomic_long_inc(&dev->rx_dropped); +			kfree_skb(skb); +			return NET_RX_DROP; +		} +	} + +	if (unlikely(!is_skb_forwardable(dev, skb))) { +		atomic_long_inc(&dev->rx_dropped); +		kfree_skb(skb); +		return NET_RX_DROP; +	} + +	skb_scrub_packet(skb, true); +	skb->protocol = eth_type_trans(skb, dev); + +	return 0;  } +EXPORT_SYMBOL_GPL(__dev_forward_skb);  /**   * dev_forward_skb - loopback an skb to another netif @@ -1479,23 +1711,33 @@ static inline void net_timestamp_check(struct sk_buff *skb)   */  int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)  { -	skb_orphan(skb); -	nf_reset(skb); - -	if (unlikely(!(dev->flags & IFF_UP) || -		     (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) { -		atomic_long_inc(&dev->rx_dropped); -		kfree_skb(skb); -		return NET_RX_DROP; -	} -	skb_set_dev(skb, dev); -	skb->tstamp.tv64 = 0; -	skb->pkt_type = PACKET_HOST; -	skb->protocol = eth_type_trans(skb, dev); -	return netif_rx(skb); +	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);  }  EXPORT_SYMBOL_GPL(dev_forward_skb); +static inline int deliver_skb(struct sk_buff *skb, +			      struct packet_type *pt_prev, +			      struct net_device *orig_dev) +{ +	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) +		return -ENOMEM; +	atomic_inc(&skb->users); +	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + +static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) +{ +	if (!ptype->af_packet_priv || !skb->sk) +		return false; + +	if (ptype->id_match) +		return ptype->id_match(ptype, skb->sk); +	else if ((struct sock *)ptype->af_packet_priv == skb->sk) +		return true; + +	return false; +} +  /*   *	Support routine. Sends outgoing frames to any network   *	taps currently in use. @@ -1504,13 +1746,8 @@ EXPORT_SYMBOL_GPL(dev_forward_skb);  static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)  {  	struct packet_type *ptype; - -#ifdef CONFIG_NET_CLS_ACT -	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) -		net_timestamp_set(skb); -#else -	net_timestamp_set(skb); -#endif +	struct sk_buff *skb2 = NULL; +	struct packet_type *pt_prev = NULL;  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -1518,12 +1755,19 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)  		 * they originated from - MvS (miquels@drinkel.ow.org)  		 */  		if ((ptype->dev == dev || !ptype->dev) && -		    (ptype->af_packet_priv == NULL || -		     (struct sock *)ptype->af_packet_priv != skb->sk)) { -			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); +		    (!skb_loop_sk(ptype, skb))) { +			if (pt_prev) { +				deliver_skb(skb2, pt_prev, skb->dev); +				pt_prev = ptype; +				continue; +			} + +			skb2 = skb_clone(skb, GFP_ATOMIC);  			if (!skb2)  				break; +			net_timestamp_set(skb2); +  			/* skb->nh should be correctly  			   set by sender, so that the second statement is  			   just protection against buggy protocols. @@ -1531,23 +1775,286 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)  			skb_reset_mac_header(skb2);  			if (skb_network_header(skb2) < skb2->data || -			    skb2->network_header > skb2->tail) { -				if (net_ratelimit()) -					printk(KERN_CRIT "protocol %04x is " -					       "buggy, dev %s\n", -					       ntohs(skb2->protocol), -					       dev->name); +			    skb_network_header(skb2) > skb_tail_pointer(skb2)) { +				net_crit_ratelimited("protocol %04x is buggy, dev %s\n", +						     ntohs(skb2->protocol), +						     dev->name);  				skb_reset_network_header(skb2);  			}  			skb2->transport_header = skb2->network_header;  			skb2->pkt_type = PACKET_OUTGOING; -			ptype->func(skb2, skb->dev, ptype, skb->dev); +			pt_prev = ptype;  		}  	} +	if (pt_prev) +		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);  	rcu_read_unlock();  } +/** + * netif_setup_tc - Handle tc mappings on real_num_tx_queues change + * @dev: Network device + * @txq: number of queues available + * + * If real_num_tx_queues is changed the tc mappings may no longer be + * valid. To resolve this verify the tc mapping remains valid and if + * not NULL the mapping. With no priorities mapping to this + * offset/count pair it will no longer be used. In the worst case TC0 + * is invalid nothing can be done so disable priority mappings. If is + * expected that drivers will fix this mapping if they can before + * calling netif_set_real_num_tx_queues. + */ +static void netif_setup_tc(struct net_device *dev, unsigned int txq) +{ +	int i; +	struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + +	/* If TC0 is invalidated disable TC mapping */ +	if (tc->offset + tc->count > txq) { +		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); +		dev->num_tc = 0; +		return; +	} + +	/* Invalidated prio to tc mappings set to TC0 */ +	for (i = 1; i < TC_BITMASK + 1; i++) { +		int q = netdev_get_prio_tc_map(dev, i); + +		tc = &dev->tc_to_txq[q]; +		if (tc->offset + tc->count > txq) { +			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", +				i, q); +			netdev_set_prio_tc_map(dev, i, 0); +		} +	} +} + +#ifdef CONFIG_XPS +static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P)		\ +	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) + +static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, +					int cpu, u16 index) +{ +	struct xps_map *map = NULL; +	int pos; + +	if (dev_maps) +		map = xmap_dereference(dev_maps->cpu_map[cpu]); + +	for (pos = 0; map && pos < map->len; pos++) { +		if (map->queues[pos] == index) { +			if (map->len > 1) { +				map->queues[pos] = map->queues[--map->len]; +			} else { +				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); +				kfree_rcu(map, rcu); +				map = NULL; +			} +			break; +		} +	} + +	return map; +} + +static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +{ +	struct xps_dev_maps *dev_maps; +	int cpu, i; +	bool active = false; + +	mutex_lock(&xps_map_mutex); +	dev_maps = xmap_dereference(dev->xps_maps); + +	if (!dev_maps) +		goto out_no_maps; + +	for_each_possible_cpu(cpu) { +		for (i = index; i < dev->num_tx_queues; i++) { +			if (!remove_xps_queue(dev_maps, cpu, i)) +				break; +		} +		if (i == dev->num_tx_queues) +			active = true; +	} + +	if (!active) { +		RCU_INIT_POINTER(dev->xps_maps, NULL); +		kfree_rcu(dev_maps, rcu); +	} + +	for (i = index; i < dev->num_tx_queues; i++) +		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), +					     NUMA_NO_NODE); + +out_no_maps: +	mutex_unlock(&xps_map_mutex); +} + +static struct xps_map *expand_xps_map(struct xps_map *map, +				      int cpu, u16 index) +{ +	struct xps_map *new_map; +	int alloc_len = XPS_MIN_MAP_ALLOC; +	int i, pos; + +	for (pos = 0; map && pos < map->len; pos++) { +		if (map->queues[pos] != index) +			continue; +		return map; +	} + +	/* Need to add queue to this CPU's existing map */ +	if (map) { +		if (pos < map->alloc_len) +			return map; + +		alloc_len = map->alloc_len * 2; +	} + +	/* Need to allocate new map to store queue on this CPU's map */ +	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, +			       cpu_to_node(cpu)); +	if (!new_map) +		return NULL; + +	for (i = 0; i < pos; i++) +		new_map->queues[i] = map->queues[i]; +	new_map->alloc_len = alloc_len; +	new_map->len = pos; + +	return new_map; +} + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, +			u16 index) +{ +	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; +	struct xps_map *map, *new_map; +	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); +	int cpu, numa_node_id = -2; +	bool active = false; + +	mutex_lock(&xps_map_mutex); + +	dev_maps = xmap_dereference(dev->xps_maps); + +	/* allocate memory for queue storage */ +	for_each_online_cpu(cpu) { +		if (!cpumask_test_cpu(cpu, mask)) +			continue; + +		if (!new_dev_maps) +			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); +		if (!new_dev_maps) { +			mutex_unlock(&xps_map_mutex); +			return -ENOMEM; +		} + +		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : +				 NULL; + +		map = expand_xps_map(map, cpu, index); +		if (!map) +			goto error; + +		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); +	} + +	if (!new_dev_maps) +		goto out_no_new_maps; + +	for_each_possible_cpu(cpu) { +		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { +			/* add queue to CPU maps */ +			int pos = 0; + +			map = xmap_dereference(new_dev_maps->cpu_map[cpu]); +			while ((pos < map->len) && (map->queues[pos] != index)) +				pos++; + +			if (pos == map->len) +				map->queues[map->len++] = index; +#ifdef CONFIG_NUMA +			if (numa_node_id == -2) +				numa_node_id = cpu_to_node(cpu); +			else if (numa_node_id != cpu_to_node(cpu)) +				numa_node_id = -1; +#endif +		} else if (dev_maps) { +			/* fill in the new device map from the old device map */ +			map = xmap_dereference(dev_maps->cpu_map[cpu]); +			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); +		} + +	} + +	rcu_assign_pointer(dev->xps_maps, new_dev_maps); + +	/* Cleanup old maps */ +	if (dev_maps) { +		for_each_possible_cpu(cpu) { +			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); +			map = xmap_dereference(dev_maps->cpu_map[cpu]); +			if (map && map != new_map) +				kfree_rcu(map, rcu); +		} + +		kfree_rcu(dev_maps, rcu); +	} + +	dev_maps = new_dev_maps; +	active = true; + +out_no_new_maps: +	/* update Tx queue numa node */ +	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), +				     (numa_node_id >= 0) ? numa_node_id : +				     NUMA_NO_NODE); + +	if (!dev_maps) +		goto out_no_maps; + +	/* removes queue from unused CPUs */ +	for_each_possible_cpu(cpu) { +		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) +			continue; + +		if (remove_xps_queue(dev_maps, cpu, index)) +			active = true; +	} + +	/* free map if not active */ +	if (!active) { +		RCU_INIT_POINTER(dev->xps_maps, NULL); +		kfree_rcu(dev_maps, rcu); +	} + +out_no_maps: +	mutex_unlock(&xps_map_mutex); + +	return 0; +error: +	/* remove any maps that we added */ +	for_each_possible_cpu(cpu) { +		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); +		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : +				 NULL; +		if (new_map && new_map != map) +			kfree(new_map); +	} + +	mutex_unlock(&xps_map_mutex); + +	kfree(new_dev_maps); +	return -ENOMEM; +} +EXPORT_SYMBOL(netif_set_xps_queue); + +#endif  /*   * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues   * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. @@ -1559,7 +2066,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)  	if (txq < 1 || txq > dev->num_tx_queues)  		return -EINVAL; -	if (dev->reg_state == NETREG_REGISTERED) { +	if (dev->reg_state == NETREG_REGISTERED || +	    dev->reg_state == NETREG_UNREGISTERING) {  		ASSERT_RTNL();  		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, @@ -1567,8 +2075,15 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)  		if (rc)  			return rc; -		if (txq < dev->real_num_tx_queues) +		if (dev->num_tc) +			netif_setup_tc(dev, txq); + +		if (txq < dev->real_num_tx_queues) {  			qdisc_reset_all_tx_gt(dev, txq); +#ifdef CONFIG_XPS +			netif_reset_xps_queues_gt(dev, txq); +#endif +		}  	}  	dev->real_num_tx_queues = txq; @@ -1576,7 +2091,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)  }  EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  /**   *	netif_set_real_num_rx_queues - set actual number of RX queues used   *	@dev: Network device @@ -1609,6 +2124,18 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)  EXPORT_SYMBOL(netif_set_real_num_rx_queues);  #endif +/** + * netif_get_num_default_rss_queues - default number of RSS queues + * + * This routine should set an upper limit on the number of RSS queues + * used by default by multiqueue devices. + */ +int netif_get_num_default_rss_queues(void) +{ +	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); +} +EXPORT_SYMBOL(netif_get_num_default_rss_queues); +  static inline void __netif_reschedule(struct Qdisc *q)  {  	struct softnet_data *sd; @@ -1630,30 +2157,42 @@ void __netif_schedule(struct Qdisc *q)  }  EXPORT_SYMBOL(__netif_schedule); -void dev_kfree_skb_irq(struct sk_buff *skb) +struct dev_kfree_skb_cb { +	enum skb_free_reason reason; +}; + +static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)  { -	if (atomic_dec_and_test(&skb->users)) { -		struct softnet_data *sd; -		unsigned long flags; +	return (struct dev_kfree_skb_cb *)skb->cb; +} + +void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) +{ +	unsigned long flags; -		local_irq_save(flags); -		sd = &__get_cpu_var(softnet_data); -		skb->next = sd->completion_queue; -		sd->completion_queue = skb; -		raise_softirq_irqoff(NET_TX_SOFTIRQ); -		local_irq_restore(flags); +	if (likely(atomic_read(&skb->users) == 1)) { +		smp_rmb(); +		atomic_set(&skb->users, 0); +	} else if (likely(!atomic_dec_and_test(&skb->users))) { +		return;  	} +	get_kfree_skb_cb(skb)->reason = reason; +	local_irq_save(flags); +	skb->next = __this_cpu_read(softnet_data.completion_queue); +	__this_cpu_write(softnet_data.completion_queue, skb); +	raise_softirq_irqoff(NET_TX_SOFTIRQ); +	local_irq_restore(flags);  } -EXPORT_SYMBOL(dev_kfree_skb_irq); +EXPORT_SYMBOL(__dev_kfree_skb_irq); -void dev_kfree_skb_any(struct sk_buff *skb) +void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)  {  	if (in_irq() || irqs_disabled()) -		dev_kfree_skb_irq(skb); +		__dev_kfree_skb_irq(skb, reason);  	else  		dev_kfree_skb(skb);  } -EXPORT_SYMBOL(dev_kfree_skb_any); +EXPORT_SYMBOL(__dev_kfree_skb_any);  /** @@ -1687,62 +2226,25 @@ void netif_device_attach(struct net_device *dev)  }  EXPORT_SYMBOL(netif_device_attach); -static bool can_checksum_protocol(unsigned long features, __be16 protocol) -{ -	return ((features & NETIF_F_NO_CSUM) || -		((features & NETIF_F_V4_CSUM) && -		 protocol == htons(ETH_P_IP)) || -		((features & NETIF_F_V6_CSUM) && -		 protocol == htons(ETH_P_IPV6)) || -		((features & NETIF_F_FCOE_CRC) && -		 protocol == htons(ETH_P_FCOE))); -} - -static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) +static void skb_warn_bad_offload(const struct sk_buff *skb)  { -	__be16 protocol = skb->protocol; -	int features = dev->features; +	static const netdev_features_t null_features = 0; +	struct net_device *dev = skb->dev; +	const char *driver = ""; -	if (vlan_tx_tag_present(skb)) { -		features &= dev->vlan_features; -	} else if (protocol == htons(ETH_P_8021Q)) { -		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; -		protocol = veh->h_vlan_encapsulated_proto; -		features &= dev->vlan_features; -	} +	if (!net_ratelimit()) +		return; -	return can_checksum_protocol(features, protocol); -} +	if (dev && dev->dev.parent) +		driver = dev_driver_string(dev->dev.parent); -/** - * skb_dev_set -- assign a new device to a buffer - * @skb: buffer for the new device - * @dev: network device - * - * If an skb is owned by a device already, we have to reset - * all data private to the namespace a device belongs to - * before assigning it a new device. - */ -#ifdef CONFIG_NET_NS -void skb_set_dev(struct sk_buff *skb, struct net_device *dev) -{ -	skb_dst_drop(skb); -	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) { -		secpath_reset(skb); -		nf_reset(skb); -		skb_init_secmark(skb); -		skb->mark = 0; -		skb->priority = 0; -		skb->nf_trace = 0; -		skb->ipvs_property = 0; -#ifdef CONFIG_NET_SCHED -		skb->tc_index = 0; -#endif -	} -	skb->dev = dev; +	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " +	     "gso_type=%d ip_summed=%d\n", +	     driver, dev ? &dev->features : &null_features, +	     skb->sk ? &skb->sk->sk_route_caps : &null_features, +	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size, +	     skb_shinfo(skb)->gso_type, skb->ip_summed);  } -EXPORT_SYMBOL(skb_set_dev); -#endif /* CONFIG_NET_NS */  /*   * Invalidate hardware checksum when packet is to be mangled, and @@ -1757,11 +2259,20 @@ int skb_checksum_help(struct sk_buff *skb)  		goto out_set_summed;  	if (unlikely(skb_shinfo(skb)->gso_size)) { -		/* Let GSO fix up the checksum. */ -		goto out_set_summed; +		skb_warn_bad_offload(skb); +		return -EINVAL;  	} -	offset = skb->csum_start - skb_headroom(skb); +	/* Before computing a checksum, we should make sure no frag could +	 * be modified by an external entity : checksum could be wrong. +	 */ +	if (skb_has_shared_frag(skb)) { +		ret = __skb_linearize(skb); +		if (ret) +			goto out; +	} + +	offset = skb_checksum_start_offset(skb);  	BUG_ON(offset >= skb_headlen(skb));  	csum = skb_checksum(skb, offset, skb->len - offset, 0); @@ -1783,69 +2294,85 @@ out:  }  EXPORT_SYMBOL(skb_checksum_help); -/** - *	skb_gso_segment - Perform segmentation on skb. - *	@skb: buffer to segment - *	@features: features for the output path (see dev->features) - * - *	This function segments the given skb and returns a list of segments. - * - *	It may return NULL if the skb requires no segmentation.  This is - *	only possible when GSO is used for verifying header integrity. - */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) +__be16 skb_network_protocol(struct sk_buff *skb, int *depth)  { -	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); -	struct packet_type *ptype; +	unsigned int vlan_depth = skb->mac_len;  	__be16 type = skb->protocol; -	int vlan_depth = ETH_HLEN; -	int err; -	while (type == htons(ETH_P_8021Q)) { -		struct vlan_hdr *vh; +	/* Tunnel gso handlers can set protocol to ethernet. */ +	if (type == htons(ETH_P_TEB)) { +		struct ethhdr *eth; -		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) -			return ERR_PTR(-EINVAL); +		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) +			return 0; -		vh = (struct vlan_hdr *)(skb->data + vlan_depth); -		type = vh->h_vlan_encapsulated_proto; -		vlan_depth += VLAN_HLEN; +		eth = (struct ethhdr *)skb_mac_header(skb); +		type = eth->h_proto;  	} -	skb_reset_mac_header(skb); -	skb->mac_len = skb->network_header - skb->mac_header; -	__skb_pull(skb, skb->mac_len); +	/* if skb->protocol is 802.1Q/AD then the header should already be +	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at +	 * ETH_HLEN otherwise +	 */ +	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { +		if (vlan_depth) { +			if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN))) +				return 0; +			vlan_depth -= VLAN_HLEN; +		} else { +			vlan_depth = ETH_HLEN; +		} +		do { +			struct vlan_hdr *vh; -	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { -		struct net_device *dev = skb->dev; -		struct ethtool_drvinfo info = {}; +			if (unlikely(!pskb_may_pull(skb, +						    vlan_depth + VLAN_HLEN))) +				return 0; -		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) -			dev->ethtool_ops->get_drvinfo(dev, &info); +			vh = (struct vlan_hdr *)(skb->data + vlan_depth); +			type = vh->h_vlan_encapsulated_proto; +			vlan_depth += VLAN_HLEN; +		} while (type == htons(ETH_P_8021Q) || +			 type == htons(ETH_P_8021AD)); +	} -		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n", -		     info.driver, dev ? dev->features : 0L, -		     skb->sk ? skb->sk->sk_route_caps : 0L, -		     skb->len, skb->data_len, skb->ip_summed); +	*depth = vlan_depth; -		if (skb_header_cloned(skb) && -		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) -			return ERR_PTR(err); -	} +	return type; +} + +/** + *	skb_mac_gso_segment - mac layer segmentation handler. + *	@skb: buffer to segment + *	@features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, +				    netdev_features_t features) +{ +	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); +	struct packet_offload *ptype; +	int vlan_depth = skb->mac_len; +	__be16 type = skb_network_protocol(skb, &vlan_depth); + +	if (unlikely(!type)) +		return ERR_PTR(-EINVAL); + +	__skb_pull(skb, vlan_depth);  	rcu_read_lock(); -	list_for_each_entry_rcu(ptype, -			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { -		if (ptype->type == type && !ptype->dev && ptype->gso_segment) { +	list_for_each_entry_rcu(ptype, &offload_base, list) { +		if (ptype->type == type && ptype->callbacks.gso_segment) {  			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { -				err = ptype->gso_send_check(skb); +				int err; + +				err = ptype->callbacks.gso_send_check(skb);  				segs = ERR_PTR(err);  				if (err || skb_gso_ok(skb, features))  					break;  				__skb_push(skb, (skb->data -  						 skb_network_header(skb)));  			} -			segs = ptype->gso_segment(skb, features); +			segs = ptype->callbacks.gso_segment(skb, features);  			break;  		}  	} @@ -1855,15 +2382,59 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)  	return segs;  } -EXPORT_SYMBOL(skb_gso_segment); +EXPORT_SYMBOL(skb_mac_gso_segment); + + +/* openvswitch calls this on rx path, so we need a different check. + */ +static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) +{ +	if (tx_path) +		return skb->ip_summed != CHECKSUM_PARTIAL; +	else +		return skb->ip_summed == CHECKSUM_NONE; +} + +/** + *	__skb_gso_segment - Perform segmentation on skb. + *	@skb: buffer to segment + *	@features: features for the output path (see dev->features) + *	@tx_path: whether it is called in TX path + * + *	This function segments the given skb and returns a list of segments. + * + *	It may return NULL if the skb requires no segmentation.  This is + *	only possible when GSO is used for verifying header integrity. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, +				  netdev_features_t features, bool tx_path) +{ +	if (unlikely(skb_needs_check(skb, tx_path))) { +		int err; + +		skb_warn_bad_offload(skb); + +		if (skb_header_cloned(skb) && +		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) +			return ERR_PTR(err); +	} + +	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); +	SKB_GSO_CB(skb)->encap_level = 0; + +	skb_reset_mac_header(skb); +	skb_reset_mac_len(skb); + +	return skb_mac_gso_segment(skb, features); +} +EXPORT_SYMBOL(__skb_gso_segment);  /* Take action when hardware reception checksum errors are detected. */  #ifdef CONFIG_BUG  void netdev_rx_csum_fault(struct net_device *dev)  {  	if (net_ratelimit()) { -		printk(KERN_ERR "%s: hw csum failure.\n", -			dev ? dev->name : "<unknown>"); +		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");  		dump_stack();  	}  } @@ -1880,9 +2451,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)  #ifdef CONFIG_HIGHMEM  	int i;  	if (!(dev->features & NETIF_F_HIGHDMA)) { -		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) -			if (PageHighMem(skb_shinfo(skb)->frags[i].page)) +		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { +			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; +			if (PageHighMem(skb_frag_page(frag)))  				return 1; +		}  	}  	if (PCI_DMA_BUS_IS_PHYS) { @@ -1891,7 +2464,8 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)  		if (!pdev)  			return 0;  		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { -			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page); +			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; +			dma_addr_t addr = page_to_phys(skb_frag_page(frag));  			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)  				return 1;  		} @@ -1910,13 +2484,8 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)  {  	struct dev_gso_cb *cb; -	do { -		struct sk_buff *nskb = skb->next; - -		skb->next = nskb->next; -		nskb->next = NULL; -		kfree_skb(nskb); -	} while (skb->next); +	kfree_skb_list(skb->next); +	skb->next = NULL;  	cb = DEV_GSO_CB(skb);  	if (cb->destructor) @@ -1926,16 +2495,14 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)  /**   *	dev_gso_segment - Perform emulated hardware segmentation on skb.   *	@skb: buffer to segment + *	@features: device features as applicable to this skb   *   *	This function segments the given skb and stores the list of segments   *	in skb->next.   */ -static int dev_gso_segment(struct sk_buff *skb) +static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)  { -	struct net_device *dev = skb->dev;  	struct sk_buff *segs; -	int features = dev->features & ~(illegal_highdma(dev, skb) ? -					 NETIF_F_SG : 0);  	segs = skb_gso_segment(skb, features); @@ -1953,103 +2520,117 @@ static int dev_gso_segment(struct sk_buff *skb)  	return 0;  } -/* - * Try to orphan skb early, right before transmission by the device. - * We cannot orphan skb if tx timestamp is requested or the sk-reference - * is needed on driver level for other reasons, e.g. see net/can/raw.c +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev.   */ -static inline void skb_orphan_try(struct sk_buff *skb) +#ifdef CONFIG_NET_MPLS_GSO +static netdev_features_t net_mpls_features(struct sk_buff *skb, +					   netdev_features_t features, +					   __be16 type)  { -	struct sock *sk = skb->sk; +	if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) +		features &= skb->dev->mpls_features; -	if (sk && !skb_shinfo(skb)->tx_flags) { -		/* skb_tx_hash() wont be able to get sk. -		 * We copy sk_hash into skb->rxhash -		 */ -		if (!skb->rxhash) -			skb->rxhash = sk->sk_hash; -		skb_orphan(skb); -	} +	return features; +} +#else +static netdev_features_t net_mpls_features(struct sk_buff *skb, +					   netdev_features_t features, +					   __be16 type) +{ +	return features;  } +#endif -int netif_get_vlan_features(struct sk_buff *skb, struct net_device *dev) +static netdev_features_t harmonize_features(struct sk_buff *skb, +	netdev_features_t features)  { -	__be16 protocol = skb->protocol; +	int tmp; +	__be16 type; -	if (protocol == htons(ETH_P_8021Q)) { -		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; -		protocol = veh->h_vlan_encapsulated_proto; -	} else if (!skb->vlan_tci) -		return dev->features; +	type = skb_network_protocol(skb, &tmp); +	features = net_mpls_features(skb, features, type); -	if (protocol != htons(ETH_P_8021Q)) -		return dev->features & dev->vlan_features; -	else -		return 0; +	if (skb->ip_summed != CHECKSUM_NONE && +	    !can_checksum_protocol(features, type)) { +		features &= ~NETIF_F_ALL_CSUM; +	} else if (illegal_highdma(skb->dev, skb)) { +		features &= ~NETIF_F_SG; +	} + +	return features;  } -EXPORT_SYMBOL(netif_get_vlan_features); -/* - * Returns true if either: - *	1. skb has frag_list and the device doesn't support FRAGLIST, or - *	2. skb is fragmented and the device does not support SG, or if - *	   at least one of fragments is in highmem and device does not - *	   support DMA from it. - */ -static inline int skb_needs_linearize(struct sk_buff *skb, -				      struct net_device *dev) +netdev_features_t netif_skb_features(struct sk_buff *skb)  { -	if (skb_is_nonlinear(skb)) { -		int features = dev->features; +	__be16 protocol = skb->protocol; +	netdev_features_t features = skb->dev->features; -		if (vlan_tx_tag_present(skb)) -			features &= dev->vlan_features; +	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) +		features &= ~NETIF_F_GSO_MASK; -		return (skb_has_frag_list(skb) && -			!(features & NETIF_F_FRAGLIST)) || -			(skb_shinfo(skb)->nr_frags && -			(!(features & NETIF_F_SG) || -			illegal_highdma(dev, skb))); +	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { +		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; +		protocol = veh->h_vlan_encapsulated_proto; +	} else if (!vlan_tx_tag_present(skb)) { +		return harmonize_features(skb, features);  	} -	return 0; +	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | +					       NETIF_F_HW_VLAN_STAG_TX); + +	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) +		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | +				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | +				NETIF_F_HW_VLAN_STAG_TX; + +	return harmonize_features(skb, features);  } +EXPORT_SYMBOL(netif_skb_features);  int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  			struct netdev_queue *txq)  {  	const struct net_device_ops *ops = dev->netdev_ops;  	int rc = NETDEV_TX_OK; +	unsigned int skb_len;  	if (likely(!skb->next)) { +		netdev_features_t features; +  		/* -		 * If device doesnt need skb->dst, release it right now while +		 * If device doesn't need skb->dst, release it right now while  		 * its hot in this cpu cache  		 */  		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)  			skb_dst_drop(skb); -		if (!list_empty(&ptype_all)) -			dev_queue_xmit_nit(skb, dev); - -		skb_orphan_try(skb); +		features = netif_skb_features(skb);  		if (vlan_tx_tag_present(skb) && -		    !(dev->features & NETIF_F_HW_VLAN_TX)) { -			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); +		    !vlan_hw_offload_capable(features, skb->vlan_proto)) { +			skb = __vlan_put_tag(skb, skb->vlan_proto, +					     vlan_tx_tag_get(skb));  			if (unlikely(!skb))  				goto out;  			skb->vlan_tci = 0;  		} -		if (netif_needs_gso(dev, skb)) { -			if (unlikely(dev_gso_segment(skb))) +		/* If encapsulation offload request, verify we are testing +		 * hardware encapsulation features instead of standard +		 * features for the netdev +		 */ +		if (skb->encapsulation) +			features &= dev->hw_enc_features; + +		if (netif_needs_gso(skb, features)) { +			if (unlikely(dev_gso_segment(skb, features)))  				goto out_kfree_skb;  			if (skb->next)  				goto gso;  		} else { -			if (skb_needs_linearize(skb, dev) && +			if (skb_needs_linearize(skb, features) &&  			    __skb_linearize(skb))  				goto out_kfree_skb; @@ -2058,16 +2639,25 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  			 * checksumming here.  			 */  			if (skb->ip_summed == CHECKSUM_PARTIAL) { -				skb_set_transport_header(skb, skb->csum_start - -					      skb_headroom(skb)); -				if (!dev_can_checksum(dev, skb) && +				if (skb->encapsulation) +					skb_set_inner_transport_header(skb, +						skb_checksum_start_offset(skb)); +				else +					skb_set_transport_header(skb, +						skb_checksum_start_offset(skb)); +				if (!(features & NETIF_F_ALL_CSUM) &&  				     skb_checksum_help(skb))  					goto out_kfree_skb;  			}  		} +		if (!list_empty(&ptype_all)) +			dev_queue_xmit_nit(skb, dev); + +		skb_len = skb->len; +		trace_net_dev_start_xmit(skb, dev);  		rc = ops->ndo_start_xmit(skb, dev); -		trace_net_dev_xmit(skb, rc); +		trace_net_dev_xmit(skb, rc, dev, skb_len);  		if (rc == NETDEV_TX_OK)  			txq_trans_update(txq);  		return rc; @@ -2080,15 +2670,13 @@ gso:  		skb->next = nskb->next;  		nskb->next = NULL; -		/* -		 * If device doesnt need nskb->dst, release it right now while -		 * its hot in this cpu cache -		 */ -		if (dev->priv_flags & IFF_XMIT_DST_RELEASE) -			skb_dst_drop(nskb); +		if (!list_empty(&ptype_all)) +			dev_queue_xmit_nit(nskb, dev); +		skb_len = nskb->len; +		trace_net_dev_start_xmit(nskb, dev);  		rc = ops->ndo_start_xmit(nskb, dev); -		trace_net_dev_xmit(nskb, rc); +		trace_net_dev_xmit(nskb, rc, dev, skb_len);  		if (unlikely(rc != NETDEV_TX_OK)) {  			if (rc & ~NETDEV_TX_MASK)  				goto out_kfree_gso_skb; @@ -2097,128 +2685,51 @@ gso:  			return rc;  		}  		txq_trans_update(txq); -		if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) +		if (unlikely(netif_xmit_stopped(txq) && skb->next))  			return NETDEV_TX_BUSY;  	} while (skb->next);  out_kfree_gso_skb: -	if (likely(skb->next == NULL)) +	if (likely(skb->next == NULL)) {  		skb->destructor = DEV_GSO_CB(skb)->destructor; +		consume_skb(skb); +		return rc; +	}  out_kfree_skb:  	kfree_skb(skb);  out:  	return rc;  } +EXPORT_SYMBOL_GPL(dev_hard_start_xmit); -static u32 hashrnd __read_mostly; - -u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) +static void qdisc_pkt_len_init(struct sk_buff *skb)  { -	u32 hash; - -	if (skb_rx_queue_recorded(skb)) { -		hash = skb_get_rx_queue(skb); -		while (unlikely(hash >= dev->real_num_tx_queues)) -			hash -= dev->real_num_tx_queues; -		return hash; -	} +	const struct skb_shared_info *shinfo = skb_shinfo(skb); -	if (skb->sk && skb->sk->sk_hash) -		hash = skb->sk->sk_hash; -	else -		hash = (__force u16) skb->protocol ^ skb->rxhash; -	hash = jhash_1word(hash, hashrnd); +	qdisc_skb_cb(skb)->pkt_len = skb->len; -	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); -} -EXPORT_SYMBOL(skb_tx_hash); - -static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) -{ -	if (unlikely(queue_index >= dev->real_num_tx_queues)) { -		if (net_ratelimit()) { -			pr_warning("%s selects TX queue %d, but " -				"real number of TX queues is %d\n", -				dev->name, queue_index, dev->real_num_tx_queues); -		} -		return 0; -	} -	return queue_index; -} - -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) -{ -#ifdef CONFIG_XPS -	struct xps_dev_maps *dev_maps; -	struct xps_map *map; -	int queue_index = -1; - -	rcu_read_lock(); -	dev_maps = rcu_dereference(dev->xps_maps); -	if (dev_maps) { -		map = rcu_dereference( -		    dev_maps->cpu_map[raw_smp_processor_id()]); -		if (map) { -			if (map->len == 1) -				queue_index = map->queues[0]; -			else { -				u32 hash; -				if (skb->sk && skb->sk->sk_hash) -					hash = skb->sk->sk_hash; -				else -					hash = (__force u16) skb->protocol ^ -					    skb->rxhash; -				hash = jhash_1word(hash, hashrnd); -				queue_index = map->queues[ -				    ((u64)hash * map->len) >> 32]; -			} -			if (unlikely(queue_index >= dev->real_num_tx_queues)) -				queue_index = -1; -		} -	} -	rcu_read_unlock(); - -	return queue_index; -#else -	return -1; -#endif -} - -static struct netdev_queue *dev_pick_tx(struct net_device *dev, -					struct sk_buff *skb) -{ -	int queue_index; -	const struct net_device_ops *ops = dev->netdev_ops; - -	if (dev->real_num_tx_queues == 1) -		queue_index = 0; -	else if (ops->ndo_select_queue) { -		queue_index = ops->ndo_select_queue(dev, skb); -		queue_index = dev_cap_txqueue(dev, queue_index); -	} else { -		struct sock *sk = skb->sk; -		queue_index = sk_tx_queue_get(sk); +	/* To get more precise estimation of bytes sent on wire, +	 * we add to pkt_len the headers size of all segments +	 */ +	if (shinfo->gso_size)  { +		unsigned int hdr_len; +		u16 gso_segs = shinfo->gso_segs; -		if (queue_index < 0 || skb->ooo_okay || -		    queue_index >= dev->real_num_tx_queues) { -			int old_index = queue_index; +		/* mac layer + network layer */ +		hdr_len = skb_transport_header(skb) - skb_mac_header(skb); -			queue_index = get_xps_queue(dev, skb); -			if (queue_index < 0) -				queue_index = skb_tx_hash(dev, skb); +		/* + transport layer */ +		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) +			hdr_len += tcp_hdrlen(skb); +		else +			hdr_len += sizeof(struct udphdr); -			if (queue_index != old_index && sk) { -				struct dst_entry *dst = -				    rcu_dereference_check(sk->sk_dst_cache, 1); +		if (shinfo->gso_type & SKB_GSO_DODGY) +			gso_segs = DIV_ROUND_UP(skb->len - hdr_len, +						shinfo->gso_size); -				if (dst && skb_dst(skb) == dst) -					sk_tx_queue_set(sk, queue_index); -			} -		} +		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;  	} - -	skb_set_queue_mapping(skb, queue_index); -	return netdev_get_tx_queue(dev, queue_index);  }  static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, @@ -2226,15 +2737,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,  				 struct netdev_queue *txq)  {  	spinlock_t *root_lock = qdisc_lock(q); -	bool contended = qdisc_is_running(q); +	bool contended;  	int rc; +	qdisc_pkt_len_init(skb); +	qdisc_calculate_pkt_len(skb, q);  	/*  	 * Heuristic to force contended enqueues to serialize on a  	 * separate lock before trying to get qdisc main lock.  	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often  	 * and dequeue packets faster.  	 */ +	contended = qdisc_is_running(q);  	if (unlikely(contended))  		spin_lock(&q->busylock); @@ -2251,7 +2765,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,  		 */  		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))  			skb_dst_force(skb); -		__qdisc_update_bstats(q, skb->len); + +		qdisc_bstats_update(q, skb); +  		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {  			if (unlikely(contended)) {  				spin_unlock(&q->busylock); @@ -2264,7 +2780,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,  		rc = NET_XMIT_SUCCESS;  	} else {  		skb_dst_force(skb); -		rc = qdisc_enqueue_root(skb, q); +		rc = q->enqueue(skb, q) & NET_XMIT_MASK;  		if (qdisc_run_begin(q)) {  			if (unlikely(contended)) {  				spin_unlock(&q->busylock); @@ -2279,12 +2795,46 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,  	return rc;  } +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) +static void skb_update_prio(struct sk_buff *skb) +{ +	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); + +	if (!skb->priority && skb->sk && map) { +		unsigned int prioidx = skb->sk->sk_cgrp_prioidx; + +		if (prioidx < map->priomap_len) +			skb->priority = map->priomap[prioidx]; +	} +} +#else +#define skb_update_prio(skb) +#endif +  static DEFINE_PER_CPU(int, xmit_recursion);  #define RECURSION_LIMIT 10  /** - *	dev_queue_xmit - transmit a buffer + *	dev_loopback_xmit - loop back @skb   *	@skb: buffer to transmit + */ +int dev_loopback_xmit(struct sk_buff *skb) +{ +	skb_reset_mac_header(skb); +	__skb_pull(skb, skb_network_offset(skb)); +	skb->pkt_type = PACKET_LOOPBACK; +	skb->ip_summed = CHECKSUM_UNNECESSARY; +	WARN_ON(!skb_dst(skb)); +	skb_dst_force(skb); +	netif_rx_ni(skb); +	return 0; +} +EXPORT_SYMBOL(dev_loopback_xmit); + +/** + *	__dev_queue_xmit - transmit a buffer + *	@skb: buffer to transmit + *	@accel_priv: private data used for L2 forwarding offload   *   *	Queue a buffer for transmission to a network device. The caller must   *	have set the device and priority and built the buffer before calling @@ -2307,19 +2857,23 @@ static DEFINE_PER_CPU(int, xmit_recursion);   *      the BH enable code must have IRQs enabled so that it will not deadlock.   *          --BLG   */ -int dev_queue_xmit(struct sk_buff *skb) +static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)  {  	struct net_device *dev = skb->dev;  	struct netdev_queue *txq;  	struct Qdisc *q;  	int rc = -ENOMEM; +	skb_reset_mac_header(skb); +  	/* Disable soft irqs for various locks below. Also  	 * stops preemption for RCU.  	 */  	rcu_read_lock_bh(); -	txq = dev_pick_tx(dev, skb); +	skb_update_prio(skb); + +	txq = netdev_pick_tx(dev, skb, accel_priv);  	q = rcu_dereference_bh(txq->qdisc);  #ifdef CONFIG_NET_CLS_ACT @@ -2353,7 +2907,7 @@ int dev_queue_xmit(struct sk_buff *skb)  			HARD_TX_LOCK(dev, txq, cpu); -			if (!netif_tx_queue_stopped(txq)) { +			if (!netif_xmit_stopped(txq)) {  				__this_cpu_inc(xmit_recursion);  				rc = dev_hard_start_xmit(skb, dev, txq);  				__this_cpu_dec(xmit_recursion); @@ -2363,37 +2917,49 @@ int dev_queue_xmit(struct sk_buff *skb)  				}  			}  			HARD_TX_UNLOCK(dev, txq); -			if (net_ratelimit()) -				printk(KERN_CRIT "Virtual device %s asks to " -				       "queue packet!\n", dev->name); +			net_crit_ratelimited("Virtual device %s asks to queue packet!\n", +					     dev->name);  		} else {  			/* Recursion is detected! It is possible,  			 * unfortunately  			 */  recursion_alert: -			if (net_ratelimit()) -				printk(KERN_CRIT "Dead loop on virtual device " -				       "%s, fix it urgently!\n", dev->name); +			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", +					     dev->name);  		}  	}  	rc = -ENETDOWN;  	rcu_read_unlock_bh(); +	atomic_long_inc(&dev->tx_dropped);  	kfree_skb(skb);  	return rc;  out:  	rcu_read_unlock_bh();  	return rc;  } + +int dev_queue_xmit(struct sk_buff *skb) +{ +	return __dev_queue_xmit(skb, NULL); +}  EXPORT_SYMBOL(dev_queue_xmit); +int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +{ +	return __dev_queue_xmit(skb, accel_priv); +} +EXPORT_SYMBOL(dev_queue_xmit_accel); +  /*=======================================================================  			Receiver routines    =======================================================================*/  int netdev_max_backlog __read_mostly = 1000; +EXPORT_SYMBOL(netdev_max_backlog); +  int netdev_tstamp_prequeue __read_mostly = 1;  int netdev_budget __read_mostly = 300;  int weight_p __read_mostly = 64;            /* old backlog weight */ @@ -2406,82 +2972,58 @@ static inline void ____napi_schedule(struct softnet_data *sd,  	__raise_softirq_irqoff(NET_RX_SOFTIRQ);  } -/* - * __skb_get_rxhash: calculate a flow hash based on src/dst addresses - * and src/dst port numbers. Returns a non-zero hash number on success - * and 0 on failure. - */ -__u32 __skb_get_rxhash(struct sk_buff *skb) -{ -	int nhoff, hash = 0, poff; -	struct ipv6hdr *ip6; -	struct iphdr *ip; -	u8 ip_proto; -	u32 addr1, addr2, ihl; -	union { -		u32 v32; -		u16 v16[2]; -	} ports; +#ifdef CONFIG_RPS -	nhoff = skb_network_offset(skb); +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); -	switch (skb->protocol) { -	case __constant_htons(ETH_P_IP): -		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) -			goto done; +struct static_key rps_needed __read_mostly; -		ip = (struct iphdr *) (skb->data + nhoff); -		if (ip->frag_off & htons(IP_MF | IP_OFFSET)) -			ip_proto = 0; -		else -			ip_proto = ip->protocol; -		addr1 = (__force u32) ip->saddr; -		addr2 = (__force u32) ip->daddr; -		ihl = ip->ihl; -		break; -	case __constant_htons(ETH_P_IPV6): -		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) -			goto done; +static struct rps_dev_flow * +set_rps_cpu(struct net_device *dev, struct sk_buff *skb, +	    struct rps_dev_flow *rflow, u16 next_cpu) +{ +	if (next_cpu != RPS_NO_CPU) { +#ifdef CONFIG_RFS_ACCEL +		struct netdev_rx_queue *rxqueue; +		struct rps_dev_flow_table *flow_table; +		struct rps_dev_flow *old_rflow; +		u32 flow_id; +		u16 rxq_index; +		int rc; -		ip6 = (struct ipv6hdr *) (skb->data + nhoff); -		ip_proto = ip6->nexthdr; -		addr1 = (__force u32) ip6->saddr.s6_addr32[3]; -		addr2 = (__force u32) ip6->daddr.s6_addr32[3]; -		ihl = (40 >> 2); -		break; -	default: -		goto done; -	} +		/* Should we steer this flow to a different hardware queue? */ +		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || +		    !(dev->features & NETIF_F_NTUPLE)) +			goto out; +		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); +		if (rxq_index == skb_get_rx_queue(skb)) +			goto out; -	ports.v32 = 0; -	poff = proto_ports_offset(ip_proto); -	if (poff >= 0) { -		nhoff += ihl * 4 + poff; -		if (pskb_may_pull(skb, nhoff + 4)) { -			ports.v32 = * (__force u32 *) (skb->data + nhoff); -			if (ports.v16[1] < ports.v16[0]) -				swap(ports.v16[0], ports.v16[1]); -		} +		rxqueue = dev->_rx + rxq_index; +		flow_table = rcu_dereference(rxqueue->rps_flow_table); +		if (!flow_table) +			goto out; +		flow_id = skb_get_hash(skb) & flow_table->mask; +		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, +							rxq_index, flow_id); +		if (rc < 0) +			goto out; +		old_rflow = rflow; +		rflow = &flow_table->flows[flow_id]; +		rflow->filter = rc; +		if (old_rflow->filter == rflow->filter) +			old_rflow->filter = RPS_NO_FILTER; +	out: +#endif +		rflow->last_qtail = +			per_cpu(softnet_data, next_cpu).input_queue_head;  	} -	/* get a consistent hash (same value on both flow directions) */ -	if (addr2 < addr1) -		swap(addr1, addr2); - -	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); -	if (!hash) -		hash = 1; - -done: -	return hash; +	rflow->cpu = next_cpu; +	return rflow;  } -EXPORT_SYMBOL(__skb_get_rxhash); - -#ifdef CONFIG_RPS - -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table);  /*   * get_rps_cpu is called from netif_receive_skb and returns the target @@ -2497,6 +3039,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	struct rps_sock_flow_table *sock_flow_table;  	int cpu = -1;  	u16 tcpu; +	u32 hash;  	if (skb_rx_queue_recorded(skb)) {  		u16 index = skb_get_rx_queue(skb); @@ -2513,18 +3056,20 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	map = rcu_dereference(rxqueue->rps_map);  	if (map) { -		if (map->len == 1) { +		if (map->len == 1 && +		    !rcu_access_pointer(rxqueue->rps_flow_table)) {  			tcpu = map->cpus[0];  			if (cpu_online(tcpu))  				cpu = tcpu;  			goto done;  		} -	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { +	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {  		goto done;  	}  	skb_reset_network_header(skb); -	if (!skb_get_rxhash(skb)) +	hash = skb_get_hash(skb); +	if (!hash)  		goto done;  	flow_table = rcu_dereference(rxqueue->rps_flow_table); @@ -2533,11 +3078,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		u16 next_cpu;  		struct rps_dev_flow *rflow; -		rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; +		rflow = &flow_table->flows[hash & flow_table->mask];  		tcpu = rflow->cpu; -		next_cpu = sock_flow_table->ents[skb->rxhash & -		    sock_flow_table->mask]; +		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];  		/*  		 * If the desired CPU (where last recvmsg was done) is @@ -2554,11 +3098,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||  		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -  		      rflow->last_qtail)) >= 0)) { -			tcpu = rflow->cpu = next_cpu; -			if (tcpu != RPS_NO_CPU) -				rflow->last_qtail = per_cpu(softnet_data, -				    tcpu).input_queue_head; +			tcpu = next_cpu; +			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);  		} +  		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {  			*rflowp = rflow;  			cpu = tcpu; @@ -2567,7 +3110,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	}  	if (map) { -		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; +		tcpu = map->cpus[((u64) hash * map->len) >> 32];  		if (cpu_online(tcpu)) {  			cpu = tcpu; @@ -2579,6 +3122,46 @@ done:  	return cpu;  } +#ifdef CONFIG_RFS_ACCEL + +/** + * rps_may_expire_flow - check whether an RFS hardware filter may be removed + * @dev: Device on which the filter was set + * @rxq_index: RX queue index + * @flow_id: Flow ID passed to ndo_rx_flow_steer() + * @filter_id: Filter ID returned by ndo_rx_flow_steer() + * + * Drivers that implement ndo_rx_flow_steer() should periodically call + * this function for each installed filter and remove the filters for + * which it returns %true. + */ +bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, +			 u32 flow_id, u16 filter_id) +{ +	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; +	struct rps_dev_flow_table *flow_table; +	struct rps_dev_flow *rflow; +	bool expire = true; +	int cpu; + +	rcu_read_lock(); +	flow_table = rcu_dereference(rxqueue->rps_flow_table); +	if (flow_table && flow_id <= flow_table->mask) { +		rflow = &flow_table->flows[flow_id]; +		cpu = ACCESS_ONCE(rflow->cpu); +		if (rflow->filter == filter_id && cpu != RPS_NO_CPU && +		    ((int)(per_cpu(softnet_data, cpu).input_queue_head - +			   rflow->last_qtail) < +		     (int)(10 * flow_table->mask))) +			expire = false; +	} +	rcu_read_unlock(); +	return expire; +} +EXPORT_SYMBOL(rps_may_expire_flow); + +#endif /* CONFIG_RFS_ACCEL */ +  /* Called from hardirq (IPI) context */  static void rps_trigger_softirq(void *data)  { @@ -2611,6 +3194,46 @@ static int rps_ipi_queued(struct softnet_data *sd)  	return 0;  } +#ifdef CONFIG_NET_FLOW_LIMIT +int netdev_flow_limit_table_len __read_mostly = (1 << 12); +#endif + +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +{ +#ifdef CONFIG_NET_FLOW_LIMIT +	struct sd_flow_limit *fl; +	struct softnet_data *sd; +	unsigned int old_flow, new_flow; + +	if (qlen < (netdev_max_backlog >> 1)) +		return false; + +	sd = &__get_cpu_var(softnet_data); + +	rcu_read_lock(); +	fl = rcu_dereference(sd->flow_limit); +	if (fl) { +		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); +		old_flow = fl->history[fl->history_head]; +		fl->history[fl->history_head] = new_flow; + +		fl->history_head++; +		fl->history_head &= FLOW_LIMIT_HISTORY - 1; + +		if (likely(fl->buckets[old_flow])) +			fl->buckets[old_flow]--; + +		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { +			fl->count++; +			rcu_read_unlock(); +			return true; +		} +	} +	rcu_read_unlock(); +#endif +	return false; +} +  /*   * enqueue_to_backlog is called to queue an skb to a per CPU backlog   * queue (may be a remote CPU queue). @@ -2620,13 +3243,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,  {  	struct softnet_data *sd;  	unsigned long flags; +	unsigned int qlen;  	sd = &per_cpu(softnet_data, cpu);  	local_irq_save(flags);  	rps_lock(sd); -	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { +	qlen = skb_queue_len(&sd->input_pkt_queue); +	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {  		if (skb_queue_len(&sd->input_pkt_queue)) {  enqueue:  			__skb_queue_tail(&sd->input_pkt_queue, skb); @@ -2656,35 +3281,15 @@ enqueue:  	return NET_RX_DROP;  } -/** - *	netif_rx	-	post buffer to the network code - *	@skb: buffer to post - * - *	This function receives a packet from a device driver and queues it for - *	the upper (protocol) levels to process.  It always succeeds. The buffer - *	may be dropped during processing for congestion control or by the - *	protocol layers. - * - *	return values: - *	NET_RX_SUCCESS	(no congestion) - *	NET_RX_DROP     (packet was dropped) - * - */ - -int netif_rx(struct sk_buff *skb) +static int netif_rx_internal(struct sk_buff *skb)  {  	int ret; -	/* if netpoll wants it, pretend we never saw it */ -	if (netpoll_rx(skb)) -		return NET_RX_DROP; - -	if (netdev_tstamp_prequeue) -		net_timestamp_check(skb); +	net_timestamp_check(netdev_tstamp_prequeue, skb);  	trace_netif_rx(skb);  #ifdef CONFIG_RPS -	{ +	if (static_key_false(&rps_needed)) {  		struct rps_dev_flow voidflow, *rflow = &voidflow;  		int cpu; @@ -2699,24 +3304,47 @@ int netif_rx(struct sk_buff *skb)  		rcu_read_unlock();  		preempt_enable(); -	} -#else +	} else +#endif  	{  		unsigned int qtail;  		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);  		put_cpu();  	} -#endif  	return ret;  } + +/** + *	netif_rx	-	post buffer to the network code + *	@skb: buffer to post + * + *	This function receives a packet from a device driver and queues it for + *	the upper (protocol) levels to process.  It always succeeds. The buffer + *	may be dropped during processing for congestion control or by the + *	protocol layers. + * + *	return values: + *	NET_RX_SUCCESS	(no congestion) + *	NET_RX_DROP     (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ +	trace_netif_rx_entry(skb); + +	return netif_rx_internal(skb); +}  EXPORT_SYMBOL(netif_rx);  int netif_rx_ni(struct sk_buff *skb)  {  	int err; +	trace_netif_rx_ni_entry(skb); +  	preempt_disable(); -	err = netif_rx(skb); +	err = netif_rx_internal(skb);  	if (local_softirq_pending())  		do_softirq();  	preempt_enable(); @@ -2742,7 +3370,10 @@ static void net_tx_action(struct softirq_action *h)  			clist = clist->next;  			WARN_ON(atomic_read(&skb->users)); -			trace_kfree_skb(skb, net_tx_action); +			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) +				trace_consume_skb(skb); +			else +				trace_kfree_skb(skb, net_tx_action);  			__kfree_skb(skb);  		}  	} @@ -2764,7 +3395,7 @@ static void net_tx_action(struct softirq_action *h)  			root_lock = qdisc_lock(q);  			if (spin_trylock(root_lock)) { -				smp_mb__before_clear_bit(); +				smp_mb__before_atomic();  				clear_bit(__QDISC_STATE_SCHED,  					  &q->state);  				qdisc_run(q); @@ -2774,7 +3405,7 @@ static void net_tx_action(struct softirq_action *h)  					      &q->state)) {  					__netif_reschedule(q);  				} else { -					smp_mb__before_clear_bit(); +					smp_mb__before_atomic();  					clear_bit(__QDISC_STATE_SCHED,  						  &q->state);  				} @@ -2783,14 +3414,6 @@ static void net_tx_action(struct softirq_action *h)  	}  } -static inline int deliver_skb(struct sk_buff *skb, -			      struct packet_type *pt_prev, -			      struct net_device *orig_dev) -{ -	atomic_inc(&skb->users); -	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); -} -  #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \      (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))  /* This hook is defined here for ATM LANE */ @@ -2804,8 +3427,8 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);   * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions   * a compare and 2 stores extra right now if we dont have it on   * but have CONFIG_NET_CLS_ACT - * NOTE: This doesnt stop any functionality; if you dont have - * the ingress scheduler, you just cant add policies on ingress. + * NOTE: This doesn't stop any functionality; if you dont have + * the ingress scheduler, you just can't add policies on ingress.   *   */  static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) @@ -2816,9 +3439,8 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)  	struct Qdisc *q;  	if (unlikely(MAX_RED_LOOP < ttl++)) { -		if (net_ratelimit()) -			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n", -			       skb->skb_iif, dev->ifindex); +		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n", +				     skb->skb_iif, dev->ifindex);  		return TC_ACT_SHOT;  	} @@ -2869,11 +3491,13 @@ out:   *	@rx_handler: receive handler to register   *	@rx_handler_data: data pointer that is used by rx handler   * - *	Register a receive hander for a device. This handler will then be + *	Register a receive handler for a device. This handler will then be   *	called from __netif_receive_skb. A negative errno code is returned   *	on a failure.   *   *	The caller must hold the rtnl_mutex. + * + *	For a general description of rx_handler, see enum rx_handler_result.   */  int netdev_rx_handler_register(struct net_device *dev,  			       rx_handler_func_t *rx_handler, @@ -2884,6 +3508,7 @@ int netdev_rx_handler_register(struct net_device *dev,  	if (dev->rx_handler)  		return -EBUSY; +	/* Note: rx_handler_data must be set before rx_handler */  	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);  	rcu_assign_pointer(dev->rx_handler, rx_handler); @@ -2895,7 +3520,7 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_register);   *	netdev_rx_handler_unregister - unregister receive handler   *	@dev: device to unregister a handler from   * - *	Unregister a receive hander from a device. + *	Unregister a receive handler from a device.   *   *	The caller must hold the rtnl_mutex.   */ @@ -2903,113 +3528,71 @@ void netdev_rx_handler_unregister(struct net_device *dev)  {  	ASSERT_RTNL(); -	rcu_assign_pointer(dev->rx_handler, NULL); -	rcu_assign_pointer(dev->rx_handler_data, NULL); +	RCU_INIT_POINTER(dev->rx_handler, NULL); +	/* a reader seeing a non NULL rx_handler in a rcu_read_lock() +	 * section has a guarantee to see a non NULL rx_handler_data +	 * as well. +	 */ +	synchronize_net(); +	RCU_INIT_POINTER(dev->rx_handler_data, NULL);  }  EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); -static inline void skb_bond_set_mac_by_master(struct sk_buff *skb, -					      struct net_device *master) -{ -	if (skb->pkt_type == PACKET_HOST) { -		u16 *dest = (u16 *) eth_hdr(skb)->h_dest; - -		memcpy(dest, master->dev_addr, ETH_ALEN); -	} -} - -/* On bonding slaves other than the currently active slave, suppress - * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and - * ARP on active-backup slaves with arp_validate enabled. +/* + * Limit the use of PFMEMALLOC reserves to those protocols that implement + * the special handling of PFMEMALLOC skbs.   */ -int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master) +static bool skb_pfmemalloc_protocol(struct sk_buff *skb)  { -	struct net_device *dev = skb->dev; - -	if (master->priv_flags & IFF_MASTER_ARPMON) -		dev->last_rx = jiffies; - -	if ((master->priv_flags & IFF_MASTER_ALB) && -	    (master->priv_flags & IFF_BRIDGE_PORT)) { -		/* Do address unmangle. The local destination address -		 * will be always the one master has. Provides the right -		 * functionality in a bridge. -		 */ -		skb_bond_set_mac_by_master(skb, master); -	} - -	if (dev->priv_flags & IFF_SLAVE_INACTIVE) { -		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && -		    skb->protocol == __cpu_to_be16(ETH_P_ARP)) -			return 0; - -		if (master->priv_flags & IFF_MASTER_ALB) { -			if (skb->pkt_type != PACKET_BROADCAST && -			    skb->pkt_type != PACKET_MULTICAST) -				return 0; -		} -		if (master->priv_flags & IFF_MASTER_8023AD && -		    skb->protocol == __cpu_to_be16(ETH_P_SLOW)) -			return 0; - -		return 1; +	switch (skb->protocol) { +	case htons(ETH_P_ARP): +	case htons(ETH_P_IP): +	case htons(ETH_P_IPV6): +	case htons(ETH_P_8021Q): +	case htons(ETH_P_8021AD): +		return true; +	default: +		return false;  	} -	return 0;  } -EXPORT_SYMBOL(__skb_bond_should_drop); -static int __netif_receive_skb(struct sk_buff *skb) +static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)  {  	struct packet_type *ptype, *pt_prev;  	rx_handler_func_t *rx_handler;  	struct net_device *orig_dev; -	struct net_device *master; -	struct net_device *null_or_orig; -	struct net_device *orig_or_bond; +	struct net_device *null_or_dev; +	bool deliver_exact = false;  	int ret = NET_RX_DROP;  	__be16 type; -	if (!netdev_tstamp_prequeue) -		net_timestamp_check(skb); +	net_timestamp_check(!netdev_tstamp_prequeue, skb);  	trace_netif_receive_skb(skb); -	/* if we've gotten here through NAPI, check netpoll */ -	if (netpoll_receive_skb(skb)) -		return NET_RX_DROP; - -	if (!skb->skb_iif) -		skb->skb_iif = skb->dev->ifindex; - -	/* -	 * bonding note: skbs received on inactive slaves should only -	 * be delivered to pkt handlers that are exact matches.  Also -	 * the deliver_no_wcard flag will be set.  If packet handlers -	 * are sensitive to duplicate packets these skbs will need to -	 * be dropped at the handler. -	 */ -	null_or_orig = NULL;  	orig_dev = skb->dev; -	master = ACCESS_ONCE(orig_dev->master); -	if (skb->deliver_no_wcard) -		null_or_orig = orig_dev; -	else if (master) { -		if (skb_bond_should_drop(skb, master)) { -			skb->deliver_no_wcard = 1; -			null_or_orig = orig_dev; /* deliver only exact match */ -		} else -			skb->dev = master; -	} -	__this_cpu_inc(softnet_data.processed);  	skb_reset_network_header(skb); -	skb_reset_transport_header(skb); -	skb->mac_len = skb->network_header - skb->mac_header; +	if (!skb_transport_header_was_set(skb)) +		skb_reset_transport_header(skb); +	skb_reset_mac_len(skb);  	pt_prev = NULL;  	rcu_read_lock(); +another_round: +	skb->skb_iif = skb->dev->ifindex; + +	__this_cpu_inc(softnet_data.processed); + +	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || +	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) { +		skb = vlan_untag(skb); +		if (unlikely(!skb)) +			goto unlock; +	} +  #ifdef CONFIG_NET_CLS_ACT  	if (skb->tc_verd & TC_NCLS) {  		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); @@ -3017,64 +3600,79 @@ static int __netif_receive_skb(struct sk_buff *skb)  	}  #endif +	if (pfmemalloc) +		goto skip_taps; +  	list_for_each_entry_rcu(ptype, &ptype_all, list) { -		if (ptype->dev == null_or_orig || ptype->dev == skb->dev || -		    ptype->dev == orig_dev) { +		if (!ptype->dev || ptype->dev == skb->dev) {  			if (pt_prev)  				ret = deliver_skb(skb, pt_prev, orig_dev);  			pt_prev = ptype;  		}  	} +skip_taps:  #ifdef CONFIG_NET_CLS_ACT  	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);  	if (!skb) -		goto out; +		goto unlock;  ncls:  #endif -	/* Handle special case of bridge or macvlan */ -	rx_handler = rcu_dereference(skb->dev->rx_handler); -	if (rx_handler) { +	if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) +		goto drop; + +	if (vlan_tx_tag_present(skb)) {  		if (pt_prev) {  			ret = deliver_skb(skb, pt_prev, orig_dev);  			pt_prev = NULL;  		} -		skb = rx_handler(skb); -		if (!skb) -			goto out; +		if (vlan_do_receive(&skb)) +			goto another_round; +		else if (unlikely(!skb)) +			goto unlock;  	} -	if (vlan_tx_tag_present(skb)) { +	rx_handler = rcu_dereference(skb->dev->rx_handler); +	if (rx_handler) {  		if (pt_prev) {  			ret = deliver_skb(skb, pt_prev, orig_dev);  			pt_prev = NULL;  		} -		if (vlan_hwaccel_do_receive(&skb)) { -			ret = __netif_receive_skb(skb); -			goto out; -		} else if (unlikely(!skb)) -			goto out; +		switch (rx_handler(&skb)) { +		case RX_HANDLER_CONSUMED: +			ret = NET_RX_SUCCESS; +			goto unlock; +		case RX_HANDLER_ANOTHER: +			goto another_round; +		case RX_HANDLER_EXACT: +			deliver_exact = true; +		case RX_HANDLER_PASS: +			break; +		default: +			BUG(); +		}  	} -	/* -	 * Make sure frames received on VLAN interfaces stacked on -	 * bonding interfaces still make their way to any base bonding -	 * device that may have registered for a specific ptype.  The -	 * handler may have to adjust skb->dev and orig_dev. -	 */ -	orig_or_bond = orig_dev; -	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && -	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { -		orig_or_bond = vlan_dev_real_dev(skb->dev); +	if (unlikely(vlan_tx_tag_present(skb))) { +		if (vlan_tx_tag_get_id(skb)) +			skb->pkt_type = PACKET_OTHERHOST; +		/* Note: we might in the future use prio bits +		 * and set skb->priority like in vlan_do_receive() +		 * For the time being, just ignore Priority Code Point +		 */ +		skb->vlan_tci = 0;  	} +	/* deliver only exact match when indicated */ +	null_or_dev = deliver_exact ? skb->dev : NULL; +  	type = skb->protocol;  	list_for_each_entry_rcu(ptype,  			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { -		if (ptype->type == type && (ptype->dev == null_or_orig || -		     ptype->dev == skb->dev || ptype->dev == orig_dev || -		     ptype->dev == orig_or_bond)) { +		if (ptype->type == type && +		    (ptype->dev == null_or_dev || ptype->dev == skb->dev || +		     ptype->dev == orig_dev)) {  			if (pt_prev)  				ret = deliver_skb(skb, pt_prev, orig_dev);  			pt_prev = ptype; @@ -3082,8 +3680,12 @@ ncls:  	}  	if (pt_prev) { -		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) +			goto drop; +		else +			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);  	} else { +drop:  		atomic_long_inc(&skb->dev->rx_dropped);  		kfree_skb(skb);  		/* Jamal, now you will not able to escape explaining @@ -3092,36 +3694,45 @@ ncls:  		ret = NET_RX_DROP;  	} -out: +unlock:  	rcu_read_unlock();  	return ret;  } -/** - *	netif_receive_skb - process receive buffer from network - *	@skb: buffer to process - * - *	netif_receive_skb() is the main receive data processing function. - *	It always succeeds. The buffer may be dropped during processing - *	for congestion control or by the protocol layers. - * - *	This function may only be called from softirq context and interrupts - *	should be enabled. - * - *	Return values (usually ignored): - *	NET_RX_SUCCESS: no congestion - *	NET_RX_DROP: packet was dropped - */ -int netif_receive_skb(struct sk_buff *skb) +static int __netif_receive_skb(struct sk_buff *skb)  { -	if (netdev_tstamp_prequeue) -		net_timestamp_check(skb); +	int ret; + +	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { +		unsigned long pflags = current->flags; + +		/* +		 * PFMEMALLOC skbs are special, they should +		 * - be delivered to SOCK_MEMALLOC sockets only +		 * - stay away from userspace +		 * - have bounded memory usage +		 * +		 * Use PF_MEMALLOC as this saves us from propagating the allocation +		 * context down to all allocation sites. +		 */ +		current->flags |= PF_MEMALLOC; +		ret = __netif_receive_skb_core(skb, true); +		tsk_restore_flags(current, pflags, PF_MEMALLOC); +	} else +		ret = __netif_receive_skb_core(skb, false); + +	return ret; +} + +static int netif_receive_skb_internal(struct sk_buff *skb) +{ +	net_timestamp_check(netdev_tstamp_prequeue, skb);  	if (skb_defer_rx_timestamp(skb))  		return NET_RX_SUCCESS;  #ifdef CONFIG_RPS -	{ +	if (static_key_false(&rps_needed)) {  		struct rps_dev_flow voidflow, *rflow = &voidflow;  		int cpu, ret; @@ -3132,16 +3743,34 @@ int netif_receive_skb(struct sk_buff *skb)  		if (cpu >= 0) {  			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);  			rcu_read_unlock(); -		} else { -			rcu_read_unlock(); -			ret = __netif_receive_skb(skb); +			return ret;  		} - -		return ret; +		rcu_read_unlock();  	} -#else -	return __netif_receive_skb(skb);  #endif +	return __netif_receive_skb(skb); +} + +/** + *	netif_receive_skb - process receive buffer from network + *	@skb: buffer to process + * + *	netif_receive_skb() is the main receive data processing function. + *	It always succeeds. The buffer may be dropped during processing + *	for congestion control or by the protocol layers. + * + *	This function may only be called from softirq context and interrupts + *	should be enabled. + * + *	Return values (usually ignored): + *	NET_RX_SUCCESS: no congestion + *	NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ +	trace_netif_receive_skb_entry(skb); + +	return netif_receive_skb_internal(skb);  }  EXPORT_SYMBOL(netif_receive_skb); @@ -3175,11 +3804,13 @@ static void flush_backlog(void *arg)  static int napi_gro_complete(struct sk_buff *skb)  { -	struct packet_type *ptype; +	struct packet_offload *ptype;  	__be16 type = skb->protocol; -	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; +	struct list_head *head = &offload_base;  	int err = -ENOENT; +	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); +  	if (NAPI_GRO_CB(skb)->count == 1) {  		skb_shinfo(skb)->gso_size = 0;  		goto out; @@ -3187,10 +3818,10 @@ static int napi_gro_complete(struct sk_buff *skb)  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, head, list) { -		if (ptype->type != type || ptype->dev || !ptype->gro_complete) +		if (ptype->type != type || !ptype->callbacks.gro_complete)  			continue; -		err = ptype->gro_complete(skb); +		err = ptype->callbacks.gro_complete(skb, 0);  		break;  	}  	rcu_read_unlock(); @@ -3202,53 +3833,137 @@ static int napi_gro_complete(struct sk_buff *skb)  	}  out: -	return netif_receive_skb(skb); +	return netif_receive_skb_internal(skb);  } -inline void napi_gro_flush(struct napi_struct *napi) +/* napi->gro_list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old)  { -	struct sk_buff *skb, *next; +	struct sk_buff *skb, *prev = NULL; -	for (skb = napi->gro_list; skb; skb = next) { -		next = skb->next; +	/* scan list and build reverse chain */ +	for (skb = napi->gro_list; skb != NULL; skb = skb->next) { +		skb->prev = prev; +		prev = skb; +	} + +	for (skb = prev; skb; skb = prev) {  		skb->next = NULL; + +		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) +			return; + +		prev = skb->prev;  		napi_gro_complete(skb); +		napi->gro_count--;  	} -	napi->gro_count = 0;  	napi->gro_list = NULL;  }  EXPORT_SYMBOL(napi_gro_flush); -enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +{ +	struct sk_buff *p; +	unsigned int maclen = skb->dev->hard_header_len; +	u32 hash = skb_get_hash_raw(skb); + +	for (p = napi->gro_list; p; p = p->next) { +		unsigned long diffs; + +		NAPI_GRO_CB(p)->flush = 0; + +		if (hash != skb_get_hash_raw(p)) { +			NAPI_GRO_CB(p)->same_flow = 0; +			continue; +		} + +		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; +		diffs |= p->vlan_tci ^ skb->vlan_tci; +		if (maclen == ETH_HLEN) +			diffs |= compare_ether_header(skb_mac_header(p), +						      skb_mac_header(skb)); +		else if (!diffs) +			diffs = memcmp(skb_mac_header(p), +				       skb_mac_header(skb), +				       maclen); +		NAPI_GRO_CB(p)->same_flow = !diffs; +	} +} + +static void skb_gro_reset_offset(struct sk_buff *skb) +{ +	const struct skb_shared_info *pinfo = skb_shinfo(skb); +	const skb_frag_t *frag0 = &pinfo->frags[0]; + +	NAPI_GRO_CB(skb)->data_offset = 0; +	NAPI_GRO_CB(skb)->frag0 = NULL; +	NAPI_GRO_CB(skb)->frag0_len = 0; + +	if (skb_mac_header(skb) == skb_tail_pointer(skb) && +	    pinfo->nr_frags && +	    !PageHighMem(skb_frag_page(frag0))) { +		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); +		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); +	} +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ +	struct skb_shared_info *pinfo = skb_shinfo(skb); + +	BUG_ON(skb->end - skb->tail < grow); + +	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + +	skb->data_len -= grow; +	skb->tail += grow; + +	pinfo->frags[0].page_offset += grow; +	skb_frag_size_sub(&pinfo->frags[0], grow); + +	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { +		skb_frag_unref(skb, 0); +		memmove(pinfo->frags, pinfo->frags + 1, +			--pinfo->nr_frags * sizeof(pinfo->frags[0])); +	} +} + +static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  {  	struct sk_buff **pp = NULL; -	struct packet_type *ptype; +	struct packet_offload *ptype;  	__be16 type = skb->protocol; -	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; +	struct list_head *head = &offload_base;  	int same_flow; -	int mac_len;  	enum gro_result ret; +	int grow; -	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) +	if (!(skb->dev->features & NETIF_F_GRO))  		goto normal;  	if (skb_is_gso(skb) || skb_has_frag_list(skb))  		goto normal; +	gro_list_prepare(napi, skb); +	NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ +  	rcu_read_lock();  	list_for_each_entry_rcu(ptype, head, list) { -		if (ptype->type != type || ptype->dev || !ptype->gro_receive) +		if (ptype->type != type || !ptype->callbacks.gro_receive)  			continue;  		skb_set_network_header(skb, skb_gro_offset(skb)); -		mac_len = skb->network_header - skb->mac_header; -		skb->mac_len = mac_len; +		skb_reset_mac_len(skb);  		NAPI_GRO_CB(skb)->same_flow = 0;  		NAPI_GRO_CB(skb)->flush = 0;  		NAPI_GRO_CB(skb)->free = 0; +		NAPI_GRO_CB(skb)->udp_mark = 0; -		pp = ptype->gro_receive(&napi->gro_list, skb); +		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);  		break;  	}  	rcu_read_unlock(); @@ -3271,38 +3986,35 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  	if (same_flow)  		goto ok; -	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) +	if (NAPI_GRO_CB(skb)->flush)  		goto normal; -	napi->gro_count++; +	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { +		struct sk_buff *nskb = napi->gro_list; + +		/* locate the end of the list to select the 'oldest' flow */ +		while (nskb->next) { +			pp = &nskb->next; +			nskb = *pp; +		} +		*pp = NULL; +		nskb->next = NULL; +		napi_gro_complete(nskb); +	} else { +		napi->gro_count++; +	}  	NAPI_GRO_CB(skb)->count = 1; +	NAPI_GRO_CB(skb)->age = jiffies; +	NAPI_GRO_CB(skb)->last = skb;  	skb_shinfo(skb)->gso_size = skb_gro_len(skb);  	skb->next = napi->gro_list;  	napi->gro_list = skb;  	ret = GRO_HELD;  pull: -	if (skb_headlen(skb) < skb_gro_offset(skb)) { -		int grow = skb_gro_offset(skb) - skb_headlen(skb); - -		BUG_ON(skb->end - skb->tail < grow); - -		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); - -		skb->tail += grow; -		skb->data_len -= grow; - -		skb_shinfo(skb)->frags[0].page_offset += grow; -		skb_shinfo(skb)->frags[0].size -= grow; - -		if (unlikely(!skb_shinfo(skb)->frags[0].size)) { -			put_page(skb_shinfo(skb)->frags[0].page); -			memmove(skb_shinfo(skb)->frags, -				skb_shinfo(skb)->frags + 1, -				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); -		} -	} - +	grow = skb_gro_offset(skb) - skb_headlen(skb); +	if (grow > 0) +		gro_pull_from_frag0(skb, grow);  ok:  	return ret; @@ -3310,40 +4022,54 @@ normal:  	ret = GRO_NORMAL;  	goto pull;  } -EXPORT_SYMBOL(dev_gro_receive); -static inline gro_result_t -__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +struct packet_offload *gro_find_receive_by_type(__be16 type)  { -	struct sk_buff *p; - -	for (p = napi->gro_list; p; p = p->next) { -		unsigned long diffs; +	struct list_head *offload_head = &offload_base; +	struct packet_offload *ptype; -		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; -		diffs |= p->vlan_tci ^ skb->vlan_tci; -		diffs |= compare_ether_header(skb_mac_header(p), -					      skb_gro_mac_header(skb)); -		NAPI_GRO_CB(p)->same_flow = !diffs; -		NAPI_GRO_CB(p)->flush = 0; +	list_for_each_entry_rcu(ptype, offload_head, list) { +		if (ptype->type != type || !ptype->callbacks.gro_receive) +			continue; +		return ptype;  	} +	return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); -	return dev_gro_receive(napi, skb); +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ +	struct list_head *offload_head = &offload_base; +	struct packet_offload *ptype; + +	list_for_each_entry_rcu(ptype, offload_head, list) { +		if (ptype->type != type || !ptype->callbacks.gro_complete) +			continue; +		return ptype; +	} +	return NULL;  } +EXPORT_SYMBOL(gro_find_complete_by_type); -gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  {  	switch (ret) {  	case GRO_NORMAL: -		if (netif_receive_skb(skb)) +		if (netif_receive_skb_internal(skb))  			ret = GRO_DROP;  		break;  	case GRO_DROP: -	case GRO_MERGED_FREE:  		kfree_skb(skb);  		break; +	case GRO_MERGED_FREE: +		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) +			kmem_cache_free(skbuff_head_cache, skb); +		else +			__kfree_skb(skb); +		break; +  	case GRO_HELD:  	case GRO_MERGED:  		break; @@ -3351,37 +4077,28 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)  	return ret;  } -EXPORT_SYMBOL(napi_skb_finish); - -void skb_gro_reset_offset(struct sk_buff *skb) -{ -	NAPI_GRO_CB(skb)->data_offset = 0; -	NAPI_GRO_CB(skb)->frag0 = NULL; -	NAPI_GRO_CB(skb)->frag0_len = 0; - -	if (skb->mac_header == skb->tail && -	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) { -		NAPI_GRO_CB(skb)->frag0 = -			page_address(skb_shinfo(skb)->frags[0].page) + -			skb_shinfo(skb)->frags[0].page_offset; -		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; -	} -} -EXPORT_SYMBOL(skb_gro_reset_offset);  gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  { +	trace_napi_gro_receive_entry(skb); +  	skb_gro_reset_offset(skb); -	return napi_skb_finish(__napi_gro_receive(napi, skb), skb); +	return napi_skb_finish(dev_gro_receive(napi, skb), skb);  }  EXPORT_SYMBOL(napi_gro_receive);  static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)  {  	__skb_pull(skb, skb_headlen(skb)); -	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); +	/* restore the reserve we had after netdev_alloc_skb_ip_align() */ +	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));  	skb->vlan_tci = 0; +	skb->dev = napi->dev; +	skb->skb_iif = 0; +	skb->encapsulation = 0; +	skb_shinfo(skb)->gso_type = 0; +	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));  	napi->skb = skb;  } @@ -3392,24 +4109,22 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)  	if (!skb) {  		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); -		if (skb) -			napi->skb = skb; +		napi->skb = skb;  	}  	return skb;  }  EXPORT_SYMBOL(napi_get_frags); -gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, -			       gro_result_t ret) +static gro_result_t napi_frags_finish(struct napi_struct *napi, +				      struct sk_buff *skb, +				      gro_result_t ret)  {  	switch (ret) {  	case GRO_NORMAL:  	case GRO_HELD: +		__skb_push(skb, ETH_HLEN);  		skb->protocol = eth_type_trans(skb, skb->dev); - -		if (ret == GRO_HELD) -			skb_gro_pull(skb, -ETH_HLEN); -		else if (netif_receive_skb(skb)) +		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))  			ret = GRO_DROP;  		break; @@ -3424,44 +4139,45 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,  	return ret;  } -EXPORT_SYMBOL(napi_frags_finish); -struct sk_buff *napi_frags_skb(struct napi_struct *napi) +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */ +static struct sk_buff *napi_frags_skb(struct napi_struct *napi)  {  	struct sk_buff *skb = napi->skb; -	struct ethhdr *eth; -	unsigned int hlen; -	unsigned int off; +	const struct ethhdr *eth; +	unsigned int hlen = sizeof(*eth);  	napi->skb = NULL;  	skb_reset_mac_header(skb);  	skb_gro_reset_offset(skb); -	off = skb_gro_offset(skb); -	hlen = off + sizeof(*eth); -	eth = skb_gro_header_fast(skb, off); -	if (skb_gro_header_hard(skb, hlen)) { -		eth = skb_gro_header_slow(skb, hlen, off); +	eth = skb_gro_header_fast(skb, 0); +	if (unlikely(skb_gro_header_hard(skb, hlen))) { +		eth = skb_gro_header_slow(skb, hlen, 0);  		if (unlikely(!eth)) {  			napi_reuse_skb(napi, skb); -			skb = NULL; -			goto out; +			return NULL;  		} +	} else { +		gro_pull_from_frag0(skb, hlen); +		NAPI_GRO_CB(skb)->frag0 += hlen; +		NAPI_GRO_CB(skb)->frag0_len -= hlen;  	} - -	skb_gro_pull(skb, sizeof(*eth)); +	__skb_pull(skb, hlen);  	/*  	 * This works because the only protocols we care about don't require -	 * special handling.  We'll fix it up properly at the end. +	 * special handling. +	 * We'll fix it up properly in napi_frags_finish()  	 */  	skb->protocol = eth->h_proto; -out:  	return skb;  } -EXPORT_SYMBOL(napi_frags_skb);  gro_result_t napi_gro_frags(struct napi_struct *napi)  { @@ -3470,12 +4186,14 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)  	if (!skb)  		return GRO_DROP; -	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); +	trace_napi_gro_frags_entry(skb); + +	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));  }  EXPORT_SYMBOL(napi_gro_frags);  /* - * net_rps_action sends any pending IPI's for rps. + * net_rps_action_and_irq_enable sends any pending IPI's for rps.   * Note: called with local irq disabled, but exits with local irq enabled.   */  static void net_rps_action_and_irq_enable(struct softnet_data *sd) @@ -3493,8 +4211,8 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)  			struct softnet_data *next = remsd->rps_ipi_next;  			if (cpu_online(remsd->cpu)) -				__smp_call_function_single(remsd->cpu, -							   &remsd->csd, 0); +				smp_call_function_single_async(remsd->cpu, +							   &remsd->csd);  			remsd = next;  		}  	} else @@ -3518,9 +4236,8 @@ static int process_backlog(struct napi_struct *napi, int quota)  #endif  	napi->weight = weight_p;  	local_irq_disable(); -	while (work < quota) { +	while (1) {  		struct sk_buff *skb; -		unsigned int qlen;  		while ((skb = __skb_dequeue(&sd->process_queue))) {  			local_irq_enable(); @@ -3534,24 +4251,24 @@ static int process_backlog(struct napi_struct *napi, int quota)  		}  		rps_lock(sd); -		qlen = skb_queue_len(&sd->input_pkt_queue); -		if (qlen) -			skb_queue_splice_tail_init(&sd->input_pkt_queue, -						   &sd->process_queue); - -		if (qlen < quota - work) { +		if (skb_queue_empty(&sd->input_pkt_queue)) {  			/*  			 * Inline a custom version of __napi_complete().  			 * only current cpu owns and manipulates this napi, -			 * and NAPI_STATE_SCHED is the only possible flag set on backlog. -			 * we can use a plain write instead of clear_bit(), +			 * and NAPI_STATE_SCHED is the only possible flag set +			 * on backlog. +			 * We can use a plain write instead of clear_bit(),  			 * and we dont need an smp_mb() memory barrier.  			 */  			list_del(&napi->poll_list);  			napi->state = 0; +			rps_unlock(sd); -			quota = work + qlen; +			break;  		} + +		skb_queue_splice_tail_init(&sd->input_pkt_queue, +					   &sd->process_queue);  		rps_unlock(sd);  	}  	local_irq_enable(); @@ -3581,7 +4298,7 @@ void __napi_complete(struct napi_struct *n)  	BUG_ON(n->gro_list);  	list_del(&n->poll_list); -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	clear_bit(NAPI_STATE_SCHED, &n->state);  }  EXPORT_SYMBOL(__napi_complete); @@ -3597,13 +4314,65 @@ void napi_complete(struct napi_struct *n)  	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))  		return; -	napi_gro_flush(n); +	napi_gro_flush(n, false);  	local_irq_save(flags);  	__napi_complete(n);  	local_irq_restore(flags);  }  EXPORT_SYMBOL(napi_complete); +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *napi_by_id(unsigned int napi_id) +{ +	unsigned int hash = napi_id % HASH_SIZE(napi_hash); +	struct napi_struct *napi; + +	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) +		if (napi->napi_id == napi_id) +			return napi; + +	return NULL; +} +EXPORT_SYMBOL_GPL(napi_by_id); + +void napi_hash_add(struct napi_struct *napi) +{ +	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + +		spin_lock(&napi_hash_lock); + +		/* 0 is not a valid id, we also skip an id that is taken +		 * we expect both events to be extremely rare +		 */ +		napi->napi_id = 0; +		while (!napi->napi_id) { +			napi->napi_id = ++napi_gen_id; +			if (napi_by_id(napi->napi_id)) +				napi->napi_id = 0; +		} + +		hlist_add_head_rcu(&napi->napi_hash_node, +			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + +		spin_unlock(&napi_hash_lock); +	} +} +EXPORT_SYMBOL_GPL(napi_hash_add); + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +void napi_hash_del(struct napi_struct *napi) +{ +	spin_lock(&napi_hash_lock); + +	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) +		hlist_del_rcu(&napi->napi_hash_node); + +	spin_unlock(&napi_hash_lock); +} +EXPORT_SYMBOL_GPL(napi_hash_del); +  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,  		    int (*poll)(struct napi_struct *, int), int weight)  { @@ -3612,6 +4381,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,  	napi->gro_list = NULL;  	napi->skb = NULL;  	napi->poll = poll; +	if (weight > NAPI_POLL_WEIGHT) +		pr_err_once("netif_napi_add() called with weight %d on device %s\n", +			    weight, dev->name);  	napi->weight = weight;  	list_add(&napi->dev_list, &dev->napi_list);  	napi->dev = dev; @@ -3625,17 +4397,10 @@ EXPORT_SYMBOL(netif_napi_add);  void netif_napi_del(struct napi_struct *napi)  { -	struct sk_buff *skb, *next; -  	list_del_init(&napi->dev_list);  	napi_free_frags(napi); -	for (skb = napi->gro_list; skb; skb = next) { -		next = skb->next; -		skb->next = NULL; -		kfree_skb(skb); -	} - +	kfree_skb_list(napi->gro_list);  	napi->gro_list = NULL;  	napi->gro_count = 0;  } @@ -3658,7 +4423,7 @@ static void net_rx_action(struct softirq_action *h)  		 * Allow this to run for 2 jiffies since which will allow  		 * an average latency of 1.5/HZ.  		 */ -		if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) +		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))  			goto softnet_break;  		local_irq_enable(); @@ -3678,7 +4443,7 @@ static void net_rx_action(struct softirq_action *h)  		 * with netpoll's poll_napi().  Only the entity which  		 * obtains the lock and sees NAPI_STATE_SCHED set will  		 * actually make the ->poll() call.  Therefore we avoid -		 * accidently calling ->poll() when NAPI is not scheduled. +		 * accidentally calling ->poll() when NAPI is not scheduled.  		 */  		work = 0;  		if (test_bit(NAPI_STATE_SCHED, &n->state)) { @@ -3702,8 +4467,17 @@ static void net_rx_action(struct softirq_action *h)  				local_irq_enable();  				napi_complete(n);  				local_irq_disable(); -			} else +			} else { +				if (n->gro_list) { +					/* flush too old packets +					 * If HZ < 1000, flush all packets. +					 */ +					local_irq_enable(); +					napi_gro_flush(n, HZ >= 1000); +					local_irq_disable(); +				}  				list_move_tail(&n->poll_list, &sd->poll_list); +			}  		}  		netpoll_poll_unlock(have); @@ -3727,488 +4501,721 @@ softnet_break:  	goto out;  } -static gifconf_func_t *gifconf_list[NPROTO]; +struct netdev_adjacent { +	struct net_device *dev; + +	/* upper master flag, there can only be one master device per list */ +	bool master; + +	/* counter for the number of times this device was added to us */ +	u16 ref_nr; + +	/* private field for the users */ +	void *private; + +	struct list_head list; +	struct rcu_head rcu; +}; + +static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, +						 struct net_device *adj_dev, +						 struct list_head *adj_list) +{ +	struct netdev_adjacent *adj; + +	list_for_each_entry(adj, adj_list, list) { +		if (adj->dev == adj_dev) +			return adj; +	} +	return NULL; +}  /** - *	register_gifconf	-	register a SIOCGIF handler - *	@family: Address family - *	@gifconf: Function handler + * netdev_has_upper_dev - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check   * - *	Register protocol dependent address dumping routines. The handler - *	that is passed must not be freed or reused until it has been replaced - *	by another handler. + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks only immediate upper device, + * not through a complete stack of devices. The caller must hold the RTNL lock.   */ -int register_gifconf(unsigned int family, gifconf_func_t *gifconf) +bool netdev_has_upper_dev(struct net_device *dev, +			  struct net_device *upper_dev)  { -	if (family >= NPROTO) -		return -EINVAL; -	gifconf_list[family] = gifconf; -	return 0; -} -EXPORT_SYMBOL(register_gifconf); +	ASSERT_RTNL(); +	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); +} +EXPORT_SYMBOL(netdev_has_upper_dev); -/* - *	Map an interface index to its name (SIOCGIFNAME) +/** + * netdev_has_any_upper_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to an upper device and return true in case + * it is. The caller must hold the RTNL lock.   */ +static bool netdev_has_any_upper_dev(struct net_device *dev) +{ +	ASSERT_RTNL(); -/* - *	We need this ioctl for efficient implementation of the - *	if_indextoname() function required by the IPv6 API.  Without - *	it, we would have to search all the interfaces to find a - *	match.  --pb - */ +	return !list_empty(&dev->all_adj_list.upper); +} -static int dev_ifname(struct net *net, struct ifreq __user *arg) +/** + * netdev_master_upper_dev_get - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RTNL lock. + */ +struct net_device *netdev_master_upper_dev_get(struct net_device *dev)  { -	struct net_device *dev; -	struct ifreq ifr; +	struct netdev_adjacent *upper; -	/* -	 *	Fetch the caller's info block. -	 */ +	ASSERT_RTNL(); -	if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) -		return -EFAULT; +	if (list_empty(&dev->adj_list.upper)) +		return NULL; -	rcu_read_lock(); -	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); -	if (!dev) { -		rcu_read_unlock(); -		return -ENODEV; -	} +	upper = list_first_entry(&dev->adj_list.upper, +				 struct netdev_adjacent, list); +	if (likely(upper->master)) +		return upper->dev; +	return NULL; +} +EXPORT_SYMBOL(netdev_master_upper_dev_get); -	strcpy(ifr.ifr_name, dev->name); -	rcu_read_unlock(); +void *netdev_adjacent_get_private(struct list_head *adj_list) +{ +	struct netdev_adjacent *adj; -	if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) -		return -EFAULT; -	return 0; +	adj = list_entry(adj_list, struct netdev_adjacent, list); + +	return adj->private;  } +EXPORT_SYMBOL(netdev_adjacent_get_private); -/* - *	Perform a SIOCGIFCONF call. This structure will change - *	size eventually, and there is nothing I can do about it. - *	Thus we will need a 'compatibility mode'. +/** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock.   */ - -static int dev_ifconf(struct net *net, char __user *arg) +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, +						 struct list_head **iter)  { -	struct ifconf ifc; -	struct net_device *dev; -	char __user *pos; -	int len; -	int total; -	int i; - -	/* -	 *	Fetch the caller's info block. -	 */ +	struct netdev_adjacent *upper; -	if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) -		return -EFAULT; +	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); -	pos = ifc.ifc_buf; -	len = ifc.ifc_len; +	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); -	/* -	 *	Loop over the interfaces, and write an info block for each. -	 */ - -	total = 0; -	for_each_netdev(net, dev) { -		for (i = 0; i < NPROTO; i++) { -			if (gifconf_list[i]) { -				int done; -				if (!pos) -					done = gifconf_list[i](dev, NULL, 0); -				else -					done = gifconf_list[i](dev, pos + total, -							       len - total); -				if (done < 0) -					return -EFAULT; -				total += done; -			} -		} -	} +	if (&upper->list == &dev->adj_list.upper) +		return NULL; -	/* -	 *	All done.  Write the updated control block back to the caller. -	 */ -	ifc.ifc_len = total; +	*iter = &upper->list; -	/* -	 * 	Both BSD and Solaris return 0 here, so we do too. -	 */ -	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +	return upper->dev;  } +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); -#ifdef CONFIG_PROC_FS -/* - *	This is invoked by the /proc filesystem handler to display a device - *	in detail. +/** + * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock.   */ -void *dev_seq_start(struct seq_file *seq, loff_t *pos) -	__acquires(RCU) +struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, +						     struct list_head **iter)  { -	struct net *net = seq_file_net(seq); -	loff_t off; -	struct net_device *dev; +	struct netdev_adjacent *upper; -	rcu_read_lock(); -	if (!*pos) -		return SEQ_START_TOKEN; +	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); -	off = 1; -	for_each_netdev_rcu(net, dev) -		if (off++ == *pos) -			return dev; +	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); -	return NULL; -} +	if (&upper->list == &dev->all_adj_list.upper) +		return NULL; -void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ -	struct net_device *dev = (v == SEQ_START_TOKEN) ? -				  first_net_device(seq_file_net(seq)) : -				  next_net_device((struct net_device *)v); +	*iter = &upper->list; -	++*pos; -	return rcu_dereference(dev); +	return upper->dev;  } +EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); -void dev_seq_stop(struct seq_file *seq, void *v) -	__releases(RCU) +/** + * netdev_lower_get_next_private - Get the next ->private from the + *				   lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, +				    struct list_head **iter)  { -	rcu_read_unlock(); +	struct netdev_adjacent *lower; + +	lower = list_entry(*iter, struct netdev_adjacent, list); + +	if (&lower->list == &dev->adj_list.lower) +		return NULL; + +	*iter = lower->list.next; + +	return lower->private;  } +EXPORT_SYMBOL(netdev_lower_get_next_private); -static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + *				       lower neighbour list, RCU + *				       variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +void *netdev_lower_get_next_private_rcu(struct net_device *dev, +					struct list_head **iter)  { -	struct rtnl_link_stats64 temp; -	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); +	struct netdev_adjacent *lower; + +	WARN_ON_ONCE(!rcu_read_lock_held()); + +	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + +	if (&lower->list == &dev->adj_list.lower) +		return NULL; -	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " -		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", -		   dev->name, stats->rx_bytes, stats->rx_packets, -		   stats->rx_errors, -		   stats->rx_dropped + stats->rx_missed_errors, -		   stats->rx_fifo_errors, -		   stats->rx_length_errors + stats->rx_over_errors + -		    stats->rx_crc_errors + stats->rx_frame_errors, -		   stats->rx_compressed, stats->multicast, -		   stats->tx_bytes, stats->tx_packets, -		   stats->tx_errors, stats->tx_dropped, -		   stats->tx_fifo_errors, stats->collisions, -		   stats->tx_carrier_errors + -		    stats->tx_aborted_errors + -		    stats->tx_window_errors + -		    stats->tx_heartbeat_errors, -		   stats->tx_compressed); +	*iter = &lower->list; + +	return lower->private;  } +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); -/* - *	Called from the PROCfs module. This now uses the new arbitrary sized - *	/proc/net interface to create /proc/net/dev +/** + * netdev_lower_get_next - Get the next device from the lower neighbour + *                         list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchainged.   */ -static int dev_seq_show(struct seq_file *seq, void *v) -{ -	if (v == SEQ_START_TOKEN) -		seq_puts(seq, "Inter-|   Receive                            " -			      "                    |  Transmit\n" -			      " face |bytes    packets errs drop fifo frame " -			      "compressed multicast|bytes    packets errs " -			      "drop fifo colls carrier compressed\n"); -	else -		dev_seq_printf_stats(seq, v); -	return 0; +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ +	struct netdev_adjacent *lower; + +	lower = list_entry((*iter)->next, struct netdev_adjacent, list); + +	if (&lower->list == &dev->adj_list.lower) +		return NULL; + +	*iter = &lower->list; + +	return lower->dev;  } +EXPORT_SYMBOL(netdev_lower_get_next); -static struct softnet_data *softnet_get_online(loff_t *pos) +/** + * netdev_lower_get_first_private_rcu - Get the first ->private from the + *				       lower neighbour list, RCU + *				       variant + * @dev: device + * + * Gets the first netdev_adjacent->private from the dev's lower neighbour + * list. The caller must hold RCU read lock. + */ +void *netdev_lower_get_first_private_rcu(struct net_device *dev)  { -	struct softnet_data *sd = NULL; +	struct netdev_adjacent *lower; -	while (*pos < nr_cpu_ids) -		if (cpu_online(*pos)) { -			sd = &per_cpu(softnet_data, *pos); -			break; -		} else -			++*pos; -	return sd; +	lower = list_first_or_null_rcu(&dev->adj_list.lower, +			struct netdev_adjacent, list); +	if (lower) +		return lower->private; +	return NULL;  } +EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); -static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +/** + * netdev_master_upper_dev_get_rcu - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RCU read lock. + */ +struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)  { -	return softnet_get_online(pos); +	struct netdev_adjacent *upper; + +	upper = list_first_or_null_rcu(&dev->adj_list.upper, +				       struct netdev_adjacent, list); +	if (upper && likely(upper->master)) +		return upper->dev; +	return NULL;  } +EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); -static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static int netdev_adjacent_sysfs_add(struct net_device *dev, +			      struct net_device *adj_dev, +			      struct list_head *dev_list)  { -	++*pos; -	return softnet_get_online(pos); +	char linkname[IFNAMSIZ+7]; +	sprintf(linkname, dev_list == &dev->adj_list.upper ? +		"upper_%s" : "lower_%s", adj_dev->name); +	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), +				 linkname);  } - -static void softnet_seq_stop(struct seq_file *seq, void *v) +static void netdev_adjacent_sysfs_del(struct net_device *dev, +			       char *name, +			       struct list_head *dev_list)  { +	char linkname[IFNAMSIZ+7]; +	sprintf(linkname, dev_list == &dev->adj_list.upper ? +		"upper_%s" : "lower_%s", name); +	sysfs_remove_link(&(dev->dev.kobj), linkname);  } -static int softnet_seq_show(struct seq_file *seq, void *v) +#define netdev_adjacent_is_neigh_list(dev, dev_list) \ +		(dev_list == &dev->adj_list.upper || \ +		 dev_list == &dev->adj_list.lower) + +static int __netdev_adjacent_dev_insert(struct net_device *dev, +					struct net_device *adj_dev, +					struct list_head *dev_list, +					void *private, bool master)  { -	struct softnet_data *sd = v; +	struct netdev_adjacent *adj; +	int ret; + +	adj = __netdev_find_adj(dev, adj_dev, dev_list); + +	if (adj) { +		adj->ref_nr++; +		return 0; +	} + +	adj = kmalloc(sizeof(*adj), GFP_KERNEL); +	if (!adj) +		return -ENOMEM; + +	adj->dev = adj_dev; +	adj->master = master; +	adj->ref_nr = 1; +	adj->private = private; +	dev_hold(adj_dev); + +	pr_debug("dev_hold for %s, because of link added from %s to %s\n", +		 adj_dev->name, dev->name, adj_dev->name); + +	if (netdev_adjacent_is_neigh_list(dev, dev_list)) { +		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); +		if (ret) +			goto free_adj; +	} + +	/* Ensure that master link is always the first item in list. */ +	if (master) { +		ret = sysfs_create_link(&(dev->dev.kobj), +					&(adj_dev->dev.kobj), "master"); +		if (ret) +			goto remove_symlinks; + +		list_add_rcu(&adj->list, dev_list); +	} else { +		list_add_tail_rcu(&adj->list, dev_list); +	} -	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", -		   sd->processed, sd->dropped, sd->time_squeeze, 0, -		   0, 0, 0, 0, /* was fastroute */ -		   sd->cpu_collision, sd->received_rps);  	return 0; -} -static const struct seq_operations dev_seq_ops = { -	.start = dev_seq_start, -	.next  = dev_seq_next, -	.stop  = dev_seq_stop, -	.show  = dev_seq_show, -}; +remove_symlinks: +	if (netdev_adjacent_is_neigh_list(dev, dev_list)) +		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); +free_adj: +	kfree(adj); +	dev_put(adj_dev); -static int dev_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open_net(inode, file, &dev_seq_ops, -			    sizeof(struct seq_net_private)); +	return ret;  } -static const struct file_operations dev_seq_fops = { -	.owner	 = THIS_MODULE, -	.open    = dev_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release_net, -}; +static void __netdev_adjacent_dev_remove(struct net_device *dev, +					 struct net_device *adj_dev, +					 struct list_head *dev_list) +{ +	struct netdev_adjacent *adj; -static const struct seq_operations softnet_seq_ops = { -	.start = softnet_seq_start, -	.next  = softnet_seq_next, -	.stop  = softnet_seq_stop, -	.show  = softnet_seq_show, -}; +	adj = __netdev_find_adj(dev, adj_dev, dev_list); -static int softnet_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open(file, &softnet_seq_ops); -} +	if (!adj) { +		pr_err("tried to remove device %s from %s\n", +		       dev->name, adj_dev->name); +		BUG(); +	} -static const struct file_operations softnet_seq_fops = { -	.owner	 = THIS_MODULE, -	.open    = softnet_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release, -}; +	if (adj->ref_nr > 1) { +		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, +			 adj->ref_nr-1); +		adj->ref_nr--; +		return; +	} + +	if (adj->master) +		sysfs_remove_link(&(dev->dev.kobj), "master"); -static void *ptype_get_idx(loff_t pos) +	if (netdev_adjacent_is_neigh_list(dev, dev_list)) +		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); + +	list_del_rcu(&adj->list); +	pr_debug("dev_put for %s, because link removed from %s to %s\n", +		 adj_dev->name, dev->name, adj_dev->name); +	dev_put(adj_dev); +	kfree_rcu(adj, rcu); +} + +static int __netdev_adjacent_dev_link_lists(struct net_device *dev, +					    struct net_device *upper_dev, +					    struct list_head *up_list, +					    struct list_head *down_list, +					    void *private, bool master)  { -	struct packet_type *pt = NULL; -	loff_t i = 0; -	int t; +	int ret; -	list_for_each_entry_rcu(pt, &ptype_all, list) { -		if (i == pos) -			return pt; -		++i; -	} +	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, +					   master); +	if (ret) +		return ret; -	for (t = 0; t < PTYPE_HASH_SIZE; t++) { -		list_for_each_entry_rcu(pt, &ptype_base[t], list) { -			if (i == pos) -				return pt; -			++i; -		} +	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, +					   false); +	if (ret) { +		__netdev_adjacent_dev_remove(dev, upper_dev, up_list); +		return ret;  	} -	return NULL; + +	return 0;  } -static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) -	__acquires(RCU) +static int __netdev_adjacent_dev_link(struct net_device *dev, +				      struct net_device *upper_dev)  { -	rcu_read_lock(); -	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; +	return __netdev_adjacent_dev_link_lists(dev, upper_dev, +						&dev->all_adj_list.upper, +						&upper_dev->all_adj_list.lower, +						NULL, false);  } -static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, +					       struct net_device *upper_dev, +					       struct list_head *up_list, +					       struct list_head *down_list)  { -	struct packet_type *pt; -	struct list_head *nxt; -	int hash; +	__netdev_adjacent_dev_remove(dev, upper_dev, up_list); +	__netdev_adjacent_dev_remove(upper_dev, dev, down_list); +} -	++*pos; -	if (v == SEQ_START_TOKEN) -		return ptype_get_idx(0); +static void __netdev_adjacent_dev_unlink(struct net_device *dev, +					 struct net_device *upper_dev) +{ +	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, +					   &dev->all_adj_list.upper, +					   &upper_dev->all_adj_list.lower); +} -	pt = v; -	nxt = pt->list.next; -	if (pt->type == htons(ETH_P_ALL)) { -		if (nxt != &ptype_all) -			goto found; -		hash = 0; -		nxt = ptype_base[0].next; -	} else -		hash = ntohs(pt->type) & PTYPE_HASH_MASK; +static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, +						struct net_device *upper_dev, +						void *private, bool master) +{ +	int ret = __netdev_adjacent_dev_link(dev, upper_dev); -	while (nxt == &ptype_base[hash]) { -		if (++hash >= PTYPE_HASH_SIZE) -			return NULL; -		nxt = ptype_base[hash].next; +	if (ret) +		return ret; + +	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, +					       &dev->adj_list.upper, +					       &upper_dev->adj_list.lower, +					       private, master); +	if (ret) { +		__netdev_adjacent_dev_unlink(dev, upper_dev); +		return ret;  	} -found: -	return list_entry(nxt, struct packet_type, list); + +	return 0;  } -static void ptype_seq_stop(struct seq_file *seq, void *v) -	__releases(RCU) +static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, +						   struct net_device *upper_dev)  { -	rcu_read_unlock(); +	__netdev_adjacent_dev_unlink(dev, upper_dev); +	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, +					   &dev->adj_list.upper, +					   &upper_dev->adj_list.lower);  } -static int ptype_seq_show(struct seq_file *seq, void *v) +static int __netdev_upper_dev_link(struct net_device *dev, +				   struct net_device *upper_dev, bool master, +				   void *private)  { -	struct packet_type *pt = v; +	struct netdev_adjacent *i, *j, *to_i, *to_j; +	int ret = 0; -	if (v == SEQ_START_TOKEN) -		seq_puts(seq, "Type Device      Function\n"); -	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { -		if (pt->type == htons(ETH_P_ALL)) -			seq_puts(seq, "ALL "); -		else -			seq_printf(seq, "%04x", ntohs(pt->type)); +	ASSERT_RTNL(); -		seq_printf(seq, " %-8s %pF\n", -			   pt->dev ? pt->dev->name : "", pt->func); +	if (dev == upper_dev) +		return -EBUSY; + +	/* To prevent loops, check if dev is not upper device to upper_dev. */ +	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) +		return -EBUSY; + +	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) +		return -EEXIST; + +	if (master && netdev_master_upper_dev_get(dev)) +		return -EBUSY; + +	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, +						   master); +	if (ret) +		return ret; + +	/* Now that we linked these devs, make all the upper_dev's +	 * all_adj_list.upper visible to every dev's all_adj_list.lower an +	 * versa, and don't forget the devices itself. All of these +	 * links are non-neighbours. +	 */ +	list_for_each_entry(i, &dev->all_adj_list.lower, list) { +		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { +			pr_debug("Interlinking %s with %s, non-neighbour\n", +				 i->dev->name, j->dev->name); +			ret = __netdev_adjacent_dev_link(i->dev, j->dev); +			if (ret) +				goto rollback_mesh; +		} +	} + +	/* add dev to every upper_dev's upper device */ +	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { +		pr_debug("linking %s's upper device %s with %s\n", +			 upper_dev->name, i->dev->name, dev->name); +		ret = __netdev_adjacent_dev_link(dev, i->dev); +		if (ret) +			goto rollback_upper_mesh;  	} +	/* add upper_dev to every dev's lower device */ +	list_for_each_entry(i, &dev->all_adj_list.lower, list) { +		pr_debug("linking %s's lower device %s with %s\n", dev->name, +			 i->dev->name, upper_dev->name); +		ret = __netdev_adjacent_dev_link(i->dev, upper_dev); +		if (ret) +			goto rollback_lower_mesh; +	} + +	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);  	return 0; -} -static const struct seq_operations ptype_seq_ops = { -	.start = ptype_seq_start, -	.next  = ptype_seq_next, -	.stop  = ptype_seq_stop, -	.show  = ptype_seq_show, -}; +rollback_lower_mesh: +	to_i = i; +	list_for_each_entry(i, &dev->all_adj_list.lower, list) { +		if (i == to_i) +			break; +		__netdev_adjacent_dev_unlink(i->dev, upper_dev); +	} + +	i = NULL; -static int ptype_seq_open(struct inode *inode, struct file *file) +rollback_upper_mesh: +	to_i = i; +	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { +		if (i == to_i) +			break; +		__netdev_adjacent_dev_unlink(dev, i->dev); +	} + +	i = j = NULL; + +rollback_mesh: +	to_i = i; +	to_j = j; +	list_for_each_entry(i, &dev->all_adj_list.lower, list) { +		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { +			if (i == to_i && j == to_j) +				break; +			__netdev_adjacent_dev_unlink(i->dev, j->dev); +		} +		if (i == to_i) +			break; +	} + +	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + +	return ret; +} + +/** + * netdev_upper_dev_link - Add a link to the upper device + * @dev: device + * @upper_dev: new upper device + * + * Adds a link to device which is upper to this one. The caller must hold + * the RTNL lock. On a failure a negative errno code is returned. + * On success the reference counts are adjusted and the function + * returns zero. + */ +int netdev_upper_dev_link(struct net_device *dev, +			  struct net_device *upper_dev)  { -	return seq_open_net(inode, file, &ptype_seq_ops, -			sizeof(struct seq_net_private)); +	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);  } +EXPORT_SYMBOL(netdev_upper_dev_link); -static const struct file_operations ptype_seq_fops = { -	.owner	 = THIS_MODULE, -	.open    = ptype_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release_net, -}; +/** + * netdev_master_upper_dev_link - Add a master link to the upper device + * @dev: device + * @upper_dev: new upper device + * + * Adds a link to device which is upper to this one. In this case, only + * one master upper device can be linked, although other non-master devices + * might be linked as well. The caller must hold the RTNL lock. + * On a failure a negative errno code is returned. On success the reference + * counts are adjusted and the function returns zero. + */ +int netdev_master_upper_dev_link(struct net_device *dev, +				 struct net_device *upper_dev) +{ +	return __netdev_upper_dev_link(dev, upper_dev, true, NULL); +} +EXPORT_SYMBOL(netdev_master_upper_dev_link); +int netdev_master_upper_dev_link_private(struct net_device *dev, +					 struct net_device *upper_dev, +					 void *private) +{ +	return __netdev_upper_dev_link(dev, upper_dev, true, private); +} +EXPORT_SYMBOL(netdev_master_upper_dev_link_private); -static int __net_init dev_proc_net_init(struct net *net) +/** + * netdev_upper_dev_unlink - Removes a link to upper device + * @dev: device + * @upper_dev: new upper device + * + * Removes a link to device which is upper to this one. The caller must hold + * the RTNL lock. + */ +void netdev_upper_dev_unlink(struct net_device *dev, +			     struct net_device *upper_dev)  { -	int rc = -ENOMEM; +	struct netdev_adjacent *i, *j; +	ASSERT_RTNL(); -	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) -		goto out; -	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) -		goto out_dev; -	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) -		goto out_softnet; +	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); -	if (wext_proc_init(net)) -		goto out_ptype; -	rc = 0; -out: -	return rc; -out_ptype: -	proc_net_remove(net, "ptype"); -out_softnet: -	proc_net_remove(net, "softnet_stat"); -out_dev: -	proc_net_remove(net, "dev"); -	goto out; +	/* Here is the tricky part. We must remove all dev's lower +	 * devices from all upper_dev's upper devices and vice +	 * versa, to maintain the graph relationship. +	 */ +	list_for_each_entry(i, &dev->all_adj_list.lower, list) +		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) +			__netdev_adjacent_dev_unlink(i->dev, j->dev); + +	/* remove also the devices itself from lower/upper device +	 * list +	 */ +	list_for_each_entry(i, &dev->all_adj_list.lower, list) +		__netdev_adjacent_dev_unlink(i->dev, upper_dev); + +	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) +		__netdev_adjacent_dev_unlink(dev, i->dev); + +	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);  } +EXPORT_SYMBOL(netdev_upper_dev_unlink); -static void __net_exit dev_proc_net_exit(struct net *net) +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)  { -	wext_proc_exit(net); +	struct netdev_adjacent *iter; -	proc_net_remove(net, "ptype"); -	proc_net_remove(net, "softnet_stat"); -	proc_net_remove(net, "dev"); -} +	list_for_each_entry(iter, &dev->adj_list.upper, list) { +		netdev_adjacent_sysfs_del(iter->dev, oldname, +					  &iter->dev->adj_list.lower); +		netdev_adjacent_sysfs_add(iter->dev, dev, +					  &iter->dev->adj_list.lower); +	} -static struct pernet_operations __net_initdata dev_proc_ops = { -	.init = dev_proc_net_init, -	.exit = dev_proc_net_exit, -}; +	list_for_each_entry(iter, &dev->adj_list.lower, list) { +		netdev_adjacent_sysfs_del(iter->dev, oldname, +					  &iter->dev->adj_list.upper); +		netdev_adjacent_sysfs_add(iter->dev, dev, +					  &iter->dev->adj_list.upper); +	} +} -static int __init dev_proc_init(void) +void *netdev_lower_dev_get_private(struct net_device *dev, +				   struct net_device *lower_dev)  { -	return register_pernet_subsys(&dev_proc_ops); +	struct netdev_adjacent *lower; + +	if (!lower_dev) +		return NULL; +	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); +	if (!lower) +		return NULL; + +	return lower->private;  } -#else -#define dev_proc_init() 0 -#endif	/* CONFIG_PROC_FS */ +EXPORT_SYMBOL(netdev_lower_dev_get_private); -/** - *	netdev_set_master	-	set up master/slave pair - *	@slave: slave device - *	@master: new master device - * - *	Changes the master device of the slave. Pass %NULL to break the - *	bonding. The caller must hold the RTNL semaphore. On a failure - *	a negative errno code is returned. On success the reference counts - *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the - *	function returns zero. - */ -int netdev_set_master(struct net_device *slave, struct net_device *master) +int dev_get_nest_level(struct net_device *dev, +		       bool (*type_check)(struct net_device *dev))  { -	struct net_device *old = slave->master; +	struct net_device *lower = NULL; +	struct list_head *iter; +	int max_nest = -1; +	int nest;  	ASSERT_RTNL(); -	if (master) { -		if (old) -			return -EBUSY; -		dev_hold(master); +	netdev_for_each_lower_dev(dev, lower, iter) { +		nest = dev_get_nest_level(lower, type_check); +		if (max_nest < nest) +			max_nest = nest;  	} -	slave->master = master; +	if (type_check(dev)) +		max_nest++; -	if (old) { -		synchronize_net(); -		dev_put(old); -	} -	if (master) -		slave->flags |= IFF_SLAVE; -	else -		slave->flags &= ~IFF_SLAVE; - -	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); -	return 0; +	return max_nest;  } -EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(dev_get_nest_level);  static void dev_change_rx_flags(struct net_device *dev, int flags)  {  	const struct net_device_ops *ops = dev->netdev_ops; -	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) +	if (ops->ndo_change_rx_flags)  		ops->ndo_change_rx_flags(dev, flags);  } -static int __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)  { -	unsigned short old_flags = dev->flags; -	uid_t uid; -	gid_t gid; +	unsigned int old_flags = dev->flags; +	kuid_t uid; +	kgid_t gid;  	ASSERT_RTNL(); @@ -4223,16 +5230,15 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)  			dev->flags &= ~IFF_PROMISC;  		else {  			dev->promiscuity -= inc; -			printk(KERN_WARNING "%s: promiscuity touches roof, " -				"set promiscuity failed, promiscuity feature " -				"of device might be broken.\n", dev->name); +			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", +				dev->name);  			return -EOVERFLOW;  		}  	}  	if (dev->flags != old_flags) { -		printk(KERN_INFO "device %s %s promiscuous mode\n", -		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" : -							       "left"); +		pr_info("device %s %s promiscuous mode\n", +			dev->name, +			dev->flags & IFF_PROMISC ? "entered" : "left");  		if (audit_enabled) {  			current_uid_gid(&uid, &gid);  			audit_log(current->audit_context, GFP_ATOMIC, @@ -4240,13 +5246,16 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)  				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",  				dev->name, (dev->flags & IFF_PROMISC),  				(old_flags & IFF_PROMISC), -				audit_get_loginuid(current), -				uid, gid, +				from_kuid(&init_user_ns, audit_get_loginuid(current)), +				from_kuid(&init_user_ns, uid), +				from_kgid(&init_user_ns, gid),  				audit_get_sessionid(current));  		}  		dev_change_rx_flags(dev, IFF_PROMISC);  	} +	if (notify) +		__dev_notify_flags(dev, old_flags, IFF_PROMISC);  	return 0;  } @@ -4263,10 +5272,10 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)   */  int dev_set_promiscuity(struct net_device *dev, int inc)  { -	unsigned short old_flags = dev->flags; +	unsigned int old_flags = dev->flags;  	int err; -	err = __dev_set_promiscuity(dev, inc); +	err = __dev_set_promiscuity(dev, inc, true);  	if (err < 0)  		return err;  	if (dev->flags != old_flags) @@ -4275,22 +5284,9 @@ int dev_set_promiscuity(struct net_device *dev, int inc)  }  EXPORT_SYMBOL(dev_set_promiscuity); -/** - *	dev_set_allmulti	- update allmulti count on a device - *	@dev: device - *	@inc: modifier - * - *	Add or remove reception of all multicast frames to a device. While the - *	count in the device remains above zero the interface remains listening - *	to all interfaces. Once it hits zero the device reverts back to normal - *	filtering operation. A negative @inc value is used to drop the counter - *	when releasing a resource needing all multicasts. - *	Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) +static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)  { -	unsigned short old_flags = dev->flags; +	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;  	ASSERT_RTNL(); @@ -4305,18 +5301,38 @@ int dev_set_allmulti(struct net_device *dev, int inc)  			dev->flags &= ~IFF_ALLMULTI;  		else {  			dev->allmulti -= inc; -			printk(KERN_WARNING "%s: allmulti touches roof, " -				"set allmulti failed, allmulti feature of " -				"device might be broken.\n", dev->name); +			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", +				dev->name);  			return -EOVERFLOW;  		}  	}  	if (dev->flags ^ old_flags) {  		dev_change_rx_flags(dev, IFF_ALLMULTI);  		dev_set_rx_mode(dev); +		if (notify) +			__dev_notify_flags(dev, old_flags, +					   dev->gflags ^ old_gflags);  	}  	return 0;  } + +/** + *	dev_set_allmulti	- update allmulti count on a device + *	@dev: device + *	@inc: modifier + * + *	Add or remove reception of all multicast frames to a device. While the + *	count in the device remains above zero the interface remains listening + *	to all interfaces. Once it hits zero the device reverts back to normal + *	filtering operation. A negative @inc value is used to drop the counter + *	when releasing a resource needing all multicasts. + *	Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ +	return __dev_set_allmulti(dev, inc, true); +}  EXPORT_SYMBOL(dev_set_allmulti);  /* @@ -4336,23 +5352,21 @@ void __dev_set_rx_mode(struct net_device *dev)  	if (!netif_device_present(dev))  		return; -	if (ops->ndo_set_rx_mode) -		ops->ndo_set_rx_mode(dev); -	else { +	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {  		/* Unicast addresses changes may only happen under the rtnl,  		 * therefore calling __dev_set_promiscuity here is safe.  		 */  		if (!netdev_uc_empty(dev) && !dev->uc_promisc) { -			__dev_set_promiscuity(dev, 1); -			dev->uc_promisc = 1; +			__dev_set_promiscuity(dev, 1, false); +			dev->uc_promisc = true;  		} else if (netdev_uc_empty(dev) && dev->uc_promisc) { -			__dev_set_promiscuity(dev, -1); -			dev->uc_promisc = 0; +			__dev_set_promiscuity(dev, -1, false); +			dev->uc_promisc = false;  		} - -		if (ops->ndo_set_multicast_list) -			ops->ndo_set_multicast_list(dev);  	} + +	if (ops->ndo_set_rx_mode) +		ops->ndo_set_rx_mode(dev);  }  void dev_set_rx_mode(struct net_device *dev) @@ -4368,9 +5382,9 @@ void dev_set_rx_mode(struct net_device *dev)   *   *	Get the combination of flag bits exported through APIs to userspace.   */ -unsigned dev_get_flags(const struct net_device *dev) +unsigned int dev_get_flags(const struct net_device *dev)  { -	unsigned flags; +	unsigned int flags;  	flags = (dev->flags & ~(IFF_PROMISC |  				IFF_ALLMULTI | @@ -4395,7 +5409,7 @@ EXPORT_SYMBOL(dev_get_flags);  int __dev_change_flags(struct net_device *dev, unsigned int flags)  { -	int old_flags = dev->flags; +	unsigned int old_flags = dev->flags;  	int ret;  	ASSERT_RTNL(); @@ -4435,9 +5449,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)  	if ((flags ^ dev->gflags) & IFF_PROMISC) {  		int inc = (flags & IFF_PROMISC) ? 1 : -1; +		unsigned int old_flags = dev->flags;  		dev->gflags ^= IFF_PROMISC; -		dev_set_promiscuity(dev, inc); + +		if (__dev_set_promiscuity(dev, inc, false) >= 0) +			if (dev->flags != old_flags) +				dev_set_rx_mode(dev);  	}  	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI @@ -4448,16 +5466,20 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)  		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;  		dev->gflags ^= IFF_ALLMULTI; -		dev_set_allmulti(dev, inc); +		__dev_set_allmulti(dev, inc, false);  	}  	return ret;  } -void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, +			unsigned int gchanges)  {  	unsigned int changes = dev->flags ^ old_flags; +	if (gchanges) +		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); +  	if (changes & IFF_UP) {  		if (dev->flags & IFF_UP)  			call_netdevice_notifiers(NETDEV_UP, dev); @@ -4466,8 +5488,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)  	}  	if (dev->flags & IFF_UP && -	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) -		call_netdevice_notifiers(NETDEV_CHANGE, dev); +	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { +		struct netdev_notifier_change_info change_info; + +		change_info.flags_changed = changes; +		call_netdevice_notifiers_info(NETDEV_CHANGE, dev, +					      &change_info.info); +	}  }  /** @@ -4478,24 +5505,32 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)   *	Change settings on device based state flags. The flags are   *	in the userspace exported format.   */ -int dev_change_flags(struct net_device *dev, unsigned flags) +int dev_change_flags(struct net_device *dev, unsigned int flags)  { -	int ret, changes; -	int old_flags = dev->flags; +	int ret; +	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;  	ret = __dev_change_flags(dev, flags);  	if (ret < 0)  		return ret; -	changes = old_flags ^ dev->flags; -	if (changes) -		rtmsg_ifinfo(RTM_NEWLINK, dev, changes); - -	__dev_notify_flags(dev, old_flags); +	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); +	__dev_notify_flags(dev, old_flags, changes);  	return ret;  }  EXPORT_SYMBOL(dev_change_flags); +static int __dev_set_mtu(struct net_device *dev, int new_mtu) +{ +	const struct net_device_ops *ops = dev->netdev_ops; + +	if (ops->ndo_change_mtu) +		return ops->ndo_change_mtu(dev, new_mtu); + +	dev->mtu = new_mtu; +	return 0; +} +  /**   *	dev_set_mtu - Change maximum transfer unit   *	@dev: device @@ -4505,8 +5540,7 @@ EXPORT_SYMBOL(dev_change_flags);   */  int dev_set_mtu(struct net_device *dev, int new_mtu)  { -	const struct net_device_ops *ops = dev->netdev_ops; -	int err; +	int err, orig_mtu;  	if (new_mtu == dev->mtu)  		return 0; @@ -4518,19 +5552,41 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)  	if (!netif_device_present(dev))  		return -ENODEV; -	err = 0; -	if (ops->ndo_change_mtu) -		err = ops->ndo_change_mtu(dev, new_mtu); -	else -		dev->mtu = new_mtu; +	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); +	err = notifier_to_errno(err); +	if (err) +		return err; + +	orig_mtu = dev->mtu; +	err = __dev_set_mtu(dev, new_mtu); -	if (!err && dev->flags & IFF_UP) -		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); +	if (!err) { +		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); +		err = notifier_to_errno(err); +		if (err) { +			/* setting mtu back and notifying everyone again, +			 * so that they have a chance to revert changes. +			 */ +			__dev_set_mtu(dev, orig_mtu); +			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); +		} +	}  	return err;  }  EXPORT_SYMBOL(dev_set_mtu);  /** + *	dev_set_group - Change group this device belongs to + *	@dev: device + *	@new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ +	dev->group = new_group; +} +EXPORT_SYMBOL(dev_set_group); + +/**   *	dev_set_mac_address - Change Media Access Control Address   *	@dev: device   *	@sa: new address @@ -4549,365 +5605,51 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)  	if (!netif_device_present(dev))  		return -ENODEV;  	err = ops->ndo_set_mac_address(dev, sa); -	if (!err) -		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); -	return err; +	if (err) +		return err; +	dev->addr_assign_type = NET_ADDR_SET; +	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); +	add_device_randomness(dev->dev_addr, dev->addr_len); +	return 0;  }  EXPORT_SYMBOL(dev_set_mac_address); -/* - *	Perform the SIOCxIFxxx calls, inside rcu_read_lock() - */ -static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) -{ -	int err; -	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); - -	if (!dev) -		return -ENODEV; - -	switch (cmd) { -	case SIOCGIFFLAGS:	/* Get interface flags */ -		ifr->ifr_flags = (short) dev_get_flags(dev); -		return 0; - -	case SIOCGIFMETRIC:	/* Get the metric on the interface -				   (currently unused) */ -		ifr->ifr_metric = 0; -		return 0; - -	case SIOCGIFMTU:	/* Get the MTU of a device */ -		ifr->ifr_mtu = dev->mtu; -		return 0; - -	case SIOCGIFHWADDR: -		if (!dev->addr_len) -			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); -		else -			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, -			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); -		ifr->ifr_hwaddr.sa_family = dev->type; -		return 0; - -	case SIOCGIFSLAVE: -		err = -EINVAL; -		break; - -	case SIOCGIFMAP: -		ifr->ifr_map.mem_start = dev->mem_start; -		ifr->ifr_map.mem_end   = dev->mem_end; -		ifr->ifr_map.base_addr = dev->base_addr; -		ifr->ifr_map.irq       = dev->irq; -		ifr->ifr_map.dma       = dev->dma; -		ifr->ifr_map.port      = dev->if_port; -		return 0; - -	case SIOCGIFINDEX: -		ifr->ifr_ifindex = dev->ifindex; -		return 0; - -	case SIOCGIFTXQLEN: -		ifr->ifr_qlen = dev->tx_queue_len; -		return 0; - -	default: -		/* dev_ioctl() should ensure this case -		 * is never reached -		 */ -		WARN_ON(1); -		err = -EINVAL; -		break; - -	} -	return err; -} - -/* - *	Perform the SIOCxIFxxx calls, inside rtnl_lock() +/** + *	dev_change_carrier - Change device carrier + *	@dev: device + *	@new_carrier: new value + * + *	Change device carrier   */ -static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +int dev_change_carrier(struct net_device *dev, bool new_carrier)  { -	int err; -	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); -	const struct net_device_ops *ops; - -	if (!dev) -		return -ENODEV; - -	ops = dev->netdev_ops; - -	switch (cmd) { -	case SIOCSIFFLAGS:	/* Set interface flags */ -		return dev_change_flags(dev, ifr->ifr_flags); - -	case SIOCSIFMETRIC:	/* Set the metric on the interface -				   (currently unused) */ -		return -EOPNOTSUPP; - -	case SIOCSIFMTU:	/* Set the MTU of a device */ -		return dev_set_mtu(dev, ifr->ifr_mtu); - -	case SIOCSIFHWADDR: -		return dev_set_mac_address(dev, &ifr->ifr_hwaddr); - -	case SIOCSIFHWBROADCAST: -		if (ifr->ifr_hwaddr.sa_family != dev->type) -			return -EINVAL; -		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, -		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); -		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); -		return 0; +	const struct net_device_ops *ops = dev->netdev_ops; -	case SIOCSIFMAP: -		if (ops->ndo_set_config) { -			if (!netif_device_present(dev)) -				return -ENODEV; -			return ops->ndo_set_config(dev, &ifr->ifr_map); -		} +	if (!ops->ndo_change_carrier)  		return -EOPNOTSUPP; - -	case SIOCADDMULTI: -		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || -		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) -			return -EINVAL; -		if (!netif_device_present(dev)) -			return -ENODEV; -		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); - -	case SIOCDELMULTI: -		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || -		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) -			return -EINVAL; -		if (!netif_device_present(dev)) -			return -ENODEV; -		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); - -	case SIOCSIFTXQLEN: -		if (ifr->ifr_qlen < 0) -			return -EINVAL; -		dev->tx_queue_len = ifr->ifr_qlen; -		return 0; - -	case SIOCSIFNAME: -		ifr->ifr_newname[IFNAMSIZ-1] = '\0'; -		return dev_change_name(dev, ifr->ifr_newname); - -	/* -	 *	Unknown or private ioctl -	 */ -	default: -		if ((cmd >= SIOCDEVPRIVATE && -		    cmd <= SIOCDEVPRIVATE + 15) || -		    cmd == SIOCBONDENSLAVE || -		    cmd == SIOCBONDRELEASE || -		    cmd == SIOCBONDSETHWADDR || -		    cmd == SIOCBONDSLAVEINFOQUERY || -		    cmd == SIOCBONDINFOQUERY || -		    cmd == SIOCBONDCHANGEACTIVE || -		    cmd == SIOCGMIIPHY || -		    cmd == SIOCGMIIREG || -		    cmd == SIOCSMIIREG || -		    cmd == SIOCBRADDIF || -		    cmd == SIOCBRDELIF || -		    cmd == SIOCSHWTSTAMP || -		    cmd == SIOCWANDEV) { -			err = -EOPNOTSUPP; -			if (ops->ndo_do_ioctl) { -				if (netif_device_present(dev)) -					err = ops->ndo_do_ioctl(dev, ifr, cmd); -				else -					err = -ENODEV; -			} -		} else -			err = -EINVAL; - -	} -	return err; +	if (!netif_device_present(dev)) +		return -ENODEV; +	return ops->ndo_change_carrier(dev, new_carrier);  } - -/* - *	This function handles all "interface"-type I/O control requests. The actual - *	'doing' part of this is dev_ifsioc above. - */ +EXPORT_SYMBOL(dev_change_carrier);  /** - *	dev_ioctl	-	network device ioctl - *	@net: the applicable net namespace - *	@cmd: command to issue - *	@arg: pointer to a struct ifreq in user space + *	dev_get_phys_port_id - Get device physical port ID + *	@dev: device + *	@ppid: port ID   * - *	Issue ioctl functions to devices. This is normally called by the - *	user space syscall interfaces but can sometimes be useful for - *	other purposes. The return value is the return from the syscall if - *	positive or a negative errno code on error. + *	Get device physical port ID   */ - -int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int dev_get_phys_port_id(struct net_device *dev, +			 struct netdev_phys_port_id *ppid)  { -	struct ifreq ifr; -	int ret; -	char *colon; - -	/* One special case: SIOCGIFCONF takes ifconf argument -	   and requires shared lock, because it sleeps writing -	   to user space. -	 */ - -	if (cmd == SIOCGIFCONF) { -		rtnl_lock(); -		ret = dev_ifconf(net, (char __user *) arg); -		rtnl_unlock(); -		return ret; -	} -	if (cmd == SIOCGIFNAME) -		return dev_ifname(net, (struct ifreq __user *)arg); - -	if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) -		return -EFAULT; - -	ifr.ifr_name[IFNAMSIZ-1] = 0; - -	colon = strchr(ifr.ifr_name, ':'); -	if (colon) -		*colon = 0; - -	/* -	 *	See which interface the caller is talking about. -	 */ - -	switch (cmd) { -	/* -	 *	These ioctl calls: -	 *	- can be done by all. -	 *	- atomic and do not require locking. -	 *	- return a value -	 */ -	case SIOCGIFFLAGS: -	case SIOCGIFMETRIC: -	case SIOCGIFMTU: -	case SIOCGIFHWADDR: -	case SIOCGIFSLAVE: -	case SIOCGIFMAP: -	case SIOCGIFINDEX: -	case SIOCGIFTXQLEN: -		dev_load(net, ifr.ifr_name); -		rcu_read_lock(); -		ret = dev_ifsioc_locked(net, &ifr, cmd); -		rcu_read_unlock(); -		if (!ret) { -			if (colon) -				*colon = ':'; -			if (copy_to_user(arg, &ifr, -					 sizeof(struct ifreq))) -				ret = -EFAULT; -		} -		return ret; - -	case SIOCETHTOOL: -		dev_load(net, ifr.ifr_name); -		rtnl_lock(); -		ret = dev_ethtool(net, &ifr); -		rtnl_unlock(); -		if (!ret) { -			if (colon) -				*colon = ':'; -			if (copy_to_user(arg, &ifr, -					 sizeof(struct ifreq))) -				ret = -EFAULT; -		} -		return ret; - -	/* -	 *	These ioctl calls: -	 *	- require superuser power. -	 *	- require strict serialization. -	 *	- return a value -	 */ -	case SIOCGMIIPHY: -	case SIOCGMIIREG: -	case SIOCSIFNAME: -		if (!capable(CAP_NET_ADMIN)) -			return -EPERM; -		dev_load(net, ifr.ifr_name); -		rtnl_lock(); -		ret = dev_ifsioc(net, &ifr, cmd); -		rtnl_unlock(); -		if (!ret) { -			if (colon) -				*colon = ':'; -			if (copy_to_user(arg, &ifr, -					 sizeof(struct ifreq))) -				ret = -EFAULT; -		} -		return ret; - -	/* -	 *	These ioctl calls: -	 *	- require superuser power. -	 *	- require strict serialization. -	 *	- do not return a value -	 */ -	case SIOCSIFFLAGS: -	case SIOCSIFMETRIC: -	case SIOCSIFMTU: -	case SIOCSIFMAP: -	case SIOCSIFHWADDR: -	case SIOCSIFSLAVE: -	case SIOCADDMULTI: -	case SIOCDELMULTI: -	case SIOCSIFHWBROADCAST: -	case SIOCSIFTXQLEN: -	case SIOCSMIIREG: -	case SIOCBONDENSLAVE: -	case SIOCBONDRELEASE: -	case SIOCBONDSETHWADDR: -	case SIOCBONDCHANGEACTIVE: -	case SIOCBRADDIF: -	case SIOCBRDELIF: -	case SIOCSHWTSTAMP: -		if (!capable(CAP_NET_ADMIN)) -			return -EPERM; -		/* fall through */ -	case SIOCBONDSLAVEINFOQUERY: -	case SIOCBONDINFOQUERY: -		dev_load(net, ifr.ifr_name); -		rtnl_lock(); -		ret = dev_ifsioc(net, &ifr, cmd); -		rtnl_unlock(); -		return ret; - -	case SIOCGIFMEM: -		/* Get the per device memory space. We can add this but -		 * currently do not support it */ -	case SIOCSIFMEM: -		/* Set the per device memory buffer space. -		 * Not applicable in our case */ -	case SIOCSIFLINK: -		return -EINVAL; +	const struct net_device_ops *ops = dev->netdev_ops; -	/* -	 *	Unknown or private ioctl. -	 */ -	default: -		if (cmd == SIOCWANDEV || -		    (cmd >= SIOCDEVPRIVATE && -		     cmd <= SIOCDEVPRIVATE + 15)) { -			dev_load(net, ifr.ifr_name); -			rtnl_lock(); -			ret = dev_ifsioc(net, &ifr, cmd); -			rtnl_unlock(); -			if (!ret && copy_to_user(arg, &ifr, -						 sizeof(struct ifreq))) -				ret = -EFAULT; -			return ret; -		} -		/* Take care of Wireless Extensions */ -		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) -			return wext_handle_ioctl(net, &ifr, cmd, arg); -		return -EINVAL; -	} +	if (!ops->ndo_get_phys_port_id) +		return -EOPNOTSUPP; +	return ops->ndo_get_phys_port_id(dev, ppid);  } - +EXPORT_SYMBOL(dev_get_phys_port_id);  /**   *	dev_new_index	-	allocate an ifindex @@ -4919,26 +5661,29 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)   */  static int dev_new_index(struct net *net)  { -	static int ifindex; +	int ifindex = net->ifindex;  	for (;;) {  		if (++ifindex <= 0)  			ifindex = 1;  		if (!__dev_get_by_index(net, ifindex)) -			return ifindex; +			return net->ifindex = ifindex;  	}  }  /* Delayed registration/unregisteration */  static LIST_HEAD(net_todo_list); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);  static void net_set_todo(struct net_device *dev)  {  	list_add_tail(&dev->todo_list, &net_todo_list); +	dev_net(dev)->dev_unreg_count++;  }  static void rollback_registered_many(struct list_head *head)  {  	struct net_device *dev, *tmp; +	LIST_HEAD(close_head);  	BUG_ON(dev_boot_phase);  	ASSERT_RTNL(); @@ -4949,19 +5694,23 @@ static void rollback_registered_many(struct list_head *head)  		 * devices and proceed with the remaining.  		 */  		if (dev->reg_state == NETREG_UNINITIALIZED) { -			pr_debug("unregister_netdevice: device %s/%p never " -				 "was registered\n", dev->name, dev); +			pr_debug("unregister_netdevice: device %s/%p never was registered\n", +				 dev->name, dev);  			WARN_ON(1);  			list_del(&dev->unreg_list);  			continue;  		} - +		dev->dismantle = true;  		BUG_ON(dev->reg_state != NETREG_REGISTERED); +	} -		/* If device is running, close it first. */ -		dev_close(dev); +	/* If device is running, close it first. */ +	list_for_each_entry(dev, head, unreg_list) +		list_add_tail(&dev->close_list, &close_head); +	dev_close_many(&close_head); +	list_for_each_entry(dev, head, unreg_list) {  		/* And unlink it from device chain. */  		unlist_netdevice(dev); @@ -4980,10 +5729,6 @@ static void rollback_registered_many(struct list_head *head)  		*/  		call_netdevice_notifiers(NETDEV_UNREGISTER, dev); -		if (!dev->rtnl_link_ops || -		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) -			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); -  		/*  		 *	Flush the unicast and multicast chains  		 */ @@ -4993,18 +5738,22 @@ static void rollback_registered_many(struct list_head *head)  		if (dev->netdev_ops->ndo_uninit)  			dev->netdev_ops->ndo_uninit(dev); -		/* Notifier chain MUST detach us from master device. */ -		WARN_ON(dev->master); +		if (!dev->rtnl_link_ops || +		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) +			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + +		/* Notifier chain MUST detach us all upper devices. */ +		WARN_ON(netdev_has_any_upper_dev(dev));  		/* Remove entries from kobject tree */  		netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS +		/* Remove XPS queueing entries */ +		netif_reset_xps_queues_gt(dev, 0); +#endif  	} -	/* Process any work delayed until the end of the batch */ -	dev = list_first_entry(head, struct net_device, unreg_list); -	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); - -	rcu_barrier(); +	synchronize_net();  	list_for_each_entry(dev, head, unreg_list)  		dev_put(dev); @@ -5016,50 +5765,144 @@ static void rollback_registered(struct net_device *dev)  	list_add(&dev->unreg_list, &single);  	rollback_registered_many(&single); +	list_del(&single);  } -unsigned long netdev_fix_features(unsigned long features, const char *name) +static netdev_features_t netdev_fix_features(struct net_device *dev, +	netdev_features_t features)  { -	/* Fix illegal SG+CSUM combinations. */ -	if ((features & NETIF_F_SG) && -	    !(features & NETIF_F_ALL_CSUM)) { -		if (name) -			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " -			       "checksum feature.\n", name); -		features &= ~NETIF_F_SG; +	/* Fix illegal checksum combinations */ +	if ((features & NETIF_F_HW_CSUM) && +	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { +		netdev_warn(dev, "mixed HW and IP checksum settings.\n"); +		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);  	}  	/* TSO requires that SG is present as well. */ -	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { -		if (name) -			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " -			       "SG feature.\n", name); +	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { +		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); +		features &= ~NETIF_F_ALL_TSO; +	} + +	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && +					!(features & NETIF_F_IP_CSUM)) { +		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");  		features &= ~NETIF_F_TSO; +		features &= ~NETIF_F_TSO_ECN; +	} + +	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && +					 !(features & NETIF_F_IPV6_CSUM)) { +		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); +		features &= ~NETIF_F_TSO6;  	} +	/* TSO ECN requires that TSO is present as well. */ +	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) +		features &= ~NETIF_F_TSO_ECN; + +	/* Software GSO depends on SG. */ +	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { +		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); +		features &= ~NETIF_F_GSO; +	} + +	/* UFO needs SG and checksumming */  	if (features & NETIF_F_UFO) {  		/* maybe split UFO into V4 and V6? */  		if (!((features & NETIF_F_GEN_CSUM) ||  		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))  			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { -			if (name) -				printk(KERN_ERR "%s: Dropping NETIF_F_UFO " -				       "since no checksum offload features.\n", -				       name); +			netdev_dbg(dev, +				"Dropping NETIF_F_UFO since no checksum offload features.\n");  			features &= ~NETIF_F_UFO;  		}  		if (!(features & NETIF_F_SG)) { -			if (name) -				printk(KERN_ERR "%s: Dropping NETIF_F_UFO " -				       "since no NETIF_F_SG feature.\n", name); +			netdev_dbg(dev, +				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");  			features &= ~NETIF_F_UFO;  		}  	} +#ifdef CONFIG_NET_RX_BUSY_POLL +	if (dev->netdev_ops->ndo_busy_poll) +		features |= NETIF_F_BUSY_POLL; +	else +#endif +		features &= ~NETIF_F_BUSY_POLL; +  	return features;  } -EXPORT_SYMBOL(netdev_fix_features); + +int __netdev_update_features(struct net_device *dev) +{ +	netdev_features_t features; +	int err = 0; + +	ASSERT_RTNL(); + +	features = netdev_get_wanted_features(dev); + +	if (dev->netdev_ops->ndo_fix_features) +		features = dev->netdev_ops->ndo_fix_features(dev, features); + +	/* driver might be less strict about feature dependencies */ +	features = netdev_fix_features(dev, features); + +	if (dev->features == features) +		return 0; + +	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", +		&dev->features, &features); + +	if (dev->netdev_ops->ndo_set_features) +		err = dev->netdev_ops->ndo_set_features(dev, features); + +	if (unlikely(err < 0)) { +		netdev_err(dev, +			"set_features() failed (%d); wanted %pNF, left %pNF\n", +			err, &features, &dev->features); +		return -1; +	} + +	if (!err) +		dev->features = features; + +	return 1; +} + +/** + *	netdev_update_features - recalculate device features + *	@dev: the device to check + * + *	Recalculate dev->features set and send notifications if it + *	has changed. Should be called after driver or hardware dependent + *	conditions might have changed that influence the features. + */ +void netdev_update_features(struct net_device *dev) +{ +	if (__netdev_update_features(dev)) +		netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_update_features); + +/** + *	netdev_change_features - recalculate device features + *	@dev: the device to check + * + *	Recalculate dev->features set and send notifications even + *	if they have not changed. Should be called instead of + *	netdev_update_features() if also dev->vlan_features might + *	have changed to allow the changes to be propagated to stacked + *	VLAN devices. + */ +void netdev_change_features(struct net_device *dev) +{ +	__netdev_update_features(dev); +	netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_change_features);  /**   *	netif_stacked_transfer_operstate -	transfer operstate @@ -5088,7 +5931,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,  }  EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  static int netif_alloc_rx_queues(struct net_device *dev)  {  	unsigned int i, count = dev->num_rx_queues; @@ -5097,10 +5940,9 @@ static int netif_alloc_rx_queues(struct net_device *dev)  	BUG_ON(count < 1);  	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); -	if (!rx) { -		pr_err("netdev: Unable to allocate %u rx queues.\n", count); +	if (!rx)  		return -ENOMEM; -	} +  	dev->_rx = rx;  	for (i = 0; i < count; i++) @@ -5116,22 +5958,31 @@ static void netdev_init_one_queue(struct net_device *dev,  	spin_lock_init(&queue->_xmit_lock);  	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);  	queue->xmit_lock_owner = -1; -	netdev_queue_numa_node_write(queue, -1); +	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);  	queue->dev = dev; +#ifdef CONFIG_BQL +	dql_init(&queue->dql, HZ); +#endif +} + +static void netif_free_tx_queues(struct net_device *dev) +{ +	kvfree(dev->_tx);  }  static int netif_alloc_netdev_queues(struct net_device *dev)  {  	unsigned int count = dev->num_tx_queues;  	struct netdev_queue *tx; +	size_t sz = count * sizeof(*tx); -	BUG_ON(count < 1); +	BUG_ON(count < 1 || count > 0xffff); -	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); +	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);  	if (!tx) { -		pr_err("netdev: Unable to allocate %u tx queues.\n", -		       count); -		return -ENOMEM; +		tx = vzalloc(sz); +		if (!tx) +			return -ENOMEM;  	}  	dev->_tx = tx; @@ -5177,6 +6028,10 @@ int register_netdevice(struct net_device *dev)  	dev->iflink = -1; +	ret = dev_get_valid_name(net, dev, dev->name); +	if (ret < 0) +		goto out; +  	/* Init, if this function is available */  	if (dev->netdev_ops->ndo_init) {  		ret = dev->netdev_ops->ndo_init(dev); @@ -5187,40 +6042,46 @@ int register_netdevice(struct net_device *dev)  		}  	} -	ret = dev_get_valid_name(dev, dev->name, 0); -	if (ret) +	if (((dev->hw_features | dev->features) & +	     NETIF_F_HW_VLAN_CTAG_FILTER) && +	    (!dev->netdev_ops->ndo_vlan_rx_add_vid || +	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { +		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); +		ret = -EINVAL; +		goto err_uninit; +	} + +	ret = -EBUSY; +	if (!dev->ifindex) +		dev->ifindex = dev_new_index(net); +	else if (__dev_get_by_index(net, dev->ifindex))  		goto err_uninit; -	dev->ifindex = dev_new_index(net);  	if (dev->iflink == -1)  		dev->iflink = dev->ifindex; -	/* Fix illegal checksum combinations */ -	if ((dev->features & NETIF_F_HW_CSUM) && -	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { -		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", -		       dev->name); -		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); -	} +	/* Transfer changeable features to wanted_features and enable +	 * software offloads (GSO and GRO). +	 */ +	dev->hw_features |= NETIF_F_SOFT_FEATURES; +	dev->features |= NETIF_F_SOFT_FEATURES; +	dev->wanted_features = dev->features & dev->hw_features; -	if ((dev->features & NETIF_F_NO_CSUM) && -	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { -		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", -		       dev->name); -		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); +	if (!(dev->flags & IFF_LOOPBACK)) { +		dev->hw_features |= NETIF_F_NOCACHE_COPY;  	} -	dev->features = netdev_fix_features(dev->features, dev->name); +	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices. +	 */ +	dev->vlan_features |= NETIF_F_HIGHDMA; -	/* Enable software GSO if SG is supported. */ -	if (dev->features & NETIF_F_SG) -		dev->features |= NETIF_F_GSO; +	/* Make NETIF_F_SG inheritable to tunnel devices. +	 */ +	dev->hw_enc_features |= NETIF_F_SG; -	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default, -	 * vlan_dev_init() will do the dev->features check, so these features -	 * are enabled only if supported by underlying device. +	/* Make NETIF_F_SG inheritable to MPLS.  	 */ -	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA); +	dev->mpls_features |= NETIF_F_SG;  	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);  	ret = notifier_to_errno(ret); @@ -5232,6 +6093,8 @@ int register_netdevice(struct net_device *dev)  		goto err_uninit;  	dev->reg_state = NETREG_REGISTERED; +	__netdev_update_features(dev); +  	/*  	 *	Default initial state at registry is that the  	 *	device is present. @@ -5239,9 +6102,19 @@ int register_netdevice(struct net_device *dev)  	set_bit(__LINK_STATE_PRESENT, &dev->state); +	linkwatch_init_dev(dev); +  	dev_init_scheduler(dev);  	dev_hold(dev);  	list_netdevice(dev); +	add_device_randomness(dev->dev_addr, dev->addr_len); + +	/* If the device has permanent device address, driver should +	 * set dev_addr and also addr_assign_type should be set to +	 * NET_ADDR_PERM (default value). +	 */ +	if (dev->addr_assign_type == NET_ADDR_PERM) +		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);  	/* Notify protocols, that a new device appeared. */  	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); @@ -5256,7 +6129,7 @@ int register_netdevice(struct net_device *dev)  	 */  	if (!dev->rtnl_link_ops ||  	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED) -		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); +		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);  out:  	return ret; @@ -5327,19 +6200,7 @@ int register_netdev(struct net_device *dev)  	int err;  	rtnl_lock(); - -	/* -	 * If the name is a format string the caller wants us to do a -	 * name allocation. -	 */ -	if (strchr(dev->name, '%')) { -		err = dev_alloc_name(dev, dev->name); -		if (err < 0) -			goto out; -	} -  	err = register_netdevice(dev); -out:  	rtnl_unlock();  	return err;  } @@ -5355,8 +6216,9 @@ int netdev_refcnt_read(const struct net_device *dev)  }  EXPORT_SYMBOL(netdev_refcnt_read); -/* +/**   * netdev_wait_allrefs - wait until all references are gone. + * @dev: target net_device   *   * This is called when unregistering network devices.   * @@ -5382,9 +6244,12 @@ static void netdev_wait_allrefs(struct net_device *dev)  			/* Rebroadcast unregister notification */  			call_netdevice_notifiers(NETDEV_UNREGISTER, dev); -			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users -			 * should have already handle it the first time */ +			__rtnl_unlock(); +			rcu_barrier(); +			rtnl_lock(); + +			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);  			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,  				     &dev->state)) {  				/* We must not have linkwatch events @@ -5406,10 +6271,8 @@ static void netdev_wait_allrefs(struct net_device *dev)  		refcnt = netdev_refcnt_read(dev);  		if (time_after(jiffies, warning_time + 10 * HZ)) { -			printk(KERN_EMERG "unregister_netdevice: " -			       "waiting for %s to become free. Usage " -			       "count = %d\n", -			       dev->name, refcnt); +			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", +				 dev->name, refcnt);  			warning_time = jiffies;  		}  	} @@ -5448,13 +6311,22 @@ void netdev_run_todo(void)  	__rtnl_unlock(); + +	/* Wait for rcu callbacks to finish before next phase */ +	if (!list_empty(&list)) +		rcu_barrier(); +  	while (!list_empty(&list)) {  		struct net_device *dev  			= list_first_entry(&list, struct net_device, todo_list);  		list_del(&dev->todo_list); +		rtnl_lock(); +		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); +		__rtnl_unlock(); +  		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { -			printk(KERN_ERR "network todo '%s' but state %d\n", +			pr_err("network todo '%s' but state %d\n",  			       dev->name, dev->reg_state);  			dump_stack();  			continue; @@ -5468,55 +6340,33 @@ void netdev_run_todo(void)  		/* paranoia */  		BUG_ON(netdev_refcnt_read(dev)); -		WARN_ON(rcu_dereference_raw(dev->ip_ptr)); -		WARN_ON(rcu_dereference_raw(dev->ip6_ptr)); +		WARN_ON(rcu_access_pointer(dev->ip_ptr)); +		WARN_ON(rcu_access_pointer(dev->ip6_ptr));  		WARN_ON(dev->dn_ptr);  		if (dev->destructor)  			dev->destructor(dev); +		/* Report a network device has been unregistered */ +		rtnl_lock(); +		dev_net(dev)->dev_unreg_count--; +		__rtnl_unlock(); +		wake_up(&netdev_unregistering_wq); +  		/* Free network device */  		kobject_put(&dev->dev.kobj);  	}  } -/** - *	dev_txq_stats_fold - fold tx_queues stats - *	@dev: device to get statistics from - *	@stats: struct rtnl_link_stats64 to hold results - */ -void dev_txq_stats_fold(const struct net_device *dev, -			struct rtnl_link_stats64 *stats) -{ -	u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0; -	unsigned int i; -	struct netdev_queue *txq; - -	for (i = 0; i < dev->num_tx_queues; i++) { -		txq = netdev_get_tx_queue(dev, i); -		spin_lock_bh(&txq->_xmit_lock); -		tx_bytes   += txq->tx_bytes; -		tx_packets += txq->tx_packets; -		tx_dropped += txq->tx_dropped; -		spin_unlock_bh(&txq->_xmit_lock); -	} -	if (tx_bytes || tx_packets || tx_dropped) { -		stats->tx_bytes   = tx_bytes; -		stats->tx_packets = tx_packets; -		stats->tx_dropped = tx_dropped; -	} -} -EXPORT_SYMBOL(dev_txq_stats_fold); -  /* Convert net_device_stats to rtnl_link_stats64.  They have the same   * fields in the same order, with only the type differing.   */ -static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, -				    const struct net_device_stats *netdev_stats) +void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, +			     const struct net_device_stats *netdev_stats)  {  #if BITS_PER_LONG == 64 -        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); -        memcpy(stats64, netdev_stats, sizeof(*stats64)); +	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); +	memcpy(stats64, netdev_stats, sizeof(*stats64));  #else  	size_t i, n = sizeof(*stats64) / sizeof(u64);  	const unsigned long *src = (const unsigned long *)netdev_stats; @@ -5528,6 +6378,7 @@ static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,  		dst[i] = src[i];  #endif  } +EXPORT_SYMBOL(netdev_stats_to_stats64);  /**   *	dev_get_stats	- get network device statistics @@ -5551,9 +6402,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,  		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));  	} else {  		netdev_stats_to_stats64(storage, &dev->stats); -		dev_txq_stats_fold(dev, storage);  	}  	storage->rx_dropped += atomic_long_read(&dev->rx_dropped); +	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);  	return storage;  }  EXPORT_SYMBOL(dev_get_stats); @@ -5576,19 +6427,38 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)  	return queue;  } +static const struct ethtool_ops default_ethtool_ops; + +void netdev_set_default_ethtool_ops(struct net_device *dev, +				    const struct ethtool_ops *ops) +{ +	if (dev->ethtool_ops == &default_ethtool_ops) +		dev->ethtool_ops = ops; +} +EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); + +void netdev_freemem(struct net_device *dev) +{ +	char *addr = (char *)dev - dev->padded; + +	kvfree(addr); +} +  /** - *	alloc_netdev_mq - allocate network device + *	alloc_netdev_mqs - allocate network device   *	@sizeof_priv:	size of private data to allocate space for   *	@name:		device name format string   *	@setup:		callback to initialize device - *	@queue_count:	the number of subqueues to allocate + *	@txqs:		the number of TX subqueues to allocate + *	@rxqs:		the number of RX subqueues to allocate   *   *	Allocates a struct net_device with private data area for driver use - *	and performs basic initialization.  Also allocates subquue structs - *	for each queue on the device at the end of the netdevice. + *	and performs basic initialization.  Also allocates subqueue structs + *	for each queue on the device.   */ -struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, -		void (*setup)(struct net_device *), unsigned int queue_count) +struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, +		void (*setup)(struct net_device *), +		unsigned int txqs, unsigned int rxqs)  {  	struct net_device *dev;  	size_t alloc_size; @@ -5596,12 +6466,18 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,  	BUG_ON(strlen(name) >= sizeof(dev->name)); -	if (queue_count < 1) { -		pr_err("alloc_netdev: Unable to allocate device " -		       "with zero queues.\n"); +	if (txqs < 1) { +		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");  		return NULL;  	} +#ifdef CONFIG_SYSFS +	if (rxqs < 1) { +		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); +		return NULL; +	} +#endif +  	alloc_size = sizeof(struct net_device);  	if (sizeof_priv) {  		/* ensure 32-byte alignment of private area */ @@ -5611,18 +6487,18 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,  	/* ensure 32-byte alignment of whole construct */  	alloc_size += NETDEV_ALIGN - 1; -	p = kzalloc(alloc_size, GFP_KERNEL); -	if (!p) { -		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); +	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); +	if (!p) +		p = vzalloc(alloc_size); +	if (!p)  		return NULL; -	}  	dev = PTR_ALIGN(p, NETDEV_ALIGN);  	dev->padded = (char *)dev - (char *)p;  	dev->pcpu_refcnt = alloc_percpu(int);  	if (!dev->pcpu_refcnt) -		goto free_p; +		goto free_dev;  	if (dev_addr_init(dev))  		goto free_pcpu; @@ -5632,42 +6508,49 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,  	dev_net_set(dev, &init_net); -	dev->num_tx_queues = queue_count; -	dev->real_num_tx_queues = queue_count; -	if (netif_alloc_netdev_queues(dev)) -		goto free_pcpu; - -#ifdef CONFIG_RPS -	dev->num_rx_queues = queue_count; -	dev->real_num_rx_queues = queue_count; -	if (netif_alloc_rx_queues(dev)) -		goto free_pcpu; -#endif -  	dev->gso_max_size = GSO_MAX_SIZE; +	dev->gso_max_segs = GSO_MAX_SEGS; -	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); -	dev->ethtool_ntuple_list.count = 0;  	INIT_LIST_HEAD(&dev->napi_list);  	INIT_LIST_HEAD(&dev->unreg_list); +	INIT_LIST_HEAD(&dev->close_list);  	INIT_LIST_HEAD(&dev->link_watch_list); +	INIT_LIST_HEAD(&dev->adj_list.upper); +	INIT_LIST_HEAD(&dev->adj_list.lower); +	INIT_LIST_HEAD(&dev->all_adj_list.upper); +	INIT_LIST_HEAD(&dev->all_adj_list.lower);  	dev->priv_flags = IFF_XMIT_DST_RELEASE;  	setup(dev); + +	dev->num_tx_queues = txqs; +	dev->real_num_tx_queues = txqs; +	if (netif_alloc_netdev_queues(dev)) +		goto free_all; + +#ifdef CONFIG_SYSFS +	dev->num_rx_queues = rxqs; +	dev->real_num_rx_queues = rxqs; +	if (netif_alloc_rx_queues(dev)) +		goto free_all; +#endif +  	strcpy(dev->name, name); +	dev->group = INIT_NETDEV_GROUP; +	if (!dev->ethtool_ops) +		dev->ethtool_ops = &default_ethtool_ops;  	return dev; +free_all: +	free_netdev(dev); +	return NULL; +  free_pcpu:  	free_percpu(dev->pcpu_refcnt); -	kfree(dev->_tx); -#ifdef CONFIG_RPS -	kfree(dev->_rx); -#endif - -free_p: -	kfree(p); +free_dev: +	netdev_freemem(dev);  	return NULL;  } -EXPORT_SYMBOL(alloc_netdev_mq); +EXPORT_SYMBOL(alloc_netdev_mqs);  /**   *	free_netdev - free network device @@ -5683,19 +6566,16 @@ void free_netdev(struct net_device *dev)  	release_net(dev_net(dev)); -	kfree(dev->_tx); -#ifdef CONFIG_RPS +	netif_free_tx_queues(dev); +#ifdef CONFIG_SYSFS  	kfree(dev->_rx);  #endif -	kfree(rcu_dereference_raw(dev->ingress_queue)); +	kfree(rcu_dereference_protected(dev->ingress_queue, 1));  	/* Flush device addresses */  	dev_addr_flush(dev); -	/* Clear ethtool n-tuple list */ -	ethtool_ntuple_flush(dev); -  	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)  		netif_napi_del(p); @@ -5704,7 +6584,7 @@ void free_netdev(struct net_device *dev)  	/*  Compatibility with error handling in drivers */  	if (dev->reg_state == NETREG_UNINITIALIZED) { -		kfree((char *)dev - dev->padded); +		netdev_freemem(dev);  		return;  	} @@ -5725,7 +6605,10 @@ EXPORT_SYMBOL(free_netdev);  void synchronize_net(void)  {  	might_sleep(); -	synchronize_rcu(); +	if (rtnl_is_locked()) +		synchronize_rcu_expedited(); +	else +		synchronize_rcu();  }  EXPORT_SYMBOL(synchronize_net); @@ -5759,6 +6642,9 @@ EXPORT_SYMBOL(unregister_netdevice_queue);  /**   *	unregister_netdevice_many - unregister many devices   *	@head: list of devices + * + *  Note: As most callers use a stack allocated list_head, + *  we force a list_del() to make sure stack wont be corrupted later.   */  void unregister_netdevice_many(struct list_head *head)  { @@ -5768,6 +6654,7 @@ void unregister_netdevice_many(struct list_head *head)  		rollback_registered_many(head);  		list_for_each_entry(dev, head, unreg_list)  			net_set_todo(dev); +		list_del(head);  	}  }  EXPORT_SYMBOL(unregister_netdevice_many); @@ -5817,7 +6704,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  		goto out;  	/* Ensure the device has been registrered */ -	err = -EINVAL;  	if (dev->reg_state != NETREG_REGISTERED)  		goto out; @@ -5834,7 +6720,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  		/* We get here if we can't use the current device name */  		if (!pat)  			goto out; -		if (dev_get_valid_name(dev, pat, 1)) +		if (dev_get_valid_name(net, dev, pat) < 0)  			goto out;  	} @@ -5862,7 +6748,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	   the device is just moving and can keep their slaves up.  	*/  	call_netdevice_notifiers(NETDEV_UNREGISTER, dev); -	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); +	rcu_barrier(); +	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); +	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);  	/*  	 *	Flush the unicast and multicast chains @@ -5870,6 +6758,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	dev_uc_flush(dev);  	dev_mc_flush(dev); +	/* Send a netdev-removed uevent to the old namespace */ +	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); +  	/* Actually switch the network namespace */  	dev_net_set(dev, net); @@ -5881,6 +6772,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  			dev->iflink = dev->ifindex;  	} +	/* Send a netdev-add uevent to the new namespace */ +	kobject_uevent(&dev->dev.kobj, KOBJ_ADD); +  	/* Fixup kobjects */  	err = device_rename(&dev->dev, dev->name);  	WARN_ON(err); @@ -5895,7 +6789,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	 *	Prevent userspace races by waiting until the network  	 *	device is fully setup before sending notifications.  	 */ -	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); +	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);  	synchronize_net();  	err = 0; @@ -5936,17 +6830,22 @@ static int dev_cpu_callback(struct notifier_block *nfb,  		oldsd->output_queue = NULL;  		oldsd->output_queue_tailp = &oldsd->output_queue;  	} +	/* Append NAPI poll list from offline CPU. */ +	if (!list_empty(&oldsd->poll_list)) { +		list_splice_init(&oldsd->poll_list, &sd->poll_list); +		raise_softirq_irqoff(NET_RX_SOFTIRQ); +	}  	raise_softirq_irqoff(NET_TX_SOFTIRQ);  	local_irq_enable();  	/* Process offline CPU's input_pkt_queue */  	while ((skb = __skb_dequeue(&oldsd->process_queue))) { -		netif_rx(skb); +		netif_rx_internal(skb);  		input_queue_head_incr(oldsd);  	}  	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { -		netif_rx(skb); +		netif_rx_internal(skb);  		input_queue_head_incr(oldsd);  	} @@ -5964,38 +6863,25 @@ static int dev_cpu_callback(struct notifier_block *nfb,   *	@one to the master device with current feature set @all.  Will not   *	enable anything that is off in @mask. Returns the new feature set.   */ -unsigned long netdev_increment_features(unsigned long all, unsigned long one, -					unsigned long mask) -{ -	/* If device needs checksumming, downgrade to it. */ -	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) -		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); -	else if (mask & NETIF_F_ALL_CSUM) { -		/* If one device supports v4/v6 checksumming, set for all. */ -		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) && -		    !(all & NETIF_F_GEN_CSUM)) { -			all &= ~NETIF_F_ALL_CSUM; -			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); -		} - -		/* If one device supports hw checksumming, set for all. */ -		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { -			all &= ~NETIF_F_ALL_CSUM; -			all |= NETIF_F_HW_CSUM; -		} -	} +netdev_features_t netdev_increment_features(netdev_features_t all, +	netdev_features_t one, netdev_features_t mask) +{ +	if (mask & NETIF_F_GEN_CSUM) +		mask |= NETIF_F_ALL_CSUM; +	mask |= NETIF_F_VLAN_CHALLENGED; -	one |= NETIF_F_ALL_CSUM; +	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; +	all &= one | ~NETIF_F_ALL_FOR_ALL; -	one |= all & NETIF_F_ONE_FOR_ALL; -	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; -	all |= one & mask & NETIF_F_ONE_FOR_ALL; +	/* If one device supports hw checksumming, set for all. */ +	if (all & NETIF_F_GEN_CSUM) +		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);  	return all;  }  EXPORT_SYMBOL(netdev_increment_features); -static struct hlist_head *netdev_create_hash(void) +static struct hlist_head * __net_init netdev_create_hash(void)  {  	int i;  	struct hlist_head *hash; @@ -6011,7 +6897,8 @@ static struct hlist_head *netdev_create_hash(void)  /* Initialize per network namespace state */  static int __net_init netdev_init(struct net *net)  { -	INIT_LIST_HEAD(&net->dev_base_head); +	if (net != &init_net) +		INIT_LIST_HEAD(&net->dev_base_head);  	net->dev_name_head = netdev_create_hash();  	if (net->dev_name_head == NULL) @@ -6032,29 +6919,23 @@ err_name:  /**   *	netdev_drivername - network driver for the device   *	@dev: network device - *	@buffer: buffer for resulting name - *	@len: size of buffer   *   *	Determine network driver for device.   */ -char *netdev_drivername(const struct net_device *dev, char *buffer, int len) +const char *netdev_drivername(const struct net_device *dev)  {  	const struct device_driver *driver;  	const struct device *parent; - -	if (len <= 0 || !buffer) -		return buffer; -	buffer[0] = 0; +	const char *empty = "";  	parent = dev->dev.parent; -  	if (!parent) -		return buffer; +		return empty;  	driver = parent->driver;  	if (driver && driver->name) -		strlcpy(buffer, driver->name, len); -	return buffer; +		return driver->name; +	return empty;  }  static int __netdev_printk(const char *level, const struct net_device *dev, @@ -6062,13 +6943,18 @@ static int __netdev_printk(const char *level, const struct net_device *dev,  {  	int r; -	if (dev && dev->dev.parent) -		r = dev_printk(level, dev->dev.parent, "%s: %pV", -			       netdev_name(dev), vaf); -	else if (dev) +	if (dev && dev->dev.parent) { +		r = dev_printk_emit(level[1] - '0', +				    dev->dev.parent, +				    "%s %s %s: %pV", +				    dev_driver_string(dev->dev.parent), +				    dev_name(dev->dev.parent), +				    netdev_name(dev), vaf); +	} else if (dev) {  		r = printk("%s%s: %pV", level, netdev_name(dev), vaf); -	else +	} else {  		r = printk("%s(NULL net_device): %pV", level, vaf); +	}  	return r;  } @@ -6086,6 +6972,7 @@ int netdev_printk(const char *level, const struct net_device *dev,  	vaf.va = &args;  	r = __netdev_printk(level, dev, &vaf); +  	va_end(args);  	return r; @@ -6105,6 +6992,7 @@ int func(const struct net_device *dev, const char *fmt, ...)	\  	vaf.va = &args;						\  								\  	r = __netdev_printk(level, dev, &vaf);			\ +								\  	va_end(args);						\  								\  	return r;						\ @@ -6150,22 +7038,50 @@ static void __net_exit default_device_exit(struct net *net)  		if (dev->rtnl_link_ops)  			continue; -		/* Push remaing network devices to init_net */ +		/* Push remaining network devices to init_net */  		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);  		err = dev_change_net_namespace(dev, &init_net, fb_name);  		if (err) { -			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n", -				__func__, dev->name, err); +			pr_emerg("%s: failed to move %s to init_net: %d\n", +				 __func__, dev->name, err);  			BUG();  		}  	}  	rtnl_unlock();  } +static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) +{ +	/* Return with the rtnl_lock held when there are no network +	 * devices unregistering in any network namespace in net_list. +	 */ +	struct net *net; +	bool unregistering; +	DEFINE_WAIT(wait); + +	for (;;) { +		prepare_to_wait(&netdev_unregistering_wq, &wait, +				TASK_UNINTERRUPTIBLE); +		unregistering = false; +		rtnl_lock(); +		list_for_each_entry(net, net_list, exit_list) { +			if (net->dev_unreg_count > 0) { +				unregistering = true; +				break; +			} +		} +		if (!unregistering) +			break; +		__rtnl_unlock(); +		schedule(); +	} +	finish_wait(&netdev_unregistering_wq, &wait); +} +  static void __net_exit default_device_exit_batch(struct list_head *net_list)  {  	/* At exit all network devices most be removed from a network -	 * namespace.  Do this in the reverse order of registeration. +	 * namespace.  Do this in the reverse order of registration.  	 * Do this across as many network namespaces as possible to  	 * improve batching efficiency.  	 */ @@ -6173,7 +7089,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)  	struct net *net;  	LIST_HEAD(dev_kill_list); -	rtnl_lock(); +	/* To prevent network device cleanup code from dereferencing +	 * loopback devices or network devices that have been freed +	 * wait here for all pending unregistrations to complete, +	 * before unregistring the loopback device and allowing the +	 * network namespace be freed. +	 * +	 * The netdev todo list containing all network devices +	 * unregistrations that happen in default_device_exit_batch +	 * will run in the rtnl_unlock() at the end of +	 * default_device_exit_batch. +	 */ +	rtnl_lock_unregistering(net_list);  	list_for_each_entry(net, net_list, exit_list) {  		for_each_netdev_reverse(net, dev) {  			if (dev->rtnl_link_ops) @@ -6218,6 +7145,8 @@ static int __init net_dev_init(void)  	for (i = 0; i < PTYPE_HASH_SIZE; i++)  		INIT_LIST_HEAD(&ptype_base[i]); +	INIT_LIST_HEAD(&offload_base); +  	if (register_pernet_subsys(&netdev_net_ops))  		goto out; @@ -6228,24 +7157,18 @@ static int __init net_dev_init(void)  	for_each_possible_cpu(i) {  		struct softnet_data *sd = &per_cpu(softnet_data, i); -		memset(sd, 0, sizeof(*sd));  		skb_queue_head_init(&sd->input_pkt_queue);  		skb_queue_head_init(&sd->process_queue); -		sd->completion_queue = NULL;  		INIT_LIST_HEAD(&sd->poll_list); -		sd->output_queue = NULL;  		sd->output_queue_tailp = &sd->output_queue;  #ifdef CONFIG_RPS  		sd->csd.func = rps_trigger_softirq;  		sd->csd.info = sd; -		sd->csd.flags = 0;  		sd->cpu = i;  #endif  		sd->backlog.poll = process_backlog;  		sd->backlog.weight = weight_p; -		sd->backlog.gro_list = NULL; -		sd->backlog.gro_count = 0;  	}  	dev_boot_phase = 0; @@ -6270,19 +7193,9 @@ static int __init net_dev_init(void)  	hotcpu_notifier(dev_cpu_callback, 0);  	dst_init(); -	dev_mcast_init();  	rc = 0;  out:  	return rc;  }  subsys_initcall(net_dev_init); - -static int __init initialize_hashrnd(void) -{ -	get_random_bytes(&hashrnd, sizeof(hashrnd)); -	return 0; -} - -late_initcall_sync(initialize_hashrnd); - diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 508f9c18992..b6b230600b9 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -13,20 +13,46 @@  #include <linux/netdevice.h>  #include <linux/rtnetlink.h> +#include <linux/export.h>  #include <linux/list.h> -#include <linux/proc_fs.h>  /*   * General list handling functions   */ -static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, -			    unsigned char *addr, int addr_len, -			    unsigned char addr_type, bool global) +static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, +			       const unsigned char *addr, int addr_len, +			       unsigned char addr_type, bool global, +			       bool sync)  {  	struct netdev_hw_addr *ha;  	int alloc_size; +	alloc_size = sizeof(*ha); +	if (alloc_size < L1_CACHE_BYTES) +		alloc_size = L1_CACHE_BYTES; +	ha = kmalloc(alloc_size, GFP_ATOMIC); +	if (!ha) +		return -ENOMEM; +	memcpy(ha->addr, addr, addr_len); +	ha->type = addr_type; +	ha->refcount = 1; +	ha->global_use = global; +	ha->synced = sync ? 1 : 0; +	ha->sync_cnt = 0; +	list_add_tail_rcu(&ha->list, &list->list); +	list->count++; + +	return 0; +} + +static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, +			    const unsigned char *addr, int addr_len, +			    unsigned char addr_type, bool global, bool sync, +			    int sync_count) +{ +	struct netdev_hw_addr *ha; +  	if (addr_len > MAX_ADDR_LEN)  		return -EINVAL; @@ -40,115 +66,133 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,  				else  					ha->global_use = true;  			} +			if (sync) { +				if (ha->synced && sync_count) +					return -EEXIST; +				else +					ha->synced++; +			}  			ha->refcount++;  			return 0;  		}  	} - -	alloc_size = sizeof(*ha); -	if (alloc_size < L1_CACHE_BYTES) -		alloc_size = L1_CACHE_BYTES; -	ha = kmalloc(alloc_size, GFP_ATOMIC); -	if (!ha) -		return -ENOMEM; -	memcpy(ha->addr, addr, addr_len); -	ha->type = addr_type; -	ha->refcount = 1; -	ha->global_use = global; -	ha->synced = false; -	list_add_tail_rcu(&ha->list, &list->list); -	list->count++; -	return 0; +	return __hw_addr_create_ex(list, addr, addr_len, addr_type, global, +				   sync);  } -static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, -			 int addr_len, unsigned char addr_type) +static int __hw_addr_add(struct netdev_hw_addr_list *list, +			 const unsigned char *addr, int addr_len, +			 unsigned char addr_type)  { -	return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); +	return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, +				0);  } -static void ha_rcu_free(struct rcu_head *head) +static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, +			       struct netdev_hw_addr *ha, bool global, +			       bool sync)  { -	struct netdev_hw_addr *ha; +	if (global && !ha->global_use) +		return -ENOENT; + +	if (sync && !ha->synced) +		return -ENOENT; -	ha = container_of(head, struct netdev_hw_addr, rcu_head); -	kfree(ha); +	if (global) +		ha->global_use = false; + +	if (sync) +		ha->synced--; + +	if (--ha->refcount) +		return 0; +	list_del_rcu(&ha->list); +	kfree_rcu(ha, rcu_head); +	list->count--; +	return 0;  }  static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, -			    unsigned char *addr, int addr_len, -			    unsigned char addr_type, bool global) +			    const unsigned char *addr, int addr_len, +			    unsigned char addr_type, bool global, bool sync)  {  	struct netdev_hw_addr *ha;  	list_for_each_entry(ha, &list->list, list) {  		if (!memcmp(ha->addr, addr, addr_len) && -		    (ha->type == addr_type || !addr_type)) { -			if (global) { -				if (!ha->global_use) -					break; -				else -					ha->global_use = false; -			} -			if (--ha->refcount) -				return 0; -			list_del_rcu(&ha->list); -			call_rcu(&ha->rcu_head, ha_rcu_free); -			list->count--; -			return 0; -		} +		    (ha->type == addr_type || !addr_type)) +			return __hw_addr_del_entry(list, ha, global, sync);  	}  	return -ENOENT;  } -static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, -			 int addr_len, unsigned char addr_type) +static int __hw_addr_del(struct netdev_hw_addr_list *list, +			 const unsigned char *addr, int addr_len, +			 unsigned char addr_type)  { -	return __hw_addr_del_ex(list, addr, addr_len, addr_type, false); +	return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false);  } -int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, -			   struct netdev_hw_addr_list *from_list, -			   int addr_len, unsigned char addr_type) +static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, +			       struct netdev_hw_addr *ha, +			       int addr_len)  {  	int err; -	struct netdev_hw_addr *ha, *ha2; -	unsigned char type; -	list_for_each_entry(ha, &from_list->list, list) { -		type = addr_type ? addr_type : ha->type; -		err = __hw_addr_add(to_list, ha->addr, addr_len, type); -		if (err) -			goto unroll; +	err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, +			       false, true, ha->sync_cnt); +	if (err && err != -EEXIST) +		return err; + +	if (!err) { +		ha->sync_cnt++; +		ha->refcount++;  	} +  	return 0; +} -unroll: -	list_for_each_entry(ha2, &from_list->list, list) { -		if (ha2 == ha) -			break; -		type = addr_type ? addr_type : ha2->type; -		__hw_addr_del(to_list, ha2->addr, addr_len, type); -	} -	return err; +static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, +				 struct netdev_hw_addr_list *from_list, +				 struct netdev_hw_addr *ha, +				 int addr_len) +{ +	int err; + +	err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type, +			       false, true); +	if (err) +		return; +	ha->sync_cnt--; +	/* address on from list is not marked synced */ +	__hw_addr_del_entry(from_list, ha, false, false);  } -EXPORT_SYMBOL(__hw_addr_add_multiple); -void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, -			    struct netdev_hw_addr_list *from_list, -			    int addr_len, unsigned char addr_type) +static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, +				   struct netdev_hw_addr_list *from_list, +				   int addr_len)  { -	struct netdev_hw_addr *ha; -	unsigned char type; +	int err = 0; +	struct netdev_hw_addr *ha, *tmp; -	list_for_each_entry(ha, &from_list->list, list) { -		type = addr_type ? addr_type : ha->type; -		__hw_addr_del(to_list, ha->addr, addr_len, addr_type); +	list_for_each_entry_safe(ha, tmp, &from_list->list, list) { +		if (ha->sync_cnt == ha->refcount) { +			__hw_addr_unsync_one(to_list, from_list, ha, addr_len); +		} else { +			err = __hw_addr_sync_one(to_list, ha, addr_len); +			if (err) +				break; +		}  	} +	return err;  } -EXPORT_SYMBOL(__hw_addr_del_multiple); +/* This function only works where there is a strict 1-1 relationship + * between source and destionation of they synch. If you ever need to + * sync addresses to more then 1 destination, you need to use + * __hw_addr_sync_multiple(). + */  int __hw_addr_sync(struct netdev_hw_addr_list *to_list,  		   struct netdev_hw_addr_list *from_list,  		   int addr_len) @@ -157,17 +201,12 @@ int __hw_addr_sync(struct netdev_hw_addr_list *to_list,  	struct netdev_hw_addr *ha, *tmp;  	list_for_each_entry_safe(ha, tmp, &from_list->list, list) { -		if (!ha->synced) { -			err = __hw_addr_add(to_list, ha->addr, -					    addr_len, ha->type); +		if (!ha->sync_cnt) { +			err = __hw_addr_sync_one(to_list, ha, addr_len);  			if (err)  				break; -			ha->synced = true; -			ha->refcount++; -		} else if (ha->refcount == 1) { -			__hw_addr_del(to_list, ha->addr, addr_len, ha->type); -			__hw_addr_del(from_list, ha->addr, addr_len, ha->type); -		} +		} else if (ha->refcount == 1) +			__hw_addr_unsync_one(to_list, from_list, ha, addr_len);  	}  	return err;  } @@ -180,28 +219,107 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,  	struct netdev_hw_addr *ha, *tmp;  	list_for_each_entry_safe(ha, tmp, &from_list->list, list) { -		if (ha->synced) { -			__hw_addr_del(to_list, ha->addr, -				      addr_len, ha->type); -			ha->synced = false; -			__hw_addr_del(from_list, ha->addr, -				      addr_len, ha->type); -		} +		if (ha->sync_cnt) +			__hw_addr_unsync_one(to_list, from_list, ha, addr_len);  	}  }  EXPORT_SYMBOL(__hw_addr_unsync); -void __hw_addr_flush(struct netdev_hw_addr_list *list) +/** + *  __hw_addr_sync_dev - Synchonize device's multicast list + *  @list: address list to syncronize + *  @dev:  device to sync + *  @sync: function to call if address should be added + *  @unsync: function to call if address should be removed + * + *  This funciton is intended to be called from the ndo_set_rx_mode + *  function of devices that require explicit address add/remove + *  notifications.  The unsync function may be NULL in which case + *  the addresses requiring removal will simply be removed without + *  any notification to the device. + **/ +int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, +		       struct net_device *dev, +		       int (*sync)(struct net_device *, const unsigned char *), +		       int (*unsync)(struct net_device *, +				     const unsigned char *)) +{ +	struct netdev_hw_addr *ha, *tmp; +	int err; + +	/* first go through and flush out any stale entries */ +	list_for_each_entry_safe(ha, tmp, &list->list, list) { +		if (!ha->sync_cnt || ha->refcount != 1) +			continue; + +		/* if unsync is defined and fails defer unsyncing address */ +		if (unsync && unsync(dev, ha->addr)) +			continue; + +		ha->sync_cnt--; +		__hw_addr_del_entry(list, ha, false, false); +	} + +	/* go through and sync new entries to the list */ +	list_for_each_entry_safe(ha, tmp, &list->list, list) { +		if (ha->sync_cnt) +			continue; + +		err = sync(dev, ha->addr); +		if (err) +			return err; + +		ha->sync_cnt++; +		ha->refcount++; +	} + +	return 0; +} +EXPORT_SYMBOL(__hw_addr_sync_dev); + +/** + *  __hw_addr_unsync_dev - Remove synchonized addresses from device + *  @list: address list to remove syncronized addresses from + *  @dev:  device to sync + *  @unsync: function to call if address should be removed + * + *  Remove all addresses that were added to the device by __hw_addr_sync_dev(). + *  This function is intended to be called from the ndo_stop or ndo_open + *  functions on devices that require explicit address add/remove + *  notifications.  If the unsync function pointer is NULL then this function + *  can be used to just reset the sync_cnt for the addresses in the list. + **/ +void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, +			  struct net_device *dev, +			  int (*unsync)(struct net_device *, +					const unsigned char *)) +{ +	struct netdev_hw_addr *ha, *tmp; + +	list_for_each_entry_safe(ha, tmp, &list->list, list) { +		if (!ha->sync_cnt) +			continue; + +		/* if unsync is defined and fails defer unsyncing address */ +		if (unsync && unsync(dev, ha->addr)) +			continue; + +		ha->sync_cnt--; +		__hw_addr_del_entry(list, ha, false, false); +	} +} +EXPORT_SYMBOL(__hw_addr_unsync_dev); + +static void __hw_addr_flush(struct netdev_hw_addr_list *list)  {  	struct netdev_hw_addr *ha, *tmp;  	list_for_each_entry_safe(ha, tmp, &list->list, list) {  		list_del_rcu(&ha->list); -		call_rcu(&ha->rcu_head, ha_rcu_free); +		kfree_rcu(ha, rcu_head);  	}  	list->count = 0;  } -EXPORT_SYMBOL(__hw_addr_flush);  void __hw_addr_init(struct netdev_hw_addr_list *list)  { @@ -276,7 +394,7 @@ EXPORT_SYMBOL(dev_addr_init);   *   *	The caller must hold the rtnl_mutex.   */ -int dev_addr_add(struct net_device *dev, unsigned char *addr, +int dev_addr_add(struct net_device *dev, const unsigned char *addr,  		 unsigned char addr_type)  {  	int err; @@ -301,7 +419,7 @@ EXPORT_SYMBOL(dev_addr_add);   *   *	The caller must hold the rtnl_mutex.   */ -int dev_addr_del(struct net_device *dev, unsigned char *addr, +int dev_addr_del(struct net_device *dev, const unsigned char *addr,  		 unsigned char addr_type)  {  	int err; @@ -315,7 +433,8 @@ int dev_addr_del(struct net_device *dev, unsigned char *addr,  	 */  	ha = list_first_entry(&dev->dev_addrs.list,  			      struct netdev_hw_addr, list); -	if (ha->addr == dev->dev_addr && ha->refcount == 1) +	if (!memcmp(ha->addr, addr, dev->addr_len) && +	    ha->type == addr_type && ha->refcount == 1)  		return -ENOENT;  	err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, @@ -326,62 +445,37 @@ int dev_addr_del(struct net_device *dev, unsigned char *addr,  }  EXPORT_SYMBOL(dev_addr_del); +/* + * Unicast list handling functions + */ +  /** - *	dev_addr_add_multiple - Add device addresses from another device - *	@to_dev: device to which addresses will be added - *	@from_dev: device from which addresses will be added - *	@addr_type: address type - 0 means type will be used from from_dev - * - *	Add device addresses of the one device to another. - ** - *	The caller must hold the rtnl_mutex. + *	dev_uc_add_excl - Add a global secondary unicast address + *	@dev: device + *	@addr: address to add   */ -int dev_addr_add_multiple(struct net_device *to_dev, -			  struct net_device *from_dev, -			  unsigned char addr_type) +int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)  { +	struct netdev_hw_addr *ha;  	int err; -	ASSERT_RTNL(); - -	if (from_dev->addr_len != to_dev->addr_len) -		return -EINVAL; -	err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, -				     to_dev->addr_len, addr_type); +	netif_addr_lock_bh(dev); +	list_for_each_entry(ha, &dev->uc.list, list) { +		if (!memcmp(ha->addr, addr, dev->addr_len) && +		    ha->type == NETDEV_HW_ADDR_T_UNICAST) { +			err = -EEXIST; +			goto out; +		} +	} +	err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len, +				  NETDEV_HW_ADDR_T_UNICAST, true, false);  	if (!err) -		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); +		__dev_set_rx_mode(dev); +out: +	netif_addr_unlock_bh(dev);  	return err;  } -EXPORT_SYMBOL(dev_addr_add_multiple); - -/** - *	dev_addr_del_multiple - Delete device addresses by another device - *	@to_dev: device where the addresses will be deleted - *	@from_dev: device by which addresses the addresses will be deleted - *	@addr_type: address type - 0 means type will used from from_dev - * - *	Deletes addresses in to device by the list of addresses in from device. - * - *	The caller must hold the rtnl_mutex. - */ -int dev_addr_del_multiple(struct net_device *to_dev, -			  struct net_device *from_dev, -			  unsigned char addr_type) -{ -	ASSERT_RTNL(); - -	if (from_dev->addr_len != to_dev->addr_len) -		return -EINVAL; -	__hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, -			       to_dev->addr_len, addr_type); -	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); -	return 0; -} -EXPORT_SYMBOL(dev_addr_del_multiple); - -/* - * Unicast list handling functions - */ +EXPORT_SYMBOL(dev_uc_add_excl);  /**   *	dev_uc_add - Add a secondary unicast address @@ -391,7 +485,7 @@ EXPORT_SYMBOL(dev_addr_del_multiple);   *	Add a secondary unicast address to the device or increase   *	the reference count if it already exists.   */ -int dev_uc_add(struct net_device *dev, unsigned char *addr) +int dev_uc_add(struct net_device *dev, const unsigned char *addr)  {  	int err; @@ -413,7 +507,7 @@ EXPORT_SYMBOL(dev_uc_add);   *	Release reference to a secondary unicast address and remove it   *	from the device if the reference count drops to zero.   */ -int dev_uc_del(struct net_device *dev, unsigned char *addr) +int dev_uc_del(struct net_device *dev, const unsigned char *addr)  {  	int err; @@ -434,10 +528,11 @@ EXPORT_SYMBOL(dev_uc_del);   *   *	Add newly added addresses to the destination device and release   *	addresses that have no users left. The source device must be - *	locked by netif_tx_lock_bh. + *	locked by netif_addr_lock_bh.   *   *	This function is intended to be called from the dev->set_rx_mode - *	function of layered software devices. + *	function of layered software devices.  This function assumes that + *	addresses will only ever be synced to the @to devices and no other.   */  int dev_uc_sync(struct net_device *to, struct net_device *from)  { @@ -446,16 +541,46 @@ int dev_uc_sync(struct net_device *to, struct net_device *from)  	if (to->addr_len != from->addr_len)  		return -EINVAL; -	netif_addr_lock_bh(to); +	netif_addr_lock_nested(to);  	err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);  	if (!err)  		__dev_set_rx_mode(to); -	netif_addr_unlock_bh(to); +	netif_addr_unlock(to);  	return err;  }  EXPORT_SYMBOL(dev_uc_sync);  /** + *	dev_uc_sync_multiple - Synchronize device's unicast list to another + *	device, but allow for multiple calls to sync to multiple devices. + *	@to: destination device + *	@from: source device + * + *	Add newly added addresses to the destination device and release + *	addresses that have been deleted from the source. The source device + *	must be locked by netif_addr_lock_bh. + * + *	This function is intended to be called from the dev->set_rx_mode + *	function of layered software devices.  It allows for a single source + *	device to be synced to multiple destination devices. + */ +int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) +{ +	int err = 0; + +	if (to->addr_len != from->addr_len) +		return -EINVAL; + +	netif_addr_lock_nested(to); +	err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); +	if (!err) +		__dev_set_rx_mode(to); +	netif_addr_unlock(to); +	return err; +} +EXPORT_SYMBOL(dev_uc_sync_multiple); + +/**   *	dev_uc_unsync - Remove synchronized addresses from the destination device   *	@to: destination device   *	@from: source device @@ -470,7 +595,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from)  		return;  	netif_addr_lock_bh(from); -	netif_addr_lock(to); +	netif_addr_lock_nested(to);  	__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);  	__dev_set_rx_mode(to);  	netif_addr_unlock(to); @@ -508,14 +633,42 @@ EXPORT_SYMBOL(dev_uc_init);   * Multicast list handling functions   */ -static int __dev_mc_add(struct net_device *dev, unsigned char *addr, +/** + *	dev_mc_add_excl - Add a global secondary multicast address + *	@dev: device + *	@addr: address to add + */ +int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) +{ +	struct netdev_hw_addr *ha; +	int err; + +	netif_addr_lock_bh(dev); +	list_for_each_entry(ha, &dev->mc.list, list) { +		if (!memcmp(ha->addr, addr, dev->addr_len) && +		    ha->type == NETDEV_HW_ADDR_T_MULTICAST) { +			err = -EEXIST; +			goto out; +		} +	} +	err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len, +				  NETDEV_HW_ADDR_T_MULTICAST, true, false); +	if (!err) +		__dev_set_rx_mode(dev); +out: +	netif_addr_unlock_bh(dev); +	return err; +} +EXPORT_SYMBOL(dev_mc_add_excl); + +static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,  			bool global)  {  	int err;  	netif_addr_lock_bh(dev);  	err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, -			       NETDEV_HW_ADDR_T_MULTICAST, global); +			       NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);  	if (!err)  		__dev_set_rx_mode(dev);  	netif_addr_unlock_bh(dev); @@ -529,7 +682,7 @@ static int __dev_mc_add(struct net_device *dev, unsigned char *addr,   *	Add a multicast address to the device or increase   *	the reference count if it already exists.   */ -int dev_mc_add(struct net_device *dev, unsigned char *addr) +int dev_mc_add(struct net_device *dev, const unsigned char *addr)  {  	return __dev_mc_add(dev, addr, false);  } @@ -542,20 +695,20 @@ EXPORT_SYMBOL(dev_mc_add);   *   *	Add a global multicast address to the device.   */ -int dev_mc_add_global(struct net_device *dev, unsigned char *addr) +int dev_mc_add_global(struct net_device *dev, const unsigned char *addr)  {  	return __dev_mc_add(dev, addr, true);  }  EXPORT_SYMBOL(dev_mc_add_global); -static int __dev_mc_del(struct net_device *dev, unsigned char *addr, +static int __dev_mc_del(struct net_device *dev, const unsigned char *addr,  			bool global)  {  	int err;  	netif_addr_lock_bh(dev);  	err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, -			       NETDEV_HW_ADDR_T_MULTICAST, global); +			       NETDEV_HW_ADDR_T_MULTICAST, global, false);  	if (!err)  		__dev_set_rx_mode(dev);  	netif_addr_unlock_bh(dev); @@ -570,7 +723,7 @@ static int __dev_mc_del(struct net_device *dev, unsigned char *addr,   *	Release reference to a multicast address and remove it   *	from the device if the reference count drops to zero.   */ -int dev_mc_del(struct net_device *dev, unsigned char *addr) +int dev_mc_del(struct net_device *dev, const unsigned char *addr)  {  	return __dev_mc_del(dev, addr, false);  } @@ -584,23 +737,23 @@ EXPORT_SYMBOL(dev_mc_del);   *	Release reference to a multicast address and remove it   *	from the device if the reference count drops to zero.   */ -int dev_mc_del_global(struct net_device *dev, unsigned char *addr) +int dev_mc_del_global(struct net_device *dev, const unsigned char *addr)  {  	return __dev_mc_del(dev, addr, true);  }  EXPORT_SYMBOL(dev_mc_del_global);  /** - *	dev_mc_sync - Synchronize device's unicast list to another device + *	dev_mc_sync - Synchronize device's multicast list to another device   *	@to: destination device   *	@from: source device   *   *	Add newly added addresses to the destination device and release   *	addresses that have no users left. The source device must be - *	locked by netif_tx_lock_bh. + *	locked by netif_addr_lock_bh.   * - *	This function is intended to be called from the dev->set_multicast_list - *	or dev->set_rx_mode function of layered software devices. + *	This function is intended to be called from the ndo_set_rx_mode + *	function of layered software devices.   */  int dev_mc_sync(struct net_device *to, struct net_device *from)  { @@ -609,16 +762,46 @@ int dev_mc_sync(struct net_device *to, struct net_device *from)  	if (to->addr_len != from->addr_len)  		return -EINVAL; -	netif_addr_lock_bh(to); +	netif_addr_lock_nested(to);  	err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);  	if (!err)  		__dev_set_rx_mode(to); -	netif_addr_unlock_bh(to); +	netif_addr_unlock(to);  	return err;  }  EXPORT_SYMBOL(dev_mc_sync);  /** + *	dev_mc_sync_multiple - Synchronize device's multicast list to another + *	device, but allow for multiple calls to sync to multiple devices. + *	@to: destination device + *	@from: source device + * + *	Add newly added addresses to the destination device and release + *	addresses that have no users left. The source device must be + *	locked by netif_addr_lock_bh. + * + *	This function is intended to be called from the ndo_set_rx_mode + *	function of layered software devices.  It allows for a single + *	source device to be synced to multiple destination devices. + */ +int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) +{ +	int err = 0; + +	if (to->addr_len != from->addr_len) +		return -EINVAL; + +	netif_addr_lock_nested(to); +	err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); +	if (!err) +		__dev_set_rx_mode(to); +	netif_addr_unlock(to); +	return err; +} +EXPORT_SYMBOL(dev_mc_sync_multiple); + +/**   *	dev_mc_unsync - Remove synchronized addresses from the destination device   *	@to: destination device   *	@from: source device @@ -633,7 +816,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from)  		return;  	netif_addr_lock_bh(from); -	netif_addr_lock(to); +	netif_addr_lock_nested(to);  	__hw_addr_unsync(&to->mc, &from->mc, to->addr_len);  	__dev_set_rx_mode(to);  	netif_addr_unlock(to); @@ -666,76 +849,3 @@ void dev_mc_init(struct net_device *dev)  	__hw_addr_init(&dev->mc);  }  EXPORT_SYMBOL(dev_mc_init); - -#ifdef CONFIG_PROC_FS -#include <linux/seq_file.h> - -static int dev_mc_seq_show(struct seq_file *seq, void *v) -{ -	struct netdev_hw_addr *ha; -	struct net_device *dev = v; - -	if (v == SEQ_START_TOKEN) -		return 0; - -	netif_addr_lock_bh(dev); -	netdev_for_each_mc_addr(ha, dev) { -		int i; - -		seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, -			   dev->name, ha->refcount, ha->global_use); - -		for (i = 0; i < dev->addr_len; i++) -			seq_printf(seq, "%02x", ha->addr[i]); - -		seq_putc(seq, '\n'); -	} -	netif_addr_unlock_bh(dev); -	return 0; -} - -static const struct seq_operations dev_mc_seq_ops = { -	.start = dev_seq_start, -	.next  = dev_seq_next, -	.stop  = dev_seq_stop, -	.show  = dev_mc_seq_show, -}; - -static int dev_mc_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open_net(inode, file, &dev_mc_seq_ops, -			    sizeof(struct seq_net_private)); -} - -static const struct file_operations dev_mc_seq_fops = { -	.owner	 = THIS_MODULE, -	.open    = dev_mc_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release_net, -}; - -#endif - -static int __net_init dev_mc_net_init(struct net *net) -{ -	if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops)) -		return -ENOMEM; -	return 0; -} - -static void __net_exit dev_mc_net_exit(struct net *net) -{ -	proc_net_remove(net, "dev_mcast"); -} - -static struct pernet_operations __net_initdata dev_mc_net_ops = { -	.init = dev_mc_net_init, -	.exit = dev_mc_net_exit, -}; - -void __init dev_mcast_init(void) -{ -	register_pernet_subsys(&dev_mc_net_ops); -} - diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c new file mode 100644 index 00000000000..cf999e09bcd --- /dev/null +++ b/net/core/dev_ioctl.c @@ -0,0 +1,567 @@ +#include <linux/kmod.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/rtnetlink.h> +#include <linux/net_tstamp.h> +#include <linux/wireless.h> +#include <net/wext.h> + +/* + *	Map an interface index to its name (SIOCGIFNAME) + */ + +/* + *	We need this ioctl for efficient implementation of the + *	if_indextoname() function required by the IPv6 API.  Without + *	it, we would have to search all the interfaces to find a + *	match.  --pb + */ + +static int dev_ifname(struct net *net, struct ifreq __user *arg) +{ +	struct ifreq ifr; +	int error; + +	/* +	 *	Fetch the caller's info block. +	 */ + +	if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) +		return -EFAULT; + +	error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex); +	if (error) +		return error; + +	if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) +		return -EFAULT; +	return 0; +} + +static gifconf_func_t *gifconf_list[NPROTO]; + +/** + *	register_gifconf	-	register a SIOCGIF handler + *	@family: Address family + *	@gifconf: Function handler + * + *	Register protocol dependent address dumping routines. The handler + *	that is passed must not be freed or reused until it has been replaced + *	by another handler. + */ +int register_gifconf(unsigned int family, gifconf_func_t *gifconf) +{ +	if (family >= NPROTO) +		return -EINVAL; +	gifconf_list[family] = gifconf; +	return 0; +} +EXPORT_SYMBOL(register_gifconf); + +/* + *	Perform a SIOCGIFCONF call. This structure will change + *	size eventually, and there is nothing I can do about it. + *	Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(struct net *net, char __user *arg) +{ +	struct ifconf ifc; +	struct net_device *dev; +	char __user *pos; +	int len; +	int total; +	int i; + +	/* +	 *	Fetch the caller's info block. +	 */ + +	if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) +		return -EFAULT; + +	pos = ifc.ifc_buf; +	len = ifc.ifc_len; + +	/* +	 *	Loop over the interfaces, and write an info block for each. +	 */ + +	total = 0; +	for_each_netdev(net, dev) { +		for (i = 0; i < NPROTO; i++) { +			if (gifconf_list[i]) { +				int done; +				if (!pos) +					done = gifconf_list[i](dev, NULL, 0); +				else +					done = gifconf_list[i](dev, pos + total, +							       len - total); +				if (done < 0) +					return -EFAULT; +				total += done; +			} +		} +	} + +	/* +	 *	All done.  Write the updated control block back to the caller. +	 */ +	ifc.ifc_len = total; + +	/* +	 * 	Both BSD and Solaris return 0 here, so we do too. +	 */ +	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +} + +/* + *	Perform the SIOCxIFxxx calls, inside rcu_read_lock() + */ +static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ +	int err; +	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); + +	if (!dev) +		return -ENODEV; + +	switch (cmd) { +	case SIOCGIFFLAGS:	/* Get interface flags */ +		ifr->ifr_flags = (short) dev_get_flags(dev); +		return 0; + +	case SIOCGIFMETRIC:	/* Get the metric on the interface +				   (currently unused) */ +		ifr->ifr_metric = 0; +		return 0; + +	case SIOCGIFMTU:	/* Get the MTU of a device */ +		ifr->ifr_mtu = dev->mtu; +		return 0; + +	case SIOCGIFHWADDR: +		if (!dev->addr_len) +			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); +		else +			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, +			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); +		ifr->ifr_hwaddr.sa_family = dev->type; +		return 0; + +	case SIOCGIFSLAVE: +		err = -EINVAL; +		break; + +	case SIOCGIFMAP: +		ifr->ifr_map.mem_start = dev->mem_start; +		ifr->ifr_map.mem_end   = dev->mem_end; +		ifr->ifr_map.base_addr = dev->base_addr; +		ifr->ifr_map.irq       = dev->irq; +		ifr->ifr_map.dma       = dev->dma; +		ifr->ifr_map.port      = dev->if_port; +		return 0; + +	case SIOCGIFINDEX: +		ifr->ifr_ifindex = dev->ifindex; +		return 0; + +	case SIOCGIFTXQLEN: +		ifr->ifr_qlen = dev->tx_queue_len; +		return 0; + +	default: +		/* dev_ioctl() should ensure this case +		 * is never reached +		 */ +		WARN_ON(1); +		err = -ENOTTY; +		break; + +	} +	return err; +} + +static int net_hwtstamp_validate(struct ifreq *ifr) +{ +	struct hwtstamp_config cfg; +	enum hwtstamp_tx_types tx_type; +	enum hwtstamp_rx_filters rx_filter; +	int tx_type_valid = 0; +	int rx_filter_valid = 0; + +	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) +		return -EFAULT; + +	if (cfg.flags) /* reserved for future extensions */ +		return -EINVAL; + +	tx_type = cfg.tx_type; +	rx_filter = cfg.rx_filter; + +	switch (tx_type) { +	case HWTSTAMP_TX_OFF: +	case HWTSTAMP_TX_ON: +	case HWTSTAMP_TX_ONESTEP_SYNC: +		tx_type_valid = 1; +		break; +	} + +	switch (rx_filter) { +	case HWTSTAMP_FILTER_NONE: +	case HWTSTAMP_FILTER_ALL: +	case HWTSTAMP_FILTER_SOME: +	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: +	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: +	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: +	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: +	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: +	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: +	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: +	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: +	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: +	case HWTSTAMP_FILTER_PTP_V2_EVENT: +	case HWTSTAMP_FILTER_PTP_V2_SYNC: +	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: +		rx_filter_valid = 1; +		break; +	} + +	if (!tx_type_valid || !rx_filter_valid) +		return -ERANGE; + +	return 0; +} + +/* + *	Perform the SIOCxIFxxx calls, inside rtnl_lock() + */ +static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ +	int err; +	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); +	const struct net_device_ops *ops; + +	if (!dev) +		return -ENODEV; + +	ops = dev->netdev_ops; + +	switch (cmd) { +	case SIOCSIFFLAGS:	/* Set interface flags */ +		return dev_change_flags(dev, ifr->ifr_flags); + +	case SIOCSIFMETRIC:	/* Set the metric on the interface +				   (currently unused) */ +		return -EOPNOTSUPP; + +	case SIOCSIFMTU:	/* Set the MTU of a device */ +		return dev_set_mtu(dev, ifr->ifr_mtu); + +	case SIOCSIFHWADDR: +		return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + +	case SIOCSIFHWBROADCAST: +		if (ifr->ifr_hwaddr.sa_family != dev->type) +			return -EINVAL; +		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, +		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); +		return 0; + +	case SIOCSIFMAP: +		if (ops->ndo_set_config) { +			if (!netif_device_present(dev)) +				return -ENODEV; +			return ops->ndo_set_config(dev, &ifr->ifr_map); +		} +		return -EOPNOTSUPP; + +	case SIOCADDMULTI: +		if (!ops->ndo_set_rx_mode || +		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) +			return -EINVAL; +		if (!netif_device_present(dev)) +			return -ENODEV; +		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + +	case SIOCDELMULTI: +		if (!ops->ndo_set_rx_mode || +		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC) +			return -EINVAL; +		if (!netif_device_present(dev)) +			return -ENODEV; +		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + +	case SIOCSIFTXQLEN: +		if (ifr->ifr_qlen < 0) +			return -EINVAL; +		dev->tx_queue_len = ifr->ifr_qlen; +		return 0; + +	case SIOCSIFNAME: +		ifr->ifr_newname[IFNAMSIZ-1] = '\0'; +		return dev_change_name(dev, ifr->ifr_newname); + +	case SIOCSHWTSTAMP: +		err = net_hwtstamp_validate(ifr); +		if (err) +			return err; +		/* fall through */ + +	/* +	 *	Unknown or private ioctl +	 */ +	default: +		if ((cmd >= SIOCDEVPRIVATE && +		    cmd <= SIOCDEVPRIVATE + 15) || +		    cmd == SIOCBONDENSLAVE || +		    cmd == SIOCBONDRELEASE || +		    cmd == SIOCBONDSETHWADDR || +		    cmd == SIOCBONDSLAVEINFOQUERY || +		    cmd == SIOCBONDINFOQUERY || +		    cmd == SIOCBONDCHANGEACTIVE || +		    cmd == SIOCGMIIPHY || +		    cmd == SIOCGMIIREG || +		    cmd == SIOCSMIIREG || +		    cmd == SIOCBRADDIF || +		    cmd == SIOCBRDELIF || +		    cmd == SIOCSHWTSTAMP || +		    cmd == SIOCGHWTSTAMP || +		    cmd == SIOCWANDEV) { +			err = -EOPNOTSUPP; +			if (ops->ndo_do_ioctl) { +				if (netif_device_present(dev)) +					err = ops->ndo_do_ioctl(dev, ifr, cmd); +				else +					err = -ENODEV; +			} +		} else +			err = -EINVAL; + +	} +	return err; +} + +/** + *	dev_load 	- load a network module + *	@net: the applicable net namespace + *	@name: name of interface + * + *	If a network interface is not present and the process has suitable + *	privileges this function loads the module. If module loading is not + *	available in this kernel then it becomes a nop. + */ + +void dev_load(struct net *net, const char *name) +{ +	struct net_device *dev; +	int no_module; + +	rcu_read_lock(); +	dev = dev_get_by_name_rcu(net, name); +	rcu_read_unlock(); + +	no_module = !dev; +	if (no_module && capable(CAP_NET_ADMIN)) +		no_module = request_module("netdev-%s", name); +	if (no_module && capable(CAP_SYS_MODULE)) { +		if (!request_module("%s", name)) +			pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n", +				name); +	} +} +EXPORT_SYMBOL(dev_load); + +/* + *	This function handles all "interface"-type I/O control requests. The actual + *	'doing' part of this is dev_ifsioc above. + */ + +/** + *	dev_ioctl	-	network device ioctl + *	@net: the applicable net namespace + *	@cmd: command to issue + *	@arg: pointer to a struct ifreq in user space + * + *	Issue ioctl functions to devices. This is normally called by the + *	user space syscall interfaces but can sometimes be useful for + *	other purposes. The return value is the return from the syscall if + *	positive or a negative errno code on error. + */ + +int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ +	struct ifreq ifr; +	int ret; +	char *colon; + +	/* One special case: SIOCGIFCONF takes ifconf argument +	   and requires shared lock, because it sleeps writing +	   to user space. +	 */ + +	if (cmd == SIOCGIFCONF) { +		rtnl_lock(); +		ret = dev_ifconf(net, (char __user *) arg); +		rtnl_unlock(); +		return ret; +	} +	if (cmd == SIOCGIFNAME) +		return dev_ifname(net, (struct ifreq __user *)arg); + +	if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) +		return -EFAULT; + +	ifr.ifr_name[IFNAMSIZ-1] = 0; + +	colon = strchr(ifr.ifr_name, ':'); +	if (colon) +		*colon = 0; + +	/* +	 *	See which interface the caller is talking about. +	 */ + +	switch (cmd) { +	/* +	 *	These ioctl calls: +	 *	- can be done by all. +	 *	- atomic and do not require locking. +	 *	- return a value +	 */ +	case SIOCGIFFLAGS: +	case SIOCGIFMETRIC: +	case SIOCGIFMTU: +	case SIOCGIFHWADDR: +	case SIOCGIFSLAVE: +	case SIOCGIFMAP: +	case SIOCGIFINDEX: +	case SIOCGIFTXQLEN: +		dev_load(net, ifr.ifr_name); +		rcu_read_lock(); +		ret = dev_ifsioc_locked(net, &ifr, cmd); +		rcu_read_unlock(); +		if (!ret) { +			if (colon) +				*colon = ':'; +			if (copy_to_user(arg, &ifr, +					 sizeof(struct ifreq))) +				ret = -EFAULT; +		} +		return ret; + +	case SIOCETHTOOL: +		dev_load(net, ifr.ifr_name); +		rtnl_lock(); +		ret = dev_ethtool(net, &ifr); +		rtnl_unlock(); +		if (!ret) { +			if (colon) +				*colon = ':'; +			if (copy_to_user(arg, &ifr, +					 sizeof(struct ifreq))) +				ret = -EFAULT; +		} +		return ret; + +	/* +	 *	These ioctl calls: +	 *	- require superuser power. +	 *	- require strict serialization. +	 *	- return a value +	 */ +	case SIOCGMIIPHY: +	case SIOCGMIIREG: +	case SIOCSIFNAME: +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) +			return -EPERM; +		dev_load(net, ifr.ifr_name); +		rtnl_lock(); +		ret = dev_ifsioc(net, &ifr, cmd); +		rtnl_unlock(); +		if (!ret) { +			if (colon) +				*colon = ':'; +			if (copy_to_user(arg, &ifr, +					 sizeof(struct ifreq))) +				ret = -EFAULT; +		} +		return ret; + +	/* +	 *	These ioctl calls: +	 *	- require superuser power. +	 *	- require strict serialization. +	 *	- do not return a value +	 */ +	case SIOCSIFMAP: +	case SIOCSIFTXQLEN: +		if (!capable(CAP_NET_ADMIN)) +			return -EPERM; +		/* fall through */ +	/* +	 *	These ioctl calls: +	 *	- require local superuser power. +	 *	- require strict serialization. +	 *	- do not return a value +	 */ +	case SIOCSIFFLAGS: +	case SIOCSIFMETRIC: +	case SIOCSIFMTU: +	case SIOCSIFHWADDR: +	case SIOCSIFSLAVE: +	case SIOCADDMULTI: +	case SIOCDELMULTI: +	case SIOCSIFHWBROADCAST: +	case SIOCSMIIREG: +	case SIOCBONDENSLAVE: +	case SIOCBONDRELEASE: +	case SIOCBONDSETHWADDR: +	case SIOCBONDCHANGEACTIVE: +	case SIOCBRADDIF: +	case SIOCBRDELIF: +	case SIOCSHWTSTAMP: +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) +			return -EPERM; +		/* fall through */ +	case SIOCBONDSLAVEINFOQUERY: +	case SIOCBONDINFOQUERY: +		dev_load(net, ifr.ifr_name); +		rtnl_lock(); +		ret = dev_ifsioc(net, &ifr, cmd); +		rtnl_unlock(); +		return ret; + +	case SIOCGIFMEM: +		/* Get the per device memory space. We can add this but +		 * currently do not support it */ +	case SIOCSIFMEM: +		/* Set the per device memory buffer space. +		 * Not applicable in our case */ +	case SIOCSIFLINK: +		return -ENOTTY; + +	/* +	 *	Unknown or private ioctl. +	 */ +	default: +		if (cmd == SIOCWANDEV || +		    cmd == SIOCGHWTSTAMP || +		    (cmd >= SIOCDEVPRIVATE && +		     cmd <= SIOCDEVPRIVATE + 15)) { +			dev_load(net, ifr.ifr_name); +			rtnl_lock(); +			ret = dev_ifsioc(net, &ifr, cmd); +			rtnl_unlock(); +			if (!ret && copy_to_user(arg, &ifr, +						 sizeof(struct ifreq))) +				ret = -EFAULT; +			return ret; +		} +		/* Take care of Wireless Extensions */ +		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) +			return wext_handle_ioctl(net, &ifr, cmd, arg); +		return -ENOTTY; +	} +} diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 36e603c78ce..e70301eb7a4 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -4,6 +4,8 @@   * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/netdevice.h>  #include <linux/etherdevice.h>  #include <linux/string.h> @@ -22,6 +24,7 @@  #include <linux/timer.h>  #include <linux/bitops.h>  #include <linux/slab.h> +#include <linux/module.h>  #include <net/genetlink.h>  #include <net/netevent.h> @@ -33,22 +36,19 @@  #define TRACE_ON 1  #define TRACE_OFF 0 -static void send_dm_alert(struct work_struct *unused); - -  /*   * Globals, our netlink socket pointer   * and the work handle that will send up   * netlink alerts   */  static int trace_state = TRACE_OFF; -static DEFINE_SPINLOCK(trace_state_lock); +static DEFINE_MUTEX(trace_state_mutex);  struct per_cpu_dm_data { -	struct work_struct dm_alert_work; -	struct sk_buff *skb; -	atomic_t dm_hit_count; -	struct timer_list send_timer; +	spinlock_t		lock; +	struct sk_buff		*skb; +	struct work_struct	dm_alert_work; +	struct timer_list	send_timer;  };  struct dm_hw_stat_delta { @@ -64,7 +64,6 @@ static struct genl_family net_drop_monitor_family = {  	.hdrsize        = 0,  	.name           = "NET_DM",  	.version        = 2, -	.maxattr        = NET_DM_CMD_MAX,  };  static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); @@ -74,56 +73,64 @@ static int dm_delay = 1;  static unsigned long dm_hw_check_delta = 2*HZ;  static LIST_HEAD(hw_stats_list); -static void reset_per_cpu_data(struct per_cpu_dm_data *data) +static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)  {  	size_t al;  	struct net_dm_alert_msg *msg;  	struct nlattr *nla; +	struct sk_buff *skb; +	unsigned long flags;  	al = sizeof(struct net_dm_alert_msg);  	al += dm_hit_limit * sizeof(struct net_dm_drop_point);  	al += sizeof(struct nlattr); -	data->skb = genlmsg_new(al, GFP_KERNEL); -	genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family, -			0, NET_DM_CMD_ALERT); -	nla = nla_reserve(data->skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg)); -	msg = nla_data(nla); -	memset(msg, 0, al); -	atomic_set(&data->dm_hit_count, dm_hit_limit); +	skb = genlmsg_new(al, GFP_KERNEL); + +	if (skb) { +		genlmsg_put(skb, 0, 0, &net_drop_monitor_family, +				0, NET_DM_CMD_ALERT); +		nla = nla_reserve(skb, NLA_UNSPEC, +				  sizeof(struct net_dm_alert_msg)); +		msg = nla_data(nla); +		memset(msg, 0, al); +	} else { +		mod_timer(&data->send_timer, jiffies + HZ / 10); +	} + +	spin_lock_irqsave(&data->lock, flags); +	swap(data->skb, skb); +	spin_unlock_irqrestore(&data->lock, flags); + +	return skb;  } -static void send_dm_alert(struct work_struct *unused) +static struct genl_multicast_group dropmon_mcgrps[] = { +	{ .name = "events", }, +}; + +static void send_dm_alert(struct work_struct *work)  {  	struct sk_buff *skb; -	struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); - -	/* -	 * Grab the skb we're about to send -	 */ -	skb = data->skb; +	struct per_cpu_dm_data *data; -	/* -	 * Replace it with a new one -	 */ -	reset_per_cpu_data(data); +	data = container_of(work, struct per_cpu_dm_data, dm_alert_work); -	/* -	 * Ship it! -	 */ -	genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); +	skb = reset_per_cpu_data(data); +	if (skb) +		genlmsg_multicast(&net_drop_monitor_family, skb, 0, +				  0, GFP_KERNEL);  }  /*   * This is the timer function to delay the sending of an alert   * in the event that more drops will arrive during the - * hysteresis period.  Note that it operates under the timer interrupt - * so we don't need to disable preemption here + * hysteresis period.   */ -static void sched_send_work(unsigned long unused) +static void sched_send_work(unsigned long _data)  { -	struct per_cpu_dm_data *data =  &__get_cpu_var(dm_cpu_data); +	struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data;  	schedule_work(&data->dm_alert_work);  } @@ -134,17 +141,19 @@ static void trace_drop_common(struct sk_buff *skb, void *location)  	struct nlmsghdr *nlh;  	struct nlattr *nla;  	int i; -	struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); +	struct sk_buff *dskb; +	struct per_cpu_dm_data *data; +	unsigned long flags; +	local_irq_save(flags); +	data = &__get_cpu_var(dm_cpu_data); +	spin_lock(&data->lock); +	dskb = data->skb; -	if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) { -		/* -		 * we're already at zero, discard this hit -		 */ +	if (!dskb)  		goto out; -	} -	nlh = (struct nlmsghdr *)data->skb->data; +	nlh = (struct nlmsghdr *)dskb->data;  	nla = genlmsg_data(nlmsg_data(nlh));  	msg = nla_data(nla);  	for (i = 0; i < msg->entries; i++) { @@ -153,11 +162,12 @@ static void trace_drop_common(struct sk_buff *skb, void *location)  			goto out;  		}  	} - +	if (msg->entries == dm_hit_limit) +		goto out;  	/*  	 * We need to create a new entry  	 */ -	__nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point)); +	__nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));  	nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));  	memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));  	msg->points[msg->entries].count = 1; @@ -165,11 +175,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location)  	if (!timer_pending(&data->send_timer)) {  		data->send_timer.expires = jiffies + dm_delay * HZ; -		add_timer_on(&data->send_timer, smp_processor_id()); +		add_timer(&data->send_timer);  	}  out: -	return; +	spin_unlock_irqrestore(&data->lock, flags);  }  static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location) @@ -207,21 +217,13 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)  	rcu_read_unlock();  } - -static void free_dm_hw_stat(struct rcu_head *head) -{ -	struct dm_hw_stat_delta *n; -	n = container_of(head, struct dm_hw_stat_delta, rcu); -	kfree(n); -} -  static int set_all_monitor_traces(int state)  {  	int rc = 0;  	struct dm_hw_stat_delta *new_stat = NULL;  	struct dm_hw_stat_delta *temp; -	spin_lock(&trace_state_lock); +	mutex_lock(&trace_state_mutex);  	if (state == trace_state) {  		rc = -EAGAIN; @@ -230,9 +232,15 @@ static int set_all_monitor_traces(int state)  	switch (state) {  	case TRACE_ON: +		if (!try_module_get(THIS_MODULE)) { +			rc = -ENODEV; +			break; +		} +  		rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);  		rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);  		break; +  	case TRACE_OFF:  		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);  		rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL); @@ -245,9 +253,12 @@ static int set_all_monitor_traces(int state)  		list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {  			if (new_stat->dev == NULL) {  				list_del_rcu(&new_stat->list); -				call_rcu(&new_stat->rcu, free_dm_hw_stat); +				kfree_rcu(new_stat, rcu);  			}  		} + +		module_put(THIS_MODULE); +  		break;  	default:  		rc = 1; @@ -260,7 +271,7 @@ static int set_all_monitor_traces(int state)  		rc = -EINPROGRESS;  out_unlock: -	spin_unlock(&trace_state_lock); +	mutex_unlock(&trace_state_mutex);  	return rc;  } @@ -288,9 +299,9 @@ static int net_dm_cmd_trace(struct sk_buff *skb,  }  static int dropmon_net_event(struct notifier_block *ev_block, -			unsigned long event, void *ptr) +			     unsigned long event, void *ptr)  { -	struct net_device *dev = ptr; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	struct dm_hw_stat_delta *new_stat = NULL;  	struct dm_hw_stat_delta *tmp; @@ -303,30 +314,30 @@ static int dropmon_net_event(struct notifier_block *ev_block,  		new_stat->dev = dev;  		new_stat->last_rx = jiffies; -		spin_lock(&trace_state_lock); +		mutex_lock(&trace_state_mutex);  		list_add_rcu(&new_stat->list, &hw_stats_list); -		spin_unlock(&trace_state_lock); +		mutex_unlock(&trace_state_mutex);  		break;  	case NETDEV_UNREGISTER: -		spin_lock(&trace_state_lock); +		mutex_lock(&trace_state_mutex);  		list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {  			if (new_stat->dev == dev) {  				new_stat->dev = NULL;  				if (trace_state == TRACE_OFF) {  					list_del_rcu(&new_stat->list); -					call_rcu(&new_stat->rcu, free_dm_hw_stat); +					kfree_rcu(new_stat, rcu);  					break;  				}  			}  		} -		spin_unlock(&trace_state_lock); +		mutex_unlock(&trace_state_mutex);  		break;  	}  out:  	return NOTIFY_DONE;  } -static struct genl_ops dropmon_ops[] = { +static const struct genl_ops dropmon_ops[] = {  	{  		.cmd = NET_DM_CMD_CONFIG,  		.doit = net_dm_cmd_config, @@ -350,38 +361,40 @@ static int __init init_net_drop_monitor(void)  	struct per_cpu_dm_data *data;  	int cpu, rc; -	printk(KERN_INFO "Initalizing network drop monitor service\n"); +	pr_info("Initializing network drop monitor service\n");  	if (sizeof(void *) > 8) { -		printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n"); +		pr_err("Unable to store program counters on this arch, Drop monitor failed\n");  		return -ENOSPC;  	} -	rc = genl_register_family_with_ops(&net_drop_monitor_family, -					   dropmon_ops, -					   ARRAY_SIZE(dropmon_ops)); +	rc = genl_register_family_with_ops_groups(&net_drop_monitor_family, +						  dropmon_ops, dropmon_mcgrps);  	if (rc) { -		printk(KERN_ERR "Could not create drop monitor netlink family\n"); +		pr_err("Could not create drop monitor netlink family\n");  		return rc;  	} +	WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);  	rc = register_netdevice_notifier(&dropmon_net_notifier);  	if (rc < 0) { -		printk(KERN_CRIT "Failed to register netdevice notifier\n"); +		pr_crit("Failed to register netdevice notifier\n");  		goto out_unreg;  	}  	rc = 0; -	for_each_present_cpu(cpu) { +	for_each_possible_cpu(cpu) {  		data = &per_cpu(dm_cpu_data, cpu); -		reset_per_cpu_data(data);  		INIT_WORK(&data->dm_alert_work, send_dm_alert);  		init_timer(&data->send_timer); -		data->send_timer.data = cpu; +		data->send_timer.data = (unsigned long)data;  		data->send_timer.function = sched_send_work; +		spin_lock_init(&data->lock); +		reset_per_cpu_data(data);  	} +  	goto out;  out_unreg: @@ -390,4 +403,37 @@ out:  	return rc;  } -late_initcall(init_net_drop_monitor); +static void exit_net_drop_monitor(void) +{ +	struct per_cpu_dm_data *data; +	int cpu; + +	BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); + +	/* +	 * Because of the module_get/put we do in the trace state change path +	 * we are guarnateed not to have any current users when we get here +	 * all we need to do is make sure that we don't have any running timers +	 * or pending schedule calls +	 */ + +	for_each_possible_cpu(cpu) { +		data = &per_cpu(dm_cpu_data, cpu); +		del_timer_sync(&data->send_timer); +		cancel_work_sync(&data->dm_alert_work); +		/* +		 * At this point, we should have exclusive access +		 * to this struct and can free the skb inside it +		 */ +		kfree_skb(data->skb); +	} + +	BUG_ON(genl_unregister_family(&net_drop_monitor_family)); +} + +module_init(init_net_drop_monitor); +module_exit(exit_net_drop_monitor); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>"); +MODULE_ALIAS_GENL_FAMILY("NET_DM"); diff --git a/net/core/dst.c b/net/core/dst.c index b99c7c7ffce..a028409ee43 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -19,6 +19,7 @@  #include <linux/types.h>  #include <net/net_namespace.h>  #include <linux/sched.h> +#include <linux/prefetch.h>  #include <net/dst.h> @@ -33,9 +34,6 @@   * 3) This list is guarded by a mutex,   *    so that the gc_task and dst_dev_event() can be synchronized.   */ -#if RT_CACHE_DEBUG >= 2 -static atomic_t			 dst_total = ATOMIC_INIT(0); -#endif  /*   * We want to keep lock & list close together @@ -69,10 +67,6 @@ static void dst_gc_task(struct work_struct *work)  	unsigned long expires = ~0L;  	struct dst_entry *dst, *next, head;  	struct dst_entry *last = &head; -#if RT_CACHE_DEBUG >= 2 -	ktime_t time_start = ktime_get(); -	struct timespec elapsed; -#endif  	mutex_lock(&dst_gc_mutex);  	next = dst_busy_list; @@ -100,7 +94,7 @@ loop:  			 * But we do not have state "obsoleted, but  			 * referenced by parent", so it is right.  			 */ -			if (dst->obsolete > 1) +			if (dst->obsolete > 0)  				continue;  			___dst_free(dst); @@ -146,25 +140,27 @@ loop:  	spin_unlock_bh(&dst_garbage.lock);  	mutex_unlock(&dst_gc_mutex); -#if RT_CACHE_DEBUG >= 2 -	elapsed = ktime_to_timespec(ktime_sub(ktime_get(), time_start)); -	printk(KERN_DEBUG "dst_total: %d delayed: %d work_perf: %d" -		" expires: %lu elapsed: %lu us\n", -		atomic_read(&dst_total), delayed, work_performed, -		expires, -		elapsed.tv_sec * USEC_PER_SEC + -		  elapsed.tv_nsec / NSEC_PER_USEC); -#endif  } -int dst_discard(struct sk_buff *skb) +int dst_discard_sk(struct sock *sk, struct sk_buff *skb)  {  	kfree_skb(skb);  	return 0;  } -EXPORT_SYMBOL(dst_discard); +EXPORT_SYMBOL(dst_discard_sk); -void *dst_alloc(struct dst_ops *ops) +const u32 dst_default_metrics[RTAX_MAX + 1] = { +	/* This initializer is needed to force linker to place this variable +	 * into const section. Otherwise it might end into bss section. +	 * We really want to avoid false sharing on this variable, and catch +	 * any writes on it. +	 */ +	[RTAX_MAX] = 0xdeadbeef, +}; + + +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, +		int initial_ref, int initial_obsolete, unsigned short flags)  {  	struct dst_entry *dst; @@ -172,18 +168,38 @@ void *dst_alloc(struct dst_ops *ops)  		if (ops->gc(ops))  			return NULL;  	} -	dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC); +	dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);  	if (!dst)  		return NULL; -	atomic_set(&dst->__refcnt, 0); +	dst->child = NULL; +	dst->dev = dev; +	if (dev) +		dev_hold(dev);  	dst->ops = ops; -	dst->lastuse = jiffies; +	dst_init_metrics(dst, dst_default_metrics, true); +	dst->expires = 0UL;  	dst->path = dst; -	dst->input = dst->output = dst_discard; -#if RT_CACHE_DEBUG >= 2 -	atomic_inc(&dst_total); +	dst->from = NULL; +#ifdef CONFIG_XFRM +	dst->xfrm = NULL; +#endif +	dst->input = dst_discard; +	dst->output = dst_discard_sk; +	dst->error = 0; +	dst->obsolete = initial_obsolete; +	dst->header_len = 0; +	dst->trailer_len = 0; +#ifdef CONFIG_IP_ROUTE_CLASSID +	dst->tclassid = 0;  #endif -	dst_entries_add(ops, 1); +	atomic_set(&dst->__refcnt, initial_ref); +	dst->__use = 0; +	dst->lastuse = jiffies; +	dst->flags = flags; +	dst->pending_confirm = 0; +	dst->next = NULL; +	if (!(flags & DST_NOCOUNT)) +		dst_entries_add(ops, 1);  	return dst;  }  EXPORT_SYMBOL(dst_alloc); @@ -193,9 +209,11 @@ static void ___dst_free(struct dst_entry *dst)  	/* The first case (dev==NULL) is required, when  	   protocol module is unloaded.  	 */ -	if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) -		dst->input = dst->output = dst_discard; -	dst->obsolete = 2; +	if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { +		dst->input = dst_discard; +		dst->output = dst_discard_sk; +	} +	dst->obsolete = DST_OBSOLETE_DEAD;  }  void __dst_free(struct dst_entry *dst) @@ -207,8 +225,8 @@ void __dst_free(struct dst_entry *dst)  	if (dst_garbage.timer_inc > DST_GC_INC) {  		dst_garbage.timer_inc = DST_GC_INC;  		dst_garbage.timer_expires = DST_GC_MIN; -		cancel_delayed_work(&dst_gc_work); -		schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires); +		mod_delayed_work(system_wq, &dst_gc_work, +				 dst_garbage.timer_expires);  	}  	spin_unlock_bh(&dst_garbage.lock);  } @@ -217,34 +235,19 @@ EXPORT_SYMBOL(__dst_free);  struct dst_entry *dst_destroy(struct dst_entry * dst)  {  	struct dst_entry *child; -	struct neighbour *neigh; -	struct hh_cache *hh;  	smp_rmb();  again: -	neigh = dst->neighbour; -	hh = dst->hh;  	child = dst->child; -	dst->hh = NULL; -	if (hh) -		hh_cache_put(hh); - -	if (neigh) { -		dst->neighbour = NULL; -		neigh_release(neigh); -	} - -	dst_entries_add(dst->ops, -1); +	if (!(dst->flags & DST_NOCOUNT)) +		dst_entries_add(dst->ops, -1);  	if (dst->ops->destroy)  		dst->ops->destroy(dst);  	if (dst->dev)  		dev_put(dst->dev); -#if RT_CACHE_DEBUG >= 2 -	atomic_dec(&dst_total); -#endif  	kmem_cache_free(dst->ops->kmem_cachep, dst);  	dst = child; @@ -266,6 +269,15 @@ again:  }  EXPORT_SYMBOL(dst_destroy); +static void dst_destroy_rcu(struct rcu_head *head) +{ +	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + +	dst = dst_destroy(dst); +	if (dst) +		__dst_free(dst); +} +  void dst_release(struct dst_entry *dst)  {  	if (dst) { @@ -273,37 +285,71 @@ void dst_release(struct dst_entry *dst)  		newrefcnt = atomic_dec_return(&dst->__refcnt);  		WARN_ON(newrefcnt < 0); -		if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { -			dst = dst_destroy(dst); -			if (dst) -				__dst_free(dst); -		} +		if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) +			call_rcu(&dst->rcu_head, dst_destroy_rcu);  	}  }  EXPORT_SYMBOL(dst_release); +u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) +{ +	u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); + +	if (p) { +		u32 *old_p = __DST_METRICS_PTR(old); +		unsigned long prev, new; + +		memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + +		new = (unsigned long) p; +		prev = cmpxchg(&dst->_metrics, old, new); + +		if (prev != old) { +			kfree(p); +			p = __DST_METRICS_PTR(prev); +			if (prev & DST_METRICS_READ_ONLY) +				p = NULL; +		} +	} +	return p; +} +EXPORT_SYMBOL(dst_cow_metrics_generic); + +/* Caller asserts that dst_metrics_read_only(dst) is false.  */ +void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) +{ +	unsigned long prev, new; + +	new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; +	prev = cmpxchg(&dst->_metrics, old, new); +	if (prev == old) +		kfree(__DST_METRICS_PTR(old)); +} +EXPORT_SYMBOL(__dst_destroy_metrics_generic); +  /** - * skb_dst_set_noref - sets skb dst, without a reference + * __skb_dst_set_noref - sets skb dst, without a reference   * @skb: buffer   * @dst: dst entry + * @force: if force is set, use noref version even for DST_NOCACHE entries   *   * Sets skb dst, assuming a reference was not taken on dst   * skb_dst_drop() should not dst_release() this dst   */ -void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) +void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)  {  	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());  	/* If dst not in cache, we must take a reference, because  	 * dst_release() will destroy dst as soon as its refcount becomes zero  	 */ -	if (unlikely(dst->flags & DST_NOCACHE)) { +	if (unlikely((dst->flags & DST_NOCACHE) && !force)) {  		dst_hold(dst);  		skb_dst_set(skb, dst);  	} else {  		skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;  	}  } -EXPORT_SYMBOL(skb_dst_set_noref); +EXPORT_SYMBOL(__skb_dst_set_noref);  /* Dirty hack. We did it in 2.2 (in __dst_free),   * we have _very_ good reasons not to repeat @@ -323,27 +369,23 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,  		return;  	if (!unregister) { -		dst->input = dst->output = dst_discard; +		dst->input = dst_discard; +		dst->output = dst_discard_sk;  	} else {  		dst->dev = dev_net(dst->dev)->loopback_dev;  		dev_hold(dst->dev);  		dev_put(dev); -		if (dst->neighbour && dst->neighbour->dev == dev) { -			dst->neighbour->dev = dst->dev; -			dev_hold(dst->dev); -			dev_put(dev); -		}  	}  }  static int dst_dev_event(struct notifier_block *this, unsigned long event,  			 void *ptr)  { -	struct net_device *dev = ptr; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	struct dst_entry *dst, *last = NULL;  	switch (event) { -	case NETDEV_UNREGISTER: +	case NETDEV_UNREGISTER_FINAL:  	case NETDEV_DOWN:  		mutex_lock(&dst_gc_mutex);  		for (dst = dst_busy_list; dst; dst = dst->next) { diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d5bc2881888..17cb912793f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -17,10 +17,14 @@  #include <linux/errno.h>  #include <linux/ethtool.h>  #include <linux/netdevice.h> +#include <linux/net_tstamp.h> +#include <linux/phy.h>  #include <linux/bitops.h>  #include <linux/uaccess.h>  #include <linux/vmalloc.h>  #include <linux/slab.h> +#include <linux/rtnetlink.h> +#include <linux/sched.h>  /*   * Some useful ethtool_ops methods that're device independent. @@ -34,152 +38,315 @@ u32 ethtool_op_get_link(struct net_device *dev)  }  EXPORT_SYMBOL(ethtool_op_get_link); -u32 ethtool_op_get_rx_csum(struct net_device *dev) +int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)  { -	return (dev->features & NETIF_F_ALL_CSUM) != 0; +	info->so_timestamping = +		SOF_TIMESTAMPING_TX_SOFTWARE | +		SOF_TIMESTAMPING_RX_SOFTWARE | +		SOF_TIMESTAMPING_SOFTWARE; +	info->phc_index = -1; +	return 0;  } -EXPORT_SYMBOL(ethtool_op_get_rx_csum); +EXPORT_SYMBOL(ethtool_op_get_ts_info); -u32 ethtool_op_get_tx_csum(struct net_device *dev) -{ -	return (dev->features & NETIF_F_ALL_CSUM) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_tx_csum); +/* Handlers for each ethtool command */ -int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) +#define ETHTOOL_DEV_FEATURE_WORDS	((NETDEV_FEATURE_COUNT + 31) / 32) + +static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { +	[NETIF_F_SG_BIT] =               "tx-scatter-gather", +	[NETIF_F_IP_CSUM_BIT] =          "tx-checksum-ipv4", +	[NETIF_F_HW_CSUM_BIT] =          "tx-checksum-ip-generic", +	[NETIF_F_IPV6_CSUM_BIT] =        "tx-checksum-ipv6", +	[NETIF_F_HIGHDMA_BIT] =          "highdma", +	[NETIF_F_FRAGLIST_BIT] =         "tx-scatter-gather-fraglist", +	[NETIF_F_HW_VLAN_CTAG_TX_BIT] =  "tx-vlan-hw-insert", + +	[NETIF_F_HW_VLAN_CTAG_RX_BIT] =  "rx-vlan-hw-parse", +	[NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", +	[NETIF_F_HW_VLAN_STAG_TX_BIT] =  "tx-vlan-stag-hw-insert", +	[NETIF_F_HW_VLAN_STAG_RX_BIT] =  "rx-vlan-stag-hw-parse", +	[NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", +	[NETIF_F_VLAN_CHALLENGED_BIT] =  "vlan-challenged", +	[NETIF_F_GSO_BIT] =              "tx-generic-segmentation", +	[NETIF_F_LLTX_BIT] =             "tx-lockless", +	[NETIF_F_NETNS_LOCAL_BIT] =      "netns-local", +	[NETIF_F_GRO_BIT] =              "rx-gro", +	[NETIF_F_LRO_BIT] =              "rx-lro", + +	[NETIF_F_TSO_BIT] =              "tx-tcp-segmentation", +	[NETIF_F_UFO_BIT] =              "tx-udp-fragmentation", +	[NETIF_F_GSO_ROBUST_BIT] =       "tx-gso-robust", +	[NETIF_F_TSO_ECN_BIT] =          "tx-tcp-ecn-segmentation", +	[NETIF_F_TSO6_BIT] =             "tx-tcp6-segmentation", +	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation", +	[NETIF_F_GSO_GRE_BIT] =		 "tx-gre-segmentation", +	[NETIF_F_GSO_IPIP_BIT] =	 "tx-ipip-segmentation", +	[NETIF_F_GSO_SIT_BIT] =		 "tx-sit-segmentation", +	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation", +	[NETIF_F_GSO_MPLS_BIT] =	 "tx-mpls-segmentation", + +	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc", +	[NETIF_F_SCTP_CSUM_BIT] =        "tx-checksum-sctp", +	[NETIF_F_FCOE_MTU_BIT] =         "fcoe-mtu", +	[NETIF_F_NTUPLE_BIT] =           "rx-ntuple-filter", +	[NETIF_F_RXHASH_BIT] =           "rx-hashing", +	[NETIF_F_RXCSUM_BIT] =           "rx-checksum", +	[NETIF_F_NOCACHE_COPY_BIT] =     "tx-nocache-copy", +	[NETIF_F_LOOPBACK_BIT] =         "loopback", +	[NETIF_F_RXFCS_BIT] =            "rx-fcs", +	[NETIF_F_RXALL_BIT] =            "rx-all", +	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", +	[NETIF_F_BUSY_POLL_BIT] =        "busy-poll", +}; + +static int ethtool_get_features(struct net_device *dev, void __user *useraddr)  { -	if (data) -		dev->features |= NETIF_F_IP_CSUM; -	else -		dev->features &= ~NETIF_F_IP_CSUM; +	struct ethtool_gfeatures cmd = { +		.cmd = ETHTOOL_GFEATURES, +		.size = ETHTOOL_DEV_FEATURE_WORDS, +	}; +	struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; +	u32 __user *sizeaddr; +	u32 copy_size; +	int i; + +	/* in case feature bits run out again */ +	BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); + +	for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { +		features[i].available = (u32)(dev->hw_features >> (32 * i)); +		features[i].requested = (u32)(dev->wanted_features >> (32 * i)); +		features[i].active = (u32)(dev->features >> (32 * i)); +		features[i].never_changed = +			(u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); +	} -	return 0; -} +	sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); +	if (get_user(copy_size, sizeaddr)) +		return -EFAULT; -int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data) -{ -	if (data) -		dev->features |= NETIF_F_HW_CSUM; -	else -		dev->features &= ~NETIF_F_HW_CSUM; +	if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) +		copy_size = ETHTOOL_DEV_FEATURE_WORDS; + +	if (copy_to_user(useraddr, &cmd, sizeof(cmd))) +		return -EFAULT; +	useraddr += sizeof(cmd); +	if (copy_to_user(useraddr, features, copy_size * sizeof(*features))) +		return -EFAULT;  	return 0;  } -EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); -int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data) +static int ethtool_set_features(struct net_device *dev, void __user *useraddr)  { -	if (data) -		dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; -	else -		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); +	struct ethtool_sfeatures cmd; +	struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; +	netdev_features_t wanted = 0, valid = 0; +	int i, ret = 0; -	return 0; +	if (copy_from_user(&cmd, useraddr, sizeof(cmd))) +		return -EFAULT; +	useraddr += sizeof(cmd); + +	if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) +		return -EINVAL; + +	if (copy_from_user(features, useraddr, sizeof(features))) +		return -EFAULT; + +	for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { +		valid |= (netdev_features_t)features[i].valid << (32 * i); +		wanted |= (netdev_features_t)features[i].requested << (32 * i); +	} + +	if (valid & ~NETIF_F_ETHTOOL_BITS) +		return -EINVAL; + +	if (valid & ~dev->hw_features) { +		valid &= dev->hw_features; +		ret |= ETHTOOL_F_UNSUPPORTED; +	} + +	dev->wanted_features &= ~valid; +	dev->wanted_features |= wanted & valid; +	__netdev_update_features(dev); + +	if ((dev->wanted_features ^ dev->features) & valid) +		ret |= ETHTOOL_F_WISH; + +	return ret;  } -EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum); -u32 ethtool_op_get_sg(struct net_device *dev) +static int __ethtool_get_sset_count(struct net_device *dev, int sset)  { -	return (dev->features & NETIF_F_SG) != 0; +	const struct ethtool_ops *ops = dev->ethtool_ops; + +	if (sset == ETH_SS_FEATURES) +		return ARRAY_SIZE(netdev_features_strings); + +	if (ops->get_sset_count && ops->get_strings) +		return ops->get_sset_count(dev, sset); +	else +		return -EOPNOTSUPP;  } -EXPORT_SYMBOL(ethtool_op_get_sg); -int ethtool_op_set_sg(struct net_device *dev, u32 data) +static void __ethtool_get_strings(struct net_device *dev, +	u32 stringset, u8 *data)  { -	if (data) -		dev->features |= NETIF_F_SG; -	else -		dev->features &= ~NETIF_F_SG; +	const struct ethtool_ops *ops = dev->ethtool_ops; -	return 0; +	if (stringset == ETH_SS_FEATURES) +		memcpy(data, netdev_features_strings, +			sizeof(netdev_features_strings)); +	else +		/* ops->get_strings is valid because checked earlier */ +		ops->get_strings(dev, stringset, data);  } -EXPORT_SYMBOL(ethtool_op_set_sg); -u32 ethtool_op_get_tso(struct net_device *dev) +static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)  { -	return (dev->features & NETIF_F_TSO) != 0; +	/* feature masks of legacy discrete ethtool ops */ + +	switch (eth_cmd) { +	case ETHTOOL_GTXCSUM: +	case ETHTOOL_STXCSUM: +		return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM; +	case ETHTOOL_GRXCSUM: +	case ETHTOOL_SRXCSUM: +		return NETIF_F_RXCSUM; +	case ETHTOOL_GSG: +	case ETHTOOL_SSG: +		return NETIF_F_SG; +	case ETHTOOL_GTSO: +	case ETHTOOL_STSO: +		return NETIF_F_ALL_TSO; +	case ETHTOOL_GUFO: +	case ETHTOOL_SUFO: +		return NETIF_F_UFO; +	case ETHTOOL_GGSO: +	case ETHTOOL_SGSO: +		return NETIF_F_GSO; +	case ETHTOOL_GGRO: +	case ETHTOOL_SGRO: +		return NETIF_F_GRO; +	default: +		BUG(); +	}  } -EXPORT_SYMBOL(ethtool_op_get_tso); -int ethtool_op_set_tso(struct net_device *dev, u32 data) +static int ethtool_get_one_feature(struct net_device *dev, +	char __user *useraddr, u32 ethcmd)  { -	if (data) -		dev->features |= NETIF_F_TSO; -	else -		dev->features &= ~NETIF_F_TSO; +	netdev_features_t mask = ethtool_get_feature_mask(ethcmd); +	struct ethtool_value edata = { +		.cmd = ethcmd, +		.data = !!(dev->features & mask), +	}; +	if (copy_to_user(useraddr, &edata, sizeof(edata))) +		return -EFAULT;  	return 0;  } -EXPORT_SYMBOL(ethtool_op_set_tso); -u32 ethtool_op_get_ufo(struct net_device *dev) +static int ethtool_set_one_feature(struct net_device *dev, +	void __user *useraddr, u32 ethcmd)  { -	return (dev->features & NETIF_F_UFO) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_ufo); +	struct ethtool_value edata; +	netdev_features_t mask; -int ethtool_op_set_ufo(struct net_device *dev, u32 data) -{ -	if (data) -		dev->features |= NETIF_F_UFO; +	if (copy_from_user(&edata, useraddr, sizeof(edata))) +		return -EFAULT; + +	mask = ethtool_get_feature_mask(ethcmd); +	mask &= dev->hw_features; +	if (!mask) +		return -EOPNOTSUPP; + +	if (edata.data) +		dev->wanted_features |= mask;  	else -		dev->features &= ~NETIF_F_UFO; +		dev->wanted_features &= ~mask; + +	__netdev_update_features(dev); +  	return 0;  } -EXPORT_SYMBOL(ethtool_op_set_ufo); -/* the following list of flags are the same as their associated - * NETIF_F_xxx values in include/linux/netdevice.h - */ -static const u32 flags_dup_features = -	(ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE | -	 ETH_FLAG_RXHASH); +#define ETH_ALL_FLAGS    (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ +			  ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ +			  NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ +			  NETIF_F_RXHASH) -u32 ethtool_op_get_flags(struct net_device *dev) +static u32 __ethtool_get_flags(struct net_device *dev)  { -	/* in the future, this function will probably contain additional -	 * handling for flags which are not so easily handled -	 * by a simple masking operation -	 */ - -	return dev->features & flags_dup_features; +	u32 flags = 0; + +	if (dev->features & NETIF_F_LRO) +		flags |= ETH_FLAG_LRO; +	if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) +		flags |= ETH_FLAG_RXVLAN; +	if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) +		flags |= ETH_FLAG_TXVLAN; +	if (dev->features & NETIF_F_NTUPLE) +		flags |= ETH_FLAG_NTUPLE; +	if (dev->features & NETIF_F_RXHASH) +		flags |= ETH_FLAG_RXHASH; + +	return flags;  } -EXPORT_SYMBOL(ethtool_op_get_flags); -int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported) +static int __ethtool_set_flags(struct net_device *dev, u32 data)  { -	if (data & ~supported) +	netdev_features_t features = 0, changed; + +	if (data & ~ETH_ALL_FLAGS)  		return -EINVAL; -	dev->features = ((dev->features & ~flags_dup_features) | -			 (data & flags_dup_features)); +	if (data & ETH_FLAG_LRO) +		features |= NETIF_F_LRO; +	if (data & ETH_FLAG_RXVLAN) +		features |= NETIF_F_HW_VLAN_CTAG_RX; +	if (data & ETH_FLAG_TXVLAN) +		features |= NETIF_F_HW_VLAN_CTAG_TX; +	if (data & ETH_FLAG_NTUPLE) +		features |= NETIF_F_NTUPLE; +	if (data & ETH_FLAG_RXHASH) +		features |= NETIF_F_RXHASH; + +	/* allow changing only bits set in hw_features */ +	changed = (features ^ dev->features) & ETH_ALL_FEATURES; +	if (changed & ~dev->hw_features) +		return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; + +	dev->wanted_features = +		(dev->wanted_features & ~changed) | (features & changed); + +	__netdev_update_features(dev); +  	return 0;  } -EXPORT_SYMBOL(ethtool_op_set_flags); -void ethtool_ntuple_flush(struct net_device *dev) +int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)  { -	struct ethtool_rx_ntuple_flow_spec_container *fsc, *f; +	ASSERT_RTNL(); -	list_for_each_entry_safe(fsc, f, &dev->ethtool_ntuple_list.list, list) { -		list_del(&fsc->list); -		kfree(fsc); -	} -	dev->ethtool_ntuple_list.count = 0; -} -EXPORT_SYMBOL(ethtool_ntuple_flush); +	if (!dev->ethtool_ops->get_settings) +		return -EOPNOTSUPP; -/* Handlers for each ethtool command */ +	memset(cmd, 0, sizeof(struct ethtool_cmd)); +	cmd->cmd = ETHTOOL_GSET; +	return dev->ethtool_ops->get_settings(dev, cmd); +} +EXPORT_SYMBOL(__ethtool_get_settings);  static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)  { -	struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET };  	int err; +	struct ethtool_cmd cmd; -	if (!dev->ethtool_ops->get_settings) -		return -EOPNOTSUPP; - -	err = dev->ethtool_ops->get_settings(dev, &cmd); +	err = __ethtool_get_settings(dev, &cmd);  	if (err < 0)  		return err; @@ -209,7 +376,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,  	memset(&info, 0, sizeof(info));  	info.cmd = ETHTOOL_GDRVINFO; -	if (ops && ops->get_drvinfo) { +	if (ops->get_drvinfo) {  		ops->get_drvinfo(dev, &info);  	} else if (dev->dev.parent && dev->dev.parent->driver) {  		strlcpy(info.bus_info, dev_name(dev->dev.parent), @@ -224,7 +391,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,  	 * this method of obtaining string set info is deprecated;  	 * Use ETHTOOL_GSSET_INFO instead.  	 */ -	if (ops && ops->get_sset_count) { +	if (ops->get_sset_count) {  		int rc;  		rc = ops->get_sset_count(dev, ETH_SS_TEST); @@ -237,9 +404,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,  		if (rc >= 0)  			info.n_priv_flags = rc;  	} -	if (ops && ops->get_regs_len) +	if (ops->get_regs_len)  		info.regdump_len = ops->get_regs_len(dev); -	if (ops && ops->get_eeprom_len) +	if (ops->get_eeprom_len)  		info.eedump_len = ops->get_eeprom_len(dev);  	if (copy_to_user(useraddr, &info, sizeof(info))) @@ -251,14 +418,10 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,  						    void __user *useraddr)  {  	struct ethtool_sset_info info; -	const struct ethtool_ops *ops = dev->ethtool_ops;  	u64 sset_mask;  	int i, idx = 0, n_bits = 0, ret, rc;  	u32 *info_buf = NULL; -	if (!ops->get_sset_count) -		return -EOPNOTSUPP; -  	if (copy_from_user(&info, useraddr, sizeof(info)))  		return -EFAULT; @@ -285,7 +448,7 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,  		if (!(sset_mask & (1ULL << i)))  			continue; -		rc = ops->get_sset_count(dev, i); +		rc = __ethtool_get_sset_count(dev, i);  		if (rc >= 0) {  			info.sset_mask |= (1ULL << i);  			info_buf[idx++] = rc; @@ -312,6 +475,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,  {  	struct ethtool_rxnfc info;  	size_t info_size = sizeof(info); +	int rc;  	if (!dev->ethtool_ops->set_rxnfc)  		return -EOPNOTSUPP; @@ -327,7 +491,15 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,  	if (copy_from_user(&info, useraddr, info_size))  		return -EFAULT; -	return dev->ethtool_ops->set_rxnfc(dev, &info); +	rc = dev->ethtool_ops->set_rxnfc(dev, &info); +	if (rc) +		return rc; + +	if (cmd == ETHTOOL_SRXCLSRLINS && +	    copy_to_user(useraddr, &info, info_size)) +		return -EFAULT; + +	return 0;  }  static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, @@ -385,37 +557,64 @@ err_out:  	return ret;  } +static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, +					struct ethtool_rxnfc *rx_rings, +					u32 size) +{ +	int i; + +	if (copy_from_user(indir, useraddr, size * sizeof(indir[0]))) +		return -EFAULT; + +	/* Validate ring indices */ +	for (i = 0; i < size; i++) +		if (indir[i] >= rx_rings->data) +			return -EINVAL; + +	return 0; +} +  static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,  						     void __user *useraddr)  { -	struct ethtool_rxfh_indir *indir; -	u32 table_size; -	size_t full_size; +	u32 user_size, dev_size; +	u32 *indir;  	int ret; -	if (!dev->ethtool_ops->get_rxfh_indir) +	if (!dev->ethtool_ops->get_rxfh_indir_size || +	    !dev->ethtool_ops->get_rxfh) +		return -EOPNOTSUPP; +	dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); +	if (dev_size == 0)  		return -EOPNOTSUPP; -	if (copy_from_user(&table_size, +	if (copy_from_user(&user_size,  			   useraddr + offsetof(struct ethtool_rxfh_indir, size), -			   sizeof(table_size))) +			   sizeof(user_size)))  		return -EFAULT; -	if (table_size > -	    (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) -		return -ENOMEM; -	full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; -	indir = kzalloc(full_size, GFP_USER); +	if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), +			 &dev_size, sizeof(dev_size))) +		return -EFAULT; + +	/* If the user buffer size is 0, this is just a query for the +	 * device table size.  Otherwise, if it's smaller than the +	 * device table size it's an error. +	 */ +	if (user_size < dev_size) +		return user_size == 0 ? 0 : -EINVAL; + +	indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);  	if (!indir)  		return -ENOMEM; -	indir->cmd = ETHTOOL_GRXFHINDIR; -	indir->size = table_size; -	ret = dev->ethtool_ops->get_rxfh_indir(dev, indir); +	ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL);  	if (ret)  		goto out; -	if (copy_to_user(useraddr, indir, full_size)) +	if (copy_to_user(useraddr + +			 offsetof(struct ethtool_rxfh_indir, ring_index[0]), +			 indir, dev_size * sizeof(indir[0])))  		ret = -EFAULT;  out: @@ -426,377 +625,220 @@ out:  static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,  						     void __user *useraddr)  { -	struct ethtool_rxfh_indir *indir; -	u32 table_size; -	size_t full_size; +	struct ethtool_rxnfc rx_rings; +	u32 user_size, dev_size, i; +	u32 *indir; +	const struct ethtool_ops *ops = dev->ethtool_ops;  	int ret; +	u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); + +	if (!ops->get_rxfh_indir_size || !ops->set_rxfh || +	    !ops->get_rxnfc) +		return -EOPNOTSUPP; -	if (!dev->ethtool_ops->set_rxfh_indir) +	dev_size = ops->get_rxfh_indir_size(dev); +	if (dev_size == 0)  		return -EOPNOTSUPP; -	if (copy_from_user(&table_size, +	if (copy_from_user(&user_size,  			   useraddr + offsetof(struct ethtool_rxfh_indir, size), -			   sizeof(table_size))) +			   sizeof(user_size)))  		return -EFAULT; -	if (table_size > -	    (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) -		return -ENOMEM; -	full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; -	indir = kmalloc(full_size, GFP_USER); +	if (user_size != 0 && user_size != dev_size) +		return -EINVAL; + +	indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);  	if (!indir)  		return -ENOMEM; -	if (copy_from_user(indir, useraddr, full_size)) { -		ret = -EFAULT; +	rx_rings.cmd = ETHTOOL_GRXRINGS; +	ret = ops->get_rxnfc(dev, &rx_rings, NULL); +	if (ret)  		goto out; + +	if (user_size == 0) { +		for (i = 0; i < dev_size; i++) +			indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); +	} else { +		ret = ethtool_copy_validate_indir(indir, +						  useraddr + ringidx_offset, +						  &rx_rings, +						  dev_size); +		if (ret) +			goto out;  	} -	ret = dev->ethtool_ops->set_rxfh_indir(dev, indir); +	ret = ops->set_rxfh(dev, indir, NULL);  out:  	kfree(indir);  	return ret;  } -static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, -			struct ethtool_rx_ntuple_flow_spec *spec, -			struct ethtool_rx_ntuple_flow_spec_container *fsc) +static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, +					       void __user *useraddr)  { +	int ret; +	const struct ethtool_ops *ops = dev->ethtool_ops; +	u32 user_indir_size, user_key_size; +	u32 dev_indir_size = 0, dev_key_size = 0; +	struct ethtool_rxfh rxfh; +	u32 total_size; +	u32 indir_bytes; +	u32 *indir = NULL; +	u8 *hkey = NULL; +	u8 *rss_config; + +	if (!(dev->ethtool_ops->get_rxfh_indir_size || +	      dev->ethtool_ops->get_rxfh_key_size) || +	      !dev->ethtool_ops->get_rxfh) +		return -EOPNOTSUPP; -	/* don't add filters forever */ -	if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY) { -		/* free the container */ -		kfree(fsc); -		return; -	} +	if (ops->get_rxfh_indir_size) +		dev_indir_size = ops->get_rxfh_indir_size(dev); +	if (ops->get_rxfh_key_size) +		dev_key_size = ops->get_rxfh_key_size(dev); -	/* Copy the whole filter over */ -	fsc->fs.flow_type = spec->flow_type; -	memcpy(&fsc->fs.h_u, &spec->h_u, sizeof(spec->h_u)); -	memcpy(&fsc->fs.m_u, &spec->m_u, sizeof(spec->m_u)); +	if ((dev_key_size + dev_indir_size) == 0) +		return -EOPNOTSUPP; -	fsc->fs.vlan_tag = spec->vlan_tag; -	fsc->fs.vlan_tag_mask = spec->vlan_tag_mask; -	fsc->fs.data = spec->data; -	fsc->fs.data_mask = spec->data_mask; -	fsc->fs.action = spec->action; +	if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) +		return -EFAULT; +	user_indir_size = rxfh.indir_size; +	user_key_size = rxfh.key_size; -	/* add to the list */ -	list_add_tail_rcu(&fsc->list, &list->list); -	list->count++; -} +	/* Check that reserved fields are 0 for now */ +	if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) +		return -EINVAL; -/* - * ethtool does not (or did not) set masks for flow parameters that are - * not specified, so if both value and mask are 0 then this must be - * treated as equivalent to a mask with all bits set.  Implement that - * here rather than in drivers. - */ -static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs) -{ -	struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec; -	struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec; - -	if (fs->flow_type != TCP_V4_FLOW && -	    fs->flow_type != UDP_V4_FLOW && -	    fs->flow_type != SCTP_V4_FLOW) -		return; - -	if (!(entry->ip4src | mask->ip4src)) -		mask->ip4src = htonl(0xffffffff); -	if (!(entry->ip4dst | mask->ip4dst)) -		mask->ip4dst = htonl(0xffffffff); -	if (!(entry->psrc | mask->psrc)) -		mask->psrc = htons(0xffff); -	if (!(entry->pdst | mask->pdst)) -		mask->pdst = htons(0xffff); -	if (!(entry->tos | mask->tos)) -		mask->tos = 0xff; -	if (!(fs->vlan_tag | fs->vlan_tag_mask)) -		fs->vlan_tag_mask = 0xffff; -	if (!(fs->data | fs->data_mask)) -		fs->data_mask = 0xffffffffffffffffULL; -} +	rxfh.indir_size = dev_indir_size; +	rxfh.key_size = dev_key_size; +	if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) +		return -EFAULT; -static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, -						    void __user *useraddr) -{ -	struct ethtool_rx_ntuple cmd; -	const struct ethtool_ops *ops = dev->ethtool_ops; -	struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL; -	int ret; +	/* If the user buffer size is 0, this is just a query for the +	 * device table size and key size.  Otherwise, if the User size is +	 * not equal to device table size or key size it's an error. +	 */ +	if (!user_indir_size && !user_key_size) +		return 0; -	if (!(dev->features & NETIF_F_NTUPLE)) +	if ((user_indir_size && (user_indir_size != dev_indir_size)) || +	    (user_key_size && (user_key_size != dev_key_size)))  		return -EINVAL; -	if (copy_from_user(&cmd, useraddr, sizeof(cmd))) -		return -EFAULT; +	indir_bytes = user_indir_size * sizeof(indir[0]); +	total_size = indir_bytes + user_key_size; +	rss_config = kzalloc(total_size, GFP_USER); +	if (!rss_config) +		return -ENOMEM; -	rx_ntuple_fix_masks(&cmd.fs); +	if (user_indir_size) +		indir = (u32 *)rss_config; -	/* -	 * Cache filter in dev struct for GET operation only if -	 * the underlying driver doesn't have its own GET operation, and -	 * only if the filter was added successfully.  First make sure we -	 * can allocate the filter, then continue if successful. -	 */ -	if (!ops->get_rx_ntuple) { -		fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC); -		if (!fsc) -			return -ENOMEM; -	} +	if (user_key_size) +		hkey = rss_config + indir_bytes; -	ret = ops->set_rx_ntuple(dev, &cmd); -	if (ret) { -		kfree(fsc); -		return ret; +	ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey); +	if (!ret) { +		if (copy_to_user(useraddr + +				 offsetof(struct ethtool_rxfh, rss_config[0]), +				 rss_config, total_size)) +			ret = -EFAULT;  	} -	if (!ops->get_rx_ntuple) -		__rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs, fsc); +	kfree(rss_config);  	return ret;  } -static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, +					       void __user *useraddr)  { -	struct ethtool_gstrings gstrings; +	int ret;  	const struct ethtool_ops *ops = dev->ethtool_ops; -	struct ethtool_rx_ntuple_flow_spec_container *fsc; -	u8 *data; -	char *p; -	int ret, i, num_strings = 0; +	struct ethtool_rxnfc rx_rings; +	struct ethtool_rxfh rxfh; +	u32 dev_indir_size = 0, dev_key_size = 0, i; +	u32 *indir = NULL, indir_bytes = 0; +	u8 *hkey = NULL; +	u8 *rss_config; +	u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); + +	if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) || +	    !ops->get_rxnfc || !ops->set_rxfh) +		return -EOPNOTSUPP; -	if (!ops->get_sset_count) +	if (ops->get_rxfh_indir_size) +		dev_indir_size = ops->get_rxfh_indir_size(dev); +	if (ops->get_rxfh_key_size) +		dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev); +	if ((dev_key_size + dev_indir_size) == 0)  		return -EOPNOTSUPP; -	if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) +	if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))  		return -EFAULT; -	ret = ops->get_sset_count(dev, gstrings.string_set); -	if (ret < 0) -		return ret; +	/* Check that reserved fields are 0 for now */ +	if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) +		return -EINVAL; -	gstrings.len = ret; +	/* If either indir or hash key is valid, proceed further. +	 * It is not valid to request that both be unchanged. +	 */ +	if ((rxfh.indir_size && +	     rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && +	     rxfh.indir_size != dev_indir_size) || +	    (rxfh.key_size && (rxfh.key_size != dev_key_size)) || +	    (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && +	     rxfh.key_size == 0)) +		return -EINVAL; -	data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); -	if (!data) +	if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) +		indir_bytes = dev_indir_size * sizeof(indir[0]); + +	rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); +	if (!rss_config)  		return -ENOMEM; -	if (ops->get_rx_ntuple) { -		/* driver-specific filter grab */ -		ret = ops->get_rx_ntuple(dev, gstrings.string_set, data); -		goto copy; -	} +	rx_rings.cmd = ETHTOOL_GRXRINGS; +	ret = ops->get_rxnfc(dev, &rx_rings, NULL); +	if (ret) +		goto out; -	/* default ethtool filter grab */ -	i = 0; -	p = (char *)data; -	list_for_each_entry(fsc, &dev->ethtool_ntuple_list.list, list) { -		sprintf(p, "Filter %d:\n", i); -		p += ETH_GSTRING_LEN; -		num_strings++; - -		switch (fsc->fs.flow_type) { -		case TCP_V4_FLOW: -			sprintf(p, "\tFlow Type: TCP\n"); -			p += ETH_GSTRING_LEN; -			num_strings++; -			break; -		case UDP_V4_FLOW: -			sprintf(p, "\tFlow Type: UDP\n"); -			p += ETH_GSTRING_LEN; -			num_strings++; -			break; -		case SCTP_V4_FLOW: -			sprintf(p, "\tFlow Type: SCTP\n"); -			p += ETH_GSTRING_LEN; -			num_strings++; -			break; -		case AH_ESP_V4_FLOW: -			sprintf(p, "\tFlow Type: AH ESP\n"); -			p += ETH_GSTRING_LEN; -			num_strings++; -			break; -		case ESP_V4_FLOW: -			sprintf(p, "\tFlow Type: ESP\n"); -			p += ETH_GSTRING_LEN; -			num_strings++; -			break; -		case IP_USER_FLOW: -			sprintf(p, "\tFlow Type: Raw IP\n"); -			p += ETH_GSTRING_LEN; -			num_strings++; -			break; -		case IPV4_FLOW: -			sprintf(p, "\tFlow Type: IPv4\n"); -			p += ETH_GSTRING_LEN; -			num_strings++; -			break; -		default: -			sprintf(p, "\tFlow Type: Unknown\n"); -			p += ETH_GSTRING_LEN; -			num_strings++; -			goto unknown_filter; -		} +	/* rxfh.indir_size == 0 means reset the indir table to default. +	 * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. +	 */ +	if (rxfh.indir_size && +	    rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { +		indir = (u32 *)rss_config; +		ret = ethtool_copy_validate_indir(indir, +						  useraddr + rss_cfg_offset, +						  &rx_rings, +						  rxfh.indir_size); +		if (ret) +			goto out; +	} else if (rxfh.indir_size == 0) { +		indir = (u32 *)rss_config; +		for (i = 0; i < dev_indir_size; i++) +			indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); +	} -		/* now the rest of the filters */ -		switch (fsc->fs.flow_type) { -		case TCP_V4_FLOW: -		case UDP_V4_FLOW: -		case SCTP_V4_FLOW: -			sprintf(p, "\tSrc IP addr: 0x%x\n", -				fsc->fs.h_u.tcp_ip4_spec.ip4src); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tSrc IP mask: 0x%x\n", -				fsc->fs.m_u.tcp_ip4_spec.ip4src); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tDest IP addr: 0x%x\n", -				fsc->fs.h_u.tcp_ip4_spec.ip4dst); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tDest IP mask: 0x%x\n", -				fsc->fs.m_u.tcp_ip4_spec.ip4dst); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tSrc Port: %d, mask: 0x%x\n", -				fsc->fs.h_u.tcp_ip4_spec.psrc, -				fsc->fs.m_u.tcp_ip4_spec.psrc); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tDest Port: %d, mask: 0x%x\n", -				fsc->fs.h_u.tcp_ip4_spec.pdst, -				fsc->fs.m_u.tcp_ip4_spec.pdst); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tTOS: %d, mask: 0x%x\n", -				fsc->fs.h_u.tcp_ip4_spec.tos, -				fsc->fs.m_u.tcp_ip4_spec.tos); -			p += ETH_GSTRING_LEN; -			num_strings++; -			break; -		case AH_ESP_V4_FLOW: -		case ESP_V4_FLOW: -			sprintf(p, "\tSrc IP addr: 0x%x\n", -				fsc->fs.h_u.ah_ip4_spec.ip4src); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tSrc IP mask: 0x%x\n", -				fsc->fs.m_u.ah_ip4_spec.ip4src); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tDest IP addr: 0x%x\n", -				fsc->fs.h_u.ah_ip4_spec.ip4dst); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tDest IP mask: 0x%x\n", -				fsc->fs.m_u.ah_ip4_spec.ip4dst); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tSPI: %d, mask: 0x%x\n", -				fsc->fs.h_u.ah_ip4_spec.spi, -				fsc->fs.m_u.ah_ip4_spec.spi); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tTOS: %d, mask: 0x%x\n", -				fsc->fs.h_u.ah_ip4_spec.tos, -				fsc->fs.m_u.ah_ip4_spec.tos); -			p += ETH_GSTRING_LEN; -			num_strings++; -			break; -		case IP_USER_FLOW: -			sprintf(p, "\tSrc IP addr: 0x%x\n", -				fsc->fs.h_u.usr_ip4_spec.ip4src); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tSrc IP mask: 0x%x\n", -				fsc->fs.m_u.usr_ip4_spec.ip4src); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tDest IP addr: 0x%x\n", -				fsc->fs.h_u.usr_ip4_spec.ip4dst); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tDest IP mask: 0x%x\n", -				fsc->fs.m_u.usr_ip4_spec.ip4dst); -			p += ETH_GSTRING_LEN; -			num_strings++; -			break; -		case IPV4_FLOW: -			sprintf(p, "\tSrc IP addr: 0x%x\n", -				fsc->fs.h_u.usr_ip4_spec.ip4src); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tSrc IP mask: 0x%x\n", -				fsc->fs.m_u.usr_ip4_spec.ip4src); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tDest IP addr: 0x%x\n", -				fsc->fs.h_u.usr_ip4_spec.ip4dst); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tDest IP mask: 0x%x\n", -				fsc->fs.m_u.usr_ip4_spec.ip4dst); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n", -				fsc->fs.h_u.usr_ip4_spec.l4_4_bytes, -				fsc->fs.m_u.usr_ip4_spec.l4_4_bytes); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tTOS: %d, mask: 0x%x\n", -				fsc->fs.h_u.usr_ip4_spec.tos, -				fsc->fs.m_u.usr_ip4_spec.tos); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tIP Version: %d, mask: 0x%x\n", -				fsc->fs.h_u.usr_ip4_spec.ip_ver, -				fsc->fs.m_u.usr_ip4_spec.ip_ver); -			p += ETH_GSTRING_LEN; -			num_strings++; -			sprintf(p, "\tProtocol: %d, mask: 0x%x\n", -				fsc->fs.h_u.usr_ip4_spec.proto, -				fsc->fs.m_u.usr_ip4_spec.proto); -			p += ETH_GSTRING_LEN; -			num_strings++; -			break; +	if (rxfh.key_size) { +		hkey = rss_config + indir_bytes; +		if (copy_from_user(hkey, +				   useraddr + rss_cfg_offset + indir_bytes, +				   rxfh.key_size)) { +			ret = -EFAULT; +			goto out;  		} -		sprintf(p, "\tVLAN: %d, mask: 0x%x\n", -			fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask); -		p += ETH_GSTRING_LEN; -		num_strings++; -		sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data); -		p += ETH_GSTRING_LEN; -		num_strings++; -		sprintf(p, "\tUser-defined mask: 0x%Lx\n", fsc->fs.data_mask); -		p += ETH_GSTRING_LEN; -		num_strings++; -		if (fsc->fs.action == ETHTOOL_RXNTUPLE_ACTION_DROP) -			sprintf(p, "\tAction: Drop\n"); -		else -			sprintf(p, "\tAction: Direct to queue %d\n", -				fsc->fs.action); -		p += ETH_GSTRING_LEN; -		num_strings++; -unknown_filter: -		i++;  	} -copy: -	/* indicate to userspace how many strings we actually have */ -	gstrings.len = num_strings; -	ret = -EFAULT; -	if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) -		goto out; -	useraddr += sizeof(gstrings); -	if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) -		goto out; -	ret = 0; + +	ret = ops->set_rxfh(dev, indir, hkey);  out: -	kfree(data); +	kfree(rss_config);  	return ret;  } @@ -817,8 +859,8 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)  	if (regs.len > reglen)  		regs.len = reglen; -	regbuf = vmalloc(reglen); -	if (!regbuf) +	regbuf = vzalloc(reglen); +	if (reglen && !regbuf)  		return -ENOMEM;  	ops->get_regs(dev, ®s, regbuf); @@ -827,7 +869,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)  	if (copy_to_user(useraddr, ®s, sizeof(regs)))  		goto out;  	useraddr += offsetof(struct ethtool_regs, data); -	if (copy_to_user(useraddr, regbuf, regs.len)) +	if (regbuf && copy_to_user(useraddr, regbuf, regs.len))  		goto out;  	ret = 0; @@ -883,6 +925,40 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)  	return dev->ethtool_ops->set_wol(dev, &wol);  } +static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_eee edata; +	int rc; + +	if (!dev->ethtool_ops->get_eee) +		return -EOPNOTSUPP; + +	memset(&edata, 0, sizeof(struct ethtool_eee)); +	edata.cmd = ETHTOOL_GEEE; +	rc = dev->ethtool_ops->get_eee(dev, &edata); + +	if (rc) +		return rc; + +	if (copy_to_user(useraddr, &edata, sizeof(edata))) +		return -EFAULT; + +	return 0; +} + +static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_eee edata; + +	if (!dev->ethtool_ops->set_eee) +		return -EOPNOTSUPP; + +	if (copy_from_user(&edata, useraddr, sizeof(edata))) +		return -EFAULT; + +	return dev->ethtool_ops->set_eee(dev, &edata); +} +  static int ethtool_nway_reset(struct net_device *dev)  {  	if (!dev->ethtool_ops->nway_reset) @@ -891,18 +967,31 @@ static int ethtool_nway_reset(struct net_device *dev)  	return dev->ethtool_ops->nway_reset(dev);  } -static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) +static int ethtool_get_link(struct net_device *dev, char __user *useraddr) +{ +	struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; + +	if (!dev->ethtool_ops->get_link) +		return -EOPNOTSUPP; + +	edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); + +	if (copy_to_user(useraddr, &edata, sizeof(edata))) +		return -EFAULT; +	return 0; +} + +static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr, +				  int (*getter)(struct net_device *, +						struct ethtool_eeprom *, u8 *), +				  u32 total_len)  {  	struct ethtool_eeprom eeprom; -	const struct ethtool_ops *ops = dev->ethtool_ops;  	void __user *userbuf = useraddr + sizeof(eeprom);  	u32 bytes_remaining;  	u8 *data;  	int ret = 0; -	if (!ops->get_eeprom || !ops->get_eeprom_len) -		return -EOPNOTSUPP; -  	if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))  		return -EFAULT; @@ -911,7 +1000,7 @@ static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)  		return -EINVAL;  	/* Check for exceeding total eeprom len */ -	if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) +	if (eeprom.offset + eeprom.len > total_len)  		return -EINVAL;  	data = kmalloc(PAGE_SIZE, GFP_USER); @@ -922,7 +1011,7 @@ static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)  	while (bytes_remaining > 0) {  		eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); -		ret = ops->get_eeprom(dev, &eeprom, data); +		ret = getter(dev, &eeprom, data);  		if (ret)  			break;  		if (copy_to_user(userbuf, data, eeprom.len)) { @@ -943,6 +1032,17 @@ static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)  	return ret;  } +static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) +{ +	const struct ethtool_ops *ops = dev->ethtool_ops; + +	if (!ops->get_eeprom || !ops->get_eeprom_len) +		return -EOPNOTSUPP; + +	return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom, +				      ops->get_eeprom_len(dev)); +} +  static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)  {  	struct ethtool_eeprom eeprom; @@ -1046,190 +1146,60 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)  	return dev->ethtool_ops->set_ringparam(dev, &ringparam);  } -static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) +static noinline_for_stack int ethtool_get_channels(struct net_device *dev, +						   void __user *useraddr)  { -	struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; +	struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; -	if (!dev->ethtool_ops->get_pauseparam) +	if (!dev->ethtool_ops->get_channels)  		return -EOPNOTSUPP; -	dev->ethtool_ops->get_pauseparam(dev, &pauseparam); +	dev->ethtool_ops->get_channels(dev, &channels); -	if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) +	if (copy_to_user(useraddr, &channels, sizeof(channels)))  		return -EFAULT;  	return 0;  } -static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) -{ -	struct ethtool_pauseparam pauseparam; - -	if (!dev->ethtool_ops->set_pauseparam) -		return -EOPNOTSUPP; - -	if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) -		return -EFAULT; - -	return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); -} - -static int __ethtool_set_sg(struct net_device *dev, u32 data) -{ -	int err; - -	if (!data && dev->ethtool_ops->set_tso) { -		err = dev->ethtool_ops->set_tso(dev, 0); -		if (err) -			return err; -	} - -	if (!data && dev->ethtool_ops->set_ufo) { -		err = dev->ethtool_ops->set_ufo(dev, 0); -		if (err) -			return err; -	} -	return dev->ethtool_ops->set_sg(dev, data); -} - -static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr) -{ -	struct ethtool_value edata; -	int err; - -	if (!dev->ethtool_ops->set_tx_csum) -		return -EOPNOTSUPP; - -	if (copy_from_user(&edata, useraddr, sizeof(edata))) -		return -EFAULT; - -	if (!edata.data && dev->ethtool_ops->set_sg) { -		err = __ethtool_set_sg(dev, 0); -		if (err) -			return err; -	} - -	return dev->ethtool_ops->set_tx_csum(dev, edata.data); -} -EXPORT_SYMBOL(ethtool_op_set_tx_csum); - -static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) -{ -	struct ethtool_value edata; - -	if (!dev->ethtool_ops->set_rx_csum) -		return -EOPNOTSUPP; - -	if (copy_from_user(&edata, useraddr, sizeof(edata))) -		return -EFAULT; - -	if (!edata.data && dev->ethtool_ops->set_sg) -		dev->features &= ~NETIF_F_GRO; - -	return dev->ethtool_ops->set_rx_csum(dev, edata.data); -} - -static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) -{ -	struct ethtool_value edata; - -	if (!dev->ethtool_ops->set_sg) -		return -EOPNOTSUPP; - -	if (copy_from_user(&edata, useraddr, sizeof(edata))) -		return -EFAULT; - -	if (edata.data && -	    !(dev->features & NETIF_F_ALL_CSUM)) -		return -EINVAL; - -	return __ethtool_set_sg(dev, edata.data); -} - -static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) +static noinline_for_stack int ethtool_set_channels(struct net_device *dev, +						   void __user *useraddr)  { -	struct ethtool_value edata; +	struct ethtool_channels channels; -	if (!dev->ethtool_ops->set_tso) +	if (!dev->ethtool_ops->set_channels)  		return -EOPNOTSUPP; -	if (copy_from_user(&edata, useraddr, sizeof(edata))) +	if (copy_from_user(&channels, useraddr, sizeof(channels)))  		return -EFAULT; -	if (edata.data && !(dev->features & NETIF_F_SG)) -		return -EINVAL; - -	return dev->ethtool_ops->set_tso(dev, edata.data); +	return dev->ethtool_ops->set_channels(dev, &channels);  } -static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) +static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)  { -	struct ethtool_value edata; +	struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; -	if (!dev->ethtool_ops->set_ufo) +	if (!dev->ethtool_ops->get_pauseparam)  		return -EOPNOTSUPP; -	if (copy_from_user(&edata, useraddr, sizeof(edata))) -		return -EFAULT; -	if (edata.data && !(dev->features & NETIF_F_SG)) -		return -EINVAL; -	if (edata.data && !((dev->features & NETIF_F_GEN_CSUM) || -		(dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) -			== (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) -		return -EINVAL; -	return dev->ethtool_ops->set_ufo(dev, edata.data); -} -static int ethtool_get_gso(struct net_device *dev, char __user *useraddr) -{ -	struct ethtool_value edata = { ETHTOOL_GGSO }; - -	edata.data = dev->features & NETIF_F_GSO; -	if (copy_to_user(useraddr, &edata, sizeof(edata))) -		return -EFAULT; -	return 0; -} - -static int ethtool_set_gso(struct net_device *dev, char __user *useraddr) -{ -	struct ethtool_value edata; +	dev->ethtool_ops->get_pauseparam(dev, &pauseparam); -	if (copy_from_user(&edata, useraddr, sizeof(edata))) +	if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam)))  		return -EFAULT; -	if (edata.data) -		dev->features |= NETIF_F_GSO; -	else -		dev->features &= ~NETIF_F_GSO;  	return 0;  } -static int ethtool_get_gro(struct net_device *dev, char __user *useraddr) +static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)  { -	struct ethtool_value edata = { ETHTOOL_GGRO }; - -	edata.data = dev->features & NETIF_F_GRO; -	if (copy_to_user(useraddr, &edata, sizeof(edata))) -		return -EFAULT; -	return 0; -} +	struct ethtool_pauseparam pauseparam; -static int ethtool_set_gro(struct net_device *dev, char __user *useraddr) -{ -	struct ethtool_value edata; +	if (!dev->ethtool_ops->set_pauseparam) +		return -EOPNOTSUPP; -	if (copy_from_user(&edata, useraddr, sizeof(edata))) +	if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))  		return -EFAULT; -	if (edata.data) { -		u32 rxcsum = dev->ethtool_ops->get_rx_csum ? -				dev->ethtool_ops->get_rx_csum(dev) : -				ethtool_op_get_rx_csum(dev); - -		if (!rxcsum) -			return -EINVAL; -		dev->features |= NETIF_F_GRO; -	} else -		dev->features &= ~NETIF_F_GRO; - -	return 0; +	return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);  }  static int ethtool_self_test(struct net_device *dev, char __user *useraddr) @@ -1273,17 +1243,13 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr)  static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)  {  	struct ethtool_gstrings gstrings; -	const struct ethtool_ops *ops = dev->ethtool_ops;  	u8 *data;  	int ret; -	if (!ops->get_strings || !ops->get_sset_count) -		return -EOPNOTSUPP; -  	if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))  		return -EFAULT; -	ret = ops->get_sset_count(dev, gstrings.string_set); +	ret = __ethtool_get_sset_count(dev, gstrings.string_set);  	if (ret < 0)  		return ret; @@ -1293,7 +1259,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)  	if (!data)  		return -ENOMEM; -	ops->get_strings(dev, gstrings.string_set, data); +	__ethtool_get_strings(dev, gstrings.string_set, data);  	ret = -EFAULT;  	if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) @@ -1303,7 +1269,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)  		goto out;  	ret = 0; - out: +out:  	kfree(data);  	return ret;  } @@ -1311,14 +1277,61 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)  static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)  {  	struct ethtool_value id; +	static bool busy; +	const struct ethtool_ops *ops = dev->ethtool_ops; +	int rc; -	if (!dev->ethtool_ops->phys_id) +	if (!ops->set_phys_id)  		return -EOPNOTSUPP; +	if (busy) +		return -EBUSY; +  	if (copy_from_user(&id, useraddr, sizeof(id)))  		return -EFAULT; -	return dev->ethtool_ops->phys_id(dev, id.data); +	rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); +	if (rc < 0) +		return rc; + +	/* Drop the RTNL lock while waiting, but prevent reentry or +	 * removal of the device. +	 */ +	busy = true; +	dev_hold(dev); +	rtnl_unlock(); + +	if (rc == 0) { +		/* Driver will handle this itself */ +		schedule_timeout_interruptible( +			id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); +	} else { +		/* Driver expects to be called at twice the frequency in rc */ +		int n = rc * 2, i, interval = HZ / n; + +		/* Count down seconds */ +		do { +			/* Count down iterations per second */ +			i = n; +			do { +				rtnl_lock(); +				rc = ops->set_phys_id(dev, +				    (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); +				rtnl_unlock(); +				if (rc) +					break; +				schedule_timeout_interruptible(interval); +			} while (!signal_pending(current) && --i != 0); +		} while (!signal_pending(current) && +			 (id.data == 0 || --id.data != 0)); +	} + +	rtnl_lock(); +	dev_put(dev); +	busy = false; + +	(void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); +	return rc;  }  static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) @@ -1433,10 +1446,182 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev,  	if (!dev->ethtool_ops->flash_device)  		return -EOPNOTSUPP; +	efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; +  	return dev->ethtool_ops->flash_device(dev, &efl);  } -/* The main entry point in this file.  Called from net/core/dev.c */ +static int ethtool_set_dump(struct net_device *dev, +			void __user *useraddr) +{ +	struct ethtool_dump dump; + +	if (!dev->ethtool_ops->set_dump) +		return -EOPNOTSUPP; + +	if (copy_from_user(&dump, useraddr, sizeof(dump))) +		return -EFAULT; + +	return dev->ethtool_ops->set_dump(dev, &dump); +} + +static int ethtool_get_dump_flag(struct net_device *dev, +				void __user *useraddr) +{ +	int ret; +	struct ethtool_dump dump; +	const struct ethtool_ops *ops = dev->ethtool_ops; + +	if (!ops->get_dump_flag) +		return -EOPNOTSUPP; + +	if (copy_from_user(&dump, useraddr, sizeof(dump))) +		return -EFAULT; + +	ret = ops->get_dump_flag(dev, &dump); +	if (ret) +		return ret; + +	if (copy_to_user(useraddr, &dump, sizeof(dump))) +		return -EFAULT; +	return 0; +} + +static int ethtool_get_dump_data(struct net_device *dev, +				void __user *useraddr) +{ +	int ret; +	__u32 len; +	struct ethtool_dump dump, tmp; +	const struct ethtool_ops *ops = dev->ethtool_ops; +	void *data = NULL; + +	if (!ops->get_dump_data || !ops->get_dump_flag) +		return -EOPNOTSUPP; + +	if (copy_from_user(&dump, useraddr, sizeof(dump))) +		return -EFAULT; + +	memset(&tmp, 0, sizeof(tmp)); +	tmp.cmd = ETHTOOL_GET_DUMP_FLAG; +	ret = ops->get_dump_flag(dev, &tmp); +	if (ret) +		return ret; + +	len = min(tmp.len, dump.len); +	if (!len) +		return -EFAULT; + +	/* Don't ever let the driver think there's more space available +	 * than it requested with .get_dump_flag(). +	 */ +	dump.len = len; + +	/* Always allocate enough space to hold the whole thing so that the +	 * driver does not need to check the length and bother with partial +	 * dumping. +	 */ +	data = vzalloc(tmp.len); +	if (!data) +		return -ENOMEM; +	ret = ops->get_dump_data(dev, &dump, data); +	if (ret) +		goto out; + +	/* There are two sane possibilities: +	 * 1. The driver's .get_dump_data() does not touch dump.len. +	 * 2. Or it may set dump.len to how much it really writes, which +	 *    should be tmp.len (or len if it can do a partial dump). +	 * In any case respond to userspace with the actual length of data +	 * it's receiving. +	 */ +	WARN_ON(dump.len != len && dump.len != tmp.len); +	dump.len = len; + +	if (copy_to_user(useraddr, &dump, sizeof(dump))) { +		ret = -EFAULT; +		goto out; +	} +	useraddr += offsetof(struct ethtool_dump, data); +	if (copy_to_user(useraddr, data, len)) +		ret = -EFAULT; +out: +	vfree(data); +	return ret; +} + +static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) +{ +	int err = 0; +	struct ethtool_ts_info info; +	const struct ethtool_ops *ops = dev->ethtool_ops; +	struct phy_device *phydev = dev->phydev; + +	memset(&info, 0, sizeof(info)); +	info.cmd = ETHTOOL_GET_TS_INFO; + +	if (phydev && phydev->drv && phydev->drv->ts_info) { +		err = phydev->drv->ts_info(phydev, &info); +	} else if (ops->get_ts_info) { +		err = ops->get_ts_info(dev, &info); +	} else { +		info.so_timestamping = +			SOF_TIMESTAMPING_RX_SOFTWARE | +			SOF_TIMESTAMPING_SOFTWARE; +		info.phc_index = -1; +	} + +	if (err) +		return err; + +	if (copy_to_user(useraddr, &info, sizeof(info))) +		err = -EFAULT; + +	return err; +} + +static int ethtool_get_module_info(struct net_device *dev, +				   void __user *useraddr) +{ +	int ret; +	struct ethtool_modinfo modinfo; +	const struct ethtool_ops *ops = dev->ethtool_ops; + +	if (!ops->get_module_info) +		return -EOPNOTSUPP; + +	if (copy_from_user(&modinfo, useraddr, sizeof(modinfo))) +		return -EFAULT; + +	ret = ops->get_module_info(dev, &modinfo); +	if (ret) +		return ret; + +	if (copy_to_user(useraddr, &modinfo, sizeof(modinfo))) +		return -EFAULT; + +	return 0; +} + +static int ethtool_get_module_eeprom(struct net_device *dev, +				     void __user *useraddr) +{ +	int ret; +	struct ethtool_modinfo modinfo; +	const struct ethtool_ops *ops = dev->ethtool_ops; + +	if (!ops->get_module_info || !ops->get_module_eeprom) +		return -EOPNOTSUPP; + +	ret = ops->get_module_info(dev, &modinfo); +	if (ret) +		return ret; + +	return ethtool_get_any_eeprom(dev, useraddr, ops->get_module_eeprom, +				      modinfo.eeprom_len); +} + +/* The main entry point in this file.  Called from net/core/dev_ioctl.c */  int dev_ethtool(struct net *net, struct ifreq *ifr)  { @@ -1444,7 +1629,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	void __user *useraddr = ifr->ifr_data;  	u32 ethcmd;  	int rc; -	unsigned long old_features; +	netdev_features_t old_features;  	if (!dev || !netif_device_present(dev))  		return -ENODEV; @@ -1452,28 +1637,21 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd)))  		return -EFAULT; -	if (!dev->ethtool_ops) { -		/* ETHTOOL_GDRVINFO does not require any driver support. -		 * It is also unprivileged and does not change anything, -		 * so we can take a shortcut to it. */ -		if (ethcmd == ETHTOOL_GDRVINFO) -			return ethtool_get_drvinfo(dev, useraddr); -		else -			return -EOPNOTSUPP; -	} -  	/* Allow some commands to be done by anyone */  	switch (ethcmd) {  	case ETHTOOL_GSET:  	case ETHTOOL_GDRVINFO:  	case ETHTOOL_GMSGLVL: +	case ETHTOOL_GLINK:  	case ETHTOOL_GCOALESCE:  	case ETHTOOL_GRINGPARAM:  	case ETHTOOL_GPAUSEPARAM:  	case ETHTOOL_GRXCSUM:  	case ETHTOOL_GTXCSUM:  	case ETHTOOL_GSG: +	case ETHTOOL_GSSET_INFO:  	case ETHTOOL_GSTRINGS: +	case ETHTOOL_GSTATS:  	case ETHTOOL_GTSO:  	case ETHTOOL_GPERMADDR:  	case ETHTOOL_GUFO: @@ -1486,9 +1664,15 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	case ETHTOOL_GRXCLSRLCNT:  	case ETHTOOL_GRXCLSRULE:  	case ETHTOOL_GRXCLSRLALL: +	case ETHTOOL_GRXFHINDIR: +	case ETHTOOL_GRSSH: +	case ETHTOOL_GFEATURES: +	case ETHTOOL_GCHANNELS: +	case ETHTOOL_GET_TS_INFO: +	case ETHTOOL_GEEE:  		break;  	default: -		if (!capable(CAP_NET_ADMIN)) +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  			return -EPERM;  	} @@ -1526,12 +1710,17 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  		rc = ethtool_set_value_void(dev, useraddr,  				       dev->ethtool_ops->set_msglevel);  		break; +	case ETHTOOL_GEEE: +		rc = ethtool_get_eee(dev, useraddr); +		break; +	case ETHTOOL_SEEE: +		rc = ethtool_set_eee(dev, useraddr); +		break;  	case ETHTOOL_NWAY_RST:  		rc = ethtool_nway_reset(dev);  		break;  	case ETHTOOL_GLINK: -		rc = ethtool_get_value(dev, useraddr, ethcmd, -				       dev->ethtool_ops->get_link); +		rc = ethtool_get_link(dev, useraddr);  		break;  	case ETHTOOL_GEEPROM:  		rc = ethtool_get_eeprom(dev, useraddr); @@ -1557,42 +1746,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	case ETHTOOL_SPAUSEPARAM:  		rc = ethtool_set_pauseparam(dev, useraddr);  		break; -	case ETHTOOL_GRXCSUM: -		rc = ethtool_get_value(dev, useraddr, ethcmd, -				       (dev->ethtool_ops->get_rx_csum ? -					dev->ethtool_ops->get_rx_csum : -					ethtool_op_get_rx_csum)); -		break; -	case ETHTOOL_SRXCSUM: -		rc = ethtool_set_rx_csum(dev, useraddr); -		break; -	case ETHTOOL_GTXCSUM: -		rc = ethtool_get_value(dev, useraddr, ethcmd, -				       (dev->ethtool_ops->get_tx_csum ? -					dev->ethtool_ops->get_tx_csum : -					ethtool_op_get_tx_csum)); -		break; -	case ETHTOOL_STXCSUM: -		rc = ethtool_set_tx_csum(dev, useraddr); -		break; -	case ETHTOOL_GSG: -		rc = ethtool_get_value(dev, useraddr, ethcmd, -				       (dev->ethtool_ops->get_sg ? -					dev->ethtool_ops->get_sg : -					ethtool_op_get_sg)); -		break; -	case ETHTOOL_SSG: -		rc = ethtool_set_sg(dev, useraddr); -		break; -	case ETHTOOL_GTSO: -		rc = ethtool_get_value(dev, useraddr, ethcmd, -				       (dev->ethtool_ops->get_tso ? -					dev->ethtool_ops->get_tso : -					ethtool_op_get_tso)); -		break; -	case ETHTOOL_STSO: -		rc = ethtool_set_tso(dev, useraddr); -		break;  	case ETHTOOL_TEST:  		rc = ethtool_self_test(dev, useraddr);  		break; @@ -1608,30 +1761,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	case ETHTOOL_GPERMADDR:  		rc = ethtool_get_perm_addr(dev, useraddr);  		break; -	case ETHTOOL_GUFO: -		rc = ethtool_get_value(dev, useraddr, ethcmd, -				       (dev->ethtool_ops->get_ufo ? -					dev->ethtool_ops->get_ufo : -					ethtool_op_get_ufo)); -		break; -	case ETHTOOL_SUFO: -		rc = ethtool_set_ufo(dev, useraddr); -		break; -	case ETHTOOL_GGSO: -		rc = ethtool_get_gso(dev, useraddr); -		break; -	case ETHTOOL_SGSO: -		rc = ethtool_set_gso(dev, useraddr); -		break;  	case ETHTOOL_GFLAGS:  		rc = ethtool_get_value(dev, useraddr, ethcmd, -				       (dev->ethtool_ops->get_flags ? -					dev->ethtool_ops->get_flags : -					ethtool_op_get_flags)); +					__ethtool_get_flags);  		break;  	case ETHTOOL_SFLAGS: -		rc = ethtool_set_value(dev, useraddr, -				       dev->ethtool_ops->set_flags); +		rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);  		break;  	case ETHTOOL_GPFLAGS:  		rc = ethtool_get_value(dev, useraddr, ethcmd, @@ -1653,24 +1788,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	case ETHTOOL_SRXCLSRLINS:  		rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);  		break; -	case ETHTOOL_GGRO: -		rc = ethtool_get_gro(dev, useraddr); -		break; -	case ETHTOOL_SGRO: -		rc = ethtool_set_gro(dev, useraddr); -		break;  	case ETHTOOL_FLASHDEV:  		rc = ethtool_flash_device(dev, useraddr);  		break;  	case ETHTOOL_RESET:  		rc = ethtool_reset(dev, useraddr);  		break; -	case ETHTOOL_SRXNTUPLE: -		rc = ethtool_set_rx_ntuple(dev, useraddr); -		break; -	case ETHTOOL_GRXNTUPLE: -		rc = ethtool_get_rx_ntuple(dev, useraddr); -		break;  	case ETHTOOL_GSSET_INFO:  		rc = ethtool_get_sset_info(dev, useraddr);  		break; @@ -1680,6 +1803,60 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	case ETHTOOL_SRXFHINDIR:  		rc = ethtool_set_rxfh_indir(dev, useraddr);  		break; +	case ETHTOOL_GRSSH: +		rc = ethtool_get_rxfh(dev, useraddr); +		break; +	case ETHTOOL_SRSSH: +		rc = ethtool_set_rxfh(dev, useraddr); +		break; +	case ETHTOOL_GFEATURES: +		rc = ethtool_get_features(dev, useraddr); +		break; +	case ETHTOOL_SFEATURES: +		rc = ethtool_set_features(dev, useraddr); +		break; +	case ETHTOOL_GTXCSUM: +	case ETHTOOL_GRXCSUM: +	case ETHTOOL_GSG: +	case ETHTOOL_GTSO: +	case ETHTOOL_GUFO: +	case ETHTOOL_GGSO: +	case ETHTOOL_GGRO: +		rc = ethtool_get_one_feature(dev, useraddr, ethcmd); +		break; +	case ETHTOOL_STXCSUM: +	case ETHTOOL_SRXCSUM: +	case ETHTOOL_SSG: +	case ETHTOOL_STSO: +	case ETHTOOL_SUFO: +	case ETHTOOL_SGSO: +	case ETHTOOL_SGRO: +		rc = ethtool_set_one_feature(dev, useraddr, ethcmd); +		break; +	case ETHTOOL_GCHANNELS: +		rc = ethtool_get_channels(dev, useraddr); +		break; +	case ETHTOOL_SCHANNELS: +		rc = ethtool_set_channels(dev, useraddr); +		break; +	case ETHTOOL_SET_DUMP: +		rc = ethtool_set_dump(dev, useraddr); +		break; +	case ETHTOOL_GET_DUMP_FLAG: +		rc = ethtool_get_dump_flag(dev, useraddr); +		break; +	case ETHTOOL_GET_DUMP_DATA: +		rc = ethtool_get_dump_data(dev, useraddr); +		break; +	case ETHTOOL_GET_TS_INFO: +		rc = ethtool_get_ts_info(dev, useraddr); +		break; +	case ETHTOOL_GMODULEINFO: +		rc = ethtool_get_module_info(dev, useraddr); +		break; +	case ETHTOOL_GMODULEEEPROM: +		rc = ethtool_get_module_eeprom(dev, useraddr); +		break;  	default:  		rc = -EOPNOTSUPP;  	} diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 82a4369ae15..185c341fafb 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -12,6 +12,7 @@  #include <linux/kernel.h>  #include <linux/slab.h>  #include <linux/list.h> +#include <linux/module.h>  #include <net/net_namespace.h>  #include <net/sock.h>  #include <net/fib_rules.h> @@ -32,6 +33,9 @@ int fib_default_rule_add(struct fib_rules_ops *ops,  	r->flags = flags;  	r->fr_net = hold_net(ops->fro_net); +	r->suppress_prefixlen = -1; +	r->suppress_ifgroup = -1; +  	/* The lock is not required here, the list in unreacheable  	 * at the moment this function is called */  	list_add_tail(&r->list, &ops->rules_list); @@ -150,6 +154,8 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)  	list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {  		list_del_rcu(&rule->list); +		if (ops->delete) +			ops->delete(rule);  		fib_rule_put(rule);  	}  } @@ -181,14 +187,13 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,  {  	int ret = 0; -	if (rule->iifindex && (rule->iifindex != fl->iif) && -	    !(fl->flags & FLOWI_FLAG_MATCH_ANY_IIF)) +	if (rule->iifindex && (rule->iifindex != fl->flowi_iif))  		goto out; -	if (rule->oifindex && (rule->oifindex != fl->oif)) +	if (rule->oifindex && (rule->oifindex != fl->flowi_oif))  		goto out; -	if ((rule->mark ^ fl->mark) & rule->mark_mask) +	if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)  		goto out;  	ret = ops->match(rule, fl, flags); @@ -224,6 +229,9 @@ jumped:  		else  			err = ops->action(rule, fl, flags, arg); +		if (!err && ops->suppress && ops->suppress(rule, arg)) +			continue; +  		if (err != -EAGAIN) {  			if ((arg->flags & FIB_LOOKUP_NOREF) ||  			    likely(atomic_inc_not_zero(&rule->refcnt))) { @@ -264,7 +272,7 @@ errout:  	return err;  } -static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)  {  	struct net *net = sock_net(skb->sk);  	struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -335,6 +343,15 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)  	rule->action = frh->action;  	rule->flags = frh->flags;  	rule->table = frh_get_table(frh, tb); +	if (tb[FRA_SUPPRESS_PREFIXLEN]) +		rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); +	else +		rule->suppress_prefixlen = -1; + +	if (tb[FRA_SUPPRESS_IFGROUP]) +		rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); +	else +		rule->suppress_ifgroup = -1;  	if (!tb[FRA_PRIORITY] && ops->default_pref)  		rule->pref = ops->default_pref(ops); @@ -385,8 +402,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)  		 */  		list_for_each_entry(r, &ops->rules_list, list) {  			if (r->action == FR_ACT_GOTO && -			    r->target == rule->pref) { -				BUG_ON(rtnl_dereference(r->ctarget) != NULL); +			    r->target == rule->pref && +			    rtnl_dereference(r->ctarget) == NULL) {  				rcu_assign_pointer(r->ctarget, rule);  				if (--ops->unresolved_rules == 0)  					break; @@ -400,7 +417,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)  	if (unresolved)  		ops->unresolved_rules++; -	notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); +	notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);  	flush_route_cache(ops);  	rules_ops_put(ops);  	return 0; @@ -413,7 +430,7 @@ errout:  	return err;  } -static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)  {  	struct net *net = sock_net(skb->sk);  	struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -443,7 +460,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)  		if (frh->action && (frh->action != rule->action))  			continue; -		if (frh->table && (frh_get_table(frh, tb) != rule->table)) +		if (frh_get_table(frh, tb) && +		    (frh_get_table(frh, tb) != rule->table))  			continue;  		if (tb[FRA_PRIORITY] && @@ -476,8 +494,11 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)  		list_del_rcu(&rule->list); -		if (rule->action == FR_ACT_GOTO) +		if (rule->action == FR_ACT_GOTO) {  			ops->nr_goto_rules--; +			if (rtnl_dereference(rule->ctarget) == NULL) +				ops->unresolved_rules--; +		}  		/*  		 * Check if this rule is a target to any of them. If so, @@ -488,14 +509,16 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)  		if (ops->nr_goto_rules > 0) {  			list_for_each_entry(tmp, &ops->rules_list, list) {  				if (rtnl_dereference(tmp->ctarget) == rule) { -					rcu_assign_pointer(tmp->ctarget, NULL); +					RCU_INIT_POINTER(tmp->ctarget, NULL);  					ops->unresolved_rules++;  				}  			}  		}  		notify_rule_change(RTM_DELRULE, rule, ops, nlh, -				   NETLINK_CB(skb).pid); +				   NETLINK_CB(skb).portid); +		if (ops->delete) +			ops->delete(rule);  		fib_rule_put(rule);  		flush_route_cache(ops);  		rules_ops_put(ops); @@ -516,6 +539,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,  			 + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */  			 + nla_total_size(4) /* FRA_PRIORITY */  			 + nla_total_size(4) /* FRA_TABLE */ +			 + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ +			 + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */  			 + nla_total_size(4) /* FRA_FWMARK */  			 + nla_total_size(4); /* FRA_FWMASK */ @@ -539,41 +564,47 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,  	frh = nlmsg_data(nlh);  	frh->family = ops->family;  	frh->table = rule->table; -	NLA_PUT_U32(skb, FRA_TABLE, rule->table); +	if (nla_put_u32(skb, FRA_TABLE, rule->table)) +		goto nla_put_failure; +	if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) +		goto nla_put_failure;  	frh->res1 = 0;  	frh->res2 = 0;  	frh->action = rule->action;  	frh->flags = rule->flags;  	if (rule->action == FR_ACT_GOTO && -	    rcu_dereference_raw(rule->ctarget) == NULL) +	    rcu_access_pointer(rule->ctarget) == NULL)  		frh->flags |= FIB_RULE_UNRESOLVED;  	if (rule->iifname[0]) { -		NLA_PUT_STRING(skb, FRA_IIFNAME, rule->iifname); - +		if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) +			goto nla_put_failure;  		if (rule->iifindex == -1)  			frh->flags |= FIB_RULE_IIF_DETACHED;  	}  	if (rule->oifname[0]) { -		NLA_PUT_STRING(skb, FRA_OIFNAME, rule->oifname); - +		if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) +			goto nla_put_failure;  		if (rule->oifindex == -1)  			frh->flags |= FIB_RULE_OIF_DETACHED;  	} -	if (rule->pref) -		NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref); - -	if (rule->mark) -		NLA_PUT_U32(skb, FRA_FWMARK, rule->mark); - -	if (rule->mark_mask || rule->mark) -		NLA_PUT_U32(skb, FRA_FWMASK, rule->mark_mask); +	if ((rule->pref && +	     nla_put_u32(skb, FRA_PRIORITY, rule->pref)) || +	    (rule->mark && +	     nla_put_u32(skb, FRA_FWMARK, rule->mark)) || +	    ((rule->mark_mask || rule->mark) && +	     nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || +	    (rule->target && +	     nla_put_u32(skb, FRA_GOTO, rule->target))) +		goto nla_put_failure; -	if (rule->target) -		NLA_PUT_U32(skb, FRA_GOTO, rule->target); +	if (rule->suppress_ifgroup != -1) { +		if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) +			goto nla_put_failure; +	}  	if (ops->fill(rule, skb, frh) < 0)  		goto nla_put_failure; @@ -591,17 +622,19 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,  	int idx = 0;  	struct fib_rule *rule; -	list_for_each_entry(rule, &ops->rules_list, list) { +	rcu_read_lock(); +	list_for_each_entry_rcu(rule, &ops->rules_list, list) {  		if (idx < cb->args[1])  			goto skip; -		if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid, +		if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,  				     cb->nlh->nlmsg_seq, RTM_NEWRULE,  				     NLM_F_MULTI, ops) < 0)  			break;  skip:  		idx++;  	} +	rcu_read_unlock();  	cb->args[1] = idx;  	rules_ops_put(ops); @@ -698,9 +731,9 @@ static void detach_rules(struct list_head *rules, struct net_device *dev)  static int fib_rules_event(struct notifier_block *this, unsigned long event, -			    void *ptr) +			   void *ptr)  { -	struct net_device *dev = ptr; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	struct net *net = dev_net(dev);  	struct fib_rules_ops *ops; @@ -712,6 +745,13 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event,  			attach_rules(&ops->rules_list, dev);  		break; +	case NETDEV_CHANGENAME: +		list_for_each_entry(ops, &net->rules_ops, list) { +			detach_rules(&ops->rules_list, dev); +			attach_rules(&ops->rules_list, dev); +		} +		break; +  	case NETDEV_UNREGISTER:  		list_for_each_entry(ops, &net->rules_ops, list)  			detach_rules(&ops->rules_list, dev); @@ -739,9 +779,9 @@ static struct pernet_operations fib_rules_net_ops = {  static int __init fib_rules_init(void)  {  	int err; -	rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL); -	rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL); -	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule); +	rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL);  	err = register_pernet_subsys(&fib_rules_net_ops);  	if (err < 0) diff --git a/net/core/filter.c b/net/core/filter.c index 25500f16a18..1dbf6462f76 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1,11 +1,16 @@  /*   * Linux Socket Filter - Kernel level socket filtering   * - * Author: - *     Jay Schulist <jschlst@samba.org> + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid:   * - * Based on the design of: - *     - The Berkeley Packet Filter + *	Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com + * + * Authors: + * + *	Jay Schulist <jschlst@samba.org> + *	Alexei Starovoitov <ast@plumgrid.com> + *	Daniel Borkmann <dborkman@redhat.com>   *   * This program is free software; you can redistribute it and/or   * modify it under the terms of the GNU General Public License @@ -33,62 +38,39 @@  #include <net/sock.h>  #include <linux/errno.h>  #include <linux/timer.h> -#include <asm/system.h>  #include <asm/uaccess.h>  #include <asm/unaligned.h>  #include <linux/filter.h> -#include <linux/reciprocal_div.h> - -enum { -	BPF_S_RET_K = 1, -	BPF_S_RET_A, -	BPF_S_ALU_ADD_K, -	BPF_S_ALU_ADD_X, -	BPF_S_ALU_SUB_K, -	BPF_S_ALU_SUB_X, -	BPF_S_ALU_MUL_K, -	BPF_S_ALU_MUL_X, -	BPF_S_ALU_DIV_X, -	BPF_S_ALU_AND_K, -	BPF_S_ALU_AND_X, -	BPF_S_ALU_OR_K, -	BPF_S_ALU_OR_X, -	BPF_S_ALU_LSH_K, -	BPF_S_ALU_LSH_X, -	BPF_S_ALU_RSH_K, -	BPF_S_ALU_RSH_X, -	BPF_S_ALU_NEG, -	BPF_S_LD_W_ABS, -	BPF_S_LD_H_ABS, -	BPF_S_LD_B_ABS, -	BPF_S_LD_W_LEN, -	BPF_S_LD_W_IND, -	BPF_S_LD_H_IND, -	BPF_S_LD_B_IND, -	BPF_S_LD_IMM, -	BPF_S_LDX_W_LEN, -	BPF_S_LDX_B_MSH, -	BPF_S_LDX_IMM, -	BPF_S_MISC_TAX, -	BPF_S_MISC_TXA, -	BPF_S_ALU_DIV_K, -	BPF_S_LD_MEM, -	BPF_S_LDX_MEM, -	BPF_S_ST, -	BPF_S_STX, -	BPF_S_JMP_JA, -	BPF_S_JMP_JEQ_K, -	BPF_S_JMP_JEQ_X, -	BPF_S_JMP_JGE_K, -	BPF_S_JMP_JGE_X, -	BPF_S_JMP_JGT_K, -	BPF_S_JMP_JGT_X, -	BPF_S_JMP_JSET_K, -	BPF_S_JMP_JSET_X, -}; - -/* No hurry in this branch */ -static void *__load_pointer(const struct sk_buff *skb, int k) +#include <linux/ratelimit.h> +#include <linux/seccomp.h> +#include <linux/if_vlan.h> + +/* Registers */ +#define BPF_R0	regs[BPF_REG_0] +#define BPF_R1	regs[BPF_REG_1] +#define BPF_R2	regs[BPF_REG_2] +#define BPF_R3	regs[BPF_REG_3] +#define BPF_R4	regs[BPF_REG_4] +#define BPF_R5	regs[BPF_REG_5] +#define BPF_R6	regs[BPF_REG_6] +#define BPF_R7	regs[BPF_REG_7] +#define BPF_R8	regs[BPF_REG_8] +#define BPF_R9	regs[BPF_REG_9] +#define BPF_R10	regs[BPF_REG_10] + +/* Named registers */ +#define DST	regs[insn->dst_reg] +#define SRC	regs[insn->src_reg] +#define FP	regs[BPF_REG_FP] +#define ARG1	regs[BPF_REG_ARG1] +#define CTX	regs[BPF_REG_CTX] +#define IMM	insn->imm + +/* No hurry in this branch + * + * Exported for the bpf jit load helper. + */ +void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)  {  	u8 *ptr = NULL; @@ -96,9 +78,9 @@ static void *__load_pointer(const struct sk_buff *skb, int k)  		ptr = skb_network_header(skb) + k - SKF_NET_OFF;  	else if (k >= SKF_LL_OFF)  		ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - -	if (ptr >= skb->head && ptr < skb_tail_pointer(skb)) +	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))  		return ptr; +  	return NULL;  } @@ -107,11 +89,8 @@ static inline void *load_pointer(const struct sk_buff *skb, int k,  {  	if (k >= 0)  		return skb_header_pointer(skb, k, size, buffer); -	else { -		if (k >= SKF_AD_OFF) -			return NULL; -		return __load_pointer(skb, k); -	} + +	return bpf_internal_load_pointer_neg_helper(skb, k, size);  }  /** @@ -131,344 +110,1032 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)  	int err;  	struct sk_filter *filter; +	/* +	 * If the skb was allocated from pfmemalloc reserves, only +	 * allow SOCK_MEMALLOC sockets to use it as this socket is +	 * helping free memory +	 */ +	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) +		return -ENOMEM; +  	err = security_sock_rcv_skb(sk, skb);  	if (err)  		return err; -	rcu_read_lock_bh(); -	filter = rcu_dereference_bh(sk->sk_filter); +	rcu_read_lock(); +	filter = rcu_dereference(sk->sk_filter);  	if (filter) { -		unsigned int pkt_len = sk_run_filter(skb, filter->insns); +		unsigned int pkt_len = SK_RUN_FILTER(filter, skb);  		err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;  	} -	rcu_read_unlock_bh(); +	rcu_read_unlock();  	return err;  }  EXPORT_SYMBOL(sk_filter); +/* Base function for offset calculation. Needs to go into .text section, + * therefore keeping it non-static as well; will also be used by JITs + * anyway later on, so do not let the compiler omit it. + */ +noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ +	return 0; +} +  /** - *	sk_run_filter - run a filter on a socket - *	@skb: buffer to run the filter on - *	@filter: filter to apply + *	__sk_run_filter - run a filter on a given context + *	@ctx: buffer to run the filter on + *	@insn: filter to apply   * - * Decode and apply filter instructions to the skb->data. - * Return length to keep, 0 for none. @skb is the data we are - * filtering, @filter is the array of filter instructions. - * Because all jumps are guaranteed to be before last instruction, - * and last instruction guaranteed to be a RET, we dont need to check - * flen. (We used to pass to this function the length of filter) + * Decode and apply filter instructions to the skb->data. Return length to + * keep, 0 for none. @ctx is the data we are operating on, @insn is the + * array of filter instructions.   */ -unsigned int sk_run_filter(const struct sk_buff *skb, -			   const struct sock_filter *fentry) +static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)  { +	u64 stack[MAX_BPF_STACK / sizeof(u64)]; +	u64 regs[MAX_BPF_REG], tmp; +	static const void *jumptable[256] = { +		[0 ... 255] = &&default_label, +		/* Now overwrite non-defaults ... */ +		/* 32 bit ALU operations */ +		[BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, +		[BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, +		[BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, +		[BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, +		[BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, +		[BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, +		[BPF_ALU | BPF_OR | BPF_X]  = &&ALU_OR_X, +		[BPF_ALU | BPF_OR | BPF_K]  = &&ALU_OR_K, +		[BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, +		[BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, +		[BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, +		[BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, +		[BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, +		[BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, +		[BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, +		[BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, +		[BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, +		[BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, +		[BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, +		[BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, +		[BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, +		[BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, +		[BPF_ALU | BPF_NEG] = &&ALU_NEG, +		[BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, +		[BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, +		/* 64 bit ALU operations */ +		[BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, +		[BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, +		[BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, +		[BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, +		[BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, +		[BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, +		[BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, +		[BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, +		[BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, +		[BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, +		[BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, +		[BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, +		[BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, +		[BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, +		[BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, +		[BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, +		[BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, +		[BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, +		[BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, +		[BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, +		[BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, +		[BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, +		[BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, +		[BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, +		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, +		/* Call instruction */ +		[BPF_JMP | BPF_CALL] = &&JMP_CALL, +		/* Jumps */ +		[BPF_JMP | BPF_JA] = &&JMP_JA, +		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, +		[BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, +		[BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, +		[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, +		[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, +		[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, +		[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, +		[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, +		[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, +		[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, +		[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, +		[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, +		[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, +		[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, +		/* Program return */ +		[BPF_JMP | BPF_EXIT] = &&JMP_EXIT, +		/* Store instructions */ +		[BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, +		[BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, +		[BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, +		[BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, +		[BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, +		[BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, +		[BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, +		[BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, +		[BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, +		[BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, +		/* Load instructions */ +		[BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, +		[BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, +		[BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, +		[BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, +		[BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, +		[BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, +		[BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, +		[BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, +		[BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, +		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, +	};  	void *ptr; -	u32 A = 0;			/* Accumulator */ -	u32 X = 0;			/* Index Register */ -	u32 mem[BPF_MEMWORDS];		/* Scratch Memory Store */ -	u32 tmp; -	int k; +	int off; -	/* -	 * Process array of filter instructions. -	 */ -	for (;; fentry++) { -#if defined(CONFIG_X86_32) -#define	K (fentry->k) -#else -		const u32 K = fentry->k; -#endif +#define CONT	 ({ insn++; goto select_insn; }) +#define CONT_JMP ({ insn++; goto select_insn; }) -		switch (fentry->code) { -		case BPF_S_ALU_ADD_X: -			A += X; -			continue; -		case BPF_S_ALU_ADD_K: -			A += K; -			continue; -		case BPF_S_ALU_SUB_X: -			A -= X; -			continue; -		case BPF_S_ALU_SUB_K: -			A -= K; -			continue; -		case BPF_S_ALU_MUL_X: -			A *= X; -			continue; -		case BPF_S_ALU_MUL_K: -			A *= K; -			continue; -		case BPF_S_ALU_DIV_X: -			if (X == 0) -				return 0; -			A /= X; -			continue; -		case BPF_S_ALU_DIV_K: -			A = reciprocal_divide(A, K); -			continue; -		case BPF_S_ALU_AND_X: -			A &= X; -			continue; -		case BPF_S_ALU_AND_K: -			A &= K; -			continue; -		case BPF_S_ALU_OR_X: -			A |= X; -			continue; -		case BPF_S_ALU_OR_K: -			A |= K; -			continue; -		case BPF_S_ALU_LSH_X: -			A <<= X; -			continue; -		case BPF_S_ALU_LSH_K: -			A <<= K; -			continue; -		case BPF_S_ALU_RSH_X: -			A >>= X; -			continue; -		case BPF_S_ALU_RSH_K: -			A >>= K; -			continue; -		case BPF_S_ALU_NEG: -			A = -A; -			continue; -		case BPF_S_JMP_JA: -			fentry += K; -			continue; -		case BPF_S_JMP_JGT_K: -			fentry += (A > K) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JGE_K: -			fentry += (A >= K) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JEQ_K: -			fentry += (A == K) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JSET_K: -			fentry += (A & K) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JGT_X: -			fentry += (A > X) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JGE_X: -			fentry += (A >= X) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JEQ_X: -			fentry += (A == X) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_JMP_JSET_X: -			fentry += (A & X) ? fentry->jt : fentry->jf; -			continue; -		case BPF_S_LD_W_ABS: -			k = K; -load_w: -			ptr = load_pointer(skb, k, 4, &tmp); -			if (ptr != NULL) { -				A = get_unaligned_be32(ptr); -				continue; -			} +	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; +	ARG1 = (u64) (unsigned long) ctx; + +	/* Registers used in classic BPF programs need to be reset first. */ +	regs[BPF_REG_A] = 0; +	regs[BPF_REG_X] = 0; + +select_insn: +	goto *jumptable[insn->code]; + +	/* ALU */ +#define ALU(OPCODE, OP)			\ +	ALU64_##OPCODE##_X:		\ +		DST = DST OP SRC;	\ +		CONT;			\ +	ALU_##OPCODE##_X:		\ +		DST = (u32) DST OP (u32) SRC;	\ +		CONT;			\ +	ALU64_##OPCODE##_K:		\ +		DST = DST OP IMM;		\ +		CONT;			\ +	ALU_##OPCODE##_K:		\ +		DST = (u32) DST OP (u32) IMM;	\ +		CONT; + +	ALU(ADD,  +) +	ALU(SUB,  -) +	ALU(AND,  &) +	ALU(OR,   |) +	ALU(LSH, <<) +	ALU(RSH, >>) +	ALU(XOR,  ^) +	ALU(MUL,  *) +#undef ALU +	ALU_NEG: +		DST = (u32) -DST; +		CONT; +	ALU64_NEG: +		DST = -DST; +		CONT; +	ALU_MOV_X: +		DST = (u32) SRC; +		CONT; +	ALU_MOV_K: +		DST = (u32) IMM; +		CONT; +	ALU64_MOV_X: +		DST = SRC; +		CONT; +	ALU64_MOV_K: +		DST = IMM; +		CONT; +	ALU64_ARSH_X: +		(*(s64 *) &DST) >>= SRC; +		CONT; +	ALU64_ARSH_K: +		(*(s64 *) &DST) >>= IMM; +		CONT; +	ALU64_MOD_X: +		if (unlikely(SRC == 0)) +			return 0; +		tmp = DST; +		DST = do_div(tmp, SRC); +		CONT; +	ALU_MOD_X: +		if (unlikely(SRC == 0)) +			return 0; +		tmp = (u32) DST; +		DST = do_div(tmp, (u32) SRC); +		CONT; +	ALU64_MOD_K: +		tmp = DST; +		DST = do_div(tmp, IMM); +		CONT; +	ALU_MOD_K: +		tmp = (u32) DST; +		DST = do_div(tmp, (u32) IMM); +		CONT; +	ALU64_DIV_X: +		if (unlikely(SRC == 0)) +			return 0; +		do_div(DST, SRC); +		CONT; +	ALU_DIV_X: +		if (unlikely(SRC == 0)) +			return 0; +		tmp = (u32) DST; +		do_div(tmp, (u32) SRC); +		DST = (u32) tmp; +		CONT; +	ALU64_DIV_K: +		do_div(DST, IMM); +		CONT; +	ALU_DIV_K: +		tmp = (u32) DST; +		do_div(tmp, (u32) IMM); +		DST = (u32) tmp; +		CONT; +	ALU_END_TO_BE: +		switch (IMM) { +		case 16: +			DST = (__force u16) cpu_to_be16(DST);  			break; -		case BPF_S_LD_H_ABS: -			k = K; -load_h: -			ptr = load_pointer(skb, k, 2, &tmp); -			if (ptr != NULL) { -				A = get_unaligned_be16(ptr); -				continue; -			} +		case 32: +			DST = (__force u32) cpu_to_be32(DST);  			break; -		case BPF_S_LD_B_ABS: -			k = K; -load_b: -			ptr = load_pointer(skb, k, 1, &tmp); -			if (ptr != NULL) { -				A = *(u8 *)ptr; -				continue; -			} +		case 64: +			DST = (__force u64) cpu_to_be64(DST); +			break; +		} +		CONT; +	ALU_END_TO_LE: +		switch (IMM) { +		case 16: +			DST = (__force u16) cpu_to_le16(DST); +			break; +		case 32: +			DST = (__force u32) cpu_to_le32(DST); +			break; +		case 64: +			DST = (__force u64) cpu_to_le64(DST);  			break; -		case BPF_S_LD_W_LEN: -			A = skb->len; -			continue; -		case BPF_S_LDX_W_LEN: -			X = skb->len; -			continue; -		case BPF_S_LD_W_IND: -			k = X + K; -			goto load_w; -		case BPF_S_LD_H_IND: -			k = X + K; -			goto load_h; -		case BPF_S_LD_B_IND: -			k = X + K; -			goto load_b; -		case BPF_S_LDX_B_MSH: -			ptr = load_pointer(skb, K, 1, &tmp); -			if (ptr != NULL) { -				X = (*(u8 *)ptr & 0xf) << 2; -				continue; -			} -			return 0; -		case BPF_S_LD_IMM: -			A = K; -			continue; -		case BPF_S_LDX_IMM: -			X = K; -			continue; -		case BPF_S_LD_MEM: -			A = mem[K]; -			continue; -		case BPF_S_LDX_MEM: -			X = mem[K]; -			continue; -		case BPF_S_MISC_TAX: -			X = A; -			continue; -		case BPF_S_MISC_TXA: -			A = X; -			continue; -		case BPF_S_RET_K: -			return K; -		case BPF_S_RET_A: -			return A; -		case BPF_S_ST: -			mem[K] = A; -			continue; -		case BPF_S_STX: -			mem[K] = X; -			continue; -		default: -			WARN_ON(1); -			return 0;  		} +		CONT; -		/* -		 * Handle ancillary data, which are impossible -		 * (or very difficult) to get parsing packet contents. +	/* CALL */ +	JMP_CALL: +		/* Function call scratches BPF_R1-BPF_R5 registers, +		 * preserves BPF_R6-BPF_R9, and stores return value +		 * into BPF_R0.  		 */ -		switch (k-SKF_AD_OFF) { -		case SKF_AD_PROTOCOL: -			A = ntohs(skb->protocol); -			continue; -		case SKF_AD_PKTTYPE: -			A = skb->pkt_type; -			continue; -		case SKF_AD_IFINDEX: -			if (!skb->dev) -				return 0; -			A = skb->dev->ifindex; -			continue; -		case SKF_AD_MARK: -			A = skb->mark; -			continue; -		case SKF_AD_QUEUE: -			A = skb->queue_mapping; -			continue; -		case SKF_AD_HATYPE: -			if (!skb->dev) -				return 0; -			A = skb->dev->type; -			continue; -		case SKF_AD_RXHASH: -			A = skb->rxhash; -			continue; -		case SKF_AD_CPU: -			A = raw_smp_processor_id(); -			continue; -		case SKF_AD_NLATTR: { -			struct nlattr *nla; - -			if (skb_is_nonlinear(skb)) -				return 0; -			if (A > skb->len - sizeof(struct nlattr)) -				return 0; - -			nla = nla_find((struct nlattr *)&skb->data[A], -				       skb->len - A, X); -			if (nla) -				A = (void *)nla - (void *)skb->data; -			else -				A = 0; -			continue; +		BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, +						       BPF_R4, BPF_R5); +		CONT; + +	/* JMP */ +	JMP_JA: +		insn += insn->off; +		CONT; +	JMP_JEQ_X: +		if (DST == SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JEQ_K: +		if (DST == IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JNE_X: +		if (DST != SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JNE_K: +		if (DST != IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JGT_X: +		if (DST > SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JGT_K: +		if (DST > IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JGE_X: +		if (DST >= SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JGE_K: +		if (DST >= IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSGT_X: +		if (((s64) DST) > ((s64) SRC)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSGT_K: +		if (((s64) DST) > ((s64) IMM)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSGE_X: +		if (((s64) DST) >= ((s64) SRC)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSGE_K: +		if (((s64) DST) >= ((s64) IMM)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSET_X: +		if (DST & SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSET_K: +		if (DST & IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_EXIT: +		return BPF_R0; + +	/* STX and ST and LDX*/ +#define LDST(SIZEOP, SIZE)						\ +	STX_MEM_##SIZEOP:						\ +		*(SIZE *)(unsigned long) (DST + insn->off) = SRC;	\ +		CONT;							\ +	ST_MEM_##SIZEOP:						\ +		*(SIZE *)(unsigned long) (DST + insn->off) = IMM;	\ +		CONT;							\ +	LDX_MEM_##SIZEOP:						\ +		DST = *(SIZE *)(unsigned long) (SRC + insn->off);	\ +		CONT; + +	LDST(B,   u8) +	LDST(H,  u16) +	LDST(W,  u32) +	LDST(DW, u64) +#undef LDST +	STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ +		atomic_add((u32) SRC, (atomic_t *)(unsigned long) +			   (DST + insn->off)); +		CONT; +	STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ +		atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) +			     (DST + insn->off)); +		CONT; +	LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ +		off = IMM; +load_word: +		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are +		 * only appearing in the programs where ctx == +		 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] +		 * == BPF_R6, sk_convert_filter() saves it in BPF_R6, +		 * internal BPF verifier will check that BPF_R6 == +		 * ctx. +		 * +		 * BPF_ABS and BPF_IND are wrappers of function calls, +		 * so they scratch BPF_R1-BPF_R5 registers, preserve +		 * BPF_R6-BPF_R9, and store return value into BPF_R0. +		 * +		 * Implicit input: +		 *   ctx == skb == BPF_R6 == CTX +		 * +		 * Explicit input: +		 *   SRC == any register +		 *   IMM == 32-bit immediate +		 * +		 * Output: +		 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness +		 */ + +		ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); +		if (likely(ptr != NULL)) { +			BPF_R0 = get_unaligned_be32(ptr); +			CONT; +		} + +		return 0; +	LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ +		off = IMM; +load_half: +		ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); +		if (likely(ptr != NULL)) { +			BPF_R0 = get_unaligned_be16(ptr); +			CONT; +		} + +		return 0; +	LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ +		off = IMM; +load_byte: +		ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); +		if (likely(ptr != NULL)) { +			BPF_R0 = *(u8 *)ptr; +			CONT; +		} + +		return 0; +	LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ +		off = IMM + SRC; +		goto load_word; +	LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ +		off = IMM + SRC; +		goto load_half; +	LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ +		off = IMM + SRC; +		goto load_byte; + +	default_label: +		/* If we ever reach this, we have a bug somewhere. */ +		WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); +		return 0; +} + +/* Helper to find the offset of pkt_type in sk_buff structure. We want + * to make sure its still a 3bit field starting at a byte boundary; + * taken from arch/x86/net/bpf_jit_comp.c. + */ +#ifdef __BIG_ENDIAN_BITFIELD +#define PKT_TYPE_MAX	(7 << 5) +#else +#define PKT_TYPE_MAX	7 +#endif +static unsigned int pkt_type_offset(void) +{ +	struct sk_buff skb_probe = { .pkt_type = ~0, }; +	u8 *ct = (u8 *) &skb_probe; +	unsigned int off; + +	for (off = 0; off < sizeof(struct sk_buff); off++) { +		if (ct[off] == PKT_TYPE_MAX) +			return off; +	} + +	pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__); +	return -1; +} + +static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ +	return __skb_get_poff((struct sk_buff *)(unsigned long) ctx); +} + +static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ +	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; +	struct nlattr *nla; + +	if (skb_is_nonlinear(skb)) +		return 0; + +	if (skb->len < sizeof(struct nlattr)) +		return 0; + +	if (a > skb->len - sizeof(struct nlattr)) +		return 0; + +	nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); +	if (nla) +		return (void *) nla - (void *) skb->data; + +	return 0; +} + +static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ +	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; +	struct nlattr *nla; + +	if (skb_is_nonlinear(skb)) +		return 0; + +	if (skb->len < sizeof(struct nlattr)) +		return 0; + +	if (a > skb->len - sizeof(struct nlattr)) +		return 0; + +	nla = (struct nlattr *) &skb->data[a]; +	if (nla->nla_len > skb->len - a) +		return 0; + +	nla = nla_find_nested(nla, x); +	if (nla) +		return (void *) nla - (void *) skb->data; + +	return 0; +} + +static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ +	return raw_smp_processor_id(); +} + +/* note that this only generates 32-bit random numbers */ +static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +{ +	return prandom_u32(); +} + +static bool convert_bpf_extensions(struct sock_filter *fp, +				   struct sock_filter_int **insnp) +{ +	struct sock_filter_int *insn = *insnp; + +	switch (fp->k) { +	case SKF_AD_OFF + SKF_AD_PROTOCOL: +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + +		/* A = *(u16 *) (CTX + offsetof(protocol)) */ +		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, +				      offsetof(struct sk_buff, protocol)); +		/* A = ntohs(A) [emitting a nop or swap16] */ +		*insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); +		break; + +	case SKF_AD_OFF + SKF_AD_PKTTYPE: +		*insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, +				    pkt_type_offset()); +		if (insn->off < 0) +			return false; +		insn++; +		*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD +		insn++; +                *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5); +#endif +		break; + +	case SKF_AD_OFF + SKF_AD_IFINDEX: +	case SKF_AD_OFF + SKF_AD_HATYPE: +		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); +		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); +		BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); + +		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), +				      BPF_REG_TMP, BPF_REG_CTX, +				      offsetof(struct sk_buff, dev)); +		/* if (tmp != 0) goto pc + 1 */ +		*insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); +		*insn++ = BPF_EXIT_INSN(); +		if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) +			*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, +					    offsetof(struct net_device, ifindex)); +		else +			*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, +					    offsetof(struct net_device, type)); +		break; + +	case SKF_AD_OFF + SKF_AD_MARK: +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + +		*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, +				    offsetof(struct sk_buff, mark)); +		break; + +	case SKF_AD_OFF + SKF_AD_RXHASH: +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); + +		*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, +				    offsetof(struct sk_buff, hash)); +		break; + +	case SKF_AD_OFF + SKF_AD_QUEUE: +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + +		*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, +				    offsetof(struct sk_buff, queue_mapping)); +		break; + +	case SKF_AD_OFF + SKF_AD_VLAN_TAG: +	case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); +		BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + +		/* A = *(u16 *) (CTX + offsetof(vlan_tci)) */ +		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, +				      offsetof(struct sk_buff, vlan_tci)); +		if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) { +			*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, +					      ~VLAN_TAG_PRESENT); +		} else { +			/* A >>= 12 */ +			*insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12); +			/* A &= 1 */ +			*insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1);  		} -		case SKF_AD_NLATTR_NEST: { -			struct nlattr *nla; - -			if (skb_is_nonlinear(skb)) -				return 0; -			if (A > skb->len - sizeof(struct nlattr)) -				return 0; - -			nla = (struct nlattr *)&skb->data[A]; -			if (nla->nla_len > A - skb->len) -				return 0; - -			nla = nla_find_nested(nla, X); -			if (nla) -				A = (void *)nla - (void *)skb->data; -			else -				A = 0; -			continue; +		break; + +	case SKF_AD_OFF + SKF_AD_PAY_OFFSET: +	case SKF_AD_OFF + SKF_AD_NLATTR: +	case SKF_AD_OFF + SKF_AD_NLATTR_NEST: +	case SKF_AD_OFF + SKF_AD_CPU: +	case SKF_AD_OFF + SKF_AD_RANDOM: +		/* arg1 = CTX */ +		*insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); +		/* arg2 = A */ +		*insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); +		/* arg3 = X */ +		*insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); +		/* Emit call(arg1=CTX, arg2=A, arg3=X) */ +		switch (fp->k) { +		case SKF_AD_OFF + SKF_AD_PAY_OFFSET: +			*insn = BPF_EMIT_CALL(__skb_get_pay_offset); +			break; +		case SKF_AD_OFF + SKF_AD_NLATTR: +			*insn = BPF_EMIT_CALL(__skb_get_nlattr); +			break; +		case SKF_AD_OFF + SKF_AD_NLATTR_NEST: +			*insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); +			break; +		case SKF_AD_OFF + SKF_AD_CPU: +			*insn = BPF_EMIT_CALL(__get_raw_cpu_id); +			break; +		case SKF_AD_OFF + SKF_AD_RANDOM: +			*insn = BPF_EMIT_CALL(__get_random_u32); +			break;  		} +		break; + +	case SKF_AD_OFF + SKF_AD_ALU_XOR_X: +		/* A ^= X */ +		*insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); +		break; + +	default: +		/* This is just a dummy call to avoid letting the compiler +		 * evict __bpf_call_base() as an optimization. Placed here +		 * where no-one bothers. +		 */ +		BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); +		return false; +	} + +	*insnp = insn; +	return true; +} + +/** + *	sk_convert_filter - convert filter program + *	@prog: the user passed filter program + *	@len: the length of the user passed filter program + *	@new_prog: buffer where converted program will be stored + *	@new_len: pointer to store length of converted program + * + * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. + * Conversion workflow: + * + * 1) First pass for calculating the new program length: + *   sk_convert_filter(old_prog, old_len, NULL, &new_len) + * + * 2) 2nd pass to remap in two passes: 1st pass finds new + *    jump offsets, 2nd pass remapping: + *   new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len); + *   sk_convert_filter(old_prog, old_len, new_prog, &new_len); + * + * User BPF's register A is mapped to our BPF register 6, user BPF + * register X is mapped to BPF register 7; frame pointer is always + * register 10; Context 'void *ctx' is stored in register 1, that is, + * for socket filters: ctx == 'struct sk_buff *', for seccomp: + * ctx == 'struct seccomp_data *'. + */ +int sk_convert_filter(struct sock_filter *prog, int len, +		      struct sock_filter_int *new_prog, int *new_len) +{ +	int new_flen = 0, pass = 0, target, i; +	struct sock_filter_int *new_insn; +	struct sock_filter *fp; +	int *addrs = NULL; +	u8 bpf_src; + +	BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); +	BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); + +	if (len <= 0 || len > BPF_MAXINSNS) +		return -EINVAL; + +	if (new_prog) { +		addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL); +		if (!addrs) +			return -ENOMEM; +	} + +do_pass: +	new_insn = new_prog; +	fp = prog; + +	if (new_insn) +		*new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); +	new_insn++; + +	for (i = 0; i < len; fp++, i++) { +		struct sock_filter_int tmp_insns[6] = { }; +		struct sock_filter_int *insn = tmp_insns; + +		if (addrs) +			addrs[i] = new_insn - new_prog; + +		switch (fp->code) { +		/* All arithmetic insns and skb loads map as-is. */ +		case BPF_ALU | BPF_ADD | BPF_X: +		case BPF_ALU | BPF_ADD | BPF_K: +		case BPF_ALU | BPF_SUB | BPF_X: +		case BPF_ALU | BPF_SUB | BPF_K: +		case BPF_ALU | BPF_AND | BPF_X: +		case BPF_ALU | BPF_AND | BPF_K: +		case BPF_ALU | BPF_OR | BPF_X: +		case BPF_ALU | BPF_OR | BPF_K: +		case BPF_ALU | BPF_LSH | BPF_X: +		case BPF_ALU | BPF_LSH | BPF_K: +		case BPF_ALU | BPF_RSH | BPF_X: +		case BPF_ALU | BPF_RSH | BPF_K: +		case BPF_ALU | BPF_XOR | BPF_X: +		case BPF_ALU | BPF_XOR | BPF_K: +		case BPF_ALU | BPF_MUL | BPF_X: +		case BPF_ALU | BPF_MUL | BPF_K: +		case BPF_ALU | BPF_DIV | BPF_X: +		case BPF_ALU | BPF_DIV | BPF_K: +		case BPF_ALU | BPF_MOD | BPF_X: +		case BPF_ALU | BPF_MOD | BPF_K: +		case BPF_ALU | BPF_NEG: +		case BPF_LD | BPF_ABS | BPF_W: +		case BPF_LD | BPF_ABS | BPF_H: +		case BPF_LD | BPF_ABS | BPF_B: +		case BPF_LD | BPF_IND | BPF_W: +		case BPF_LD | BPF_IND | BPF_H: +		case BPF_LD | BPF_IND | BPF_B: +			/* Check for overloaded BPF extension and +			 * directly convert it if found, otherwise +			 * just move on with mapping. +			 */ +			if (BPF_CLASS(fp->code) == BPF_LD && +			    BPF_MODE(fp->code) == BPF_ABS && +			    convert_bpf_extensions(fp, &insn)) +				break; + +			*insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); +			break; + +		/* Jump transformation cannot use BPF block macros +		 * everywhere as offset calculation and target updates +		 * require a bit more work than the rest, i.e. jump +		 * opcodes map as-is, but offsets need adjustment. +		 */ + +#define BPF_EMIT_JMP							\ +	do {								\ +		if (target >= len || target < 0)			\ +			goto err;					\ +		insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0;	\ +		/* Adjust pc relative offset for 2nd or 3rd insn. */	\ +		insn->off -= insn - tmp_insns;				\ +	} while (0) + +		case BPF_JMP | BPF_JA: +			target = i + fp->k + 1; +			insn->code = fp->code; +			BPF_EMIT_JMP; +			break; + +		case BPF_JMP | BPF_JEQ | BPF_K: +		case BPF_JMP | BPF_JEQ | BPF_X: +		case BPF_JMP | BPF_JSET | BPF_K: +		case BPF_JMP | BPF_JSET | BPF_X: +		case BPF_JMP | BPF_JGT | BPF_K: +		case BPF_JMP | BPF_JGT | BPF_X: +		case BPF_JMP | BPF_JGE | BPF_K: +		case BPF_JMP | BPF_JGE | BPF_X: +			if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { +				/* BPF immediates are signed, zero extend +				 * immediate into tmp register and use it +				 * in compare insn. +				 */ +				*insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); + +				insn->dst_reg = BPF_REG_A; +				insn->src_reg = BPF_REG_TMP; +				bpf_src = BPF_X; +			} else { +				insn->dst_reg = BPF_REG_A; +				insn->src_reg = BPF_REG_X; +				insn->imm = fp->k; +				bpf_src = BPF_SRC(fp->code); +			} + +			/* Common case where 'jump_false' is next insn. */ +			if (fp->jf == 0) { +				insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; +				target = i + fp->jt + 1; +				BPF_EMIT_JMP; +				break; +			} + +			/* Convert JEQ into JNE when 'jump_true' is next insn. */ +			if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { +				insn->code = BPF_JMP | BPF_JNE | bpf_src; +				target = i + fp->jf + 1; +				BPF_EMIT_JMP; +				break; +			} + +			/* Other jumps are mapped into two insns: Jxx and JA. */ +			target = i + fp->jt + 1; +			insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; +			BPF_EMIT_JMP; +			insn++; + +			insn->code = BPF_JMP | BPF_JA; +			target = i + fp->jf + 1; +			BPF_EMIT_JMP; +			break; + +		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ +		case BPF_LDX | BPF_MSH | BPF_B: +			/* tmp = A */ +			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); +			/* A = BPF_R0 = *(u8 *) (skb->data + K) */ +			*insn++ = BPF_LD_ABS(BPF_B, fp->k); +			/* A &= 0xf */ +			*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); +			/* A <<= 2 */ +			*insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); +			/* X = A */ +			*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); +			/* A = tmp */ +			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); +			break; + +		/* RET_K, RET_A are remaped into 2 insns. */ +		case BPF_RET | BPF_A: +		case BPF_RET | BPF_K: +			*insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? +						BPF_K : BPF_X, BPF_REG_0, +						BPF_REG_A, fp->k); +			*insn = BPF_EXIT_INSN(); +			break; + +		/* Store to stack. */ +		case BPF_ST: +		case BPF_STX: +			*insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == +					    BPF_ST ? BPF_REG_A : BPF_REG_X, +					    -(BPF_MEMWORDS - fp->k) * 4); +			break; + +		/* Load from stack. */ +		case BPF_LD | BPF_MEM: +		case BPF_LDX | BPF_MEM: +			*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ? +					    BPF_REG_A : BPF_REG_X, BPF_REG_FP, +					    -(BPF_MEMWORDS - fp->k) * 4); +			break; + +		/* A = K or X = K */ +		case BPF_LD | BPF_IMM: +		case BPF_LDX | BPF_IMM: +			*insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? +					      BPF_REG_A : BPF_REG_X, fp->k); +			break; + +		/* X = A */ +		case BPF_MISC | BPF_TAX: +			*insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); +			break; + +		/* A = X */ +		case BPF_MISC | BPF_TXA: +			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); +			break; + +		/* A = skb->len or X = skb->len */ +		case BPF_LD | BPF_W | BPF_LEN: +		case BPF_LDX | BPF_W | BPF_LEN: +			*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? +					    BPF_REG_A : BPF_REG_X, BPF_REG_CTX, +					    offsetof(struct sk_buff, len)); +			break; + +		/* Access seccomp_data fields. */ +		case BPF_LDX | BPF_ABS | BPF_W: +			/* A = *(u32 *) (ctx + K) */ +			*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); +			break; + +		/* Unkown instruction. */  		default: -			return 0; +			goto err;  		} + +		insn++; +		if (new_prog) +			memcpy(new_insn, tmp_insns, +			       sizeof(*insn) * (insn - tmp_insns)); +		new_insn += insn - tmp_insns; +	} + +	if (!new_prog) { +		/* Only calculating new length. */ +		*new_len = new_insn - new_prog; +		return 0; +	} + +	pass++; +	if (new_flen != new_insn - new_prog) { +		new_flen = new_insn - new_prog; +		if (pass > 2) +			goto err; +		goto do_pass;  	} +	kfree(addrs); +	BUG_ON(*new_len != new_flen);  	return 0; +err: +	kfree(addrs); +	return -EINVAL;  } -EXPORT_SYMBOL(sk_run_filter); -/* - * Security : +/* Security: + *   * A BPF program is able to use 16 cells of memory to store intermediate - * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()) + * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()). + *   * As we dont want to clear mem[] array for each packet going through   * sk_run_filter(), we check that filter loaded by user never try to read   * a cell if not previously written, and we check all branches to be sure - * a malicious user doesnt try to abuse us. + * a malicious user doesn't try to abuse us.   */  static int check_load_and_stores(struct sock_filter *filter, int flen)  { -	u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */ +	u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */  	int pc, ret = 0;  	BUILD_BUG_ON(BPF_MEMWORDS > 16); -	masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); + +	masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);  	if (!masks)  		return -ENOMEM; +  	memset(masks, 0xff, flen * sizeof(*masks));  	for (pc = 0; pc < flen; pc++) {  		memvalid &= masks[pc];  		switch (filter[pc].code) { -		case BPF_S_ST: -		case BPF_S_STX: +		case BPF_ST: +		case BPF_STX:  			memvalid |= (1 << filter[pc].k);  			break; -		case BPF_S_LD_MEM: -		case BPF_S_LDX_MEM: +		case BPF_LD | BPF_MEM: +		case BPF_LDX | BPF_MEM:  			if (!(memvalid & (1 << filter[pc].k))) {  				ret = -EINVAL;  				goto error;  			}  			break; -		case BPF_S_JMP_JA: -			/* a jump must set masks on target */ +		case BPF_JMP | BPF_JA: +			/* A jump must set masks on target */  			masks[pc + 1 + filter[pc].k] &= memvalid;  			memvalid = ~0;  			break; -		case BPF_S_JMP_JEQ_K: -		case BPF_S_JMP_JEQ_X: -		case BPF_S_JMP_JGE_K: -		case BPF_S_JMP_JGE_X: -		case BPF_S_JMP_JGT_K: -		case BPF_S_JMP_JGT_X: -		case BPF_S_JMP_JSET_X: -		case BPF_S_JMP_JSET_K: -			/* a jump must set masks on targets */ +		case BPF_JMP | BPF_JEQ | BPF_K: +		case BPF_JMP | BPF_JEQ | BPF_X: +		case BPF_JMP | BPF_JGE | BPF_K: +		case BPF_JMP | BPF_JGE | BPF_X: +		case BPF_JMP | BPF_JGT | BPF_K: +		case BPF_JMP | BPF_JGT | BPF_X: +		case BPF_JMP | BPF_JSET | BPF_K: +		case BPF_JMP | BPF_JSET | BPF_X: +			/* A jump must set masks on targets */  			masks[pc + 1 + filter[pc].jt] &= memvalid;  			masks[pc + 1 + filter[pc].jf] &= memvalid;  			memvalid = ~0; @@ -480,6 +1147,72 @@ error:  	return ret;  } +static bool chk_code_allowed(u16 code_to_probe) +{ +	static const bool codes[] = { +		/* 32 bit ALU operations */ +		[BPF_ALU | BPF_ADD | BPF_K] = true, +		[BPF_ALU | BPF_ADD | BPF_X] = true, +		[BPF_ALU | BPF_SUB | BPF_K] = true, +		[BPF_ALU | BPF_SUB | BPF_X] = true, +		[BPF_ALU | BPF_MUL | BPF_K] = true, +		[BPF_ALU | BPF_MUL | BPF_X] = true, +		[BPF_ALU | BPF_DIV | BPF_K] = true, +		[BPF_ALU | BPF_DIV | BPF_X] = true, +		[BPF_ALU | BPF_MOD | BPF_K] = true, +		[BPF_ALU | BPF_MOD | BPF_X] = true, +		[BPF_ALU | BPF_AND | BPF_K] = true, +		[BPF_ALU | BPF_AND | BPF_X] = true, +		[BPF_ALU | BPF_OR | BPF_K] = true, +		[BPF_ALU | BPF_OR | BPF_X] = true, +		[BPF_ALU | BPF_XOR | BPF_K] = true, +		[BPF_ALU | BPF_XOR | BPF_X] = true, +		[BPF_ALU | BPF_LSH | BPF_K] = true, +		[BPF_ALU | BPF_LSH | BPF_X] = true, +		[BPF_ALU | BPF_RSH | BPF_K] = true, +		[BPF_ALU | BPF_RSH | BPF_X] = true, +		[BPF_ALU | BPF_NEG] = true, +		/* Load instructions */ +		[BPF_LD | BPF_W | BPF_ABS] = true, +		[BPF_LD | BPF_H | BPF_ABS] = true, +		[BPF_LD | BPF_B | BPF_ABS] = true, +		[BPF_LD | BPF_W | BPF_LEN] = true, +		[BPF_LD | BPF_W | BPF_IND] = true, +		[BPF_LD | BPF_H | BPF_IND] = true, +		[BPF_LD | BPF_B | BPF_IND] = true, +		[BPF_LD | BPF_IMM] = true, +		[BPF_LD | BPF_MEM] = true, +		[BPF_LDX | BPF_W | BPF_LEN] = true, +		[BPF_LDX | BPF_B | BPF_MSH] = true, +		[BPF_LDX | BPF_IMM] = true, +		[BPF_LDX | BPF_MEM] = true, +		/* Store instructions */ +		[BPF_ST] = true, +		[BPF_STX] = true, +		/* Misc instructions */ +		[BPF_MISC | BPF_TAX] = true, +		[BPF_MISC | BPF_TXA] = true, +		/* Return instructions */ +		[BPF_RET | BPF_K] = true, +		[BPF_RET | BPF_A] = true, +		/* Jump instructions */ +		[BPF_JMP | BPF_JA] = true, +		[BPF_JMP | BPF_JEQ | BPF_K] = true, +		[BPF_JMP | BPF_JEQ | BPF_X] = true, +		[BPF_JMP | BPF_JGE | BPF_K] = true, +		[BPF_JMP | BPF_JGE | BPF_X] = true, +		[BPF_JMP | BPF_JGT | BPF_K] = true, +		[BPF_JMP | BPF_JGT | BPF_X] = true, +		[BPF_JMP | BPF_JSET | BPF_K] = true, +		[BPF_JMP | BPF_JSET | BPF_X] = true, +	}; + +	if (code_to_probe >= ARRAY_SIZE(codes)) +		return false; + +	return codes[code_to_probe]; +} +  /**   *	sk_chk_filter - verify socket filter code   *	@filter: filter to verify @@ -494,146 +1227,354 @@ error:   *   * Returns 0 if the rule set is legal or -EINVAL if not.   */ -int sk_chk_filter(struct sock_filter *filter, int flen) +int sk_chk_filter(struct sock_filter *filter, unsigned int flen)  { -	/* -	 * Valid instructions are initialized to non-0. -	 * Invalid instructions are initialized to 0. -	 */ -	static const u8 codes[] = { -		[BPF_ALU|BPF_ADD|BPF_K]  = BPF_S_ALU_ADD_K, -		[BPF_ALU|BPF_ADD|BPF_X]  = BPF_S_ALU_ADD_X, -		[BPF_ALU|BPF_SUB|BPF_K]  = BPF_S_ALU_SUB_K, -		[BPF_ALU|BPF_SUB|BPF_X]  = BPF_S_ALU_SUB_X, -		[BPF_ALU|BPF_MUL|BPF_K]  = BPF_S_ALU_MUL_K, -		[BPF_ALU|BPF_MUL|BPF_X]  = BPF_S_ALU_MUL_X, -		[BPF_ALU|BPF_DIV|BPF_X]  = BPF_S_ALU_DIV_X, -		[BPF_ALU|BPF_AND|BPF_K]  = BPF_S_ALU_AND_K, -		[BPF_ALU|BPF_AND|BPF_X]  = BPF_S_ALU_AND_X, -		[BPF_ALU|BPF_OR|BPF_K]   = BPF_S_ALU_OR_K, -		[BPF_ALU|BPF_OR|BPF_X]   = BPF_S_ALU_OR_X, -		[BPF_ALU|BPF_LSH|BPF_K]  = BPF_S_ALU_LSH_K, -		[BPF_ALU|BPF_LSH|BPF_X]  = BPF_S_ALU_LSH_X, -		[BPF_ALU|BPF_RSH|BPF_K]  = BPF_S_ALU_RSH_K, -		[BPF_ALU|BPF_RSH|BPF_X]  = BPF_S_ALU_RSH_X, -		[BPF_ALU|BPF_NEG]        = BPF_S_ALU_NEG, -		[BPF_LD|BPF_W|BPF_ABS]   = BPF_S_LD_W_ABS, -		[BPF_LD|BPF_H|BPF_ABS]   = BPF_S_LD_H_ABS, -		[BPF_LD|BPF_B|BPF_ABS]   = BPF_S_LD_B_ABS, -		[BPF_LD|BPF_W|BPF_LEN]   = BPF_S_LD_W_LEN, -		[BPF_LD|BPF_W|BPF_IND]   = BPF_S_LD_W_IND, -		[BPF_LD|BPF_H|BPF_IND]   = BPF_S_LD_H_IND, -		[BPF_LD|BPF_B|BPF_IND]   = BPF_S_LD_B_IND, -		[BPF_LD|BPF_IMM]         = BPF_S_LD_IMM, -		[BPF_LDX|BPF_W|BPF_LEN]  = BPF_S_LDX_W_LEN, -		[BPF_LDX|BPF_B|BPF_MSH]  = BPF_S_LDX_B_MSH, -		[BPF_LDX|BPF_IMM]        = BPF_S_LDX_IMM, -		[BPF_MISC|BPF_TAX]       = BPF_S_MISC_TAX, -		[BPF_MISC|BPF_TXA]       = BPF_S_MISC_TXA, -		[BPF_RET|BPF_K]          = BPF_S_RET_K, -		[BPF_RET|BPF_A]          = BPF_S_RET_A, -		[BPF_ALU|BPF_DIV|BPF_K]  = BPF_S_ALU_DIV_K, -		[BPF_LD|BPF_MEM]         = BPF_S_LD_MEM, -		[BPF_LDX|BPF_MEM]        = BPF_S_LDX_MEM, -		[BPF_ST]                 = BPF_S_ST, -		[BPF_STX]                = BPF_S_STX, -		[BPF_JMP|BPF_JA]         = BPF_S_JMP_JA, -		[BPF_JMP|BPF_JEQ|BPF_K]  = BPF_S_JMP_JEQ_K, -		[BPF_JMP|BPF_JEQ|BPF_X]  = BPF_S_JMP_JEQ_X, -		[BPF_JMP|BPF_JGE|BPF_K]  = BPF_S_JMP_JGE_K, -		[BPF_JMP|BPF_JGE|BPF_X]  = BPF_S_JMP_JGE_X, -		[BPF_JMP|BPF_JGT|BPF_K]  = BPF_S_JMP_JGT_K, -		[BPF_JMP|BPF_JGT|BPF_X]  = BPF_S_JMP_JGT_X, -		[BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K, -		[BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X, -	}; +	bool anc_found;  	int pc;  	if (flen == 0 || flen > BPF_MAXINSNS)  		return -EINVAL; -	/* check the filter code now */ +	/* Check the filter code now */  	for (pc = 0; pc < flen; pc++) {  		struct sock_filter *ftest = &filter[pc]; -		u16 code = ftest->code; -		if (code >= ARRAY_SIZE(codes)) -			return -EINVAL; -		code = codes[code]; -		if (!code) +		/* May we actually operate on this code? */ +		if (!chk_code_allowed(ftest->code))  			return -EINVAL; +  		/* Some instructions need special checks */ -		switch (code) { -		case BPF_S_ALU_DIV_K: -			/* check for division by zero */ +		switch (ftest->code) { +		case BPF_ALU | BPF_DIV | BPF_K: +		case BPF_ALU | BPF_MOD | BPF_K: +			/* Check for division by zero */  			if (ftest->k == 0)  				return -EINVAL; -			ftest->k = reciprocal_value(ftest->k);  			break; -		case BPF_S_LD_MEM: -		case BPF_S_LDX_MEM: -		case BPF_S_ST: -		case BPF_S_STX: -			/* check for invalid memory addresses */ +		case BPF_LD | BPF_MEM: +		case BPF_LDX | BPF_MEM: +		case BPF_ST: +		case BPF_STX: +			/* Check for invalid memory addresses */  			if (ftest->k >= BPF_MEMWORDS)  				return -EINVAL;  			break; -		case BPF_S_JMP_JA: -			/* -			 * Note, the large ftest->k might cause loops. +		case BPF_JMP | BPF_JA: +			/* Note, the large ftest->k might cause loops.  			 * Compare this with conditional jumps below,  			 * where offsets are limited. --ANK (981016)  			 */ -			if (ftest->k >= (unsigned)(flen-pc-1)) +			if (ftest->k >= (unsigned int)(flen - pc - 1))  				return -EINVAL;  			break; -		case BPF_S_JMP_JEQ_K: -		case BPF_S_JMP_JEQ_X: -		case BPF_S_JMP_JGE_K: -		case BPF_S_JMP_JGE_X: -		case BPF_S_JMP_JGT_K: -		case BPF_S_JMP_JGT_X: -		case BPF_S_JMP_JSET_X: -		case BPF_S_JMP_JSET_K: -			/* for conditionals both must be safe */ +		case BPF_JMP | BPF_JEQ | BPF_K: +		case BPF_JMP | BPF_JEQ | BPF_X: +		case BPF_JMP | BPF_JGE | BPF_K: +		case BPF_JMP | BPF_JGE | BPF_X: +		case BPF_JMP | BPF_JGT | BPF_K: +		case BPF_JMP | BPF_JGT | BPF_X: +		case BPF_JMP | BPF_JSET | BPF_K: +		case BPF_JMP | BPF_JSET | BPF_X: +			/* Both conditionals must be safe */  			if (pc + ftest->jt + 1 >= flen ||  			    pc + ftest->jf + 1 >= flen)  				return -EINVAL;  			break; +		case BPF_LD | BPF_W | BPF_ABS: +		case BPF_LD | BPF_H | BPF_ABS: +		case BPF_LD | BPF_B | BPF_ABS: +			anc_found = false; +			if (bpf_anc_helper(ftest) & BPF_ANC) +				anc_found = true; +			/* Ancillary operation unknown or unsupported */ +			if (anc_found == false && ftest->k >= SKF_AD_OFF) +				return -EINVAL;  		} -		ftest->code = code;  	} -	/* last instruction must be a RET code */ +	/* Last instruction must be a RET code */  	switch (filter[flen - 1].code) { -	case BPF_S_RET_K: -	case BPF_S_RET_A: +	case BPF_RET | BPF_K: +	case BPF_RET | BPF_A:  		return check_load_and_stores(filter, flen);  	} +  	return -EINVAL;  }  EXPORT_SYMBOL(sk_chk_filter); +static int sk_store_orig_filter(struct sk_filter *fp, +				const struct sock_fprog *fprog) +{ +	unsigned int fsize = sk_filter_proglen(fprog); +	struct sock_fprog_kern *fkprog; + +	fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); +	if (!fp->orig_prog) +		return -ENOMEM; + +	fkprog = fp->orig_prog; +	fkprog->len = fprog->len; +	fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL); +	if (!fkprog->filter) { +		kfree(fp->orig_prog); +		return -ENOMEM; +	} + +	return 0; +} + +static void sk_release_orig_filter(struct sk_filter *fp) +{ +	struct sock_fprog_kern *fprog = fp->orig_prog; + +	if (fprog) { +		kfree(fprog->filter); +		kfree(fprog); +	} +} +  /** - * 	sk_filter_rcu_release - Release a socket filter by rcu_head + * 	sk_filter_release_rcu - Release a socket filter by rcu_head   *	@rcu: rcu_head that contains the sk_filter to free   */ -static void sk_filter_rcu_release(struct rcu_head *rcu) +static void sk_filter_release_rcu(struct rcu_head *rcu)  {  	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); +	sk_release_orig_filter(fp); +	sk_filter_free(fp); +} + +/** + *	sk_filter_release - release a socket filter + *	@fp: filter to remove + * + *	Remove a filter from a socket and release its resources. + */ +static void sk_filter_release(struct sk_filter *fp) +{ +	if (atomic_dec_and_test(&fp->refcnt)) +		call_rcu(&fp->rcu, sk_filter_release_rcu); +} + +void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) +{ +	atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);  	sk_filter_release(fp);  } -static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp) +void sk_filter_charge(struct sock *sk, struct sk_filter *fp) +{ +	atomic_inc(&fp->refcnt); +	atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc); +} + +static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp, +					      struct sock *sk, +					      unsigned int len) +{ +	struct sk_filter *fp_new; + +	if (sk == NULL) +		return krealloc(fp, len, GFP_KERNEL); + +	fp_new = sock_kmalloc(sk, len, GFP_KERNEL); +	if (fp_new) { +		*fp_new = *fp; +		/* As we're keeping orig_prog in fp_new along, +		 * we need to make sure we're not evicting it +		 * from the old fp. +		 */ +		fp->orig_prog = NULL; +		sk_filter_uncharge(sk, fp); +	} + +	return fp_new; +} + +static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, +					     struct sock *sk) +{ +	struct sock_filter *old_prog; +	struct sk_filter *old_fp; +	int err, new_len, old_len = fp->len; + +	/* We are free to overwrite insns et al right here as it +	 * won't be used at this point in time anymore internally +	 * after the migration to the internal BPF instruction +	 * representation. +	 */ +	BUILD_BUG_ON(sizeof(struct sock_filter) != +		     sizeof(struct sock_filter_int)); + +	/* Conversion cannot happen on overlapping memory areas, +	 * so we need to keep the user BPF around until the 2nd +	 * pass. At this time, the user BPF is stored in fp->insns. +	 */ +	old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), +			   GFP_KERNEL); +	if (!old_prog) { +		err = -ENOMEM; +		goto out_err; +	} + +	/* 1st pass: calculate the new program length. */ +	err = sk_convert_filter(old_prog, old_len, NULL, &new_len); +	if (err) +		goto out_err_free; + +	/* Expand fp for appending the new filter representation. */ +	old_fp = fp; +	fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len)); +	if (!fp) { +		/* The old_fp is still around in case we couldn't +		 * allocate new memory, so uncharge on that one. +		 */ +		fp = old_fp; +		err = -ENOMEM; +		goto out_err_free; +	} + +	fp->len = new_len; + +	/* 2nd pass: remap sock_filter insns into sock_filter_int insns. */ +	err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len); +	if (err) +		/* 2nd sk_convert_filter() can fail only if it fails +		 * to allocate memory, remapping must succeed. Note, +		 * that at this time old_fp has already been released +		 * by __sk_migrate_realloc(). +		 */ +		goto out_err_free; + +	sk_filter_select_runtime(fp); + +	kfree(old_prog); +	return fp; + +out_err_free: +	kfree(old_prog); +out_err: +	/* Rollback filter setup. */ +	if (sk != NULL) +		sk_filter_uncharge(sk, fp); +	else +		kfree(fp); +	return ERR_PTR(err); +} + +void __weak bpf_int_jit_compile(struct sk_filter *prog) +{ +} + +/** + *	sk_filter_select_runtime - select execution runtime for BPF program + *	@fp: sk_filter populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via SK_RUN_FILTER() macro + */ +void sk_filter_select_runtime(struct sk_filter *fp) +{ +	fp->bpf_func = (void *) __sk_run_filter; + +	/* Probe if internal BPF can be JITed */ +	bpf_int_jit_compile(fp); +} +EXPORT_SYMBOL_GPL(sk_filter_select_runtime); + +/* free internal BPF program */ +void sk_filter_free(struct sk_filter *fp)  { -	unsigned int size = sk_filter_len(fp); +	bpf_jit_free(fp); +} +EXPORT_SYMBOL_GPL(sk_filter_free); + +static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, +					     struct sock *sk) +{ +	int err; + +	fp->bpf_func = NULL; +	fp->jited = 0; + +	err = sk_chk_filter(fp->insns, fp->len); +	if (err) { +		if (sk != NULL) +			sk_filter_uncharge(sk, fp); +		else +			kfree(fp); +		return ERR_PTR(err); +	} + +	/* Probe if we can JIT compile the filter and if so, do +	 * the compilation of the filter. +	 */ +	bpf_jit_compile(fp); -	atomic_sub(size, &sk->sk_omem_alloc); -	call_rcu_bh(&fp->rcu, sk_filter_rcu_release); +	/* JIT compiler couldn't process this filter, so do the +	 * internal BPF translation for the optimized interpreter. +	 */ +	if (!fp->jited) +		fp = __sk_migrate_filter(fp, sk); + +	return fp;  }  /** + *	sk_unattached_filter_create - create an unattached filter + *	@pfp: the unattached filter that is created + *	@fprog: the filter program + * + * Create a filter independent of any socket. We first run some + * sanity checks on it to make sure it does not explode on us later. + * If an error occurs or there is insufficient memory for the filter + * a negative errno code is returned. On success the return is zero. + */ +int sk_unattached_filter_create(struct sk_filter **pfp, +				struct sock_fprog_kern *fprog) +{ +	unsigned int fsize = sk_filter_proglen(fprog); +	struct sk_filter *fp; + +	/* Make sure new filter is there and in the right amounts. */ +	if (fprog->filter == NULL) +		return -EINVAL; + +	fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL); +	if (!fp) +		return -ENOMEM; + +	memcpy(fp->insns, fprog->filter, fsize); + +	atomic_set(&fp->refcnt, 1); +	fp->len = fprog->len; +	/* Since unattached filters are not copied back to user +	 * space through sk_get_filter(), we do not need to hold +	 * a copy here, and can spare us the work. +	 */ +	fp->orig_prog = NULL; + +	/* __sk_prepare_filter() already takes care of uncharging +	 * memory in case something goes wrong. +	 */ +	fp = __sk_prepare_filter(fp, NULL); +	if (IS_ERR(fp)) +		return PTR_ERR(fp); + +	*pfp = fp; +	return 0; +} +EXPORT_SYMBOL_GPL(sk_unattached_filter_create); + +void sk_unattached_filter_destroy(struct sk_filter *fp) +{ +	sk_filter_release(fp); +} +EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy); + +/**   *	sk_attach_filter - attach a socket filter   *	@fprog: the filter program   *	@sk: the socket to use @@ -646,36 +1587,49 @@ static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp)  int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)  {  	struct sk_filter *fp, *old_fp; -	unsigned int fsize = sizeof(struct sock_filter) * fprog->len; +	unsigned int fsize = sk_filter_proglen(fprog); +	unsigned int sk_fsize = sk_filter_size(fprog->len);  	int err; +	if (sock_flag(sk, SOCK_FILTER_LOCKED)) +		return -EPERM; +  	/* Make sure new filter is there and in the right amounts. */  	if (fprog->filter == NULL)  		return -EINVAL; -	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); +	fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);  	if (!fp)  		return -ENOMEM; +  	if (copy_from_user(fp->insns, fprog->filter, fsize)) { -		sock_kfree_s(sk, fp, fsize+sizeof(*fp)); +		sock_kfree_s(sk, fp, sk_fsize);  		return -EFAULT;  	}  	atomic_set(&fp->refcnt, 1);  	fp->len = fprog->len; -	err = sk_chk_filter(fp->insns, fp->len); +	err = sk_store_orig_filter(fp, fprog);  	if (err) {  		sk_filter_uncharge(sk, fp); -		return err; +		return -ENOMEM;  	} +	/* __sk_prepare_filter() already takes care of uncharging +	 * memory in case something goes wrong. +	 */ +	fp = __sk_prepare_filter(fp, sk); +	if (IS_ERR(fp)) +		return PTR_ERR(fp); +  	old_fp = rcu_dereference_protected(sk->sk_filter,  					   sock_owned_by_user(sk));  	rcu_assign_pointer(sk->sk_filter, fp);  	if (old_fp) -		sk_filter_delayed_uncharge(sk, old_fp); +		sk_filter_uncharge(sk, old_fp); +  	return 0;  }  EXPORT_SYMBOL_GPL(sk_attach_filter); @@ -685,13 +1639,57 @@ int sk_detach_filter(struct sock *sk)  	int ret = -ENOENT;  	struct sk_filter *filter; +	if (sock_flag(sk, SOCK_FILTER_LOCKED)) +		return -EPERM; +  	filter = rcu_dereference_protected(sk->sk_filter,  					   sock_owned_by_user(sk));  	if (filter) { -		rcu_assign_pointer(sk->sk_filter, NULL); -		sk_filter_delayed_uncharge(sk, filter); +		RCU_INIT_POINTER(sk->sk_filter, NULL); +		sk_filter_uncharge(sk, filter);  		ret = 0;  	} +  	return ret;  }  EXPORT_SYMBOL_GPL(sk_detach_filter); + +int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, +		  unsigned int len) +{ +	struct sock_fprog_kern *fprog; +	struct sk_filter *filter; +	int ret = 0; + +	lock_sock(sk); +	filter = rcu_dereference_protected(sk->sk_filter, +					   sock_owned_by_user(sk)); +	if (!filter) +		goto out; + +	/* We're copying the filter that has been originally attached, +	 * so no conversion/decode needed anymore. +	 */ +	fprog = filter->orig_prog; + +	ret = fprog->len; +	if (!len) +		/* User space only enquires number of filter blocks. */ +		goto out; + +	ret = -EINVAL; +	if (len < fprog->len) +		goto out; + +	ret = -EFAULT; +	if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog))) +		goto out; + +	/* Instead of bytes, the API requests to return the number +	 * of filter blocks. +	 */ +	ret = fprog->len; +out: +	release_sock(sk); +	return ret; +} diff --git a/net/core/flow.c b/net/core/flow.c index 127c8a7ffd6..a0348fde1fd 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -22,14 +22,16 @@  #include <linux/cpumask.h>  #include <linux/mutex.h>  #include <net/flow.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  #include <linux/security.h> +#include <net/net_namespace.h>  struct flow_cache_entry {  	union {  		struct hlist_node	hlist;  		struct list_head	gc_list;  	} u; +	struct net			*net;  	u16				family;  	u8				dir;  	u32				genid; @@ -37,37 +39,14 @@ struct flow_cache_entry {  	struct flow_cache_object	*object;  }; -struct flow_cache_percpu { -	struct hlist_head		*hash_table; -	int				hash_count; -	u32				hash_rnd; -	int				hash_rnd_recalc; -	struct tasklet_struct		flush_tasklet; -}; -  struct flow_flush_info {  	struct flow_cache		*cache;  	atomic_t			cpuleft;  	struct completion		completion;  }; -struct flow_cache { -	u32				hash_shift; -	struct flow_cache_percpu __percpu *percpu; -	struct notifier_block		hotcpu_notifier; -	int				low_watermark; -	int				high_watermark; -	struct timer_list		rnd_timer; -}; - -atomic_t flow_cache_genid = ATOMIC_INIT(0); -EXPORT_SYMBOL(flow_cache_genid); -static struct flow_cache flow_cache_global;  static struct kmem_cache *flow_cachep __read_mostly; -static DEFINE_SPINLOCK(flow_cache_gc_lock); -static LIST_HEAD(flow_cache_gc_list); -  #define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)  #define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ) @@ -83,16 +62,18 @@ static void flow_cache_new_hashrnd(unsigned long arg)  	add_timer(&fc->rnd_timer);  } -static int flow_entry_valid(struct flow_cache_entry *fle) +static int flow_entry_valid(struct flow_cache_entry *fle, +				struct netns_xfrm *xfrm)  { -	if (atomic_read(&flow_cache_genid) != fle->genid) +	if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)  		return 0;  	if (fle->object && !fle->object->ops->check(fle->object))  		return 0;  	return 1;  } -static void flow_entry_kill(struct flow_cache_entry *fle) +static void flow_entry_kill(struct flow_cache_entry *fle, +				struct netns_xfrm *xfrm)  {  	if (fle->object)  		fle->object->ops->delete(fle->object); @@ -103,26 +84,28 @@ static void flow_cache_gc_task(struct work_struct *work)  {  	struct list_head gc_list;  	struct flow_cache_entry *fce, *n; +	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, +						flow_cache_gc_work);  	INIT_LIST_HEAD(&gc_list); -	spin_lock_bh(&flow_cache_gc_lock); -	list_splice_tail_init(&flow_cache_gc_list, &gc_list); -	spin_unlock_bh(&flow_cache_gc_lock); +	spin_lock_bh(&xfrm->flow_cache_gc_lock); +	list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list); +	spin_unlock_bh(&xfrm->flow_cache_gc_lock);  	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) -		flow_entry_kill(fce); +		flow_entry_kill(fce, xfrm);  } -static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);  static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, -				     int deleted, struct list_head *gc_list) +				     int deleted, struct list_head *gc_list, +				     struct netns_xfrm *xfrm)  {  	if (deleted) {  		fcp->hash_count -= deleted; -		spin_lock_bh(&flow_cache_gc_lock); -		list_splice_tail(gc_list, &flow_cache_gc_list); -		spin_unlock_bh(&flow_cache_gc_lock); -		schedule_work(&flow_cache_gc_work); +		spin_lock_bh(&xfrm->flow_cache_gc_lock); +		list_splice_tail(gc_list, &xfrm->flow_cache_gc_list); +		spin_unlock_bh(&xfrm->flow_cache_gc_lock); +		schedule_work(&xfrm->flow_cache_gc_work);  	}  } @@ -131,17 +114,19 @@ static void __flow_cache_shrink(struct flow_cache *fc,  				int shrink_to)  {  	struct flow_cache_entry *fle; -	struct hlist_node *entry, *tmp; +	struct hlist_node *tmp;  	LIST_HEAD(gc_list);  	int i, deleted = 0; +	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, +						flow_cache_global);  	for (i = 0; i < flow_cache_hash_size(fc); i++) {  		int saved = 0; -		hlist_for_each_entry_safe(fle, entry, tmp, +		hlist_for_each_entry_safe(fle, tmp,  					  &fcp->hash_table[i], u.hlist) {  			if (saved < shrink_to && -			    flow_entry_valid(fle)) { +			    flow_entry_valid(fle, xfrm)) {  				saved++;  			} else {  				deleted++; @@ -151,7 +136,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,  		}  	} -	flow_cache_queue_garbage(fcp, deleted, &gc_list); +	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);  }  static void flow_cache_shrink(struct flow_cache *fc, @@ -172,31 +157,28 @@ static void flow_new_hash_rnd(struct flow_cache *fc,  static u32 flow_hash_code(struct flow_cache *fc,  			  struct flow_cache_percpu *fcp, -			  struct flowi *key) +			  const struct flowi *key, +			  size_t keysize)  { -	u32 *k = (u32 *) key; +	const u32 *k = (const u32 *) key; +	const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32); -	return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) +	return jhash2(k, length, fcp->hash_rnd)  		& (flow_cache_hash_size(fc) - 1);  } -typedef unsigned long flow_compare_t; -  /* I hear what you're saying, use memcmp.  But memcmp cannot make - * important assumptions that we can here, such as alignment and - * constant size. + * important assumptions that we can here, such as alignment.   */ -static int flow_key_compare(struct flowi *key1, struct flowi *key2) +static int flow_key_compare(const struct flowi *key1, const struct flowi *key2, +			    size_t keysize)  { -	flow_compare_t *k1, *k1_lim, *k2; -	const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t); +	const flow_compare_t *k1, *k1_lim, *k2; -	BUILD_BUG_ON(sizeof(struct flowi) % sizeof(flow_compare_t)); +	k1 = (const flow_compare_t *) key1; +	k1_lim = k1 + keysize; -	k1 = (flow_compare_t *) key1; -	k1_lim = k1 + n_elem; - -	k2 = (flow_compare_t *) key2; +	k2 = (const flow_compare_t *) key2;  	do {  		if (*k1++ != *k2++) @@ -207,14 +189,14 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)  }  struct flow_cache_object * -flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, +flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,  		  flow_resolve_t resolver, void *ctx)  { -	struct flow_cache *fc = &flow_cache_global; +	struct flow_cache *fc = &net->xfrm.flow_cache_global;  	struct flow_cache_percpu *fcp;  	struct flow_cache_entry *fle, *tfle; -	struct hlist_node *entry;  	struct flow_cache_object *flo; +	size_t keysize;  	unsigned int hash;  	local_bh_disable(); @@ -222,6 +204,11 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,  	fle = NULL;  	flo = NULL; + +	keysize = flow_key_size(family); +	if (!keysize) +		goto nocache; +  	/* Packet really early in init?  Making flow_cache_init a  	 * pre-smp initcall would solve this.  --RR */  	if (!fcp->hash_table) @@ -230,11 +217,12 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,  	if (fcp->hash_rnd_recalc)  		flow_new_hash_rnd(fc, fcp); -	hash = flow_hash_code(fc, fcp, key); -	hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) { -		if (tfle->family == family && +	hash = flow_hash_code(fc, fcp, key, keysize); +	hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) { +		if (tfle->net == net && +		    tfle->family == family &&  		    tfle->dir == dir && -		    flow_key_compare(key, &tfle->key) == 0) { +		    flow_key_compare(key, &tfle->key, keysize) == 0) {  			fle = tfle;  			break;  		} @@ -246,14 +234,15 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,  		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);  		if (fle) { +			fle->net = net;  			fle->family = family;  			fle->dir = dir; -			memcpy(&fle->key, key, sizeof(*key)); +			memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));  			fle->object = NULL;  			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);  			fcp->hash_count++;  		} -	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { +	} else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {  		flo = fle->object;  		if (!flo)  			goto ret_object; @@ -274,13 +263,13 @@ nocache:  	}  	flo = resolver(net, key, family, dir, flo, ctx);  	if (fle) { -		fle->genid = atomic_read(&flow_cache_genid); +		fle->genid = atomic_read(&net->xfrm.flow_cache_genid);  		if (!IS_ERR(flo))  			fle->object = flo;  		else  			fle->genid--;  	} else { -		if (flo && !IS_ERR(flo)) +		if (!IS_ERR_OR_NULL(flo))  			flo->ops->delete(flo);  	}  ret_object: @@ -295,15 +284,17 @@ static void flow_cache_flush_tasklet(unsigned long data)  	struct flow_cache *fc = info->cache;  	struct flow_cache_percpu *fcp;  	struct flow_cache_entry *fle; -	struct hlist_node *entry, *tmp; +	struct hlist_node *tmp;  	LIST_HEAD(gc_list);  	int i, deleted = 0; +	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, +						flow_cache_global);  	fcp = this_cpu_ptr(fc->percpu);  	for (i = 0; i < flow_cache_hash_size(fc); i++) { -		hlist_for_each_entry_safe(fle, entry, tmp, +		hlist_for_each_entry_safe(fle, tmp,  					  &fcp->hash_table[i], u.hlist) { -			if (flow_entry_valid(fle)) +			if (flow_entry_valid(fle, xfrm))  				continue;  			deleted++; @@ -312,47 +303,94 @@ static void flow_cache_flush_tasklet(unsigned long data)  		}  	} -	flow_cache_queue_garbage(fcp, deleted, &gc_list); +	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);  	if (atomic_dec_and_test(&info->cpuleft))  		complete(&info->completion);  } +/* + * Return whether a cpu needs flushing.  Conservatively, we assume + * the presence of any entries means the core may require flushing, + * since the flow_cache_ops.check() function may assume it's running + * on the same core as the per-cpu cache component. + */ +static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu) +{ +	struct flow_cache_percpu *fcp; +	int i; + +	fcp = per_cpu_ptr(fc->percpu, cpu); +	for (i = 0; i < flow_cache_hash_size(fc); i++) +		if (!hlist_empty(&fcp->hash_table[i])) +			return 0; +	return 1; +} +  static void flow_cache_flush_per_cpu(void *data)  {  	struct flow_flush_info *info = data; -	int cpu;  	struct tasklet_struct *tasklet; -	cpu = smp_processor_id(); -	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet; +	tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;  	tasklet->data = (unsigned long)info;  	tasklet_schedule(tasklet);  } -void flow_cache_flush(void) +void flow_cache_flush(struct net *net)  {  	struct flow_flush_info info; -	static DEFINE_MUTEX(flow_flush_sem); +	cpumask_var_t mask; +	int i, self; + +	/* Track which cpus need flushing to avoid disturbing all cores. */ +	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) +		return; +	cpumask_clear(mask);  	/* Don't want cpus going down or up during this. */  	get_online_cpus(); -	mutex_lock(&flow_flush_sem); -	info.cache = &flow_cache_global; -	atomic_set(&info.cpuleft, num_online_cpus()); +	mutex_lock(&net->xfrm.flow_flush_sem); +	info.cache = &net->xfrm.flow_cache_global; +	for_each_online_cpu(i) +		if (!flow_cache_percpu_empty(info.cache, i)) +			cpumask_set_cpu(i, mask); +	atomic_set(&info.cpuleft, cpumask_weight(mask)); +	if (atomic_read(&info.cpuleft) == 0) +		goto done; +  	init_completion(&info.completion);  	local_bh_disable(); -	smp_call_function(flow_cache_flush_per_cpu, &info, 0); -	flow_cache_flush_tasklet((unsigned long)&info); +	self = cpumask_test_and_clear_cpu(smp_processor_id(), mask); +	on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0); +	if (self) +		flow_cache_flush_tasklet((unsigned long)&info);  	local_bh_enable();  	wait_for_completion(&info.completion); -	mutex_unlock(&flow_flush_sem); + +done: +	mutex_unlock(&net->xfrm.flow_flush_sem);  	put_online_cpus(); +	free_cpumask_var(mask);  } -static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) +static void flow_cache_flush_task(struct work_struct *work) +{ +	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, +						flow_cache_gc_work); +	struct net *net = container_of(xfrm, struct net, xfrm); + +	flow_cache_flush(net); +} + +void flow_cache_flush_deferred(struct net *net) +{ +	schedule_work(&net->xfrm.flow_cache_flush_work); +} + +static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)  {  	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);  	size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); @@ -370,11 +408,12 @@ static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)  	return 0;  } -static int __cpuinit flow_cache_cpu(struct notifier_block *nfb, +static int flow_cache_cpu(struct notifier_block *nfb,  			  unsigned long action,  			  void *hcpu)  { -	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); +	struct flow_cache *fc = container_of(nfb, struct flow_cache, +						hotcpu_notifier);  	int res, cpu = (unsigned long) hcpu;  	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); @@ -393,9 +432,20 @@ static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,  	return NOTIFY_OK;  } -static int __init flow_cache_init(struct flow_cache *fc) +int flow_cache_init(struct net *net)  {  	int i; +	struct flow_cache *fc = &net->xfrm.flow_cache_global; + +	if (!flow_cachep) +		flow_cachep = kmem_cache_create("flow_cache", +						sizeof(struct flow_cache_entry), +						0, SLAB_PANIC, NULL); +	spin_lock_init(&net->xfrm.flow_cache_gc_lock); +	INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list); +	INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task); +	INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task); +	mutex_init(&net->xfrm.flow_flush_sem);  	fc->hash_shift = 10;  	fc->low_watermark = 2 * flow_cache_hash_size(fc); @@ -405,14 +455,18 @@ static int __init flow_cache_init(struct flow_cache *fc)  	if (!fc->percpu)  		return -ENOMEM; +	cpu_notifier_register_begin(); +  	for_each_online_cpu(i) {  		if (flow_cache_cpu_prepare(fc, i)) -			return -ENOMEM; +			goto err;  	}  	fc->hotcpu_notifier = (struct notifier_block){  		.notifier_call = flow_cache_cpu,  	}; -	register_hotcpu_notifier(&fc->hotcpu_notifier); +	__register_hotcpu_notifier(&fc->hotcpu_notifier); + +	cpu_notifier_register_done();  	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,  		    (unsigned long) fc); @@ -420,15 +474,38 @@ static int __init flow_cache_init(struct flow_cache *fc)  	add_timer(&fc->rnd_timer);  	return 0; + +err: +	for_each_possible_cpu(i) { +		struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); +		kfree(fcp->hash_table); +		fcp->hash_table = NULL; +	} + +	cpu_notifier_register_done(); + +	free_percpu(fc->percpu); +	fc->percpu = NULL; + +	return -ENOMEM;  } +EXPORT_SYMBOL(flow_cache_init); -static int __init flow_cache_init_global(void) +void flow_cache_fini(struct net *net)  { -	flow_cachep = kmem_cache_create("flow_cache", -					sizeof(struct flow_cache_entry), -					0, SLAB_PANIC, NULL); +	int i; +	struct flow_cache *fc = &net->xfrm.flow_cache_global; -	return flow_cache_init(&flow_cache_global); -} +	del_timer_sync(&fc->rnd_timer); +	unregister_hotcpu_notifier(&fc->hotcpu_notifier); -module_init(flow_cache_init_global); +	for_each_possible_cpu(i) { +		struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); +		kfree(fcp->hash_table); +		fcp->hash_table = NULL; +	} + +	free_percpu(fc->percpu); +	fc->percpu = NULL; +} +EXPORT_SYMBOL(flow_cache_fini); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c new file mode 100644 index 00000000000..107ed12a532 --- /dev/null +++ b/net/core/flow_dissector.c @@ -0,0 +1,405 @@ +#include <linux/skbuff.h> +#include <linux/export.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/if_vlan.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <linux/igmp.h> +#include <linux/icmp.h> +#include <linux/sctp.h> +#include <linux/dccp.h> +#include <linux/if_tunnel.h> +#include <linux/if_pppox.h> +#include <linux/ppp_defs.h> +#include <net/flow_keys.h> + +/* copy saddr & daddr, possibly using 64bit load/store + * Equivalent to :	flow->src = iph->saddr; + *			flow->dst = iph->daddr; + */ +static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph) +{ +	BUILD_BUG_ON(offsetof(typeof(*flow), dst) != +		     offsetof(typeof(*flow), src) + sizeof(flow->src)); +	memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); +} + +/** + * skb_flow_get_ports - extract the upper layer ports and return them + * @skb: buffer to extract the ports from + * @thoff: transport header offset + * @ip_proto: protocol for which to get port offset + * + * The function will try to retrieve the ports at offset thoff + poff where poff + * is the protocol port offset returned from proto_ports_offset + */ +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +{ +	int poff = proto_ports_offset(ip_proto); + +	if (poff >= 0) { +		__be32 *ports, _ports; + +		ports = skb_header_pointer(skb, thoff + poff, +					   sizeof(_ports), &_ports); +		if (ports) +			return *ports; +	} + +	return 0; +} +EXPORT_SYMBOL(skb_flow_get_ports); + +bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +{ +	int nhoff = skb_network_offset(skb); +	u8 ip_proto; +	__be16 proto = skb->protocol; + +	memset(flow, 0, sizeof(*flow)); + +again: +	switch (proto) { +	case htons(ETH_P_IP): { +		const struct iphdr *iph; +		struct iphdr _iph; +ip: +		iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); +		if (!iph || iph->ihl < 5) +			return false; +		nhoff += iph->ihl * 4; + +		ip_proto = iph->protocol; +		if (ip_is_fragment(iph)) +			ip_proto = 0; + +		iph_to_flow_copy_addrs(flow, iph); +		break; +	} +	case htons(ETH_P_IPV6): { +		const struct ipv6hdr *iph; +		struct ipv6hdr _iph; +ipv6: +		iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); +		if (!iph) +			return false; + +		ip_proto = iph->nexthdr; +		flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); +		flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); +		nhoff += sizeof(struct ipv6hdr); +		break; +	} +	case htons(ETH_P_8021AD): +	case htons(ETH_P_8021Q): { +		const struct vlan_hdr *vlan; +		struct vlan_hdr _vlan; + +		vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); +		if (!vlan) +			return false; + +		proto = vlan->h_vlan_encapsulated_proto; +		nhoff += sizeof(*vlan); +		goto again; +	} +	case htons(ETH_P_PPP_SES): { +		struct { +			struct pppoe_hdr hdr; +			__be16 proto; +		} *hdr, _hdr; +		hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); +		if (!hdr) +			return false; +		proto = hdr->proto; +		nhoff += PPPOE_SES_HLEN; +		switch (proto) { +		case htons(PPP_IP): +			goto ip; +		case htons(PPP_IPV6): +			goto ipv6; +		default: +			return false; +		} +	} +	default: +		return false; +	} + +	switch (ip_proto) { +	case IPPROTO_GRE: { +		struct gre_hdr { +			__be16 flags; +			__be16 proto; +		} *hdr, _hdr; + +		hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); +		if (!hdr) +			return false; +		/* +		 * Only look inside GRE if version zero and no +		 * routing +		 */ +		if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) { +			proto = hdr->proto; +			nhoff += 4; +			if (hdr->flags & GRE_CSUM) +				nhoff += 4; +			if (hdr->flags & GRE_KEY) +				nhoff += 4; +			if (hdr->flags & GRE_SEQ) +				nhoff += 4; +			if (proto == htons(ETH_P_TEB)) { +				const struct ethhdr *eth; +				struct ethhdr _eth; + +				eth = skb_header_pointer(skb, nhoff, +							 sizeof(_eth), &_eth); +				if (!eth) +					return false; +				proto = eth->h_proto; +				nhoff += sizeof(*eth); +			} +			goto again; +		} +		break; +	} +	case IPPROTO_IPIP: +		proto = htons(ETH_P_IP); +		goto ip; +	case IPPROTO_IPV6: +		proto = htons(ETH_P_IPV6); +		goto ipv6; +	default: +		break; +	} + +	flow->ip_proto = ip_proto; +	flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); +	flow->thoff = (u16) nhoff; + +	return true; +} +EXPORT_SYMBOL(skb_flow_dissect); + +static u32 hashrnd __read_mostly; +static __always_inline void __flow_hash_secret_init(void) +{ +	net_get_random_once(&hashrnd, sizeof(hashrnd)); +} + +static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) +{ +	__flow_hash_secret_init(); +	return jhash_3words(a, b, c, hashrnd); +} + +static __always_inline u32 __flow_hash_1word(u32 a) +{ +	__flow_hash_secret_init(); +	return jhash_1word(a, hashrnd); +} + +/* + * __skb_get_hash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers.  Sets hash in skb to non-zero hash value + * on success, zero indicates no valid hash.  Also, sets l4_hash in skb + * if hash is a canonical 4-tuple hash over transport ports. + */ +void __skb_get_hash(struct sk_buff *skb) +{ +	struct flow_keys keys; +	u32 hash; + +	if (!skb_flow_dissect(skb, &keys)) +		return; + +	if (keys.ports) +		skb->l4_hash = 1; + +	/* get a consistent hash (same value on both flow directions) */ +	if (((__force u32)keys.dst < (__force u32)keys.src) || +	    (((__force u32)keys.dst == (__force u32)keys.src) && +	     ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { +		swap(keys.dst, keys.src); +		swap(keys.port16[0], keys.port16[1]); +	} + +	hash = __flow_hash_3words((__force u32)keys.dst, +				  (__force u32)keys.src, +				  (__force u32)keys.ports); +	if (!hash) +		hash = 1; + +	skb->hash = hash; +} +EXPORT_SYMBOL(__skb_get_hash); + +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, +		  unsigned int num_tx_queues) +{ +	u32 hash; +	u16 qoffset = 0; +	u16 qcount = num_tx_queues; + +	if (skb_rx_queue_recorded(skb)) { +		hash = skb_get_rx_queue(skb); +		while (unlikely(hash >= num_tx_queues)) +			hash -= num_tx_queues; +		return hash; +	} + +	if (dev->num_tc) { +		u8 tc = netdev_get_prio_tc_map(dev, skb->priority); +		qoffset = dev->tc_to_txq[tc].offset; +		qcount = dev->tc_to_txq[tc].count; +	} + +	if (skb->sk && skb->sk->sk_hash) +		hash = skb->sk->sk_hash; +	else +		hash = (__force u16) skb->protocol; +	hash = __flow_hash_1word(hash); + +	return (u16) (((u64) hash * qcount) >> 32) + qoffset; +} +EXPORT_SYMBOL(__skb_tx_hash); + +/* __skb_get_poff() returns the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 __skb_get_poff(const struct sk_buff *skb) +{ +	struct flow_keys keys; +	u32 poff = 0; + +	if (!skb_flow_dissect(skb, &keys)) +		return 0; + +	poff += keys.thoff; +	switch (keys.ip_proto) { +	case IPPROTO_TCP: { +		const struct tcphdr *tcph; +		struct tcphdr _tcph; + +		tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph); +		if (!tcph) +			return poff; + +		poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4); +		break; +	} +	case IPPROTO_UDP: +	case IPPROTO_UDPLITE: +		poff += sizeof(struct udphdr); +		break; +	/* For the rest, we do not really care about header +	 * extensions at this point for now. +	 */ +	case IPPROTO_ICMP: +		poff += sizeof(struct icmphdr); +		break; +	case IPPROTO_ICMPV6: +		poff += sizeof(struct icmp6hdr); +		break; +	case IPPROTO_IGMP: +		poff += sizeof(struct igmphdr); +		break; +	case IPPROTO_DCCP: +		poff += sizeof(struct dccp_hdr); +		break; +	case IPPROTO_SCTP: +		poff += sizeof(struct sctphdr); +		break; +	} + +	return poff; +} + +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS +	struct xps_dev_maps *dev_maps; +	struct xps_map *map; +	int queue_index = -1; + +	rcu_read_lock(); +	dev_maps = rcu_dereference(dev->xps_maps); +	if (dev_maps) { +		map = rcu_dereference( +		    dev_maps->cpu_map[raw_smp_processor_id()]); +		if (map) { +			if (map->len == 1) +				queue_index = map->queues[0]; +			else { +				u32 hash; +				if (skb->sk && skb->sk->sk_hash) +					hash = skb->sk->sk_hash; +				else +					hash = (__force u16) skb->protocol ^ +					    skb->hash; +				hash = __flow_hash_1word(hash); +				queue_index = map->queues[ +				    ((u64)hash * map->len) >> 32]; +			} +			if (unlikely(queue_index >= dev->real_num_tx_queues)) +				queue_index = -1; +		} +	} +	rcu_read_unlock(); + +	return queue_index; +#else +	return -1; +#endif +} + +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +{ +	struct sock *sk = skb->sk; +	int queue_index = sk_tx_queue_get(sk); + +	if (queue_index < 0 || skb->ooo_okay || +	    queue_index >= dev->real_num_tx_queues) { +		int new_index = get_xps_queue(dev, skb); +		if (new_index < 0) +			new_index = skb_tx_hash(dev, skb); + +		if (queue_index != new_index && sk && +		    rcu_access_pointer(sk->sk_dst_cache)) +			sk_tx_queue_set(sk, new_index); + +		queue_index = new_index; +	} + +	return queue_index; +} + +struct netdev_queue *netdev_pick_tx(struct net_device *dev, +				    struct sk_buff *skb, +				    void *accel_priv) +{ +	int queue_index = 0; + +	if (dev->real_num_tx_queues != 1) { +		const struct net_device_ops *ops = dev->netdev_ops; +		if (ops->ndo_select_queue) +			queue_index = ops->ndo_select_queue(dev, skb, accel_priv, +							    __netdev_pick_tx); +		else +			queue_index = __netdev_pick_tx(dev, skb); + +		if (!accel_priv) +			queue_index = netdev_cap_txqueue(dev, queue_index); +	} + +	skb_set_queue_mapping(skb, queue_index); +	return netdev_get_tx_queue(dev, queue_index); +} diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 7c2373321b7..6b5b6e7013c 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -14,7 +14,6 @@   */  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/bitops.h>  #include <linux/module.h>  #include <linux/types.h> @@ -83,7 +82,7 @@ struct gen_estimator  {  	struct list_head	list;  	struct gnet_stats_basic_packed	*bstats; -	struct gnet_stats_rate_est	*rate_est; +	struct gnet_stats_rate_est64	*rate_est;  	spinlock_t		*stats_lock;  	int			ewma_log;  	u64			last_bytes; @@ -168,7 +167,7 @@ static void gen_add_node(struct gen_estimator *est)  static  struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, -				    const struct gnet_stats_rate_est *rate_est) +				    const struct gnet_stats_rate_est64 *rate_est)  {  	struct rb_node *p = est_root.rb_node; @@ -204,7 +203,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats   *   */  int gen_new_estimator(struct gnet_stats_basic_packed *bstats, -		      struct gnet_stats_rate_est *rate_est, +		      struct gnet_stats_rate_est64 *rate_est,  		      spinlock_t *stats_lock,  		      struct nlattr *opt)  { @@ -249,13 +248,6 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,  }  EXPORT_SYMBOL(gen_new_estimator); -static void __gen_kill_estimator(struct rcu_head *head) -{ -	struct gen_estimator *e = container_of(head, -					struct gen_estimator, e_rcu); -	kfree(e); -} -  /**   * gen_kill_estimator - remove a rate estimator   * @bstats: basic statistics @@ -266,7 +258,7 @@ static void __gen_kill_estimator(struct rcu_head *head)   * Note : Caller should respect an RCU grace period before freeing stats_lock   */  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, -			struct gnet_stats_rate_est *rate_est) +			struct gnet_stats_rate_est64 *rate_est)  {  	struct gen_estimator *e; @@ -279,7 +271,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,  		write_unlock(&est_lock);  		list_del_rcu(&e->list); -		call_rcu(&e->e_rcu, __gen_kill_estimator); +		kfree_rcu(e, e_rcu);  	}  	spin_unlock_bh(&est_tree_lock);  } @@ -298,7 +290,7 @@ EXPORT_SYMBOL(gen_kill_estimator);   * Returns 0 on success or a negative error code.   */  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, -			  struct gnet_stats_rate_est *rate_est, +			  struct gnet_stats_rate_est64 *rate_est,  			  spinlock_t *stats_lock, struct nlattr *opt)  {  	gen_kill_estimator(bstats, rate_est); @@ -314,7 +306,7 @@ EXPORT_SYMBOL(gen_replace_estimator);   * Returns true if estimator is active, and false if not.   */  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, -			  const struct gnet_stats_rate_est *rate_est) +			  const struct gnet_stats_rate_est64 *rate_est)  {  	bool res; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 0452eb27a27..9d3d9e78397 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -27,7 +27,8 @@  static inline int  gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)  { -	NLA_PUT(d->skb, type, size, buf); +	if (nla_put(d->skb, type, size, buf)) +		goto nla_put_failure;  	return 0;  nla_put_failure: @@ -142,18 +143,30 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);  int  gnet_stats_copy_rate_est(struct gnet_dump *d,  			 const struct gnet_stats_basic_packed *b, -			 struct gnet_stats_rate_est *r) +			 struct gnet_stats_rate_est64 *r)  { +	struct gnet_stats_rate_est est; +	int res; +  	if (b && !gen_estimator_active(b, r))  		return 0; +	est.bps = min_t(u64, UINT_MAX, r->bps); +	/* we have some time before reaching 2^32 packets per second */ +	est.pps = r->pps; +  	if (d->compat_tc_stats) { -		d->tc_stats.bps = r->bps; -		d->tc_stats.pps = r->pps; +		d->tc_stats.bps = est.bps; +		d->tc_stats.pps = est.pps;  	} -	if (d->tail) -		return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r)); +	if (d->tail) { +		res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est)); +		if (res < 0 || est.bps == r->bps) +			return res; +		/* emit 64bit stats only if needed */ +		return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r)); +	}  	return 0;  } diff --git a/net/core/iovec.c b/net/core/iovec.c index c40f27e7d20..e1ec45ab1e6 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -35,11 +35,11 @@   *	in any case.   */ -int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) +int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode)  {  	int size, ct, err; -	if (m->msg_namelen) { +	if (m->msg_name && m->msg_namelen) {  		if (mode == VERIFY_READ) {  			void __user *namep;  			namep = (void __user __force *) m->msg_name; @@ -51,6 +51,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address,  		m->msg_name = address;  	} else {  		m->msg_name = NULL; +		m->msg_namelen = 0;  	}  	size = m->msg_iovlen * sizeof(struct iovec); @@ -74,111 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address,  }  /* - *	Copy kernel to iovec. Returns -EFAULT on error. - * - *	Note: this modifies the original iovec. - */ - -int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) -{ -	while (len > 0) { -		if (iov->iov_len) { -			int copy = min_t(unsigned int, iov->iov_len, len); -			if (copy_to_user(iov->iov_base, kdata, copy)) -				return -EFAULT; -			kdata += copy; -			len -= copy; -			iov->iov_len -= copy; -			iov->iov_base += copy; -		} -		iov++; -	} - -	return 0; -} -EXPORT_SYMBOL(memcpy_toiovec); - -/* - *	Copy kernel to iovec. Returns -EFAULT on error. - */ - -int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, -		      int offset, int len) -{ -	int copy; -	for (; len > 0; ++iov) { -		/* Skip over the finished iovecs */ -		if (unlikely(offset >= iov->iov_len)) { -			offset -= iov->iov_len; -			continue; -		} -		copy = min_t(unsigned int, iov->iov_len - offset, len); -		if (copy_to_user(iov->iov_base + offset, kdata, copy)) -			return -EFAULT; -		offset = 0; -		kdata += copy; -		len -= copy; -	} - -	return 0; -} -EXPORT_SYMBOL(memcpy_toiovecend); - -/* - *	Copy iovec to kernel. Returns -EFAULT on error. - * - *	Note: this modifies the original iovec. - */ - -int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) -{ -	while (len > 0) { -		if (iov->iov_len) { -			int copy = min_t(unsigned int, len, iov->iov_len); -			if (copy_from_user(kdata, iov->iov_base, copy)) -				return -EFAULT; -			len -= copy; -			kdata += copy; -			iov->iov_base += copy; -			iov->iov_len -= copy; -		} -		iov++; -	} - -	return 0; -} -EXPORT_SYMBOL(memcpy_fromiovec); - -/* - *	Copy iovec from kernel. Returns -EFAULT on error. - */ - -int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, -			int offset, int len) -{ -	/* Skip over the finished iovecs */ -	while (offset >= iov->iov_len) { -		offset -= iov->iov_len; -		iov++; -	} - -	while (len > 0) { -		u8 __user *base = iov->iov_base + offset; -		int copy = min_t(unsigned int, len, iov->iov_len - offset); - -		offset = 0; -		if (copy_from_user(kdata, base, copy)) -			return -EFAULT; -		len -= copy; -		kdata += copy; -		iov++; -	} - -	return 0; -} -EXPORT_SYMBOL(memcpy_fromiovecend); - -/*   *	And now for the all-in-one: copy and checksum from a user iovec   *	directly to a datagram   *	Calls to csum_partial but the last must be in 32 bit chunks @@ -262,3 +158,27 @@ out_fault:  	goto out;  }  EXPORT_SYMBOL(csum_partial_copy_fromiovecend); + +unsigned long iov_pages(const struct iovec *iov, int offset, +			unsigned long nr_segs) +{ +	unsigned long seg, base; +	int pages = 0, len, size; + +	while (nr_segs && (offset >= iov->iov_len)) { +		offset -= iov->iov_len; +		++iov; +		--nr_segs; +	} + +	for (seg = 0; seg < nr_segs; seg++) { +		base = (unsigned long)iov[seg].iov_base + offset; +		len = iov[seg].iov_len - offset; +		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; +		pages += size; +		offset = 0; +	} + +	return pages; +} +EXPORT_SYMBOL(iov_pages); diff --git a/net/core/kmap_skb.h b/net/core/kmap_skb.h deleted file mode 100644 index 283c2b993fb..00000000000 --- a/net/core/kmap_skb.h +++ /dev/null @@ -1,19 +0,0 @@ -#include <linux/highmem.h> - -static inline void *kmap_skb_frag(const skb_frag_t *frag) -{ -#ifdef CONFIG_HIGHMEM -	BUG_ON(in_irq()); - -	local_bh_disable(); -#endif -	return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ); -} - -static inline void kunmap_skb_frag(void *vaddr) -{ -	kunmap_atomic(vaddr, KM_SKB_DATA_SOFTIRQ); -#ifdef CONFIG_HIGHMEM -	local_bh_enable(); -#endif -} diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 01a1101b593..bd0767e6b2b 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -76,10 +76,26 @@ static void rfc2863_policy(struct net_device *dev)  } +void linkwatch_init_dev(struct net_device *dev) +{ +	/* Handle pre-registration link state changes */ +	if (!netif_carrier_ok(dev) || netif_dormant(dev)) +		rfc2863_policy(dev); +} + +  static bool linkwatch_urgent_event(struct net_device *dev)  { -	return netif_running(dev) && netif_carrier_ok(dev) && -		qdisc_tx_changing(dev); +	if (!netif_running(dev)) +		return false; + +	if (dev->ifindex != dev->iflink) +		return true; + +	if (dev->priv_flags & IFF_TEAM_PORT) +		return true; + +	return netif_carrier_ok(dev) &&	qdisc_tx_changing(dev);  } @@ -115,22 +131,13 @@ static void linkwatch_schedule_work(int urgent)  		delay = 0;  	/* -	 * This is true if we've scheduled it immeditately or if we don't -	 * need an immediate execution and it's already pending. +	 * If urgent, schedule immediate execution; otherwise, don't +	 * override the existing timer.  	 */ -	if (schedule_delayed_work(&linkwatch_work, delay) == !delay) -		return; - -	/* Don't bother if there is nothing urgent. */ -	if (!test_bit(LW_URGENT, &linkwatch_flags)) -		return; - -	/* It's already running which is good enough. */ -	if (!cancel_delayed_work(&linkwatch_work)) -		return; - -	/* Otherwise we reschedule it again for immediate exection. */ -	schedule_delayed_work(&linkwatch_work, 0); +	if (test_bit(LW_URGENT, &linkwatch_flags)) +		mod_delayed_work(system_wq, &linkwatch_work, 0); +	else +		schedule_delayed_work(&linkwatch_work, delay);  } @@ -140,7 +147,7 @@ static void linkwatch_do_dev(struct net_device *dev)  	 * Make sure the above read is complete since it can be  	 * rewritten as soon as we clear the bit below.  	 */ -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	/* We are about to handle this device,  	 * so new events can be accepted diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8cc8f9a79db..ef31fef25e5 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -15,6 +15,8 @@   *	Harald Welte		Add neighbour cache statistics like rtstat   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/slab.h>  #include <linux/types.h>  #include <linux/kernel.h> @@ -36,23 +38,16 @@  #include <linux/random.h>  #include <linux/string.h>  #include <linux/log2.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#define DEBUG  #define NEIGH_DEBUG 1 - -#define NEIGH_PRINTK(x...) printk(x) -#define NEIGH_NOPRINTK(x...) do { ; } while(0) -#define NEIGH_PRINTK0 NEIGH_PRINTK -#define NEIGH_PRINTK1 NEIGH_NOPRINTK -#define NEIGH_PRINTK2 NEIGH_NOPRINTK - -#if NEIGH_DEBUG >= 1 -#undef NEIGH_PRINTK1 -#define NEIGH_PRINTK1 NEIGH_PRINTK -#endif -#if NEIGH_DEBUG >= 2 -#undef NEIGH_PRINTK2 -#define NEIGH_PRINTK2 NEIGH_PRINTK -#endif +#define neigh_dbg(level, fmt, ...)		\ +do {						\ +	if (level <= NEIGH_DEBUG)		\ +		pr_debug(fmt, ##__VA_ARGS__);	\ +} while (0)  #define PNEIGH_HASHMASK		0xF @@ -99,7 +94,7 @@ static const struct file_operations neigh_stat_seq_fops;  static DEFINE_RWLOCK(neigh_tbl_lock); -static int neigh_blackhole(struct sk_buff *skb) +static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)  {  	kfree_skb(skb);  	return -ENETDOWN; @@ -122,7 +117,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)  unsigned long neigh_rand_reach_time(unsigned long base)  { -	return base ? (net_random() % base) + (base >> 1) : 0; +	return base ? (prandom_u32() % base) + (base >> 1) : 0;  }  EXPORT_SYMBOL(neigh_rand_reach_time); @@ -138,7 +133,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)  	write_lock_bh(&tbl->lock);  	nht = rcu_dereference_protected(tbl->nht,  					lockdep_is_held(&tbl->lock)); -	for (i = 0; i <= nht->hash_mask; i++) { +	for (i = 0; i < (1 << nht->hash_shift); i++) {  		struct neighbour *n;  		struct neighbour __rcu **np; @@ -211,7 +206,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)  	nht = rcu_dereference_protected(tbl->nht,  					lockdep_is_held(&tbl->lock)); -	for (i = 0; i <= nht->hash_mask; i++) { +	for (i = 0; i < (1 << nht->hash_shift); i++) {  		struct neighbour *n;  		struct neighbour __rcu **np = &nht->hash_buckets[i]; @@ -238,13 +233,14 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)  				   we must kill timers etc. and move  				   it to safe state.  				 */ -				skb_queue_purge(&n->arp_queue); +				__skb_queue_purge(&n->arp_queue); +				n->arp_queue_len_bytes = 0;  				n->output = neigh_blackhole;  				if (n->nud_state & NUD_VALID)  					n->nud_state = NUD_NOARP;  				else  					n->nud_state = NUD_NONE; -				NEIGH_PRINTK2("neigh %p is stray.\n", n); +				neigh_dbg(2, "neigh %p is stray\n", n);  			}  			write_unlock(&n->lock);  			neigh_cleanup_and_release(n); @@ -273,7 +269,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)  }  EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)  {  	struct neighbour *n = NULL;  	unsigned long now = jiffies; @@ -288,16 +284,17 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)  			goto out_entries;  	} -	n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC); +	n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);  	if (!n)  		goto out_entries; -	skb_queue_head_init(&n->arp_queue); +	__skb_queue_head_init(&n->arp_queue);  	rwlock_init(&n->lock);  	seqlock_init(&n->ha_lock);  	n->updated	  = n->used = now;  	n->nud_state	  = NUD_NONE;  	n->output	  = neigh_blackhole; +	seqlock_init(&n->hh.hh_lock);  	n->parms	  = neigh_parms_clone(&tbl->parms);  	setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); @@ -313,11 +310,18 @@ out_entries:  	goto out;  } -static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) +static void neigh_get_hash_rnd(u32 *x)  { -	size_t size = entries * sizeof(struct neighbour *); +	get_random_bytes(x, sizeof(*x)); +	*x |= 1; +} + +static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) +{ +	size_t size = (1 << shift) * sizeof(struct neighbour *);  	struct neigh_hash_table *ret; -	struct neighbour **buckets; +	struct neighbour __rcu **buckets; +	int i;  	ret = kmalloc(sizeof(*ret), GFP_ATOMIC);  	if (!ret) @@ -325,16 +329,17 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)  	if (size <= PAGE_SIZE)  		buckets = kzalloc(size, GFP_ATOMIC);  	else -		buckets = (struct neighbour **) +		buckets = (struct neighbour __rcu **)  			  __get_free_pages(GFP_ATOMIC | __GFP_ZERO,  					   get_order(size));  	if (!buckets) {  		kfree(ret);  		return NULL;  	} -	rcu_assign_pointer(ret->hash_buckets, buckets); -	ret->hash_mask = entries - 1; -	get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); +	ret->hash_buckets = buckets; +	ret->hash_shift = shift; +	for (i = 0; i < NEIGH_NUM_HASH_RND; i++) +		neigh_get_hash_rnd(&ret->hash_rnd[i]);  	return ret;  } @@ -343,8 +348,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head)  	struct neigh_hash_table *nht = container_of(head,  						    struct neigh_hash_table,  						    rcu); -	size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *); -	struct neighbour **buckets = nht->hash_buckets; +	size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); +	struct neighbour __rcu **buckets = nht->hash_buckets;  	if (size <= PAGE_SIZE)  		kfree(buckets); @@ -354,21 +359,20 @@ static void neigh_hash_free_rcu(struct rcu_head *head)  }  static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, -						unsigned long new_entries) +						unsigned long new_shift)  {  	unsigned int i, hash;  	struct neigh_hash_table *new_nht, *old_nht;  	NEIGH_CACHE_STAT_INC(tbl, hash_grows); -	BUG_ON(!is_power_of_2(new_entries));  	old_nht = rcu_dereference_protected(tbl->nht,  					    lockdep_is_held(&tbl->lock)); -	new_nht = neigh_hash_alloc(new_entries); +	new_nht = neigh_hash_alloc(new_shift);  	if (!new_nht)  		return old_nht; -	for (i = 0; i <= old_nht->hash_mask; i++) { +	for (i = 0; i < (1 << old_nht->hash_shift); i++) {  		struct neighbour *n, *next;  		for (n = rcu_dereference_protected(old_nht->hash_buckets[i], @@ -378,7 +382,7 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,  			hash = tbl->hash(n->primary_key, n->dev,  					 new_nht->hash_rnd); -			hash &= new_nht->hash_mask; +			hash >>= (32 - new_nht->hash_shift);  			next = rcu_dereference_protected(n->next,  						lockdep_is_held(&tbl->lock)); @@ -407,7 +411,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,  	rcu_read_lock_bh();  	nht = rcu_dereference_bh(tbl->nht); -	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; +	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);  	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);  	     n != NULL; @@ -437,7 +441,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,  	rcu_read_lock_bh();  	nht = rcu_dereference_bh(tbl->nht); -	hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask; +	hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift);  	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);  	     n != NULL; @@ -456,13 +460,13 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,  }  EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, -			       struct net_device *dev) +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, +				 struct net_device *dev, bool want_ref)  {  	u32 hash_val;  	int key_len = tbl->key_len;  	int error; -	struct neighbour *n1, *rc, *n = neigh_alloc(tbl); +	struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);  	struct neigh_hash_table *nht;  	if (!n) { @@ -480,6 +484,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,  		goto out_neigh_release;  	} +	if (dev->netdev_ops->ndo_neigh_construct) { +		error = dev->netdev_ops->ndo_neigh_construct(n); +		if (error < 0) { +			rc = ERR_PTR(error); +			goto out_neigh_release; +		} +	} +  	/* Device specific setup. */  	if (n->parms->neigh_setup &&  	    (error = n->parms->neigh_setup(n)) < 0) { @@ -487,16 +499,16 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,  		goto out_neigh_release;  	} -	n->confirmed = jiffies - (n->parms->base_reachable_time << 1); +	n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);  	write_lock_bh(&tbl->lock);  	nht = rcu_dereference_protected(tbl->nht,  					lockdep_is_held(&tbl->lock)); -	if (atomic_read(&tbl->entries) > (nht->hash_mask + 1)) -		nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1); +	if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) +		nht = neigh_hash_grow(tbl, nht->hash_shift + 1); -	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; +	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);  	if (n->parms->dead) {  		rc = ERR_PTR(-EINVAL); @@ -509,20 +521,22 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,  	     n1 = rcu_dereference_protected(n1->next,  			lockdep_is_held(&tbl->lock))) {  		if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { -			neigh_hold(n1); +			if (want_ref) +				neigh_hold(n1);  			rc = n1;  			goto out_tbl_unlock;  		}  	}  	n->dead = 0; -	neigh_hold(n); +	if (want_ref) +		neigh_hold(n);  	rcu_assign_pointer(n->next,  			   rcu_dereference_protected(nht->hash_buckets[hash_val],  						     lockdep_is_held(&tbl->lock)));  	rcu_assign_pointer(nht->hash_buckets[hash_val], n);  	write_unlock_bh(&tbl->lock); -	NEIGH_PRINTK2("neigh %p is created.\n", n); +	neigh_dbg(2, "neigh %p is created\n", n);  	rc = n;  out:  	return rc; @@ -532,7 +546,7 @@ out_neigh_release:  	neigh_release(n);  	goto out;  } -EXPORT_SYMBOL(neigh_create); +EXPORT_SYMBOL(__neigh_create);  static u32 pneigh_hash(const void *pkey, int key_len)  { @@ -677,51 +691,40 @@ static inline void neigh_parms_put(struct neigh_parms *parms)  		neigh_parms_destroy(parms);  } -static void neigh_destroy_rcu(struct rcu_head *head) -{ -	struct neighbour *neigh = container_of(head, struct neighbour, rcu); - -	kmem_cache_free(neigh->tbl->kmem_cachep, neigh); -}  /*   *	neighbour must already be out of the table;   *   */  void neigh_destroy(struct neighbour *neigh)  { -	struct hh_cache *hh; +	struct net_device *dev = neigh->dev;  	NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);  	if (!neigh->dead) { -		printk(KERN_WARNING -		       "Destroying alive neighbour %p\n", neigh); +		pr_warn("Destroying alive neighbour %p\n", neigh);  		dump_stack();  		return;  	}  	if (neigh_del_timer(neigh)) -		printk(KERN_WARNING "Impossible event.\n"); +		pr_warn("Impossible event\n"); -	while ((hh = neigh->hh) != NULL) { -		neigh->hh = hh->hh_next; -		hh->hh_next = NULL; - -		write_seqlock_bh(&hh->hh_lock); -		hh->hh_output = neigh_blackhole; -		write_sequnlock_bh(&hh->hh_lock); -		hh_cache_put(hh); -	} +	write_lock_bh(&neigh->lock); +	__skb_queue_purge(&neigh->arp_queue); +	write_unlock_bh(&neigh->lock); +	neigh->arp_queue_len_bytes = 0; -	skb_queue_purge(&neigh->arp_queue); +	if (dev->netdev_ops->ndo_neigh_destroy) +		dev->netdev_ops->ndo_neigh_destroy(neigh); -	dev_put(neigh->dev); +	dev_put(dev);  	neigh_parms_put(neigh->parms); -	NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); +	neigh_dbg(2, "neigh %p is destroyed\n", neigh);  	atomic_dec(&neigh->tbl->entries); -	call_rcu(&neigh->rcu, neigh_destroy_rcu); +	kfree_rcu(neigh, rcu);  }  EXPORT_SYMBOL(neigh_destroy); @@ -732,14 +735,9 @@ EXPORT_SYMBOL(neigh_destroy);   */  static void neigh_suspect(struct neighbour *neigh)  { -	struct hh_cache *hh; - -	NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); +	neigh_dbg(2, "neigh %p is suspected\n", neigh);  	neigh->output = neigh->ops->output; - -	for (hh = neigh->hh; hh; hh = hh->hh_next) -		hh->hh_output = neigh->ops->output;  }  /* Neighbour state is OK; @@ -749,14 +747,9 @@ static void neigh_suspect(struct neighbour *neigh)   */  static void neigh_connect(struct neighbour *neigh)  { -	struct hh_cache *hh; - -	NEIGH_PRINTK2("neigh %p is connected.\n", neigh); +	neigh_dbg(2, "neigh %p is connected\n", neigh);  	neigh->output = neigh->ops->connected_output; - -	for (hh = neigh->hh; hh; hh = hh->hh_next) -		hh->hh_output = neigh->ops->hh_output;  }  static void neigh_periodic_work(struct work_struct *work) @@ -782,10 +775,13 @@ static void neigh_periodic_work(struct work_struct *work)  		tbl->last_rand = jiffies;  		for (p = &tbl->parms; p; p = p->next)  			p->reachable_time = -				neigh_rand_reach_time(p->base_reachable_time); +				neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));  	} -	for (i = 0 ; i <= nht->hash_mask; i++) { +	if (atomic_read(&tbl->entries) < tbl->gc_thresh1) +		goto out; + +	for (i = 0 ; i < (1 << nht->hash_shift); i++) {  		np = &nht->hash_buckets[i];  		while ((n = rcu_dereference_protected(*np, @@ -805,7 +801,7 @@ static void neigh_periodic_work(struct work_struct *work)  			if (atomic_read(&n->refcnt) == 1 &&  			    (state == NUD_FAILED || -			     time_after(jiffies, n->used + n->parms->gc_staletime))) { +			     time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {  				*np = n->next;  				n->dead = 1;  				write_unlock(&n->lock); @@ -824,22 +820,26 @@ next_elt:  		write_unlock_bh(&tbl->lock);  		cond_resched();  		write_lock_bh(&tbl->lock); +		nht = rcu_dereference_protected(tbl->nht, +						lockdep_is_held(&tbl->lock));  	} -	/* Cycle through all hash buckets every base_reachable_time/2 ticks. -	 * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 -	 * base_reachable_time. +out: +	/* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. +	 * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 +	 * BASE_REACHABLE_TIME.  	 */ -	schedule_delayed_work(&tbl->gc_work, -			      tbl->parms.base_reachable_time >> 1); +	queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, +			      NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);  	write_unlock_bh(&tbl->lock);  }  static __inline__ int neigh_max_probes(struct neighbour *n)  {  	struct neigh_parms *p = n->parms; -	return (n->nud_state & NUD_PROBE) ? -		p->ucast_probes : -		p->ucast_probes + p->app_probes + p->mcast_probes; +	int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES); +	if (!(n->nud_state & NUD_PROBE)) +		max_probes += NEIGH_VAR(p, MCAST_PROBES); +	return max_probes;  }  static void neigh_invalidate(struct neighbour *neigh) @@ -849,7 +849,7 @@ static void neigh_invalidate(struct neighbour *neigh)  	struct sk_buff *skb;  	NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); -	NEIGH_PRINTK2("neigh %p is failed.\n", neigh); +	neigh_dbg(2, "neigh %p is failed\n", neigh);  	neigh->updated = jiffies;  	/* It is very thin place. report_unreachable is very complicated @@ -863,7 +863,21 @@ static void neigh_invalidate(struct neighbour *neigh)  		neigh->ops->error_report(neigh, skb);  		write_lock(&neigh->lock);  	} -	skb_queue_purge(&neigh->arp_queue); +	__skb_queue_purge(&neigh->arp_queue); +	neigh->arp_queue_len_bytes = 0; +} + +static void neigh_probe(struct neighbour *neigh) +	__releases(neigh->lock) +{ +	struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); +	/* keep skb alive even if arp_queue overflows */ +	if (skb) +		skb = skb_copy(skb, GFP_ATOMIC); +	write_unlock(&neigh->lock); +	neigh->ops->solicit(neigh, skb); +	atomic_inc(&neigh->probes); +	kfree_skb(skb);  }  /* Called when a timer expires for a neighbour entry. */ @@ -872,7 +886,7 @@ static void neigh_timer_handler(unsigned long arg)  {  	unsigned long now, next;  	struct neighbour *neigh = (struct neighbour *)arg; -	unsigned state; +	unsigned int state;  	int notify = 0;  	write_lock(&neigh->lock); @@ -881,27 +895,24 @@ static void neigh_timer_handler(unsigned long arg)  	now = jiffies;  	next = now + HZ; -	if (!(state & NUD_IN_TIMER)) { -#ifndef CONFIG_SMP -		printk(KERN_WARNING "neigh: timer & !nud_in_timer\n"); -#endif +	if (!(state & NUD_IN_TIMER))  		goto out; -	}  	if (state & NUD_REACHABLE) {  		if (time_before_eq(now,  				   neigh->confirmed + neigh->parms->reachable_time)) { -			NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); +			neigh_dbg(2, "neigh %p is still alive\n", neigh);  			next = neigh->confirmed + neigh->parms->reachable_time;  		} else if (time_before_eq(now, -					  neigh->used + neigh->parms->delay_probe_time)) { -			NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); +					  neigh->used + +					  NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { +			neigh_dbg(2, "neigh %p is delayed\n", neigh);  			neigh->nud_state = NUD_DELAY;  			neigh->updated = jiffies;  			neigh_suspect(neigh); -			next = now + neigh->parms->delay_probe_time; +			next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);  		} else { -			NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); +			neigh_dbg(2, "neigh %p is suspected\n", neigh);  			neigh->nud_state = NUD_STALE;  			neigh->updated = jiffies;  			neigh_suspect(neigh); @@ -909,23 +920,24 @@ static void neigh_timer_handler(unsigned long arg)  		}  	} else if (state & NUD_DELAY) {  		if (time_before_eq(now, -				   neigh->confirmed + neigh->parms->delay_probe_time)) { -			NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh); +				   neigh->confirmed + +				   NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { +			neigh_dbg(2, "neigh %p is now reachable\n", neigh);  			neigh->nud_state = NUD_REACHABLE;  			neigh->updated = jiffies;  			neigh_connect(neigh);  			notify = 1;  			next = neigh->confirmed + neigh->parms->reachable_time;  		} else { -			NEIGH_PRINTK2("neigh %p is probed.\n", neigh); +			neigh_dbg(2, "neigh %p is probed\n", neigh);  			neigh->nud_state = NUD_PROBE;  			neigh->updated = jiffies;  			atomic_set(&neigh->probes, 0); -			next = now + neigh->parms->retrans_time; +			next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);  		}  	} else {  		/* NUD_PROBE|NUD_INCOMPLETE */ -		next = now + neigh->parms->retrans_time; +		next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);  	}  	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -933,6 +945,7 @@ static void neigh_timer_handler(unsigned long arg)  		neigh->nud_state = NUD_FAILED;  		notify = 1;  		neigh_invalidate(neigh); +		goto out;  	}  	if (neigh->nud_state & NUD_IN_TIMER) { @@ -942,14 +955,7 @@ static void neigh_timer_handler(unsigned long arg)  			neigh_hold(neigh);  	}  	if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { -		struct sk_buff *skb = skb_peek(&neigh->arp_queue); -		/* keep skb alive even if arp_queue overflows */ -		if (skb) -			skb = skb_copy(skb, GFP_ATOMIC); -		write_unlock(&neigh->lock); -		neigh->ops->solicit(neigh, skb); -		atomic_inc(&neigh->probes); -		kfree_skb(skb); +		neigh_probe(neigh);  	} else {  out:  		write_unlock(&neigh->lock); @@ -964,7 +970,7 @@ out:  int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)  {  	int rc; -	unsigned long now; +	bool immediate_probe = false;  	write_lock_bh(&neigh->lock); @@ -972,14 +978,19 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)  	if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))  		goto out_unlock_bh; -	now = jiffies; -  	if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { -		if (neigh->parms->mcast_probes + neigh->parms->app_probes) { -			atomic_set(&neigh->probes, neigh->parms->ucast_probes); +		if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + +		    NEIGH_VAR(neigh->parms, APP_PROBES)) { +			unsigned long next, now = jiffies; + +			atomic_set(&neigh->probes, +				   NEIGH_VAR(neigh->parms, UCAST_PROBES));  			neigh->nud_state     = NUD_INCOMPLETE; -			neigh->updated = jiffies; -			neigh_add_timer(neigh, now + 1); +			neigh->updated = now; +			next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), +					 HZ/2); +			neigh_add_timer(neigh, next); +			immediate_probe = true;  		} else {  			neigh->nud_state = NUD_FAILED;  			neigh->updated = jiffies; @@ -989,34 +1000,43 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)  			return 1;  		}  	} else if (neigh->nud_state & NUD_STALE) { -		NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); +		neigh_dbg(2, "neigh %p is delayed\n", neigh);  		neigh->nud_state = NUD_DELAY;  		neigh->updated = jiffies; -		neigh_add_timer(neigh, -				jiffies + neigh->parms->delay_probe_time); +		neigh_add_timer(neigh, jiffies + +				NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));  	}  	if (neigh->nud_state == NUD_INCOMPLETE) {  		if (skb) { -			if (skb_queue_len(&neigh->arp_queue) >= -			    neigh->parms->queue_len) { +			while (neigh->arp_queue_len_bytes + skb->truesize > +			       NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {  				struct sk_buff *buff; +  				buff = __skb_dequeue(&neigh->arp_queue); +				if (!buff) +					break; +				neigh->arp_queue_len_bytes -= buff->truesize;  				kfree_skb(buff);  				NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);  			}  			skb_dst_force(skb);  			__skb_queue_tail(&neigh->arp_queue, skb); +			neigh->arp_queue_len_bytes += skb->truesize;  		}  		rc = 1;  	}  out_unlock_bh: -	write_unlock_bh(&neigh->lock); +	if (immediate_probe) +		neigh_probe(neigh); +	else +		write_unlock(&neigh->lock); +	local_bh_enable();  	return rc;  }  EXPORT_SYMBOL(__neigh_event_send); -static void neigh_update_hhs(const struct neighbour *neigh) +static void neigh_update_hhs(struct neighbour *neigh)  {  	struct hh_cache *hh;  	void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) @@ -1026,7 +1046,8 @@ static void neigh_update_hhs(const struct neighbour *neigh)  		update = neigh->dev->header_ops->cache_update;  	if (update) { -		for (hh = neigh->hh; hh; hh = hh->hh_next) { +		hh = &neigh->hh; +		if (hh->hh_len) {  			write_seqlock_bh(&hh->hh_lock);  			update(hh, neigh->dev, neigh->ha);  			write_sequnlock_bh(&hh->hh_lock); @@ -1149,6 +1170,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,  						 neigh->parms->reachable_time :  						 0)));  		neigh->nud_state = new; +		notify = 1;  	}  	if (lladdr != neigh->ha) { @@ -1158,7 +1180,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,  		neigh_update_hhs(neigh);  		if (!(new & NUD_CONNECTED))  			neigh->confirmed = jiffies - -				      (neigh->parms->base_reachable_time << 1); +				      (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);  		notify = 1;  	}  	if (new == old) @@ -1174,15 +1196,34 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,  		while (neigh->nud_state & NUD_VALID &&  		       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { -			struct neighbour *n1 = neigh; +			struct dst_entry *dst = skb_dst(skb); +			struct neighbour *n2, *n1 = neigh;  			write_unlock_bh(&neigh->lock); -			/* On shaper/eql skb->dst->neighbour != neigh :( */ -			if (skb_dst(skb) && skb_dst(skb)->neighbour) -				n1 = skb_dst(skb)->neighbour; -			n1->output(skb); + +			rcu_read_lock(); + +			/* Why not just use 'neigh' as-is?  The problem is that +			 * things such as shaper, eql, and sch_teql can end up +			 * using alternative, different, neigh objects to output +			 * the packet in the output path.  So what we need to do +			 * here is re-lookup the top-level neigh in the path so +			 * we can reinject the packet there. +			 */ +			n2 = NULL; +			if (dst) { +				n2 = dst_neigh_lookup_skb(dst, skb); +				if (n2) +					n1 = n2; +			} +			n1->output(n1, skb); +			if (n2) +				neigh_release(n2); +			rcu_read_unlock(); +  			write_lock_bh(&neigh->lock);  		} -		skb_queue_purge(&neigh->arp_queue); +		__skb_queue_purge(&neigh->arp_queue); +		neigh->arp_queue_len_bytes = 0;  	}  out:  	if (update_isrouter) { @@ -1199,6 +1240,21 @@ out:  }  EXPORT_SYMBOL(neigh_update); +/* Update the neigh to listen temporarily for probe responses, even if it is + * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. + */ +void __neigh_set_probe_once(struct neighbour *neigh) +{ +	neigh->updated = jiffies; +	if (!(neigh->nud_state & NUD_FAILED)) +		return; +	neigh->nud_state = NUD_INCOMPLETE; +	atomic_set(&neigh->probes, neigh_max_probes(neigh)); +	neigh_add_timer(neigh, +			jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); +} +EXPORT_SYMBOL(__neigh_set_probe_once); +  struct neighbour *neigh_event_ns(struct neigh_table *tbl,  				 u8 *lladdr, void *saddr,  				 struct net_device *dev) @@ -1212,67 +1268,21 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,  }  EXPORT_SYMBOL(neigh_event_ns); -static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst, -				   __be16 protocol) -{ -	struct hh_cache *hh; - -	smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */ -	for (hh = n->hh; hh; hh = hh->hh_next) { -		if (hh->hh_type == protocol) { -			atomic_inc(&hh->hh_refcnt); -			if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) -				hh_cache_put(hh); -			return true; -		} -	} -	return false; -} -  /* called with read_lock_bh(&n->lock); */ -static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, -			  __be16 protocol) +static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst)  { -	struct hh_cache	*hh;  	struct net_device *dev = dst->dev; - -	if (likely(neigh_hh_lookup(n, dst, protocol))) -		return; - -	/* slow path */ -	hh = kzalloc(sizeof(*hh), GFP_ATOMIC); -	if (!hh) -		return; - -	seqlock_init(&hh->hh_lock); -	hh->hh_type = protocol; -	atomic_set(&hh->hh_refcnt, 2); - -	if (dev->header_ops->cache(n, hh)) { -		kfree(hh); -		return; -	} +	__be16 prot = dst->ops->protocol; +	struct hh_cache	*hh = &n->hh;  	write_lock_bh(&n->lock); -	/* must check if another thread already did the insert */ -	if (neigh_hh_lookup(n, dst, protocol)) { -		kfree(hh); -		goto end; -	} - -	if (n->nud_state & NUD_CONNECTED) -		hh->hh_output = n->ops->hh_output; -	else -		hh->hh_output = n->ops->output; - -	hh->hh_next = n->hh; -	smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */ -	n->hh	    = hh; +	/* Only one thread can come in here and initialize the +	 * hh_cache entry. +	 */ +	if (!hh->hh_len) +		dev->header_ops->cache(n, hh, prot); -	if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) -		hh_cache_put(hh); -end:  	write_unlock_bh(&n->lock);  } @@ -1281,7 +1291,7 @@ end:   * but resolution is not made yet.   */ -int neigh_compat_output(struct sk_buff *skb) +int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb)  {  	struct net_device *dev = skb->dev; @@ -1289,7 +1299,7 @@ int neigh_compat_output(struct sk_buff *skb)  	if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,  			    skb->len) < 0 && -	    dev->header_ops->rebuild(skb)) +	    dev_rebuild_header(skb))  		return 0;  	return dev_queue_xmit(skb); @@ -1298,43 +1308,38 @@ EXPORT_SYMBOL(neigh_compat_output);  /* Slow and careful. */ -int neigh_resolve_output(struct sk_buff *skb) +int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)  {  	struct dst_entry *dst = skb_dst(skb); -	struct neighbour *neigh;  	int rc = 0; -	if (!dst || !(neigh = dst->neighbour)) +	if (!dst)  		goto discard; -	__skb_pull(skb, skb_network_offset(skb)); -  	if (!neigh_event_send(neigh, skb)) {  		int err;  		struct net_device *dev = neigh->dev;  		unsigned int seq; -		if (dev->header_ops->cache && -		    !dst->hh && -		    !(dst->flags & DST_NOCACHE)) -			neigh_hh_init(neigh, dst, dst->ops->protocol); +		if (dev->header_ops->cache && !neigh->hh.hh_len) +			neigh_hh_init(neigh, dst);  		do { +			__skb_pull(skb, skb_network_offset(skb));  			seq = read_seqbegin(&neigh->ha_lock);  			err = dev_hard_header(skb, dev, ntohs(skb->protocol),  					      neigh->ha, NULL, skb->len);  		} while (read_seqretry(&neigh->ha_lock, seq));  		if (err >= 0) -			rc = neigh->ops->queue_xmit(skb); +			rc = dev_queue_xmit(skb);  		else  			goto out_kfree_skb;  	}  out:  	return rc;  discard: -	NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", -		      dst, dst ? dst->neighbour : NULL); +	neigh_dbg(1, "%s: dst=%p neigh=%p\n", __func__, dst, neigh);  out_kfree_skb:  	rc = -EINVAL;  	kfree_skb(skb); @@ -1344,24 +1349,21 @@ EXPORT_SYMBOL(neigh_resolve_output);  /* As fast as possible without hh cache */ -int neigh_connected_output(struct sk_buff *skb) +int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)  { -	int err; -	struct dst_entry *dst = skb_dst(skb); -	struct neighbour *neigh = dst->neighbour;  	struct net_device *dev = neigh->dev;  	unsigned int seq; - -	__skb_pull(skb, skb_network_offset(skb)); +	int err;  	do { +		__skb_pull(skb, skb_network_offset(skb));  		seq = read_seqbegin(&neigh->ha_lock);  		err = dev_hard_header(skb, dev, ntohs(skb->protocol),  				      neigh->ha, NULL, skb->len);  	} while (read_seqretry(&neigh->ha_lock, seq));  	if (err >= 0) -		err = neigh->ops->queue_xmit(skb); +		err = dev_queue_xmit(skb);  	else {  		err = -EINVAL;  		kfree_skb(skb); @@ -1370,6 +1372,12 @@ int neigh_connected_output(struct sk_buff *skb)  }  EXPORT_SYMBOL(neigh_connected_output); +int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) +{ +	return dev_queue_xmit(skb); +} +EXPORT_SYMBOL(neigh_direct_output); +  static void neigh_proxy_process(unsigned long arg)  {  	struct neigh_table *tbl = (struct neigh_table *)arg; @@ -1384,11 +1392,15 @@ static void neigh_proxy_process(unsigned long arg)  		if (tdif <= 0) {  			struct net_device *dev = skb->dev; +  			__skb_unlink(skb, &tbl->proxy_queue); -			if (tbl->proxy_redo && netif_running(dev)) +			if (tbl->proxy_redo && netif_running(dev)) { +				rcu_read_lock();  				tbl->proxy_redo(skb); -			else +				rcu_read_unlock(); +			} else {  				kfree_skb(skb); +			}  			dev_put(dev);  		} else if (!sched_next || tdif < sched_next) @@ -1404,9 +1416,11 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,  		    struct sk_buff *skb)  {  	unsigned long now = jiffies; -	unsigned long sched_next = now + (net_random() % p->proxy_delay); -	if (tbl->proxy_queue.qlen > p->proxy_qlen) { +	unsigned long sched_next = now + (prandom_u32() % +					  NEIGH_VAR(p, PROXY_DELAY)); + +	if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {  		kfree_skb(skb);  		return;  	} @@ -1434,7 +1448,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,  	for (p = &tbl->parms; p; p = p->next) {  		if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || -		    (!p->dev && !ifindex)) +		    (!p->dev && !ifindex && net_eq(net, &init_net)))  			return p;  	} @@ -1444,34 +1458,34 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,  struct neigh_parms *neigh_parms_alloc(struct net_device *dev,  				      struct neigh_table *tbl)  { -	struct neigh_parms *p, *ref; +	struct neigh_parms *p;  	struct net *net = dev_net(dev);  	const struct net_device_ops *ops = dev->netdev_ops; -	ref = lookup_neigh_parms(tbl, net, 0); -	if (!ref) -		return NULL; - -	p = kmemdup(ref, sizeof(*p), GFP_KERNEL); +	p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);  	if (p) {  		p->tbl		  = tbl;  		atomic_set(&p->refcnt, 1);  		p->reachable_time = -				neigh_rand_reach_time(p->base_reachable_time); +				neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); +		dev_hold(dev); +		p->dev = dev; +		write_pnet(&p->net, hold_net(net)); +		p->sysctl_table = NULL;  		if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { +			release_net(net); +			dev_put(dev);  			kfree(p);  			return NULL;  		} -		dev_hold(dev); -		p->dev = dev; -		write_pnet(&p->net, hold_net(net)); -		p->sysctl_table = NULL;  		write_lock_bh(&tbl->lock);  		p->next		= tbl->parms.next;  		tbl->parms.next = p;  		write_unlock_bh(&tbl->lock); + +		neigh_parms_data_state_cleanall(p);  	}  	return p;  } @@ -1504,7 +1518,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)  		}  	}  	write_unlock_bh(&tbl->lock); -	NEIGH_PRINTK1("neigh_parms_release: not found\n"); +	neigh_dbg(1, "%s: not found\n", __func__);  }  EXPORT_SYMBOL(neigh_parms_release); @@ -1516,7 +1530,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms)  static struct lock_class_key neigh_table_proxy_queue_class; -void neigh_table_init_no_netlink(struct neigh_table *tbl) +static void neigh_table_init_no_netlink(struct neigh_table *tbl)  {  	unsigned long now = jiffies;  	unsigned long phsize; @@ -1524,13 +1538,8 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)  	write_pnet(&tbl->parms.net, &init_net);  	atomic_set(&tbl->parms.refcnt, 1);  	tbl->parms.reachable_time = -			  neigh_rand_reach_time(tbl->parms.base_reachable_time); +			  neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); -	if (!tbl->kmem_cachep) -		tbl->kmem_cachep = -			kmem_cache_create(tbl->id, tbl->entry_size, 0, -					  SLAB_HWCACHE_ALIGN|SLAB_PANIC, -					  NULL);  	tbl->stats = alloc_percpu(struct neigh_statistics);  	if (!tbl->stats)  		panic("cannot create neighbour cache statistics"); @@ -1541,7 +1550,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)  		panic("cannot create neighbour proc dir entry");  #endif -	tbl->nht = neigh_hash_alloc(8); +	RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));  	phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);  	tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); @@ -1549,9 +1558,16 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)  	if (!tbl->nht || !tbl->phash_buckets)  		panic("cannot allocate neighbour cache hashes"); +	if (!tbl->entry_size) +		tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + +					tbl->key_len, NEIGH_PRIV_ALIGN); +	else +		WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); +  	rwlock_init(&tbl->lock); -	INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work); -	schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); +	INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); +	queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, +			tbl->parms.reachable_time);  	setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);  	skb_queue_head_init_class(&tbl->proxy_queue,  			&neigh_table_proxy_queue_class); @@ -1559,7 +1575,6 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)  	tbl->last_flush = now;  	tbl->last_rand	= now + tbl->parms.reachable_time * 20;  } -EXPORT_SYMBOL(neigh_table_init_no_netlink);  void neigh_table_init(struct neigh_table *tbl)  { @@ -1576,8 +1591,8 @@ void neigh_table_init(struct neigh_table *tbl)  	write_unlock(&neigh_tbl_lock);  	if (unlikely(tmp)) { -		printk(KERN_ERR "NEIGH: Registering multiple tables for " -		       "family %d\n", tbl->family); +		pr_err("Registering multiple tables for family %d\n", +		       tbl->family);  		dump_stack();  	}  } @@ -1593,7 +1608,7 @@ int neigh_table_clear(struct neigh_table *tbl)  	pneigh_queue_purge(&tbl->proxy_queue);  	neigh_ifdown(tbl, NULL);  	if (atomic_read(&tbl->entries)) -		printk(KERN_CRIT "neighbour leakage\n"); +		pr_crit("neighbour leakage\n");  	write_lock(&neigh_tbl_lock);  	for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {  		if (*tp == tbl) { @@ -1603,7 +1618,8 @@ int neigh_table_clear(struct neigh_table *tbl)  	}  	write_unlock(&neigh_tbl_lock); -	call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu); +	call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, +		 neigh_hash_free_rcu);  	tbl->nht = NULL;  	kfree(tbl->phash_buckets); @@ -1614,14 +1630,11 @@ int neigh_table_clear(struct neigh_table *tbl)  	free_percpu(tbl->stats);  	tbl->stats = NULL; -	kmem_cache_destroy(tbl->kmem_cachep); -	tbl->kmem_cachep = NULL; -  	return 0;  }  EXPORT_SYMBOL(neigh_table_clear); -static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	struct ndmsg *ndm; @@ -1685,7 +1698,7 @@ out:  	return err;  } -static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	struct ndmsg *ndm; @@ -1791,25 +1804,36 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)  	if (nest == NULL)  		return -ENOBUFS; -	if (parms->dev) -		NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex); - -	NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)); -	NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len); -	NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen); -	NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes); -	NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes); -	NLA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes); -	NLA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time); -	NLA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME, -		      parms->base_reachable_time); -	NLA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime); -	NLA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time); -	NLA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time); -	NLA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay); -	NLA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay); -	NLA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime); - +	if ((parms->dev && +	     nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || +	    nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) || +	    nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, +			NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || +	    /* approximative value for deprecated QUEUE_LEN (in packets) */ +	    nla_put_u32(skb, NDTPA_QUEUE_LEN, +			NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || +	    nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || +	    nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || +	    nla_put_u32(skb, NDTPA_UCAST_PROBES, +			NEIGH_VAR(parms, UCAST_PROBES)) || +	    nla_put_u32(skb, NDTPA_MCAST_PROBES, +			NEIGH_VAR(parms, MCAST_PROBES)) || +	    nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) || +	    nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, +			  NEIGH_VAR(parms, BASE_REACHABLE_TIME)) || +	    nla_put_msecs(skb, NDTPA_GC_STALETIME, +			  NEIGH_VAR(parms, GC_STALETIME)) || +	    nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, +			  NEIGH_VAR(parms, DELAY_PROBE_TIME)) || +	    nla_put_msecs(skb, NDTPA_RETRANS_TIME, +			  NEIGH_VAR(parms, RETRANS_TIME)) || +	    nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, +			  NEIGH_VAR(parms, ANYCAST_DELAY)) || +	    nla_put_msecs(skb, NDTPA_PROXY_DELAY, +			  NEIGH_VAR(parms, PROXY_DELAY)) || +	    nla_put_msecs(skb, NDTPA_LOCKTIME, +			  NEIGH_VAR(parms, LOCKTIME))) +		goto nla_put_failure;  	return nla_nest_end(skb, nest);  nla_put_failure: @@ -1834,12 +1858,12 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,  	ndtmsg->ndtm_pad1   = 0;  	ndtmsg->ndtm_pad2   = 0; -	NLA_PUT_STRING(skb, NDTA_NAME, tbl->id); -	NLA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval); -	NLA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1); -	NLA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2); -	NLA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3); - +	if (nla_put_string(skb, NDTA_NAME, tbl->id) || +	    nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval) || +	    nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) || +	    nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) || +	    nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3)) +		goto nla_put_failure;  	{  		unsigned long now = jiffies;  		unsigned int flush_delta = now - tbl->last_flush; @@ -1856,11 +1880,12 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,  		rcu_read_lock_bh();  		nht = rcu_dereference_bh(tbl->nht); -		ndc.ndtc_hash_rnd = nht->hash_rnd; -		ndc.ndtc_hash_mask = nht->hash_mask; +		ndc.ndtc_hash_rnd = nht->hash_rnd[0]; +		ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);  		rcu_read_unlock_bh(); -		NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc); +		if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) +			goto nla_put_failure;  	}  	{ @@ -1885,7 +1910,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,  			ndst.ndts_forced_gc_runs	+= st->forced_gc_runs;  		} -		NLA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst); +		if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst)) +			goto nla_put_failure;  	}  	BUG_ON(tbl->parms.dev); @@ -1958,7 +1984,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {  	[NDTPA_LOCKTIME]		= { .type = NLA_U64 },  }; -static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	struct neigh_table *tbl; @@ -2022,45 +2048,68 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  			switch (i) {  			case NDTPA_QUEUE_LEN: -				p->queue_len = nla_get_u32(tbp[i]); +				NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, +					      nla_get_u32(tbp[i]) * +					      SKB_TRUESIZE(ETH_FRAME_LEN)); +				break; +			case NDTPA_QUEUE_LENBYTES: +				NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, +					      nla_get_u32(tbp[i]));  				break;  			case NDTPA_PROXY_QLEN: -				p->proxy_qlen = nla_get_u32(tbp[i]); +				NEIGH_VAR_SET(p, PROXY_QLEN, +					      nla_get_u32(tbp[i]));  				break;  			case NDTPA_APP_PROBES: -				p->app_probes = nla_get_u32(tbp[i]); +				NEIGH_VAR_SET(p, APP_PROBES, +					      nla_get_u32(tbp[i]));  				break;  			case NDTPA_UCAST_PROBES: -				p->ucast_probes = nla_get_u32(tbp[i]); +				NEIGH_VAR_SET(p, UCAST_PROBES, +					      nla_get_u32(tbp[i]));  				break;  			case NDTPA_MCAST_PROBES: -				p->mcast_probes = nla_get_u32(tbp[i]); +				NEIGH_VAR_SET(p, MCAST_PROBES, +					      nla_get_u32(tbp[i]));  				break;  			case NDTPA_BASE_REACHABLE_TIME: -				p->base_reachable_time = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_GC_STALETIME: -				p->gc_staletime = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, GC_STALETIME, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_DELAY_PROBE_TIME: -				p->delay_probe_time = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, DELAY_PROBE_TIME, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_RETRANS_TIME: -				p->retrans_time = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, RETRANS_TIME, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_ANYCAST_DELAY: -				p->anycast_delay = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, ANYCAST_DELAY, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_PROXY_DELAY: -				p->proxy_delay = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, PROXY_DELAY, +					      nla_get_msecs(tbp[i]));  				break;  			case NDTPA_LOCKTIME: -				p->locktime = nla_get_msecs(tbp[i]); +				NEIGH_VAR_SET(p, LOCKTIME, +					      nla_get_msecs(tbp[i]));  				break;  			}  		}  	} +	err = -ENOENT; +	if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] || +	     tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) && +	    !net_eq(net, &init_net)) +		goto errout_tbl_lock; +  	if (tb[NDTA_THRESH1])  		tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]); @@ -2100,7 +2149,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)  		if (tidx < tbl_skip || (family && tbl->family != family))  			continue; -		if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid, +		if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,  				       cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,  				       NLM_F_MULTI) <= 0)  			break; @@ -2113,7 +2162,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)  				goto next;  			if (neightbl_fill_param_info(skb, tbl, p, -						     NETLINK_CB(cb->skb).pid, +						     NETLINK_CB(cb->skb).portid,  						     cb->nlh->nlmsg_seq,  						     RTM_NEWNEIGHTBL,  						     NLM_F_MULTI) <= 0) @@ -2152,7 +2201,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,  	ndm->ndm_type	 = neigh->type;  	ndm->ndm_ifindex = neigh->dev->ifindex; -	NLA_PUT(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key); +	if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) +		goto nla_put_failure;  	read_lock_bh(&neigh->lock);  	ndm->ndm_state	 = neigh->nud_state; @@ -2172,8 +2222,39 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,  	ci.ndm_refcnt	 = atomic_read(&neigh->refcnt) - 1;  	read_unlock_bh(&neigh->lock); -	NLA_PUT_U32(skb, NDA_PROBES, atomic_read(&neigh->probes)); -	NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); +	if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || +	    nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) +		goto nla_put_failure; + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -EMSGSIZE; +} + +static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, +			    u32 pid, u32 seq, int type, unsigned int flags, +			    struct neigh_table *tbl) +{ +	struct nlmsghdr *nlh; +	struct ndmsg *ndm; + +	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); +	if (nlh == NULL) +		return -EMSGSIZE; + +	ndm = nlmsg_data(nlh); +	ndm->ndm_family	 = tbl->family; +	ndm->ndm_pad1    = 0; +	ndm->ndm_pad2    = 0; +	ndm->ndm_flags	 = pn->flags | NTF_PROXY; +	ndm->ndm_type	 = RTN_UNICAST; +	ndm->ndm_ifindex = pn->dev->ifindex; +	ndm->ndm_state	 = NUD_NONE; + +	if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) +		goto nla_put_failure;  	return nlmsg_end(skb, nlh); @@ -2200,9 +2281,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  	rcu_read_lock_bh();  	nht = rcu_dereference_bh(tbl->nht); -	for (h = 0; h <= nht->hash_mask; h++) { -		if (h < s_h) -			continue; +	for (h = s_h; h < (1 << nht->hash_shift); h++) {  		if (h > s_h)  			s_idx = 0;  		for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; @@ -2212,7 +2291,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  				continue;  			if (idx < s_idx)  				goto next; -			if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, +			if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,  					    cb->nlh->nlmsg_seq,  					    RTM_NEWNEIGH,  					    NLM_F_MULTI) <= 0) { @@ -2231,22 +2310,77 @@ out:  	return rc;  } +static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, +			     struct netlink_callback *cb) +{ +	struct pneigh_entry *n; +	struct net *net = sock_net(skb->sk); +	int rc, h, s_h = cb->args[3]; +	int idx, s_idx = idx = cb->args[4]; + +	read_lock_bh(&tbl->lock); + +	for (h = s_h; h <= PNEIGH_HASHMASK; h++) { +		if (h > s_h) +			s_idx = 0; +		for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { +			if (dev_net(n->dev) != net) +				continue; +			if (idx < s_idx) +				goto next; +			if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, +					    cb->nlh->nlmsg_seq, +					    RTM_NEWNEIGH, +					    NLM_F_MULTI, tbl) <= 0) { +				read_unlock_bh(&tbl->lock); +				rc = -1; +				goto out; +			} +		next: +			idx++; +		} +	} + +	read_unlock_bh(&tbl->lock); +	rc = skb->len; +out: +	cb->args[3] = h; +	cb->args[4] = idx; +	return rc; + +} +  static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)  {  	struct neigh_table *tbl;  	int t, family, s_t; +	int proxy = 0; +	int err;  	read_lock(&neigh_tbl_lock);  	family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + +	/* check for full ndmsg structure presence, family member is +	 * the same for both structures +	 */ +	if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) && +	    ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY) +		proxy = 1; +  	s_t = cb->args[0]; -	for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) { +	for (tbl = neigh_tables, t = 0; tbl; +	     tbl = tbl->next, t++) {  		if (t < s_t || (family && tbl->family != family))  			continue;  		if (t > s_t)  			memset(&cb->args[1], 0, sizeof(cb->args) -  						sizeof(cb->args[0])); -		if (neigh_dump_table(tbl, skb, cb) < 0) +		if (proxy) +			err = pneigh_dump_table(tbl, skb, cb); +		else +			err = neigh_dump_table(tbl, skb, cb); +		if (err < 0)  			break;  	}  	read_unlock(&neigh_tbl_lock); @@ -2264,7 +2398,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void  	nht = rcu_dereference_bh(tbl->nht);  	read_lock(&tbl->lock); /* avoid resizes */ -	for (chain = 0; chain <= nht->hash_mask; chain++) { +	for (chain = 0; chain < (1 << nht->hash_shift); chain++) {  		struct neighbour *n;  		for (n = rcu_dereference_bh(nht->hash_buckets[chain]); @@ -2286,7 +2420,7 @@ void __neigh_for_each_release(struct neigh_table *tbl,  	nht = rcu_dereference_protected(tbl->nht,  					lockdep_is_held(&tbl->lock)); -	for (chain = 0; chain <= nht->hash_mask; chain++) { +	for (chain = 0; chain < (1 << nht->hash_shift); chain++) {  		struct neighbour *n;  		struct neighbour __rcu **np; @@ -2323,7 +2457,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)  	int bucket = state->bucket;  	state->flags &= ~NEIGH_SEQ_IS_PNEIGH; -	for (bucket = 0; bucket <= nht->hash_mask; bucket++) { +	for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {  		n = rcu_dereference_bh(nht->hash_buckets[bucket]);  		while (n) { @@ -2390,7 +2524,7 @@ next:  		if (n)  			break; -		if (++state->bucket > nht->hash_mask) +		if (++state->bucket >= (1 << nht->hash_shift))  			break;  		n = rcu_dereference_bh(nht->hash_buckets[state->bucket]); @@ -2445,7 +2579,10 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,  	struct net *net = seq_file_net(seq);  	struct neigh_table *tbl = state->tbl; -	pn = pn->next; +	do { +		pn = pn->next; +	} while (pn && !net_eq(pneigh_net(pn), net)); +  	while (!pn) {  		if (++state->bucket > PNEIGH_HASHMASK)  			break; @@ -2625,7 +2762,7 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file)  	if (!ret) {  		struct seq_file *sf = file->private_data; -		sf->private = PDE(inode)->data; +		sf->private = PDE_DATA(inode);  	}  	return ret;  }; @@ -2673,219 +2810,299 @@ errout:  		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);  } -#ifdef CONFIG_ARPD  void neigh_app_ns(struct neighbour *n)  {  	__neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST);  }  EXPORT_SYMBOL(neigh_app_ns); -#endif /* CONFIG_ARPD */  #ifdef CONFIG_SYSCTL +static int zero; +static int int_max = INT_MAX; +static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); + +static int proc_unres_qlen(struct ctl_table *ctl, int write, +			   void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	int size, ret; +	struct ctl_table tmp = *ctl; -#define NEIGH_VARS_MAX 19 +	tmp.extra1 = &zero; +	tmp.extra2 = &unres_qlen_max; +	tmp.data = &size; + +	size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); +	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + +	if (write && !ret) +		*(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); +	return ret; +} + +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, +						   int family) +{ +	switch (family) { +	case AF_INET: +		return __in_dev_arp_parms_get_rcu(dev); +	case AF_INET6: +		return __in6_dev_nd_parms_get_rcu(dev); +	} +	return NULL; +} + +static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, +				  int index) +{ +	struct net_device *dev; +	int family = neigh_parms_family(p); + +	rcu_read_lock(); +	for_each_netdev_rcu(net, dev) { +		struct neigh_parms *dst_p = +				neigh_get_dev_parms_rcu(dev, family); + +		if (dst_p && !test_bit(index, dst_p->data_state)) +			dst_p->data[index] = p->data[index]; +	} +	rcu_read_unlock(); +} + +static void neigh_proc_update(struct ctl_table *ctl, int write) +{ +	struct net_device *dev = ctl->extra1; +	struct neigh_parms *p = ctl->extra2; +	struct net *net = neigh_parms_net(p); +	int index = (int *) ctl->data - p->data; + +	if (!write) +		return; + +	set_bit(index, p->data_state); +	if (!dev) /* NULL dev means this is default value */ +		neigh_copy_dflt_parms(net, p, index); +} + +static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, +					   void __user *buffer, +					   size_t *lenp, loff_t *ppos) +{ +	struct ctl_table tmp = *ctl; +	int ret; + +	tmp.extra1 = &zero; +	tmp.extra2 = &int_max; + +	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); +	neigh_proc_update(ctl, write); +	return ret; +} + +int neigh_proc_dointvec(struct ctl_table *ctl, int write, +			void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + +	neigh_proc_update(ctl, write); +	return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec); + +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, +				void __user *buffer, +				size_t *lenp, loff_t *ppos) +{ +	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); + +	neigh_proc_update(ctl, write); +	return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); + +static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, +					      void __user *buffer, +					      size_t *lenp, loff_t *ppos) +{ +	int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); + +	neigh_proc_update(ctl, write); +	return ret; +} + +int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, +				   void __user *buffer, +				   size_t *lenp, loff_t *ppos) +{ +	int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); + +	neigh_proc_update(ctl, write); +	return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); + +static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, +					  void __user *buffer, +					  size_t *lenp, loff_t *ppos) +{ +	int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); + +	neigh_proc_update(ctl, write); +	return ret; +} + +#define NEIGH_PARMS_DATA_OFFSET(index)	\ +	(&((struct neigh_parms *) 0)->data[index]) + +#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ +	[NEIGH_VAR_ ## attr] = { \ +		.procname	= name, \ +		.data		= NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ +		.maxlen		= sizeof(int), \ +		.mode		= mval, \ +		.proc_handler	= proc, \ +	} + +#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) + +#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) + +#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) + +#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies) + +#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) + +#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ +	NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen)  static struct neigh_sysctl_table {  	struct ctl_table_header *sysctl_header; -	struct ctl_table neigh_vars[NEIGH_VARS_MAX]; -	char *dev_name; +	struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];  } neigh_sysctl_template __read_mostly = {  	.neigh_vars = { -		{ -			.procname	= "mcast_solicit", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec, -		}, -		{ -			.procname	= "ucast_solicit", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec, -		}, -		{ -			.procname	= "app_solicit", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec, -		}, -		{ -			.procname	= "retrans_time", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_userhz_jiffies, -		}, -		{ -			.procname	= "base_reachable_time", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_jiffies, -		}, -		{ -			.procname	= "delay_first_probe_time", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_jiffies, -		}, -		{ -			.procname	= "gc_stale_time", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_jiffies, -		}, -		{ -			.procname	= "unres_qlen", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec, -		}, -		{ -			.procname	= "proxy_qlen", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec, -		}, -		{ -			.procname	= "anycast_delay", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_userhz_jiffies, -		}, -		{ -			.procname	= "proxy_delay", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_userhz_jiffies, -		}, -		{ -			.procname	= "locktime", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_userhz_jiffies, -		}, -		{ -			.procname	= "retrans_time_ms", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_ms_jiffies, -		}, -		{ -			.procname	= "base_reachable_time_ms", -			.maxlen		= sizeof(int), -			.mode		= 0644, -			.proc_handler	= proc_dointvec_ms_jiffies, -		}, -		{ +		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), +		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), +		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), +		NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), +		NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), +		NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), +		NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), +		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), +		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), +		NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), +		NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), +		NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), +		NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), +		NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), +		NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"), +		[NEIGH_VAR_GC_INTERVAL] = {  			.procname	= "gc_interval",  			.maxlen		= sizeof(int),  			.mode		= 0644,  			.proc_handler	= proc_dointvec_jiffies,  		}, -		{ +		[NEIGH_VAR_GC_THRESH1] = {  			.procname	= "gc_thresh1",  			.maxlen		= sizeof(int),  			.mode		= 0644, -			.proc_handler	= proc_dointvec, +			.extra1 	= &zero, +			.extra2		= &int_max, +			.proc_handler	= proc_dointvec_minmax,  		}, -		{ +		[NEIGH_VAR_GC_THRESH2] = {  			.procname	= "gc_thresh2",  			.maxlen		= sizeof(int),  			.mode		= 0644, -			.proc_handler	= proc_dointvec, +			.extra1 	= &zero, +			.extra2		= &int_max, +			.proc_handler	= proc_dointvec_minmax,  		}, -		{ +		[NEIGH_VAR_GC_THRESH3] = {  			.procname	= "gc_thresh3",  			.maxlen		= sizeof(int),  			.mode		= 0644, -			.proc_handler	= proc_dointvec, +			.extra1 	= &zero, +			.extra2		= &int_max, +			.proc_handler	= proc_dointvec_minmax,  		},  		{},  	},  };  int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, -			  char *p_name, proc_handler *handler) +			  proc_handler *handler)  { +	int i;  	struct neigh_sysctl_table *t; -	const char *dev_name_source = NULL; - -#define NEIGH_CTL_PATH_ROOT	0 -#define NEIGH_CTL_PATH_PROTO	1 -#define NEIGH_CTL_PATH_NEIGH	2 -#define NEIGH_CTL_PATH_DEV	3 - -	struct ctl_path neigh_path[] = { -		{ .procname = "net",	 }, -		{ .procname = "proto",	 }, -		{ .procname = "neigh",	 }, -		{ .procname = "default", }, -		{ }, -	}; +	const char *dev_name_source; +	char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; +	char *p_name;  	t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);  	if (!t)  		goto err; -	t->neigh_vars[0].data  = &p->mcast_probes; -	t->neigh_vars[1].data  = &p->ucast_probes; -	t->neigh_vars[2].data  = &p->app_probes; -	t->neigh_vars[3].data  = &p->retrans_time; -	t->neigh_vars[4].data  = &p->base_reachable_time; -	t->neigh_vars[5].data  = &p->delay_probe_time; -	t->neigh_vars[6].data  = &p->gc_staletime; -	t->neigh_vars[7].data  = &p->queue_len; -	t->neigh_vars[8].data  = &p->proxy_qlen; -	t->neigh_vars[9].data  = &p->anycast_delay; -	t->neigh_vars[10].data = &p->proxy_delay; -	t->neigh_vars[11].data = &p->locktime; -	t->neigh_vars[12].data  = &p->retrans_time; -	t->neigh_vars[13].data  = &p->base_reachable_time; +	for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { +		t->neigh_vars[i].data += (long) p; +		t->neigh_vars[i].extra1 = dev; +		t->neigh_vars[i].extra2 = p; +	}  	if (dev) {  		dev_name_source = dev->name;  		/* Terminate the table early */ -		memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14])); +		memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, +		       sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));  	} else { -		dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname; -		t->neigh_vars[14].data = (int *)(p + 1); -		t->neigh_vars[15].data = (int *)(p + 1) + 1; -		t->neigh_vars[16].data = (int *)(p + 1) + 2; -		t->neigh_vars[17].data = (int *)(p + 1) + 3; +		struct neigh_table *tbl = p->tbl; +		dev_name_source = "default"; +		t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; +		t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; +		t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; +		t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;  	} -  	if (handler) {  		/* RetransTime */ -		t->neigh_vars[3].proc_handler = handler; -		t->neigh_vars[3].extra1 = dev; +		t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;  		/* ReachableTime */ -		t->neigh_vars[4].proc_handler = handler; -		t->neigh_vars[4].extra1 = dev; +		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;  		/* RetransTime (in milliseconds)*/ -		t->neigh_vars[12].proc_handler = handler; -		t->neigh_vars[12].extra1 = dev; +		t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;  		/* ReachableTime (in milliseconds) */ -		t->neigh_vars[13].proc_handler = handler; -		t->neigh_vars[13].extra1 = dev; +		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;  	} -	t->dev_name = kstrdup(dev_name_source, GFP_KERNEL); -	if (!t->dev_name) -		goto free; +	/* Don't export sysctls to unprivileged users */ +	if (neigh_parms_net(p)->user_ns != &init_user_ns) +		t->neigh_vars[0].procname = NULL; -	neigh_path[NEIGH_CTL_PATH_DEV].procname = t->dev_name; -	neigh_path[NEIGH_CTL_PATH_PROTO].procname = p_name; +	switch (neigh_parms_family(p)) { +	case AF_INET: +	      p_name = "ipv4"; +	      break; +	case AF_INET6: +	      p_name = "ipv6"; +	      break; +	default: +	      BUG(); +	} +	snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", +		p_name, dev_name_source);  	t->sysctl_header = -		register_net_sysctl_table(neigh_parms_net(p), neigh_path, t->neigh_vars); +		register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars);  	if (!t->sysctl_header) -		goto free_procname; +		goto free;  	p->sysctl_table = t;  	return 0; -free_procname: -	kfree(t->dev_name);  free:  	kfree(t);  err: @@ -2898,8 +3115,7 @@ void neigh_sysctl_unregister(struct neigh_parms *p)  	if (p->sysctl_table) {  		struct neigh_sysctl_table *t = p->sysctl_table;  		p->sysctl_table = NULL; -		unregister_sysctl_table(t->sysctl_header); -		kfree(t->dev_name); +		unregister_net_sysctl_table(t->sysctl_header);  		kfree(t);  	}  } @@ -2909,12 +3125,13 @@ EXPORT_SYMBOL(neigh_sysctl_unregister);  static int __init neigh_init(void)  { -	rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL); -	rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL); -	rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info); +	rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL); -	rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info); -	rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL); +	rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, +		      NULL); +	rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL);  	return 0;  } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c new file mode 100644 index 00000000000..2bf83299600 --- /dev/null +++ b/net/core/net-procfs.c @@ -0,0 +1,423 @@ +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/wext.h> + +#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) + +#define get_bucket(x) ((x) >> BUCKET_SPACE) +#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) +#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) + +extern struct list_head ptype_all __read_mostly; +extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; + +static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +{ +	struct net *net = seq_file_net(seq); +	struct net_device *dev; +	struct hlist_head *h; +	unsigned int count = 0, offset = get_offset(*pos); + +	h = &net->dev_name_head[get_bucket(*pos)]; +	hlist_for_each_entry_rcu(dev, h, name_hlist) { +		if (++count == offset) +			return dev; +	} + +	return NULL; +} + +static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) +{ +	struct net_device *dev; +	unsigned int bucket; + +	do { +		dev = dev_from_same_bucket(seq, pos); +		if (dev) +			return dev; + +		bucket = get_bucket(*pos) + 1; +		*pos = set_bucket_offset(bucket, 1); +	} while (bucket < NETDEV_HASHENTRIES); + +	return NULL; +} + +/* + *	This is invoked by the /proc filesystem handler to display a device + *	in detail. + */ +static void *dev_seq_start(struct seq_file *seq, loff_t *pos) +	__acquires(RCU) +{ +	rcu_read_lock(); +	if (!*pos) +		return SEQ_START_TOKEN; + +	if (get_bucket(*pos) >= NETDEV_HASHENTRIES) +		return NULL; + +	return dev_from_bucket(seq, pos); +} + +static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	++*pos; +	return dev_from_bucket(seq, pos); +} + +static void dev_seq_stop(struct seq_file *seq, void *v) +	__releases(RCU) +{ +	rcu_read_unlock(); +} + +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +{ +	struct rtnl_link_stats64 temp; +	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + +	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " +		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", +		   dev->name, stats->rx_bytes, stats->rx_packets, +		   stats->rx_errors, +		   stats->rx_dropped + stats->rx_missed_errors, +		   stats->rx_fifo_errors, +		   stats->rx_length_errors + stats->rx_over_errors + +		    stats->rx_crc_errors + stats->rx_frame_errors, +		   stats->rx_compressed, stats->multicast, +		   stats->tx_bytes, stats->tx_packets, +		   stats->tx_errors, stats->tx_dropped, +		   stats->tx_fifo_errors, stats->collisions, +		   stats->tx_carrier_errors + +		    stats->tx_aborted_errors + +		    stats->tx_window_errors + +		    stats->tx_heartbeat_errors, +		   stats->tx_compressed); +} + +/* + *	Called from the PROCfs module. This now uses the new arbitrary sized + *	/proc/net interface to create /proc/net/dev + */ +static int dev_seq_show(struct seq_file *seq, void *v) +{ +	if (v == SEQ_START_TOKEN) +		seq_puts(seq, "Inter-|   Receive                            " +			      "                    |  Transmit\n" +			      " face |bytes    packets errs drop fifo frame " +			      "compressed multicast|bytes    packets errs " +			      "drop fifo colls carrier compressed\n"); +	else +		dev_seq_printf_stats(seq, v); +	return 0; +} + +static struct softnet_data *softnet_get_online(loff_t *pos) +{ +	struct softnet_data *sd = NULL; + +	while (*pos < nr_cpu_ids) +		if (cpu_online(*pos)) { +			sd = &per_cpu(softnet_data, *pos); +			break; +		} else +			++*pos; +	return sd; +} + +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +{ +	return softnet_get_online(pos); +} + +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	++*pos; +	return softnet_get_online(pos); +} + +static void softnet_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int softnet_seq_show(struct seq_file *seq, void *v) +{ +	struct softnet_data *sd = v; +	unsigned int flow_limit_count = 0; + +#ifdef CONFIG_NET_FLOW_LIMIT +	struct sd_flow_limit *fl; + +	rcu_read_lock(); +	fl = rcu_dereference(sd->flow_limit); +	if (fl) +		flow_limit_count = fl->count; +	rcu_read_unlock(); +#endif + +	seq_printf(seq, +		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", +		   sd->processed, sd->dropped, sd->time_squeeze, 0, +		   0, 0, 0, 0, /* was fastroute */ +		   sd->cpu_collision, sd->received_rps, flow_limit_count); +	return 0; +} + +static const struct seq_operations dev_seq_ops = { +	.start = dev_seq_start, +	.next  = dev_seq_next, +	.stop  = dev_seq_stop, +	.show  = dev_seq_show, +}; + +static int dev_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open_net(inode, file, &dev_seq_ops, +			    sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = dev_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release_net, +}; + +static const struct seq_operations softnet_seq_ops = { +	.start = softnet_seq_start, +	.next  = softnet_seq_next, +	.stop  = softnet_seq_stop, +	.show  = softnet_seq_show, +}; + +static int softnet_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &softnet_seq_ops); +} + +static const struct file_operations softnet_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = softnet_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release, +}; + +static void *ptype_get_idx(loff_t pos) +{ +	struct packet_type *pt = NULL; +	loff_t i = 0; +	int t; + +	list_for_each_entry_rcu(pt, &ptype_all, list) { +		if (i == pos) +			return pt; +		++i; +	} + +	for (t = 0; t < PTYPE_HASH_SIZE; t++) { +		list_for_each_entry_rcu(pt, &ptype_base[t], list) { +			if (i == pos) +				return pt; +			++i; +		} +	} +	return NULL; +} + +static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) +	__acquires(RCU) +{ +	rcu_read_lock(); +	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	struct packet_type *pt; +	struct list_head *nxt; +	int hash; + +	++*pos; +	if (v == SEQ_START_TOKEN) +		return ptype_get_idx(0); + +	pt = v; +	nxt = pt->list.next; +	if (pt->type == htons(ETH_P_ALL)) { +		if (nxt != &ptype_all) +			goto found; +		hash = 0; +		nxt = ptype_base[0].next; +	} else +		hash = ntohs(pt->type) & PTYPE_HASH_MASK; + +	while (nxt == &ptype_base[hash]) { +		if (++hash >= PTYPE_HASH_SIZE) +			return NULL; +		nxt = ptype_base[hash].next; +	} +found: +	return list_entry(nxt, struct packet_type, list); +} + +static void ptype_seq_stop(struct seq_file *seq, void *v) +	__releases(RCU) +{ +	rcu_read_unlock(); +} + +static int ptype_seq_show(struct seq_file *seq, void *v) +{ +	struct packet_type *pt = v; + +	if (v == SEQ_START_TOKEN) +		seq_puts(seq, "Type Device      Function\n"); +	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { +		if (pt->type == htons(ETH_P_ALL)) +			seq_puts(seq, "ALL "); +		else +			seq_printf(seq, "%04x", ntohs(pt->type)); + +		seq_printf(seq, " %-8s %pf\n", +			   pt->dev ? pt->dev->name : "", pt->func); +	} + +	return 0; +} + +static const struct seq_operations ptype_seq_ops = { +	.start = ptype_seq_start, +	.next  = ptype_seq_next, +	.stop  = ptype_seq_stop, +	.show  = ptype_seq_show, +}; + +static int ptype_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open_net(inode, file, &ptype_seq_ops, +			sizeof(struct seq_net_private)); +} + +static const struct file_operations ptype_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = ptype_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release_net, +}; + + +static int __net_init dev_proc_net_init(struct net *net) +{ +	int rc = -ENOMEM; + +	if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops)) +		goto out; +	if (!proc_create("softnet_stat", S_IRUGO, net->proc_net, +			 &softnet_seq_fops)) +		goto out_dev; +	if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops)) +		goto out_softnet; + +	if (wext_proc_init(net)) +		goto out_ptype; +	rc = 0; +out: +	return rc; +out_ptype: +	remove_proc_entry("ptype", net->proc_net); +out_softnet: +	remove_proc_entry("softnet_stat", net->proc_net); +out_dev: +	remove_proc_entry("dev", net->proc_net); +	goto out; +} + +static void __net_exit dev_proc_net_exit(struct net *net) +{ +	wext_proc_exit(net); + +	remove_proc_entry("ptype", net->proc_net); +	remove_proc_entry("softnet_stat", net->proc_net); +	remove_proc_entry("dev", net->proc_net); +} + +static struct pernet_operations __net_initdata dev_proc_ops = { +	.init = dev_proc_net_init, +	.exit = dev_proc_net_exit, +}; + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ +	struct netdev_hw_addr *ha; +	struct net_device *dev = v; + +	if (v == SEQ_START_TOKEN) +		return 0; + +	netif_addr_lock_bh(dev); +	netdev_for_each_mc_addr(ha, dev) { +		int i; + +		seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, +			   dev->name, ha->refcount, ha->global_use); + +		for (i = 0; i < dev->addr_len; i++) +			seq_printf(seq, "%02x", ha->addr[i]); + +		seq_putc(seq, '\n'); +	} +	netif_addr_unlock_bh(dev); +	return 0; +} + +static const struct seq_operations dev_mc_seq_ops = { +	.start = dev_seq_start, +	.next  = dev_seq_next, +	.stop  = dev_seq_stop, +	.show  = dev_mc_seq_show, +}; + +static int dev_mc_seq_open(struct inode *inode, struct file *file) +{ +	return seq_open_net(inode, file, &dev_mc_seq_ops, +			    sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_mc_seq_fops = { +	.owner	 = THIS_MODULE, +	.open    = dev_mc_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release_net, +}; + +static int __net_init dev_mc_net_init(struct net *net) +{ +	if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops)) +		return -ENOMEM; +	return 0; +} + +static void __net_exit dev_mc_net_exit(struct net *net) +{ +	remove_proc_entry("dev_mcast", net->proc_net); +} + +static struct pernet_operations __net_initdata dev_mc_net_ops = { +	.init = dev_mc_net_init, +	.exit = dev_mc_net_exit, +}; + +int __init dev_proc_init(void) +{ +	int ret = register_pernet_subsys(&dev_proc_ops); +	if (!ret) +		return register_pernet_subsys(&dev_mc_net_ops); +	return ret; +} diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 85e8b5326dd..1cac29ebb05 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -18,9 +18,10 @@  #include <net/sock.h>  #include <net/net_namespace.h>  #include <linux/rtnetlink.h> -#include <linux/wireless.h>  #include <linux/vmalloc.h> -#include <net/wext.h> +#include <linux/export.h> +#include <linux/jiffies.h> +#include <linux/pm_runtime.h>  #include "net-sysfs.h" @@ -28,6 +29,7 @@  static const char fmt_hex[] = "%#x\n";  static const char fmt_long_hex[] = "%#lx\n";  static const char fmt_dec[] = "%d\n"; +static const char fmt_udec[] = "%u\n";  static const char fmt_ulong[] = "%lu\n";  static const char fmt_u64[] = "%llu\n"; @@ -58,35 +60,42 @@ static ssize_t format_##field(const struct net_device *net, char *buf)	\  {									\  	return sprintf(buf, format_string, net->field);			\  }									\ -static ssize_t show_##field(struct device *dev,				\ +static ssize_t field##_show(struct device *dev,				\  			    struct device_attribute *attr, char *buf)	\  {									\  	return netdev_show(dev, attr, buf, format_##field);		\ -} +}									\ + +#define NETDEVICE_SHOW_RO(field, format_string)				\ +NETDEVICE_SHOW(field, format_string);					\ +static DEVICE_ATTR_RO(field) +#define NETDEVICE_SHOW_RW(field, format_string)				\ +NETDEVICE_SHOW(field, format_string);					\ +static DEVICE_ATTR_RW(field)  /* use same locking and permission rules as SIF* ioctl's */  static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,  			    const char *buf, size_t len,  			    int (*set)(struct net_device *, unsigned long))  { -	struct net_device *net = to_net_dev(dev); -	char *endp; +	struct net_device *netdev = to_net_dev(dev); +	struct net *net = dev_net(netdev);  	unsigned long new;  	int ret = -EINVAL; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  		return -EPERM; -	new = simple_strtoul(buf, &endp, 0); -	if (endp == buf) +	ret = kstrtoul(buf, 0, &new); +	if (ret)  		goto err;  	if (!rtnl_trylock())  		return restart_syscall(); -	if (dev_isalive(net)) { -		if ((ret = (*set)(net, new)) == 0) +	if (dev_isalive(netdev)) { +		if ((ret = (*set)(netdev, new)) == 0)  			ret = len;  	}  	rtnl_unlock(); @@ -94,17 +103,17 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,  	return ret;  } -NETDEVICE_SHOW(dev_id, fmt_hex); -NETDEVICE_SHOW(addr_assign_type, fmt_dec); -NETDEVICE_SHOW(addr_len, fmt_dec); -NETDEVICE_SHOW(iflink, fmt_dec); -NETDEVICE_SHOW(ifindex, fmt_dec); -NETDEVICE_SHOW(features, fmt_long_hex); -NETDEVICE_SHOW(type, fmt_dec); -NETDEVICE_SHOW(link_mode, fmt_dec); +NETDEVICE_SHOW_RO(dev_id, fmt_hex); +NETDEVICE_SHOW_RO(dev_port, fmt_dec); +NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); +NETDEVICE_SHOW_RO(addr_len, fmt_dec); +NETDEVICE_SHOW_RO(iflink, fmt_dec); +NETDEVICE_SHOW_RO(ifindex, fmt_dec); +NETDEVICE_SHOW_RO(type, fmt_dec); +NETDEVICE_SHOW_RO(link_mode, fmt_dec);  /* use same locking rules as GIFHWADDR ioctl's */ -static ssize_t show_address(struct device *dev, struct device_attribute *attr, +static ssize_t address_show(struct device *dev, struct device_attribute *attr,  			    char *buf)  {  	struct net_device *net = to_net_dev(dev); @@ -116,17 +125,32 @@ static ssize_t show_address(struct device *dev, struct device_attribute *attr,  	read_unlock(&dev_base_lock);  	return ret;  } +static DEVICE_ATTR_RO(address); -static ssize_t show_broadcast(struct device *dev, -			    struct device_attribute *attr, char *buf) +static ssize_t broadcast_show(struct device *dev, +			      struct device_attribute *attr, char *buf)  {  	struct net_device *net = to_net_dev(dev);  	if (dev_isalive(net))  		return sysfs_format_mac(buf, net->broadcast, net->addr_len);  	return -EINVAL;  } +static DEVICE_ATTR_RO(broadcast); + +static int change_carrier(struct net_device *net, unsigned long new_carrier) +{ +	if (!netif_running(net)) +		return -EINVAL; +	return dev_change_carrier(net, (bool) new_carrier); +} -static ssize_t show_carrier(struct device *dev, +static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, +			     const char *buf, size_t len) +{ +	return netdev_store(dev, attr, buf, len, change_carrier); +} + +static ssize_t carrier_show(struct device *dev,  			    struct device_attribute *attr, char *buf)  {  	struct net_device *netdev = to_net_dev(dev); @@ -135,8 +159,9 @@ static ssize_t show_carrier(struct device *dev,  	}  	return -EINVAL;  } +static DEVICE_ATTR_RW(carrier); -static ssize_t show_speed(struct device *dev, +static ssize_t speed_show(struct device *dev,  			  struct device_attribute *attr, char *buf)  {  	struct net_device *netdev = to_net_dev(dev); @@ -145,19 +170,17 @@ static ssize_t show_speed(struct device *dev,  	if (!rtnl_trylock())  		return restart_syscall(); -	if (netif_running(netdev) && -	    netdev->ethtool_ops && -	    netdev->ethtool_ops->get_settings) { -		struct ethtool_cmd cmd = { ETHTOOL_GSET }; - -		if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) -			ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); +	if (netif_running(netdev)) { +		struct ethtool_cmd cmd; +		if (!__ethtool_get_settings(netdev, &cmd)) +			ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd));  	}  	rtnl_unlock();  	return ret;  } +static DEVICE_ATTR_RO(speed); -static ssize_t show_duplex(struct device *dev, +static ssize_t duplex_show(struct device *dev,  			   struct device_attribute *attr, char *buf)  {  	struct net_device *netdev = to_net_dev(dev); @@ -166,19 +189,30 @@ static ssize_t show_duplex(struct device *dev,  	if (!rtnl_trylock())  		return restart_syscall(); -	if (netif_running(netdev) && -	    netdev->ethtool_ops && -	    netdev->ethtool_ops->get_settings) { -		struct ethtool_cmd cmd = { ETHTOOL_GSET }; - -		if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) -			ret = sprintf(buf, "%s\n", cmd.duplex ? "full" : "half"); +	if (netif_running(netdev)) { +		struct ethtool_cmd cmd; +		if (!__ethtool_get_settings(netdev, &cmd)) { +			const char *duplex; +			switch (cmd.duplex) { +			case DUPLEX_HALF: +				duplex = "half"; +				break; +			case DUPLEX_FULL: +				duplex = "full"; +				break; +			default: +				duplex = "unknown"; +				break; +			} +			ret = sprintf(buf, "%s\n", duplex); +		}  	}  	rtnl_unlock();  	return ret;  } +static DEVICE_ATTR_RO(duplex); -static ssize_t show_dormant(struct device *dev, +static ssize_t dormant_show(struct device *dev,  			    struct device_attribute *attr, char *buf)  {  	struct net_device *netdev = to_net_dev(dev); @@ -188,6 +222,7 @@ static ssize_t show_dormant(struct device *dev,  	return -EINVAL;  } +static DEVICE_ATTR_RO(dormant);  static const char *const operstates[] = {  	"unknown", @@ -199,7 +234,7 @@ static const char *const operstates[] = {  	"up"  }; -static ssize_t show_operstate(struct device *dev, +static ssize_t operstate_show(struct device *dev,  			      struct device_attribute *attr, char *buf)  {  	const struct net_device *netdev = to_net_dev(dev); @@ -216,35 +251,43 @@ static ssize_t show_operstate(struct device *dev,  	return sprintf(buf, "%s\n", operstates[operstate]);  } +static DEVICE_ATTR_RO(operstate); + +static ssize_t carrier_changes_show(struct device *dev, +				    struct device_attribute *attr, +				    char *buf) +{ +	struct net_device *netdev = to_net_dev(dev); +	return sprintf(buf, fmt_dec, +		       atomic_read(&netdev->carrier_changes)); +} +static DEVICE_ATTR_RO(carrier_changes);  /* read-write attributes */ -NETDEVICE_SHOW(mtu, fmt_dec);  static int change_mtu(struct net_device *net, unsigned long new_mtu)  {  	return dev_set_mtu(net, (int) new_mtu);  } -static ssize_t store_mtu(struct device *dev, struct device_attribute *attr, +static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,  			 const char *buf, size_t len)  {  	return netdev_store(dev, attr, buf, len, change_mtu);  } - -NETDEVICE_SHOW(flags, fmt_hex); +NETDEVICE_SHOW_RW(mtu, fmt_dec);  static int change_flags(struct net_device *net, unsigned long new_flags)  { -	return dev_change_flags(net, (unsigned) new_flags); +	return dev_change_flags(net, (unsigned int) new_flags);  } -static ssize_t store_flags(struct device *dev, struct device_attribute *attr, +static ssize_t flags_store(struct device *dev, struct device_attribute *attr,  			   const char *buf, size_t len)  {  	return netdev_store(dev, attr, buf, len, change_flags);  } - -NETDEVICE_SHOW(tx_queue_len, fmt_ulong); +NETDEVICE_SHOW_RW(flags, fmt_hex);  static int change_tx_queue_len(struct net_device *net, unsigned long new_len)  { @@ -252,21 +295,26 @@ static int change_tx_queue_len(struct net_device *net, unsigned long new_len)  	return 0;  } -static ssize_t store_tx_queue_len(struct device *dev, +static ssize_t tx_queue_len_store(struct device *dev,  				  struct device_attribute *attr,  				  const char *buf, size_t len)  { +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; +  	return netdev_store(dev, attr, buf, len, change_tx_queue_len);  } +NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); -static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, +static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,  			     const char *buf, size_t len)  {  	struct net_device *netdev = to_net_dev(dev); +	struct net *net = dev_net(netdev);  	size_t count = len;  	ssize_t ret; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	/* ignore trailing newline */ @@ -281,7 +329,7 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,  	return ret < 0 ? ret : len;  } -static ssize_t show_ifalias(struct device *dev, +static ssize_t ifalias_show(struct device *dev,  			    struct device_attribute *attr, char *buf)  {  	const struct net_device *netdev = to_net_dev(dev); @@ -294,30 +342,70 @@ static ssize_t show_ifalias(struct device *dev,  	rtnl_unlock();  	return ret;  } +static DEVICE_ATTR_RW(ifalias); -static struct device_attribute net_class_attributes[] = { -	__ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL), -	__ATTR(addr_len, S_IRUGO, show_addr_len, NULL), -	__ATTR(dev_id, S_IRUGO, show_dev_id, NULL), -	__ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias), -	__ATTR(iflink, S_IRUGO, show_iflink, NULL), -	__ATTR(ifindex, S_IRUGO, show_ifindex, NULL), -	__ATTR(features, S_IRUGO, show_features, NULL), -	__ATTR(type, S_IRUGO, show_type, NULL), -	__ATTR(link_mode, S_IRUGO, show_link_mode, NULL), -	__ATTR(address, S_IRUGO, show_address, NULL), -	__ATTR(broadcast, S_IRUGO, show_broadcast, NULL), -	__ATTR(carrier, S_IRUGO, show_carrier, NULL), -	__ATTR(speed, S_IRUGO, show_speed, NULL), -	__ATTR(duplex, S_IRUGO, show_duplex, NULL), -	__ATTR(dormant, S_IRUGO, show_dormant, NULL), -	__ATTR(operstate, S_IRUGO, show_operstate, NULL), -	__ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu), -	__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), -	__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, -	       store_tx_queue_len), -	{} +static int change_group(struct net_device *net, unsigned long new_group) +{ +	dev_set_group(net, (int) new_group); +	return 0; +} + +static ssize_t group_store(struct device *dev, struct device_attribute *attr, +			   const char *buf, size_t len) +{ +	return netdev_store(dev, attr, buf, len, change_group); +} +NETDEVICE_SHOW(group, fmt_dec); +static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); + +static ssize_t phys_port_id_show(struct device *dev, +				 struct device_attribute *attr, char *buf) +{ +	struct net_device *netdev = to_net_dev(dev); +	ssize_t ret = -EINVAL; + +	if (!rtnl_trylock()) +		return restart_syscall(); + +	if (dev_isalive(netdev)) { +		struct netdev_phys_port_id ppid; + +		ret = dev_get_phys_port_id(netdev, &ppid); +		if (!ret) +			ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); +	} +	rtnl_unlock(); + +	return ret; +} +static DEVICE_ATTR_RO(phys_port_id); + +static struct attribute *net_class_attrs[] = { +	&dev_attr_netdev_group.attr, +	&dev_attr_type.attr, +	&dev_attr_dev_id.attr, +	&dev_attr_dev_port.attr, +	&dev_attr_iflink.attr, +	&dev_attr_ifindex.attr, +	&dev_attr_addr_assign_type.attr, +	&dev_attr_addr_len.attr, +	&dev_attr_link_mode.attr, +	&dev_attr_address.attr, +	&dev_attr_broadcast.attr, +	&dev_attr_speed.attr, +	&dev_attr_duplex.attr, +	&dev_attr_dormant.attr, +	&dev_attr_operstate.attr, +	&dev_attr_carrier_changes.attr, +	&dev_attr_ifalias.attr, +	&dev_attr_carrier.attr, +	&dev_attr_mtu.attr, +	&dev_attr_flags.attr, +	&dev_attr_tx_queue_len.attr, +	&dev_attr_phys_port_id.attr, +	NULL,  }; +ATTRIBUTE_GROUPS(net_class);  /* Show a given an attribute in the statistics group */  static ssize_t netstat_show(const struct device *d, @@ -343,13 +431,13 @@ static ssize_t netstat_show(const struct device *d,  /* generate a read-only statistics attribute */  #define NETSTAT_ENTRY(name)						\ -static ssize_t show_##name(struct device *d,				\ +static ssize_t name##_show(struct device *d,				\  			   struct device_attribute *attr, char *buf) 	\  {									\  	return netstat_show(d, attr, buf,				\  			    offsetof(struct rtnl_link_stats64, name));	\  }									\ -static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) +static DEVICE_ATTR_RO(name)  NETSTAT_ENTRY(rx_packets);  NETSTAT_ENTRY(tx_packets); @@ -408,63 +496,8 @@ static struct attribute_group netstat_group = {  	.attrs  = netstat_attrs,  }; -#ifdef CONFIG_WIRELESS_EXT_SYSFS -/* helper function that does all the locking etc for wireless stats */ -static ssize_t wireless_show(struct device *d, char *buf, -			     ssize_t (*format)(const struct iw_statistics *, -					       char *)) -{ -	struct net_device *dev = to_net_dev(d); -	const struct iw_statistics *iw; -	ssize_t ret = -EINVAL; - -	if (!rtnl_trylock()) -		return restart_syscall(); -	if (dev_isalive(dev)) { -		iw = get_wireless_stats(dev); -		if (iw) -			ret = (*format)(iw, buf); -	} -	rtnl_unlock(); - -	return ret; -} - -/* show function template for wireless fields */ -#define WIRELESS_SHOW(name, field, format_string)			\ -static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \ -{									\ -	return sprintf(buf, format_string, iw->field);			\ -}									\ -static ssize_t show_iw_##name(struct device *d,				\ -			      struct device_attribute *attr, char *buf)	\ -{									\ -	return wireless_show(d, buf, format_iw_##name);			\ -}									\ -static DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL) - -WIRELESS_SHOW(status, status, fmt_hex); -WIRELESS_SHOW(link, qual.qual, fmt_dec); -WIRELESS_SHOW(level, qual.level, fmt_dec); -WIRELESS_SHOW(noise, qual.noise, fmt_dec); -WIRELESS_SHOW(nwid, discard.nwid, fmt_dec); -WIRELESS_SHOW(crypt, discard.code, fmt_dec); -WIRELESS_SHOW(fragment, discard.fragment, fmt_dec); -WIRELESS_SHOW(misc, discard.misc, fmt_dec); -WIRELESS_SHOW(retries, discard.retries, fmt_dec); -WIRELESS_SHOW(beacon, miss.beacon, fmt_dec); - +#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)  static struct attribute *wireless_attrs[] = { -	&dev_attr_status.attr, -	&dev_attr_link.attr, -	&dev_attr_level.attr, -	&dev_attr_noise.attr, -	&dev_attr_nwid.attr, -	&dev_attr_crypt.attr, -	&dev_attr_fragment.attr, -	&dev_attr_retries.attr, -	&dev_attr_misc.attr, -	&dev_attr_beacon.attr,  	NULL  }; @@ -473,19 +506,12 @@ static struct attribute_group wireless_group = {  	.attrs = wireless_attrs,  };  #endif + +#else /* CONFIG_SYSFS */ +#define net_class_groups	NULL  #endif /* CONFIG_SYSFS */ -#ifdef CONFIG_RPS -/* - * RX queue sysfs structures and functions. - */ -struct rx_queue_attribute { -	struct attribute attr; -	ssize_t (*show)(struct netdev_rx_queue *queue, -	    struct rx_queue_attribute *attr, char *buf); -	ssize_t (*store)(struct netdev_rx_queue *queue, -	    struct rx_queue_attribute *attr, const char *buf, size_t len); -}; +#ifdef CONFIG_SYSFS  #define to_rx_queue_attr(_attr) container_of(_attr,		\      struct rx_queue_attribute, attr) @@ -520,6 +546,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = {  	.store = rx_queue_attr_store,  }; +#ifdef CONFIG_RPS  static ssize_t show_rps_map(struct netdev_rx_queue *queue,  			    struct rx_queue_attribute *attribute, char *buf)  { @@ -550,13 +577,6 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue,  	return len;  } -static void rps_map_release(struct rcu_head *rcu) -{ -	struct rps_map *map = container_of(rcu, struct rps_map, rcu); - -	kfree(map); -} -  static ssize_t store_rps_map(struct netdev_rx_queue *queue,  		      struct rx_queue_attribute *attribute,  		      const char *buf, size_t len) @@ -578,7 +598,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,  		return err;  	} -	map = kzalloc(max_t(unsigned, +	map = kzalloc(max_t(unsigned int,  	    RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),  	    GFP_KERNEL);  	if (!map) { @@ -603,9 +623,12 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,  	rcu_assign_pointer(queue->rps_map, map);  	spin_unlock(&rps_map_lock); -	if (old_map) -		call_rcu(&old_map->rcu, rps_map_release); - +	if (map) +		static_key_slow_inc(&rps_needed); +	if (old_map) { +		kfree_rcu(old_map, rcu); +		static_key_slow_dec(&rps_needed); +	}  	free_cpumask_var(mask);  	return len;  } @@ -615,65 +638,68 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,  					   char *buf)  {  	struct rps_dev_flow_table *flow_table; -	unsigned int val = 0; +	unsigned long val = 0;  	rcu_read_lock();  	flow_table = rcu_dereference(queue->rps_flow_table);  	if (flow_table) -		val = flow_table->mask + 1; +		val = (unsigned long)flow_table->mask + 1;  	rcu_read_unlock(); -	return sprintf(buf, "%u\n", val); -} - -static void rps_dev_flow_table_release_work(struct work_struct *work) -{ -	struct rps_dev_flow_table *table = container_of(work, -	    struct rps_dev_flow_table, free_work); - -	vfree(table); +	return sprintf(buf, "%lu\n", val);  }  static void rps_dev_flow_table_release(struct rcu_head *rcu)  {  	struct rps_dev_flow_table *table = container_of(rcu,  	    struct rps_dev_flow_table, rcu); - -	INIT_WORK(&table->free_work, rps_dev_flow_table_release_work); -	schedule_work(&table->free_work); +	vfree(table);  }  static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,  				     struct rx_queue_attribute *attr,  				     const char *buf, size_t len)  { -	unsigned int count; -	char *endp; +	unsigned long mask, count;  	struct rps_dev_flow_table *table, *old_table;  	static DEFINE_SPINLOCK(rps_dev_flow_lock); +	int rc;  	if (!capable(CAP_NET_ADMIN))  		return -EPERM; -	count = simple_strtoul(buf, &endp, 0); -	if (endp == buf) -		return -EINVAL; +	rc = kstrtoul(buf, 0, &count); +	if (rc < 0) +		return rc;  	if (count) { -		int i; - -		if (count > 1<<30) { +		mask = count - 1; +		/* mask = roundup_pow_of_two(count) - 1; +		 * without overflows... +		 */ +		while ((mask | (mask >> 1)) != mask) +			mask |= (mask >> 1); +		/* On 64 bit arches, must check mask fits in table->mask (u32), +		 * and on 32bit arches, must check +		 * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. +		 */ +#if BITS_PER_LONG > 32 +		if (mask > (unsigned long)(u32)mask) +			return -EINVAL; +#else +		if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) +				/ sizeof(struct rps_dev_flow)) {  			/* Enforce a limit to prevent overflow */  			return -EINVAL;  		} -		count = roundup_pow_of_two(count); -		table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count)); +#endif +		table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));  		if (!table)  			return -ENOMEM; -		table->mask = count - 1; -		for (i = 0; i < count; i++) -			table->flows[i].cpu = RPS_NO_CPU; +		table->mask = mask; +		for (count = 0; count <= mask; count++) +			table->flows[count].cpu = RPS_NO_CPU;  	} else  		table = NULL; @@ -696,40 +722,58 @@ static struct rx_queue_attribute rps_cpus_attribute =  static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =  	__ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,  	    show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); +#endif /* CONFIG_RPS */  static struct attribute *rx_queue_default_attrs[] = { +#ifdef CONFIG_RPS  	&rps_cpus_attribute.attr,  	&rps_dev_flow_table_cnt_attribute.attr, +#endif  	NULL  };  static void rx_queue_release(struct kobject *kobj)  {  	struct netdev_rx_queue *queue = to_rx_queue(kobj); +#ifdef CONFIG_RPS  	struct rps_map *map;  	struct rps_dev_flow_table *flow_table; -	map = rcu_dereference_raw(queue->rps_map); +	map = rcu_dereference_protected(queue->rps_map, 1);  	if (map) {  		RCU_INIT_POINTER(queue->rps_map, NULL); -		call_rcu(&map->rcu, rps_map_release); +		kfree_rcu(map, rcu);  	} -	flow_table = rcu_dereference_raw(queue->rps_flow_table); +	flow_table = rcu_dereference_protected(queue->rps_flow_table, 1);  	if (flow_table) {  		RCU_INIT_POINTER(queue->rps_flow_table, NULL);  		call_rcu(&flow_table->rcu, rps_dev_flow_table_release);  	} +#endif  	memset(kobj, 0, sizeof(*kobj));  	dev_put(queue->dev);  } +static const void *rx_queue_namespace(struct kobject *kobj) +{ +	struct netdev_rx_queue *queue = to_rx_queue(kobj); +	struct device *dev = &queue->dev->dev; +	const void *ns = NULL; + +	if (dev->class && dev->class->ns_type) +		ns = dev->class->namespace(dev); + +	return ns; +} +  static struct kobj_type rx_queue_ktype = {  	.sysfs_ops = &rx_queue_sysfs_ops,  	.release = rx_queue_release,  	.default_attrs = rx_queue_default_attrs, +	.namespace = rx_queue_namespace  };  static int rx_queue_add_kobject(struct net_device *net, int index) @@ -741,25 +785,36 @@ static int rx_queue_add_kobject(struct net_device *net, int index)  	kobj->kset = net->queues_kset;  	error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,  	    "rx-%u", index); -	if (error) { -		kobject_put(kobj); -		return error; +	if (error) +		goto exit; + +	if (net->sysfs_rx_queue_group) { +		error = sysfs_create_group(kobj, net->sysfs_rx_queue_group); +		if (error) +			goto exit;  	}  	kobject_uevent(kobj, KOBJ_ADD);  	dev_hold(queue->dev);  	return error; +exit: +	kobject_put(kobj); +	return error;  } -#endif /* CONFIG_RPS */ +#endif /* CONFIG_SYSFS */  int  net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)  { -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  	int i;  	int error = 0; +#ifndef CONFIG_RPS +	if (!net->sysfs_rx_queue_group) +		return 0; +#endif  	for (i = old_num; i < new_num; i++) {  		error = rx_queue_add_kobject(net, i);  		if (error) { @@ -768,8 +823,12 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)  		}  	} -	while (--i >= new_num) +	while (--i >= new_num) { +		if (net->sysfs_rx_queue_group) +			sysfs_remove_group(&net->_rx[i].kobj, +					   net->sysfs_rx_queue_group);  		kobject_put(&net->_rx[i].kobj); +	}  	return error;  #else @@ -777,7 +836,7 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)  #endif  } -#ifdef CONFIG_XPS +#ifdef CONFIG_SYSFS  /*   * netdev_queue sysfs structures and functions.   */ @@ -823,15 +882,139 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {  	.store = netdev_queue_attr_store,  }; -static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +static ssize_t show_trans_timeout(struct netdev_queue *queue, +				  struct netdev_queue_attribute *attribute, +				  char *buf)  { -	struct net_device *dev = queue->dev; -	int i; +	unsigned long trans_timeout; -	for (i = 0; i < dev->num_tx_queues; i++) -		if (queue == &dev->_tx[i]) -			break; +	spin_lock_irq(&queue->_xmit_lock); +	trans_timeout = queue->trans_timeout; +	spin_unlock_irq(&queue->_xmit_lock); + +	return sprintf(buf, "%lu", trans_timeout); +} + +static struct netdev_queue_attribute queue_trans_timeout = +	__ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); + +#ifdef CONFIG_BQL +/* + * Byte queue limits sysfs structures and functions. + */ +static ssize_t bql_show(char *buf, unsigned int value) +{ +	return sprintf(buf, "%u\n", value); +} + +static ssize_t bql_set(const char *buf, const size_t count, +		       unsigned int *pvalue) +{ +	unsigned int value; +	int err; +	if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) +		value = DQL_MAX_LIMIT; +	else { +		err = kstrtouint(buf, 10, &value); +		if (err < 0) +			return err; +		if (value > DQL_MAX_LIMIT) +			return -EINVAL; +	} + +	*pvalue = value; + +	return count; +} + +static ssize_t bql_show_hold_time(struct netdev_queue *queue, +				  struct netdev_queue_attribute *attr, +				  char *buf) +{ +	struct dql *dql = &queue->dql; + +	return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); +} + +static ssize_t bql_set_hold_time(struct netdev_queue *queue, +				 struct netdev_queue_attribute *attribute, +				 const char *buf, size_t len) +{ +	struct dql *dql = &queue->dql; +	unsigned int value; +	int err; + +	err = kstrtouint(buf, 10, &value); +	if (err < 0) +		return err; + +	dql->slack_hold_time = msecs_to_jiffies(value); + +	return len; +} + +static struct netdev_queue_attribute bql_hold_time_attribute = +	__ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time, +	    bql_set_hold_time); + +static ssize_t bql_show_inflight(struct netdev_queue *queue, +				 struct netdev_queue_attribute *attr, +				 char *buf) +{ +	struct dql *dql = &queue->dql; + +	return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); +} + +static struct netdev_queue_attribute bql_inflight_attribute = +	__ATTR(inflight, S_IRUGO, bql_show_inflight, NULL); + +#define BQL_ATTR(NAME, FIELD)						\ +static ssize_t bql_show_ ## NAME(struct netdev_queue *queue,		\ +				 struct netdev_queue_attribute *attr,	\ +				 char *buf)				\ +{									\ +	return bql_show(buf, queue->dql.FIELD);				\ +}									\ +									\ +static ssize_t bql_set_ ## NAME(struct netdev_queue *queue,		\ +				struct netdev_queue_attribute *attr,	\ +				const char *buf, size_t len)		\ +{									\ +	return bql_set(buf, len, &queue->dql.FIELD);			\ +}									\ +									\ +static struct netdev_queue_attribute bql_ ## NAME ## _attribute =	\ +	__ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME,		\ +	    bql_set_ ## NAME); + +BQL_ATTR(limit, limit) +BQL_ATTR(limit_max, max_limit) +BQL_ATTR(limit_min, min_limit) + +static struct attribute *dql_attrs[] = { +	&bql_limit_attribute.attr, +	&bql_limit_max_attribute.attr, +	&bql_limit_min_attribute.attr, +	&bql_hold_time_attribute.attr, +	&bql_inflight_attribute.attr, +	NULL +}; + +static struct attribute_group dql_group = { +	.name  = "byte_queue_limits", +	.attrs  = dql_attrs, +}; +#endif /* CONFIG_BQL */ + +#ifdef CONFIG_XPS +static unsigned int get_netdev_queue_index(struct netdev_queue *queue) +{ +	struct net_device *dev = queue->dev; +	unsigned int i; + +	i = queue - dev->_tx;  	BUG_ON(i >= dev->num_tx_queues);  	return i; @@ -883,37 +1066,14 @@ static ssize_t show_xps_map(struct netdev_queue *queue,  	return len;  } -static void xps_map_release(struct rcu_head *rcu) -{ -	struct xps_map *map = container_of(rcu, struct xps_map, rcu); - -	kfree(map); -} - -static void xps_dev_maps_release(struct rcu_head *rcu) -{ -	struct xps_dev_maps *dev_maps = -	    container_of(rcu, struct xps_dev_maps, rcu); - -	kfree(dev_maps); -} - -static DEFINE_MUTEX(xps_map_mutex); -#define xmap_dereference(P)		\ -	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) -  static ssize_t store_xps_map(struct netdev_queue *queue,  		      struct netdev_queue_attribute *attribute,  		      const char *buf, size_t len)  {  	struct net_device *dev = queue->dev; -	cpumask_var_t mask; -	int err, i, cpu, pos, map_len, alloc_len, need_set;  	unsigned long index; -	struct xps_map *map, *new_map; -	struct xps_dev_maps *dev_maps, *new_dev_maps; -	int nonempty = 0; -	int numa_node = -2; +	cpumask_var_t mask; +	int err;  	if (!capable(CAP_NET_ADMIN))  		return -EPERM; @@ -929,169 +1089,50 @@ static ssize_t store_xps_map(struct netdev_queue *queue,  		return err;  	} -	new_dev_maps = kzalloc(max_t(unsigned, -	    XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL); -	if (!new_dev_maps) { -		free_cpumask_var(mask); -		return -ENOMEM; -	} - -	mutex_lock(&xps_map_mutex); - -	dev_maps = xmap_dereference(dev->xps_maps); - -	for_each_possible_cpu(cpu) { -		map = dev_maps ? -			xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; -		new_map = map; -		if (map) { -			for (pos = 0; pos < map->len; pos++) -				if (map->queues[pos] == index) -					break; -			map_len = map->len; -			alloc_len = map->alloc_len; -		} else -			pos = map_len = alloc_len = 0; - -		need_set = cpu_isset(cpu, *mask) && cpu_online(cpu); -#ifdef CONFIG_NUMA -		if (need_set) { -			if (numa_node == -2) -				numa_node = cpu_to_node(cpu); -			else if (numa_node != cpu_to_node(cpu)) -				numa_node = -1; -		} -#endif -		if (need_set && pos >= map_len) { -			/* Need to add queue to this CPU's map */ -			if (map_len >= alloc_len) { -				alloc_len = alloc_len ? -				    2 * alloc_len : XPS_MIN_MAP_ALLOC; -				new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), -						       GFP_KERNEL, -						       cpu_to_node(cpu)); -				if (!new_map) -					goto error; -				new_map->alloc_len = alloc_len; -				for (i = 0; i < map_len; i++) -					new_map->queues[i] = map->queues[i]; -				new_map->len = map_len; -			} -			new_map->queues[new_map->len++] = index; -		} else if (!need_set && pos < map_len) { -			/* Need to remove queue from this CPU's map */ -			if (map_len > 1) -				new_map->queues[pos] = -				    new_map->queues[--new_map->len]; -			else -				new_map = NULL; -		} -		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map); -	} - -	/* Cleanup old maps */ -	for_each_possible_cpu(cpu) { -		map = dev_maps ? -			xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; -		if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map) -			call_rcu(&map->rcu, xps_map_release); -		if (new_dev_maps->cpu_map[cpu]) -			nonempty = 1; -	} - -	if (nonempty) -		rcu_assign_pointer(dev->xps_maps, new_dev_maps); -	else { -		kfree(new_dev_maps); -		rcu_assign_pointer(dev->xps_maps, NULL); -	} - -	if (dev_maps) -		call_rcu(&dev_maps->rcu, xps_dev_maps_release); - -	netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : -1); - -	mutex_unlock(&xps_map_mutex); +	err = netif_set_xps_queue(dev, mask, index);  	free_cpumask_var(mask); -	return len; -error: -	mutex_unlock(&xps_map_mutex); - -	if (new_dev_maps) -		for_each_possible_cpu(i) -			kfree(rcu_dereference_protected( -				new_dev_maps->cpu_map[i], -				1)); -	kfree(new_dev_maps); -	free_cpumask_var(mask); -	return -ENOMEM; +	return err ? : len;  }  static struct netdev_queue_attribute xps_cpus_attribute =      __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); +#endif /* CONFIG_XPS */  static struct attribute *netdev_queue_default_attrs[] = { +	&queue_trans_timeout.attr, +#ifdef CONFIG_XPS  	&xps_cpus_attribute.attr, +#endif  	NULL  };  static void netdev_queue_release(struct kobject *kobj)  {  	struct netdev_queue *queue = to_netdev_queue(kobj); -	struct net_device *dev = queue->dev; -	struct xps_dev_maps *dev_maps; -	struct xps_map *map; -	unsigned long index; -	int i, pos, nonempty = 0; - -	index = get_netdev_queue_index(queue); -	mutex_lock(&xps_map_mutex); -	dev_maps = xmap_dereference(dev->xps_maps); - -	if (dev_maps) { -		for_each_possible_cpu(i) { -			map = xmap_dereference(dev_maps->cpu_map[i]); -			if (!map) -				continue; - -			for (pos = 0; pos < map->len; pos++) -				if (map->queues[pos] == index) -					break; - -			if (pos < map->len) { -				if (map->len > 1) -					map->queues[pos] = -					    map->queues[--map->len]; -				else { -					RCU_INIT_POINTER(dev_maps->cpu_map[i], -					    NULL); -					call_rcu(&map->rcu, xps_map_release); -					map = NULL; -				} -			} -			if (map) -				nonempty = 1; -		} +	memset(kobj, 0, sizeof(*kobj)); +	dev_put(queue->dev); +} -		if (!nonempty) { -			RCU_INIT_POINTER(dev->xps_maps, NULL); -			call_rcu(&dev_maps->rcu, xps_dev_maps_release); -		} -	} +static const void *netdev_queue_namespace(struct kobject *kobj) +{ +	struct netdev_queue *queue = to_netdev_queue(kobj); +	struct device *dev = &queue->dev->dev; +	const void *ns = NULL; -	mutex_unlock(&xps_map_mutex); +	if (dev->class && dev->class->ns_type) +		ns = dev->class->namespace(dev); -	memset(kobj, 0, sizeof(*kobj)); -	dev_put(queue->dev); +	return ns;  }  static struct kobj_type netdev_queue_ktype = {  	.sysfs_ops = &netdev_queue_sysfs_ops,  	.release = netdev_queue_release,  	.default_attrs = netdev_queue_default_attrs, +	.namespace = netdev_queue_namespace,  };  static int netdev_queue_add_kobject(struct net_device *net, int index) @@ -1103,22 +1144,29 @@ static int netdev_queue_add_kobject(struct net_device *net, int index)  	kobj->kset = net->queues_kset;  	error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,  	    "tx-%u", index); -	if (error) { -		kobject_put(kobj); -		return error; -	} +	if (error) +		goto exit; + +#ifdef CONFIG_BQL +	error = sysfs_create_group(kobj, &dql_group); +	if (error) +		goto exit; +#endif  	kobject_uevent(kobj, KOBJ_ADD);  	dev_hold(queue->dev); +	return 0; +exit: +	kobject_put(kobj);  	return error;  } -#endif /* CONFIG_XPS */ +#endif /* CONFIG_SYSFS */  int  netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)  { -#ifdef CONFIG_XPS +#ifdef CONFIG_SYSFS  	int i;  	int error = 0; @@ -1130,27 +1178,30 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)  		}  	} -	while (--i >= new_num) -		kobject_put(&net->_tx[i].kobj); +	while (--i >= new_num) { +		struct netdev_queue *queue = net->_tx + i; + +#ifdef CONFIG_BQL +		sysfs_remove_group(&queue->kobj, &dql_group); +#endif +		kobject_put(&queue->kobj); +	}  	return error;  #else  	return 0; -#endif +#endif /* CONFIG_SYSFS */  }  static int register_queue_kobjects(struct net_device *net)  {  	int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; -#if defined(CONFIG_RPS) || defined(CONFIG_XPS) +#ifdef CONFIG_SYSFS  	net->queues_kset = kset_create_and_add("queues",  	    NULL, &net->dev.kobj);  	if (!net->queues_kset)  		return -ENOMEM; -#endif - -#ifdef CONFIG_RPS  	real_rx = net->real_num_rx_queues;  #endif  	real_tx = net->real_num_tx_queues; @@ -1177,21 +1228,33 @@ static void remove_queue_kobjects(struct net_device *net)  {  	int real_rx = 0, real_tx = 0; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS  	real_rx = net->real_num_rx_queues;  #endif  	real_tx = net->real_num_tx_queues;  	net_rx_queue_update_kobjects(net, real_rx, 0);  	netdev_queue_update_kobjects(net, real_tx, 0); -#if defined(CONFIG_RPS) || defined(CONFIG_XPS) +#ifdef CONFIG_SYSFS  	kset_unregister(net->queues_kset);  #endif  } -static const void *net_current_ns(void) +static bool net_current_may_mount(void) +{ +	struct net *net = current->nsproxy->net_ns; + +	return ns_capable(net->user_ns, CAP_SYS_ADMIN); +} + +static void *net_grab_current_ns(void)  { -	return current->nsproxy->net_ns; +	struct net *ns = current->nsproxy->net_ns; +#ifdef CONFIG_NET_NS +	if (ns) +		atomic_inc(&ns->passive); +#endif +	return ns;  }  static const void *net_initial_ns(void) @@ -1206,23 +1269,14 @@ static const void *net_netlink_ns(struct sock *sk)  struct kobj_ns_type_operations net_ns_type_operations = {  	.type = KOBJ_NS_TYPE_NET, -	.current_ns = net_current_ns, +	.current_may_mount = net_current_may_mount, +	.grab_current_ns = net_grab_current_ns,  	.netlink_ns = net_netlink_ns,  	.initial_ns = net_initial_ns, +	.drop_ns = net_drop_ns,  };  EXPORT_SYMBOL_GPL(net_ns_type_operations); -static void net_kobj_ns_exit(struct net *net) -{ -	kobj_ns_exit(KOBJ_NS_TYPE_NET, net); -} - -static struct pernet_operations kobj_net_ops = { -	.exit = net_kobj_ns_exit, -}; - - -#ifdef CONFIG_HOTPLUG  static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)  {  	struct net_device *dev = to_net_dev(d); @@ -1241,7 +1295,6 @@ static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)  exit:  	return retval;  } -#endif  /*   *	netdev_release -- destroy and free a dead device. @@ -1254,7 +1307,7 @@ static void netdev_release(struct device *d)  	BUG_ON(dev->reg_state != NETREG_RELEASED);  	kfree(dev->ifalias); -	kfree((char *)dev - dev->padded); +	netdev_freemem(dev);  }  static const void *net_namespace(struct device *d) @@ -1267,12 +1320,8 @@ static const void *net_namespace(struct device *d)  static struct class net_class = {  	.name = "net",  	.dev_release = netdev_release, -#ifdef CONFIG_SYSFS -	.dev_attrs = net_class_attributes, -#endif /* CONFIG_SYSFS */ -#ifdef CONFIG_HOTPLUG +	.dev_groups = net_class_groups,  	.dev_uevent = netdev_uevent, -#endif  	.ns_type = &net_ns_type_operations,  	.namespace = net_namespace,  }; @@ -1288,6 +1337,8 @@ void netdev_unregister_kobject(struct net_device * net)  	remove_queue_kobjects(net); +	pm_runtime_set_memalloc_noio(dev, false); +  	device_del(dev);  } @@ -1311,10 +1362,11 @@ int netdev_register_kobject(struct net_device *net)  		groups++;  	*groups++ = &netstat_group; -#ifdef CONFIG_WIRELESS_EXT_SYSFS + +#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)  	if (net->ieee80211_ptr)  		*groups++ = &wireless_group; -#ifdef CONFIG_WIRELESS_EXT +#if IS_ENABLED(CONFIG_WIRELESS_EXT)  	else if (net->wireless_handlers)  		*groups++ = &wireless_group;  #endif @@ -1331,24 +1383,27 @@ int netdev_register_kobject(struct net_device *net)  		return error;  	} +	pm_runtime_set_memalloc_noio(dev, true); +  	return error;  } -int netdev_class_create_file(struct class_attribute *class_attr) +int netdev_class_create_file_ns(struct class_attribute *class_attr, +				const void *ns)  { -	return class_create_file(&net_class, class_attr); +	return class_create_file_ns(&net_class, class_attr, ns);  } -EXPORT_SYMBOL(netdev_class_create_file); +EXPORT_SYMBOL(netdev_class_create_file_ns); -void netdev_class_remove_file(struct class_attribute *class_attr) +void netdev_class_remove_file_ns(struct class_attribute *class_attr, +				 const void *ns)  { -	class_remove_file(&net_class, class_attr); +	class_remove_file_ns(&net_class, class_attr, ns);  } -EXPORT_SYMBOL(netdev_class_remove_file); +EXPORT_SYMBOL(netdev_class_remove_file_ns); -int netdev_kobject_init(void) +int __init netdev_kobject_init(void)  {  	kobj_ns_type_register(&net_ns_type_operations); -	register_pernet_subsys(&kobj_net_ops);  	return class_register(&net_class);  } diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index bd7751ec1c4..2745a1b51e0 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -1,7 +1,7 @@  #ifndef __NET_SYSFS_H__  #define __NET_SYSFS_H__ -int netdev_kobject_init(void); +int __init netdev_kobject_init(void);  int netdev_register_kobject(struct net_device *);  void netdev_unregister_kobject(struct net_device *);  int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 7f1bb2aba03..ba3c0120786 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -11,6 +11,7 @@  #include <linux/inetdevice.h>  #include <linux/inet.h>  #include <linux/interrupt.h> +#include <linux/export.h>  #include <linux/netpoll.h>  #include <linux/sched.h>  #include <linux/delay.h> @@ -28,6 +29,8 @@  #include <trace/events/skb.h>  #include <trace/events/net.h>  #include <trace/events/napi.h> +#include <trace/events/sock.h> +#include <trace/events/udp.h>  EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 3f860261c5e..85b62691f4f 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/workqueue.h>  #include <linux/rtnetlink.h>  #include <linux/cache.h> @@ -8,6 +10,11 @@  #include <linux/idr.h>  #include <linux/rculist.h>  #include <linux/nsproxy.h> +#include <linux/fs.h> +#include <linux/proc_ns.h> +#include <linux/file.h> +#include <linux/export.h> +#include <linux/user_namespace.h>  #include <net/net_namespace.h>  #include <net/netns/generic.h> @@ -17,22 +24,30 @@  static LIST_HEAD(pernet_list);  static struct list_head *first_device = &pernet_list; -static DEFINE_MUTEX(net_mutex); +DEFINE_MUTEX(net_mutex);  LIST_HEAD(net_namespace_list);  EXPORT_SYMBOL_GPL(net_namespace_list); -struct net init_net; +struct net init_net = { +	.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), +};  EXPORT_SYMBOL(init_net);  #define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */ -static void net_generic_release(struct rcu_head *rcu) +static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; + +static struct net_generic *net_alloc_generic(void)  {  	struct net_generic *ng; +	size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); -	ng = container_of(rcu, struct net_generic, rcu); -	kfree(ng); +	ng = kzalloc(generic_size, GFP_KERNEL); +	if (ng) +		ng->len = max_gen_ptrs; + +	return ng;  }  static int net_assign_generic(struct net *net, int id, void *data) @@ -48,8 +63,7 @@ static int net_assign_generic(struct net *net, int id, void *data)  	if (old_ng->len >= id)  		goto assign; -	ng = kzalloc(sizeof(struct net_generic) + -			id * sizeof(void *), GFP_KERNEL); +	ng = net_alloc_generic();  	if (ng == NULL)  		return -ENOMEM; @@ -64,11 +78,10 @@ static int net_assign_generic(struct net *net, int id, void *data)  	 * the old copy for kfree after a grace period.  	 */ -	ng->len = id;  	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));  	rcu_assign_pointer(net->gen, ng); -	call_rcu(&old_ng->rcu, net_generic_release); +	kfree_rcu(old_ng, rcu);  assign:  	ng->ptr[id - 1] = data;  	return 0; @@ -76,21 +89,29 @@ assign:  static int ops_init(const struct pernet_operations *ops, struct net *net)  { -	int err; +	int err = -ENOMEM; +	void *data = NULL; +  	if (ops->id && ops->size) { -		void *data = kzalloc(ops->size, GFP_KERNEL); +		data = kzalloc(ops->size, GFP_KERNEL);  		if (!data) -			return -ENOMEM; +			goto out;  		err = net_assign_generic(net, *ops->id, data); -		if (err) { -			kfree(data); -			return err; -		} +		if (err) +			goto cleanup;  	} +	err = 0;  	if (ops->init) -		return ops->init(net); -	return 0; +		err = ops->init(net); +	if (!err) +		return 0; + +cleanup: +	kfree(data); + +out: +	return err;  }  static void ops_free(const struct pernet_operations *ops, struct net *net) @@ -126,7 +147,7 @@ static void ops_free_list(const struct pernet_operations *ops,  /*   * setup_net runs the initializers for the network namespace object.   */ -static __net_init int setup_net(struct net *net) +static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)  {  	/* Must be called with net_mutex held */  	const struct pernet_operations *ops, *saved_ops; @@ -134,6 +155,9 @@ static __net_init int setup_net(struct net *net)  	LIST_HEAD(net_exit_list);  	atomic_set(&net->count, 1); +	atomic_set(&net->passive, 1); +	net->dev_base_seq = 1; +	net->user_ns = user_ns;  #ifdef NETNS_REFCNT_DEBUG  	atomic_set(&net->use_count, 0); @@ -164,18 +188,6 @@ out_undo:  	goto out;  } -static struct net_generic *net_alloc_generic(void) -{ -	struct net_generic *ng; -	size_t generic_size = sizeof(struct net_generic) + -		INITIAL_NET_GEN_PTRS * sizeof(void *); - -	ng = kzalloc(generic_size, GFP_KERNEL); -	if (ng) -		ng->len = INITIAL_NET_GEN_PTRS; - -	return ng; -}  #ifdef CONFIG_NET_NS  static struct kmem_cache *net_cachep; @@ -207,8 +219,8 @@ static void net_free(struct net *net)  {  #ifdef NETNS_REFCNT_DEBUG  	if (unlikely(atomic_read(&net->use_count) != 0)) { -		printk(KERN_EMERG "network namespace not free! Usage: %d\n", -			atomic_read(&net->use_count)); +		pr_emerg("network namespace not free! Usage: %d\n", +			 atomic_read(&net->use_count));  		return;  	}  #endif @@ -216,16 +228,30 @@ static void net_free(struct net *net)  	kmem_cache_free(net_cachep, net);  } -static struct net *net_create(void) +void net_drop_ns(void *p) +{ +	struct net *ns = p; +	if (ns && atomic_dec_and_test(&ns->passive)) +		net_free(ns); +} + +struct net *copy_net_ns(unsigned long flags, +			struct user_namespace *user_ns, struct net *old_net)  {  	struct net *net;  	int rv; +	if (!(flags & CLONE_NEWNET)) +		return get_net(old_net); +  	net = net_alloc();  	if (!net)  		return ERR_PTR(-ENOMEM); + +	get_user_ns(user_ns); +  	mutex_lock(&net_mutex); -	rv = setup_net(net); +	rv = setup_net(net, user_ns);  	if (rv == 0) {  		rtnl_lock();  		list_add_tail_rcu(&net->list, &net_namespace_list); @@ -233,19 +259,13 @@ static struct net *net_create(void)  	}  	mutex_unlock(&net_mutex);  	if (rv < 0) { -		net_free(net); +		put_user_ns(user_ns); +		net_drop_ns(net);  		return ERR_PTR(rv);  	}  	return net;  } -struct net *copy_net_ns(unsigned long flags, struct net *old_net) -{ -	if (!(flags & CLONE_NEWNET)) -		return get_net(old_net); -	return net_create(); -} -  static DEFINE_SPINLOCK(cleanup_list_lock);  static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */ @@ -253,7 +273,7 @@ static void cleanup_net(struct work_struct *work)  {  	const struct pernet_operations *ops;  	struct net *net, *tmp; -	LIST_HEAD(net_kill_list); +	struct list_head net_kill_list;  	LIST_HEAD(net_exit_list);  	/* Atomically snapshot the list of namespaces to cleanup */ @@ -296,7 +316,8 @@ static void cleanup_net(struct work_struct *work)  	/* Finally it is safe to free my network namespace structure */  	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {  		list_del_init(&net->exit_list); -		net_free(net); +		put_user_ns(net->user_ns); +		net_drop_ns(net);  	}  }  static DECLARE_WORK(net_cleanup_work, cleanup_net); @@ -314,12 +335,30 @@ void __put_net(struct net *net)  }  EXPORT_SYMBOL_GPL(__put_net); +struct net *get_net_ns_by_fd(int fd) +{ +	struct proc_ns *ei; +	struct file *file; +	struct net *net; + +	file = proc_ns_fget(fd); +	if (IS_ERR(file)) +		return ERR_CAST(file); + +	ei = get_proc_ns(file_inode(file)); +	if (ei->ns_ops == &netns_operations) +		net = get_net(ei->ns); +	else +		net = ERR_PTR(-EINVAL); + +	fput(file); +	return net; +} +  #else -struct net *copy_net_ns(unsigned long flags, struct net *old_net) +struct net *get_net_ns_by_fd(int fd)  { -	if (flags & CLONE_NEWNET) -		return ERR_PTR(-EINVAL); -	return old_net; +	return ERR_PTR(-EINVAL);  }  #endif @@ -343,6 +382,21 @@ struct net *get_net_ns_by_pid(pid_t pid)  }  EXPORT_SYMBOL_GPL(get_net_ns_by_pid); +static __net_init int net_ns_net_init(struct net *net) +{ +	return proc_alloc_inum(&net->proc_inum); +} + +static __net_exit void net_ns_net_exit(struct net *net) +{ +	proc_free_inum(net->proc_inum); +} + +static struct pernet_operations __net_initdata net_ns_ops = { +	.init = net_ns_net_init, +	.exit = net_ns_net_exit, +}; +  static int __init net_ns_init(void)  {  	struct net_generic *ng; @@ -365,7 +419,7 @@ static int __init net_ns_init(void)  	rcu_assign_pointer(init_net.gen, ng);  	mutex_lock(&net_mutex); -	if (setup_net(&init_net)) +	if (setup_net(&init_net, &init_user_ns))  		panic("Could not setup the initial network namespace");  	rtnl_lock(); @@ -374,6 +428,8 @@ static int __init net_ns_init(void)  	mutex_unlock(&net_mutex); +	register_pernet_subsys(&net_ns_ops); +  	return 0;  } @@ -423,12 +479,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)  static int __register_pernet_operations(struct list_head *list,  					struct pernet_operations *ops)  { -	int err = 0; -	err = ops_init(ops, &init_net); -	if (err) -		ops_free(ops, &init_net); -	return err; -	 +	return ops_init(ops, &init_net);  }  static void __unregister_pernet_operations(struct pernet_operations *ops) @@ -458,6 +509,7 @@ again:  			}  			return error;  		} +		max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);  	}  	error = __register_pernet_operations(list, ops);  	if (error) { @@ -573,3 +625,52 @@ void unregister_pernet_device(struct pernet_operations *ops)  	mutex_unlock(&net_mutex);  }  EXPORT_SYMBOL_GPL(unregister_pernet_device); + +#ifdef CONFIG_NET_NS +static void *netns_get(struct task_struct *task) +{ +	struct net *net = NULL; +	struct nsproxy *nsproxy; + +	rcu_read_lock(); +	nsproxy = task_nsproxy(task); +	if (nsproxy) +		net = get_net(nsproxy->net_ns); +	rcu_read_unlock(); + +	return net; +} + +static void netns_put(void *ns) +{ +	put_net(ns); +} + +static int netns_install(struct nsproxy *nsproxy, void *ns) +{ +	struct net *net = ns; + +	if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || +	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) +		return -EPERM; + +	put_net(nsproxy->net_ns); +	nsproxy->net_ns = get_net(net); +	return 0; +} + +static unsigned int netns_inum(void *ns) +{ +	struct net *net = ns; +	return net->proc_inum; +} + +const struct proc_ns_operations netns_operations = { +	.name		= "net", +	.type		= CLONE_NEWNET, +	.get		= netns_get, +	.put		= netns_put, +	.install	= netns_install, +	.inum		= netns_inum, +}; +#endif diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c new file mode 100644 index 00000000000..30d903b19c6 --- /dev/null +++ b/net/core/netclassid_cgroup.c @@ -0,0 +1,111 @@ +/* + * net/core/netclassid_cgroup.c	Classid Cgroupfs Handling + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + * + * Authors:	Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/cgroup.h> +#include <linux/fdtable.h> +#include <net/cls_cgroup.h> +#include <net/sock.h> + +static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) +{ +	return css ? container_of(css, struct cgroup_cls_state, css) : NULL; +} + +struct cgroup_cls_state *task_cls_state(struct task_struct *p) +{ +	return css_cls_state(task_css(p, net_cls_cgrp_id)); +} +EXPORT_SYMBOL_GPL(task_cls_state); + +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) +{ +	struct cgroup_cls_state *cs; + +	cs = kzalloc(sizeof(*cs), GFP_KERNEL); +	if (!cs) +		return ERR_PTR(-ENOMEM); + +	return &cs->css; +} + +static int cgrp_css_online(struct cgroup_subsys_state *css) +{ +	struct cgroup_cls_state *cs = css_cls_state(css); +	struct cgroup_cls_state *parent = css_cls_state(css->parent); + +	if (parent) +		cs->classid = parent->classid; + +	return 0; +} + +static void cgrp_css_free(struct cgroup_subsys_state *css) +{ +	kfree(css_cls_state(css)); +} + +static int update_classid(const void *v, struct file *file, unsigned n) +{ +	int err; +	struct socket *sock = sock_from_file(file, &err); + +	if (sock) +		sock->sk->sk_classid = (u32)(unsigned long)v; + +	return 0; +} + +static void cgrp_attach(struct cgroup_subsys_state *css, +			struct cgroup_taskset *tset) +{ +	struct cgroup_cls_state *cs = css_cls_state(css); +	void *v = (void *)(unsigned long)cs->classid; +	struct task_struct *p; + +	cgroup_taskset_for_each(p, tset) { +		task_lock(p); +		iterate_fd(p->files, 0, update_classid, v); +		task_unlock(p); +	} +} + +static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) +{ +	return css_cls_state(css)->classid; +} + +static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, +			 u64 value) +{ +	css_cls_state(css)->classid = (u32) value; + +	return 0; +} + +static struct cftype ss_files[] = { +	{ +		.name		= "classid", +		.read_u64	= read_classid, +		.write_u64	= write_classid, +	}, +	{ }	/* terminate */ +}; + +struct cgroup_subsys net_cls_cgrp_subsys = { +	.css_alloc		= cgrp_css_alloc, +	.css_online		= cgrp_css_online, +	.css_free		= cgrp_css_free, +	.attach			= cgrp_attach, +	.base_cftypes		= ss_files, +}; diff --git a/net/core/netevent.c b/net/core/netevent.c index 865f0ceb81f..f17ccd291d3 100644 --- a/net/core/netevent.c +++ b/net/core/netevent.c @@ -15,6 +15,7 @@  #include <linux/rtnetlink.h>  #include <linux/notifier.h> +#include <linux/export.h>  #include <net/netevent.h>  static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index ee38acb6d46..e33937fb32a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -9,7 +9,10 @@   * Copyright (C) 2002  Red Hat, Inc.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/moduleparam.h> +#include <linux/kernel.h>  #include <linux/netdevice.h>  #include <linux/etherdevice.h>  #include <linux/string.h> @@ -23,8 +26,13 @@  #include <linux/rcupdate.h>  #include <linux/workqueue.h>  #include <linux/slab.h> +#include <linux/export.h> +#include <linux/if_vlan.h>  #include <net/tcp.h>  #include <net/udp.h> +#include <net/addrconf.h> +#include <net/ndisc.h> +#include <net/ip6_checksum.h>  #include <asm/unaligned.h>  #include <trace/events/napi.h> @@ -35,26 +43,63 @@  #define MAX_UDP_CHUNK 1460  #define MAX_SKBS 32 -#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)  static struct sk_buff_head skb_pool; -static atomic_t trapped; +DEFINE_STATIC_SRCU(netpoll_srcu);  #define USEC_PER_POLL	50 -#define NETPOLL_RX_ENABLED  1 -#define NETPOLL_RX_DROP     2 -#define MAX_SKB_SIZE \ -		(MAX_UDP_CHUNK + sizeof(struct udphdr) + \ -				sizeof(struct iphdr) + sizeof(struct ethhdr)) +#define MAX_SKB_SIZE							\ +	(sizeof(struct ethhdr) +					\ +	 sizeof(struct iphdr) +						\ +	 sizeof(struct udphdr) +					\ +	 MAX_UDP_CHUNK)  static void zap_completion_queue(void); -static void arp_reply(struct sk_buff *skb); +static void netpoll_async_cleanup(struct work_struct *work);  static unsigned int carrier_timeout = 4;  module_param(carrier_timeout, uint, 0644); +#define np_info(np, fmt, ...)				\ +	pr_info("%s: " fmt, np->name, ##__VA_ARGS__) +#define np_err(np, fmt, ...)				\ +	pr_err("%s: " fmt, np->name, ##__VA_ARGS__) +#define np_notice(np, fmt, ...)				\ +	pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) + +static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, +			      struct netdev_queue *txq) +{ +	const struct net_device_ops *ops = dev->netdev_ops; +	int status = NETDEV_TX_OK; +	netdev_features_t features; + +	features = netif_skb_features(skb); + +	if (vlan_tx_tag_present(skb) && +	    !vlan_hw_offload_capable(features, skb->vlan_proto)) { +		skb = __vlan_put_tag(skb, skb->vlan_proto, +				     vlan_tx_tag_get(skb)); +		if (unlikely(!skb)) { +			/* This is actually a packet drop, but we +			 * don't want the code that calls this +			 * function to try and operate on a NULL skb. +			 */ +			goto out; +		} +		skb->vlan_tci = 0; +	} + +	status = ops->ndo_start_xmit(skb, dev); +	if (status == NETDEV_TX_OK) +		txq_trans_update(txq); + +out: +	return status; +} +  static void queue_process(struct work_struct *work)  {  	struct netpoll_info *npinfo = @@ -64,51 +109,31 @@ static void queue_process(struct work_struct *work)  	while ((skb = skb_dequeue(&npinfo->txq))) {  		struct net_device *dev = skb->dev; -		const struct net_device_ops *ops = dev->netdev_ops;  		struct netdev_queue *txq;  		if (!netif_device_present(dev) || !netif_running(dev)) { -			__kfree_skb(skb); +			kfree_skb(skb);  			continue;  		}  		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));  		local_irq_save(flags); -		__netif_tx_lock(txq, smp_processor_id()); -		if (netif_tx_queue_frozen_or_stopped(txq) || -		    ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { +		HARD_TX_LOCK(dev, txq, smp_processor_id()); +		if (netif_xmit_frozen_or_stopped(txq) || +		    netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {  			skb_queue_head(&npinfo->txq, skb); -			__netif_tx_unlock(txq); +			HARD_TX_UNLOCK(dev, txq);  			local_irq_restore(flags);  			schedule_delayed_work(&npinfo->tx_work, HZ/10);  			return;  		} -		__netif_tx_unlock(txq); +		HARD_TX_UNLOCK(dev, txq);  		local_irq_restore(flags);  	}  } -static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh, -			    unsigned short ulen, __be32 saddr, __be32 daddr) -{ -	__wsum psum; - -	if (uh->check == 0 || skb_csum_unnecessary(skb)) -		return 0; - -	psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); - -	if (skb->ip_summed == CHECKSUM_COMPLETE && -	    !csum_fold(csum_add(psum, skb->csum))) -		return 0; - -	skb->csum = psum; - -	return __skb_checksum_complete(skb); -} -  /*   * Check whether delayed processing was scheduled for our NIC. If so,   * we attempt to grab the poll lock and use ->poll() to pump the card. @@ -119,14 +144,8 @@ static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,   * trylock here and interrupts are already disabled in the softirq   * case. Further, we test the poll_owner to avoid recursion on UP   * systems where the lock doesn't exist. - * - * In cases where there is bi-directional communications, reading only - * one message at a time can lead to packets being dropped by the - * network adapter, forcing superfluous retries and possibly timeouts. - * Thus, we set our budget to greater than 1.   */ -static int poll_one_napi(struct netpoll_info *npinfo, -			 struct napi_struct *napi, int budget) +static int poll_one_napi(struct napi_struct *napi, int budget)  {  	int work; @@ -137,74 +156,87 @@ static int poll_one_napi(struct netpoll_info *npinfo,  	if (!test_bit(NAPI_STATE_SCHED, &napi->state))  		return budget; -	npinfo->rx_flags |= NETPOLL_RX_DROP; -	atomic_inc(&trapped);  	set_bit(NAPI_STATE_NPSVC, &napi->state);  	work = napi->poll(napi, budget); +	WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);  	trace_napi_poll(napi);  	clear_bit(NAPI_STATE_NPSVC, &napi->state); -	atomic_dec(&trapped); -	npinfo->rx_flags &= ~NETPOLL_RX_DROP;  	return budget - work;  } -static void poll_napi(struct net_device *dev) +static void poll_napi(struct net_device *dev, int budget)  {  	struct napi_struct *napi; -	int budget = 16;  	list_for_each_entry(napi, &dev->napi_list, dev_list) {  		if (napi->poll_owner != smp_processor_id() &&  		    spin_trylock(&napi->poll_lock)) { -			budget = poll_one_napi(dev->npinfo, napi, budget); +			budget = poll_one_napi(napi, budget);  			spin_unlock(&napi->poll_lock); - -			if (!budget) -				break;  		}  	}  } -static void service_arp_queue(struct netpoll_info *npi) -{ -	if (npi) { -		struct sk_buff *skb; - -		while ((skb = skb_dequeue(&npi->arp_tx))) -			arp_reply(skb); -	} -} - -void netpoll_poll_dev(struct net_device *dev) +static void netpoll_poll_dev(struct net_device *dev)  {  	const struct net_device_ops *ops; +	struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); +	int budget = 0; -	if (!dev || !netif_running(dev)) +	/* Don't do any rx activity if the dev_lock mutex is held +	 * the dev_open/close paths use this to block netpoll activity +	 * while changing device state +	 */ +	if (down_trylock(&ni->dev_lock))  		return; +	if (!netif_running(dev)) { +		up(&ni->dev_lock); +		return; +	} +  	ops = dev->netdev_ops; -	if (!ops->ndo_poll_controller) +	if (!ops->ndo_poll_controller) { +		up(&ni->dev_lock);  		return; +	}  	/* Process pending work on NIC */  	ops->ndo_poll_controller(dev); -	poll_napi(dev); +	poll_napi(dev, budget); -	service_arp_queue(dev->npinfo); +	up(&ni->dev_lock);  	zap_completion_queue();  } -EXPORT_SYMBOL(netpoll_poll_dev); -void netpoll_poll(struct netpoll *np) +void netpoll_poll_disable(struct net_device *dev)  { -	netpoll_poll_dev(np->dev); +	struct netpoll_info *ni; +	int idx; +	might_sleep(); +	idx = srcu_read_lock(&netpoll_srcu); +	ni = srcu_dereference(dev->npinfo, &netpoll_srcu); +	if (ni) +		down(&ni->dev_lock); +	srcu_read_unlock(&netpoll_srcu, idx);  } -EXPORT_SYMBOL(netpoll_poll); +EXPORT_SYMBOL(netpoll_poll_disable); + +void netpoll_poll_enable(struct net_device *dev) +{ +	struct netpoll_info *ni; +	rcu_read_lock(); +	ni = rcu_dereference(dev->npinfo); +	if (ni) +		up(&ni->dev_lock); +	rcu_read_unlock(); +} +EXPORT_SYMBOL(netpoll_poll_enable);  static void refill_skbs(void)  { @@ -238,7 +270,7 @@ static void zap_completion_queue(void)  		while (clist != NULL) {  			struct sk_buff *skb = clist;  			clist = clist->next; -			if (skb->destructor) { +			if (!skb_irq_freeable(skb)) {  				atomic_inc(&skb->users);  				dev_kfree_skb_any(skb); /* put this one back */  			} else { @@ -265,7 +297,7 @@ repeat:  	if (!skb) {  		if (++count < 10) { -			netpoll_poll(np); +			netpoll_poll_dev(np->dev);  			goto repeat;  		}  		return NULL; @@ -287,40 +319,37 @@ static int netpoll_owner_active(struct net_device *dev)  	return 0;  } +/* call with IRQ disabled */  void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,  			     struct net_device *dev)  {  	int status = NETDEV_TX_BUSY;  	unsigned long tries; -	const struct net_device_ops *ops = dev->netdev_ops;  	/* It is up to the caller to keep npinfo alive. */ -	struct netpoll_info *npinfo = np->dev->npinfo; +	struct netpoll_info *npinfo; + +	WARN_ON_ONCE(!irqs_disabled()); +	npinfo = rcu_dereference_bh(np->dev->npinfo);  	if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { -		__kfree_skb(skb); +		dev_kfree_skb_irq(skb);  		return;  	}  	/* don't get messages out of order, and no recursion */  	if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {  		struct netdev_queue *txq; -		unsigned long flags; -		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); +		txq = netdev_pick_tx(dev, skb, NULL); -		local_irq_save(flags);  		/* try until next clock tick */  		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;  		     tries > 0; --tries) { -			if (__netif_tx_trylock(txq)) { -				if (!netif_tx_queue_stopped(txq)) { -					dev->priv_flags |= IFF_IN_NETPOLL; -					status = ops->ndo_start_xmit(skb, dev); -					dev->priv_flags &= ~IFF_IN_NETPOLL; -					if (status == NETDEV_TX_OK) -						txq_trans_update(txq); -				} -				__netif_tx_unlock(txq); +			if (HARD_TX_TRYLOCK(dev, txq)) { +				if (!netif_xmit_stopped(txq)) +					status = netpoll_start_xmit(skb, dev, txq); + +				HARD_TX_UNLOCK(dev, txq);  				if (status == NETDEV_TX_OK)  					break; @@ -328,16 +357,15 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,  			}  			/* tickle device maybe there is some cleanup */ -			netpoll_poll(np); +			netpoll_poll_dev(np->dev);  			udelay(USEC_PER_POLL);  		}  		WARN_ONCE(!irqs_disabled(), -			"netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", -			dev->name, ops->ndo_start_xmit); +			"netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", +			dev->name, dev->netdev_ops->ndo_start_xmit); -		local_irq_restore(flags);  	}  	if (status != NETDEV_TX_OK) { @@ -349,22 +377,29 @@ EXPORT_SYMBOL(netpoll_send_skb_on_dev);  void netpoll_send_udp(struct netpoll *np, const char *msg, int len)  { -	int total_len, eth_len, ip_len, udp_len; +	int total_len, ip_len, udp_len;  	struct sk_buff *skb;  	struct udphdr *udph;  	struct iphdr *iph;  	struct ethhdr *eth; +	static atomic_t ip_ident; +	struct ipv6hdr *ip6h;  	udp_len = len + sizeof(*udph); -	ip_len = eth_len = udp_len + sizeof(*iph); -	total_len = eth_len + ETH_HLEN + NET_IP_ALIGN; +	if (np->ipv6) +		ip_len = udp_len + sizeof(*ip6h); +	else +		ip_len = udp_len + sizeof(*iph); + +	total_len = ip_len + LL_RESERVED_SPACE(np->dev); -	skb = find_skb(np, total_len, total_len - len); +	skb = find_skb(np, total_len + np->dev->needed_tailroom, +		       total_len - len);  	if (!skb)  		return;  	skb_copy_to_linear_data(skb, msg, len); -	skb->len += len; +	skb_put(skb, len);  	skb_push(skb, sizeof(*udph));  	skb_reset_transport_header(skb); @@ -372,36 +407,68 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)  	udph->source = htons(np->local_port);  	udph->dest = htons(np->remote_port);  	udph->len = htons(udp_len); -	udph->check = 0; -	udph->check = csum_tcpudp_magic(np->local_ip, -					np->remote_ip, -					udp_len, IPPROTO_UDP, -					csum_partial(udph, udp_len, 0)); -	if (udph->check == 0) -		udph->check = CSUM_MANGLED_0; - -	skb_push(skb, sizeof(*iph)); -	skb_reset_network_header(skb); -	iph = ip_hdr(skb); - -	/* iph->version = 4; iph->ihl = 5; */ -	put_unaligned(0x45, (unsigned char *)iph); -	iph->tos      = 0; -	put_unaligned(htons(ip_len), &(iph->tot_len)); -	iph->id       = 0; -	iph->frag_off = 0; -	iph->ttl      = 64; -	iph->protocol = IPPROTO_UDP; -	iph->check    = 0; -	put_unaligned(np->local_ip, &(iph->saddr)); -	put_unaligned(np->remote_ip, &(iph->daddr)); -	iph->check    = ip_fast_csum((unsigned char *)iph, iph->ihl); - -	eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); -	skb_reset_mac_header(skb); -	skb->protocol = eth->h_proto = htons(ETH_P_IP); -	memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN); -	memcpy(eth->h_dest, np->remote_mac, ETH_ALEN); + +	if (np->ipv6) { +		udph->check = 0; +		udph->check = csum_ipv6_magic(&np->local_ip.in6, +					      &np->remote_ip.in6, +					      udp_len, IPPROTO_UDP, +					      csum_partial(udph, udp_len, 0)); +		if (udph->check == 0) +			udph->check = CSUM_MANGLED_0; + +		skb_push(skb, sizeof(*ip6h)); +		skb_reset_network_header(skb); +		ip6h = ipv6_hdr(skb); + +		/* ip6h->version = 6; ip6h->priority = 0; */ +		put_unaligned(0x60, (unsigned char *)ip6h); +		ip6h->flow_lbl[0] = 0; +		ip6h->flow_lbl[1] = 0; +		ip6h->flow_lbl[2] = 0; + +		ip6h->payload_len = htons(sizeof(struct udphdr) + len); +		ip6h->nexthdr = IPPROTO_UDP; +		ip6h->hop_limit = 32; +		ip6h->saddr = np->local_ip.in6; +		ip6h->daddr = np->remote_ip.in6; + +		eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); +		skb_reset_mac_header(skb); +		skb->protocol = eth->h_proto = htons(ETH_P_IPV6); +	} else { +		udph->check = 0; +		udph->check = csum_tcpudp_magic(np->local_ip.ip, +						np->remote_ip.ip, +						udp_len, IPPROTO_UDP, +						csum_partial(udph, udp_len, 0)); +		if (udph->check == 0) +			udph->check = CSUM_MANGLED_0; + +		skb_push(skb, sizeof(*iph)); +		skb_reset_network_header(skb); +		iph = ip_hdr(skb); + +		/* iph->version = 4; iph->ihl = 5; */ +		put_unaligned(0x45, (unsigned char *)iph); +		iph->tos      = 0; +		put_unaligned(htons(ip_len), &(iph->tot_len)); +		iph->id       = htons(atomic_inc_return(&ip_ident)); +		iph->frag_off = 0; +		iph->ttl      = 64; +		iph->protocol = IPPROTO_UDP; +		iph->check    = 0; +		put_unaligned(np->local_ip.ip, &(iph->saddr)); +		put_unaligned(np->remote_ip.ip, &(iph->daddr)); +		iph->check    = ip_fast_csum((unsigned char *)iph, iph->ihl); + +		eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); +		skb_reset_mac_header(skb); +		skb->protocol = eth->h_proto = htons(ETH_P_IP); +	} + +	ether_addr_copy(eth->h_source, np->dev->dev_addr); +	ether_addr_copy(eth->h_dest, np->remote_mac);  	skb->dev = np->dev; @@ -409,251 +476,69 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)  }  EXPORT_SYMBOL(netpoll_send_udp); -static void arp_reply(struct sk_buff *skb) +void netpoll_print_options(struct netpoll *np)  { -	struct netpoll_info *npinfo = skb->dev->npinfo; -	struct arphdr *arp; -	unsigned char *arp_ptr; -	int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; -	__be32 sip, tip; -	unsigned char *sha; -	struct sk_buff *send_skb; -	struct netpoll *np, *tmp; -	unsigned long flags; -	int hits = 0; - -	if (list_empty(&npinfo->rx_np)) -		return; - -	/* Before checking the packet, we do some early -	   inspection whether this is interesting at all */ -	spin_lock_irqsave(&npinfo->rx_lock, flags); -	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { -		if (np->dev == skb->dev) -			hits++; -	} -	spin_unlock_irqrestore(&npinfo->rx_lock, flags); - -	/* No netpoll struct is using this dev */ -	if (!hits) -		return; - -	/* No arp on this interface */ -	if (skb->dev->flags & IFF_NOARP) -		return; - -	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) -		return; - -	skb_reset_network_header(skb); -	skb_reset_transport_header(skb); -	arp = arp_hdr(skb); - -	if ((arp->ar_hrd != htons(ARPHRD_ETHER) && -	     arp->ar_hrd != htons(ARPHRD_IEEE802)) || -	    arp->ar_pro != htons(ETH_P_IP) || -	    arp->ar_op != htons(ARPOP_REQUEST)) -		return; - -	arp_ptr = (unsigned char *)(arp+1); -	/* save the location of the src hw addr */ -	sha = arp_ptr; -	arp_ptr += skb->dev->addr_len; -	memcpy(&sip, arp_ptr, 4); -	arp_ptr += 4; -	/* If we actually cared about dst hw addr, -	   it would get copied here */ -	arp_ptr += skb->dev->addr_len; -	memcpy(&tip, arp_ptr, 4); - -	/* Should we ignore arp? */ -	if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) -		return; - -	size = arp_hdr_len(skb->dev); - -	spin_lock_irqsave(&npinfo->rx_lock, flags); -	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { -		if (tip != np->local_ip) -			continue; - -		send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), -				    LL_RESERVED_SPACE(np->dev)); -		if (!send_skb) -			continue; - -		skb_reset_network_header(send_skb); -		arp = (struct arphdr *) skb_put(send_skb, size); -		send_skb->dev = skb->dev; -		send_skb->protocol = htons(ETH_P_ARP); - -		/* Fill the device header for the ARP frame */ -		if (dev_hard_header(send_skb, skb->dev, ptype, -				    sha, np->dev->dev_addr, -				    send_skb->len) < 0) { -			kfree_skb(send_skb); -			continue; -		} - -		/* -		 * Fill out the arp protocol part. -		 * -		 * we only support ethernet device type, -		 * which (according to RFC 1390) should -		 * always equal 1 (Ethernet). -		 */ - -		arp->ar_hrd = htons(np->dev->type); -		arp->ar_pro = htons(ETH_P_IP); -		arp->ar_hln = np->dev->addr_len; -		arp->ar_pln = 4; -		arp->ar_op = htons(type); - -		arp_ptr = (unsigned char *)(arp + 1); -		memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); -		arp_ptr += np->dev->addr_len; -		memcpy(arp_ptr, &tip, 4); -		arp_ptr += 4; -		memcpy(arp_ptr, sha, np->dev->addr_len); -		arp_ptr += np->dev->addr_len; -		memcpy(arp_ptr, &sip, 4); - -		netpoll_send_skb(np, send_skb); - -		/* If there are several rx_hooks for the same address, -		   we're fine by sending a single reply */ -		break; -	} -	spin_unlock_irqrestore(&npinfo->rx_lock, flags); +	np_info(np, "local port %d\n", np->local_port); +	if (np->ipv6) +		np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); +	else +		np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); +	np_info(np, "interface '%s'\n", np->dev_name); +	np_info(np, "remote port %d\n", np->remote_port); +	if (np->ipv6) +		np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); +	else +		np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); +	np_info(np, "remote ethernet address %pM\n", np->remote_mac);  } +EXPORT_SYMBOL(netpoll_print_options); -int __netpoll_rx(struct sk_buff *skb) +static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)  { -	int proto, len, ulen; -	int hits = 0; -	struct iphdr *iph; -	struct udphdr *uh; -	struct netpoll_info *npinfo = skb->dev->npinfo; -	struct netpoll *np, *tmp; +	const char *end; -	if (list_empty(&npinfo->rx_np)) -		goto out; - -	if (skb->dev->type != ARPHRD_ETHER) -		goto out; - -	/* check if netpoll clients need ARP */ -	if (skb->protocol == htons(ETH_P_ARP) && -	    atomic_read(&trapped)) { -		skb_queue_tail(&npinfo->arp_tx, skb); -		return 1; +	if (!strchr(str, ':') && +	    in4_pton(str, -1, (void *)addr, -1, &end) > 0) { +		if (!*end) +			return 0;  	} - -	proto = ntohs(eth_hdr(skb)->h_proto); -	if (proto != ETH_P_IP) -		goto out; -	if (skb->pkt_type == PACKET_OTHERHOST) -		goto out; -	if (skb_shared(skb)) -		goto out; - -	iph = (struct iphdr *)skb->data; -	if (!pskb_may_pull(skb, sizeof(struct iphdr))) -		goto out; -	if (iph->ihl < 5 || iph->version != 4) -		goto out; -	if (!pskb_may_pull(skb, iph->ihl*4)) -		goto out; -	if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) -		goto out; - -	len = ntohs(iph->tot_len); -	if (skb->len < len || len < iph->ihl*4) -		goto out; - -	/* -	 * Our transport medium may have padded the buffer out. -	 * Now We trim to the true length of the frame. -	 */ -	if (pskb_trim_rcsum(skb, len)) -		goto out; - -	if (iph->protocol != IPPROTO_UDP) -		goto out; - -	len -= iph->ihl*4; -	uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); -	ulen = ntohs(uh->len); - -	if (ulen != len) -		goto out; -	if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) -		goto out; - -	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { -		if (np->local_ip && np->local_ip != iph->daddr) -			continue; -		if (np->remote_ip && np->remote_ip != iph->saddr) -			continue; -		if (np->local_port && np->local_port != ntohs(uh->dest)) -			continue; - -		np->rx_hook(np, ntohs(uh->source), -			       (char *)(uh+1), -			       ulen - sizeof(struct udphdr)); -		hits++; -	} - -	if (!hits) -		goto out; - -	kfree_skb(skb); -	return 1; - -out: -	if (atomic_read(&trapped)) { -		kfree_skb(skb); -		return 1; +	if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { +#if IS_ENABLED(CONFIG_IPV6) +		if (!*end) +			return 1; +#else +		return -1; +#endif  	} - -	return 0; -} - -void netpoll_print_options(struct netpoll *np) -{ -	printk(KERN_INFO "%s: local port %d\n", -			 np->name, np->local_port); -	printk(KERN_INFO "%s: local IP %pI4\n", -			 np->name, &np->local_ip); -	printk(KERN_INFO "%s: interface '%s'\n", -			 np->name, np->dev_name); -	printk(KERN_INFO "%s: remote port %d\n", -			 np->name, np->remote_port); -	printk(KERN_INFO "%s: remote IP %pI4\n", -			 np->name, &np->remote_ip); -	printk(KERN_INFO "%s: remote ethernet address %pM\n", -	                 np->name, np->remote_mac); +	return -1;  } -EXPORT_SYMBOL(netpoll_print_options);  int netpoll_parse_options(struct netpoll *np, char *opt)  {  	char *cur=opt, *delim; +	int ipv6; +	bool ipversion_set = false;  	if (*cur != '@') {  		if ((delim = strchr(cur, '@')) == NULL)  			goto parse_failed;  		*delim = 0; -		np->local_port = simple_strtol(cur, NULL, 10); +		if (kstrtou16(cur, 10, &np->local_port)) +			goto parse_failed;  		cur = delim;  	}  	cur++;  	if (*cur != '/') { +		ipversion_set = true;  		if ((delim = strchr(cur, '/')) == NULL)  			goto parse_failed;  		*delim = 0; -		np->local_ip = in_aton(cur); +		ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); +		if (ipv6 < 0) +			goto parse_failed; +		else +			np->ipv6 = (bool)ipv6;  		cur = delim;  	}  	cur++; @@ -674,9 +559,9 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  			goto parse_failed;  		*delim = 0;  		if (*cur == ' ' || *cur == '\t') -			printk(KERN_INFO "%s: warning: whitespace" -					"is not allowed\n", np->name); -		np->remote_port = simple_strtol(cur, NULL, 10); +			np_info(np, "warning: whitespace is not allowed\n"); +		if (kstrtou16(cur, 10, &np->remote_port)) +			goto parse_failed;  		cur = delim;  	}  	cur++; @@ -685,37 +570,19 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  	if ((delim = strchr(cur, '/')) == NULL)  		goto parse_failed;  	*delim = 0; -	np->remote_ip = in_aton(cur); +	ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); +	if (ipv6 < 0) +		goto parse_failed; +	else if (ipversion_set && np->ipv6 != (bool)ipv6) +		goto parse_failed; +	else +		np->ipv6 = (bool)ipv6;  	cur = delim + 1;  	if (*cur != 0) {  		/* MAC address */ -		if ((delim = strchr(cur, ':')) == NULL) +		if (!mac_pton(cur, np->remote_mac))  			goto parse_failed; -		*delim = 0; -		np->remote_mac[0] = simple_strtol(cur, NULL, 16); -		cur = delim + 1; -		if ((delim = strchr(cur, ':')) == NULL) -			goto parse_failed; -		*delim = 0; -		np->remote_mac[1] = simple_strtol(cur, NULL, 16); -		cur = delim + 1; -		if ((delim = strchr(cur, ':')) == NULL) -			goto parse_failed; -		*delim = 0; -		np->remote_mac[2] = simple_strtol(cur, NULL, 16); -		cur = delim + 1; -		if ((delim = strchr(cur, ':')) == NULL) -			goto parse_failed; -		*delim = 0; -		np->remote_mac[3] = simple_strtol(cur, NULL, 16); -		cur = delim + 1; -		if ((delim = strchr(cur, ':')) == NULL) -			goto parse_failed; -		*delim = 0; -		np->remote_mac[4] = simple_strtol(cur, NULL, 16); -		cur = delim + 1; -		np->remote_mac[5] = simple_strtol(cur, NULL, 16);  	}  	netpoll_print_options(np); @@ -723,24 +590,25 @@ int netpoll_parse_options(struct netpoll *np, char *opt)  	return 0;   parse_failed: -	printk(KERN_INFO "%s: couldn't parse config at '%s'!\n", -	       np->name, cur); +	np_info(np, "couldn't parse config at '%s'!\n", cur);  	return -1;  }  EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev)  { -	struct net_device *ndev = np->dev;  	struct netpoll_info *npinfo;  	const struct net_device_ops *ops; -	unsigned long flags;  	int err; +	np->dev = ndev; +	strlcpy(np->dev_name, ndev->name, IFNAMSIZ); +	INIT_WORK(&np->cleanup_work, netpoll_async_cleanup); +  	if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||  	    !ndev->netdev_ops->ndo_poll_controller) { -		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", -		       np->name, np->dev_name); +		np_err(np, "%s doesn't support polling, aborting\n", +		       np->dev_name);  		err = -ENOTSUPP;  		goto out;  	} @@ -752,11 +620,7 @@ int __netpoll_setup(struct netpoll *np)  			goto out;  		} -		npinfo->rx_flags = 0; -		INIT_LIST_HEAD(&npinfo->rx_np); - -		spin_lock_init(&npinfo->rx_lock); -		skb_queue_head_init(&npinfo->arp_tx); +		sema_init(&npinfo->dev_lock, 1);  		skb_queue_head_init(&npinfo->txq);  		INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); @@ -769,19 +633,12 @@ int __netpoll_setup(struct netpoll *np)  				goto free_npinfo;  		}  	} else { -		npinfo = ndev->npinfo; +		npinfo = rtnl_dereference(ndev->npinfo);  		atomic_inc(&npinfo->refcnt);  	}  	npinfo->netpoll = np; -	if (np->rx_hook) { -		spin_lock_irqsave(&npinfo->rx_lock, flags); -		npinfo->rx_flags |= NETPOLL_RX_ENABLED; -		list_add_tail(&np->rx, &npinfo->rx_np); -		spin_unlock_irqrestore(&npinfo->rx_lock, flags); -	} -  	/* last thing to do is link it to the net device structure */  	rcu_assign_pointer(ndev->npinfo, npinfo); @@ -800,37 +657,42 @@ int netpoll_setup(struct netpoll *np)  	struct in_device *in_dev;  	int err; -	if (np->dev_name) -		ndev = dev_get_by_name(&init_net, np->dev_name); +	rtnl_lock(); +	if (np->dev_name) { +		struct net *net = current->nsproxy->net_ns; +		ndev = __dev_get_by_name(net, np->dev_name); +	}  	if (!ndev) { -		printk(KERN_ERR "%s: %s doesn't exist, aborting.\n", -		       np->name, np->dev_name); -		return -ENODEV; +		np_err(np, "%s doesn't exist, aborting\n", np->dev_name); +		err = -ENODEV; +		goto unlock; +	} +	dev_hold(ndev); + +	if (netdev_master_upper_dev_get(ndev)) { +		np_err(np, "%s is a slave device, aborting\n", np->dev_name); +		err = -EBUSY; +		goto put;  	}  	if (!netif_running(ndev)) {  		unsigned long atmost, atleast; -		printk(KERN_INFO "%s: device %s not up yet, forcing it\n", -		       np->name, np->dev_name); +		np_info(np, "device %s not up yet, forcing it\n", np->dev_name); -		rtnl_lock();  		err = dev_open(ndev); -		rtnl_unlock();  		if (err) { -			printk(KERN_ERR "%s: failed to open %s\n", -			       np->name, ndev->name); +			np_err(np, "failed to open %s\n", ndev->name);  			goto put;  		} +		rtnl_unlock();  		atleast = jiffies + HZ/10;  		atmost = jiffies + carrier_timeout * HZ;  		while (!netif_carrier_ok(ndev)) {  			if (time_after(jiffies, atmost)) { -				printk(KERN_NOTICE -				       "%s: timeout waiting for carrier\n", -				       np->name); +				np_notice(np, "timeout waiting for carrier\n");  				break;  			}  			msleep(1); @@ -842,46 +704,73 @@ int netpoll_setup(struct netpoll *np)  		 */  		if (time_before(jiffies, atleast)) { -			printk(KERN_NOTICE "%s: carrier detect appears" -			       " untrustworthy, waiting 4 seconds\n", -			       np->name); +			np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n");  			msleep(4000);  		} +		rtnl_lock();  	} -	if (!np->local_ip) { -		rcu_read_lock(); -		in_dev = __in_dev_get_rcu(ndev); +	if (!np->local_ip.ip) { +		if (!np->ipv6) { +			in_dev = __in_dev_get_rtnl(ndev); + +			if (!in_dev || !in_dev->ifa_list) { +				np_err(np, "no IP address for %s, aborting\n", +				       np->dev_name); +				err = -EDESTADDRREQ; +				goto put; +			} + +			np->local_ip.ip = in_dev->ifa_list->ifa_local; +			np_info(np, "local IP %pI4\n", &np->local_ip.ip); +		} else { +#if IS_ENABLED(CONFIG_IPV6) +			struct inet6_dev *idev; -		if (!in_dev || !in_dev->ifa_list) { -			rcu_read_unlock(); -			printk(KERN_ERR "%s: no IP address for %s, aborting\n", -			       np->name, np->dev_name);  			err = -EDESTADDRREQ; +			idev = __in6_dev_get(ndev); +			if (idev) { +				struct inet6_ifaddr *ifp; + +				read_lock_bh(&idev->lock); +				list_for_each_entry(ifp, &idev->addr_list, if_list) { +					if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) +						continue; +					np->local_ip.in6 = ifp->addr; +					err = 0; +					break; +				} +				read_unlock_bh(&idev->lock); +			} +			if (err) { +				np_err(np, "no IPv6 address for %s, aborting\n", +				       np->dev_name); +				goto put; +			} else +				np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); +#else +			np_err(np, "IPv6 is not supported %s, aborting\n", +			       np->dev_name); +			err = -EINVAL;  			goto put; +#endif  		} - -		np->local_ip = in_dev->ifa_list->ifa_local; -		rcu_read_unlock(); -		printk(KERN_INFO "%s: local IP %pI4\n", np->name, &np->local_ip);  	} -	np->dev = ndev; -  	/* fill up the skb queue */  	refill_skbs(); -	rtnl_lock(); -	err = __netpoll_setup(np); -	rtnl_unlock(); - +	err = __netpoll_setup(np, ndev);  	if (err)  		goto put; +	rtnl_unlock();  	return 0;  put:  	dev_put(ndev); +unlock: +	rtnl_unlock();  	return err;  }  EXPORT_SYMBOL(netpoll_setup); @@ -893,22 +782,36 @@ static int __init netpoll_init(void)  }  core_initcall(netpoll_init); +static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) +{ +	struct netpoll_info *npinfo = +			container_of(rcu_head, struct netpoll_info, rcu); + +	skb_queue_purge(&npinfo->txq); + +	/* we can't call cancel_delayed_work_sync here, as we are in softirq */ +	cancel_delayed_work(&npinfo->tx_work); + +	/* clean after last, unfinished work */ +	__skb_queue_purge(&npinfo->txq); +	/* now cancel it again */ +	cancel_delayed_work(&npinfo->tx_work); +	kfree(npinfo); +} +  void __netpoll_cleanup(struct netpoll *np)  {  	struct netpoll_info *npinfo; -	unsigned long flags; -	npinfo = np->dev->npinfo; +	/* rtnl_dereference would be preferable here but +	 * rcu_cleanup_netpoll path can put us in here safely without +	 * holding the rtnl, so plain rcu_dereference it is +	 */ +	npinfo = rtnl_dereference(np->dev->npinfo);  	if (!npinfo)  		return; -	if (!list_empty(&npinfo->rx_np)) { -		spin_lock_irqsave(&npinfo->rx_lock, flags); -		list_del(&np->rx); -		if (list_empty(&npinfo->rx_np)) -			npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; -		spin_unlock_irqrestore(&npinfo->rx_lock, flags); -	} +	synchronize_srcu(&netpoll_srcu);  	if (atomic_dec_and_test(&npinfo->refcnt)) {  		const struct net_device_ops *ops; @@ -917,47 +820,37 @@ void __netpoll_cleanup(struct netpoll *np)  		if (ops->ndo_netpoll_cleanup)  			ops->ndo_netpoll_cleanup(np->dev); -		rcu_assign_pointer(np->dev->npinfo, NULL); - -		/* avoid racing with NAPI reading npinfo */ -		synchronize_rcu_bh(); - -		skb_queue_purge(&npinfo->arp_tx); -		skb_queue_purge(&npinfo->txq); -		cancel_rearming_delayed_work(&npinfo->tx_work); - -		/* clean after last, unfinished work */ -		__skb_queue_purge(&npinfo->txq); -		kfree(npinfo); +		RCU_INIT_POINTER(np->dev->npinfo, NULL); +		call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);  	}  }  EXPORT_SYMBOL_GPL(__netpoll_cleanup); -void netpoll_cleanup(struct netpoll *np) +static void netpoll_async_cleanup(struct work_struct *work)  { -	if (!np->dev) -		return; +	struct netpoll *np = container_of(work, struct netpoll, cleanup_work);  	rtnl_lock();  	__netpoll_cleanup(np);  	rtnl_unlock(); - -	dev_put(np->dev); -	np->dev = NULL; +	kfree(np);  } -EXPORT_SYMBOL(netpoll_cleanup); -int netpoll_trap(void) +void __netpoll_free_async(struct netpoll *np)  { -	return atomic_read(&trapped); +	schedule_work(&np->cleanup_work);  } -EXPORT_SYMBOL(netpoll_trap); +EXPORT_SYMBOL_GPL(__netpoll_free_async); -void netpoll_set_trap(int trap) +void netpoll_cleanup(struct netpoll *np)  { -	if (trap) -		atomic_inc(&trapped); -	else -		atomic_dec(&trapped); +	rtnl_lock(); +	if (!np->dev) +		goto out; +	__netpoll_cleanup(np); +	dev_put(np->dev); +	np->dev = NULL; +out: +	rtnl_unlock();  } -EXPORT_SYMBOL(netpoll_set_trap); +EXPORT_SYMBOL(netpoll_cleanup); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c new file mode 100644 index 00000000000..2f385b9bccc --- /dev/null +++ b/net/core/netprio_cgroup.c @@ -0,0 +1,288 @@ +/* + * net/core/netprio_cgroup.c	Priority Control Group + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + * + * Authors:	Neil Horman <nhorman@tuxdriver.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/cgroup.h> +#include <linux/rcupdate.h> +#include <linux/atomic.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> +#include <net/netprio_cgroup.h> + +#include <linux/fdtable.h> + +#define PRIOMAP_MIN_SZ		128 + +/* + * Extend @dev->priomap so that it's large enough to accommodate + * @target_idx.  @dev->priomap.priomap_len > @target_idx after successful + * return.  Must be called under rtnl lock. + */ +static int extend_netdev_table(struct net_device *dev, u32 target_idx) +{ +	struct netprio_map *old, *new; +	size_t new_sz, new_len; + +	/* is the existing priomap large enough? */ +	old = rtnl_dereference(dev->priomap); +	if (old && old->priomap_len > target_idx) +		return 0; + +	/* +	 * Determine the new size.  Let's keep it power-of-two.  We start +	 * from PRIOMAP_MIN_SZ and double it until it's large enough to +	 * accommodate @target_idx. +	 */ +	new_sz = PRIOMAP_MIN_SZ; +	while (true) { +		new_len = (new_sz - offsetof(struct netprio_map, priomap)) / +			sizeof(new->priomap[0]); +		if (new_len > target_idx) +			break; +		new_sz *= 2; +		/* overflowed? */ +		if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) +			return -ENOSPC; +	} + +	/* allocate & copy */ +	new = kzalloc(new_sz, GFP_KERNEL); +	if (!new) +		return -ENOMEM; + +	if (old) +		memcpy(new->priomap, old->priomap, +		       old->priomap_len * sizeof(old->priomap[0])); + +	new->priomap_len = new_len; + +	/* install the new priomap */ +	rcu_assign_pointer(dev->priomap, new); +	if (old) +		kfree_rcu(old, rcu); +	return 0; +} + +/** + * netprio_prio - return the effective netprio of a cgroup-net_device pair + * @css: css part of the target pair + * @dev: net_device part of the target pair + * + * Should be called under RCU read or rtnl lock. + */ +static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) +{ +	struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); +	int id = css->cgroup->id; + +	if (map && id < map->priomap_len) +		return map->priomap[id]; +	return 0; +} + +/** + * netprio_set_prio - set netprio on a cgroup-net_device pair + * @css: css part of the target pair + * @dev: net_device part of the target pair + * @prio: prio to set + * + * Set netprio to @prio on @css-@dev pair.  Should be called under rtnl + * lock and may fail under memory pressure for non-zero @prio. + */ +static int netprio_set_prio(struct cgroup_subsys_state *css, +			    struct net_device *dev, u32 prio) +{ +	struct netprio_map *map; +	int id = css->cgroup->id; +	int ret; + +	/* avoid extending priomap for zero writes */ +	map = rtnl_dereference(dev->priomap); +	if (!prio && (!map || map->priomap_len <= id)) +		return 0; + +	ret = extend_netdev_table(dev, id); +	if (ret) +		return ret; + +	map = rtnl_dereference(dev->priomap); +	map->priomap[id] = prio; +	return 0; +} + +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) +{ +	struct cgroup_subsys_state *css; + +	css = kzalloc(sizeof(*css), GFP_KERNEL); +	if (!css) +		return ERR_PTR(-ENOMEM); + +	return css; +} + +static int cgrp_css_online(struct cgroup_subsys_state *css) +{ +	struct cgroup_subsys_state *parent_css = css->parent; +	struct net_device *dev; +	int ret = 0; + +	if (!parent_css) +		return 0; + +	rtnl_lock(); +	/* +	 * Inherit prios from the parent.  As all prios are set during +	 * onlining, there is no need to clear them on offline. +	 */ +	for_each_netdev(&init_net, dev) { +		u32 prio = netprio_prio(parent_css, dev); + +		ret = netprio_set_prio(css, dev, prio); +		if (ret) +			break; +	} +	rtnl_unlock(); +	return ret; +} + +static void cgrp_css_free(struct cgroup_subsys_state *css) +{ +	kfree(css); +} + +static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) +{ +	return css->cgroup->id; +} + +static int read_priomap(struct seq_file *sf, void *v) +{ +	struct net_device *dev; + +	rcu_read_lock(); +	for_each_netdev_rcu(&init_net, dev) +		seq_printf(sf, "%s %u\n", dev->name, +			   netprio_prio(seq_css(sf), dev)); +	rcu_read_unlock(); +	return 0; +} + +static ssize_t write_priomap(struct kernfs_open_file *of, +			     char *buf, size_t nbytes, loff_t off) +{ +	char devname[IFNAMSIZ + 1]; +	struct net_device *dev; +	u32 prio; +	int ret; + +	if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) +		return -EINVAL; + +	dev = dev_get_by_name(&init_net, devname); +	if (!dev) +		return -ENODEV; + +	rtnl_lock(); + +	ret = netprio_set_prio(of_css(of), dev, prio); + +	rtnl_unlock(); +	dev_put(dev); +	return ret ?: nbytes; +} + +static int update_netprio(const void *v, struct file *file, unsigned n) +{ +	int err; +	struct socket *sock = sock_from_file(file, &err); +	if (sock) +		sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v; +	return 0; +} + +static void net_prio_attach(struct cgroup_subsys_state *css, +			    struct cgroup_taskset *tset) +{ +	struct task_struct *p; +	void *v = (void *)(unsigned long)css->cgroup->id; + +	cgroup_taskset_for_each(p, tset) { +		task_lock(p); +		iterate_fd(p->files, 0, update_netprio, v); +		task_unlock(p); +	} +} + +static struct cftype ss_files[] = { +	{ +		.name = "prioidx", +		.read_u64 = read_prioidx, +	}, +	{ +		.name = "ifpriomap", +		.seq_show = read_priomap, +		.write = write_priomap, +	}, +	{ }	/* terminate */ +}; + +struct cgroup_subsys net_prio_cgrp_subsys = { +	.css_alloc	= cgrp_css_alloc, +	.css_online	= cgrp_css_online, +	.css_free	= cgrp_css_free, +	.attach		= net_prio_attach, +	.base_cftypes	= ss_files, +}; + +static int netprio_device_event(struct notifier_block *unused, +				unsigned long event, void *ptr) +{ +	struct net_device *dev = netdev_notifier_info_to_dev(ptr); +	struct netprio_map *old; + +	/* +	 * Note this is called with rtnl_lock held so we have update side +	 * protection on our rcu assignments +	 */ + +	switch (event) { +	case NETDEV_UNREGISTER: +		old = rtnl_dereference(dev->priomap); +		RCU_INIT_POINTER(dev->priomap, NULL); +		if (old) +			kfree_rcu(old, rcu); +		break; +	} +	return NOTIFY_DONE; +} + +static struct notifier_block netprio_device_notifier = { +	.notifier_call = netprio_device_event +}; + +static int __init init_cgroup_netprio(void) +{ +	register_netdevice_notifier(&netprio_device_notifier); +	return 0; +} + +subsys_initcall(init_cgroup_netprio); +MODULE_LICENSE("GPL v2"); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 2953b2abc97..fc17a9d309a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -156,13 +156,17 @@  #include <linux/wait.h>  #include <linux/etherdevice.h>  #include <linux/kthread.h> +#include <linux/prefetch.h>  #include <net/net_namespace.h>  #include <net/checksum.h>  #include <net/ipv6.h> +#include <net/udp.h> +#include <net/ip6_checksum.h>  #include <net/addrconf.h>  #ifdef CONFIG_XFRM  #include <net/xfrm.h>  #endif +#include <net/netns/generic.h>  #include <asm/byteorder.h>  #include <linux/rcupdate.h>  #include <linux/bitops.h> @@ -196,6 +200,7 @@  #define F_QUEUE_MAP_RND (1<<13)	/* queue map Random */  #define F_QUEUE_MAP_CPU (1<<14)	/* queue map mirrors smp_processor_id() */  #define F_NODE          (1<<15)	/* Node memory alloc*/ +#define F_UDPCSUM       (1<<16)	/* Include UDP checksum */  /* Thread control flag bits */  #define T_STOP        (1<<0)	/* Stop run */ @@ -211,7 +216,6 @@  #define PKTGEN_MAGIC 0xbe9be955  #define PG_PROC_DIR "pktgen"  #define PGCTRL	    "pgctrl" -static struct proc_dir_entry *pg_proc_dir;  #define MAX_CFLOWS  65536 @@ -247,10 +251,11 @@ struct pktgen_dev {  	int removal_mark;	/* non-zero => the device is marked for  				 * removal by worker thread */ -	int min_pkt_size;	/* = ETH_ZLEN; */ -	int max_pkt_size;	/* = ETH_ZLEN; */ +	int min_pkt_size; +	int max_pkt_size;  	int pkt_overhead;	/* overhead for MPLS, VLANs, IPSEC etc */  	int nfrags; +	struct page *page;  	u64 delay;		/* nano-seconds */  	__u64 count;		/* Default No packets to send */ @@ -318,7 +323,7 @@ struct pktgen_dev {  				(see RFC 3260, sec. 4) */  	/* MPLS */ -	unsigned nr_labels;	/* Depth of stack, 0 = no MPLS */ +	unsigned int nr_labels;	/* Depth of stack, 0 = no MPLS */  	__be32 labels[MAX_MPLS_LABELS];  	/* VLAN/SVLAN (802.1Q/Q-in-Q) */ @@ -371,10 +376,10 @@ struct pktgen_dev {  				  */  	char odevname[32];  	struct flow_state *flows; -	unsigned cflows;	/* Concurrent flows (config) */ -	unsigned lflow;		/* Flow length  (config) */ -	unsigned nflows;	/* accumulated flows (stats) */ -	unsigned curfl;		/* current sequenced flow (state)*/ +	unsigned int cflows;	/* Concurrent flows (config) */ +	unsigned int lflow;		/* Flow length  (config) */ +	unsigned int nflows;	/* accumulated flows (stats) */ +	unsigned int curfl;		/* current sequenced flow (state)*/  	u16 queue_map_min;  	u16 queue_map_max; @@ -384,6 +389,9 @@ struct pktgen_dev {  #ifdef CONFIG_XFRM  	__u8	ipsmode;		/* IPSEC mode (config) */  	__u8	ipsproto;		/* IPSEC type (config) */ +	__u32	spi; +	struct dst_entry dst; +	struct dst_ops dstops;  #endif  	char result[512];  }; @@ -395,7 +403,15 @@ struct pktgen_hdr {  	__be32 tv_usec;  }; -static bool pktgen_exiting __read_mostly; + +static int pg_net_id __read_mostly; + +struct pktgen_net { +	struct net		*net; +	struct proc_dir_entry	*proc_dir; +	struct list_head	pktgen_threads; +	bool			pktgen_exiting; +};  struct pktgen_thread {  	spinlock_t if_lock;		/* for list of devices */ @@ -412,25 +428,12 @@ struct pktgen_thread {  	wait_queue_head_t queue;  	struct completion start_done; +	struct pktgen_net *net;  };  #define REMOVE 1  #define FIND   0 -static inline ktime_t ktime_now(void) -{ -	struct timespec ts; -	ktime_get_ts(&ts); - -	return timespec_to_ktime(ts); -} - -/* This works even if 32 bit because of careful byte order choice */ -static inline int ktime_lt(const ktime_t cmp1, const ktime_t cmp2) -{ -	return cmp1.tv64 < cmp2.tv64; -} -  static const char version[] =  	"Packet Generator for packet performance testing. "  	"Version: " VERSION "\n"; @@ -440,16 +443,13 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);  static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,  					  const char *ifname, bool exact);  static int pktgen_device_event(struct notifier_block *, unsigned long, void *); -static void pktgen_run_all_threads(void); -static void pktgen_reset_all_threads(void); -static void pktgen_stop_all_threads_ifs(void); +static void pktgen_run_all_threads(struct pktgen_net *pn); +static void pktgen_reset_all_threads(struct pktgen_net *pn); +static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn);  static void pktgen_stop(struct pktgen_thread *t);  static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); -static unsigned int scan_ip6(const char *s, char ip[16]); -static unsigned int fmt_ip6(char *s, const char ip[16]); -  /* Module parameters, defaults. */  static int pg_count_d __read_mostly = 1000;  static int pg_delay_d __read_mostly; @@ -457,7 +457,6 @@ static int pg_clone_skb_d  __read_mostly;  static int debug  __read_mostly;  static DEFINE_MUTEX(pktgen_thread_lock); -static LIST_HEAD(pktgen_threads);  static struct notifier_block pktgen_notifier_block = {  	.notifier_call = pktgen_device_event, @@ -477,44 +476,41 @@ static int pgctrl_show(struct seq_file *seq, void *v)  static ssize_t pgctrl_write(struct file *file, const char __user *buf,  			    size_t count, loff_t *ppos)  { -	int err = 0;  	char data[128]; +	struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); -	if (!capable(CAP_NET_ADMIN)) { -		err = -EPERM; -		goto out; -	} +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; + +	if (count == 0) +		return -EINVAL;  	if (count > sizeof(data))  		count = sizeof(data); -	if (copy_from_user(data, buf, count)) { -		err = -EFAULT; -		goto out; -	} -	data[count - 1] = 0;	/* Make string */ +	if (copy_from_user(data, buf, count)) +		return -EFAULT; + +	data[count - 1] = 0;	/* Strip trailing '\n' and terminate string */  	if (!strcmp(data, "stop")) -		pktgen_stop_all_threads_ifs(); +		pktgen_stop_all_threads_ifs(pn);  	else if (!strcmp(data, "start")) -		pktgen_run_all_threads(); +		pktgen_run_all_threads(pn);  	else if (!strcmp(data, "reset")) -		pktgen_reset_all_threads(); +		pktgen_reset_all_threads(pn);  	else  		pr_warning("Unknown command: %s\n", data); -	err = count; - -out: -	return err; +	return count;  }  static int pgctrl_open(struct inode *inode, struct file *file)  { -	return single_open(file, pgctrl_show, PDE(inode)->data); +	return single_open(file, pgctrl_show, PDE_DATA(inode));  }  static const struct file_operations pktgen_fops = { @@ -555,21 +551,13 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  			   pkt_dev->skb_priority);  	if (pkt_dev->flags & F_IPV6) { -		char b1[128], b2[128], b3[128]; -		fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr); -		fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr); -		fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr); -		seq_printf(seq, -			   "     saddr: %s  min_saddr: %s  max_saddr: %s\n", b1, -			   b2, b3); - -		fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr); -		fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr); -		fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr);  		seq_printf(seq, -			   "     daddr: %s  min_daddr: %s  max_daddr: %s\n", b1, -			   b2, b3); - +			   "     saddr: %pI6c  min_saddr: %pI6c  max_saddr: %pI6c\n" +			   "     daddr: %pI6c  min_daddr: %pI6c  max_daddr: %pI6c\n", +			   &pkt_dev->in6_saddr, +			   &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr, +			   &pkt_dev->in6_daddr, +			   &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr);  	} else {  		seq_printf(seq,  			   "     dst_min: %s  dst_max: %s\n", @@ -585,7 +573,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  		   is_zero_ether_addr(pkt_dev->src_mac) ?  			     pkt_dev->odev->dev_addr : pkt_dev->src_mac); -	seq_printf(seq, "dst_mac: "); +	seq_puts(seq, "dst_mac: ");  	seq_printf(seq, "%pM\n", pkt_dev->dst_mac);  	seq_printf(seq, @@ -599,8 +587,8 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  		   pkt_dev->src_mac_count, pkt_dev->dst_mac_count);  	if (pkt_dev->nr_labels) { -		unsigned i; -		seq_printf(seq, "     mpls: "); +		unsigned int i; +		seq_puts(seq, "     mpls: ");  		for (i = 0; i < pkt_dev->nr_labels; i++)  			seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),  				   i == pkt_dev->nr_labels-1 ? "\n" : ", "); @@ -625,66 +613,72 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  	if (pkt_dev->node >= 0)  		seq_printf(seq, "     node: %d\n", pkt_dev->node); -	seq_printf(seq, "     Flags: "); +	seq_puts(seq, "     Flags: ");  	if (pkt_dev->flags & F_IPV6) -		seq_printf(seq, "IPV6  "); +		seq_puts(seq, "IPV6  ");  	if (pkt_dev->flags & F_IPSRC_RND) -		seq_printf(seq, "IPSRC_RND  "); +		seq_puts(seq, "IPSRC_RND  ");  	if (pkt_dev->flags & F_IPDST_RND) -		seq_printf(seq, "IPDST_RND  "); +		seq_puts(seq, "IPDST_RND  ");  	if (pkt_dev->flags & F_TXSIZE_RND) -		seq_printf(seq, "TXSIZE_RND  "); +		seq_puts(seq, "TXSIZE_RND  ");  	if (pkt_dev->flags & F_UDPSRC_RND) -		seq_printf(seq, "UDPSRC_RND  "); +		seq_puts(seq, "UDPSRC_RND  ");  	if (pkt_dev->flags & F_UDPDST_RND) -		seq_printf(seq, "UDPDST_RND  "); +		seq_puts(seq, "UDPDST_RND  "); + +	if (pkt_dev->flags & F_UDPCSUM) +		seq_puts(seq, "UDPCSUM  ");  	if (pkt_dev->flags & F_MPLS_RND) -		seq_printf(seq,  "MPLS_RND  "); +		seq_puts(seq,  "MPLS_RND  ");  	if (pkt_dev->flags & F_QUEUE_MAP_RND) -		seq_printf(seq,  "QUEUE_MAP_RND  "); +		seq_puts(seq,  "QUEUE_MAP_RND  ");  	if (pkt_dev->flags & F_QUEUE_MAP_CPU) -		seq_printf(seq,  "QUEUE_MAP_CPU  "); +		seq_puts(seq,  "QUEUE_MAP_CPU  ");  	if (pkt_dev->cflows) {  		if (pkt_dev->flags & F_FLOW_SEQ) -			seq_printf(seq,  "FLOW_SEQ  "); /*in sequence flows*/ +			seq_puts(seq,  "FLOW_SEQ  "); /*in sequence flows*/  		else -			seq_printf(seq,  "FLOW_RND  "); +			seq_puts(seq,  "FLOW_RND  ");  	}  #ifdef CONFIG_XFRM -	if (pkt_dev->flags & F_IPSEC_ON) -		seq_printf(seq,  "IPSEC  "); +	if (pkt_dev->flags & F_IPSEC_ON) { +		seq_puts(seq,  "IPSEC  "); +		if (pkt_dev->spi) +			seq_printf(seq, "spi:%u", pkt_dev->spi); +	}  #endif  	if (pkt_dev->flags & F_MACSRC_RND) -		seq_printf(seq, "MACSRC_RND  "); +		seq_puts(seq, "MACSRC_RND  ");  	if (pkt_dev->flags & F_MACDST_RND) -		seq_printf(seq, "MACDST_RND  "); +		seq_puts(seq, "MACDST_RND  ");  	if (pkt_dev->flags & F_VID_RND) -		seq_printf(seq, "VID_RND  "); +		seq_puts(seq, "VID_RND  ");  	if (pkt_dev->flags & F_SVID_RND) -		seq_printf(seq, "SVID_RND  "); +		seq_puts(seq, "SVID_RND  ");  	if (pkt_dev->flags & F_NODE) -		seq_printf(seq, "NODE_ALLOC  "); +		seq_puts(seq, "NODE_ALLOC  ");  	seq_puts(seq, "\n");  	/* not really stopped, more like last-running-at */ -	stopped = pkt_dev->running ? ktime_now() : pkt_dev->stopped_at; +	stopped = pkt_dev->running ? ktime_get() : pkt_dev->stopped_at;  	idle = pkt_dev->idle_acc;  	do_div(idle, NSEC_PER_USEC); @@ -705,13 +699,12 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  		   pkt_dev->cur_src_mac_offset);  	if (pkt_dev->flags & F_IPV6) { -		char b1[128], b2[128]; -		fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr); -		fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr); -		seq_printf(seq, "     cur_saddr: %s  cur_daddr: %s\n", b2, b1); +		seq_printf(seq, "     cur_saddr: %pI6c  cur_daddr: %pI6c\n", +				&pkt_dev->cur_in6_saddr, +				&pkt_dev->cur_in6_daddr);  	} else -		seq_printf(seq, "     cur_saddr: 0x%x  cur_daddr: 0x%x\n", -			   pkt_dev->cur_saddr, pkt_dev->cur_daddr); +		seq_printf(seq, "     cur_saddr: %pI4  cur_daddr: %pI4\n", +			   &pkt_dev->cur_saddr, &pkt_dev->cur_daddr);  	seq_printf(seq, "     cur_udp_dst: %d  cur_udp_src: %d\n",  		   pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); @@ -723,7 +716,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)  	if (pkt_dev->result[0])  		seq_printf(seq, "Result: %s\n", pkt_dev->result);  	else -		seq_printf(seq, "Result: Idle\n"); +		seq_puts(seq, "Result: Idle\n");  	return 0;  } @@ -775,8 +768,8 @@ done:  	return i;  } -static unsigned long num_arg(const char __user * user_buffer, -			     unsigned long maxlen, unsigned long *num) +static long num_arg(const char __user *user_buffer, unsigned long maxlen, +				unsigned long *num)  {  	int i;  	*num = 0; @@ -820,7 +813,7 @@ done_str:  static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)  { -	unsigned n = 0; +	unsigned int n = 0;  	char c;  	ssize_t i = 0;  	int len; @@ -899,8 +892,8 @@ static ssize_t pktgen_if_write(struct file *file,  		if (copy_from_user(tb, user_buffer, copy))  			return -EFAULT;  		tb[copy] = 0; -		printk(KERN_DEBUG "pktgen: %s,%lu  buffer -:%s:-\n", name, -		       (unsigned long)count, tb); +		pr_debug("%s,%lu  buffer -:%s:-\n", +			 name, (unsigned long)count, tb);  	}  	if (!strcmp(name, "min_pkt_size")) { @@ -1078,7 +1071,9 @@ static ssize_t pktgen_if_write(struct file *file,  		len = num_arg(&user_buffer[i], 10, &value);  		if (len < 0)  			return len; - +		if ((value > 0) && +		    (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) +			return -ENOTSUPP;  		i += len;  		pkt_dev->clone_skb = value; @@ -1134,6 +1129,10 @@ static ssize_t pktgen_if_write(struct file *file,  		if (node_possible(value)) {  			pkt_dev->node = value;  			sprintf(pg_result, "OK: node=%d", pkt_dev->node); +			if (pkt_dev->page) { +				put_page(pkt_dev->page); +				pkt_dev->page = NULL; +			}  		}  		else  			sprintf(pg_result, "ERROR: node not possible"); @@ -1237,12 +1236,24 @@ static ssize_t pktgen_if_write(struct file *file,  		else if (strcmp(f, "!NODE_ALLOC") == 0)  			pkt_dev->flags &= ~F_NODE; +		else if (strcmp(f, "UDPCSUM") == 0) +			pkt_dev->flags |= F_UDPCSUM; + +		else if (strcmp(f, "!UDPCSUM") == 0) +			pkt_dev->flags &= ~F_UDPCSUM; +  		else {  			sprintf(pg_result,  				"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",  				f,  				"IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " -				"MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n"); +				"MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, " +				"MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, " +				"QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, " +#ifdef CONFIG_XFRM +				"IPSEC, " +#endif +				"NODE_ALLOC\n");  			return count;  		}  		sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); @@ -1263,8 +1274,7 @@ static ssize_t pktgen_if_write(struct file *file,  			pkt_dev->cur_daddr = pkt_dev->daddr_min;  		}  		if (debug) -			printk(KERN_DEBUG "pktgen: dst_min set to: %s\n", -			       pkt_dev->dst_min); +			pr_debug("dst_min set to: %s\n", pkt_dev->dst_min);  		i += len;  		sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);  		return count; @@ -1286,8 +1296,7 @@ static ssize_t pktgen_if_write(struct file *file,  			pkt_dev->cur_daddr = pkt_dev->daddr_max;  		}  		if (debug) -			printk(KERN_DEBUG "pktgen: dst_max set to: %s\n", -			       pkt_dev->dst_max); +			pr_debug("dst_max set to: %s\n", pkt_dev->dst_max);  		i += len;  		sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);  		return count; @@ -1303,13 +1312,13 @@ static ssize_t pktgen_if_write(struct file *file,  			return -EFAULT;  		buf[len] = 0; -		scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); -		fmt_ip6(buf, pkt_dev->in6_daddr.s6_addr); +		in6_pton(buf, -1, pkt_dev->in6_daddr.s6_addr, -1, NULL); +		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr); -		ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr); +		pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr;  		if (debug) -			printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf); +			pr_debug("dst6 set to: %s\n", buf);  		i += len;  		sprintf(pg_result, "OK: dst6=%s", buf); @@ -1326,13 +1335,12 @@ static ssize_t pktgen_if_write(struct file *file,  			return -EFAULT;  		buf[len] = 0; -		scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); -		fmt_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); +		in6_pton(buf, -1, pkt_dev->min_in6_daddr.s6_addr, -1, NULL); +		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr); -		ipv6_addr_copy(&pkt_dev->cur_in6_daddr, -			       &pkt_dev->min_in6_daddr); +		pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr;  		if (debug) -			printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf); +			pr_debug("dst6_min set to: %s\n", buf);  		i += len;  		sprintf(pg_result, "OK: dst6_min=%s", buf); @@ -1349,11 +1357,11 @@ static ssize_t pktgen_if_write(struct file *file,  			return -EFAULT;  		buf[len] = 0; -		scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); -		fmt_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); +		in6_pton(buf, -1, pkt_dev->max_in6_daddr.s6_addr, -1, NULL); +		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr);  		if (debug) -			printk(KERN_DEBUG "pktgen: dst6_max set to: %s\n", buf); +			pr_debug("dst6_max set to: %s\n", buf);  		i += len;  		sprintf(pg_result, "OK: dst6_max=%s", buf); @@ -1370,13 +1378,13 @@ static ssize_t pktgen_if_write(struct file *file,  			return -EFAULT;  		buf[len] = 0; -		scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); -		fmt_ip6(buf, pkt_dev->in6_saddr.s6_addr); +		in6_pton(buf, -1, pkt_dev->in6_saddr.s6_addr, -1, NULL); +		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr); -		ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr); +		pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr;  		if (debug) -			printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf); +			pr_debug("src6 set to: %s\n", buf);  		i += len;  		sprintf(pg_result, "OK: src6=%s", buf); @@ -1397,8 +1405,7 @@ static ssize_t pktgen_if_write(struct file *file,  			pkt_dev->cur_saddr = pkt_dev->saddr_min;  		}  		if (debug) -			printk(KERN_DEBUG "pktgen: src_min set to: %s\n", -			       pkt_dev->src_min); +			pr_debug("src_min set to: %s\n", pkt_dev->src_min);  		i += len;  		sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);  		return count; @@ -1418,18 +1425,12 @@ static ssize_t pktgen_if_write(struct file *file,  			pkt_dev->cur_saddr = pkt_dev->saddr_max;  		}  		if (debug) -			printk(KERN_DEBUG "pktgen: src_max set to: %s\n", -			       pkt_dev->src_max); +			pr_debug("src_max set to: %s\n", pkt_dev->src_max);  		i += len;  		sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);  		return count;  	}  	if (!strcmp(name, "dst_mac")) { -		char *v = valstr; -		unsigned char old_dmac[ETH_ALEN]; -		unsigned char *m = pkt_dev->dst_mac; -		memcpy(old_dmac, pkt_dev->dst_mac, ETH_ALEN); -  		len = strn_len(&user_buffer[i], sizeof(valstr) - 1);  		if (len < 0)  			return len; @@ -1437,35 +1438,16 @@ static ssize_t pktgen_if_write(struct file *file,  		memset(valstr, 0, sizeof(valstr));  		if (copy_from_user(valstr, &user_buffer[i], len))  			return -EFAULT; -		i += len; - -		for (*m = 0; *v && m < pkt_dev->dst_mac + 6; v++) { -			int value; - -			value = hex_to_bin(*v); -			if (value >= 0) -				*m = *m * 16 + value; - -			if (*v == ':') { -				m++; -				*m = 0; -			} -		} +		if (!mac_pton(valstr, pkt_dev->dst_mac)) +			return -EINVAL;  		/* Set up Dest MAC */ -		if (compare_ether_addr(old_dmac, pkt_dev->dst_mac)) -			memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); +		ether_addr_copy(&pkt_dev->hh[0], pkt_dev->dst_mac); -		sprintf(pg_result, "OK: dstmac"); +		sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac);  		return count;  	}  	if (!strcmp(name, "src_mac")) { -		char *v = valstr; -		unsigned char old_smac[ETH_ALEN]; -		unsigned char *m = pkt_dev->src_mac; - -		memcpy(old_smac, pkt_dev->src_mac, ETH_ALEN); -  		len = strn_len(&user_buffer[i], sizeof(valstr) - 1);  		if (len < 0)  			return len; @@ -1473,26 +1455,13 @@ static ssize_t pktgen_if_write(struct file *file,  		memset(valstr, 0, sizeof(valstr));  		if (copy_from_user(valstr, &user_buffer[i], len))  			return -EFAULT; -		i += len; - -		for (*m = 0; *v && m < pkt_dev->src_mac + 6; v++) { -			int value; - -			value = hex_to_bin(*v); -			if (value >= 0) -				*m = *m * 16 + value; - -			if (*v == ':') { -				m++; -				*m = 0; -			} -		} +		if (!mac_pton(valstr, pkt_dev->src_mac)) +			return -EINVAL;  		/* Set up Src MAC */ -		if (compare_ether_addr(old_smac, pkt_dev->src_mac)) -			memcpy(&(pkt_dev->hh[6]), pkt_dev->src_mac, ETH_ALEN); +		ether_addr_copy(&pkt_dev->hh[6], pkt_dev->src_mac); -		sprintf(pg_result, "OK: srcmac"); +		sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac);  		return count;  	} @@ -1515,7 +1484,18 @@ static ssize_t pktgen_if_write(struct file *file,  		sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows);  		return count;  	} +#ifdef CONFIG_XFRM +	if (!strcmp(name, "spi")) { +		len = num_arg(&user_buffer[i], 10, &value); +		if (len < 0) +			return len; +		i += len; +		pkt_dev->spi = value; +		sprintf(pg_result, "OK: spi=%u", pkt_dev->spi); +		return count; +	} +#endif  	if (!strcmp(name, "flowlen")) {  		len = num_arg(&user_buffer[i], 10, &value);  		if (len < 0) @@ -1550,7 +1530,7 @@ static ssize_t pktgen_if_write(struct file *file,  	}  	if (!strcmp(name, "mpls")) { -		unsigned n, cnt; +		unsigned int n, cnt;  		len = get_labels(&user_buffer[i], pkt_dev);  		if (len < 0) @@ -1567,7 +1547,7 @@ static ssize_t pktgen_if_write(struct file *file,  			pkt_dev->svlan_id = 0xffff;  			if (debug) -				printk(KERN_DEBUG "pktgen: VLAN/SVLAN auto turned off\n"); +				pr_debug("VLAN/SVLAN auto turned off\n");  		}  		return count;  	} @@ -1582,10 +1562,10 @@ static ssize_t pktgen_if_write(struct file *file,  			pkt_dev->vlan_id = value;  /* turn on VLAN */  			if (debug) -				printk(KERN_DEBUG "pktgen: VLAN turned on\n"); +				pr_debug("VLAN turned on\n");  			if (debug && pkt_dev->nr_labels) -				printk(KERN_DEBUG "pktgen: MPLS auto turned off\n"); +				pr_debug("MPLS auto turned off\n");  			pkt_dev->nr_labels = 0;    /* turn off MPLS */  			sprintf(pg_result, "OK: vlan_id=%u", pkt_dev->vlan_id); @@ -1594,7 +1574,7 @@ static ssize_t pktgen_if_write(struct file *file,  			pkt_dev->svlan_id = 0xffff;  			if (debug) -				printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n"); +				pr_debug("VLAN/SVLAN turned off\n");  		}  		return count;  	} @@ -1639,10 +1619,10 @@ static ssize_t pktgen_if_write(struct file *file,  			pkt_dev->svlan_id = value;  /* turn on SVLAN */  			if (debug) -				printk(KERN_DEBUG "pktgen: SVLAN turned on\n"); +				pr_debug("SVLAN turned on\n");  			if (debug && pkt_dev->nr_labels) -				printk(KERN_DEBUG "pktgen: MPLS auto turned off\n"); +				pr_debug("MPLS auto turned off\n");  			pkt_dev->nr_labels = 0;    /* turn off MPLS */  			sprintf(pg_result, "OK: svlan_id=%u", pkt_dev->svlan_id); @@ -1651,7 +1631,7 @@ static ssize_t pktgen_if_write(struct file *file,  			pkt_dev->svlan_id = 0xffff;  			if (debug) -				printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n"); +				pr_debug("VLAN/SVLAN turned off\n");  		}  		return count;  	} @@ -1736,7 +1716,7 @@ static ssize_t pktgen_if_write(struct file *file,  static int pktgen_if_open(struct inode *inode, struct file *file)  { -	return single_open(file, pktgen_if_show, PDE(inode)->data); +	return single_open(file, pktgen_if_show, PDE_DATA(inode));  }  static const struct file_operations pktgen_if_fops = { @@ -1755,14 +1735,14 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)  	BUG_ON(!t); -	seq_printf(seq, "Running: "); +	seq_puts(seq, "Running: ");  	if_lock(t);  	list_for_each_entry(pkt_dev, &t->if_list, list)  		if (pkt_dev->running)  			seq_printf(seq, "%s ", pkt_dev->odevname); -	seq_printf(seq, "\nStopped: "); +	seq_puts(seq, "\nStopped: ");  	list_for_each_entry(pkt_dev, &t->if_list, list)  		if (!pkt_dev->running) @@ -1771,7 +1751,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)  	if (t->result[0])  		seq_printf(seq, "\nResult: %s\n", t->result);  	else -		seq_printf(seq, "\nResult: NA\n"); +		seq_puts(seq, "\nResult: NA\n");  	if_unlock(t); @@ -1819,8 +1799,7 @@ static ssize_t pktgen_thread_write(struct file *file,  	i += len;  	if (debug) -		printk(KERN_DEBUG "pktgen: t=%s, count=%lu\n", -		       name, (unsigned long)count); +		pr_debug("t=%s, count=%lu\n", name, (unsigned long)count);  	if (!t) {  		pr_err("ERROR: No thread\n"); @@ -1842,10 +1821,13 @@ static ssize_t pktgen_thread_write(struct file *file,  			return -EFAULT;  		i += len;  		mutex_lock(&pktgen_thread_lock); -		pktgen_add_device(t, f); +		ret = pktgen_add_device(t, f);  		mutex_unlock(&pktgen_thread_lock); -		ret = count; -		sprintf(pg_result, "OK: add_device=%s", f); +		if (!ret) { +			ret = count; +			sprintf(pg_result, "OK: add_device=%s", f); +		} else +			sprintf(pg_result, "ERROR: can not add device %s", f);  		goto out;  	} @@ -1872,7 +1854,7 @@ out:  static int pktgen_thread_open(struct inode *inode, struct file *file)  { -	return single_open(file, pktgen_thread_show, PDE(inode)->data); +	return single_open(file, pktgen_thread_show, PDE_DATA(inode));  }  static const struct file_operations pktgen_thread_fops = { @@ -1885,13 +1867,14 @@ static const struct file_operations pktgen_thread_fops = {  };  /* Think find or remove for NN */ -static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) +static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn, +					      const char *ifname, int remove)  {  	struct pktgen_thread *t;  	struct pktgen_dev *pkt_dev = NULL;  	bool exact = (remove == FIND); -	list_for_each_entry(t, &pktgen_threads, th_list) { +	list_for_each_entry(t, &pn->pktgen_threads, th_list) {  		pkt_dev = pktgen_find_dev(t, ifname, exact);  		if (pkt_dev) {  			if (remove) { @@ -1909,7 +1892,7 @@ static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove)  /*   * mark a device for removal   */ -static void pktgen_mark_device(const char *ifname) +static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname)  {  	struct pktgen_dev *pkt_dev = NULL;  	const int max_tries = 10, msec_per_try = 125; @@ -1920,7 +1903,7 @@ static void pktgen_mark_device(const char *ifname)  	while (1) { -		pkt_dev = __pktgen_NN_threads(ifname, REMOVE); +		pkt_dev = __pktgen_NN_threads(pn, ifname, REMOVE);  		if (pkt_dev == NULL)  			break;	/* success */ @@ -1941,21 +1924,21 @@ static void pktgen_mark_device(const char *ifname)  	mutex_unlock(&pktgen_thread_lock);  } -static void pktgen_change_name(struct net_device *dev) +static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *dev)  {  	struct pktgen_thread *t; -	list_for_each_entry(t, &pktgen_threads, th_list) { +	list_for_each_entry(t, &pn->pktgen_threads, th_list) {  		struct pktgen_dev *pkt_dev;  		list_for_each_entry(pkt_dev, &t->if_list, list) {  			if (pkt_dev->odev != dev)  				continue; -			remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); +			proc_remove(pkt_dev->entry);  			pkt_dev->entry = proc_create_data(dev->name, 0600, -							  pg_proc_dir, +							  pn->proc_dir,  							  &pktgen_if_fops,  							  pkt_dev);  			if (!pkt_dev->entry) @@ -1969,9 +1952,10 @@ static void pktgen_change_name(struct net_device *dev)  static int pktgen_device_event(struct notifier_block *unused,  			       unsigned long event, void *ptr)  { -	struct net_device *dev = ptr; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr); +	struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id); -	if (!net_eq(dev_net(dev), &init_net)) +	if (pn->pktgen_exiting)  		return NOTIFY_DONE;  	/* It is OK that we do not hold the group lock right now, @@ -1980,18 +1964,19 @@ static int pktgen_device_event(struct notifier_block *unused,  	switch (event) {  	case NETDEV_CHANGENAME: -		pktgen_change_name(dev); +		pktgen_change_name(pn, dev);  		break;  	case NETDEV_UNREGISTER: -		pktgen_mark_device(dev->name); +		pktgen_mark_device(pn, dev->name);  		break;  	}  	return NOTIFY_DONE;  } -static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, +static struct net_device *pktgen_dev_get_by_name(const struct pktgen_net *pn, +						 struct pktgen_dev *pkt_dev,  						 const char *ifname)  {  	char b[IFNAMSIZ+5]; @@ -2005,13 +1990,14 @@ static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev,  	}  	b[i] = 0; -	return dev_get_by_name(&init_net, b); +	return dev_get_by_name(pn->net, b);  }  /* Associate pktgen_dev with a device. */ -static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) +static int pktgen_setup_dev(const struct pktgen_net *pn, +			    struct pktgen_dev *pkt_dev, const char *ifname)  {  	struct net_device *odev;  	int err; @@ -2022,7 +2008,7 @@ static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname)  		pkt_dev->odev = NULL;  	} -	odev = pktgen_dev_get_by_name(pkt_dev, ifname); +	odev = pktgen_dev_get_by_name(pn, pkt_dev, ifname);  	if (!odev) {  		pr_err("no such netdevice: \"%s\"\n", ifname);  		return -ENODEV; @@ -2064,36 +2050,34 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)  		pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",  			   pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,  			   pkt_dev->odevname); -		pkt_dev->queue_map_min = ntxq - 1; +		pkt_dev->queue_map_min = (ntxq ?: 1) - 1;  	}  	if (pkt_dev->queue_map_max >= ntxq) {  		pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",  			   pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,  			   pkt_dev->odevname); -		pkt_dev->queue_map_max = ntxq - 1; +		pkt_dev->queue_map_max = (ntxq ?: 1) - 1;  	}  	/* Default to the interface's mac if not explicitly set. */  	if (is_zero_ether_addr(pkt_dev->src_mac)) -		memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN); +		ether_addr_copy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr);  	/* Set up Dest MAC */ -	memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); - -	/* Set up pkt size */ -	pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; +	ether_addr_copy(&(pkt_dev->hh[0]), pkt_dev->dst_mac);  	if (pkt_dev->flags & F_IPV6) { -		/* -		 * Skip this automatic address setting until locks or functions -		 * gets exported -		 */ - -#ifdef NOTNOW  		int i, set = 0, err = 1;  		struct inet6_dev *idev; +		if (pkt_dev->min_pkt_size == 0) { +			pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr) +						+ sizeof(struct udphdr) +						+ sizeof(struct pktgen_hdr) +						+ pkt_dev->pkt_overhead; +		} +  		for (i = 0; i < IN6_ADDR_HSIZE; i++)  			if (pkt_dev->cur_in6_saddr.s6_addr[i]) {  				set = 1; @@ -2114,13 +2098,10 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)  				struct inet6_ifaddr *ifp;  				read_lock_bh(&idev->lock); -				for (ifp = idev->addr_list; ifp; -				     ifp = ifp->if_next) { -					if (ifp->scope == IFA_LINK && +				list_for_each_entry(ifp, &idev->addr_list, if_list) { +					if ((ifp->scope & IFA_LINK) &&  					    !(ifp->flags & IFA_F_TENTATIVE)) { -						ipv6_addr_copy(&pkt_dev-> -							       cur_in6_saddr, -							       &ifp->addr); +						pkt_dev->cur_in6_saddr = ifp->addr;  						err = 0;  						break;  					} @@ -2131,8 +2112,14 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)  			if (err)  				pr_err("ERROR: IPv6 link address not available\n");  		} -#endif  	} else { +		if (pkt_dev->min_pkt_size == 0) { +			pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr) +						+ sizeof(struct udphdr) +						+ sizeof(struct pktgen_hdr) +						+ pkt_dev->pkt_overhead; +		} +  		pkt_dev->saddr_min = 0;  		pkt_dev->saddr_max = 0;  		if (strlen(pkt_dev->src_min) == 0) { @@ -2158,6 +2145,10 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)  		pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);  	}  	/* Initialize current values. */ +	pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; +	if (pkt_dev->min_pkt_size > pkt_dev->max_pkt_size) +		pkt_dev->max_pkt_size = pkt_dev->min_pkt_size; +  	pkt_dev->cur_dst_mac_offset = 0;  	pkt_dev->cur_src_mac_offset = 0;  	pkt_dev->cur_saddr = pkt_dev->saddr_min; @@ -2183,10 +2174,13 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)  		return;  	} -	start_time = ktime_now(); -	if (remaining < 100000) -		ndelay(remaining);	/* really small just spin */ -	else { +	start_time = ktime_get(); +	if (remaining < 100000) { +		/* for small delays (<100us), just loop until limit is reached */ +		do { +			end_time = ktime_get(); +		} while (ktime_compare(end_time, spin_until) < 0); +	} else {  		/* see do_nanosleep */  		hrtimer_init_sleeper(&t, current);  		do { @@ -2201,8 +2195,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)  			hrtimer_cancel(&t.timer);  		} while (t.task && pkt_dev->running && !signal_pending(current));  		__set_current_state(TASK_RUNNING); +		end_time = ktime_get();  	} -	end_time = ktime_now();  	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));  	pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); @@ -2235,7 +2229,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)  				pkt_dev->curfl = 0; /*reset */  		}  	} else { -		flow = random32() % pkt_dev->cflows; +		flow = prandom_u32() % pkt_dev->cflows;  		pkt_dev->curfl = flow;  		if (pkt_dev->flows[flow].count > pkt_dev->lflow) { @@ -2256,14 +2250,23 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)  static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)  {  	struct xfrm_state *x = pkt_dev->flows[flow].x; +	struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);  	if (!x) { -		/*slow path: we dont already have xfrm_state*/ -		x = xfrm_stateonly_find(&init_net, DUMMY_MARK, -					(xfrm_address_t *)&pkt_dev->cur_daddr, -					(xfrm_address_t *)&pkt_dev->cur_saddr, -					AF_INET, -					pkt_dev->ipsmode, -					pkt_dev->ipsproto, 0); + +		if (pkt_dev->spi) { +			/* We need as quick as possible to find the right SA +			 * Searching with minimum criteria to archieve this. +			 */ +			x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); +		} else { +			/* slow path: we dont already have xfrm_state */ +			x = xfrm_stateonly_find(pn->net, DUMMY_MARK, +						(xfrm_address_t *)&pkt_dev->cur_daddr, +						(xfrm_address_t *)&pkt_dev->cur_saddr, +						AF_INET, +						pkt_dev->ipsmode, +						pkt_dev->ipsproto, 0); +		}  		if (x) {  			pkt_dev->flows[flow].x = x;  			set_pkt_overhead(pkt_dev); @@ -2282,7 +2285,7 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev)  	else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {  		__u16 t;  		if (pkt_dev->flags & F_QUEUE_MAP_RND) { -			t = random32() % +			t = prandom_u32() %  				(pkt_dev->queue_map_max -  				 pkt_dev->queue_map_min + 1)  				+ pkt_dev->queue_map_min; @@ -2314,7 +2317,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  		__u32 tmp;  		if (pkt_dev->flags & F_MACSRC_RND) -			mc = random32() % pkt_dev->src_mac_count; +			mc = prandom_u32() % pkt_dev->src_mac_count;  		else {  			mc = pkt_dev->cur_src_mac_offset++;  			if (pkt_dev->cur_src_mac_offset >= @@ -2340,7 +2343,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  		__u32 tmp;  		if (pkt_dev->flags & F_MACDST_RND) -			mc = random32() % pkt_dev->dst_mac_count; +			mc = prandom_u32() % pkt_dev->dst_mac_count;  		else {  			mc = pkt_dev->cur_dst_mac_offset++; @@ -2363,25 +2366,25 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  	}  	if (pkt_dev->flags & F_MPLS_RND) { -		unsigned i; +		unsigned int i;  		for (i = 0; i < pkt_dev->nr_labels; i++)  			if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)  				pkt_dev->labels[i] = MPLS_STACK_BOTTOM | -					     ((__force __be32)random32() & +					     ((__force __be32)prandom_u32() &  						      htonl(0x000fffff));  	}  	if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { -		pkt_dev->vlan_id = random32() & (4096-1); +		pkt_dev->vlan_id = prandom_u32() & (4096 - 1);  	}  	if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { -		pkt_dev->svlan_id = random32() & (4096 - 1); +		pkt_dev->svlan_id = prandom_u32() & (4096 - 1);  	}  	if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {  		if (pkt_dev->flags & F_UDPSRC_RND) -			pkt_dev->cur_udp_src = random32() % +			pkt_dev->cur_udp_src = prandom_u32() %  				(pkt_dev->udp_src_max - pkt_dev->udp_src_min)  				+ pkt_dev->udp_src_min; @@ -2394,7 +2397,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  	if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {  		if (pkt_dev->flags & F_UDPDST_RND) { -			pkt_dev->cur_udp_dst = random32() % +			pkt_dev->cur_udp_dst = prandom_u32() %  				(pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)  				+ pkt_dev->udp_dst_min;  		} else { @@ -2411,7 +2414,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  		if (imn < imx) {  			__u32 t;  			if (pkt_dev->flags & F_IPSRC_RND) -				t = random32() % (imx - imn) + imn; +				t = prandom_u32() % (imx - imn) + imn;  			else {  				t = ntohl(pkt_dev->cur_saddr);  				t++; @@ -2432,17 +2435,15 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  				__be32 s;  				if (pkt_dev->flags & F_IPDST_RND) { -					t = random32() % (imx - imn) + imn; -					s = htonl(t); - -					while (ipv4_is_loopback(s) || -					       ipv4_is_multicast(s) || -					       ipv4_is_lbcast(s) || -					       ipv4_is_zeronet(s) || -					       ipv4_is_local_multicast(s)) { -						t = random32() % (imx - imn) + imn; +					do { +						t = prandom_u32() % +							(imx - imn) + imn;  						s = htonl(t); -					} +					} while (ipv4_is_loopback(s) || +						ipv4_is_multicast(s) || +						ipv4_is_lbcast(s) || +						ipv4_is_zeronet(s) || +						ipv4_is_local_multicast(s));  					pkt_dev->cur_daddr = s;  				} else {  					t = ntohl(pkt_dev->cur_daddr); @@ -2466,18 +2467,14 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  		}  	} else {		/* IPV6 * */ -		if (pkt_dev->min_in6_daddr.s6_addr32[0] == 0 && -		    pkt_dev->min_in6_daddr.s6_addr32[1] == 0 && -		    pkt_dev->min_in6_daddr.s6_addr32[2] == 0 && -		    pkt_dev->min_in6_daddr.s6_addr32[3] == 0) ; -		else { +		if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) {  			int i;  			/* Only random destinations yet */  			for (i = 0; i < 4; i++) {  				pkt_dev->cur_in6_daddr.s6_addr32[i] = -				    (((__force __be32)random32() | +				    (((__force __be32)prandom_u32() |  				      pkt_dev->min_in6_daddr.s6_addr32[i]) &  				     pkt_dev->max_in6_daddr.s6_addr32[i]);  			} @@ -2487,7 +2484,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  	if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {  		__u32 t;  		if (pkt_dev->flags & F_TXSIZE_RND) { -			t = random32() % +			t = prandom_u32() %  				(pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)  				+ pkt_dev->min_pkt_size;  		} else { @@ -2505,33 +2502,47 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)  #ifdef CONFIG_XFRM +static u32 pktgen_dst_metrics[RTAX_MAX + 1] = { + +	[RTAX_HOPLIMIT] = 0x5, /* Set a static hoplimit */ +}; +  static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)  {  	struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;  	int err = 0; -	struct iphdr *iph; +	struct net *net = dev_net(pkt_dev->odev);  	if (!x)  		return 0;  	/* XXX: we dont support tunnel mode for now until  	 * we resolve the dst issue */ -	if (x->props.mode != XFRM_MODE_TRANSPORT) +	if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0))  		return 0; -	spin_lock(&x->lock); -	iph = ip_hdr(skb); +	/* But when user specify an valid SPI, transformation +	 * supports both transport/tunnel mode + ESP/AH type. +	 */ +	if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) +		skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF; +	rcu_read_lock_bh();  	err = x->outer_mode->output(x, skb); -	if (err) +	rcu_read_unlock_bh(); +	if (err) { +		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);  		goto error; +	}  	err = x->type->output(x, skb); -	if (err) +	if (err) { +		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR);  		goto error; - +	} +	spin_lock_bh(&x->lock);  	x->curlft.bytes += skb->len;  	x->curlft.packets++; +	spin_unlock_bh(&x->lock);  error: -	spin_unlock(&x->lock);  	return err;  } @@ -2559,6 +2570,8 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,  		if (x) {  			int ret;  			__u8 *eth; +			struct iphdr *iph; +  			nhead = x->props.header_len - skb_headroom(skb);  			if (nhead > 0) {  				ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC); @@ -2580,6 +2593,11 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,  			eth = (__u8 *) skb_push(skb, ETH_HLEN);  			memcpy(eth, pkt_dev->hh, 12);  			*(u16 *) ð[12] = protocol; + +			/* Update IPv4 header len as well as checksum value */ +			iph = ip_hdr(skb); +			iph->tot_len = htons(skb->len - ETH_HLEN); +			ip_send_check(iph);  		}  	}  	return 1; @@ -2591,7 +2609,7 @@ err:  static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)  { -	unsigned i; +	unsigned int i;  	for (i = 0; i < pkt_dev->nr_labels; i++)  		*mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; @@ -2605,6 +2623,95 @@ static inline __be16 build_tci(unsigned int id, unsigned int cfi,  	return htons(id | (cfi << 12) | (prio << 13));  } +static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, +				int datalen) +{ +	struct timeval timestamp; +	struct pktgen_hdr *pgh; + +	pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh)); +	datalen -= sizeof(*pgh); + +	if (pkt_dev->nfrags <= 0) { +		memset(skb_put(skb, datalen), 0, datalen); +	} else { +		int frags = pkt_dev->nfrags; +		int i, len; +		int frag_len; + + +		if (frags > MAX_SKB_FRAGS) +			frags = MAX_SKB_FRAGS; +		len = datalen - frags * PAGE_SIZE; +		if (len > 0) { +			memset(skb_put(skb, len), 0, len); +			datalen = frags * PAGE_SIZE; +		} + +		i = 0; +		frag_len = (datalen/frags) < PAGE_SIZE ? +			   (datalen/frags) : PAGE_SIZE; +		while (datalen > 0) { +			if (unlikely(!pkt_dev->page)) { +				int node = numa_node_id(); + +				if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE)) +					node = pkt_dev->node; +				pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); +				if (!pkt_dev->page) +					break; +			} +			get_page(pkt_dev->page); +			skb_frag_set_page(skb, i, pkt_dev->page); +			skb_shinfo(skb)->frags[i].page_offset = 0; +			/*last fragment, fill rest of data*/ +			if (i == (frags - 1)) +				skb_frag_size_set(&skb_shinfo(skb)->frags[i], +				    (datalen < PAGE_SIZE ? datalen : PAGE_SIZE)); +			else +				skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len); +			datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]); +			skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]); +			skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]); +			i++; +			skb_shinfo(skb)->nr_frags = i; +		} +	} + +	/* Stamp the time, and sequence number, +	 * convert them to network byte order +	 */ +	pgh->pgh_magic = htonl(PKTGEN_MAGIC); +	pgh->seq_num = htonl(pkt_dev->seq_num); + +	do_gettimeofday(×tamp); +	pgh->tv_sec = htonl(timestamp.tv_sec); +	pgh->tv_usec = htonl(timestamp.tv_usec); +} + +static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, +					struct pktgen_dev *pkt_dev, +					unsigned int extralen) +{ +	struct sk_buff *skb = NULL; +	unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen + +			    pkt_dev->pkt_overhead; + +	if (pkt_dev->flags & F_NODE) { +		int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id(); + +		skb = __alloc_skb(NET_SKB_PAD + size, GFP_NOWAIT, 0, node); +		if (likely(skb)) { +			skb_reserve(skb, NET_SKB_PAD); +			skb->dev = dev; +		} +	} else { +		 skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); +	} + +	return skb; +} +  static struct sk_buff *fill_packet_ipv4(struct net_device *odev,  					struct pktgen_dev *pkt_dev)  { @@ -2613,7 +2720,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,  	struct udphdr *udph;  	int datalen, iplen;  	struct iphdr *iph; -	struct pktgen_hdr *pgh = NULL;  	__be16 protocol = htons(ETH_P_IP);  	__be32 *mpls;  	__be16 *vlan_tci = NULL;                 /* Encapsulates priority and VLAN ID */ @@ -2636,31 +2742,13 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,  	datalen = (odev->hard_header_len + 16) & ~0xf; -	if (pkt_dev->flags & F_NODE) { -		int node; - -		if (pkt_dev->node >= 0) -			node = pkt_dev->node; -		else -			node =  numa_node_id(); - -		skb = __alloc_skb(NET_SKB_PAD + pkt_dev->cur_pkt_size + 64 -				  + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT, 0, node); -		if (likely(skb)) { -			skb_reserve(skb, NET_SKB_PAD); -			skb->dev = odev; -		} -	} -	else -	  skb = __netdev_alloc_skb(odev, -				   pkt_dev->cur_pkt_size + 64 -				   + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT); - +	skb = pktgen_alloc_skb(odev, pkt_dev, datalen);  	if (!skb) {  		sprintf(pkt_dev->result, "No memory");  		return NULL;  	} +	prefetchw(skb->data);  	skb_reserve(skb, datalen);  	/*  Reserve for ethernet and IP header  */ @@ -2686,28 +2774,28 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,  		*vlan_encapsulated_proto = htons(ETH_P_IP);  	} -	skb->network_header = skb->tail; -	skb->transport_header = skb->network_header + sizeof(struct iphdr); -	skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); +	skb_set_mac_header(skb, 0); +	skb_set_network_header(skb, skb->len); +	iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr)); + +	skb_set_transport_header(skb, skb->len); +	udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr));  	skb_set_queue_mapping(skb, queue_map);  	skb->priority = pkt_dev->skb_priority; -	iph = ip_hdr(skb); -	udph = udp_hdr(skb); -  	memcpy(eth, pkt_dev->hh, 12);  	*(__be16 *) & eth[12] = protocol;  	/* Eth + IPh + UDPh + mpls */  	datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -  		  pkt_dev->pkt_overhead; -	if (datalen < sizeof(struct pktgen_hdr)) +	if (datalen < 0 || datalen < sizeof(struct pktgen_hdr))  		datalen = sizeof(struct pktgen_hdr);  	udph->source = htons(pkt_dev->cur_udp_src);  	udph->dest = htons(pkt_dev->cur_udp_dst);  	udph->len = htons(datalen + 8);	/* DATA + udphdr */ -	udph->check = 0;	/* No checksum */ +	udph->check = 0;  	iph->ihl = 5;  	iph->version = 4; @@ -2721,83 +2809,29 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,  	iph->frag_off = 0;  	iplen = 20 + 8 + datalen;  	iph->tot_len = htons(iplen); -	iph->check = 0; -	iph->check = ip_fast_csum((void *)iph, iph->ihl); +	ip_send_check(iph);  	skb->protocol = protocol; -	skb->mac_header = (skb->network_header - ETH_HLEN - -			   pkt_dev->pkt_overhead);  	skb->dev = odev;  	skb->pkt_type = PACKET_HOST; -	if (pkt_dev->nfrags <= 0) { -		pgh = (struct pktgen_hdr *)skb_put(skb, datalen); -		memset(pgh + 1, 0, datalen - sizeof(struct pktgen_hdr)); +	if (!(pkt_dev->flags & F_UDPCSUM)) { +		skb->ip_summed = CHECKSUM_NONE; +	} else if (odev->features & NETIF_F_V4_CSUM) { +		skb->ip_summed = CHECKSUM_PARTIAL; +		skb->csum = 0; +		udp4_hwcsum(skb, udph->source, udph->dest);  	} else { -		int frags = pkt_dev->nfrags; -		int i, len; - -		pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8); - -		if (frags > MAX_SKB_FRAGS) -			frags = MAX_SKB_FRAGS; -		if (datalen > frags * PAGE_SIZE) { -			len = datalen - frags * PAGE_SIZE; -			memset(skb_put(skb, len), 0, len); -			datalen = frags * PAGE_SIZE; -		} - -		i = 0; -		while (datalen > 0) { -			struct page *page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); -			skb_shinfo(skb)->frags[i].page = page; -			skb_shinfo(skb)->frags[i].page_offset = 0; -			skb_shinfo(skb)->frags[i].size = -			    (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); -			datalen -= skb_shinfo(skb)->frags[i].size; -			skb->len += skb_shinfo(skb)->frags[i].size; -			skb->data_len += skb_shinfo(skb)->frags[i].size; -			i++; -			skb_shinfo(skb)->nr_frags = i; -		} - -		while (i < frags) { -			int rem; - -			if (i == 0) -				break; - -			rem = skb_shinfo(skb)->frags[i - 1].size / 2; -			if (rem == 0) -				break; +		__wsum csum = udp_csum(skb); -			skb_shinfo(skb)->frags[i - 1].size -= rem; +		/* add protocol-dependent pseudo-header */ +		udph->check = csum_tcpudp_magic(udph->source, udph->dest, +						datalen + 8, IPPROTO_UDP, csum); -			skb_shinfo(skb)->frags[i] = -			    skb_shinfo(skb)->frags[i - 1]; -			get_page(skb_shinfo(skb)->frags[i].page); -			skb_shinfo(skb)->frags[i].page = -			    skb_shinfo(skb)->frags[i - 1].page; -			skb_shinfo(skb)->frags[i].page_offset += -			    skb_shinfo(skb)->frags[i - 1].size; -			skb_shinfo(skb)->frags[i].size = rem; -			i++; -			skb_shinfo(skb)->nr_frags = i; -		} +		if (udph->check == 0) +			udph->check = CSUM_MANGLED_0;  	} -	/* Stamp the time, and sequence number, -	 * convert them to network byte order -	 */ -	if (pgh) { -		struct timeval timestamp; - -		pgh->pgh_magic = htonl(PKTGEN_MAGIC); -		pgh->seq_num = htonl(pkt_dev->seq_num); - -		do_gettimeofday(×tamp); -		pgh->tv_sec = htonl(timestamp.tv_sec); -		pgh->tv_usec = htonl(timestamp.tv_usec); -	} +	pktgen_finalize_skb(pkt_dev, skb, datalen);  #ifdef CONFIG_XFRM  	if (!process_ipsec(pkt_dev, skb, protocol)) @@ -2807,179 +2841,14 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,  	return skb;  } -/* - * scan_ip6, fmt_ip taken from dietlibc-0.21 - * Author Felix von Leitner <felix-dietlibc@fefe.de> - * - * Slightly modified for kernel. - * Should be candidate for net/ipv4/utils.c - * --ro - */ - -static unsigned int scan_ip6(const char *s, char ip[16]) -{ -	unsigned int i; -	unsigned int len = 0; -	unsigned long u; -	char suffix[16]; -	unsigned int prefixlen = 0; -	unsigned int suffixlen = 0; -	__be32 tmp; -	char *pos; - -	for (i = 0; i < 16; i++) -		ip[i] = 0; - -	for (;;) { -		if (*s == ':') { -			len++; -			if (s[1] == ':') {	/* Found "::", skip to part 2 */ -				s += 2; -				len++; -				break; -			} -			s++; -		} - -		u = simple_strtoul(s, &pos, 16); -		i = pos - s; -		if (!i) -			return 0; -		if (prefixlen == 12 && s[i] == '.') { - -			/* the last 4 bytes may be written as IPv4 address */ - -			tmp = in_aton(s); -			memcpy((struct in_addr *)(ip + 12), &tmp, sizeof(tmp)); -			return i + len; -		} -		ip[prefixlen++] = (u >> 8); -		ip[prefixlen++] = (u & 255); -		s += i; -		len += i; -		if (prefixlen == 16) -			return len; -	} - -/* part 2, after "::" */ -	for (;;) { -		if (*s == ':') { -			if (suffixlen == 0) -				break; -			s++; -			len++; -		} else if (suffixlen != 0) -			break; - -		u = simple_strtol(s, &pos, 16); -		i = pos - s; -		if (!i) { -			if (*s) -				len--; -			break; -		} -		if (suffixlen + prefixlen <= 12 && s[i] == '.') { -			tmp = in_aton(s); -			memcpy((struct in_addr *)(suffix + suffixlen), &tmp, -			       sizeof(tmp)); -			suffixlen += 4; -			len += strlen(s); -			break; -		} -		suffix[suffixlen++] = (u >> 8); -		suffix[suffixlen++] = (u & 255); -		s += i; -		len += i; -		if (prefixlen + suffixlen == 16) -			break; -	} -	for (i = 0; i < suffixlen; i++) -		ip[16 - suffixlen + i] = suffix[i]; -	return len; -} - -static char tohex(char hexdigit) -{ -	return hexdigit > 9 ? hexdigit + 'a' - 10 : hexdigit + '0'; -} - -static int fmt_xlong(char *s, unsigned int i) -{ -	char *bak = s; -	*s = tohex((i >> 12) & 0xf); -	if (s != bak || *s != '0') -		++s; -	*s = tohex((i >> 8) & 0xf); -	if (s != bak || *s != '0') -		++s; -	*s = tohex((i >> 4) & 0xf); -	if (s != bak || *s != '0') -		++s; -	*s = tohex(i & 0xf); -	return s - bak + 1; -} - -static unsigned int fmt_ip6(char *s, const char ip[16]) -{ -	unsigned int len; -	unsigned int i; -	unsigned int temp; -	unsigned int compressing; -	int j; - -	len = 0; -	compressing = 0; -	for (j = 0; j < 16; j += 2) { - -#ifdef V4MAPPEDPREFIX -		if (j == 12 && !memcmp(ip, V4mappedprefix, 12)) { -			inet_ntoa_r(*(struct in_addr *)(ip + 12), s); -			temp = strlen(s); -			return len + temp; -		} -#endif -		temp = ((unsigned long)(unsigned char)ip[j] << 8) + -		    (unsigned long)(unsigned char)ip[j + 1]; -		if (temp == 0) { -			if (!compressing) { -				compressing = 1; -				if (j == 0) { -					*s++ = ':'; -					++len; -				} -			} -		} else { -			if (compressing) { -				compressing = 0; -				*s++ = ':'; -				++len; -			} -			i = fmt_xlong(s, temp); -			len += i; -			s += i; -			if (j < 14) { -				*s++ = ':'; -				++len; -			} -		} -	} -	if (compressing) { -		*s++ = ':'; -		++len; -	} -	*s = 0; -	return len; -} -  static struct sk_buff *fill_packet_ipv6(struct net_device *odev,  					struct pktgen_dev *pkt_dev)  {  	struct sk_buff *skb = NULL;  	__u8 *eth;  	struct udphdr *udph; -	int datalen; +	int datalen, udplen;  	struct ipv6hdr *iph; -	struct pktgen_hdr *pgh = NULL;  	__be16 protocol = htons(ETH_P_IPV6);  	__be32 *mpls;  	__be16 *vlan_tci = NULL;                 /* Encapsulates priority and VLAN ID */ @@ -3000,14 +2869,13 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,  	mod_cur_headers(pkt_dev);  	queue_map = pkt_dev->cur_queue_map; -	skb = __netdev_alloc_skb(odev, -				 pkt_dev->cur_pkt_size + 64 -				 + 16 + pkt_dev->pkt_overhead, GFP_NOWAIT); +	skb = pktgen_alloc_skb(odev, pkt_dev, 16);  	if (!skb) {  		sprintf(pkt_dev->result, "No memory");  		return NULL;  	} +	prefetchw(skb->data);  	skb_reserve(skb, 16);  	/*  Reserve for ethernet and IP header  */ @@ -3033,13 +2901,14 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,  		*vlan_encapsulated_proto = htons(ETH_P_IPV6);  	} -	skb->network_header = skb->tail; -	skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); -	skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); +	skb_set_mac_header(skb, 0); +	skb_set_network_header(skb, skb->len); +	iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + +	skb_set_transport_header(skb, skb->len); +	udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr));  	skb_set_queue_mapping(skb, queue_map);  	skb->priority = pkt_dev->skb_priority; -	iph = ipv6_hdr(skb); -	udph = udp_hdr(skb);  	memcpy(eth, pkt_dev->hh, 12);  	*(__be16 *) ð[12] = protocol; @@ -3049,16 +2918,16 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,  		  sizeof(struct ipv6hdr) - sizeof(struct udphdr) -  		  pkt_dev->pkt_overhead; -	if (datalen < sizeof(struct pktgen_hdr)) { +	if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) {  		datalen = sizeof(struct pktgen_hdr); -		if (net_ratelimit()) -			pr_info("increased datalen to %d\n", datalen); +		net_info_ratelimited("increased datalen to %d\n", datalen);  	} +	udplen = datalen + sizeof(struct udphdr);  	udph->source = htons(pkt_dev->cur_udp_src);  	udph->dest = htons(pkt_dev->cur_udp_dst); -	udph->len = htons(datalen + sizeof(struct udphdr)); -	udph->check = 0;	/* No checksum */ +	udph->len = htons(udplen); +	udph->check = 0;  	*(__be32 *) iph = htonl(0x60000000);	/* Version + flow */ @@ -3069,87 +2938,34 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,  	iph->hop_limit = 32; -	iph->payload_len = htons(sizeof(struct udphdr) + datalen); +	iph->payload_len = htons(udplen);  	iph->nexthdr = IPPROTO_UDP; -	ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr); -	ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr); +	iph->daddr = pkt_dev->cur_in6_daddr; +	iph->saddr = pkt_dev->cur_in6_saddr; -	skb->mac_header = (skb->network_header - ETH_HLEN - -			   pkt_dev->pkt_overhead);  	skb->protocol = protocol;  	skb->dev = odev;  	skb->pkt_type = PACKET_HOST; -	if (pkt_dev->nfrags <= 0) -		pgh = (struct pktgen_hdr *)skb_put(skb, datalen); -	else { -		int frags = pkt_dev->nfrags; -		int i; - -		pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8); - -		if (frags > MAX_SKB_FRAGS) -			frags = MAX_SKB_FRAGS; -		if (datalen > frags * PAGE_SIZE) { -			skb_put(skb, datalen - frags * PAGE_SIZE); -			datalen = frags * PAGE_SIZE; -		} - -		i = 0; -		while (datalen > 0) { -			struct page *page = alloc_pages(GFP_KERNEL, 0); -			skb_shinfo(skb)->frags[i].page = page; -			skb_shinfo(skb)->frags[i].page_offset = 0; -			skb_shinfo(skb)->frags[i].size = -			    (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); -			datalen -= skb_shinfo(skb)->frags[i].size; -			skb->len += skb_shinfo(skb)->frags[i].size; -			skb->data_len += skb_shinfo(skb)->frags[i].size; -			i++; -			skb_shinfo(skb)->nr_frags = i; -		} - -		while (i < frags) { -			int rem; - -			if (i == 0) -				break; - -			rem = skb_shinfo(skb)->frags[i - 1].size / 2; -			if (rem == 0) -				break; +	if (!(pkt_dev->flags & F_UDPCSUM)) { +		skb->ip_summed = CHECKSUM_NONE; +	} else if (odev->features & NETIF_F_V6_CSUM) { +		skb->ip_summed = CHECKSUM_PARTIAL; +		skb->csum_start = skb_transport_header(skb) - skb->head; +		skb->csum_offset = offsetof(struct udphdr, check); +		udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0); +	} else { +		__wsum csum = udp_csum(skb); -			skb_shinfo(skb)->frags[i - 1].size -= rem; +		/* add protocol-dependent pseudo-header */ +		udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum); -			skb_shinfo(skb)->frags[i] = -			    skb_shinfo(skb)->frags[i - 1]; -			get_page(skb_shinfo(skb)->frags[i].page); -			skb_shinfo(skb)->frags[i].page = -			    skb_shinfo(skb)->frags[i - 1].page; -			skb_shinfo(skb)->frags[i].page_offset += -			    skb_shinfo(skb)->frags[i - 1].size; -			skb_shinfo(skb)->frags[i].size = rem; -			i++; -			skb_shinfo(skb)->nr_frags = i; -		} +		if (udph->check == 0) +			udph->check = CSUM_MANGLED_0;  	} -	/* Stamp the time, and sequence number, -	 * convert them to network byte order -	 * should we update cloned packets too ? -	 */ -	if (pgh) { -		struct timeval timestamp; - -		pgh->pgh_magic = htonl(PKTGEN_MAGIC); -		pgh->seq_num = htonl(pkt_dev->seq_num); - -		do_gettimeofday(×tamp); -		pgh->tv_sec = htonl(timestamp.tv_sec); -		pgh->tv_usec = htonl(timestamp.tv_usec); -	} -	/* pkt_dev->seq_num++; FF: you really mean this? */ +	pktgen_finalize_skb(pkt_dev, skb, datalen);  	return skb;  } @@ -3193,8 +3009,7 @@ static void pktgen_run(struct pktgen_thread *t)  			pktgen_clear_counters(pkt_dev);  			pkt_dev->running = 1;	/* Cranke yeself! */  			pkt_dev->skb = NULL; -			pkt_dev->started_at = -				pkt_dev->next_tx = ktime_now(); +			pkt_dev->started_at = pkt_dev->next_tx = ktime_get();  			set_pkt_overhead(pkt_dev); @@ -3208,7 +3023,7 @@ static void pktgen_run(struct pktgen_thread *t)  		t->control &= ~(T_STOP);  } -static void pktgen_stop_all_threads_ifs(void) +static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn)  {  	struct pktgen_thread *t; @@ -3216,7 +3031,7 @@ static void pktgen_stop_all_threads_ifs(void)  	mutex_lock(&pktgen_thread_lock); -	list_for_each_entry(t, &pktgen_threads, th_list) +	list_for_each_entry(t, &pn->pktgen_threads, th_list)  		t->control |= T_STOP;  	mutex_unlock(&pktgen_thread_lock); @@ -3252,28 +3067,28 @@ signal:  	return 0;  } -static int pktgen_wait_all_threads_run(void) +static int pktgen_wait_all_threads_run(struct pktgen_net *pn)  {  	struct pktgen_thread *t;  	int sig = 1;  	mutex_lock(&pktgen_thread_lock); -	list_for_each_entry(t, &pktgen_threads, th_list) { +	list_for_each_entry(t, &pn->pktgen_threads, th_list) {  		sig = pktgen_wait_thread_run(t);  		if (sig == 0)  			break;  	}  	if (sig == 0) -		list_for_each_entry(t, &pktgen_threads, th_list) +		list_for_each_entry(t, &pn->pktgen_threads, th_list)  			t->control |= (T_STOP);  	mutex_unlock(&pktgen_thread_lock);  	return sig;  } -static void pktgen_run_all_threads(void) +static void pktgen_run_all_threads(struct pktgen_net *pn)  {  	struct pktgen_thread *t; @@ -3281,7 +3096,7 @@ static void pktgen_run_all_threads(void)  	mutex_lock(&pktgen_thread_lock); -	list_for_each_entry(t, &pktgen_threads, th_list) +	list_for_each_entry(t, &pn->pktgen_threads, th_list)  		t->control |= (T_RUN);  	mutex_unlock(&pktgen_thread_lock); @@ -3289,10 +3104,10 @@ static void pktgen_run_all_threads(void)  	/* Propagate thread->control  */  	schedule_timeout_interruptible(msecs_to_jiffies(125)); -	pktgen_wait_all_threads_run(); +	pktgen_wait_all_threads_run(pn);  } -static void pktgen_reset_all_threads(void) +static void pktgen_reset_all_threads(struct pktgen_net *pn)  {  	struct pktgen_thread *t; @@ -3300,7 +3115,7 @@ static void pktgen_reset_all_threads(void)  	mutex_lock(&pktgen_thread_lock); -	list_for_each_entry(t, &pktgen_threads, th_list) +	list_for_each_entry(t, &pn->pktgen_threads, th_list)  		t->control |= (T_REMDEVALL);  	mutex_unlock(&pktgen_thread_lock); @@ -3308,7 +3123,7 @@ static void pktgen_reset_all_threads(void)  	/* Propagate thread->control  */  	schedule_timeout_interruptible(msecs_to_jiffies(125)); -	pktgen_wait_all_threads_run(); +	pktgen_wait_all_threads_run(pn);  }  static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) @@ -3319,7 +3134,7 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)  				    pkt_dev->started_at);  	ktime_t idle = ns_to_ktime(pkt_dev->idle_acc); -	p += sprintf(p, "OK: %llu(c%llu+d%llu) nsec, %llu (%dbyte,%dfrags)\n", +	p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",  		     (unsigned long long)ktime_to_us(elapsed),  		     (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)),  		     (unsigned long long)ktime_to_us(idle), @@ -3353,7 +3168,7 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)  	kfree_skb(pkt_dev->skb);  	pkt_dev->skb = NULL; -	pkt_dev->stopped_at = ktime_now(); +	pkt_dev->stopped_at = ktime_get();  	pkt_dev->running = 0;  	show_results(pkt_dev, nr_frags); @@ -3372,7 +3187,7 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)  			continue;  		if (best == NULL)  			best = pkt_dev; -		else if (ktime_lt(pkt_dev->next_tx, best->next_tx)) +		else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)  			best = pkt_dev;  	}  	if_unlock(t); @@ -3450,21 +3265,19 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)  static void pktgen_rem_thread(struct pktgen_thread *t)  {  	/* Remove from the thread list */ - -	remove_proc_entry(t->tsk->comm, pg_proc_dir); - +	remove_proc_entry(t->tsk->comm, t->net->proc_dir);  }  static void pktgen_resched(struct pktgen_dev *pkt_dev)  { -	ktime_t idle_start = ktime_now(); +	ktime_t idle_start = ktime_get();  	schedule(); -	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); +	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));  }  static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)  { -	ktime_t idle_start = ktime_now(); +	ktime_t idle_start = ktime_get();  	while (atomic_read(&(pkt_dev->skb->users)) != 1) {  		if (signal_pending(current)) @@ -3475,7 +3288,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)  		else  			cpu_relax();  	} -	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); +	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));  }  static void pktgen_xmit(struct pktgen_dev *pkt_dev) @@ -3497,7 +3310,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  	 * "never transmit"  	 */  	if (unlikely(pkt_dev->delay == ULLONG_MAX)) { -		pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX); +		pkt_dev->next_tx = ktime_add_ns(ktime_get(), ULONG_MAX);  		return;  	} @@ -3525,9 +3338,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  	queue_map = skb_get_queue_mapping(pkt_dev->skb);  	txq = netdev_get_tx_queue(odev, queue_map); -	__netif_tx_lock_bh(txq); +	local_bh_disable(); -	if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) { +	HARD_TX_LOCK(odev, txq, smp_processor_id()); + +	if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {  		ret = NETDEV_TX_BUSY;  		pkt_dev->last_ok = 0;  		goto unlock; @@ -3550,9 +3365,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  		pkt_dev->errors++;  		break;  	default: /* Drivers are not supposed to return other values! */ -		if (net_ratelimit()) -			pr_info("pktgen: %s xmit error: %d\n", -				pkt_dev->odevname, ret); +		net_info_ratelimited("%s xmit error: %d\n", +				     pkt_dev->odevname, ret);  		pkt_dev->errors++;  		/* fallthru */  	case NETDEV_TX_LOCKED: @@ -3562,7 +3376,9 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)  		pkt_dev->last_ok = 0;  	}  unlock: -	__netif_tx_unlock_bh(txq); +	HARD_TX_UNLOCK(odev, txq); + +	local_bh_enable();  	/* If pkt_dev->count is zero, then run forever */  	if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { @@ -3599,7 +3415,7 @@ static int pktgen_thread_worker(void *arg)  		pkt_dev = next_to_run(t);  		if (unlikely(!pkt_dev && t->control == 0)) { -			if (pktgen_exiting) +			if (t->net->pktgen_exiting)  				break;  			wait_event_interruptible_timeout(t->queue,  							 t->control != 0, @@ -3721,7 +3537,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)  	/* We don't allow a device to be on several threads */ -	pkt_dev = __pktgen_NN_threads(ifname, FIND); +	pkt_dev = __pktgen_NN_threads(t->net, ifname, FIND);  	if (pkt_dev) {  		pr_err("ERROR: interface already used\n");  		return -EBUSY; @@ -3732,19 +3548,15 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)  		return -ENOMEM;  	strcpy(pkt_dev->odevname, ifname); -	pkt_dev->flows = vmalloc_node(MAX_CFLOWS * sizeof(struct flow_state), +	pkt_dev->flows = vzalloc_node(MAX_CFLOWS * sizeof(struct flow_state),  				      node);  	if (pkt_dev->flows == NULL) {  		kfree(pkt_dev);  		return -ENOMEM;  	} -	memset(pkt_dev->flows, 0, MAX_CFLOWS * sizeof(struct flow_state));  	pkt_dev->removal_mark = 0; -	pkt_dev->min_pkt_size = ETH_ZLEN; -	pkt_dev->max_pkt_size = ETH_ZLEN;  	pkt_dev->nfrags = 0; -	pkt_dev->clone_skb = pg_clone_skb_d;  	pkt_dev->delay = pg_delay_d;  	pkt_dev->count = pg_count_d;  	pkt_dev->sofar = 0; @@ -3752,7 +3564,6 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)  	pkt_dev->udp_src_max = 9;  	pkt_dev->udp_dst_min = 9;  	pkt_dev->udp_dst_max = 9; -  	pkt_dev->vlan_p = 0;  	pkt_dev->vlan_cfi = 0;  	pkt_dev->vlan_id = 0xffff; @@ -3761,11 +3572,13 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)  	pkt_dev->svlan_id = 0xffff;  	pkt_dev->node = -1; -	err = pktgen_setup_dev(pkt_dev, ifname); +	err = pktgen_setup_dev(t->net, pkt_dev, ifname);  	if (err)  		goto out1; +	if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING) +		pkt_dev->clone_skb = pg_clone_skb_d; -	pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir, +	pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir,  					  &pktgen_if_fops, pkt_dev);  	if (!pkt_dev->entry) {  		pr_err("cannot create %s/%s procfs entry\n", @@ -3776,6 +3589,17 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)  #ifdef CONFIG_XFRM  	pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;  	pkt_dev->ipsproto = IPPROTO_ESP; + +	/* xfrm tunnel mode needs additional dst to extract outter +	 * ip header protocol/ttl/id field, here creat a phony one. +	 * instead of looking for a valid rt, which definitely hurting +	 * performance under such circumstance. +	 */ +	pkt_dev->dstops.family = AF_INET; +	pkt_dev->dst.dev = pkt_dev->odev; +	dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false); +	pkt_dev->dst.child = &pkt_dev->dst; +	pkt_dev->dst.ops = &pkt_dev->dstops;  #endif  	return add_dev_to_thread(t, pkt_dev); @@ -3790,7 +3614,7 @@ out1:  	return err;  } -static int __init pktgen_create_thread(int cpu) +static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)  {  	struct pktgen_thread *t;  	struct proc_dir_entry *pe; @@ -3808,10 +3632,13 @@ static int __init pktgen_create_thread(int cpu)  	INIT_LIST_HEAD(&t->if_list); -	list_add_tail(&t->th_list, &pktgen_threads); +	list_add_tail(&t->th_list, &pn->pktgen_threads);  	init_completion(&t->start_done); -	p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu); +	p = kthread_create_on_node(pktgen_thread_worker, +				   t, +				   cpu_to_node(cpu), +				   "kpktgend_%d", cpu);  	if (IS_ERR(p)) {  		pr_err("kernel_thread() failed for cpu %d\n", t->cpu);  		list_del(&t->th_list); @@ -3821,7 +3648,7 @@ static int __init pktgen_create_thread(int cpu)  	kthread_bind(p, cpu);  	t->tsk = p; -	pe = proc_create_data(t->tsk->comm, 0600, pg_proc_dir, +	pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,  			      &pktgen_thread_fops, t);  	if (!pe) {  		pr_err("cannot create %s/%s procfs entry\n", @@ -3832,6 +3659,7 @@ static int __init pktgen_create_thread(int cpu)  		return -EINVAL;  	} +	t->net = pn;  	wake_up_process(p);  	wait_for_completion(&t->start_done); @@ -3857,7 +3685,6 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t,  static int pktgen_remove_device(struct pktgen_thread *t,  				struct pktgen_dev *pkt_dev)  { -  	pr_debug("remove_device pkt_dev=%p\n", pkt_dev);  	if (pkt_dev->running) { @@ -3877,77 +3704,114 @@ static int pktgen_remove_device(struct pktgen_thread *t,  	_rem_dev_from_if_list(t, pkt_dev);  	if (pkt_dev->entry) -		remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); +		proc_remove(pkt_dev->entry);  #ifdef CONFIG_XFRM  	free_SAs(pkt_dev);  #endif  	vfree(pkt_dev->flows); +	if (pkt_dev->page) +		put_page(pkt_dev->page);  	kfree(pkt_dev);  	return 0;  } -static int __init pg_init(void) +static int __net_init pg_net_init(struct net *net)  { -	int cpu; +	struct pktgen_net *pn = net_generic(net, pg_net_id);  	struct proc_dir_entry *pe; - -	pr_info("%s", version); - -	pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net); -	if (!pg_proc_dir) +	int cpu, ret = 0; + +	pn->net = net; +	INIT_LIST_HEAD(&pn->pktgen_threads); +	pn->pktgen_exiting = false; +	pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net); +	if (!pn->proc_dir) { +		pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR);  		return -ENODEV; - -	pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops); +	} +	pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops);  	if (pe == NULL) { -		pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL); -		proc_net_remove(&init_net, PG_PROC_DIR); -		return -EINVAL; +		pr_err("cannot create %s procfs entry\n", PGCTRL); +		ret = -EINVAL; +		goto remove;  	} -	/* Register us to receive netdevice events */ -	register_netdevice_notifier(&pktgen_notifier_block); -  	for_each_online_cpu(cpu) {  		int err; -		err = pktgen_create_thread(cpu); +		err = pktgen_create_thread(cpu, pn);  		if (err) -			pr_warning("WARNING: Cannot create thread for cpu %d (%d)\n", +			pr_warn("Cannot create thread for cpu %d (%d)\n",  				   cpu, err);  	} -	if (list_empty(&pktgen_threads)) { -		pr_err("ERROR: Initialization failed for all threads\n"); -		unregister_netdevice_notifier(&pktgen_notifier_block); -		remove_proc_entry(PGCTRL, pg_proc_dir); -		proc_net_remove(&init_net, PG_PROC_DIR); -		return -ENODEV; +	if (list_empty(&pn->pktgen_threads)) { +		pr_err("Initialization failed for all threads\n"); +		ret = -ENODEV; +		goto remove_entry;  	}  	return 0; + +remove_entry: +	remove_proc_entry(PGCTRL, pn->proc_dir); +remove: +	remove_proc_entry(PG_PROC_DIR, pn->net->proc_net); +	return ret;  } -static void __exit pg_cleanup(void) +static void __net_exit pg_net_exit(struct net *net)  { +	struct pktgen_net *pn = net_generic(net, pg_net_id);  	struct pktgen_thread *t;  	struct list_head *q, *n; +	LIST_HEAD(list);  	/* Stop all interfaces & threads */ -	pktgen_exiting = true; +	pn->pktgen_exiting = true; -	list_for_each_safe(q, n, &pktgen_threads) { +	mutex_lock(&pktgen_thread_lock); +	list_splice_init(&pn->pktgen_threads, &list); +	mutex_unlock(&pktgen_thread_lock); + +	list_for_each_safe(q, n, &list) {  		t = list_entry(q, struct pktgen_thread, th_list); +		list_del(&t->th_list);  		kthread_stop(t->tsk);  		kfree(t);  	} -	/* Un-register us from receiving netdevice events */ -	unregister_netdevice_notifier(&pktgen_notifier_block); +	remove_proc_entry(PGCTRL, pn->proc_dir); +	remove_proc_entry(PG_PROC_DIR, pn->net->proc_net); +} + +static struct pernet_operations pg_net_ops = { +	.init = pg_net_init, +	.exit = pg_net_exit, +	.id   = &pg_net_id, +	.size = sizeof(struct pktgen_net), +}; + +static int __init pg_init(void) +{ +	int ret = 0; + +	pr_info("%s", version); +	ret = register_pernet_subsys(&pg_net_ops); +	if (ret) +		return ret; +	ret = register_netdevice_notifier(&pktgen_notifier_block); +	if (ret) +		unregister_pernet_subsys(&pg_net_ops); -	/* Clean up proc file system */ -	remove_proc_entry(PGCTRL, pg_proc_dir); -	proc_net_remove(&init_net, PG_PROC_DIR); +	return ret; +} + +static void __exit pg_cleanup(void) +{ +	unregister_netdevice_notifier(&pktgen_notifier_block); +	unregister_pernet_subsys(&pg_net_ops);  }  module_init(pg_init); diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c new file mode 100644 index 00000000000..d3027a73fd4 --- /dev/null +++ b/net/core/ptp_classifier.c @@ -0,0 +1,141 @@ +/* PTP classifier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* The below program is the bpf_asm (tools/net/) representation of + * the opcode array in the ptp_filter structure. + * + * For convenience, this can easily be altered and reviewed with + * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a + * simple file containing the below program: + * + * ldh [12]                        ; load ethertype + * + * ; PTP over UDP over IPv4 over Ethernet + * test_ipv4: + *   jneq #0x800, test_ipv6        ; ETH_P_IP ? + *   ldb [23]                      ; load proto + *   jneq #17, drop_ipv4           ; IPPROTO_UDP ? + *   ldh [20]                      ; load frag offset field + *   jset #0x1fff, drop_ipv4       ; don't allow fragments + *   ldxb 4*([14]&0xf)             ; load IP header len + *   ldh [x + 16]                  ; load UDP dst port + *   jneq #319, drop_ipv4          ; is port PTP_EV_PORT ? + *   ldh [x + 22]                  ; load payload + *   and #0xf                      ; mask PTP_CLASS_VMASK + *   or #0x10                      ; PTP_CLASS_IPV4 + *   ret a                         ; return PTP class + *   drop_ipv4: ret #0x0           ; PTP_CLASS_NONE + * + * ; PTP over UDP over IPv6 over Ethernet + * test_ipv6: + *   jneq #0x86dd, test_8021q      ; ETH_P_IPV6 ? + *   ldb [20]                      ; load proto + *   jneq #17, drop_ipv6           ; IPPROTO_UDP ? + *   ldh [56]                      ; load UDP dst port + *   jneq #319, drop_ipv6          ; is port PTP_EV_PORT ? + *   ldh [62]                      ; load payload + *   and #0xf                      ; mask PTP_CLASS_VMASK + *   or #0x20                      ; PTP_CLASS_IPV6 + *   ret a                         ; return PTP class + *   drop_ipv6: ret #0x0           ; PTP_CLASS_NONE + * + * ; PTP over 802.1Q over Ethernet + * test_8021q: + *   jneq #0x8100, test_ieee1588   ; ETH_P_8021Q ? + *   ldh [16]                      ; load inner type + *   jneq #0x88f7, drop_ieee1588   ; ETH_P_1588 ? + *   ldb [18]                      ; load payload + *   and #0x8                      ; as we don't have ports here, test + *   jneq #0x0, drop_ieee1588      ; for PTP_GEN_BIT and drop these + *   ldh [18]                      ; reload payload + *   and #0xf                      ; mask PTP_CLASS_VMASK + *   or #0x40                      ; PTP_CLASS_V2_VLAN + *   ret a                         ; return PTP class + * + * ; PTP over Ethernet + * test_ieee1588: + *   jneq #0x88f7, drop_ieee1588   ; ETH_P_1588 ? + *   ldb [14]                      ; load payload + *   and #0x8                      ; as we don't have ports here, test + *   jneq #0x0, drop_ieee1588      ; for PTP_GEN_BIT and drop these + *   ldh [14]                      ; reload payload + *   and #0xf                      ; mask PTP_CLASS_VMASK + *   or #0x30                      ; PTP_CLASS_L2 + *   ret a                         ; return PTP class + *   drop_ieee1588: ret #0x0       ; PTP_CLASS_NONE + */ + +#include <linux/skbuff.h> +#include <linux/filter.h> +#include <linux/ptp_classify.h> + +static struct sk_filter *ptp_insns __read_mostly; + +unsigned int ptp_classify_raw(const struct sk_buff *skb) +{ +	return SK_RUN_FILTER(ptp_insns, skb); +} +EXPORT_SYMBOL_GPL(ptp_classify_raw); + +void __init ptp_classifier_init(void) +{ +	static struct sock_filter ptp_filter[] __initdata = { +		{ 0x28,  0,  0, 0x0000000c }, +		{ 0x15,  0, 12, 0x00000800 }, +		{ 0x30,  0,  0, 0x00000017 }, +		{ 0x15,  0,  9, 0x00000011 }, +		{ 0x28,  0,  0, 0x00000014 }, +		{ 0x45,  7,  0, 0x00001fff }, +		{ 0xb1,  0,  0, 0x0000000e }, +		{ 0x48,  0,  0, 0x00000010 }, +		{ 0x15,  0,  4, 0x0000013f }, +		{ 0x48,  0,  0, 0x00000016 }, +		{ 0x54,  0,  0, 0x0000000f }, +		{ 0x44,  0,  0, 0x00000010 }, +		{ 0x16,  0,  0, 0x00000000 }, +		{ 0x06,  0,  0, 0x00000000 }, +		{ 0x15,  0,  9, 0x000086dd }, +		{ 0x30,  0,  0, 0x00000014 }, +		{ 0x15,  0,  6, 0x00000011 }, +		{ 0x28,  0,  0, 0x00000038 }, +		{ 0x15,  0,  4, 0x0000013f }, +		{ 0x28,  0,  0, 0x0000003e }, +		{ 0x54,  0,  0, 0x0000000f }, +		{ 0x44,  0,  0, 0x00000020 }, +		{ 0x16,  0,  0, 0x00000000 }, +		{ 0x06,  0,  0, 0x00000000 }, +		{ 0x15,  0,  9, 0x00008100 }, +		{ 0x28,  0,  0, 0x00000010 }, +		{ 0x15,  0, 15, 0x000088f7 }, +		{ 0x30,  0,  0, 0x00000012 }, +		{ 0x54,  0,  0, 0x00000008 }, +		{ 0x15,  0, 12, 0x00000000 }, +		{ 0x28,  0,  0, 0x00000012 }, +		{ 0x54,  0,  0, 0x0000000f }, +		{ 0x44,  0,  0, 0x00000040 }, +		{ 0x16,  0,  0, 0x00000000 }, +		{ 0x15,  0,  7, 0x000088f7 }, +		{ 0x30,  0,  0, 0x0000000e }, +		{ 0x54,  0,  0, 0x00000008 }, +		{ 0x15,  0,  4, 0x00000000 }, +		{ 0x28,  0,  0, 0x0000000e }, +		{ 0x54,  0,  0, 0x0000000f }, +		{ 0x44,  0,  0, 0x00000030 }, +		{ 0x16,  0,  0, 0x00000000 }, +		{ 0x06,  0,  0, 0x00000000 }, +	}; +	struct sock_fprog_kern ptp_prog = { +		.len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter, +	}; + +	BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog)); +} diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 41d99435f62..467f326126e 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -15,6 +15,7 @@  #include <linux/random.h>  #include <linux/slab.h>  #include <linux/string.h> +#include <linux/tcp.h>  #include <linux/vmalloc.h>  #include <net/request_sock.h> @@ -26,10 +27,11 @@   * but then some measure against one socket starving all other sockets   * would be needed.   * - * It was 128 by default. Experiments with real servers show, that + * The minimum value of it is 128. Experiments with real servers show that   * it is absolutely not enough even at 100conn/sec. 256 cures most - * of problems. This value is adjusted to 128 for very small machines - * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). + * of problems. + * This value is adjusted to 128 for low memory machines, + * and it will increase in proportion to the memory of machine.   * Note : Dont forget somaxconn that may limit backlog too.   */  int sysctl_max_syn_backlog = 256; @@ -46,9 +48,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,  	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);  	lopt_size += nr_table_entries * sizeof(struct request_sock *);  	if (lopt_size > PAGE_SIZE) -		lopt = __vmalloc(lopt_size, -			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, -			PAGE_KERNEL); +		lopt = vzalloc(lopt_size);  	else  		lopt = kzalloc(lopt_size, GFP_KERNEL);  	if (lopt == NULL) @@ -131,3 +131,94 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)  		kfree(lopt);  } +/* + * This function is called to set a Fast Open socket's "fastopen_rsk" field + * to NULL when a TFO socket no longer needs to access the request_sock. + * This happens only after 3WHS has been either completed or aborted (e.g., + * RST is received). + * + * Before TFO, a child socket is created only after 3WHS is completed, + * hence it never needs to access the request_sock. things get a lot more + * complex with TFO. A child socket, accepted or not, has to access its + * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, + * until 3WHS is either completed or aborted. Afterwards the req will stay + * until either the child socket is accepted, or in the rare case when the + * listener is closed before the child is accepted. + * + * In short, a request socket is only freed after BOTH 3WHS has completed + * (or aborted) and the child socket has been accepted (or listener closed). + * When a child socket is accepted, its corresponding req->sk is set to + * NULL since it's no longer needed. More importantly, "req->sk == NULL" + * will be used by the code below to determine if a child socket has been + * accepted or not, and the check is protected by the fastopenq->lock + * described below. + * + * Note that fastopen_rsk is only accessed from the child socket's context + * with its socket lock held. But a request_sock (req) can be accessed by + * both its child socket through fastopen_rsk, and a listener socket through + * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin + * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. + * only in the rare case when both the listener and the child locks are held, + * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. + * The lock also protects other fields such as fastopenq->qlen, which is + * decremented by this function when fastopen_rsk is no longer needed. + * + * Note that another solution was to simply use the existing socket lock + * from the listener. But first socket lock is difficult to use. It is not + * a simple spin lock - one must consider sock_owned_by_user() and arrange + * to use sk_add_backlog() stuff. But what really makes it infeasible is the + * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to + * acquire a child's lock while holding listener's socket lock. A corner + * case might also exist in tcp_v4_hnd_req() that will trigger this locking + * order. + * + * When a TFO req is created, it needs to sock_hold its listener to prevent + * the latter data structure from going away. + * + * This function also sets "treq->listener" to NULL and unreference listener + * socket. treq->listener is used by the listener so it is protected by the + * fastopenq->lock in this function. + */ +void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, +			   bool reset) +{ +	struct sock *lsk = tcp_rsk(req)->listener; +	struct fastopen_queue *fastopenq = +	    inet_csk(lsk)->icsk_accept_queue.fastopenq; + +	tcp_sk(sk)->fastopen_rsk = NULL; +	spin_lock_bh(&fastopenq->lock); +	fastopenq->qlen--; +	tcp_rsk(req)->listener = NULL; +	if (req->sk)	/* the child socket hasn't been accepted yet */ +		goto out; + +	if (!reset || lsk->sk_state != TCP_LISTEN) { +		/* If the listener has been closed don't bother with the +		 * special RST handling below. +		 */ +		spin_unlock_bh(&fastopenq->lock); +		sock_put(lsk); +		reqsk_free(req); +		return; +	} +	/* Wait for 60secs before removing a req that has triggered RST. +	 * This is a simple defense against TFO spoofing attack - by +	 * counting the req against fastopen.max_qlen, and disabling +	 * TFO when the qlen exceeds max_qlen. +	 * +	 * For more details see CoNext'11 "TCP Fast Open" paper. +	 */ +	req->expires = jiffies + 60*HZ; +	if (fastopenq->rskq_rst_head == NULL) +		fastopenq->rskq_rst_head = req; +	else +		fastopenq->rskq_rst_tail->dl_next = req; + +	req->dl_next = NULL; +	fastopenq->rskq_rst_tail = req; +	fastopenq->qlen++; +out: +	spin_unlock_bh(&fastopenq->lock); +	sock_put(lsk); +} diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 750db57f3bb..1063996f831 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -35,10 +35,11 @@  #include <linux/security.h>  #include <linux/mutex.h>  #include <linux/if_addr.h> +#include <linux/if_bridge.h>  #include <linux/pci.h> +#include <linux/etherdevice.h>  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/inet.h>  #include <linux/netdevice.h> @@ -56,6 +57,7 @@  struct rtnl_link {  	rtnl_doit_func		doit;  	rtnl_dumpit_func	dumpit; +	rtnl_calcit_func 	calcit;  };  static DEFINE_MUTEX(rtnl_mutex); @@ -126,7 +128,7 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)  	if (tab == NULL || tab[msgindex].doit == NULL)  		tab = rtnl_msg_handlers[PF_UNSPEC]; -	return tab ? tab[msgindex].doit : NULL; +	return tab[msgindex].doit;  }  static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) @@ -141,7 +143,22 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)  	if (tab == NULL || tab[msgindex].dumpit == NULL)  		tab = rtnl_msg_handlers[PF_UNSPEC]; -	return tab ? tab[msgindex].dumpit : NULL; +	return tab[msgindex].dumpit; +} + +static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex) +{ +	struct rtnl_link *tab; + +	if (protocol <= RTNL_FAMILY_MAX) +		tab = rtnl_msg_handlers[protocol]; +	else +		tab = NULL; + +	if (tab == NULL || tab[msgindex].calcit == NULL) +		tab = rtnl_msg_handlers[PF_UNSPEC]; + +	return tab[msgindex].calcit;  }  /** @@ -150,6 +167,7 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)   * @msgtype: rtnetlink message type   * @doit: Function pointer called for each request message   * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @calcit: Function pointer to calc size of dump message   *   * Registers the specified function pointers (at least one of them has   * to be non-NULL) to be called whenever a request message for the @@ -162,7 +180,8 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)   * Returns 0 on success or a negative error code.   */  int __rtnl_register(int protocol, int msgtype, -		    rtnl_doit_func doit, rtnl_dumpit_func dumpit) +		    rtnl_doit_func doit, rtnl_dumpit_func dumpit, +		    rtnl_calcit_func calcit)  {  	struct rtnl_link *tab;  	int msgindex; @@ -185,6 +204,9 @@ int __rtnl_register(int protocol, int msgtype,  	if (dumpit)  		tab[msgindex].dumpit = dumpit; +	if (calcit) +		tab[msgindex].calcit = calcit; +  	return 0;  }  EXPORT_SYMBOL_GPL(__rtnl_register); @@ -196,12 +218,13 @@ EXPORT_SYMBOL_GPL(__rtnl_register);   * as failure of this function is very unlikely, it can only happen due   * to lack of memory when allocating the chain to store all message   * handlers for a protocol. Meant for use in init functions where lack - * of memory implies no sense in continueing. + * of memory implies no sense in continuing.   */  void rtnl_register(int protocol, int msgtype, -		   rtnl_doit_func doit, rtnl_dumpit_func dumpit) +		   rtnl_doit_func doit, rtnl_dumpit_func dumpit, +		   rtnl_calcit_func calcit)  { -	if (__rtnl_register(protocol, msgtype, doit, dumpit) < 0) +	if (__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0)  		panic("Unable to register rtnetlink message handler, "  		      "protocol = %d, message type = %d\n",  		      protocol, msgtype); @@ -250,6 +273,17 @@ EXPORT_SYMBOL_GPL(rtnl_unregister_all);  static LIST_HEAD(link_ops); +static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +{ +	const struct rtnl_link_ops *ops; + +	list_for_each_entry(ops, &link_ops, list) { +		if (!strcmp(ops->kind, kind)) +			return ops; +	} +	return NULL; +} +  /**   * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.   * @ops: struct rtnl_link_ops * to register @@ -262,6 +296,9 @@ static LIST_HEAD(link_ops);   */  int __rtnl_link_register(struct rtnl_link_ops *ops)  { +	if (rtnl_link_ops_get(ops->kind)) +		return -EEXIST; +  	if (!ops->dellink)  		ops->dellink = unregister_netdevice_queue; @@ -316,27 +353,63 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops)  }  EXPORT_SYMBOL_GPL(__rtnl_link_unregister); +/* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace. + */ +static void rtnl_lock_unregistering_all(void) +{ +	struct net *net; +	bool unregistering; +	DEFINE_WAIT(wait); + +	for (;;) { +		prepare_to_wait(&netdev_unregistering_wq, &wait, +				TASK_UNINTERRUPTIBLE); +		unregistering = false; +		rtnl_lock(); +		for_each_net(net) { +			if (net->dev_unreg_count > 0) { +				unregistering = true; +				break; +			} +		} +		if (!unregistering) +			break; +		__rtnl_unlock(); +		schedule(); +	} +	finish_wait(&netdev_unregistering_wq, &wait); +} +  /**   * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.   * @ops: struct rtnl_link_ops * to unregister   */  void rtnl_link_unregister(struct rtnl_link_ops *ops)  { -	rtnl_lock(); +	/* Close the race with cleanup_net() */ +	mutex_lock(&net_mutex); +	rtnl_lock_unregistering_all();  	__rtnl_link_unregister(ops);  	rtnl_unlock(); +	mutex_unlock(&net_mutex);  }  EXPORT_SYMBOL_GPL(rtnl_link_unregister); -static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev)  { +	struct net_device *master_dev;  	const struct rtnl_link_ops *ops; -	list_for_each_entry(ops, &link_ops, list) { -		if (!strcmp(ops->kind, kind)) -			return ops; -	} -	return NULL; +	master_dev = netdev_master_upper_dev_get((struct net_device *) dev); +	if (!master_dev) +		return 0; +	ops = master_dev->rtnl_link_ops; +	if (!ops || !ops->get_slave_size) +		return 0; +	/* IFLA_INFO_SLAVE_DATA + nested data */ +	return nla_total_size(sizeof(struct nlattr)) + +	       ops->get_slave_size(master_dev, dev);  }  static size_t rtnl_link_get_size(const struct net_device *dev) @@ -359,6 +432,8 @@ static size_t rtnl_link_get_size(const struct net_device *dev)  		/* IFLA_INFO_XSTATS */  		size += nla_total_size(ops->get_xstats_size(dev)); +	size += rtnl_link_get_slave_info_data_size(dev); +  	return size;  } @@ -377,34 +452,16 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family)  }  /** - * __rtnl_af_register - Register rtnl_af_ops with rtnetlink. - * @ops: struct rtnl_af_ops * to register - * - * The caller must hold the rtnl_mutex. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_af_register(struct rtnl_af_ops *ops) -{ -	list_add_tail(&ops->list, &rtnl_af_ops); -	return 0; -} -EXPORT_SYMBOL_GPL(__rtnl_af_register); - -/**   * rtnl_af_register - Register rtnl_af_ops with rtnetlink.   * @ops: struct rtnl_af_ops * to register   *   * Returns 0 on success or a negative error code.   */ -int rtnl_af_register(struct rtnl_af_ops *ops) +void rtnl_af_register(struct rtnl_af_ops *ops)  { -	int err; -  	rtnl_lock(); -	err = __rtnl_af_register(ops); +	list_add_tail(&ops->list, &rtnl_af_ops);  	rtnl_unlock(); -	return err;  }  EXPORT_SYMBOL_GPL(rtnl_af_register); @@ -451,84 +508,107 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev)  	return size;  } -static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +static bool rtnl_have_link_slave_info(const struct net_device *dev)  { -	const struct rtnl_link_ops *ops = dev->rtnl_link_ops; -	struct nlattr *linkinfo, *data; -	int err = -EMSGSIZE; +	struct net_device *master_dev; -	linkinfo = nla_nest_start(skb, IFLA_LINKINFO); -	if (linkinfo == NULL) -		goto out; +	master_dev = netdev_master_upper_dev_get((struct net_device *) dev); +	if (master_dev && master_dev->rtnl_link_ops) +		return true; +	return false; +} + +static int rtnl_link_slave_info_fill(struct sk_buff *skb, +				     const struct net_device *dev) +{ +	struct net_device *master_dev; +	const struct rtnl_link_ops *ops; +	struct nlattr *slave_data; +	int err; +	master_dev = netdev_master_upper_dev_get((struct net_device *) dev); +	if (!master_dev) +		return 0; +	ops = master_dev->rtnl_link_ops; +	if (!ops) +		return 0; +	if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) +		return -EMSGSIZE; +	if (ops->fill_slave_info) { +		slave_data = nla_nest_start(skb, IFLA_INFO_SLAVE_DATA); +		if (!slave_data) +			return -EMSGSIZE; +		err = ops->fill_slave_info(skb, master_dev, dev); +		if (err < 0) +			goto err_cancel_slave_data; +		nla_nest_end(skb, slave_data); +	} +	return 0; + +err_cancel_slave_data: +	nla_nest_cancel(skb, slave_data); +	return err; +} + +static int rtnl_link_info_fill(struct sk_buff *skb, +			       const struct net_device *dev) +{ +	const struct rtnl_link_ops *ops = dev->rtnl_link_ops; +	struct nlattr *data; +	int err; + +	if (!ops) +		return 0;  	if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) -		goto err_cancel_link; +		return -EMSGSIZE;  	if (ops->fill_xstats) {  		err = ops->fill_xstats(skb, dev);  		if (err < 0) -			goto err_cancel_link; +			return err;  	}  	if (ops->fill_info) {  		data = nla_nest_start(skb, IFLA_INFO_DATA);  		if (data == NULL) -			goto err_cancel_link; +			return -EMSGSIZE;  		err = ops->fill_info(skb, dev);  		if (err < 0)  			goto err_cancel_data;  		nla_nest_end(skb, data);  	} - -	nla_nest_end(skb, linkinfo);  	return 0;  err_cancel_data:  	nla_nest_cancel(skb, data); -err_cancel_link: -	nla_nest_cancel(skb, linkinfo); -out:  	return err;  } -static const int rtm_min[RTM_NR_FAMILIES] = +static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)  { -	[RTM_FAM(RTM_NEWLINK)]      = NLMSG_LENGTH(sizeof(struct ifinfomsg)), -	[RTM_FAM(RTM_NEWADDR)]      = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), -	[RTM_FAM(RTM_NEWROUTE)]     = NLMSG_LENGTH(sizeof(struct rtmsg)), -	[RTM_FAM(RTM_NEWRULE)]      = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)), -	[RTM_FAM(RTM_NEWQDISC)]     = NLMSG_LENGTH(sizeof(struct tcmsg)), -	[RTM_FAM(RTM_NEWTCLASS)]    = NLMSG_LENGTH(sizeof(struct tcmsg)), -	[RTM_FAM(RTM_NEWTFILTER)]   = NLMSG_LENGTH(sizeof(struct tcmsg)), -	[RTM_FAM(RTM_NEWACTION)]    = NLMSG_LENGTH(sizeof(struct tcamsg)), -	[RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), -	[RTM_FAM(RTM_GETANYCAST)]   = NLMSG_LENGTH(sizeof(struct rtgenmsg)), -}; +	struct nlattr *linkinfo; +	int err = -EMSGSIZE; -static const int rta_max[RTM_NR_FAMILIES] = -{ -	[RTM_FAM(RTM_NEWLINK)]      = IFLA_MAX, -	[RTM_FAM(RTM_NEWADDR)]      = IFA_MAX, -	[RTM_FAM(RTM_NEWROUTE)]     = RTA_MAX, -	[RTM_FAM(RTM_NEWRULE)]      = FRA_MAX, -	[RTM_FAM(RTM_NEWQDISC)]     = TCA_MAX, -	[RTM_FAM(RTM_NEWTCLASS)]    = TCA_MAX, -	[RTM_FAM(RTM_NEWTFILTER)]   = TCA_MAX, -	[RTM_FAM(RTM_NEWACTION)]    = TCAA_MAX, -}; +	linkinfo = nla_nest_start(skb, IFLA_LINKINFO); +	if (linkinfo == NULL) +		goto out; -void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) -{ -	struct rtattr *rta; -	int size = RTA_LENGTH(attrlen); +	err = rtnl_link_info_fill(skb, dev); +	if (err < 0) +		goto err_cancel_link; + +	err = rtnl_link_slave_info_fill(skb, dev); +	if (err < 0) +		goto err_cancel_link; + +	nla_nest_end(skb, linkinfo); +	return 0; -	rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size)); -	rta->rta_type = attrtype; -	rta->rta_len = size; -	memcpy(RTA_DATA(rta), data, attrlen); -	memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); +err_cancel_link: +	nla_nest_cancel(skb, linkinfo); +out: +	return err;  } -EXPORT_SYMBOL(__rta_fill); -int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo) +int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)  {  	struct sock *rtnl = net->rtnl;  	int err = 0; @@ -583,7 +663,8 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)  	for (i = 0; i < RTAX_MAX; i++) {  		if (metrics[i]) {  			valid++; -			NLA_PUT_U32(skb, i+1, metrics[i]); +			if (nla_put_u32(skb, i+1, metrics[i])) +				goto nla_put_failure;  		}  	} @@ -601,21 +682,23 @@ nla_put_failure:  EXPORT_SYMBOL(rtnetlink_put_metrics);  int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, -		       u32 ts, u32 tsage, long expires, u32 error) +		       long expires, u32 error)  {  	struct rta_cacheinfo ci = { -		.rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), +		.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse),  		.rta_used = dst->__use,  		.rta_clntref = atomic_read(&(dst->__refcnt)),  		.rta_error = error,  		.rta_id =  id, -		.rta_ts = ts, -		.rta_tsage = tsage,  	}; -	if (expires) -		ci.rta_expires = jiffies_to_clock_t(expires); +	if (expires) { +		unsigned long clock; +		clock = jiffies_to_clock_t(abs(expires)); +		clock = min_t(unsigned long, clock, INT_MAX); +		ci.rta_expires = (expires > 0) ? clock : -clock; +	}  	return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);  }  EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); @@ -647,6 +730,12 @@ static void set_operstate(struct net_device *dev, unsigned char transition)  	}  } +static unsigned int rtnl_dev_get_flags(const struct net_device *dev) +{ +	return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | +	       (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); +} +  static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,  					   const struct ifinfomsg *ifm)  { @@ -655,7 +744,7 @@ static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,  	/* bugwards compatibility: ifi_change == 0 is treated as ~0 */  	if (ifm->ifi_change)  		flags = (flags & ifm->ifi_change) | -			(dev->flags & ~ifm->ifi_change); +			(rtnl_dev_get_flags(dev) & ~ifm->ifi_change);  	return flags;  } @@ -698,23 +787,26 @@ static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)  }  /* All VF info */ -static inline int rtnl_vfinfo_size(const struct net_device *dev) +static inline int rtnl_vfinfo_size(const struct net_device *dev, +				   u32 ext_filter_mask)  { -	if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { - +	if (dev->dev.parent && dev_is_pci(dev->dev.parent) && +	    (ext_filter_mask & RTEXT_FILTER_VF)) {  		int num_vfs = dev_num_vf(dev->dev.parent);  		size_t size = nla_total_size(sizeof(struct nlattr));  		size += nla_total_size(num_vfs * sizeof(struct nlattr));  		size += num_vfs *  			(nla_total_size(sizeof(struct ifla_vf_mac)) +  			 nla_total_size(sizeof(struct ifla_vf_vlan)) + -			 nla_total_size(sizeof(struct ifla_vf_tx_rate))); +			 nla_total_size(sizeof(struct ifla_vf_spoofchk)) + +			 nla_total_size(sizeof(struct ifla_vf_rate)));  		return size;  	} else  		return 0;  } -static size_t rtnl_port_size(const struct net_device *dev) +static size_t rtnl_port_size(const struct net_device *dev, +			     u32 ext_filter_mask)  {  	size_t port_size = nla_total_size(4)		/* PORT_VF */  		+ nla_total_size(PORT_PROFILE_MAX)	/* PORT_PROFILE */ @@ -730,7 +822,8 @@ static size_t rtnl_port_size(const struct net_device *dev)  	size_t port_self_size = nla_total_size(sizeof(struct nlattr))  		+ port_size; -	if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) +	if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || +	    !(ext_filter_mask & RTEXT_FILTER_VF))  		return 0;  	if (dev_num_vf(dev->dev.parent))  		return port_self_size + vf_ports_size + @@ -739,7 +832,8 @@ static size_t rtnl_port_size(const struct net_device *dev)  		return port_self_size;  } -static noinline size_t if_nlmsg_size(const struct net_device *dev) +static noinline size_t if_nlmsg_size(const struct net_device *dev, +				     u32 ext_filter_mask)  {  	return NLMSG_ALIGN(sizeof(struct ifinfomsg))  	       + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ @@ -755,13 +849,20 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev)  	       + nla_total_size(4) /* IFLA_MTU */  	       + nla_total_size(4) /* IFLA_LINK */  	       + nla_total_size(4) /* IFLA_MASTER */ +	       + nla_total_size(1) /* IFLA_CARRIER */ +	       + nla_total_size(4) /* IFLA_PROMISCUITY */ +	       + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ +	       + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */  	       + nla_total_size(1) /* IFLA_OPERSTATE */  	       + nla_total_size(1) /* IFLA_LINKMODE */ -	       + nla_total_size(4) /* IFLA_NUM_VF */ -	       + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ -	       + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ +	       + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ +	       + nla_total_size(ext_filter_mask +			        & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ +	       + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ +	       + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */  	       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ -	       + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ +	       + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ +	       + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */  }  static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -779,7 +880,8 @@ static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)  		vf_port = nla_nest_start(skb, IFLA_VF_PORT);  		if (!vf_port)  			goto nla_put_failure; -		NLA_PUT_U32(skb, IFLA_PORT_VF, vf); +		if (nla_put_u32(skb, IFLA_PORT_VF, vf)) +			goto nla_put_failure;  		err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb);  		if (err == -EMSGSIZE)  			goto nla_put_failure; @@ -819,11 +921,13 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)  	return 0;  } -static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) +static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, +			  u32 ext_filter_mask)  {  	int err; -	if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) +	if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || +	    !(ext_filter_mask & RTEXT_FILTER_VF))  		return 0;  	err = rtnl_port_self_fill(skb, dev); @@ -839,9 +943,27 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)  	return 0;  } +static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) +{ +	int err; +	struct netdev_phys_port_id ppid; + +	err = dev_get_phys_port_id(dev, &ppid); +	if (err) { +		if (err == -EOPNOTSUPP) +			return 0; +		return err; +	} + +	if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) +		return -EMSGSIZE; + +	return 0; +} +  static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  			    int type, u32 pid, u32 seq, u32 change, -			    unsigned int flags) +			    unsigned int flags, u32 ext_filter_mask)  {  	struct ifinfomsg *ifm;  	struct nlmsghdr *nlh; @@ -849,7 +971,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  	const struct rtnl_link_stats64 *stats;  	struct nlattr *attr, *af_spec;  	struct rtnl_af_ops *af_ops; +	struct net_device *upper_dev = netdev_master_upper_dev_get(dev); +	ASSERT_RTNL();  	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);  	if (nlh == NULL)  		return -EMSGSIZE; @@ -862,24 +986,30 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  	ifm->ifi_flags = dev_get_flags(dev);  	ifm->ifi_change = change; -	NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); -	NLA_PUT_U32(skb, IFLA_TXQLEN, dev->tx_queue_len); -	NLA_PUT_U8(skb, IFLA_OPERSTATE, -		   netif_running(dev) ? dev->operstate : IF_OPER_DOWN); -	NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode); -	NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); - -	if (dev->ifindex != dev->iflink) -		NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); - -	if (dev->master) -		NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex); - -	if (dev->qdisc) -		NLA_PUT_STRING(skb, IFLA_QDISC, dev->qdisc->ops->id); - -	if (dev->ifalias) -		NLA_PUT_STRING(skb, IFLA_IFALIAS, dev->ifalias); +	if (nla_put_string(skb, IFLA_IFNAME, dev->name) || +	    nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || +	    nla_put_u8(skb, IFLA_OPERSTATE, +		       netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || +	    nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || +	    nla_put_u32(skb, IFLA_MTU, dev->mtu) || +	    nla_put_u32(skb, IFLA_GROUP, dev->group) || +	    nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || +	    nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || +#ifdef CONFIG_RPS +	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || +#endif +	    (dev->ifindex != dev->iflink && +	     nla_put_u32(skb, IFLA_LINK, dev->iflink)) || +	    (upper_dev && +	     nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) || +	    nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || +	    (dev->qdisc && +	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || +	    (dev->ifalias && +	     nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || +	    nla_put_u32(skb, IFLA_CARRIER_CHANGES, +			atomic_read(&dev->carrier_changes))) +		goto nla_put_failure;  	if (1) {  		struct rtnl_link_ifmap map = { @@ -890,14 +1020,19 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  			.dma         = dev->dma,  			.port        = dev->if_port,  		}; -		NLA_PUT(skb, IFLA_MAP, sizeof(map), &map); +		if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) +			goto nla_put_failure;  	}  	if (dev->addr_len) { -		NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); -		NLA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); +		if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || +		    nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) +			goto nla_put_failure;  	} +	if (rtnl_phys_port_id_fill(skb, dev)) +		goto nla_put_failure; +  	attr = nla_reserve(skb, IFLA_STATS,  			sizeof(struct rtnl_link_stats));  	if (attr == NULL) @@ -912,10 +1047,12 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  		goto nla_put_failure;  	copy_rtnl_link_stats64(nla_data(attr), stats); -	if (dev->dev.parent) -		NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); +	if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) && +	    nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent))) +		goto nla_put_failure; -	if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { +	if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent +	    && (ext_filter_mask & RTEXT_FILTER_VF)) {  		int i;  		struct nlattr *vfinfo, *vf; @@ -928,31 +1065,65 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,  			struct ifla_vf_info ivi;  			struct ifla_vf_mac vf_mac;  			struct ifla_vf_vlan vf_vlan; +			struct ifla_vf_rate vf_rate;  			struct ifla_vf_tx_rate vf_tx_rate; +			struct ifla_vf_spoofchk vf_spoofchk; +			struct ifla_vf_link_state vf_linkstate; + +			/* +			 * Not all SR-IOV capable drivers support the +			 * spoofcheck query.  Preset to -1 so the user +			 * space tool can detect that the driver didn't +			 * report anything. +			 */ +			ivi.spoofchk = -1; +			memset(ivi.mac, 0, sizeof(ivi.mac)); +			/* The default value for VF link state is "auto" +			 * IFLA_VF_LINK_STATE_AUTO which equals zero +			 */ +			ivi.linkstate = 0;  			if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))  				break; -			vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf; +			vf_mac.vf = +				vf_vlan.vf = +				vf_rate.vf = +				vf_tx_rate.vf = +				vf_spoofchk.vf = +				vf_linkstate.vf = ivi.vf; +  			memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));  			vf_vlan.vlan = ivi.vlan;  			vf_vlan.qos = ivi.qos; -			vf_tx_rate.rate = ivi.tx_rate; +			vf_tx_rate.rate = ivi.max_tx_rate; +			vf_rate.min_tx_rate = ivi.min_tx_rate; +			vf_rate.max_tx_rate = ivi.max_tx_rate; +			vf_spoofchk.setting = ivi.spoofchk; +			vf_linkstate.link_state = ivi.linkstate;  			vf = nla_nest_start(skb, IFLA_VF_INFO);  			if (!vf) {  				nla_nest_cancel(skb, vfinfo);  				goto nla_put_failure;  			} -			NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac); -			NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan); -			NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate); +			if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || +			    nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || +			    nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), +				    &vf_rate) || +			    nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), +				    &vf_tx_rate) || +			    nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), +				    &vf_spoofchk) || +			    nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), +				    &vf_linkstate)) +				goto nla_put_failure;  			nla_nest_end(skb, vf);  		}  		nla_nest_end(skb, vfinfo);  	} -	if (rtnl_port_fill(skb, dev)) +	if (rtnl_port_fill(skb, dev, ext_filter_mask))  		goto nla_put_failure; -	if (dev->rtnl_link_ops) { +	if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {  		if (rtnl_link_fill(skb, dev) < 0)  			goto nla_put_failure;  	} @@ -994,64 +1165,40 @@ nla_put_failure:  	return -EMSGSIZE;  } -static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) -{ -	struct net *net = sock_net(skb->sk); -	int h, s_h; -	int idx = 0, s_idx; -	struct net_device *dev; -	struct hlist_head *head; -	struct hlist_node *node; - -	s_h = cb->args[0]; -	s_idx = cb->args[1]; - -	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { -		idx = 0; -		head = &net->dev_index_head[h]; -		hlist_for_each_entry(dev, node, head, index_hlist) { -			if (idx < s_idx) -				goto cont; -			if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, -					     NETLINK_CB(cb->skb).pid, -					     cb->nlh->nlmsg_seq, 0, -					     NLM_F_MULTI) <= 0) -				goto out; -cont: -			idx++; -		} -	} -out: -	cb->args[1] = idx; -	cb->args[0] = h; - -	return skb->len; -} - -const struct nla_policy ifla_policy[IFLA_MAX+1] = { +static const struct nla_policy ifla_policy[IFLA_MAX+1] = {  	[IFLA_IFNAME]		= { .type = NLA_STRING, .len = IFNAMSIZ-1 },  	[IFLA_ADDRESS]		= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },  	[IFLA_BROADCAST]	= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },  	[IFLA_MAP]		= { .len = sizeof(struct rtnl_link_ifmap) },  	[IFLA_MTU]		= { .type = NLA_U32 },  	[IFLA_LINK]		= { .type = NLA_U32 }, +	[IFLA_MASTER]		= { .type = NLA_U32 }, +	[IFLA_CARRIER]		= { .type = NLA_U8 },  	[IFLA_TXQLEN]		= { .type = NLA_U32 },  	[IFLA_WEIGHT]		= { .type = NLA_U32 },  	[IFLA_OPERSTATE]	= { .type = NLA_U8 },  	[IFLA_LINKMODE]		= { .type = NLA_U8 },  	[IFLA_LINKINFO]		= { .type = NLA_NESTED },  	[IFLA_NET_NS_PID]	= { .type = NLA_U32 }, +	[IFLA_NET_NS_FD]	= { .type = NLA_U32 },  	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },  	[IFLA_VFINFO_LIST]	= {. type = NLA_NESTED },  	[IFLA_VF_PORTS]		= { .type = NLA_NESTED },  	[IFLA_PORT_SELF]	= { .type = NLA_NESTED },  	[IFLA_AF_SPEC]		= { .type = NLA_NESTED }, +	[IFLA_EXT_MASK]		= { .type = NLA_U32 }, +	[IFLA_PROMISCUITY]	= { .type = NLA_U32 }, +	[IFLA_NUM_TX_QUEUES]	= { .type = NLA_U32 }, +	[IFLA_NUM_RX_QUEUES]	= { .type = NLA_U32 }, +	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, +	[IFLA_CARRIER_CHANGES]	= { .type = NLA_U32 },  /* ignored */  }; -EXPORT_SYMBOL(ifla_policy);  static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {  	[IFLA_INFO_KIND]	= { .type = NLA_STRING },  	[IFLA_INFO_DATA]	= { .type = NLA_NESTED }, +	[IFLA_INFO_SLAVE_KIND]	= { .type = NLA_STRING }, +	[IFLA_INFO_SLAVE_DATA]	= { .type = NLA_NESTED },  };  static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { @@ -1065,6 +1212,12 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {  				    .len = sizeof(struct ifla_vf_vlan) },  	[IFLA_VF_TX_RATE]	= { .type = NLA_BINARY,  				    .len = sizeof(struct ifla_vf_tx_rate) }, +	[IFLA_VF_SPOOFCHK]	= { .type = NLA_BINARY, +				    .len = sizeof(struct ifla_vf_spoofchk) }, +	[IFLA_VF_RATE]		= { .type = NLA_BINARY, +				    .len = sizeof(struct ifla_vf_rate) }, +	[IFLA_VF_LINK_STATE]	= { .type = NLA_BINARY, +				    .len = sizeof(struct ifla_vf_link_state) },  };  static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { @@ -1081,6 +1234,78 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {  	[IFLA_PORT_RESPONSE]	= { .type = NLA_U16, },  }; +static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct net *net = sock_net(skb->sk); +	int h, s_h; +	int idx = 0, s_idx; +	struct net_device *dev; +	struct hlist_head *head; +	struct nlattr *tb[IFLA_MAX+1]; +	u32 ext_filter_mask = 0; +	int err; +	int hdrlen; + +	s_h = cb->args[0]; +	s_idx = cb->args[1]; + +	rcu_read_lock(); +	cb->seq = net->dev_base_seq; + +	/* A hack to preserve kernel<->userspace interface. +	 * The correct header is ifinfomsg. It is consistent with rtnl_getlink. +	 * However, before Linux v3.9 the code here assumed rtgenmsg and that's +	 * what iproute2 < v3.9.0 used. +	 * We can detect the old iproute2. Even including the IFLA_EXT_MASK +	 * attribute, its netlink message is shorter than struct ifinfomsg. +	 */ +	hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ? +		 sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + +	if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) { + +		if (tb[IFLA_EXT_MASK]) +			ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); +	} + +	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { +		idx = 0; +		head = &net->dev_index_head[h]; +		hlist_for_each_entry_rcu(dev, head, index_hlist) { +			if (idx < s_idx) +				goto cont; +			err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, +					       NETLINK_CB(cb->skb).portid, +					       cb->nlh->nlmsg_seq, 0, +					       NLM_F_MULTI, +					       ext_filter_mask); +			/* If we ran out of room on the first message, +			 * we're in trouble +			 */ +			WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + +			if (err <= 0) +				goto out; + +			nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +			idx++; +		} +	} +out: +	rcu_read_unlock(); +	cb->args[1] = idx; +	cb->args[0] = h; + +	return skb->len; +} + +int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len) +{ +	return nla_parse(tb, IFLA_MAX, head, len, ifla_policy); +} +EXPORT_SYMBOL(rtnl_nla_parse_ifla); +  struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])  {  	struct net *net; @@ -1089,6 +1314,8 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])  	 */  	if (tb[IFLA_NET_NS_PID])  		net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); +	else if (tb[IFLA_NET_NS_FD]) +		net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));  	else  		net = get_net(src_net);  	return net; @@ -1121,8 +1348,7 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])  				return -EOPNOTSUPP;  			if (af_ops->validate_link_af) { -				err = af_ops->validate_link_af(dev, -							tb[IFLA_AF_SPEC]); +				err = af_ops->validate_link_af(dev, af);  				if (err < 0)  					return err;  			} @@ -1161,11 +1387,47 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)  		}  		case IFLA_VF_TX_RATE: {  			struct ifla_vf_tx_rate *ivt; +			struct ifla_vf_info ivf; +			ivt = nla_data(vf); +			err = -EOPNOTSUPP; +			if (ops->ndo_get_vf_config) +				err = ops->ndo_get_vf_config(dev, ivt->vf, +							     &ivf); +			if (err) +				break; +			err = -EOPNOTSUPP; +			if (ops->ndo_set_vf_rate) +				err = ops->ndo_set_vf_rate(dev, ivt->vf, +							   ivf.min_tx_rate, +							   ivt->rate); +			break; +		} +		case IFLA_VF_RATE: { +			struct ifla_vf_rate *ivt;  			ivt = nla_data(vf);  			err = -EOPNOTSUPP; -			if (ops->ndo_set_vf_tx_rate) -				err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, -							      ivt->rate); +			if (ops->ndo_set_vf_rate) +				err = ops->ndo_set_vf_rate(dev, ivt->vf, +							   ivt->min_tx_rate, +							   ivt->max_tx_rate); +			break; +		} +		case IFLA_VF_SPOOFCHK: { +			struct ifla_vf_spoofchk *ivs; +			ivs = nla_data(vf); +			err = -EOPNOTSUPP; +			if (ops->ndo_set_vf_spoofchk) +				err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, +							       ivs->setting); +			break; +		} +		case IFLA_VF_LINK_STATE: { +			struct ifla_vf_link_state *ivl; +			ivl = nla_data(vf); +			err = -EOPNOTSUPP; +			if (ops->ndo_set_vf_link_state) +				err = ops->ndo_set_vf_link_state(dev, ivl->vf, +								 ivl->link_state);  			break;  		}  		default: @@ -1178,19 +1440,58 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)  	return err;  } -static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, +static int do_set_master(struct net_device *dev, int ifindex) +{ +	struct net_device *upper_dev = netdev_master_upper_dev_get(dev); +	const struct net_device_ops *ops; +	int err; + +	if (upper_dev) { +		if (upper_dev->ifindex == ifindex) +			return 0; +		ops = upper_dev->netdev_ops; +		if (ops->ndo_del_slave) { +			err = ops->ndo_del_slave(upper_dev, dev); +			if (err) +				return err; +		} else { +			return -EOPNOTSUPP; +		} +	} + +	if (ifindex) { +		upper_dev = __dev_get_by_index(dev_net(dev), ifindex); +		if (!upper_dev) +			return -EINVAL; +		ops = upper_dev->netdev_ops; +		if (ops->ndo_add_slave) { +			err = ops->ndo_add_slave(upper_dev, dev); +			if (err) +				return err; +		} else { +			return -EOPNOTSUPP; +		} +	} +	return 0; +} + +static int do_setlink(const struct sk_buff *skb, +		      struct net_device *dev, struct ifinfomsg *ifm,  		      struct nlattr **tb, char *ifname, int modified)  {  	const struct net_device_ops *ops = dev->netdev_ops; -	int send_addr_notify = 0;  	int err; -	if (tb[IFLA_NET_NS_PID]) { +	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {  		struct net *net = rtnl_link_get_net(dev_net(dev), tb);  		if (IS_ERR(net)) {  			err = PTR_ERR(net);  			goto errout;  		} +		if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { +			err = -EPERM; +			goto errout; +		}  		err = dev_change_net_namespace(dev, net, ifname);  		put_net(net);  		if (err) @@ -1231,16 +1532,6 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  		struct sockaddr *sa;  		int len; -		if (!ops->ndo_set_mac_address) { -			err = -EOPNOTSUPP; -			goto errout; -		} - -		if (!netif_device_present(dev)) { -			err = -ENODEV; -			goto errout; -		} -  		len = sizeof(sa_family_t) + dev->addr_len;  		sa = kmalloc(len, GFP_KERNEL);  		if (!sa) { @@ -1250,11 +1541,10 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  		sa->sa_family = dev->type;  		memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),  		       dev->addr_len); -		err = ops->ndo_set_mac_address(dev, sa); +		err = dev_set_mac_address(dev, sa);  		kfree(sa);  		if (err)  			goto errout; -		send_addr_notify = 1;  		modified = 1;  	} @@ -1265,6 +1555,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  		modified = 1;  	} +	if (tb[IFLA_GROUP]) { +		dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); +		modified = 1; +	} +  	/*  	 * Interface selected by interface index but interface  	 * name provided implies that a name change has been @@ -1287,7 +1582,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  	if (tb[IFLA_BROADCAST]) {  		nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); -		send_addr_notify = 1; +		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);  	}  	if (ifm->ifi_flags || ifm->ifi_change) { @@ -1296,6 +1591,20 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  			goto errout;  	} +	if (tb[IFLA_MASTER]) { +		err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); +		if (err) +			goto errout; +		modified = 1; +	} + +	if (tb[IFLA_CARRIER]) { +		err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); +		if (err) +			goto errout; +		modified = 1; +	} +  	if (tb[IFLA_TXQLEN])  		dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); @@ -1390,18 +1699,14 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,  	err = 0;  errout: -	if (err < 0 && modified && net_ratelimit()) -		printk(KERN_WARNING "A link change request failed with " -		       "some changes comitted already. Interface %s may " -		       "have been left with an inconsistent configuration, " -		       "please check.\n", dev->name); +	if (err < 0 && modified) +		net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", +				     dev->name); -	if (send_addr_notify) -		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);  	return err;  } -static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	struct ifinfomsg *ifm; @@ -1437,12 +1742,12 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  	if (err < 0)  		goto errout; -	err = do_setlink(dev, ifm, tb, ifname, 0); +	err = do_setlink(skb, dev, ifm, tb, ifname, 0);  errout:  	return err;  } -static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	const struct rtnl_link_ops *ops; @@ -1451,6 +1756,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  	char ifname[IFNAMSIZ];  	struct nlattr *tb[IFLA_MAX+1];  	int err; +	LIST_HEAD(list_kill);  	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);  	if (err < 0) @@ -1474,7 +1780,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)  	if (!ops)  		return -EOPNOTSUPP; -	ops->dellink(dev, NULL); +	ops->dellink(dev, &list_kill); +	unregister_netdevice_many(&list_kill);  	return 0;  } @@ -1491,48 +1798,47 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)  	}  	dev->rtnl_link_state = RTNL_LINK_INITIALIZED; -	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); -	__dev_notify_flags(dev, old_flags); +	__dev_notify_flags(dev, old_flags, ~0U);  	return 0;  }  EXPORT_SYMBOL(rtnl_configure_link); -struct net_device *rtnl_create_link(struct net *src_net, struct net *net, +struct net_device *rtnl_create_link(struct net *net,  	char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])  {  	int err;  	struct net_device *dev; -	unsigned int num_queues = 1; -	unsigned int real_num_queues = 1; +	unsigned int num_tx_queues = 1; +	unsigned int num_rx_queues = 1; + +	if (tb[IFLA_NUM_TX_QUEUES]) +		num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); +	else if (ops->get_num_tx_queues) +		num_tx_queues = ops->get_num_tx_queues(); + +	if (tb[IFLA_NUM_RX_QUEUES]) +		num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]); +	else if (ops->get_num_rx_queues) +		num_rx_queues = ops->get_num_rx_queues(); -	if (ops->get_tx_queues) { -		err = ops->get_tx_queues(src_net, tb, &num_queues, -					 &real_num_queues); -		if (err) -			goto err; -	}  	err = -ENOMEM; -	dev = alloc_netdev_mq(ops->priv_size, ifname, ops->setup, num_queues); +	dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup, +			       num_tx_queues, num_rx_queues);  	if (!dev)  		goto err;  	dev_net_set(dev, net);  	dev->rtnl_link_ops = ops;  	dev->rtnl_link_state = RTNL_LINK_INITIALIZING; -	dev->real_num_tx_queues = real_num_queues; - -	if (strchr(dev->name, '%')) { -		err = dev_alloc_name(dev, dev->name); -		if (err < 0) -			goto err_free; -	}  	if (tb[IFLA_MTU])  		dev->mtu = nla_get_u32(tb[IFLA_MTU]); -	if (tb[IFLA_ADDRESS]) +	if (tb[IFLA_ADDRESS]) {  		memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),  				nla_len(tb[IFLA_ADDRESS])); +		dev->addr_assign_type = NET_ADDR_SET; +	}  	if (tb[IFLA_BROADCAST])  		memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),  				nla_len(tb[IFLA_BROADCAST])); @@ -1542,21 +1848,42 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net,  		set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));  	if (tb[IFLA_LINKMODE])  		dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); +	if (tb[IFLA_GROUP]) +		dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));  	return dev; -err_free: -	free_netdev(dev);  err:  	return ERR_PTR(err);  }  EXPORT_SYMBOL(rtnl_create_link); -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int rtnl_group_changelink(const struct sk_buff *skb, +		struct net *net, int group, +		struct ifinfomsg *ifm, +		struct nlattr **tb) +{ +	struct net_device *dev; +	int err; + +	for_each_netdev(net, dev) { +		if (dev->group == group) { +			err = do_setlink(skb, dev, ifm, tb, NULL, 0); +			if (err < 0) +				return err; +		} +	} + +	return 0; +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	const struct rtnl_link_ops *ops; +	const struct rtnl_link_ops *m_ops = NULL;  	struct net_device *dev; +	struct net_device *master_dev = NULL;  	struct ifinfomsg *ifm;  	char kind[MODULE_NAME_LEN];  	char ifname[IFNAMSIZ]; @@ -1579,10 +1906,18 @@ replay:  	ifm = nlmsg_data(nlh);  	if (ifm->ifi_index > 0)  		dev = __dev_get_by_index(net, ifm->ifi_index); -	else if (ifname[0]) -		dev = __dev_get_by_name(net, ifname); -	else -		dev = NULL; +	else { +		if (ifname[0]) +			dev = __dev_get_by_name(net, ifname); +		else +			dev = NULL; +	} + +	if (dev) { +		master_dev = netdev_master_upper_dev_get(dev); +		if (master_dev) +			m_ops = master_dev->rtnl_link_ops; +	}  	err = validate_linkmsg(dev, tb);  	if (err < 0) @@ -1605,7 +1940,10 @@ replay:  	}  	if (1) { -		struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; +		struct nlattr *attr[ops ? ops->maxtype + 1 : 0]; +		struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 0]; +		struct nlattr **data = NULL; +		struct nlattr **slave_data = NULL;  		struct net *dest_net;  		if (ops) { @@ -1624,6 +1962,24 @@ replay:  			}  		} +		if (m_ops) { +			if (m_ops->slave_maxtype && +			    linkinfo[IFLA_INFO_SLAVE_DATA]) { +				err = nla_parse_nested(slave_attr, +						       m_ops->slave_maxtype, +						       linkinfo[IFLA_INFO_SLAVE_DATA], +						       m_ops->slave_policy); +				if (err < 0) +					return err; +				slave_data = slave_attr; +			} +			if (m_ops->slave_validate) { +				err = m_ops->slave_validate(tb, slave_data); +				if (err < 0) +					return err; +			} +		} +  		if (dev) {  			int modified = 0; @@ -1643,14 +1999,28 @@ replay:  				modified = 1;  			} -			return do_setlink(dev, ifm, tb, ifname, modified); +			if (linkinfo[IFLA_INFO_SLAVE_DATA]) { +				if (!m_ops || !m_ops->slave_changelink) +					return -EOPNOTSUPP; + +				err = m_ops->slave_changelink(master_dev, dev, +							      tb, slave_data); +				if (err < 0) +					return err; +				modified = 1; +			} + +			return do_setlink(skb, dev, ifm, tb, ifname, modified);  		} -		if (!(nlh->nlmsg_flags & NLM_F_CREATE)) +		if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { +			if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) +				return rtnl_group_changelink(skb, net, +						nla_get_u32(tb[IFLA_GROUP]), +						ifm, tb);  			return -ENODEV; +		} -		if (ifm->ifi_index) -			return -EOPNOTSUPP;  		if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])  			return -EOPNOTSUPP; @@ -1672,20 +2042,36 @@ replay:  			snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);  		dest_net = rtnl_link_get_net(net, tb); -		dev = rtnl_create_link(net, dest_net, ifname, ops, tb); +		if (IS_ERR(dest_net)) +			return PTR_ERR(dest_net); -		if (IS_ERR(dev)) +		dev = rtnl_create_link(dest_net, ifname, ops, tb); +		if (IS_ERR(dev)) {  			err = PTR_ERR(dev); -		else if (ops->newlink) -			err = ops->newlink(net, dev, tb, data); -		else -			err = register_netdevice(dev); - -		if (err < 0 && !IS_ERR(dev)) -			free_netdev(dev); -		if (err < 0)  			goto out; +		} +		dev->ifindex = ifm->ifi_index; + +		if (ops->newlink) { +			err = ops->newlink(net, dev, tb, data); +			/* Drivers should call free_netdev() in ->destructor +			 * and unregister it on failure after registration +			 * so that device could be finally freed in rtnl_unlock. +			 */ +			if (err < 0) { +				/* If device is not registered at all, free it now */ +				if (dev->reg_state == NETREG_UNINITIALIZED) +					free_netdev(dev); +				goto out; +			} +		} else { +			err = register_netdevice(dev); +			if (err < 0) { +				free_netdev(dev); +				goto out; +			} +		}  		err = rtnl_configure_link(dev, ifm);  		if (err < 0)  			unregister_netdevice(dev); @@ -1695,7 +2081,7 @@ out:  	}  } -static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh)  {  	struct net *net = sock_net(skb->sk);  	struct ifinfomsg *ifm; @@ -1704,6 +2090,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)  	struct net_device *dev = NULL;  	struct sk_buff *nskb;  	int err; +	u32 ext_filter_mask = 0;  	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);  	if (err < 0) @@ -1712,6 +2099,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)  	if (tb[IFLA_IFNAME])  		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); +	if (tb[IFLA_EXT_MASK]) +		ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); +  	ifm = nlmsg_data(nlh);  	if (ifm->ifi_index > 0)  		dev = __dev_get_by_index(net, ifm->ifi_index); @@ -1723,22 +2113,55 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)  	if (dev == NULL)  		return -ENODEV; -	nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); +	nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);  	if (nskb == NULL)  		return -ENOBUFS; -	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, -			       nlh->nlmsg_seq, 0, 0); +	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, +			       nlh->nlmsg_seq, 0, 0, ext_filter_mask);  	if (err < 0) {  		/* -EMSGSIZE implies BUG in if_nlmsg_size */  		WARN_ON(err == -EMSGSIZE);  		kfree_skb(nskb);  	} else -		err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); +		err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);  	return err;  } +static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) +{ +	struct net *net = sock_net(skb->sk); +	struct net_device *dev; +	struct nlattr *tb[IFLA_MAX+1]; +	u32 ext_filter_mask = 0; +	u16 min_ifinfo_dump_size = 0; +	int hdrlen; + +	/* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ +	hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? +		 sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + +	if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) { +		if (tb[IFLA_EXT_MASK]) +			ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); +	} + +	if (!ext_filter_mask) +		return NLMSG_GOODSIZE; +	/* +	 * traverse the list of net devices and compute the minimum +	 * buffer size based upon the filter mask. +	 */ +	list_for_each_entry(dev, &net->dev_base_head, dev_list) { +		min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size, +					     if_nlmsg_size(dev, +						           ext_filter_mask)); +	} + +	return min_ifinfo_dump_size; +} +  static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)  {  	int idx; @@ -1753,8 +2176,11 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)  		if (rtnl_msg_handlers[idx] == NULL ||  		    rtnl_msg_handlers[idx][type].dumpit == NULL)  			continue; -		if (idx > s_idx) +		if (idx > s_idx) {  			memset(&cb->args[0], 0, sizeof(cb->args)); +			cb->prev_seq = 0; +			cb->seq = 0; +		}  		if (rtnl_msg_handlers[idx][type].dumpit(skb, cb))  			break;  	} @@ -1763,33 +2189,653 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)  	return skb->len;  } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, +		  gfp_t flags)  {  	struct net *net = dev_net(dev);  	struct sk_buff *skb;  	int err = -ENOBUFS; +	size_t if_info_size; -	skb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); +	skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags);  	if (skb == NULL)  		goto errout; -	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0); +	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0);  	if (err < 0) {  		/* -EMSGSIZE implies BUG in if_nlmsg_size() */  		WARN_ON(err == -EMSGSIZE);  		kfree_skb(skb);  		goto errout;  	} -	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); +	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);  	return;  errout:  	if (err < 0)  		rtnl_set_sk_err(net, RTNLGRP_LINK, err);  } +EXPORT_SYMBOL(rtmsg_ifinfo); + +static int nlmsg_populate_fdb_fill(struct sk_buff *skb, +				   struct net_device *dev, +				   u8 *addr, u32 pid, u32 seq, +				   int type, unsigned int flags, +				   int nlflags) +{ +	struct nlmsghdr *nlh; +	struct ndmsg *ndm; + +	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags); +	if (!nlh) +		return -EMSGSIZE; + +	ndm = nlmsg_data(nlh); +	ndm->ndm_family  = AF_BRIDGE; +	ndm->ndm_pad1	 = 0; +	ndm->ndm_pad2    = 0; +	ndm->ndm_flags	 = flags; +	ndm->ndm_type	 = 0; +	ndm->ndm_ifindex = dev->ifindex; +	ndm->ndm_state   = NUD_PERMANENT; + +	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) +		goto nla_put_failure; + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -EMSGSIZE; +} + +static inline size_t rtnl_fdb_nlmsg_size(void) +{ +	return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN); +} + +static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type) +{ +	struct net *net = dev_net(dev); +	struct sk_buff *skb; +	int err = -ENOBUFS; + +	skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC); +	if (!skb) +		goto errout; + +	err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF, 0); +	if (err < 0) { +		kfree_skb(skb); +		goto errout; +	} + +	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); +	return; +errout: +	rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +} + +/** + * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry + */ +int ndo_dflt_fdb_add(struct ndmsg *ndm, +		     struct nlattr *tb[], +		     struct net_device *dev, +		     const unsigned char *addr, +		     u16 flags) +{ +	int err = -EINVAL; + +	/* If aging addresses are supported device will need to +	 * implement its own handler for this. +	 */ +	if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { +		pr_info("%s: FDB only supports static addresses\n", dev->name); +		return err; +	} + +	if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) +		err = dev_uc_add_excl(dev, addr); +	else if (is_multicast_ether_addr(addr)) +		err = dev_mc_add_excl(dev, addr); + +	/* Only return duplicate errors if NLM_F_EXCL is set */ +	if (err == -EEXIST && !(flags & NLM_F_EXCL)) +		err = 0; + +	return err; +} +EXPORT_SYMBOL(ndo_dflt_fdb_add); + +static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) +{ +	struct net *net = sock_net(skb->sk); +	struct ndmsg *ndm; +	struct nlattr *tb[NDA_MAX+1]; +	struct net_device *dev; +	u8 *addr; +	int err; + +	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); +	if (err < 0) +		return err; + +	ndm = nlmsg_data(nlh); +	if (ndm->ndm_ifindex == 0) { +		pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n"); +		return -EINVAL; +	} + +	dev = __dev_get_by_index(net, ndm->ndm_ifindex); +	if (dev == NULL) { +		pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n"); +		return -ENODEV; +	} + +	if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { +		pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n"); +		return -EINVAL; +	} -/* Protected by RTNL sempahore.  */ -static struct rtattr **rta_buf; -static int rtattr_max; +	addr = nla_data(tb[NDA_LLADDR]); + +	err = -EOPNOTSUPP; + +	/* Support fdb on master device the net/bridge default case */ +	if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && +	    (dev->priv_flags & IFF_BRIDGE_PORT)) { +		struct net_device *br_dev = netdev_master_upper_dev_get(dev); +		const struct net_device_ops *ops = br_dev->netdev_ops; + +		err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags); +		if (err) +			goto out; +		else +			ndm->ndm_flags &= ~NTF_MASTER; +	} + +	/* Embedded bridge, macvlan, and any other device support */ +	if ((ndm->ndm_flags & NTF_SELF)) { +		if (dev->netdev_ops->ndo_fdb_add) +			err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, +							   nlh->nlmsg_flags); +		else +			err = ndo_dflt_fdb_add(ndm, tb, dev, addr, +					       nlh->nlmsg_flags); + +		if (!err) { +			rtnl_fdb_notify(dev, addr, RTM_NEWNEIGH); +			ndm->ndm_flags &= ~NTF_SELF; +		} +	} +out: +	return err; +} + +/** + * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry + */ +int ndo_dflt_fdb_del(struct ndmsg *ndm, +		     struct nlattr *tb[], +		     struct net_device *dev, +		     const unsigned char *addr) +{ +	int err = -EOPNOTSUPP; + +	/* If aging addresses are supported device will need to +	 * implement its own handler for this. +	 */ +	if (!(ndm->ndm_state & NUD_PERMANENT)) { +		pr_info("%s: FDB only supports static addresses\n", dev->name); +		return -EINVAL; +	} + +	if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) +		err = dev_uc_del(dev, addr); +	else if (is_multicast_ether_addr(addr)) +		err = dev_mc_del(dev, addr); +	else +		err = -EINVAL; + +	return err; +} +EXPORT_SYMBOL(ndo_dflt_fdb_del); + +static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) +{ +	struct net *net = sock_net(skb->sk); +	struct ndmsg *ndm; +	struct nlattr *tb[NDA_MAX+1]; +	struct net_device *dev; +	int err = -EINVAL; +	__u8 *addr; + +	if (!netlink_capable(skb, CAP_NET_ADMIN)) +		return -EPERM; + +	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); +	if (err < 0) +		return err; + +	ndm = nlmsg_data(nlh); +	if (ndm->ndm_ifindex == 0) { +		pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n"); +		return -EINVAL; +	} + +	dev = __dev_get_by_index(net, ndm->ndm_ifindex); +	if (dev == NULL) { +		pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n"); +		return -ENODEV; +	} + +	if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { +		pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n"); +		return -EINVAL; +	} + +	addr = nla_data(tb[NDA_LLADDR]); + +	err = -EOPNOTSUPP; + +	/* Support fdb on master device the net/bridge default case */ +	if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && +	    (dev->priv_flags & IFF_BRIDGE_PORT)) { +		struct net_device *br_dev = netdev_master_upper_dev_get(dev); +		const struct net_device_ops *ops = br_dev->netdev_ops; + +		if (ops->ndo_fdb_del) +			err = ops->ndo_fdb_del(ndm, tb, dev, addr); + +		if (err) +			goto out; +		else +			ndm->ndm_flags &= ~NTF_MASTER; +	} + +	/* Embedded bridge, macvlan, and any other device support */ +	if (ndm->ndm_flags & NTF_SELF) { +		if (dev->netdev_ops->ndo_fdb_del) +			err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr); +		else +			err = ndo_dflt_fdb_del(ndm, tb, dev, addr); + +		if (!err) { +			rtnl_fdb_notify(dev, addr, RTM_DELNEIGH); +			ndm->ndm_flags &= ~NTF_SELF; +		} +	} +out: +	return err; +} + +static int nlmsg_populate_fdb(struct sk_buff *skb, +			      struct netlink_callback *cb, +			      struct net_device *dev, +			      int *idx, +			      struct netdev_hw_addr_list *list) +{ +	struct netdev_hw_addr *ha; +	int err; +	u32 portid, seq; + +	portid = NETLINK_CB(cb->skb).portid; +	seq = cb->nlh->nlmsg_seq; + +	list_for_each_entry(ha, &list->list, list) { +		if (*idx < cb->args[0]) +			goto skip; + +		err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, +					      portid, seq, +					      RTM_NEWNEIGH, NTF_SELF, +					      NLM_F_MULTI); +		if (err < 0) +			return err; +skip: +		*idx += 1; +	} +	return 0; +} + +/** + * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. + * @nlh: netlink message header + * @dev: netdevice + * + * Default netdevice operation to dump the existing unicast address list. + * Returns number of addresses from list put in skb. + */ +int ndo_dflt_fdb_dump(struct sk_buff *skb, +		      struct netlink_callback *cb, +		      struct net_device *dev, +		      int idx) +{ +	int err; + +	netif_addr_lock_bh(dev); +	err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc); +	if (err) +		goto out; +	nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc); +out: +	netif_addr_unlock_bh(dev); +	return idx; +} +EXPORT_SYMBOL(ndo_dflt_fdb_dump); + +static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ +	int idx = 0; +	struct net *net = sock_net(skb->sk); +	struct net_device *dev; + +	rcu_read_lock(); +	for_each_netdev_rcu(net, dev) { +		if (dev->priv_flags & IFF_BRIDGE_PORT) { +			struct net_device *br_dev; +			const struct net_device_ops *ops; + +			br_dev = netdev_master_upper_dev_get(dev); +			ops = br_dev->netdev_ops; +			if (ops->ndo_fdb_dump) +				idx = ops->ndo_fdb_dump(skb, cb, dev, idx); +		} + +		if (dev->netdev_ops->ndo_fdb_dump) +			idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx); +		else +			idx = ndo_dflt_fdb_dump(skb, cb, dev, idx); +	} +	rcu_read_unlock(); + +	cb->args[0] = idx; +	return skb->len; +} + +int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, +			    struct net_device *dev, u16 mode) +{ +	struct nlmsghdr *nlh; +	struct ifinfomsg *ifm; +	struct nlattr *br_afspec; +	u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; +	struct net_device *br_dev = netdev_master_upper_dev_get(dev); + +	nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI); +	if (nlh == NULL) +		return -EMSGSIZE; + +	ifm = nlmsg_data(nlh); +	ifm->ifi_family = AF_BRIDGE; +	ifm->__ifi_pad = 0; +	ifm->ifi_type = dev->type; +	ifm->ifi_index = dev->ifindex; +	ifm->ifi_flags = dev_get_flags(dev); +	ifm->ifi_change = 0; + + +	if (nla_put_string(skb, IFLA_IFNAME, dev->name) || +	    nla_put_u32(skb, IFLA_MTU, dev->mtu) || +	    nla_put_u8(skb, IFLA_OPERSTATE, operstate) || +	    (br_dev && +	     nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || +	    (dev->addr_len && +	     nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || +	    (dev->ifindex != dev->iflink && +	     nla_put_u32(skb, IFLA_LINK, dev->iflink))) +		goto nla_put_failure; + +	br_afspec = nla_nest_start(skb, IFLA_AF_SPEC); +	if (!br_afspec) +		goto nla_put_failure; + +	if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) || +	    nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { +		nla_nest_cancel(skb, br_afspec); +		goto nla_put_failure; +	} +	nla_nest_end(skb, br_afspec); + +	return nlmsg_end(skb, nlh); +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -EMSGSIZE; +} +EXPORT_SYMBOL(ndo_dflt_bridge_getlink); + +static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct net *net = sock_net(skb->sk); +	struct net_device *dev; +	int idx = 0; +	u32 portid = NETLINK_CB(cb->skb).portid; +	u32 seq = cb->nlh->nlmsg_seq; +	struct nlattr *extfilt; +	u32 filter_mask = 0; + +	extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), +				  IFLA_EXT_MASK); +	if (extfilt) +		filter_mask = nla_get_u32(extfilt); + +	rcu_read_lock(); +	for_each_netdev_rcu(net, dev) { +		const struct net_device_ops *ops = dev->netdev_ops; +		struct net_device *br_dev = netdev_master_upper_dev_get(dev); + +		if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { +			if (idx >= cb->args[0] && +			    br_dev->netdev_ops->ndo_bridge_getlink( +				    skb, portid, seq, dev, filter_mask) < 0) +				break; +			idx++; +		} + +		if (ops->ndo_bridge_getlink) { +			if (idx >= cb->args[0] && +			    ops->ndo_bridge_getlink(skb, portid, seq, dev, +						    filter_mask) < 0) +				break; +			idx++; +		} +	} +	rcu_read_unlock(); +	cb->args[0] = idx; + +	return skb->len; +} + +static inline size_t bridge_nlmsg_size(void) +{ +	return NLMSG_ALIGN(sizeof(struct ifinfomsg)) +		+ nla_total_size(IFNAMSIZ)	/* IFLA_IFNAME */ +		+ nla_total_size(MAX_ADDR_LEN)	/* IFLA_ADDRESS */ +		+ nla_total_size(sizeof(u32))	/* IFLA_MASTER */ +		+ nla_total_size(sizeof(u32))	/* IFLA_MTU */ +		+ nla_total_size(sizeof(u32))	/* IFLA_LINK */ +		+ nla_total_size(sizeof(u32))	/* IFLA_OPERSTATE */ +		+ nla_total_size(sizeof(u8))	/* IFLA_PROTINFO */ +		+ nla_total_size(sizeof(struct nlattr))	/* IFLA_AF_SPEC */ +		+ nla_total_size(sizeof(u16))	/* IFLA_BRIDGE_FLAGS */ +		+ nla_total_size(sizeof(u16));	/* IFLA_BRIDGE_MODE */ +} + +static int rtnl_bridge_notify(struct net_device *dev, u16 flags) +{ +	struct net *net = dev_net(dev); +	struct net_device *br_dev = netdev_master_upper_dev_get(dev); +	struct sk_buff *skb; +	int err = -EOPNOTSUPP; + +	skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); +	if (!skb) { +		err = -ENOMEM; +		goto errout; +	} + +	if ((!flags || (flags & BRIDGE_FLAGS_MASTER)) && +	    br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { +		err = br_dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); +		if (err < 0) +			goto errout; +	} + +	if ((flags & BRIDGE_FLAGS_SELF) && +	    dev->netdev_ops->ndo_bridge_getlink) { +		err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); +		if (err < 0) +			goto errout; +	} + +	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); +	return 0; +errout: +	WARN_ON(err == -EMSGSIZE); +	kfree_skb(skb); +	rtnl_set_sk_err(net, RTNLGRP_LINK, err); +	return err; +} + +static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) +{ +	struct net *net = sock_net(skb->sk); +	struct ifinfomsg *ifm; +	struct net_device *dev; +	struct nlattr *br_spec, *attr = NULL; +	int rem, err = -EOPNOTSUPP; +	u16 oflags, flags = 0; +	bool have_flags = false; + +	if (nlmsg_len(nlh) < sizeof(*ifm)) +		return -EINVAL; + +	ifm = nlmsg_data(nlh); +	if (ifm->ifi_family != AF_BRIDGE) +		return -EPFNOSUPPORT; + +	dev = __dev_get_by_index(net, ifm->ifi_index); +	if (!dev) { +		pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); +		return -ENODEV; +	} + +	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); +	if (br_spec) { +		nla_for_each_nested(attr, br_spec, rem) { +			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { +				have_flags = true; +				flags = nla_get_u16(attr); +				break; +			} +		} +	} + +	oflags = flags; + +	if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { +		struct net_device *br_dev = netdev_master_upper_dev_get(dev); + +		if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) { +			err = -EOPNOTSUPP; +			goto out; +		} + +		err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh); +		if (err) +			goto out; + +		flags &= ~BRIDGE_FLAGS_MASTER; +	} + +	if ((flags & BRIDGE_FLAGS_SELF)) { +		if (!dev->netdev_ops->ndo_bridge_setlink) +			err = -EOPNOTSUPP; +		else +			err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh); + +		if (!err) +			flags &= ~BRIDGE_FLAGS_SELF; +	} + +	if (have_flags) +		memcpy(nla_data(attr), &flags, sizeof(flags)); +	/* Generate event to notify upper layer of bridge change */ +	if (!err) +		err = rtnl_bridge_notify(dev, oflags); +out: +	return err; +} + +static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) +{ +	struct net *net = sock_net(skb->sk); +	struct ifinfomsg *ifm; +	struct net_device *dev; +	struct nlattr *br_spec, *attr = NULL; +	int rem, err = -EOPNOTSUPP; +	u16 oflags, flags = 0; +	bool have_flags = false; + +	if (nlmsg_len(nlh) < sizeof(*ifm)) +		return -EINVAL; + +	ifm = nlmsg_data(nlh); +	if (ifm->ifi_family != AF_BRIDGE) +		return -EPFNOSUPPORT; + +	dev = __dev_get_by_index(net, ifm->ifi_index); +	if (!dev) { +		pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); +		return -ENODEV; +	} + +	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); +	if (br_spec) { +		nla_for_each_nested(attr, br_spec, rem) { +			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { +				have_flags = true; +				flags = nla_get_u16(attr); +				break; +			} +		} +	} + +	oflags = flags; + +	if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { +		struct net_device *br_dev = netdev_master_upper_dev_get(dev); + +		if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) { +			err = -EOPNOTSUPP; +			goto out; +		} + +		err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh); +		if (err) +			goto out; + +		flags &= ~BRIDGE_FLAGS_MASTER; +	} + +	if ((flags & BRIDGE_FLAGS_SELF)) { +		if (!dev->netdev_ops->ndo_bridge_dellink) +			err = -EOPNOTSUPP; +		else +			err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh); + +		if (!err) +			flags &= ~BRIDGE_FLAGS_SELF; +	} + +	if (have_flags) +		memcpy(nla_data(attr), &flags, sizeof(flags)); +	/* Generate event to notify upper layer of bridge change */ +	if (!err) +		err = rtnl_bridge_notify(dev, oflags); +out: +	return err; +}  /* Process one rtnetlink message. */ @@ -1798,7 +2844,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	struct net *net = sock_net(skb->sk);  	rtnl_doit_func doit;  	int sz_idx, kind; -	int min_len;  	int family;  	int type;  	int err; @@ -1810,57 +2855,47 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	type -= RTM_BASE;  	/* All the messages must have at least 1 byte length */ -	if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) +	if (nlmsg_len(nlh) < sizeof(struct rtgenmsg))  		return 0; -	family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family; +	family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;  	sz_idx = type>>2;  	kind = type&3; -	if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) +	if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))  		return -EPERM;  	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {  		struct sock *rtnl;  		rtnl_dumpit_func dumpit; +		rtnl_calcit_func calcit; +		u16 min_dump_alloc = 0;  		dumpit = rtnl_get_dumpit(family, type);  		if (dumpit == NULL)  			return -EOPNOTSUPP; +		calcit = rtnl_get_calcit(family, type); +		if (calcit) +			min_dump_alloc = calcit(skb, nlh);  		__rtnl_unlock();  		rtnl = net->rtnl; -		err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL); +		{ +			struct netlink_dump_control c = { +				.dump		= dumpit, +				.min_dump_alloc	= min_dump_alloc, +			}; +			err = netlink_dump_start(rtnl, skb, nlh, &c); +		}  		rtnl_lock();  		return err;  	} -	memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); - -	min_len = rtm_min[sz_idx]; -	if (nlh->nlmsg_len < min_len) -		return -EINVAL; - -	if (nlh->nlmsg_len > min_len) { -		int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); -		struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len); - -		while (RTA_OK(attr, attrlen)) { -			unsigned flavor = attr->rta_type; -			if (flavor) { -				if (flavor > rta_max[sz_idx]) -					return -EINVAL; -				rta_buf[flavor-1] = attr; -			} -			attr = RTA_NEXT(attr, attrlen); -		} -	} -  	doit = rtnl_get_doit(family, type);  	if (doit == NULL)  		return -EOPNOTSUPP; -	return doit(skb, nlh, (void *)&rta_buf[0]); +	return doit(skb, nlh);  }  static void rtnetlink_rcv(struct sk_buff *skb) @@ -1872,7 +2907,7 @@ static void rtnetlink_rcv(struct sk_buff *skb)  static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)  { -	struct net_device *dev = ptr; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	switch (event) {  	case NETDEV_UP: @@ -1884,10 +2919,12 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi  	case NETDEV_PRE_TYPE_CHANGE:  	case NETDEV_GOING_DOWN:  	case NETDEV_UNREGISTER: -	case NETDEV_UNREGISTER_BATCH: +	case NETDEV_UNREGISTER_FINAL: +	case NETDEV_RELEASE: +	case NETDEV_JOIN:  		break;  	default: -		rtmsg_ifinfo(RTM_NEWLINK, dev, 0); +		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);  		break;  	}  	return NOTIFY_DONE; @@ -1901,8 +2938,14 @@ static struct notifier_block rtnetlink_dev_notifier = {  static int __net_init rtnetlink_net_init(struct net *net)  {  	struct sock *sk; -	sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, -				   rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); +	struct netlink_kernel_cfg cfg = { +		.groups		= RTNLGRP_MAX, +		.input		= rtnetlink_rcv, +		.cb_mutex	= &rtnl_mutex, +		.flags		= NL_CFG_F_NONROOT_RECV, +	}; + +	sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);  	if (!sk)  		return -ENOMEM;  	net->rtnl = sk; @@ -1922,28 +2965,26 @@ static struct pernet_operations rtnetlink_net_ops = {  void __init rtnetlink_init(void)  { -	int i; - -	rtattr_max = 0; -	for (i = 0; i < ARRAY_SIZE(rta_max); i++) -		if (rta_max[i] > rtattr_max) -			rtattr_max = rta_max[i]; -	rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL); -	if (!rta_buf) -		panic("rtnetlink_init: cannot allocate rta_buf\n"); -  	if (register_pernet_subsys(&rtnetlink_net_ops))  		panic("rtnetlink_init: cannot initialize rtnetlink\n"); -	netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);  	register_netdevice_notifier(&rtnetlink_dev_notifier); -	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo); -	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL); -	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL); -	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL); +	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, +		      rtnl_dump_ifinfo, rtnl_calcit); +	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL); + +	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL); +	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL); + +	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL); +	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL); +	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL); -	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all); -	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all); +	rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL); +	rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL); +	rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);  } diff --git a/net/core/scm.c b/net/core/scm.c index bbe45445080..b442e7e25e6 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -24,11 +24,11 @@  #include <linux/interrupt.h>  #include <linux/netdevice.h>  #include <linux/security.h> +#include <linux/pid_namespace.h>  #include <linux/pid.h>  #include <linux/nsproxy.h>  #include <linux/slab.h> -#include <asm/system.h>  #include <asm/uaccess.h>  #include <net/protocol.h> @@ -36,6 +36,7 @@  #include <net/sock.h>  #include <net/compat.h>  #include <net/scm.h> +#include <net/cls_cgroup.h>  /* @@ -46,12 +47,18 @@  static __inline__ int scm_check_creds(struct ucred *creds)  {  	const struct cred *cred = current_cred(); +	kuid_t uid = make_kuid(cred->user_ns, creds->uid); +	kgid_t gid = make_kgid(cred->user_ns, creds->gid); -	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && -	    ((creds->uid == cred->uid   || creds->uid == cred->euid || -	      creds->uid == cred->suid) || capable(CAP_SETUID)) && -	    ((creds->gid == cred->gid   || creds->gid == cred->egid || -	      creds->gid == cred->sgid) || capable(CAP_SETGID))) { +	if (!uid_valid(uid) || !gid_valid(gid)) +		return -EINVAL; + +	if ((creds->pid == task_tgid_vnr(current) || +	     ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) && +	    ((uid_eq(uid, cred->uid)   || uid_eq(uid, cred->euid) || +	      uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) && +	    ((gid_eq(gid, cred->gid)   || gid_eq(gid, cred->egid) || +	      gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) {  	       return 0;  	}  	return -EPERM; @@ -95,7 +102,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)  		int fd = fdp[i];  		struct file *file; -		if (fd < 0 || !(file = fget(fd))) +		if (fd < 0 || !(file = fget_raw(fd)))  			return -EBADF;  		*fpp++ = file;  		fpl->count++; @@ -110,25 +117,9 @@ void __scm_destroy(struct scm_cookie *scm)  	if (fpl) {  		scm->fp = NULL; -		if (current->scm_work_list) { -			list_add_tail(&fpl->list, current->scm_work_list); -		} else { -			LIST_HEAD(work_list); - -			current->scm_work_list = &work_list; - -			list_add(&fpl->list, &work_list); -			while (!list_empty(&work_list)) { -				fpl = list_first_entry(&work_list, struct scm_fp_list, list); - -				list_del(&fpl->list); -				for (i=fpl->count-1; i>=0; i--) -					fput(fpl->fp[i]); -				kfree(fpl); -			} - -			current->scm_work_list = NULL; -		} +		for (i=fpl->count-1; i>=0; i--) +			fput(fpl->fp[i]); +		kfree(fpl);  	}  }  EXPORT_SYMBOL(__scm_destroy); @@ -166,37 +157,38 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)  				goto error;  			break;  		case SCM_CREDENTIALS: +		{ +			struct ucred creds; +			kuid_t uid; +			kgid_t gid;  			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))  				goto error; -			memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred)); -			err = scm_check_creds(&p->creds); +			memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred)); +			err = scm_check_creds(&creds);  			if (err)  				goto error; -			if (pid_vnr(p->pid) != p->creds.pid) { +			p->creds.pid = creds.pid; +			if (!p->pid || pid_vnr(p->pid) != creds.pid) {  				struct pid *pid;  				err = -ESRCH; -				pid = find_get_pid(p->creds.pid); +				pid = find_get_pid(creds.pid);  				if (!pid)  					goto error;  				put_pid(p->pid);  				p->pid = pid;  			} -			if ((p->cred->euid != p->creds.uid) || -				(p->cred->egid != p->creds.gid)) { -				struct cred *cred; -				err = -ENOMEM; -				cred = prepare_creds(); -				if (!cred) -					goto error; +			err = -EINVAL; +			uid = make_kuid(current_user_ns(), creds.uid); +			gid = make_kgid(current_user_ns(), creds.gid); +			if (!uid_valid(uid) || !gid_valid(gid)) +				goto error; -				cred->uid = cred->euid = p->creds.uid; -				cred->gid = cred->egid = p->creds.uid; -				put_cred(p->cred); -				p->cred = cred; -			} +			p->creds.uid = uid; +			p->creds.gid = gid;  			break; +		}  		default:  			goto error;  		} @@ -280,6 +272,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)  	for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;  	     i++, cmfptr++)  	{ +		struct socket *sock;  		int new_fd;  		err = security_file_receive(fp[i]);  		if (err) @@ -295,8 +288,12 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)  			break;  		}  		/* Bump the usage count and install the file. */ -		get_file(fp[i]); -		fd_install(new_fd, fp[i]); +		sock = sock_from_file(fp[i], &err); +		if (sock) { +			sock_update_netprioidx(sock->sk); +			sock_update_classid(sock->sk); +		} +		fd_install(new_fd, get_file(fp[i]));  	}  	if (i > 0) diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c new file mode 100644 index 00000000000..ba71212f025 --- /dev/null +++ b/net/core/secure_seq.c @@ -0,0 +1,173 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/cryptohash.h> +#include <linux/module.h> +#include <linux/cache.h> +#include <linux/random.h> +#include <linux/hrtimer.h> +#include <linux/ktime.h> +#include <linux/string.h> +#include <linux/net.h> + +#include <net/secure_seq.h> + +#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) +#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) + +static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; + +static __always_inline void net_secret_init(void) +{ +	net_get_random_once(net_secret, sizeof(net_secret)); +} +#endif + +#ifdef CONFIG_INET +static u32 seq_scale(u32 seq) +{ +	/* +	 *	As close as possible to RFC 793, which +	 *	suggests using a 250 kHz clock. +	 *	Further reading shows this assumes 2 Mb/s networks. +	 *	For 10 Mb/s Ethernet, a 1 MHz clock is appropriate. +	 *	For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but +	 *	we also need to limit the resolution so that the u32 seq +	 *	overlaps less than one time per MSL (2 minutes). +	 *	Choosing a clock of 64 ns period is OK. (period of 274 s) +	 */ +	return seq + (ktime_to_ns(ktime_get_real()) >> 6); +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, +				   __be16 sport, __be16 dport) +{ +	u32 secret[MD5_MESSAGE_BYTES / 4]; +	u32 hash[MD5_DIGEST_WORDS]; +	u32 i; + +	net_secret_init(); +	memcpy(hash, saddr, 16); +	for (i = 0; i < 4; i++) +		secret[i] = net_secret[i] + (__force u32)daddr[i]; +	secret[4] = net_secret[4] + +		(((__force u16)sport << 16) + (__force u16)dport); +	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) +		secret[i] = net_secret[i]; + +	md5_transform(hash, secret); + +	return seq_scale(hash[0]); +} +EXPORT_SYMBOL(secure_tcpv6_sequence_number); + +u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, +			       __be16 dport) +{ +	u32 secret[MD5_MESSAGE_BYTES / 4]; +	u32 hash[MD5_DIGEST_WORDS]; +	u32 i; + +	net_secret_init(); +	memcpy(hash, saddr, 16); +	for (i = 0; i < 4; i++) +		secret[i] = net_secret[i] + (__force u32) daddr[i]; +	secret[4] = net_secret[4] + (__force u32)dport; +	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) +		secret[i] = net_secret[i]; + +	md5_transform(hash, secret); + +	return hash[0]; +} +EXPORT_SYMBOL(secure_ipv6_port_ephemeral); +#endif + +#ifdef CONFIG_INET + +__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, +				 __be16 sport, __be16 dport) +{ +	u32 hash[MD5_DIGEST_WORDS]; + +	net_secret_init(); +	hash[0] = (__force u32)saddr; +	hash[1] = (__force u32)daddr; +	hash[2] = ((__force u16)sport << 16) + (__force u16)dport; +	hash[3] = net_secret[15]; + +	md5_transform(hash, net_secret); + +	return seq_scale(hash[0]); +} + +u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) +{ +	u32 hash[MD5_DIGEST_WORDS]; + +	net_secret_init(); +	hash[0] = (__force u32)saddr; +	hash[1] = (__force u32)daddr; +	hash[2] = (__force u32)dport ^ net_secret[14]; +	hash[3] = net_secret[15]; + +	md5_transform(hash, net_secret); + +	return hash[0]; +} +EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); +#endif + +#if IS_ENABLED(CONFIG_IP_DCCP) +u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, +				__be16 sport, __be16 dport) +{ +	u32 hash[MD5_DIGEST_WORDS]; +	u64 seq; + +	net_secret_init(); +	hash[0] = (__force u32)saddr; +	hash[1] = (__force u32)daddr; +	hash[2] = ((__force u16)sport << 16) + (__force u16)dport; +	hash[3] = net_secret[15]; + +	md5_transform(hash, net_secret); + +	seq = hash[0] | (((u64)hash[1]) << 32); +	seq += ktime_to_ns(ktime_get_real()); +	seq &= (1ull << 48) - 1; + +	return seq; +} +EXPORT_SYMBOL(secure_dccp_sequence_number); + +#if IS_ENABLED(CONFIG_IPV6) +u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, +				  __be16 sport, __be16 dport) +{ +	u32 secret[MD5_MESSAGE_BYTES / 4]; +	u32 hash[MD5_DIGEST_WORDS]; +	u64 seq; +	u32 i; + +	net_secret_init(); +	memcpy(hash, saddr, 16); +	for (i = 0; i < 4; i++) +		secret[i] = net_secret[i] + daddr[i]; +	secret[4] = net_secret[4] + +		(((__force u16)sport << 16) + (__force u16)dport); +	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) +		secret[i] = net_secret[i]; + +	md5_transform(hash, secret); + +	seq = hash[0] | (((u64)hash[1]) << 32); +	seq += ktime_to_ns(ktime_get_real()); +	seq &= (1ull << 48) - 1; + +	return seq; +} +EXPORT_SYMBOL(secure_dccpv6_sequence_number); +#endif +#endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8814a9a52f4..c1a33033cbe 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -36,6 +36,8 @@   *	The functions in this file will not compile correctly with gcc 2.4.x   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/module.h>  #include <linux/types.h>  #include <linux/kernel.h> @@ -45,6 +47,8 @@  #include <linux/in.h>  #include <linux/inet.h>  #include <linux/slab.h> +#include <linux/tcp.h> +#include <linux/udp.h>  #include <linux/netdevice.h>  #ifdef CONFIG_NET_CLS_ACT  #include <net/pkt_sched.h> @@ -57,93 +61,89 @@  #include <linux/init.h>  #include <linux/scatterlist.h>  #include <linux/errqueue.h> +#include <linux/prefetch.h>  #include <net/protocol.h>  #include <net/dst.h>  #include <net/sock.h>  #include <net/checksum.h> +#include <net/ip6_checksum.h>  #include <net/xfrm.h>  #include <asm/uaccess.h> -#include <asm/system.h>  #include <trace/events/skb.h> +#include <linux/highmem.h> -#include "kmap_skb.h" - -static struct kmem_cache *skbuff_head_cache __read_mostly; +struct kmem_cache *skbuff_head_cache __read_mostly;  static struct kmem_cache *skbuff_fclone_cache __read_mostly; -static void sock_pipe_buf_release(struct pipe_inode_info *pipe, -				  struct pipe_buffer *buf) +/** + *	skb_panic - private function for out-of-line support + *	@skb:	buffer + *	@sz:	size + *	@addr:	address + *	@msg:	skb_over_panic or skb_under_panic + * + *	Out-of-line support for skb_put() and skb_push(). + *	Called via the wrapper skb_over_panic() or skb_under_panic(). + *	Keep out of line to prevent kernel bloat. + *	__builtin_return_address is not used because it is not always reliable. + */ +static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, +		      const char msg[])  { -	put_page(buf->page); +	pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", +		 msg, addr, skb->len, sz, skb->head, skb->data, +		 (unsigned long)skb->tail, (unsigned long)skb->end, +		 skb->dev ? skb->dev->name : "<NULL>"); +	BUG();  } -static void sock_pipe_buf_get(struct pipe_inode_info *pipe, -				struct pipe_buffer *buf) +static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)  { -	get_page(buf->page); +	skb_panic(skb, sz, addr, __func__);  } -static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, -			       struct pipe_buffer *buf) +static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)  { -	return 1; +	skb_panic(skb, sz, addr, __func__);  } - -/* Pipe buffer operations for a socket. */ -static const struct pipe_buf_operations sock_pipe_buf_ops = { -	.can_merge = 0, -	.map = generic_pipe_buf_map, -	.unmap = generic_pipe_buf_unmap, -	.confirm = generic_pipe_buf_confirm, -	.release = sock_pipe_buf_release, -	.steal = sock_pipe_buf_steal, -	.get = sock_pipe_buf_get, -}; -  /* - *	Keep out-of-line to prevent kernel bloat. - *	__builtin_return_address is not used because it is not always - *	reliable. + * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells + * the caller if emergency pfmemalloc reserves are being used. If it is and + * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves + * may be used. Otherwise, the packet data may be discarded until enough + * memory is free   */ +#define kmalloc_reserve(size, gfp, node, pfmemalloc) \ +	 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) -/** - *	skb_over_panic	- 	private function - *	@skb: buffer - *	@sz: size - *	@here: address - * - *	Out of line support code for skb_put(). Not user callable. - */ -static void skb_over_panic(struct sk_buff *skb, int sz, void *here) +static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, +			       unsigned long ip, bool *pfmemalloc)  { -	printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " -			  "data:%p tail:%#lx end:%#lx dev:%s\n", -	       here, skb->len, sz, skb->head, skb->data, -	       (unsigned long)skb->tail, (unsigned long)skb->end, -	       skb->dev ? skb->dev->name : "<NULL>"); -	BUG(); -} +	void *obj; +	bool ret_pfmemalloc = false; -/** - *	skb_under_panic	- 	private function - *	@skb: buffer - *	@sz: size - *	@here: address - * - *	Out of line support code for skb_push(). Not user callable. - */ +	/* +	 * Try a regular allocation, when that fails and we're not entitled +	 * to the reserves, fail. +	 */ +	obj = kmalloc_node_track_caller(size, +					flags | __GFP_NOMEMALLOC | __GFP_NOWARN, +					node); +	if (obj || !(gfp_pfmemalloc_allowed(flags))) +		goto out; -static void skb_under_panic(struct sk_buff *skb, int sz, void *here) -{ -	printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " -			  "data:%p tail:%#lx end:%#lx dev:%s\n", -	       here, skb->len, sz, skb->head, skb->data, -	       (unsigned long)skb->tail, (unsigned long)skb->end, -	       skb->dev ? skb->dev->name : "<NULL>"); -	BUG(); +	/* Try again but now we are using pfmemalloc reserves */ +	ret_pfmemalloc = true; +	obj = kmalloc_node_track_caller(size, flags, node); + +out: +	if (pfmemalloc) +		*pfmemalloc = ret_pfmemalloc; + +	return obj;  }  /* 	Allocate a new skbuff. We do this ourselves so we can fill in a few @@ -152,30 +152,62 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)   *   */ +struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) +{ +	struct sk_buff *skb; + +	/* Get the HEAD */ +	skb = kmem_cache_alloc_node(skbuff_head_cache, +				    gfp_mask & ~__GFP_DMA, node); +	if (!skb) +		goto out; + +	/* +	 * Only clear those fields we need to clear, not those that we will +	 * actually initialise below. Hence, don't put any more fields after +	 * the tail pointer in struct sk_buff! +	 */ +	memset(skb, 0, offsetof(struct sk_buff, tail)); +	skb->head = NULL; +	skb->truesize = sizeof(struct sk_buff); +	atomic_set(&skb->users, 1); + +	skb->mac_header = (typeof(skb->mac_header))~0U; +out: +	return skb; +} +  /**   *	__alloc_skb	-	allocate a network buffer   *	@size: size to allocate   *	@gfp_mask: allocation mask - *	@fclone: allocate from fclone cache instead of head cache - *		and allocate a cloned (child) skb + *	@flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache + *		instead of head cache and allocate a cloned (child) skb. + *		If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for + *		allocations in case the data is required for writeback   *	@node: numa node to allocate memory on   *   *	Allocate a new &sk_buff. The returned buffer has no headroom and a - *	tail room of size bytes. The object has a reference count of one. - *	The return is the buffer. On a failure the return is %NULL. + *	tail room of at least size bytes. The object has a reference count + *	of one. The return is the buffer. On a failure the return is %NULL.   *   *	Buffers may only be allocated from interrupts using a @gfp_mask of   *	%GFP_ATOMIC.   */  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, -			    int fclone, int node) +			    int flags, int node)  {  	struct kmem_cache *cache;  	struct skb_shared_info *shinfo;  	struct sk_buff *skb;  	u8 *data; +	bool pfmemalloc; + +	cache = (flags & SKB_ALLOC_FCLONE) +		? skbuff_fclone_cache : skbuff_head_cache; -	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; +	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) +		gfp_mask |= __GFP_MEMALLOC;  	/* Get the HEAD */  	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); @@ -183,11 +215,21 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,  		goto out;  	prefetchw(skb); +	/* We do our best to align skb_shared_info on a separate cache +	 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives +	 * aligned memory blocks, unless SLUB/SLAB debug is enabled. +	 * Both skb->head and skb_shared_info are cache line aligned. +	 */  	size = SKB_DATA_ALIGN(size); -	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), -			gfp_mask, node); +	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); +	data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);  	if (!data)  		goto nodata; +	/* kmalloc(size) might give us more room than requested. +	 * Put skb_shared_info exactly at the end of allocated zone, +	 * to allow max possible filling before reallocation. +	 */ +	size = SKB_WITH_OVERHEAD(ksize(data));  	prefetchw(data + size);  	/* @@ -196,22 +238,24 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,  	 * the tail pointer in struct sk_buff!  	 */  	memset(skb, 0, offsetof(struct sk_buff, tail)); -	skb->truesize = size + sizeof(struct sk_buff); +	/* Account for allocated memory : skb + skb->head */ +	skb->truesize = SKB_TRUESIZE(size); +	skb->pfmemalloc = pfmemalloc;  	atomic_set(&skb->users, 1);  	skb->head = data;  	skb->data = data;  	skb_reset_tail_pointer(skb);  	skb->end = skb->tail + size; -#ifdef NET_SKBUFF_DATA_USES_OFFSET -	skb->mac_header = ~0U; -#endif +	skb->mac_header = (typeof(skb->mac_header))~0U; +	skb->transport_header = (typeof(skb->transport_header))~0U;  	/* make sure we initialize shinfo sequentially */  	shinfo = skb_shinfo(skb);  	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));  	atomic_set(&shinfo->dataref, 1); +	kmemcheck_annotate_variable(shinfo->destructor_arg); -	if (fclone) { +	if (flags & SKB_ALLOC_FCLONE) {  		struct sk_buff *child = skb + 1;  		atomic_t *fclone_ref = (atomic_t *) (child + 1); @@ -221,6 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,  		atomic_set(fclone_ref, 1);  		child->fclone = SKB_FCLONE_UNAVAILABLE; +		child->pfmemalloc = pfmemalloc;  	}  out:  	return skb; @@ -232,6 +277,124 @@ nodata:  EXPORT_SYMBOL(__alloc_skb);  /** + * build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of fragment, or 0 if head was kmalloced + * + * Allocate a new &sk_buff. Caller provides space holding head and + * skb_shared_info. @data must have been allocated by kmalloc() only if + * @frag_size is 0, otherwise data should come from the page allocator. + * The return is the new skb buffer. + * On a failure the return is %NULL, and @data is not freed. + * Notes : + *  Before IO, driver allocates only data buffer where NIC put incoming frame + *  Driver should add room at head (NET_SKB_PAD) and + *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) + *  After IO, driver calls build_skb(), to allocate sk_buff and populate it + *  before giving packet to stack. + *  RX rings only contains data buffers, not full skbs. + */ +struct sk_buff *build_skb(void *data, unsigned int frag_size) +{ +	struct skb_shared_info *shinfo; +	struct sk_buff *skb; +	unsigned int size = frag_size ? : ksize(data); + +	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); +	if (!skb) +		return NULL; + +	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + +	memset(skb, 0, offsetof(struct sk_buff, tail)); +	skb->truesize = SKB_TRUESIZE(size); +	skb->head_frag = frag_size != 0; +	atomic_set(&skb->users, 1); +	skb->head = data; +	skb->data = data; +	skb_reset_tail_pointer(skb); +	skb->end = skb->tail + size; +	skb->mac_header = (typeof(skb->mac_header))~0U; +	skb->transport_header = (typeof(skb->transport_header))~0U; + +	/* make sure we initialize shinfo sequentially */ +	shinfo = skb_shinfo(skb); +	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); +	atomic_set(&shinfo->dataref, 1); +	kmemcheck_annotate_variable(shinfo->destructor_arg); + +	return skb; +} +EXPORT_SYMBOL(build_skb); + +struct netdev_alloc_cache { +	struct page_frag	frag; +	/* we maintain a pagecount bias, so that we dont dirty cache line +	 * containing page->_count every time we allocate a fragment. +	 */ +	unsigned int		pagecnt_bias; +}; +static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); + +static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +{ +	struct netdev_alloc_cache *nc; +	void *data = NULL; +	int order; +	unsigned long flags; + +	local_irq_save(flags); +	nc = &__get_cpu_var(netdev_alloc_cache); +	if (unlikely(!nc->frag.page)) { +refill: +		for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) { +			gfp_t gfp = gfp_mask; + +			if (order) +				gfp |= __GFP_COMP | __GFP_NOWARN; +			nc->frag.page = alloc_pages(gfp, order); +			if (likely(nc->frag.page)) +				break; +			if (--order < 0) +				goto end; +		} +		nc->frag.size = PAGE_SIZE << order; +recycle: +		atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS); +		nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; +		nc->frag.offset = 0; +	} + +	if (nc->frag.offset + fragsz > nc->frag.size) { +		/* avoid unnecessary locked operations if possible */ +		if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) || +		    atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count)) +			goto recycle; +		goto refill; +	} + +	data = page_address(nc->frag.page) + nc->frag.offset; +	nc->frag.offset += fragsz; +	nc->pagecnt_bias--; +end: +	local_irq_restore(flags); +	return data; +} + +/** + * netdev_alloc_frag - allocate a page fragment + * @fragsz: fragment size + * + * Allocates a frag from a page for receive buffer. + * Uses GFP_ATOMIC allocations. + */ +void *netdev_alloc_frag(unsigned int fragsz) +{ +	return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); +} +EXPORT_SYMBOL(netdev_alloc_frag); + +/**   *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device   *	@dev: network device to receive on   *	@length: length to allocate @@ -245,11 +408,29 @@ EXPORT_SYMBOL(__alloc_skb);   *	%NULL is returned if there is no free memory.   */  struct sk_buff *__netdev_alloc_skb(struct net_device *dev, -		unsigned int length, gfp_t gfp_mask) +				   unsigned int length, gfp_t gfp_mask)  { -	struct sk_buff *skb; +	struct sk_buff *skb = NULL; +	unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + +			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); -	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); +	if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { +		void *data; + +		if (sk_memalloc_socks()) +			gfp_mask |= __GFP_MEMALLOC; + +		data = __netdev_alloc_frag(fragsz, gfp_mask); + +		if (likely(data)) { +			skb = build_skb(data, fragsz); +			if (unlikely(!skb)) +				put_page(virt_to_head_page(data)); +		} +	} else { +		skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, +				  SKB_ALLOC_RX, NUMA_NO_NODE); +	}  	if (likely(skb)) {  		skb_reserve(skb, NET_SKB_PAD);  		skb->dev = dev; @@ -259,48 +440,31 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,  EXPORT_SYMBOL(__netdev_alloc_skb);  void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, -		int size) +		     int size, unsigned int truesize)  {  	skb_fill_page_desc(skb, i, page, off, size);  	skb->len += size;  	skb->data_len += size; -	skb->truesize += size; +	skb->truesize += truesize;  }  EXPORT_SYMBOL(skb_add_rx_frag); -/** - *	dev_alloc_skb - allocate an skbuff for receiving - *	@length: length to allocate - * - *	Allocate a new &sk_buff and assign it a usage count of one. The - *	buffer has unspecified headroom built in. Users should allocate - *	the headroom they think they need without accounting for the - *	built in space. The built in space is used for optimisations. - * - *	%NULL is returned if there is no free memory. Although this function - *	allocates memory it can be called from an interrupt. - */ -struct sk_buff *dev_alloc_skb(unsigned int length) +void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, +			  unsigned int truesize)  { -	/* -	 * There is more code here than it seems: -	 * __dev_alloc_skb is an inline -	 */ -	return __dev_alloc_skb(length, GFP_ATOMIC); +	skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + +	skb_frag_size_add(frag, size); +	skb->len += size; +	skb->data_len += size; +	skb->truesize += truesize;  } -EXPORT_SYMBOL(dev_alloc_skb); +EXPORT_SYMBOL(skb_coalesce_rx_frag);  static void skb_drop_list(struct sk_buff **listp)  { -	struct sk_buff *list = *listp; - +	kfree_skb_list(*listp);  	*listp = NULL; - -	do { -		struct sk_buff *this = list; -		list = list->next; -		kfree_skb(this); -	} while (list);  }  static inline void skb_drop_fraglist(struct sk_buff *skb) @@ -316,6 +480,14 @@ static void skb_clone_fraglist(struct sk_buff *skb)  		skb_get(list);  } +static void skb_free_head(struct sk_buff *skb) +{ +	if (skb->head_frag) +		put_page(virt_to_head_page(skb->head)); +	else +		kfree(skb->head); +} +  static void skb_release_data(struct sk_buff *skb)  {  	if (!skb->cloned || @@ -324,13 +496,25 @@ static void skb_release_data(struct sk_buff *skb)  		if (skb_shinfo(skb)->nr_frags) {  			int i;  			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) -				put_page(skb_shinfo(skb)->frags[i].page); +				skb_frag_unref(skb, i); +		} + +		/* +		 * If skb buf is from userspace, we need to notify the caller +		 * the lower device DMA has done; +		 */ +		if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { +			struct ubuf_info *uarg; + +			uarg = skb_shinfo(skb)->destructor_arg; +			if (uarg->callback) +				uarg->callback(uarg, true);  		}  		if (skb_has_frag_list(skb))  			skb_drop_fraglist(skb); -		kfree(skb->head); +		skb_free_head(skb);  	}  } @@ -378,9 +562,8 @@ static void skb_release_head_state(struct sk_buff *skb)  		WARN_ON(in_irq());  		skb->destructor(skb);  	} -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK)  	nf_conntrack_put(skb->nfct); -	nf_conntrack_put_reasm(skb->nfct_reasm);  #endif  #ifdef CONFIG_BRIDGE_NETFILTER  	nf_bridge_put(skb->nf_bridge); @@ -398,7 +581,8 @@ static void skb_release_head_state(struct sk_buff *skb)  static void skb_release_all(struct sk_buff *skb)  {  	skb_release_head_state(skb); -	skb_release_data(skb); +	if (likely(skb->head)) +		skb_release_data(skb);  }  /** @@ -437,6 +621,37 @@ void kfree_skb(struct sk_buff *skb)  }  EXPORT_SYMBOL(kfree_skb); +void kfree_skb_list(struct sk_buff *segs) +{ +	while (segs) { +		struct sk_buff *next = segs->next; + +		kfree_skb(segs); +		segs = next; +	} +} +EXPORT_SYMBOL(kfree_skb_list); + +/** + *	skb_tx_error - report an sk_buff xmit error + *	@skb: buffer that triggered an error + * + *	Report xmit error if a device callback is tracking this skb. + *	skb must be freed afterwards. + */ +void skb_tx_error(struct sk_buff *skb) +{ +	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { +		struct ubuf_info *uarg; + +		uarg = skb_shinfo(skb)->destructor_arg; +		if (uarg->callback) +			uarg->callback(uarg, false); +		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; +	} +} +EXPORT_SYMBOL(skb_tx_error); +  /**   *	consume_skb - free an skbuff   *	@skb: buffer to free @@ -458,49 +673,6 @@ void consume_skb(struct sk_buff *skb)  }  EXPORT_SYMBOL(consume_skb); -/** - *	skb_recycle_check - check if skb can be reused for receive - *	@skb: buffer - *	@skb_size: minimum receive buffer size - * - *	Checks that the skb passed in is not shared or cloned, and - *	that it is linear and its head portion at least as large as - *	skb_size so that it can be recycled as a receive buffer. - *	If these conditions are met, this function does any necessary - *	reference count dropping and cleans up the skbuff as if it - *	just came from __alloc_skb(). - */ -bool skb_recycle_check(struct sk_buff *skb, int skb_size) -{ -	struct skb_shared_info *shinfo; - -	if (irqs_disabled()) -		return false; - -	if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) -		return false; - -	skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD); -	if (skb_end_pointer(skb) - skb->head < skb_size) -		return false; - -	if (skb_shared(skb) || skb_cloned(skb)) -		return false; - -	skb_release_head_state(skb); - -	shinfo = skb_shinfo(skb); -	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); -	atomic_set(&shinfo->dataref, 1); - -	memset(skb, 0, offsetof(struct sk_buff, tail)); -	skb->data = skb->head + NET_SKB_PAD; -	skb_reset_tail_pointer(skb); - -	return true; -} -EXPORT_SYMBOL(skb_recycle_check); -  static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)  {  	new->tstamp		= old->tstamp; @@ -508,39 +680,50 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)  	new->transport_header	= old->transport_header;  	new->network_header	= old->network_header;  	new->mac_header		= old->mac_header; +	new->inner_protocol	= old->inner_protocol; +	new->inner_transport_header = old->inner_transport_header; +	new->inner_network_header = old->inner_network_header; +	new->inner_mac_header = old->inner_mac_header;  	skb_dst_copy(new, old); -	new->rxhash		= old->rxhash; +	skb_copy_hash(new, old); +	new->ooo_okay		= old->ooo_okay; +	new->no_fcs		= old->no_fcs; +	new->encapsulation	= old->encapsulation; +	new->encap_hdr_csum	= old->encap_hdr_csum; +	new->csum_valid		= old->csum_valid; +	new->csum_complete_sw	= old->csum_complete_sw;  #ifdef CONFIG_XFRM  	new->sp			= secpath_get(old->sp);  #endif  	memcpy(new->cb, old->cb, sizeof(old->cb));  	new->csum		= old->csum; -	new->local_df		= old->local_df; +	new->ignore_df		= old->ignore_df;  	new->pkt_type		= old->pkt_type;  	new->ip_summed		= old->ip_summed;  	skb_copy_queue_mapping(new, old);  	new->priority		= old->priority; -	new->deliver_no_wcard	= old->deliver_no_wcard; -#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) +#if IS_ENABLED(CONFIG_IP_VS)  	new->ipvs_property	= old->ipvs_property;  #endif +	new->pfmemalloc		= old->pfmemalloc;  	new->protocol		= old->protocol;  	new->mark		= old->mark;  	new->skb_iif		= old->skb_iif;  	__nf_copy(new, old); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ -    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) -	new->nf_trace		= old->nf_trace; -#endif  #ifdef CONFIG_NET_SCHED  	new->tc_index		= old->tc_index;  #ifdef CONFIG_NET_CLS_ACT  	new->tc_verd		= old->tc_verd;  #endif  #endif +	new->vlan_proto		= old->vlan_proto;  	new->vlan_tci		= old->vlan_tci;  	skb_copy_secmark(new, old); + +#ifdef CONFIG_NET_RX_BUSY_POLL +	new->napi_id	= old->napi_id; +#endif  }  /* @@ -565,6 +748,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)  	C(tail);  	C(end);  	C(head); +	C(head_frag);  	C(data);  	C(truesize);  	atomic_set(&n->users, 1); @@ -594,6 +778,67 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)  EXPORT_SYMBOL_GPL(skb_morph);  /** + *	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel + *	@skb: the skb to modify + *	@gfp_mask: allocation priority + * + *	This must be called on SKBTX_DEV_ZEROCOPY skb. + *	It will copy all frags into kernel and drop the reference + *	to userspace pages. + * + *	If this function is called from an interrupt gfp_mask() must be + *	%GFP_ATOMIC. + * + *	Returns 0 on success or a negative error code on failure + *	to allocate kernel memory to copy to. + */ +int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) +{ +	int i; +	int num_frags = skb_shinfo(skb)->nr_frags; +	struct page *page, *head = NULL; +	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; + +	for (i = 0; i < num_frags; i++) { +		u8 *vaddr; +		skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + +		page = alloc_page(gfp_mask); +		if (!page) { +			while (head) { +				struct page *next = (struct page *)page_private(head); +				put_page(head); +				head = next; +			} +			return -ENOMEM; +		} +		vaddr = kmap_atomic(skb_frag_page(f)); +		memcpy(page_address(page), +		       vaddr + f->page_offset, skb_frag_size(f)); +		kunmap_atomic(vaddr); +		set_page_private(page, (unsigned long)head); +		head = page; +	} + +	/* skb frags release userspace buffers */ +	for (i = 0; i < num_frags; i++) +		skb_frag_unref(skb, i); + +	uarg->callback(uarg, false); + +	/* skb frags point to kernel buffers */ +	for (i = num_frags - 1; i >= 0; i--) { +		__skb_fill_page_desc(skb, i, head, 0, +				     skb_shinfo(skb)->frags[i].size); +		head = (struct page *)page_private(head); +	} + +	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; +	return 0; +} +EXPORT_SYMBOL_GPL(skb_copy_ubufs); + +/**   *	skb_clone	-	duplicate an sk_buff   *	@skb: buffer to clone   *	@gfp_mask: allocation priority @@ -611,6 +856,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)  {  	struct sk_buff *n; +	if (skb_orphan_frags(skb, gfp_mask)) +		return NULL; +  	n = skb + 1;  	if (skb->fclone == SKB_FCLONE_ORIG &&  	    n->fclone == SKB_FCLONE_UNAVAILABLE) { @@ -618,6 +866,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)  		n->fclone = SKB_FCLONE_CLONE;  		atomic_inc(fclone_ref);  	} else { +		if (skb_pfmemalloc(skb)) +			gfp_mask |= __GFP_MEMALLOC; +  		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);  		if (!n)  			return NULL; @@ -631,29 +882,37 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)  }  EXPORT_SYMBOL(skb_clone); -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +static void skb_headers_offset_update(struct sk_buff *skb, int off)  { -#ifndef NET_SKBUFF_DATA_USES_OFFSET -	/* -	 *	Shift between the two data areas in bytes -	 */ -	unsigned long offset = new->data - old->data; -#endif +	/* Only adjust this if it actually is csum_start rather than csum */ +	if (skb->ip_summed == CHECKSUM_PARTIAL) +		skb->csum_start += off; +	/* {transport,network,mac}_header and tail are relative to skb->head */ +	skb->transport_header += off; +	skb->network_header   += off; +	if (skb_mac_header_was_set(skb)) +		skb->mac_header += off; +	skb->inner_transport_header += off; +	skb->inner_network_header += off; +	skb->inner_mac_header += off; +} +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{  	__copy_skb_header(new, old); -#ifndef NET_SKBUFF_DATA_USES_OFFSET -	/* {transport,network,mac}_header are relative to skb->head */ -	new->transport_header += offset; -	new->network_header   += offset; -	if (skb_mac_header_was_set(new)) -		new->mac_header	      += offset; -#endif  	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;  	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;  	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;  } +static inline int skb_alloc_rx_flag(const struct sk_buff *skb) +{ +	if (skb_pfmemalloc(skb)) +		return SKB_ALLOC_RX; +	return 0; +} +  /**   *	skb_copy	-	create private copy of an sk_buff   *	@skb: buffer to copy @@ -674,8 +933,9 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)  struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)  {  	int headerlen = skb_headroom(skb); -	unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; -	struct sk_buff *n = alloc_skb(size, gfp_mask); +	unsigned int size = skb_end_offset(skb) + skb->data_len; +	struct sk_buff *n = __alloc_skb(size, gfp_mask, +					skb_alloc_rx_flag(skb), NUMA_NO_NODE);  	if (!n)  		return NULL; @@ -694,9 +954,13 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)  EXPORT_SYMBOL(skb_copy);  /** - *	pskb_copy	-	create copy of an sk_buff with private head. + *	__pskb_copy_fclone	-  create copy of an sk_buff with private head.   *	@skb: buffer to copy + *	@headroom: headroom of new skb   *	@gfp_mask: allocation priority + *	@fclone: if true allocate the copy of the skb from the fclone + *	cache instead of the head cache; it is recommended to set this + *	to true for the cases where the copy will likely be cloned   *   *	Make a copy of both an &sk_buff and part of its data, located   *	in header. Fragmented data remain shared. This is used when @@ -706,16 +970,18 @@ EXPORT_SYMBOL(skb_copy);   *	The returned buffer has a reference count of 1.   */ -struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) +struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, +				   gfp_t gfp_mask, bool fclone)  { -	unsigned int size = skb_end_pointer(skb) - skb->head; -	struct sk_buff *n = alloc_skb(size, gfp_mask); +	unsigned int size = skb_headlen(skb) + headroom; +	int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); +	struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);  	if (!n)  		goto out;  	/* Set the data pointer */ -	skb_reserve(n, skb_headroom(skb)); +	skb_reserve(n, headroom);  	/* Set the tail pointer and length */  	skb_put(n, skb_headlen(skb));  	/* Copy the bytes */ @@ -728,9 +994,14 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)  	if (skb_shinfo(skb)->nr_frags) {  		int i; +		if (skb_orphan_frags(skb, gfp_mask)) { +			kfree_skb(n); +			n = NULL; +			goto out; +		}  		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {  			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; -			get_page(skb_shinfo(n)->frags[i].page); +			skb_frag_ref(skb, i);  		}  		skb_shinfo(n)->nr_frags = i;  	} @@ -744,7 +1015,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)  out:  	return n;  } -EXPORT_SYMBOL(pskb_copy); +EXPORT_SYMBOL(__pskb_copy_fclone);  /**   *	pskb_expand_head - reallocate header of &sk_buff @@ -753,8 +1024,8 @@ EXPORT_SYMBOL(pskb_copy);   *	@ntail: room to add at tail   *	@gfp_mask: allocation priority   * - *	Expands (or creates identical copy, if &nhead and &ntail are zero) - *	header of skb. &sk_buff itself is not changed. &sk_buff MUST have + *	Expands (or creates identical copy, if @nhead and @ntail are zero) + *	header of @skb. &sk_buff itself is not changed. &sk_buff MUST have   *	reference count of 1. Returns zero in the case of success or error,   *	if expansion failed. In the last case, &sk_buff is not changed.   * @@ -767,9 +1038,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,  {  	int i;  	u8 *data; -	int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; +	int size = nhead + skb_end_offset(skb) + ntail;  	long off; -	bool fastpath;  	BUG_ON(nhead < 0); @@ -778,31 +1048,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,  	size = SKB_DATA_ALIGN(size); -	/* Check if we can avoid taking references on fragments if we own -	 * the last reference on skb->head. (see skb_release_data()) -	 */ -	if (!skb->cloned) -		fastpath = true; -	else { -		int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; - -		fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; -	} - -	if (fastpath && -	    size + sizeof(struct skb_shared_info) <= ksize(skb->head)) { -		memmove(skb->head + size, skb_shinfo(skb), -			offsetof(struct skb_shared_info, -				 frags[skb_shinfo(skb)->nr_frags])); -		memmove(skb->head + nhead, skb->head, -			skb_tail_pointer(skb) - skb->head); -		off = nhead; -		goto adjust_others; -	} - -	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); +	if (skb_pfmemalloc(skb)) +		gfp_mask |= __GFP_MEMALLOC; +	data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), +			       gfp_mask, NUMA_NO_NODE, NULL);  	if (!data)  		goto nodata; +	size = SKB_WITH_OVERHEAD(ksize(data));  	/* Copy only real data... and, alas, header. This should be  	 * optimized for the cases when header is void. @@ -813,21 +1065,29 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,  	       skb_shinfo(skb),  	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); -	if (fastpath) { -		kfree(skb->head); -	} else { +	/* +	 * if shinfo is shared we must drop the old head gracefully, but if it +	 * is not we can just drop the old head and let the existing refcount +	 * be since all we did is relocate the values +	 */ +	if (skb_cloned(skb)) { +		/* copy this zero copy skb frags */ +		if (skb_orphan_frags(skb, gfp_mask)) +			goto nofrags;  		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) -			get_page(skb_shinfo(skb)->frags[i].page); +			skb_frag_ref(skb, i);  		if (skb_has_frag_list(skb))  			skb_clone_fraglist(skb);  		skb_release_data(skb); +	} else { +		skb_free_head(skb);  	}  	off = (data + nhead) - skb->head;  	skb->head     = data; -adjust_others: +	skb->head_frag = 0;  	skb->data    += off;  #ifdef NET_SKBUFF_DATA_USES_OFFSET  	skb->end      = size; @@ -835,21 +1095,16 @@ adjust_others:  #else  	skb->end      = skb->head + size;  #endif -	/* {transport,network,mac}_header and tail are relative to skb->head */  	skb->tail	      += off; -	skb->transport_header += off; -	skb->network_header   += off; -	if (skb_mac_header_was_set(skb)) -		skb->mac_header += off; -	/* Only adjust this if it actually is csum_start rather than csum */ -	if (skb->ip_summed == CHECKSUM_PARTIAL) -		skb->csum_start += nhead; +	skb_headers_offset_update(skb, nhead);  	skb->cloned   = 0;  	skb->hdr_len  = 0;  	skb->nohdr    = 0;  	atomic_set(&skb_shinfo(skb)->dataref, 1);  	return 0; +nofrags: +	kfree(data);  nodata:  	return -ENOMEM;  } @@ -901,11 +1156,11 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,  	/*  	 *	Allocate the copy buffer  	 */ -	struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, -				      gfp_mask); +	struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, +					gfp_mask, skb_alloc_rx_flag(skb), +					NUMA_NO_NODE);  	int oldheadroom = skb_headroom(skb);  	int head_copy_len, head_copy_off; -	int off;  	if (!n)  		return NULL; @@ -929,15 +1184,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,  	copy_skb_header(n, skb); -	off                  = newheadroom - oldheadroom; -	if (n->ip_summed == CHECKSUM_PARTIAL) -		n->csum_start += off; -#ifdef NET_SKBUFF_DATA_USES_OFFSET -	n->transport_header += off; -	n->network_header   += off; -	if (skb_mac_header_was_set(skb)) -		n->mac_header += off; -#endif +	skb_headers_offset_update(n, newheadroom - oldheadroom);  	return n;  } @@ -990,6 +1237,29 @@ free_skb:  EXPORT_SYMBOL(skb_pad);  /** + *	pskb_put - add data to the tail of a potentially fragmented buffer + *	@skb: start of the buffer to use + *	@tail: tail fragment of the buffer to use + *	@len: amount of data to add + * + *	This function extends the used data area of the potentially + *	fragmented buffer. @tail must be the last fragment of @skb -- or + *	@skb itself. If this would exceed the total buffer size the kernel + *	will panic. A pointer to the first byte of the extra data is + *	returned. + */ + +unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +{ +	if (tail != skb) { +		skb->data_len += len; +		skb->len += len; +	} +	return skb_put(tail, len); +} +EXPORT_SYMBOL_GPL(pskb_put); + +/**   *	skb_put - add data to a buffer   *	@skb: buffer to use   *	@len: amount of data to add @@ -1082,20 +1352,20 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len)  		goto drop_pages;  	for (; i < nfrags; i++) { -		int end = offset + skb_shinfo(skb)->frags[i].size; +		int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);  		if (end < len) {  			offset = end;  			continue;  		} -		skb_shinfo(skb)->frags[i++].size = len - offset; +		skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);  drop_pages:  		skb_shinfo(skb)->nr_frags = i;  		for (; i < nfrags; i++) -			put_page(skb_shinfo(skb)->frags[i].page); +			skb_frag_unref(skb, i);  		if (skb_has_frag_list(skb))  			skb_drop_fraglist(skb); @@ -1114,7 +1384,7 @@ drop_pages:  				return -ENOMEM;  			nfrag->next = frag->next; -			kfree_skb(frag); +			consume_skb(frag);  			frag = nfrag;  			*fragp = frag;  		} @@ -1198,9 +1468,11 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)  	/* Estimate size of pulled pages. */  	eat = delta;  	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { -		if (skb_shinfo(skb)->frags[i].size >= eat) +		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + +		if (size >= eat)  			goto pull_pages; -		eat -= skb_shinfo(skb)->frags[i].size; +		eat -= size;  	}  	/* If we need update frag list, we are in troubles. @@ -1263,14 +1535,16 @@ pull_pages:  	eat = delta;  	k = 0;  	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { -		if (skb_shinfo(skb)->frags[i].size <= eat) { -			put_page(skb_shinfo(skb)->frags[i].page); -			eat -= skb_shinfo(skb)->frags[i].size; +		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + +		if (size <= eat) { +			skb_frag_unref(skb, i); +			eat -= size;  		} else {  			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];  			if (eat) {  				skb_shinfo(skb)->frags[k].page_offset += eat; -				skb_shinfo(skb)->frags[k].size -= eat; +				skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);  				eat = 0;  			}  			k++; @@ -1285,8 +1559,21 @@ pull_pages:  }  EXPORT_SYMBOL(__pskb_pull_tail); -/* Copy some data bits from skb to kernel buffer. */ - +/** + *	skb_copy_bits - copy bits from skb to kernel buffer + *	@skb: source skb + *	@offset: offset in source + *	@to: destination buffer + *	@len: number of bytes to copy + * + *	Copy the specified number of bytes from the source skb to the + *	destination buffer. + * + *	CAUTION ! : + *		If its prototype is ever changed, + *		check arch/{*}/net/{*}.S files, + *		since it is called from BPF assembly code. + */  int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)  {  	int start = skb_headlen(skb); @@ -1309,21 +1596,22 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)  	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {  		int end; +		skb_frag_t *f = &skb_shinfo(skb)->frags[i];  		WARN_ON(start > offset + len); -		end = start + skb_shinfo(skb)->frags[i].size; +		end = start + skb_frag_size(f);  		if ((copy = end - offset) > 0) {  			u8 *vaddr;  			if (copy > len)  				copy = len; -			vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); +			vaddr = kmap_atomic(skb_frag_page(f));  			memcpy(to, -			       vaddr + skb_shinfo(skb)->frags[i].page_offset+ -			       offset - start, copy); -			kunmap_skb_frag(vaddr); +			       vaddr + f->page_offset + offset - start, +			       copy); +			kunmap_atomic(vaddr);  			if ((len -= copy) == 0)  				return 0; @@ -1351,6 +1639,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)  		}  		start = end;  	} +  	if (!len)  		return 0; @@ -1368,140 +1657,122 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)  	put_page(spd->pages[i]);  } -static inline struct page *linear_to_page(struct page *page, unsigned int *len, -					  unsigned int *offset, -					  struct sk_buff *skb, struct sock *sk) +static struct page *linear_to_page(struct page *page, unsigned int *len, +				   unsigned int *offset, +				   struct sock *sk)  { -	struct page *p = sk->sk_sndmsg_page; -	unsigned int off; - -	if (!p) { -new_page: -		p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0); -		if (!p) -			return NULL; +	struct page_frag *pfrag = sk_page_frag(sk); -		off = sk->sk_sndmsg_off = 0; -		/* hold one ref to this page until it's full */ -	} else { -		unsigned int mlen; +	if (!sk_page_frag_refill(sk, pfrag)) +		return NULL; -		off = sk->sk_sndmsg_off; -		mlen = PAGE_SIZE - off; -		if (mlen < 64 && mlen < *len) { -			put_page(p); -			goto new_page; -		} +	*len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); -		*len = min_t(unsigned int, *len, mlen); -	} +	memcpy(page_address(pfrag->page) + pfrag->offset, +	       page_address(page) + *offset, *len); +	*offset = pfrag->offset; +	pfrag->offset += *len; -	memcpy(page_address(p) + off, page_address(page) + *offset, *len); -	sk->sk_sndmsg_off += *len; -	*offset = off; -	get_page(p); +	return pfrag->page; +} -	return p; +static bool spd_can_coalesce(const struct splice_pipe_desc *spd, +			     struct page *page, +			     unsigned int offset) +{ +	return	spd->nr_pages && +		spd->pages[spd->nr_pages - 1] == page && +		(spd->partial[spd->nr_pages - 1].offset + +		 spd->partial[spd->nr_pages - 1].len == offset);  }  /*   * Fill page/offset/length into spd, if it can hold more pages.   */ -static inline int spd_fill_page(struct splice_pipe_desc *spd, -				struct pipe_inode_info *pipe, struct page *page, -				unsigned int *len, unsigned int offset, -				struct sk_buff *skb, int linear, -				struct sock *sk) +static bool spd_fill_page(struct splice_pipe_desc *spd, +			  struct pipe_inode_info *pipe, struct page *page, +			  unsigned int *len, unsigned int offset, +			  bool linear, +			  struct sock *sk)  { -	if (unlikely(spd->nr_pages == pipe->buffers)) -		return 1; +	if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) +		return true;  	if (linear) { -		page = linear_to_page(page, len, &offset, skb, sk); +		page = linear_to_page(page, len, &offset, sk);  		if (!page) -			return 1; -	} else -		get_page(page); - +			return true; +	} +	if (spd_can_coalesce(spd, page, offset)) { +		spd->partial[spd->nr_pages - 1].len += *len; +		return false; +	} +	get_page(page);  	spd->pages[spd->nr_pages] = page;  	spd->partial[spd->nr_pages].len = *len;  	spd->partial[spd->nr_pages].offset = offset;  	spd->nr_pages++; -	return 0; +	return false;  } -static inline void __segment_seek(struct page **page, unsigned int *poff, -				  unsigned int *plen, unsigned int off) -{ -	unsigned long n; - -	*poff += off; -	n = *poff / PAGE_SIZE; -	if (n) -		*page = nth_page(*page, n); - -	*poff = *poff % PAGE_SIZE; -	*plen -= off; -} - -static inline int __splice_segment(struct page *page, unsigned int poff, -				   unsigned int plen, unsigned int *off, -				   unsigned int *len, struct sk_buff *skb, -				   struct splice_pipe_desc *spd, int linear, -				   struct sock *sk, -				   struct pipe_inode_info *pipe) +static bool __splice_segment(struct page *page, unsigned int poff, +			     unsigned int plen, unsigned int *off, +			     unsigned int *len, +			     struct splice_pipe_desc *spd, bool linear, +			     struct sock *sk, +			     struct pipe_inode_info *pipe)  {  	if (!*len) -		return 1; +		return true;  	/* skip this segment if already processed */  	if (*off >= plen) {  		*off -= plen; -		return 0; +		return false;  	}  	/* ignore any bits we already processed */ -	if (*off) { -		__segment_seek(&page, &poff, &plen, *off); -		*off = 0; -	} +	poff += *off; +	plen -= *off; +	*off = 0;  	do {  		unsigned int flen = min(*len, plen); -		/* the linear region may spread across several pages  */ -		flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - -		if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) -			return 1; - -		__segment_seek(&page, &poff, &plen, flen); +		if (spd_fill_page(spd, pipe, page, &flen, poff, +				  linear, sk)) +			return true; +		poff += flen; +		plen -= flen;  		*len -= flen; -  	} while (*len && plen); -	return 0; +	return false;  }  /* - * Map linear and fragment data from the skb to spd. It reports failure if the + * Map linear and fragment data from the skb to spd. It reports true if the   * pipe is full or if we already spliced the requested length.   */ -static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, -			     unsigned int *offset, unsigned int *len, -			     struct splice_pipe_desc *spd, struct sock *sk) +static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, +			      unsigned int *offset, unsigned int *len, +			      struct splice_pipe_desc *spd, struct sock *sk)  {  	int seg; -	/* -	 * map the linear part +	/* map the linear part : +	 * If skb->head_frag is set, this 'linear' part is backed by a +	 * fragment, and if the head is not shared with any clones then +	 * we can avoid a copy since we own the head portion of this page.  	 */  	if (__splice_segment(virt_to_page(skb->data),  			     (unsigned long) skb->data & (PAGE_SIZE - 1),  			     skb_headlen(skb), -			     offset, len, skb, spd, 1, sk, pipe)) -		return 1; +			     offset, len, spd, +			     skb_head_is_locked(skb), +			     sk, pipe)) +		return true;  	/*  	 * then map the fragments @@ -1509,12 +1780,13 @@ static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,  	for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {  		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; -		if (__splice_segment(f->page, f->page_offset, f->size, -				     offset, len, skb, spd, 0, sk, pipe)) -			return 1; +		if (__splice_segment(skb_frag_page(f), +				     f->page_offset, skb_frag_size(f), +				     offset, len, spd, false, sk, pipe)) +			return true;  	} -	return 0; +	return false;  }  /* @@ -1527,22 +1799,20 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,  		    struct pipe_inode_info *pipe, unsigned int tlen,  		    unsigned int flags)  { -	struct partial_page partial[PIPE_DEF_BUFFERS]; -	struct page *pages[PIPE_DEF_BUFFERS]; +	struct partial_page partial[MAX_SKB_FRAGS]; +	struct page *pages[MAX_SKB_FRAGS];  	struct splice_pipe_desc spd = {  		.pages = pages,  		.partial = partial, +		.nr_pages_max = MAX_SKB_FRAGS,  		.flags = flags, -		.ops = &sock_pipe_buf_ops, +		.ops = &nosteal_pipe_buf_ops,  		.spd_release = sock_spd_release,  	};  	struct sk_buff *frag_iter;  	struct sock *sk = skb->sk;  	int ret = 0; -	if (splice_grow_spd(pipe, &spd)) -		return -ENOMEM; -  	/*  	 * __skb_splice_bits() only fails if the output has no room left,  	 * so no point in going over the frag_list for the error case. @@ -1578,7 +1848,6 @@ done:  		lock_sock(sk);  	} -	splice_shrink_spd(pipe, &spd);  	return ret;  } @@ -1619,17 +1888,17 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)  		WARN_ON(start > offset + len); -		end = start + frag->size; +		end = start + skb_frag_size(frag);  		if ((copy = end - offset) > 0) {  			u8 *vaddr;  			if (copy > len)  				copy = len; -			vaddr = kmap_skb_frag(frag); +			vaddr = kmap_atomic(skb_frag_page(frag));  			memcpy(vaddr + frag->page_offset + offset - start,  			       from, copy); -			kunmap_skb_frag(vaddr); +			kunmap_atomic(vaddr);  			if ((len -= copy) == 0)  				return 0; @@ -1667,9 +1936,8 @@ fault:  EXPORT_SYMBOL(skb_store_bits);  /* Checksum skb data. */ - -__wsum skb_checksum(const struct sk_buff *skb, int offset, -			  int len, __wsum csum) +__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, +		      __wsum csum, const struct skb_checksum_ops *ops)  {  	int start = skb_headlen(skb);  	int i, copy = start - offset; @@ -1680,7 +1948,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,  	if (copy > 0) {  		if (copy > len)  			copy = len; -		csum = csum_partial(skb->data + offset, copy, csum); +		csum = ops->update(skb->data + offset, copy, csum);  		if ((len -= copy) == 0)  			return csum;  		offset += copy; @@ -1689,22 +1957,22 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,  	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {  		int end; +		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];  		WARN_ON(start > offset + len); -		end = start + skb_shinfo(skb)->frags[i].size; +		end = start + skb_frag_size(frag);  		if ((copy = end - offset) > 0) {  			__wsum csum2;  			u8 *vaddr; -			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];  			if (copy > len)  				copy = len; -			vaddr = kmap_skb_frag(frag); -			csum2 = csum_partial(vaddr + frag->page_offset + -					     offset - start, copy, 0); -			kunmap_skb_frag(vaddr); -			csum = csum_block_add(csum, csum2, pos); +			vaddr = kmap_atomic(skb_frag_page(frag)); +			csum2 = ops->update(vaddr + frag->page_offset + +					    offset - start, copy, 0); +			kunmap_atomic(vaddr); +			csum = ops->combine(csum, csum2, pos, copy);  			if (!(len -= copy))  				return csum;  			offset += copy; @@ -1723,9 +1991,9 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,  			__wsum csum2;  			if (copy > len)  				copy = len; -			csum2 = skb_checksum(frag_iter, offset - start, -					     copy, 0); -			csum = csum_block_add(csum, csum2, pos); +			csum2 = __skb_checksum(frag_iter, offset - start, +					       copy, 0, ops); +			csum = ops->combine(csum, csum2, pos, copy);  			if ((len -= copy) == 0)  				return csum;  			offset += copy; @@ -1737,6 +2005,18 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,  	return csum;  } +EXPORT_SYMBOL(__skb_checksum); + +__wsum skb_checksum(const struct sk_buff *skb, int offset, +		    int len, __wsum csum) +{ +	const struct skb_checksum_ops ops = { +		.update  = csum_partial_ext, +		.combine = csum_block_add_ext, +	}; + +	return __skb_checksum(skb, offset, len, csum, &ops); +}  EXPORT_SYMBOL(skb_checksum);  /* Both of above in one bottle. */ @@ -1767,7 +2047,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,  		WARN_ON(start > offset + len); -		end = start + skb_shinfo(skb)->frags[i].size; +		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);  		if ((copy = end - offset) > 0) {  			__wsum csum2;  			u8 *vaddr; @@ -1775,12 +2055,12 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,  			if (copy > len)  				copy = len; -			vaddr = kmap_skb_frag(frag); +			vaddr = kmap_atomic(skb_frag_page(frag));  			csum2 = csum_partial_copy_nocheck(vaddr +  							  frag->page_offset +  							  offset - start, to,  							  copy, 0); -			kunmap_skb_frag(vaddr); +			kunmap_atomic(vaddr);  			csum = csum_block_add(csum, csum2, pos);  			if (!(len -= copy))  				return csum; @@ -1818,13 +2098,111 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,  }  EXPORT_SYMBOL(skb_copy_and_csum_bits); + /** + *	skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() + *	@from: source buffer + * + *	Calculates the amount of linear headroom needed in the 'to' skb passed + *	into skb_zerocopy(). + */ +unsigned int +skb_zerocopy_headlen(const struct sk_buff *from) +{ +	unsigned int hlen = 0; + +	if (!from->head_frag || +	    skb_headlen(from) < L1_CACHE_BYTES || +	    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) +		hlen = skb_headlen(from); + +	if (skb_has_frag_list(from)) +		hlen = from->len; + +	return hlen; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); + +/** + *	skb_zerocopy - Zero copy skb to skb + *	@to: destination buffer + *	@from: source buffer + *	@len: number of bytes to copy from source buffer + *	@hlen: size of linear headroom in destination buffer + * + *	Copies up to `len` bytes from `from` to `to` by creating references + *	to the frags in the source buffer. + * + *	The `hlen` as calculated by skb_zerocopy_headlen() specifies the + *	headroom in the `to` buffer. + * + *	Return value: + *	0: everything is OK + *	-ENOMEM: couldn't orphan frags of @from due to lack of memory + *	-EFAULT: skb_copy_bits() found some problem with skb geometry + */ +int +skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) +{ +	int i, j = 0; +	int plen = 0; /* length of skb->head fragment */ +	int ret; +	struct page *page; +	unsigned int offset; + +	BUG_ON(!from->head_frag && !hlen); + +	/* dont bother with small payloads */ +	if (len <= skb_tailroom(to)) +		return skb_copy_bits(from, 0, skb_put(to, len), len); + +	if (hlen) { +		ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); +		if (unlikely(ret)) +			return ret; +		len -= hlen; +	} else { +		plen = min_t(int, skb_headlen(from), len); +		if (plen) { +			page = virt_to_head_page(from->head); +			offset = from->data - (unsigned char *)page_address(page); +			__skb_fill_page_desc(to, 0, page, offset, plen); +			get_page(page); +			j = 1; +			len -= plen; +		} +	} + +	to->truesize += len + plen; +	to->len += len + plen; +	to->data_len += len + plen; + +	if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { +		skb_tx_error(from); +		return -ENOMEM; +	} + +	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { +		if (!len) +			break; +		skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; +		skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); +		len -= skb_shinfo(to)->frags[j].size; +		skb_frag_ref(to, j); +		j++; +	} +	skb_shinfo(to)->nr_frags = j; + +	return 0; +} +EXPORT_SYMBOL_GPL(skb_zerocopy); +  void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)  {  	__wsum csum;  	long csstart;  	if (skb->ip_summed == CHECKSUM_PARTIAL) -		csstart = skb->csum_start - skb_headroom(skb); +		csstart = skb_checksum_start_offset(skb);  	else  		csstart = skb_headlen(skb); @@ -2040,7 +2418,7 @@ static inline void skb_split_no_header(struct sk_buff *skb,  	skb->data_len		  = len - pos;  	for (i = 0; i < nfrags; i++) { -		int size = skb_shinfo(skb)->frags[i].size; +		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);  		if (pos + size > len) {  			skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; @@ -2054,10 +2432,10 @@ static inline void skb_split_no_header(struct sk_buff *skb,  				 *    where splitting is expensive.  				 * 2. Split is accurately. We make this.  				 */ -				get_page(skb_shinfo(skb)->frags[i].page); +				skb_frag_ref(skb, i);  				skb_shinfo(skb1)->frags[0].page_offset += len - pos; -				skb_shinfo(skb1)->frags[0].size -= len - pos; -				skb_shinfo(skb)->frags[i].size	= len - pos; +				skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); +				skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);  				skb_shinfo(skb)->nr_frags++;  			}  			k++; @@ -2078,6 +2456,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)  {  	int pos = skb_headlen(skb); +	skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;  	if (len < pos)	/* Split line is inside header. */  		skb_split_inside_header(skb, skb1, len, pos);  	else		/* Second chunk has no header, nothing to copy. */ @@ -2101,7 +2480,7 @@ static int skb_prepare_for_shift(struct sk_buff *skb)   * @shiftlen: shift up to this many bytes   *   * Attempts to shift up to shiftlen worth of bytes, which may be less than - * the length of the skb, from tgt to skb. Returns number bytes shifted. + * the length of the skb, from skb to tgt. Returns number bytes shifted.   * It's up to caller to free skb if everything was shifted.   *   * If @tgt runs out of frags, the whole operation is aborted. @@ -2129,12 +2508,13 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)  	 * commit all, so that we don't have to undo partial changes  	 */  	if (!to || -	    !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) { +	    !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), +			      fragfrom->page_offset)) {  		merge = -1;  	} else {  		merge = to - 1; -		todo -= fragfrom->size; +		todo -= skb_frag_size(fragfrom);  		if (todo < 0) {  			if (skb_prepare_for_shift(skb) ||  			    skb_prepare_for_shift(tgt)) @@ -2144,8 +2524,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)  			fragfrom = &skb_shinfo(skb)->frags[from];  			fragto = &skb_shinfo(tgt)->frags[merge]; -			fragto->size += shiftlen; -			fragfrom->size -= shiftlen; +			skb_frag_size_add(fragto, shiftlen); +			skb_frag_size_sub(fragfrom, shiftlen);  			fragfrom->page_offset += shiftlen;  			goto onlymerged; @@ -2169,20 +2549,20 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)  		fragfrom = &skb_shinfo(skb)->frags[from];  		fragto = &skb_shinfo(tgt)->frags[to]; -		if (todo >= fragfrom->size) { +		if (todo >= skb_frag_size(fragfrom)) {  			*fragto = *fragfrom; -			todo -= fragfrom->size; +			todo -= skb_frag_size(fragfrom);  			from++;  			to++;  		} else { -			get_page(fragfrom->page); +			__skb_frag_ref(fragfrom);  			fragto->page = fragfrom->page;  			fragto->page_offset = fragfrom->page_offset; -			fragto->size = todo; +			skb_frag_size_set(fragto, todo);  			fragfrom->page_offset += todo; -			fragfrom->size -= todo; +			skb_frag_size_sub(fragfrom, todo);  			todo = 0;  			to++; @@ -2197,8 +2577,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)  		fragfrom = &skb_shinfo(skb)->frags[0];  		fragto = &skb_shinfo(tgt)->frags[merge]; -		fragto->size += fragfrom->size; -		put_page(fragfrom->page); +		skb_frag_size_add(fragto, skb_frag_size(fragfrom)); +		__skb_frag_unref(fragfrom);  	}  	/* Reposition in the original skb */ @@ -2254,18 +2634,18 @@ EXPORT_SYMBOL(skb_prepare_seq_read);   * @data: destination pointer for data to be returned   * @st: state variable   * - * Reads a block of skb data at &consumed relative to the + * Reads a block of skb data at @consumed relative to the   * lower offset specified to skb_prepare_seq_read(). Assigns - * the head of the data block to &data and returns the length + * the head of the data block to @data and returns the length   * of the block or 0 if the end of the skb data or the upper   * offset has been reached.   *   * The caller is not required to consume all of the data - * returned, i.e. &consumed is typically set to the number + * returned, i.e. @consumed is typically set to the number   * of bytes already consumed and the next call to   * skb_seq_read() will return the remaining part of the block.   * - * Note 1: The size of each block of data returned can be arbitary, + * Note 1: The size of each block of data returned can be arbitrary,   *       this limitation is the cost for zerocopy seqeuental   *       reads of potentially non linear data.   * @@ -2279,8 +2659,13 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data,  	unsigned int block_limit, abs_offset = consumed + st->lower_offset;  	skb_frag_t *frag; -	if (unlikely(abs_offset >= st->upper_offset)) +	if (unlikely(abs_offset >= st->upper_offset)) { +		if (st->frag_data) { +			kunmap_atomic(st->frag_data); +			st->frag_data = NULL; +		}  		return 0; +	}  next_skb:  	block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; @@ -2295,11 +2680,11 @@ next_skb:  	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {  		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; -		block_limit = frag->size + st->stepped_offset; +		block_limit = skb_frag_size(frag) + st->stepped_offset;  		if (abs_offset < block_limit) {  			if (!st->frag_data) -				st->frag_data = kmap_skb_frag(frag); +				st->frag_data = kmap_atomic(skb_frag_page(frag));  			*data = (u8 *) st->frag_data + frag->page_offset +  				(abs_offset - st->stepped_offset); @@ -2308,16 +2693,16 @@ next_skb:  		}  		if (st->frag_data) { -			kunmap_skb_frag(st->frag_data); +			kunmap_atomic(st->frag_data);  			st->frag_data = NULL;  		}  		st->frag_idx++; -		st->stepped_offset += frag->size; +		st->stepped_offset += skb_frag_size(frag);  	}  	if (st->frag_data) { -		kunmap_skb_frag(st->frag_data); +		kunmap_atomic(st->frag_data);  		st->frag_data = NULL;  	} @@ -2345,7 +2730,7 @@ EXPORT_SYMBOL(skb_seq_read);  void skb_abort_seq_read(struct skb_seq_state *st)  {  	if (st->frag_data) -		kunmap_skb_frag(st->frag_data); +		kunmap_atomic(st->frag_data);  }  EXPORT_SYMBOL(skb_abort_seq_read); @@ -2393,7 +2778,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,  EXPORT_SYMBOL(skb_find_text);  /** - * skb_append_datato_frags: - append the user data to a skb + * skb_append_datato_frags - append the user data to a skb   * @sk: sock  structure   * @skb: skb structure to be appened with user data.   * @getfrag: call back function to be used for getting the user data @@ -2408,52 +2793,37 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,  					int len, int odd, struct sk_buff *skb),  			void *from, int length)  { -	int frg_cnt = 0; -	skb_frag_t *frag = NULL; -	struct page *page = NULL; -	int copy, left; +	int frg_cnt = skb_shinfo(skb)->nr_frags; +	int copy;  	int offset = 0;  	int ret; +	struct page_frag *pfrag = ¤t->task_frag;  	do {  		/* Return error if we don't have space for new frag */ -		frg_cnt = skb_shinfo(skb)->nr_frags;  		if (frg_cnt >= MAX_SKB_FRAGS) -			return -EFAULT; +			return -EMSGSIZE; -		/* allocate a new page for next frag */ -		page = alloc_pages(sk->sk_allocation, 0); - -		/* If alloc_page fails just return failure and caller will -		 * free previous allocated pages by doing kfree_skb() -		 */ -		if (page == NULL) +		if (!sk_page_frag_refill(sk, pfrag))  			return -ENOMEM; -		/* initialize the next frag */ -		sk->sk_sndmsg_page = page; -		sk->sk_sndmsg_off = 0; -		skb_fill_page_desc(skb, frg_cnt, page, 0, 0); -		skb->truesize += PAGE_SIZE; -		atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); - -		/* get the new initialized frag */ -		frg_cnt = skb_shinfo(skb)->nr_frags; -		frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; -  		/* copy the user data to page */ -		left = PAGE_SIZE - frag->page_offset; -		copy = (length > left)? left : length; +		copy = min_t(int, length, pfrag->size - pfrag->offset); -		ret = getfrag(from, (page_address(frag->page) + -			    frag->page_offset + frag->size), -			    offset, copy, 0, skb); +		ret = getfrag(from, page_address(pfrag->page) + pfrag->offset, +			      offset, copy, 0, skb);  		if (ret < 0)  			return -EFAULT;  		/* copy was successful so update the size parameters */ -		sk->sk_sndmsg_off += copy; -		frag->size += copy; +		skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset, +				   copy); +		frg_cnt++; +		pfrag->offset += copy; +		get_page(pfrag->page); + +		skb->truesize += copy; +		atomic_add(copy, &sk->sk_wmem_alloc);  		skb->len += copy;  		skb->data_len += copy;  		offset += copy; @@ -2488,72 +2858,109 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);  /**   *	skb_segment - Perform protocol segmentation on skb. - *	@skb: buffer to segment + *	@head_skb: buffer to segment   *	@features: features for the output path (see dev->features)   *   *	This function performs segmentation on the given skb.  It returns   *	a pointer to the first in a list of new skbs for the segments.   *	In case of error it returns ERR_PTR(err).   */ -struct sk_buff *skb_segment(struct sk_buff *skb, int features) +struct sk_buff *skb_segment(struct sk_buff *head_skb, +			    netdev_features_t features)  {  	struct sk_buff *segs = NULL;  	struct sk_buff *tail = NULL; -	struct sk_buff *fskb = skb_shinfo(skb)->frag_list; -	unsigned int mss = skb_shinfo(skb)->gso_size; -	unsigned int doffset = skb->data - skb_mac_header(skb); +	struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; +	skb_frag_t *frag = skb_shinfo(head_skb)->frags; +	unsigned int mss = skb_shinfo(head_skb)->gso_size; +	unsigned int doffset = head_skb->data - skb_mac_header(head_skb); +	struct sk_buff *frag_skb = head_skb;  	unsigned int offset = doffset; +	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);  	unsigned int headroom;  	unsigned int len; -	int sg = features & NETIF_F_SG; -	int nfrags = skb_shinfo(skb)->nr_frags; +	__be16 proto; +	bool csum; +	int sg = !!(features & NETIF_F_SG); +	int nfrags = skb_shinfo(head_skb)->nr_frags;  	int err = -ENOMEM;  	int i = 0;  	int pos; +	int dummy; + +	__skb_push(head_skb, doffset); +	proto = skb_network_protocol(head_skb, &dummy); +	if (unlikely(!proto)) +		return ERR_PTR(-EINVAL); + +	csum = !head_skb->encap_hdr_csum && +	    !!can_checksum_protocol(features, proto); -	__skb_push(skb, doffset); -	headroom = skb_headroom(skb); -	pos = skb_headlen(skb); +	headroom = skb_headroom(head_skb); +	pos = skb_headlen(head_skb);  	do {  		struct sk_buff *nskb; -		skb_frag_t *frag; +		skb_frag_t *nskb_frag;  		int hsize;  		int size; -		len = skb->len - offset; +		len = head_skb->len - offset;  		if (len > mss)  			len = mss; -		hsize = skb_headlen(skb) - offset; +		hsize = skb_headlen(head_skb) - offset;  		if (hsize < 0)  			hsize = 0;  		if (hsize > len || !sg)  			hsize = len; -		if (!hsize && i >= nfrags) { -			BUG_ON(fskb->len != len); +		if (!hsize && i >= nfrags && skb_headlen(list_skb) && +		    (skb_headlen(list_skb) == len || sg)) { +			BUG_ON(skb_headlen(list_skb) > len); + +			i = 0; +			nfrags = skb_shinfo(list_skb)->nr_frags; +			frag = skb_shinfo(list_skb)->frags; +			frag_skb = list_skb; +			pos += skb_headlen(list_skb); + +			while (pos < offset + len) { +				BUG_ON(i >= nfrags); + +				size = skb_frag_size(frag); +				if (pos + size > offset + len) +					break; -			pos += len; -			nskb = skb_clone(fskb, GFP_ATOMIC); -			fskb = fskb->next; +				i++; +				pos += size; +				frag++; +			} + +			nskb = skb_clone(list_skb, GFP_ATOMIC); +			list_skb = list_skb->next;  			if (unlikely(!nskb))  				goto err; -			hsize = skb_end_pointer(nskb) - nskb->head; +			if (unlikely(pskb_trim(nskb, len))) { +				kfree_skb(nskb); +				goto err; +			} + +			hsize = skb_end_offset(nskb);  			if (skb_cow_head(nskb, doffset + headroom)) {  				kfree_skb(nskb);  				goto err;  			} -			nskb->truesize += skb_end_pointer(nskb) - nskb->head - -					  hsize; +			nskb->truesize += skb_end_offset(nskb) - hsize;  			skb_release_head_state(nskb);  			__skb_push(nskb, doffset);  		} else { -			nskb = alloc_skb(hsize + doffset + headroom, -					 GFP_ATOMIC); +			nskb = __alloc_skb(hsize + doffset + headroom, +					   GFP_ATOMIC, skb_alloc_rx_flag(head_skb), +					   NUMA_NO_NODE);  			if (unlikely(!nskb))  				goto err; @@ -2568,121 +2975,133 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)  			segs = nskb;  		tail = nskb; -		__copy_skb_header(nskb, skb); -		nskb->mac_len = skb->mac_len; +		__copy_skb_header(nskb, head_skb); +		nskb->mac_len = head_skb->mac_len; -		/* nskb and skb might have different headroom */ -		if (nskb->ip_summed == CHECKSUM_PARTIAL) -			nskb->csum_start += skb_headroom(nskb) - headroom; +		skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); -		skb_reset_mac_header(nskb); -		skb_set_network_header(nskb, skb->mac_len); -		nskb->transport_header = (nskb->network_header + -					  skb_network_header_len(skb)); -		skb_copy_from_linear_data(skb, nskb->data, doffset); +		skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, +						 nskb->data - tnl_hlen, +						 doffset + tnl_hlen); -		if (fskb != skb_shinfo(skb)->frag_list) -			continue; +		if (nskb->len == len + doffset) +			goto perform_csum_check;  		if (!sg) {  			nskb->ip_summed = CHECKSUM_NONE; -			nskb->csum = skb_copy_and_csum_bits(skb, offset, +			nskb->csum = skb_copy_and_csum_bits(head_skb, offset,  							    skb_put(nskb, len),  							    len, 0); +			SKB_GSO_CB(nskb)->csum_start = +			    skb_headroom(nskb) + doffset;  			continue;  		} -		frag = skb_shinfo(nskb)->frags; +		nskb_frag = skb_shinfo(nskb)->frags; -		skb_copy_from_linear_data_offset(skb, offset, +		skb_copy_from_linear_data_offset(head_skb, offset,  						 skb_put(nskb, hsize), hsize); -		while (pos < offset + len && i < nfrags) { -			*frag = skb_shinfo(skb)->frags[i]; -			get_page(frag->page); -			size = frag->size; +		skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags & +			SKBTX_SHARED_FRAG; + +		while (pos < offset + len) { +			if (i >= nfrags) { +				BUG_ON(skb_headlen(list_skb)); + +				i = 0; +				nfrags = skb_shinfo(list_skb)->nr_frags; +				frag = skb_shinfo(list_skb)->frags; +				frag_skb = list_skb; + +				BUG_ON(!nfrags); + +				list_skb = list_skb->next; +			} + +			if (unlikely(skb_shinfo(nskb)->nr_frags >= +				     MAX_SKB_FRAGS)) { +				net_warn_ratelimited( +					"skb_segment: too many frags: %u %u\n", +					pos, mss); +				goto err; +			} + +			if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) +				goto err; + +			*nskb_frag = *frag; +			__skb_frag_ref(nskb_frag); +			size = skb_frag_size(nskb_frag);  			if (pos < offset) { -				frag->page_offset += offset - pos; -				frag->size -= offset - pos; +				nskb_frag->page_offset += offset - pos; +				skb_frag_size_sub(nskb_frag, offset - pos);  			}  			skb_shinfo(nskb)->nr_frags++;  			if (pos + size <= offset + len) {  				i++; +				frag++;  				pos += size;  			} else { -				frag->size -= pos + size - (offset + len); +				skb_frag_size_sub(nskb_frag, pos + size - (offset + len));  				goto skip_fraglist;  			} -			frag++; -		} - -		if (pos < offset + len) { -			struct sk_buff *fskb2 = fskb; - -			BUG_ON(pos + fskb->len != offset + len); - -			pos += fskb->len; -			fskb = fskb->next; - -			if (fskb2->next) { -				fskb2 = skb_clone(fskb2, GFP_ATOMIC); -				if (!fskb2) -					goto err; -			} else -				skb_get(fskb2); - -			SKB_FRAG_ASSERT(nskb); -			skb_shinfo(nskb)->frag_list = fskb2; +			nskb_frag++;  		}  skip_fraglist:  		nskb->data_len = len - hsize;  		nskb->len += nskb->data_len;  		nskb->truesize += nskb->data_len; -	} while ((offset += len) < skb->len); + +perform_csum_check: +		if (!csum) { +			nskb->csum = skb_checksum(nskb, doffset, +						  nskb->len - doffset, 0); +			nskb->ip_summed = CHECKSUM_NONE; +			SKB_GSO_CB(nskb)->csum_start = +			    skb_headroom(nskb) + doffset; +		} +	} while ((offset += len) < head_skb->len);  	return segs;  err: -	while ((skb = segs)) { -		segs = skb->next; -		kfree_skb(skb); -	} +	kfree_skb_list(segs);  	return ERR_PTR(err);  }  EXPORT_SYMBOL_GPL(skb_segment);  int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)  { -	struct sk_buff *p = *head; -	struct sk_buff *nskb; -	struct skb_shared_info *skbinfo = skb_shinfo(skb); -	struct skb_shared_info *pinfo = skb_shinfo(p); -	unsigned int headroom; -	unsigned int len = skb_gro_len(skb); +	struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);  	unsigned int offset = skb_gro_offset(skb);  	unsigned int headlen = skb_headlen(skb); +	struct sk_buff *nskb, *lp, *p = *head; +	unsigned int len = skb_gro_len(skb); +	unsigned int delta_truesize; +	unsigned int headroom; -	if (p->len + len >= 65536) +	if (unlikely(p->len + len >= 65536))  		return -E2BIG; -	if (pinfo->frag_list) -		goto merge; -	else if (headlen <= offset) { +	lp = NAPI_GRO_CB(p)->last; +	pinfo = skb_shinfo(lp); + +	if (headlen <= offset) {  		skb_frag_t *frag;  		skb_frag_t *frag2;  		int i = skbinfo->nr_frags;  		int nr_frags = pinfo->nr_frags + i; -		offset -= headlen; -  		if (nr_frags > MAX_SKB_FRAGS) -			return -E2BIG; +			goto merge; +		offset -= headlen;  		pinfo->nr_frags = nr_frags;  		skbinfo->nr_frags = 0; @@ -2693,15 +3112,48 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)  		} while (--i);  		frag->page_offset += offset; -		frag->size -= offset; +		skb_frag_size_sub(frag, offset); + +		/* all fragments truesize : remove (head size + sk_buff) */ +		delta_truesize = skb->truesize - +				 SKB_TRUESIZE(skb_end_offset(skb));  		skb->truesize -= skb->data_len;  		skb->len -= skb->data_len;  		skb->data_len = 0; -		NAPI_GRO_CB(skb)->free = 1; +		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; +		goto done; +	} else if (skb->head_frag) { +		int nr_frags = pinfo->nr_frags; +		skb_frag_t *frag = pinfo->frags + nr_frags; +		struct page *page = virt_to_head_page(skb->head); +		unsigned int first_size = headlen - offset; +		unsigned int first_offset; + +		if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) +			goto merge; + +		first_offset = skb->data - +			       (unsigned char *)page_address(page) + +			       offset; + +		pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; + +		frag->page.p	  = page; +		frag->page_offset = first_offset; +		skb_frag_size_set(frag, first_size); + +		memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); +		/* We dont need to clear skbinfo->nr_frags here */ + +		delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); +		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;  		goto done; -	} else if (skb_gro_len(p) != pinfo->gso_size) +	} +	if (pinfo->frag_list) +		goto merge; +	if (skb_gro_len(p) != pinfo->gso_size)  		return -E2BIG;  	headroom = skb_headroom(p); @@ -2723,15 +3175,14 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)  	memcpy(skb_mac_header(nskb), skb_mac_header(p),  	       p->data - skb_mac_header(p)); -	*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);  	skb_shinfo(nskb)->frag_list = p;  	skb_shinfo(nskb)->gso_size = pinfo->gso_size;  	pinfo->gso_size = 0;  	skb_header_release(p); -	nskb->prev = p; +	NAPI_GRO_CB(nskb)->last = p;  	nskb->data_len += p->len; -	nskb->truesize += p->len; +	nskb->truesize += p->truesize;  	nskb->len += p->len;  	*head = nskb; @@ -2741,24 +3192,37 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)  	p = nskb;  merge: +	delta_truesize = skb->truesize;  	if (offset > headlen) { -		skbinfo->frags[0].page_offset += offset - headlen; -		skbinfo->frags[0].size -= offset - headlen; +		unsigned int eat = offset - headlen; + +		skbinfo->frags[0].page_offset += eat; +		skb_frag_size_sub(&skbinfo->frags[0], eat); +		skb->data_len -= eat; +		skb->len -= eat;  		offset = headlen;  	}  	__skb_pull(skb, offset); -	p->prev->next = skb; -	p->prev = skb; +	if (NAPI_GRO_CB(p)->last == p) +		skb_shinfo(p)->frag_list = skb; +	else +		NAPI_GRO_CB(p)->last->next = skb; +	NAPI_GRO_CB(p)->last = skb;  	skb_header_release(skb); +	lp = p;  done:  	NAPI_GRO_CB(p)->count++;  	p->data_len += len; -	p->truesize += len; +	p->truesize += delta_truesize;  	p->len += len; - +	if (lp != p) { +		lp->data_len += len; +		lp->truesize += delta_truesize; +		lp->len += len; +	}  	NAPI_GRO_CB(skb)->same_flow = 1;  	return 0;  } @@ -2812,13 +3276,13 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)  		WARN_ON(start > offset + len); -		end = start + skb_shinfo(skb)->frags[i].size; +		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);  		if ((copy = end - offset) > 0) {  			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];  			if (copy > len)  				copy = len; -			sg_set_page(&sg[elt], frag->page, copy, +			sg_set_page(&sg[elt], skb_frag_page(frag), copy,  					frag->page_offset+offset-start);  			elt++;  			if (!(len -= copy)) @@ -2849,6 +3313,32 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)  	return elt;  } +/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given + * sglist without mark the sg which contain last skb data as the end. + * So the caller can mannipulate sg list as will when padding new data after + * the first call without calling sg_unmark_end to expend sg list. + * + * Scenario to use skb_to_sgvec_nomark: + * 1. sg_init_table + * 2. skb_to_sgvec_nomark(payload1) + * 3. skb_to_sgvec_nomark(payload2) + * + * This is equivalent to: + * 1. sg_init_table + * 2. skb_to_sgvec(payload1) + * 3. sg_unmark_end + * 4. skb_to_sgvec(payload2) + * + * When mapping mutilple payload conditionally, skb_to_sgvec_nomark + * is more preferable. + */ +int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, +			int offset, int len) +{ +	return __skb_to_sgvec(skb, sg, offset, len); +} +EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); +  int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)  {  	int nsg = __skb_to_sgvec(skb, sg, offset, len); @@ -2982,7 +3472,7 @@ static void sock_rmem_free(struct sk_buff *skb)  int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)  {  	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= -	    (unsigned)sk->sk_rcvbuf) +	    (unsigned int)sk->sk_rcvbuf)  		return -ENOMEM;  	skb_orphan(skb); @@ -2990,9 +3480,12 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)  	skb->destructor = sock_rmem_free;  	atomic_add(skb->truesize, &sk->sk_rmem_alloc); +	/* before exiting rcu section, make sure dst is refcounted */ +	skb_dst_force(skb); +  	skb_queue_tail(&sk->sk_error_queue, skb);  	if (!sock_flag(sk, SOCK_DEAD)) -		sk->sk_data_ready(sk, skb->len); +		sk->sk_data_ready(sk);  	return 0;  }  EXPORT_SYMBOL(sock_queue_err_skb); @@ -3008,12 +3501,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,  	if (!sk)  		return; -	skb = skb_clone(orig_skb, GFP_ATOMIC); -	if (!skb) -		return; -  	if (hwtstamps) { -		*skb_hwtstamps(skb) = +		*skb_hwtstamps(orig_skb) =  			*hwtstamps;  	} else {  		/* @@ -3021,9 +3510,13 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,  		 * so keep the shared tx_flags and only  		 * store software time stamp  		 */ -		skb->tstamp = ktime_get_real(); +		orig_skb->tstamp = ktime_get_real();  	} +	skb = skb_clone(orig_skb, GFP_ATOMIC); +	if (!skb) +		return; +  	serr = SKB_EXT_ERR(skb);  	memset(serr, 0, sizeof(*serr));  	serr->ee.ee_errno = ENOMSG; @@ -3036,6 +3529,26 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,  }  EXPORT_SYMBOL_GPL(skb_tstamp_tx); +void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) +{ +	struct sock *sk = skb->sk; +	struct sock_exterr_skb *serr; +	int err; + +	skb->wifi_acked_valid = 1; +	skb->wifi_acked = acked; + +	serr = SKB_EXT_ERR(skb); +	memset(serr, 0, sizeof(*serr)); +	serr->ee.ee_errno = ENOMSG; +	serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; + +	err = sock_queue_err_skb(sk, skb); +	if (err) +		kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); +  /**   * skb_partial_csum_set - set up and verify partial csum values for packet @@ -3053,23 +3566,396 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)  {  	if (unlikely(start > skb_headlen(skb)) ||  	    unlikely((int)start + off > skb_headlen(skb) - 2)) { -		if (net_ratelimit()) -			printk(KERN_WARNING -			       "bad partial csum: csum=%u/%u len=%u\n", -			       start, off, skb_headlen(skb)); +		net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n", +				     start, off, skb_headlen(skb));  		return false;  	}  	skb->ip_summed = CHECKSUM_PARTIAL;  	skb->csum_start = skb_headroom(skb) + start;  	skb->csum_offset = off; +	skb_set_transport_header(skb, start);  	return true;  }  EXPORT_SYMBOL_GPL(skb_partial_csum_set); +static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, +			       unsigned int max) +{ +	if (skb_headlen(skb) >= len) +		return 0; + +	/* If we need to pullup then pullup to the max, so we +	 * won't need to do it again. +	 */ +	if (max > skb->len) +		max = skb->len; + +	if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) +		return -ENOMEM; + +	if (skb_headlen(skb) < len) +		return -EPROTO; + +	return 0; +} + +#define MAX_TCP_HDR_LEN (15 * 4) + +static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, +				      typeof(IPPROTO_IP) proto, +				      unsigned int off) +{ +	switch (proto) { +		int err; + +	case IPPROTO_TCP: +		err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), +					  off + MAX_TCP_HDR_LEN); +		if (!err && !skb_partial_csum_set(skb, off, +						  offsetof(struct tcphdr, +							   check))) +			err = -EPROTO; +		return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; + +	case IPPROTO_UDP: +		err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), +					  off + sizeof(struct udphdr)); +		if (!err && !skb_partial_csum_set(skb, off, +						  offsetof(struct udphdr, +							   check))) +			err = -EPROTO; +		return err ? ERR_PTR(err) : &udp_hdr(skb)->check; +	} + +	return ERR_PTR(-EPROTO); +} + +/* This value should be large enough to cover a tagged ethernet header plus + * maximally sized IP and TCP or UDP headers. + */ +#define MAX_IP_HDR_LEN 128 + +static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) +{ +	unsigned int off; +	bool fragment; +	__sum16 *csum; +	int err; + +	fragment = false; + +	err = skb_maybe_pull_tail(skb, +				  sizeof(struct iphdr), +				  MAX_IP_HDR_LEN); +	if (err < 0) +		goto out; + +	if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF)) +		fragment = true; + +	off = ip_hdrlen(skb); + +	err = -EPROTO; + +	if (fragment) +		goto out; + +	csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); +	if (IS_ERR(csum)) +		return PTR_ERR(csum); + +	if (recalculate) +		*csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, +					   ip_hdr(skb)->daddr, +					   skb->len - off, +					   ip_hdr(skb)->protocol, 0); +	err = 0; + +out: +	return err; +} + +/* This value should be large enough to cover a tagged ethernet header plus + * an IPv6 header, all options, and a maximal TCP or UDP header. + */ +#define MAX_IPV6_HDR_LEN 256 + +#define OPT_HDR(type, skb, off) \ +	(type *)(skb_network_header(skb) + (off)) + +static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) +{ +	int err; +	u8 nexthdr; +	unsigned int off; +	unsigned int len; +	bool fragment; +	bool done; +	__sum16 *csum; + +	fragment = false; +	done = false; + +	off = sizeof(struct ipv6hdr); + +	err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); +	if (err < 0) +		goto out; + +	nexthdr = ipv6_hdr(skb)->nexthdr; + +	len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); +	while (off <= len && !done) { +		switch (nexthdr) { +		case IPPROTO_DSTOPTS: +		case IPPROTO_HOPOPTS: +		case IPPROTO_ROUTING: { +			struct ipv6_opt_hdr *hp; + +			err = skb_maybe_pull_tail(skb, +						  off + +						  sizeof(struct ipv6_opt_hdr), +						  MAX_IPV6_HDR_LEN); +			if (err < 0) +				goto out; + +			hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); +			nexthdr = hp->nexthdr; +			off += ipv6_optlen(hp); +			break; +		} +		case IPPROTO_AH: { +			struct ip_auth_hdr *hp; + +			err = skb_maybe_pull_tail(skb, +						  off + +						  sizeof(struct ip_auth_hdr), +						  MAX_IPV6_HDR_LEN); +			if (err < 0) +				goto out; + +			hp = OPT_HDR(struct ip_auth_hdr, skb, off); +			nexthdr = hp->nexthdr; +			off += ipv6_authlen(hp); +			break; +		} +		case IPPROTO_FRAGMENT: { +			struct frag_hdr *hp; + +			err = skb_maybe_pull_tail(skb, +						  off + +						  sizeof(struct frag_hdr), +						  MAX_IPV6_HDR_LEN); +			if (err < 0) +				goto out; + +			hp = OPT_HDR(struct frag_hdr, skb, off); + +			if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) +				fragment = true; + +			nexthdr = hp->nexthdr; +			off += sizeof(struct frag_hdr); +			break; +		} +		default: +			done = true; +			break; +		} +	} + +	err = -EPROTO; + +	if (!done || fragment) +		goto out; + +	csum = skb_checksum_setup_ip(skb, nexthdr, off); +	if (IS_ERR(csum)) +		return PTR_ERR(csum); + +	if (recalculate) +		*csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, +					 &ipv6_hdr(skb)->daddr, +					 skb->len - off, nexthdr, 0); +	err = 0; + +out: +	return err; +} + +/** + * skb_checksum_setup - set up partial checksum offset + * @skb: the skb to set up + * @recalculate: if true the pseudo-header checksum will be recalculated + */ +int skb_checksum_setup(struct sk_buff *skb, bool recalculate) +{ +	int err; + +	switch (skb->protocol) { +	case htons(ETH_P_IP): +		err = skb_checksum_setup_ipv4(skb, recalculate); +		break; + +	case htons(ETH_P_IPV6): +		err = skb_checksum_setup_ipv6(skb, recalculate); +		break; + +	default: +		err = -EPROTO; +		break; +	} + +	return err; +} +EXPORT_SYMBOL(skb_checksum_setup); +  void __skb_warn_lro_forwarding(const struct sk_buff *skb)  { -	if (net_ratelimit()) -		pr_warning("%s: received packets cannot be forwarded" -			   " while LRO is enabled\n", skb->dev->name); +	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", +			     skb->dev->name);  }  EXPORT_SYMBOL(__skb_warn_lro_forwarding); + +void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) +{ +	if (head_stolen) { +		skb_release_head_state(skb); +		kmem_cache_free(skbuff_head_cache, skb); +	} else { +		__kfree_skb(skb); +	} +} +EXPORT_SYMBOL(kfree_skb_partial); + +/** + * skb_try_coalesce - try to merge skb to prior one + * @to: prior buffer + * @from: buffer to add + * @fragstolen: pointer to boolean + * @delta_truesize: how much more was allocated than was requested + */ +bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, +		      bool *fragstolen, int *delta_truesize) +{ +	int i, delta, len = from->len; + +	*fragstolen = false; + +	if (skb_cloned(to)) +		return false; + +	if (len <= skb_tailroom(to)) { +		BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); +		*delta_truesize = 0; +		return true; +	} + +	if (skb_has_frag_list(to) || skb_has_frag_list(from)) +		return false; + +	if (skb_headlen(from) != 0) { +		struct page *page; +		unsigned int offset; + +		if (skb_shinfo(to)->nr_frags + +		    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) +			return false; + +		if (skb_head_is_locked(from)) +			return false; + +		delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); + +		page = virt_to_head_page(from->head); +		offset = from->data - (unsigned char *)page_address(page); + +		skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, +				   page, offset, skb_headlen(from)); +		*fragstolen = true; +	} else { +		if (skb_shinfo(to)->nr_frags + +		    skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) +			return false; + +		delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); +	} + +	WARN_ON_ONCE(delta < len); + +	memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, +	       skb_shinfo(from)->frags, +	       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); +	skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; + +	if (!skb_cloned(from)) +		skb_shinfo(from)->nr_frags = 0; + +	/* if the skb is not cloned this does nothing +	 * since we set nr_frags to 0. +	 */ +	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) +		skb_frag_ref(from, i); + +	to->truesize += delta; +	to->len += len; +	to->data_len += len; + +	*delta_truesize = delta; +	return true; +} +EXPORT_SYMBOL(skb_try_coalesce); + +/** + * skb_scrub_packet - scrub an skb + * + * @skb: buffer to clean + * @xnet: packet is crossing netns + * + * skb_scrub_packet can be used after encapsulating or decapsulting a packet + * into/from a tunnel. Some information have to be cleared during these + * operations. + * skb_scrub_packet can also be used to clean a skb before injecting it in + * another namespace (@xnet == true). We have to clear all information in the + * skb that could impact namespace isolation. + */ +void skb_scrub_packet(struct sk_buff *skb, bool xnet) +{ +	if (xnet) +		skb_orphan(skb); +	skb->tstamp.tv64 = 0; +	skb->pkt_type = PACKET_HOST; +	skb->skb_iif = 0; +	skb->ignore_df = 0; +	skb_dst_drop(skb); +	skb->mark = 0; +	secpath_reset(skb); +	nf_reset(skb); +	nf_reset_trace(skb); +} +EXPORT_SYMBOL_GPL(skb_scrub_packet); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ +	const struct skb_shared_info *shinfo = skb_shinfo(skb); + +	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) +		return tcp_hdrlen(skb) + shinfo->gso_size; + +	/* UFO sets gso_size to the size of the fragmentation +	 * payload, i.e. the size of the L4 (UDP) header is already +	 * accounted for. +	 */ +	return shinfo->gso_size; +} +EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); diff --git a/net/core/sock.c b/net/core/sock.c index fb608011146..026e01f7027 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -89,8 +89,11 @@   *		2 of the License, or (at your option) any later version.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/capability.h>  #include <linux/errno.h> +#include <linux/errqueue.h>  #include <linux/types.h>  #include <linux/socket.h>  #include <linux/in.h> @@ -111,9 +114,11 @@  #include <linux/init.h>  #include <linux/highmem.h>  #include <linux/user_namespace.h> +#include <linux/static_key.h> +#include <linux/memcontrol.h> +#include <linux/prefetch.h>  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/netdevice.h>  #include <net/protocol.h> @@ -125,13 +130,107 @@  #include <net/xfrm.h>  #include <linux/ipsec.h>  #include <net/cls_cgroup.h> +#include <net/netprio_cgroup.h>  #include <linux/filter.h> +#include <trace/events/sock.h> +  #ifdef CONFIG_INET  #include <net/tcp.h>  #endif +#include <net/busy_poll.h> + +static DEFINE_MUTEX(proto_list_mutex); +static LIST_HEAD(proto_list); + +/** + * sk_ns_capable - General socket capability test + * @sk: Socket to use a capability on or through + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in the user + * namespace @user_ns. + */ +bool sk_ns_capable(const struct sock *sk, +		   struct user_namespace *user_ns, int cap) +{ +	return file_ns_capable(sk->sk_socket->file, user_ns, cap) && +		ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(sk_ns_capable); + +/** + * sk_capable - Socket global capability test + * @sk: Socket to use a capability on or through + * @cap: The global capbility to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in all user + * namespaces. + */ +bool sk_capable(const struct sock *sk, int cap) +{ +	return sk_ns_capable(sk, &init_user_ns, cap); +} +EXPORT_SYMBOL(sk_capable); + +/** + * sk_net_capable - Network namespace socket capability test + * @sk: Socket to use a capability on or through + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socke was created + * and the current process has the capability @cap over the network namespace + * the socket is a member of. + */ +bool sk_net_capable(const struct sock *sk, int cap) +{ +	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); +} +EXPORT_SYMBOL(sk_net_capable); + + +#ifdef CONFIG_MEMCG_KMEM +int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) +{ +	struct proto *proto; +	int ret = 0; + +	mutex_lock(&proto_list_mutex); +	list_for_each_entry(proto, &proto_list, node) { +		if (proto->init_cgroup) { +			ret = proto->init_cgroup(memcg, ss); +			if (ret) +				goto out; +		} +	} + +	mutex_unlock(&proto_list_mutex); +	return ret; +out: +	list_for_each_entry_continue_reverse(proto, &proto_list, node) +		if (proto->destroy_cgroup) +			proto->destroy_cgroup(memcg); +	mutex_unlock(&proto_list_mutex); +	return ret; +} + +void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) +{ +	struct proto *proto; + +	mutex_lock(&proto_list_mutex); +	list_for_each_entry_reverse(proto, &proto_list, node) +		if (proto->destroy_cgroup) +			proto->destroy_cgroup(memcg); +	mutex_unlock(&proto_list_mutex); +} +#endif +  /*   * Each address family might have different locking rules, so we have   * one slock key per address family: @@ -139,6 +238,11 @@  static struct lock_class_key af_family_keys[AF_MAX];  static struct lock_class_key af_family_slock_keys[AF_MAX]; +#if defined(CONFIG_MEMCG_KMEM) +struct static_key memcg_socket_limit_enabled; +EXPORT_SYMBOL(memcg_socket_limit_enabled); +#endif +  /*   * Make lock validator output more readable. (we pre-construct these   * strings build-time, so that runtime initialization of socket @@ -157,8 +261,8 @@ static const char *const af_family_key_strings[AF_MAX+1] = {    "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,    "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,    "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   , -  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , -  "sk_lock-AF_MAX" +  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      , +  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"  };  static const char *const af_family_slock_key_strings[AF_MAX+1] = {    "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     , @@ -173,8 +277,8 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {    "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,    "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,    "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   , -  "slock-AF_IEEE802154", "slock-AF_CAIF" , -  "slock-AF_MAX" +  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      , +  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"  };  static const char *const af_family_clock_key_strings[AF_MAX+1] = {    "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     , @@ -189,8 +293,8 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {    "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,    "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,    "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   , -  "clock-AF_IEEE802154", "clock-AF_CAIF" , -  "clock-AF_MAX" +  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      , +  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"  };  /* @@ -205,24 +309,76 @@ static struct lock_class_key af_callback_keys[AF_MAX];   * not depend upon such differences.   */  #define _SK_MEM_PACKETS		256 -#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256) +#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)  #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)  #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)  /* Run time adjustable parameters. */  __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +EXPORT_SYMBOL(sysctl_wmem_max);  __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +EXPORT_SYMBOL(sysctl_rmem_max);  __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;  __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -/* Maximal space eaten by iovec or ancilliary data plus some space */ +/* Maximal space eaten by iovec or ancillary data plus some space */  int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);  EXPORT_SYMBOL(sysctl_optmem_max); -#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) -int net_cls_subsys_id = -1; -EXPORT_SYMBOL_GPL(net_cls_subsys_id); -#endif +struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL_GPL(memalloc_socks); + +/** + * sk_set_memalloc - sets %SOCK_MEMALLOC + * @sk: socket to set it on + * + * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. + * It's the responsibility of the admin to adjust min_free_kbytes + * to meet the requirements + */ +void sk_set_memalloc(struct sock *sk) +{ +	sock_set_flag(sk, SOCK_MEMALLOC); +	sk->sk_allocation |= __GFP_MEMALLOC; +	static_key_slow_inc(&memalloc_socks); +} +EXPORT_SYMBOL_GPL(sk_set_memalloc); + +void sk_clear_memalloc(struct sock *sk) +{ +	sock_reset_flag(sk, SOCK_MEMALLOC); +	sk->sk_allocation &= ~__GFP_MEMALLOC; +	static_key_slow_dec(&memalloc_socks); + +	/* +	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward +	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while +	 * it has rmem allocations there is a risk that the user of the +	 * socket cannot make forward progress due to exceeding the rmem +	 * limits. By rights, sk_clear_memalloc() should only be called +	 * on sockets being torn down but warn and reset the accounting if +	 * that assumption breaks. +	 */ +	if (WARN_ON(sk->sk_forward_alloc)) +		sk_mem_reclaim(sk); +} +EXPORT_SYMBOL_GPL(sk_clear_memalloc); + +int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ +	int ret; +	unsigned long pflags = current->flags; + +	/* these should have been dropped before queueing */ +	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); + +	current->flags |= PF_MEMALLOC; +	ret = sk->sk_backlog_rcv(sk, skb); +	tsk_restore_flags(current, pflags, PF_MEMALLOC); + +	return ret; +} +EXPORT_SYMBOL(__sk_backlog_rcv);  static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)  { @@ -241,9 +397,8 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)  		*timeo_p = 0;  		if (warned < 10 && net_ratelimit()) {  			warned++; -			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) " -			       "tries to set negative timeout\n", -				current->comm, task_pid_nr(current)); +			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", +				__func__, current->comm, task_pid_nr(current));  		}  		return 0;  	} @@ -261,20 +416,20 @@ static void sock_warn_obsolete_bsdism(const char *name)  	static char warncomm[TASK_COMM_LEN];  	if (strcmp(warncomm, current->comm) && warned < 5) {  		strcpy(warncomm,  current->comm); -		printk(KERN_WARNING "process `%s' is using obsolete " -		       "%s SO_BSDCOMPAT\n", warncomm, name); +		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", +			warncomm, name);  		warned++;  	}  } -static void sock_disable_timestamp(struct sock *sk, int flag) +#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) + +static void sock_disable_timestamp(struct sock *sk, unsigned long flags)  { -	if (sock_flag(sk, flag)) { -		sock_reset_flag(sk, flag); -		if (!sock_flag(sk, SOCK_TIMESTAMP) && -		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) { +	if (sk->sk_flags & flags) { +		sk->sk_flags &= ~flags; +		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))  			net_disable_timestamp(); -		}  	}  } @@ -286,12 +441,9 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  	unsigned long flags;  	struct sk_buff_head *list = &sk->sk_receive_queue; -	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces -	   number of warnings when compiling with -W --ANK -	 */ -	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= -	    (unsigned)sk->sk_rcvbuf) { +	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {  		atomic_inc(&sk->sk_drops); +		trace_sock_rcvqueue_full(sk, skb);  		return -ENOMEM;  	} @@ -299,7 +451,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  	if (err)  		return err; -	if (!sk_rmem_schedule(sk, skb->truesize)) { +	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {  		atomic_inc(&sk->sk_drops);  		return -ENOBUFS;  	} @@ -325,7 +477,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  	spin_unlock_irqrestore(&list->lock, flags);  	if (!sock_flag(sk, SOCK_DEAD)) -		sk->sk_data_ready(sk, skb_len); +		sk->sk_data_ready(sk);  	return 0;  }  EXPORT_SYMBOL(sock_queue_rcv_skb); @@ -339,7 +491,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)  	skb->dev = NULL; -	if (sk_rcvqueues_full(sk, skb)) { +	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {  		atomic_inc(&sk->sk_drops);  		goto discard_and_relse;  	} @@ -356,7 +508,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)  		rc = sk_backlog_rcv(sk, skb);  		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); -	} else if (sk_add_backlog(sk, skb)) { +	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {  		bh_unlock_sock(sk);  		atomic_inc(&sk->sk_drops);  		goto discard_and_relse; @@ -372,19 +524,13 @@ discard_and_relse:  }  EXPORT_SYMBOL(sk_receive_skb); -void sk_reset_txq(struct sock *sk) -{ -	sk_tx_queue_clear(sk); -} -EXPORT_SYMBOL(sk_reset_txq); -  struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)  {  	struct dst_entry *dst = __sk_dst_get(sk);  	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {  		sk_tx_queue_clear(sk); -		rcu_assign_pointer(sk->sk_dst_cache, NULL); +		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);  		dst_release(dst);  		return NULL;  	} @@ -407,7 +553,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)  }  EXPORT_SYMBOL(sk_dst_check); -static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) +static int sock_setbindtodevice(struct sock *sk, char __user *optval, +				int optlen)  {  	int ret = -ENOPROTOOPT;  #ifdef CONFIG_NETDEVICES @@ -417,7 +564,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)  	/* Sorry... */  	ret = -EPERM; -	if (!capable(CAP_NET_RAW)) +	if (!ns_capable(net->user_ns, CAP_NET_RAW))  		goto out;  	ret = -EINVAL; @@ -464,6 +611,46 @@ out:  	return ret;  } +static int sock_getbindtodevice(struct sock *sk, char __user *optval, +				int __user *optlen, int len) +{ +	int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES +	struct net *net = sock_net(sk); +	char devname[IFNAMSIZ]; + +	if (sk->sk_bound_dev_if == 0) { +		len = 0; +		goto zero; +	} + +	ret = -EINVAL; +	if (len < IFNAMSIZ) +		goto out; + +	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); +	if (ret) +		goto out; + +	len = strlen(devname) + 1; + +	ret = -EFAULT; +	if (copy_to_user(optval, devname, len)) +		goto out; + +zero: +	ret = -EFAULT; +	if (put_user(len, optlen)) +		goto out; + +	ret = 0; + +out: +#endif + +	return ret; +} +  static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)  {  	if (valbool) @@ -491,7 +678,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,  	 */  	if (optname == SO_BINDTODEVICE) -		return sock_bindtodevice(sk, optval, optlen); +		return sock_setbindtodevice(sk, optval, optlen);  	if (optlen < sizeof(int))  		return -EINVAL; @@ -511,7 +698,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,  			sock_valbool_flag(sk, SOCK_DBG, valbool);  		break;  	case SO_REUSEADDR: -		sk->sk_reuse = valbool; +		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); +		break; +	case SO_REUSEPORT: +		sk->sk_reuseport = valbool;  		break;  	case SO_TYPE:  	case SO_PROTOCOL: @@ -527,23 +717,15 @@ int sock_setsockopt(struct socket *sock, int level, int optname,  		break;  	case SO_SNDBUF:  		/* Don't error on this BSD doesn't and if you think -		   about it this is right. Otherwise apps have to -		   play 'guess the biggest size' games. RCVBUF/SNDBUF -		   are treated in BSD as hints */ - -		if (val > sysctl_wmem_max) -			val = sysctl_wmem_max; +		 * about it this is right. Otherwise apps have to +		 * play 'guess the biggest size' games. RCVBUF/SNDBUF +		 * are treated in BSD as hints +		 */ +		val = min_t(u32, val, sysctl_wmem_max);  set_sndbuf:  		sk->sk_userlocks |= SOCK_SNDBUF_LOCK; -		if ((val * 2) < SOCK_MIN_SNDBUF) -			sk->sk_sndbuf = SOCK_MIN_SNDBUF; -		else -			sk->sk_sndbuf = val * 2; - -		/* -		 *	Wake up sending tasks if we -		 *	upped the value. -		 */ +		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF); +		/* Wake up sending tasks if we upped the value. */  		sk->sk_write_space(sk);  		break; @@ -556,12 +738,11 @@ set_sndbuf:  	case SO_RCVBUF:  		/* Don't error on this BSD doesn't and if you think -		   about it this is right. Otherwise apps have to -		   play 'guess the biggest size' games. RCVBUF/SNDBUF -		   are treated in BSD as hints */ - -		if (val > sysctl_rmem_max) -			val = sysctl_rmem_max; +		 * about it this is right. Otherwise apps have to +		 * play 'guess the biggest size' games. RCVBUF/SNDBUF +		 * are treated in BSD as hints +		 */ +		val = min_t(u32, val, sysctl_rmem_max);  set_rcvbuf:  		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;  		/* @@ -579,10 +760,7 @@ set_rcvbuf:  		 * returning the value we actually used in getsockopt  		 * is the most desirable behavior.  		 */ -		if ((val * 2) < SOCK_MIN_RCVBUF) -			sk->sk_rcvbuf = SOCK_MIN_RCVBUF; -		else -			sk->sk_rcvbuf = val * 2; +		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);  		break;  	case SO_RCVBUFFORCE: @@ -594,7 +772,8 @@ set_rcvbuf:  	case SO_KEEPALIVE:  #ifdef CONFIG_INET -		if (sk->sk_protocol == IPPROTO_TCP) +		if (sk->sk_protocol == IPPROTO_TCP && +		    sk->sk_type == SOCK_STREAM)  			tcp_set_keepalive(sk, valbool);  #endif  		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); @@ -605,11 +784,12 @@ set_rcvbuf:  		break;  	case SO_NO_CHECK: -		sk->sk_no_check = valbool; +		sk->sk_no_check_tx = valbool;  		break;  	case SO_PRIORITY: -		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) +		if ((val >= 0 && val <= 6) || +		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  			sk->sk_priority = val;  		else  			ret = -EPERM; @@ -679,7 +859,7 @@ set_rcvbuf:  					      SOCK_TIMESTAMPING_RX_SOFTWARE);  		else  			sock_disable_timestamp(sk, -					       SOCK_TIMESTAMPING_RX_SOFTWARE); +					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));  		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,  				  val & SOF_TIMESTAMPING_SOFTWARE);  		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, @@ -719,6 +899,13 @@ set_rcvbuf:  		ret = sk_detach_filter(sk);  		break; +	case SO_LOCK_FILTER: +		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) +			ret = -EPERM; +		else +			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); +		break; +  	case SO_PASSSEC:  		if (valbool)  			set_bit(SOCK_PASSSEC, &sock->flags); @@ -726,7 +913,7 @@ set_rcvbuf:  			clear_bit(SOCK_PASSSEC, &sock->flags);  		break;  	case SO_MARK: -		if (!capable(CAP_NET_ADMIN)) +		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  			ret = -EPERM;  		else  			sk->sk_mark = val; @@ -735,11 +922,48 @@ set_rcvbuf:  		/* We implement the SO_SNDLOWAT etc to  		   not be settable (1003.1g 5.3) */  	case SO_RXQ_OVFL: -		if (valbool) -			sock_set_flag(sk, SOCK_RXQ_OVFL); +		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); +		break; + +	case SO_WIFI_STATUS: +		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); +		break; + +	case SO_PEEK_OFF: +		if (sock->ops->set_peek_off) +			ret = sock->ops->set_peek_off(sk, val);  		else -			sock_reset_flag(sk, SOCK_RXQ_OVFL); +			ret = -EOPNOTSUPP;  		break; + +	case SO_NOFCS: +		sock_valbool_flag(sk, SOCK_NOFCS, valbool); +		break; + +	case SO_SELECT_ERR_QUEUE: +		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); +		break; + +#ifdef CONFIG_NET_RX_BUSY_POLL +	case SO_BUSY_POLL: +		/* allow unprivileged users to decrease the value */ +		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) +			ret = -EPERM; +		else { +			if (val < 0) +				ret = -EINVAL; +			else +				sk->sk_ll_usec = val; +		} +		break; +#endif + +	case SO_MAX_PACING_RATE: +		sk->sk_max_pacing_rate = val; +		sk->sk_pacing_rate = min(sk->sk_pacing_rate, +					 sk->sk_max_pacing_rate); +		break; +  	default:  		ret = -ENOPROTOOPT;  		break; @@ -750,19 +974,18 @@ set_rcvbuf:  EXPORT_SYMBOL(sock_setsockopt); -void cred_to_ucred(struct pid *pid, const struct cred *cred, -		   struct ucred *ucred) +static void cred_to_ucred(struct pid *pid, const struct cred *cred, +			  struct ucred *ucred)  {  	ucred->pid = pid_vnr(pid);  	ucred->uid = ucred->gid = -1;  	if (cred) {  		struct user_namespace *current_ns = current_user_ns(); -		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid); -		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid); +		ucred->uid = from_kuid_munged(current_ns, cred->euid); +		ucred->gid = from_kgid_munged(current_ns, cred->egid);  	}  } -EXPORT_SYMBOL_GPL(cred_to_ucred);  int sock_getsockopt(struct socket *sock, int level, int optname,  		    char __user *optval, int __user *optlen) @@ -795,7 +1018,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		break;  	case SO_BROADCAST: -		v.val = !!sock_flag(sk, SOCK_BROADCAST); +		v.val = sock_flag(sk, SOCK_BROADCAST);  		break;  	case SO_SNDBUF: @@ -810,8 +1033,12 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		v.val = sk->sk_reuse;  		break; +	case SO_REUSEPORT: +		v.val = sk->sk_reuseport; +		break; +  	case SO_KEEPALIVE: -		v.val = !!sock_flag(sk, SOCK_KEEPOPEN); +		v.val = sock_flag(sk, SOCK_KEEPOPEN);  		break;  	case SO_TYPE: @@ -833,11 +1060,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		break;  	case SO_OOBINLINE: -		v.val = !!sock_flag(sk, SOCK_URGINLINE); +		v.val = sock_flag(sk, SOCK_URGINLINE);  		break;  	case SO_NO_CHECK: -		v.val = sk->sk_no_check; +		v.val = sk->sk_no_check_tx;  		break;  	case SO_PRIORITY: @@ -846,7 +1073,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  	case SO_LINGER:  		lv		= sizeof(v.ling); -		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER); +		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);  		v.ling.l_linger	= sk->sk_lingertime / HZ;  		break; @@ -912,7 +1139,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		break;  	case SO_PASSCRED: -		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0; +		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);  		break;  	case SO_PEERCRED: @@ -947,7 +1174,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		break;  	case SO_PASSSEC: -		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0; +		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);  		break;  	case SO_PEERSEC: @@ -958,7 +1185,53 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		break;  	case SO_RXQ_OVFL: -		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL); +		v.val = sock_flag(sk, SOCK_RXQ_OVFL); +		break; + +	case SO_WIFI_STATUS: +		v.val = sock_flag(sk, SOCK_WIFI_STATUS); +		break; + +	case SO_PEEK_OFF: +		if (!sock->ops->set_peek_off) +			return -EOPNOTSUPP; + +		v.val = sk->sk_peek_off; +		break; +	case SO_NOFCS: +		v.val = sock_flag(sk, SOCK_NOFCS); +		break; + +	case SO_BINDTODEVICE: +		return sock_getbindtodevice(sk, optval, optlen, len); + +	case SO_GET_FILTER: +		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); +		if (len < 0) +			return len; + +		goto lenout; + +	case SO_LOCK_FILTER: +		v.val = sock_flag(sk, SOCK_FILTER_LOCKED); +		break; + +	case SO_BPF_EXTENSIONS: +		v.val = bpf_tell_extensions(); +		break; + +	case SO_SELECT_ERR_QUEUE: +		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); +		break; + +#ifdef CONFIG_NET_RX_BUSY_POLL +	case SO_BUSY_POLL: +		v.val = sk->sk_ll_usec; +		break; +#endif + +	case SO_MAX_PACING_RATE: +		v.val = sk->sk_max_pacing_rate;  		break;  	default: @@ -992,23 +1265,42 @@ static inline void sock_lock_init(struct sock *sk)  /*   * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,   * even temporarly, because of RCU lookups. sk_node should also be left as is. + * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end   */  static void sock_copy(struct sock *nsk, const struct sock *osk)  {  #ifdef CONFIG_SECURITY_NETWORK  	void *sptr = nsk->sk_security;  #endif -	BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) != -		     sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) + -		     sizeof(osk->sk_tx_queue_mapping)); -	memcpy(&nsk->sk_copy_start, &osk->sk_copy_start, -	       osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start)); +	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); + +	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, +	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); +  #ifdef CONFIG_SECURITY_NETWORK  	nsk->sk_security = sptr;  	security_sk_clone(osk, nsk);  #endif  } +void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) +{ +	unsigned long nulls1, nulls2; + +	nulls1 = offsetof(struct sock, __sk_common.skc_node.next); +	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); +	if (nulls1 > nulls2) +		swap(nulls1, nulls2); + +	if (nulls1 != 0) +		memset((char *)sk, 0, nulls1); +	memset((char *)sk + nulls1 + sizeof(void *), 0, +	       nulls2 - nulls1 - sizeof(void *)); +	memset((char *)sk + nulls2 + sizeof(void *), 0, +	       size - nulls2 - sizeof(void *)); +} +EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); +  static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,  		int family)  { @@ -1021,19 +1313,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,  		if (!sk)  			return sk;  		if (priority & __GFP_ZERO) { -			/* -			 * caches using SLAB_DESTROY_BY_RCU should let -			 * sk_node.next un-modified. Special care is taken -			 * when initializing object to zero. -			 */ -			if (offsetof(struct sock, sk_node.next) != 0) -				memset(sk, 0, offsetof(struct sock, sk_node.next)); -			memset(&sk->sk_node.pprev, 0, -			       prot->obj_size - offsetof(struct sock, -							 sk_node.pprev)); +			if (prot->clear_sk) +				prot->clear_sk(sk, prot->obj_size); +			else +				sk_prot_clear_nulls(sk, prot->obj_size);  		} -	} -	else +	} else  		sk = kmalloc(prot->obj_size, priority);  	if (sk != NULL) { @@ -1075,18 +1360,15 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)  	module_put(owner);  } -#ifdef CONFIG_CGROUPS -void sock_update_classid(struct sock *sk) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) +void sock_update_netprioidx(struct sock *sk)  { -	u32 classid; +	if (in_interrupt()) +		return; -	rcu_read_lock();  /* doing current task, which cannot vanish. */ -	classid = task_cls_classid(current); -	rcu_read_unlock(); -	if (classid && classid != sk->sk_classid) -		sk->sk_classid = classid; +	sk->sk_cgrp_prioidx = task_netprioidx(current);  } -EXPORT_SYMBOL(sock_update_classid); +EXPORT_SYMBOL_GPL(sock_update_netprioidx);  #endif  /** @@ -1114,6 +1396,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,  		atomic_set(&sk->sk_wmem_alloc, 1);  		sock_update_classid(sk); +		sock_update_netprioidx(sk);  	}  	return sk; @@ -1131,15 +1414,14 @@ static void __sk_free(struct sock *sk)  				       atomic_read(&sk->sk_wmem_alloc) == 0);  	if (filter) {  		sk_filter_uncharge(sk, filter); -		rcu_assign_pointer(sk->sk_filter, NULL); +		RCU_INIT_POINTER(sk->sk_filter, NULL);  	} -	sock_disable_timestamp(sk, SOCK_TIMESTAMP); -	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); +	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);  	if (atomic_read(&sk->sk_omem_alloc)) -		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", -		       __func__, atomic_read(&sk->sk_omem_alloc)); +		pr_debug("%s: optmem leakage (%d bytes) detected\n", +			 __func__, atomic_read(&sk->sk_omem_alloc));  	if (sk->sk_peer_cred)  		put_cred(sk->sk_peer_cred); @@ -1151,7 +1433,7 @@ static void __sk_free(struct sock *sk)  void sk_free(struct sock *sk)  {  	/* -	 * We substract one from sk_wmem_alloc and can know if +	 * We subtract one from sk_wmem_alloc and can know if  	 * some packets are still in some tx queue.  	 * If not null, sock_wfree() will call __sk_free(sk) later  	 */ @@ -1161,10 +1443,10 @@ void sk_free(struct sock *sk)  EXPORT_SYMBOL(sk_free);  /* - * Last sock_put should drop referrence to sk->sk_net. It has already - * been dropped in sk_change_net. Taking referrence to stopping namespace + * Last sock_put should drop reference to sk->sk_net. It has already + * been dropped in sk_change_net. Taking reference to stopping namespace   * is not an option. - * Take referrence to a socket to remove it from hash _alive_ and after that + * Take reference to a socket to remove it from hash _alive_ and after that   * destroy it in the context of init_net.   */  void sk_release_kernel(struct sock *sk) @@ -1180,7 +1462,20 @@ void sk_release_kernel(struct sock *sk)  }  EXPORT_SYMBOL(sk_release_kernel); -struct sock *sk_clone(const struct sock *sk, const gfp_t priority) +static void sk_update_clone(const struct sock *sk, struct sock *newsk) +{ +	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) +		sock_update_memcg(newsk); +} + +/** + *	sk_clone_lock - clone a socket, and lock its clone + *	@sk: the socket to clone + *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + *	Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)  {  	struct sock *newsk; @@ -1233,6 +1528,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)  			/* It is still raw copy of parent, so invalidate  			 * destructor and make plain sk_free() */  			newsk->sk_destruct = NULL; +			bh_unlock_sock(newsk);  			sk_free(newsk);  			newsk = NULL;  			goto out; @@ -1262,17 +1558,18 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)  		sk_set_socket(newsk, NULL);  		newsk->sk_wq = NULL; +		sk_update_clone(sk, newsk); +  		if (newsk->sk_prot->sockets_allocated) -			percpu_counter_inc(newsk->sk_prot->sockets_allocated); +			sk_sockets_allocated_inc(newsk); -		if (sock_flag(newsk, SOCK_TIMESTAMP) || -		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) +		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)  			net_enable_timestamp();  	}  out:  	return newsk;  } -EXPORT_SYMBOL_GPL(sk_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock);  void sk_setup_caps(struct sock *sk, struct dst_entry *dst)  { @@ -1287,24 +1584,12 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)  		} else {  			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;  			sk->sk_gso_max_size = dst->dev->gso_max_size; +			sk->sk_gso_max_segs = dst->dev->gso_max_segs;  		}  	}  }  EXPORT_SYMBOL_GPL(sk_setup_caps); -void __init sk_init(void) -{ -	if (totalram_pages <= 4096) { -		sysctl_wmem_max = 32767; -		sysctl_rmem_max = 32767; -		sysctl_wmem_default = 32767; -		sysctl_rmem_default = 32767; -	} else if (totalram_pages >= 131072) { -		sysctl_wmem_max = 131071; -		sysctl_rmem_max = 131071; -	} -} -  /*   *	Simple resource managers for sockets.   */ @@ -1336,6 +1621,25 @@ void sock_wfree(struct sk_buff *skb)  }  EXPORT_SYMBOL(sock_wfree); +void skb_orphan_partial(struct sk_buff *skb) +{ +	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, +	 * so we do not completely orphan skb, but transfert all +	 * accounted bytes but one, to avoid unexpected reorders. +	 */ +	if (skb->destructor == sock_wfree +#ifdef CONFIG_INET +	    || skb->destructor == tcp_wfree +#endif +		) { +		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); +		skb->truesize = 1; +	} else { +		skb_orphan(skb); +	} +} +EXPORT_SYMBOL(skb_orphan_partial); +  /*   * Read buffer destructor automatically called from kfree_skb.   */ @@ -1349,13 +1653,25 @@ void sock_rfree(struct sk_buff *skb)  }  EXPORT_SYMBOL(sock_rfree); +void sock_edemux(struct sk_buff *skb) +{ +	struct sock *sk = skb->sk; -int sock_i_uid(struct sock *sk) +#ifdef CONFIG_INET +	if (sk->sk_state == TCP_TIME_WAIT) +		inet_twsk_put(inet_twsk(sk)); +	else +#endif +		sock_put(sk); +} +EXPORT_SYMBOL(sock_edemux); + +kuid_t sock_i_uid(struct sock *sk)  { -	int uid; +	kuid_t uid;  	read_lock_bh(&sk->sk_callback_lock); -	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; +	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;  	read_unlock_bh(&sk->sk_callback_lock);  	return uid;  } @@ -1390,27 +1706,11 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,  EXPORT_SYMBOL(sock_wmalloc);  /* - * Allocate a skb from the socket's receive buffer. - */ -struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, -			     gfp_t priority) -{ -	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { -		struct sk_buff *skb = alloc_skb(size, priority); -		if (skb) { -			skb_set_owner_r(skb, sk); -			return skb; -		} -	} -	return NULL; -} - -/*   * Allocate a memory block from the socket's option memory buffer.   */  void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)  { -	if ((unsigned)size <= sysctl_optmem_max && +	if ((unsigned int)size <= sysctl_optmem_max &&  	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {  		void *mem;  		/* First do the add, to avoid the race if kmalloc @@ -1470,19 +1770,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)  struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,  				     unsigned long data_len, int noblock, -				     int *errcode) +				     int *errcode, int max_page_order)  { -	struct sk_buff *skb; +	struct sk_buff *skb = NULL; +	unsigned long chunk;  	gfp_t gfp_mask;  	long timeo;  	int err; +	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; +	struct page *page; +	int i; -	gfp_mask = sk->sk_allocation; -	if (gfp_mask & __GFP_WAIT) -		gfp_mask |= __GFP_REPEAT; +	err = -EMSGSIZE; +	if (npages > MAX_SKB_FRAGS) +		goto failure;  	timeo = sock_sndtimeo(sk, noblock); -	while (1) { +	while (!skb) {  		err = sock_error(sk);  		if (err != 0)  			goto failure; @@ -1491,54 +1795,54 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,  		if (sk->sk_shutdown & SEND_SHUTDOWN)  			goto failure; -		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { -			skb = alloc_skb(header_len, gfp_mask); -			if (skb) { -				int npages; -				int i; - -				/* No pages, we're done... */ -				if (!data_len) -					break; - -				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; -				skb->truesize += data_len; -				skb_shinfo(skb)->nr_frags = npages; -				for (i = 0; i < npages; i++) { -					struct page *page; -					skb_frag_t *frag; - -					page = alloc_pages(sk->sk_allocation, 0); -					if (!page) { -						err = -ENOBUFS; -						skb_shinfo(skb)->nr_frags = i; -						kfree_skb(skb); -						goto failure; -					} - -					frag = &skb_shinfo(skb)->frags[i]; -					frag->page = page; -					frag->page_offset = 0; -					frag->size = (data_len >= PAGE_SIZE ? -						      PAGE_SIZE : -						      data_len); -					data_len -= PAGE_SIZE; -				} +		if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { +			set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); +			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +			err = -EAGAIN; +			if (!timeo) +				goto failure; +			if (signal_pending(current)) +				goto interrupted; +			timeo = sock_wait_for_wmem(sk, timeo); +			continue; +		} -				/* Full success... */ -				break; -			} -			err = -ENOBUFS; +		err = -ENOBUFS; +		gfp_mask = sk->sk_allocation; +		if (gfp_mask & __GFP_WAIT) +			gfp_mask |= __GFP_REPEAT; + +		skb = alloc_skb(header_len, gfp_mask); +		if (!skb)  			goto failure; + +		skb->truesize += data_len; + +		for (i = 0; npages > 0; i++) { +			int order = max_page_order; + +			while (order) { +				if (npages >= 1 << order) { +					page = alloc_pages(sk->sk_allocation | +							   __GFP_COMP | +							   __GFP_NOWARN | +							   __GFP_NORETRY, +							   order); +					if (page) +						goto fill_page; +				} +				order--; +			} +			page = alloc_page(sk->sk_allocation); +			if (!page) +				goto failure; +fill_page: +			chunk = min_t(unsigned long, data_len, +				      PAGE_SIZE << order); +			skb_fill_page_desc(skb, i, page, 0, chunk); +			data_len -= chunk; +			npages -= 1 << order;  		} -		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); -		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -		err = -EAGAIN; -		if (!timeo) -			goto failure; -		if (signal_pending(current)) -			goto interrupted; -		timeo = sock_wait_for_wmem(sk, timeo);  	}  	skb_set_owner_w(skb, sk); @@ -1547,6 +1851,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,  interrupted:  	err = sock_intr_errno(timeo);  failure: +	kfree_skb(skb);  	*errcode = err;  	return NULL;  } @@ -1555,10 +1860,66 @@ EXPORT_SYMBOL(sock_alloc_send_pskb);  struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,  				    int noblock, int *errcode)  { -	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); +	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);  }  EXPORT_SYMBOL(sock_alloc_send_skb); +/* On 32bit arches, an skb frag is limited to 2^15 */ +#define SKB_FRAG_PAGE_ORDER	get_order(32768) + +/** + * skb_page_frag_refill - check that a page_frag contains enough room + * @sz: minimum size of the fragment we want to get + * @pfrag: pointer to page_frag + * @prio: priority for memory allocation + * + * Note: While this allocator tries to use high order pages, there is + * no guarantee that allocations succeed. Therefore, @sz MUST be + * less or equal than PAGE_SIZE. + */ +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) +{ +	int order; + +	if (pfrag->page) { +		if (atomic_read(&pfrag->page->_count) == 1) { +			pfrag->offset = 0; +			return true; +		} +		if (pfrag->offset + sz <= pfrag->size) +			return true; +		put_page(pfrag->page); +	} + +	order = SKB_FRAG_PAGE_ORDER; +	do { +		gfp_t gfp = prio; + +		if (order) +			gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; +		pfrag->page = alloc_pages(gfp, order); +		if (likely(pfrag->page)) { +			pfrag->offset = 0; +			pfrag->size = PAGE_SIZE << order; +			return true; +		} +	} while (--order >= 0); + +	return false; +} +EXPORT_SYMBOL(skb_page_frag_refill); + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ +	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) +		return true; + +	sk_enter_memory_pressure(sk); +	sk_stream_moderate_sndbuf(sk); +	return false; +} +EXPORT_SYMBOL(sk_page_frag_refill); +  static void __lock_sock(struct sock *sk)  	__releases(&sk->sk_lock.slock)  	__acquires(&sk->sk_lock.slock) @@ -1590,6 +1951,7 @@ static void __release_sock(struct sock *sk)  		do {  			struct sk_buff *next = skb->next; +			prefetch(next);  			WARN_ON_ONCE(skb_dst_is_noref(skb));  			skb->next = NULL;  			sk_backlog_rcv(sk, skb); @@ -1654,30 +2016,34 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)  	struct proto *prot = sk->sk_prot;  	int amt = sk_mem_pages(size);  	long allocated; +	int parent_status = UNDER_LIMIT;  	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; -	allocated = atomic_long_add_return(amt, prot->memory_allocated); + +	allocated = sk_memory_allocated_add(sk, amt, &parent_status);  	/* Under limit. */ -	if (allocated <= prot->sysctl_mem[0]) { -		if (prot->memory_pressure && *prot->memory_pressure) -			*prot->memory_pressure = 0; +	if (parent_status == UNDER_LIMIT && +			allocated <= sk_prot_mem_limits(sk, 0)) { +		sk_leave_memory_pressure(sk);  		return 1;  	} -	/* Under pressure. */ -	if (allocated > prot->sysctl_mem[1]) -		if (prot->enter_memory_pressure) -			prot->enter_memory_pressure(sk); +	/* Under pressure. (we or our parents) */ +	if ((parent_status > SOFT_LIMIT) || +			allocated > sk_prot_mem_limits(sk, 1)) +		sk_enter_memory_pressure(sk); -	/* Over hard limit. */ -	if (allocated > prot->sysctl_mem[2]) +	/* Over hard limit (we or our parents) */ +	if ((parent_status == OVER_LIMIT) || +			(allocated > sk_prot_mem_limits(sk, 2)))  		goto suppress_allocation;  	/* guarantee minimum buffer size under pressure */  	if (kind == SK_MEM_RECV) {  		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])  			return 1; +  	} else { /* SK_MEM_SEND */  		if (sk->sk_type == SOCK_STREAM) {  			if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) @@ -1687,13 +2053,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)  				return 1;  	} -	if (prot->memory_pressure) { +	if (sk_has_memory_pressure(sk)) {  		int alloc; -		if (!*prot->memory_pressure) +		if (!sk_under_memory_pressure(sk))  			return 1; -		alloc = percpu_counter_read_positive(prot->sockets_allocated); -		if (prot->sysctl_mem[2] > alloc * +		alloc = sk_sockets_allocated_read_positive(sk); +		if (sk_prot_mem_limits(sk, 2) > alloc *  		    sk_mem_pages(sk->sk_wmem_queued +  				 atomic_read(&sk->sk_rmem_alloc) +  				 sk->sk_forward_alloc)) @@ -1712,9 +2078,13 @@ suppress_allocation:  			return 1;  	} +	trace_sock_exceed_buf_limit(sk, prot, allocated); +  	/* Alas. Undo changes. */  	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; -	atomic_long_sub(amt, prot->memory_allocated); + +	sk_memory_allocated_sub(sk, amt); +  	return 0;  }  EXPORT_SYMBOL(__sk_mem_schedule); @@ -1725,15 +2095,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);   */  void __sk_mem_reclaim(struct sock *sk)  { -	struct proto *prot = sk->sk_prot; - -	atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, -		   prot->memory_allocated); +	sk_memory_allocated_sub(sk, +				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);  	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; -	if (prot->memory_pressure && *prot->memory_pressure && -	    (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) -		*prot->memory_pressure = 0; +	if (sk_under_memory_pressure(sk) && +	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) +		sk_leave_memory_pressure(sk);  }  EXPORT_SYMBOL(__sk_mem_reclaim); @@ -1877,14 +2245,14 @@ static void sock_def_error_report(struct sock *sk)  	rcu_read_unlock();  } -static void sock_def_readable(struct sock *sk, int len) +static void sock_def_readable(struct sock *sk)  {  	struct socket_wq *wq;  	rcu_read_lock();  	wq = rcu_dereference(sk->sk_wq);  	if (wq_has_sleeper(wq)) -		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | +		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |  						POLLRDNORM | POLLRDBAND);  	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);  	rcu_read_unlock(); @@ -1936,7 +2304,7 @@ EXPORT_SYMBOL(sk_reset_timer);  void sk_stop_timer(struct sock *sk, struct timer_list* timer)  { -	if (timer_pending(timer) && del_timer(timer)) +	if (del_timer(timer))  		__sock_put(sk);  }  EXPORT_SYMBOL(sk_stop_timer); @@ -1981,8 +2349,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)  	sk->sk_error_report	=	sock_def_error_report;  	sk->sk_destruct		=	sock_def_destruct; -	sk->sk_sndmsg_page	=	NULL; -	sk->sk_sndmsg_off	=	0; +	sk->sk_frag.page	=	NULL; +	sk->sk_frag.offset	=	0; +	sk->sk_peek_off		=	-1;  	sk->sk_peer_pid 	=	NULL;  	sk->sk_peer_cred	=	NULL; @@ -1993,6 +2362,13 @@ void sock_init_data(struct socket *sock, struct sock *sk)  	sk->sk_stamp = ktime_set(-1L, 0); +#ifdef CONFIG_NET_RX_BUSY_POLL +	sk->sk_napi_id		=	0; +	sk->sk_ll_usec		=	sysctl_net_busy_read; +#endif + +	sk->sk_max_pacing_rate = ~0U; +	sk->sk_pacing_rate = ~0U;  	/*  	 * Before updating sk_refcnt, we must commit prior changes to memory  	 * (Documentation/RCU/rculist_nulls.txt for details) @@ -2029,7 +2405,14 @@ void release_sock(struct sock *sk)  	spin_lock_bh(&sk->sk_lock.slock);  	if (sk->sk_backlog.tail)  		__release_sock(sk); -	sk->sk_lock.owned = 0; + +	/* Warning : release_cb() might need to release sk ownership, +	 * ie call sock_release_ownership(sk) before us. +	 */ +	if (sk->sk_prot->release_cb) +		sk->sk_prot->release_cb(sk); + +	sock_release_ownership(sk);  	if (waitqueue_active(&sk->sk_lock.wq))  		wake_up(&sk->sk_lock.wq);  	spin_unlock_bh(&sk->sk_lock.slock); @@ -2104,20 +2487,65 @@ EXPORT_SYMBOL(sock_get_timestampns);  void sock_enable_timestamp(struct sock *sk, int flag)  {  	if (!sock_flag(sk, flag)) { +		unsigned long previous_flags = sk->sk_flags; +  		sock_set_flag(sk, flag);  		/*  		 * we just set one of the two flags which require net  		 * time stamping, but time stamping might have been on  		 * already because of the other one  		 */ -		if (!sock_flag(sk, -				flag == SOCK_TIMESTAMP ? -				SOCK_TIMESTAMPING_RX_SOFTWARE : -				SOCK_TIMESTAMP)) +		if (!(previous_flags & SK_FLAGS_TIMESTAMP))  			net_enable_timestamp();  	}  } +int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, +		       int level, int type) +{ +	struct sock_exterr_skb *serr; +	struct sk_buff *skb, *skb2; +	int copied, err; + +	err = -EAGAIN; +	skb = skb_dequeue(&sk->sk_error_queue); +	if (skb == NULL) +		goto out; + +	copied = skb->len; +	if (copied > len) { +		msg->msg_flags |= MSG_TRUNC; +		copied = len; +	} +	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); +	if (err) +		goto out_free_skb; + +	sock_recv_timestamp(msg, sk, skb); + +	serr = SKB_EXT_ERR(skb); +	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + +	msg->msg_flags |= MSG_ERRQUEUE; +	err = copied; + +	/* Reset and regenerate socket error */ +	spin_lock_bh(&sk->sk_error_queue.lock); +	sk->sk_err = 0; +	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { +		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; +		spin_unlock_bh(&sk->sk_error_queue.lock); +		sk->sk_error_report(sk); +	} else +		spin_unlock_bh(&sk->sk_error_queue.lock); + +out_free_skb: +	kfree_skb(skb); +out: +	return err; +} +EXPORT_SYMBOL(sock_recv_errqueue); +  /*   *	Get a socket option on an socket.   * @@ -2221,13 +2649,16 @@ void sk_common_release(struct sock *sk)  	xfrm_sk_free_policy(sk);  	sk_refcnt_debug_release(sk); + +	if (sk->sk_frag.page) { +		put_page(sk->sk_frag.page); +		sk->sk_frag.page = NULL; +	} +  	sock_put(sk);  }  EXPORT_SYMBOL(sk_common_release); -static DEFINE_RWLOCK(proto_list_lock); -static LIST_HEAD(proto_list); -  #ifdef CONFIG_PROC_FS  #define PROTO_INUSE_NR	64	/* should be enough for the first time */  struct prot_inuse { @@ -2307,7 +2738,7 @@ static void assign_proto_idx(struct proto *prot)  	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);  	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { -		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n"); +		pr_err("PROTO_INUSE_NR exhausted\n");  		return;  	} @@ -2337,8 +2768,8 @@ int proto_register(struct proto *prot, int alloc_slab)  					NULL);  		if (prot->slab == NULL) { -			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", -			       prot->name); +			pr_crit("%s: Can't create sock SLAB cache!\n", +				prot->name);  			goto out;  		} @@ -2352,8 +2783,8 @@ int proto_register(struct proto *prot, int alloc_slab)  								 SLAB_HWCACHE_ALIGN, NULL);  			if (prot->rsk_prot->slab == NULL) { -				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", -				       prot->name); +				pr_crit("%s: Can't create request sock SLAB cache!\n", +					prot->name);  				goto out_free_request_sock_slab_name;  			}  		} @@ -2376,10 +2807,10 @@ int proto_register(struct proto *prot, int alloc_slab)  		}  	} -	write_lock(&proto_list_lock); +	mutex_lock(&proto_list_mutex);  	list_add(&prot->node, &proto_list);  	assign_proto_idx(prot); -	write_unlock(&proto_list_lock); +	mutex_unlock(&proto_list_mutex);  	return 0;  out_free_timewait_sock_slab_name: @@ -2402,10 +2833,10 @@ EXPORT_SYMBOL(proto_register);  void proto_unregister(struct proto *prot)  { -	write_lock(&proto_list_lock); +	mutex_lock(&proto_list_mutex);  	release_proto_idx(prot);  	list_del(&prot->node); -	write_unlock(&proto_list_lock); +	mutex_unlock(&proto_list_mutex);  	if (prot->slab != NULL) {  		kmem_cache_destroy(prot->slab); @@ -2428,9 +2859,9 @@ EXPORT_SYMBOL(proto_unregister);  #ifdef CONFIG_PROC_FS  static void *proto_seq_start(struct seq_file *seq, loff_t *pos) -	__acquires(proto_list_lock) +	__acquires(proto_list_mutex)  { -	read_lock(&proto_list_lock); +	mutex_lock(&proto_list_mutex);  	return seq_list_start_head(&proto_list, *pos);  } @@ -2440,25 +2871,36 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)  }  static void proto_seq_stop(struct seq_file *seq, void *v) -	__releases(proto_list_lock) +	__releases(proto_list_mutex)  { -	read_unlock(&proto_list_lock); +	mutex_unlock(&proto_list_mutex);  }  static char proto_method_implemented(const void *method)  {  	return method == NULL ? 'n' : 'y';  } +static long sock_prot_memory_allocated(struct proto *proto) +{ +	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; +} + +static char *sock_prot_memory_pressure(struct proto *proto) +{ +	return proto->memory_pressure != NULL ? +	proto_memory_pressure(proto) ? "yes" : "no" : "NI"; +}  static void proto_seq_printf(struct seq_file *seq, struct proto *proto)  { +  	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "  			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",  		   proto->name,  		   proto->obj_size,  		   sock_prot_inuse_get(seq_file_net(seq), proto), -		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, -		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", +		   sock_prot_memory_allocated(proto), +		   sock_prot_memory_pressure(proto),  		   proto->max_header,  		   proto->slab == NULL ? "no" : "yes",  		   module_name(proto->owner), @@ -2524,7 +2966,7 @@ static const struct file_operations proto_seq_fops = {  static __net_init int proto_init_net(struct net *net)  { -	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) +	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))  		return -ENOMEM;  	return 0; @@ -2532,7 +2974,7 @@ static __net_init int proto_init_net(struct net *net)  static __net_exit void proto_exit_net(struct net *net)  { -	proc_net_remove(net, "protocols"); +	remove_proc_entry("protocols", net->proc_net);  } diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c new file mode 100644 index 00000000000..a4216a4c957 --- /dev/null +++ b/net/core/sock_diag.c @@ -0,0 +1,231 @@ +#include <linux/mutex.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <net/netlink.h> +#include <net/net_namespace.h> +#include <linux/module.h> +#include <net/sock.h> + +#include <linux/inet_diag.h> +#include <linux/sock_diag.h> + +static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; +static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); +static DEFINE_MUTEX(sock_diag_table_mutex); + +int sock_diag_check_cookie(void *sk, __u32 *cookie) +{ +	if ((cookie[0] != INET_DIAG_NOCOOKIE || +	     cookie[1] != INET_DIAG_NOCOOKIE) && +	    ((u32)(unsigned long)sk != cookie[0] || +	     (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1])) +		return -ESTALE; +	else +		return 0; +} +EXPORT_SYMBOL_GPL(sock_diag_check_cookie); + +void sock_diag_save_cookie(void *sk, __u32 *cookie) +{ +	cookie[0] = (u32)(unsigned long)sk; +	cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); +} +EXPORT_SYMBOL_GPL(sock_diag_save_cookie); + +int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) +{ +	u32 mem[SK_MEMINFO_VARS]; + +	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); +	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; +	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); +	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; +	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; +	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; +	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); +	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + +	return nla_put(skb, attrtype, sizeof(mem), &mem); +} +EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); + +int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, +			     struct sk_buff *skb, int attrtype) +{ +	struct sock_fprog_kern *fprog; +	struct sk_filter *filter; +	struct nlattr *attr; +	unsigned int flen; +	int err = 0; + +	if (!may_report_filterinfo) { +		nla_reserve(skb, attrtype, 0); +		return 0; +	} + +	rcu_read_lock(); +	filter = rcu_dereference(sk->sk_filter); +	if (!filter) +		goto out; + +	fprog = filter->orig_prog; +	flen = sk_filter_proglen(fprog); + +	attr = nla_reserve(skb, attrtype, flen); +	if (attr == NULL) { +		err = -EMSGSIZE; +		goto out; +	} + +	memcpy(nla_data(attr), fprog->filter, flen); +out: +	rcu_read_unlock(); +	return err; +} +EXPORT_SYMBOL(sock_diag_put_filterinfo); + +void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +{ +	mutex_lock(&sock_diag_table_mutex); +	inet_rcv_compat = fn; +	mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); + +void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +{ +	mutex_lock(&sock_diag_table_mutex); +	inet_rcv_compat = NULL; +	mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); + +int sock_diag_register(const struct sock_diag_handler *hndl) +{ +	int err = 0; + +	if (hndl->family >= AF_MAX) +		return -EINVAL; + +	mutex_lock(&sock_diag_table_mutex); +	if (sock_diag_handlers[hndl->family]) +		err = -EBUSY; +	else +		sock_diag_handlers[hndl->family] = hndl; +	mutex_unlock(&sock_diag_table_mutex); + +	return err; +} +EXPORT_SYMBOL_GPL(sock_diag_register); + +void sock_diag_unregister(const struct sock_diag_handler *hnld) +{ +	int family = hnld->family; + +	if (family >= AF_MAX) +		return; + +	mutex_lock(&sock_diag_table_mutex); +	BUG_ON(sock_diag_handlers[family] != hnld); +	sock_diag_handlers[family] = NULL; +	mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_unregister); + +static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ +	int err; +	struct sock_diag_req *req = nlmsg_data(nlh); +	const struct sock_diag_handler *hndl; + +	if (nlmsg_len(nlh) < sizeof(*req)) +		return -EINVAL; + +	if (req->sdiag_family >= AF_MAX) +		return -EINVAL; + +	if (sock_diag_handlers[req->sdiag_family] == NULL) +		request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, +				NETLINK_SOCK_DIAG, req->sdiag_family); + +	mutex_lock(&sock_diag_table_mutex); +	hndl = sock_diag_handlers[req->sdiag_family]; +	if (hndl == NULL) +		err = -ENOENT; +	else +		err = hndl->dump(skb, nlh); +	mutex_unlock(&sock_diag_table_mutex); + +	return err; +} + +static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ +	int ret; + +	switch (nlh->nlmsg_type) { +	case TCPDIAG_GETSOCK: +	case DCCPDIAG_GETSOCK: +		if (inet_rcv_compat == NULL) +			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, +					NETLINK_SOCK_DIAG, AF_INET); + +		mutex_lock(&sock_diag_table_mutex); +		if (inet_rcv_compat != NULL) +			ret = inet_rcv_compat(skb, nlh); +		else +			ret = -EOPNOTSUPP; +		mutex_unlock(&sock_diag_table_mutex); + +		return ret; +	case SOCK_DIAG_BY_FAMILY: +		return __sock_diag_rcv_msg(skb, nlh); +	default: +		return -EINVAL; +	} +} + +static DEFINE_MUTEX(sock_diag_mutex); + +static void sock_diag_rcv(struct sk_buff *skb) +{ +	mutex_lock(&sock_diag_mutex); +	netlink_rcv_skb(skb, &sock_diag_rcv_msg); +	mutex_unlock(&sock_diag_mutex); +} + +static int __net_init diag_net_init(struct net *net) +{ +	struct netlink_kernel_cfg cfg = { +		.input	= sock_diag_rcv, +	}; + +	net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg); +	return net->diag_nlsk == NULL ? -ENOMEM : 0; +} + +static void __net_exit diag_net_exit(struct net *net) +{ +	netlink_kernel_release(net->diag_nlsk); +	net->diag_nlsk = NULL; +} + +static struct pernet_operations diag_net_ops = { +	.init = diag_net_init, +	.exit = diag_net_exit, +}; + +static int __init sock_diag_init(void) +{ +	return register_pernet_subsys(&diag_net_ops); +} + +static void __exit sock_diag_exit(void) +{ +	unregister_pernet_subsys(&diag_net_ops); +} + +module_init(sock_diag_init); +module_exit(sock_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG); diff --git a/net/core/stream.c b/net/core/stream.c index f5df85dcd20..301c05f2606 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -30,7 +30,7 @@ void sk_stream_write_space(struct sock *sk)  	struct socket *sock = sk->sk_socket;  	struct socket_wq *wq; -	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { +	if (sk_stream_is_writeable(sk) && sock) {  		clear_bit(SOCK_NOSPACE, &sock->flags);  		rcu_read_lock(); @@ -122,7 +122,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)  	DEFINE_WAIT(wait);  	if (sk_stream_memory_free(sk)) -		current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; +		current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;  	while (1) {  		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 385b6095fdc..cf9cd13509a 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -14,17 +14,25 @@  #include <linux/vmalloc.h>  #include <linux/init.h>  #include <linux/slab.h> +#include <linux/kmemleak.h>  #include <net/ip.h>  #include <net/sock.h> +#include <net/net_ratelimit.h> +#include <net/busy_poll.h> +#include <net/pkt_sched.h> + +static int zero = 0; +static int one = 1; +static int ushort_max = USHRT_MAX;  #ifdef CONFIG_RPS -static int rps_sock_flow_sysctl(ctl_table *table, int write, +static int rps_sock_flow_sysctl(struct ctl_table *table, int write,  				void __user *buffer, size_t *lenp, loff_t *ppos)  {  	unsigned int orig_size, size;  	int ret, i; -	ctl_table tmp = { +	struct ctl_table tmp = {  		.data = &size,  		.maxlen = sizeof(size),  		.mode = table->mode @@ -67,8 +75,13 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,  		if (sock_table != orig_sock_table) {  			rcu_assign_pointer(rps_sock_flow_table, sock_table); -			synchronize_rcu(); -			vfree(orig_sock_table); +			if (sock_table) +				static_key_slow_inc(&rps_needed); +			if (orig_sock_table) { +				static_key_slow_dec(&rps_needed); +				synchronize_rcu(); +				vfree(orig_sock_table); +			}  		}  	} @@ -78,6 +91,130 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,  }  #endif /* CONFIG_RPS */ +#ifdef CONFIG_NET_FLOW_LIMIT +static DEFINE_MUTEX(flow_limit_update_mutex); + +static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, +				 void __user *buffer, size_t *lenp, +				 loff_t *ppos) +{ +	struct sd_flow_limit *cur; +	struct softnet_data *sd; +	cpumask_var_t mask; +	int i, len, ret = 0; + +	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) +		return -ENOMEM; + +	if (write) { +		ret = cpumask_parse_user(buffer, *lenp, mask); +		if (ret) +			goto done; + +		mutex_lock(&flow_limit_update_mutex); +		len = sizeof(*cur) + netdev_flow_limit_table_len; +		for_each_possible_cpu(i) { +			sd = &per_cpu(softnet_data, i); +			cur = rcu_dereference_protected(sd->flow_limit, +				     lockdep_is_held(&flow_limit_update_mutex)); +			if (cur && !cpumask_test_cpu(i, mask)) { +				RCU_INIT_POINTER(sd->flow_limit, NULL); +				synchronize_rcu(); +				kfree(cur); +			} else if (!cur && cpumask_test_cpu(i, mask)) { +				cur = kzalloc_node(len, GFP_KERNEL, +						   cpu_to_node(i)); +				if (!cur) { +					/* not unwinding previous changes */ +					ret = -ENOMEM; +					goto write_unlock; +				} +				cur->num_buckets = netdev_flow_limit_table_len; +				rcu_assign_pointer(sd->flow_limit, cur); +			} +		} +write_unlock: +		mutex_unlock(&flow_limit_update_mutex); +	} else { +		char kbuf[128]; + +		if (*ppos || !*lenp) { +			*lenp = 0; +			goto done; +		} + +		cpumask_clear(mask); +		rcu_read_lock(); +		for_each_possible_cpu(i) { +			sd = &per_cpu(softnet_data, i); +			if (rcu_dereference(sd->flow_limit)) +				cpumask_set_cpu(i, mask); +		} +		rcu_read_unlock(); + +		len = min(sizeof(kbuf) - 1, *lenp); +		len = cpumask_scnprintf(kbuf, len, mask); +		if (!len) { +			*lenp = 0; +			goto done; +		} +		if (len < *lenp) +			kbuf[len++] = '\n'; +		if (copy_to_user(buffer, kbuf, len)) { +			ret = -EFAULT; +			goto done; +		} +		*lenp = len; +		*ppos += len; +	} + +done: +	free_cpumask_var(mask); +	return ret; +} + +static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, +				       void __user *buffer, size_t *lenp, +				       loff_t *ppos) +{ +	unsigned int old, *ptr; +	int ret; + +	mutex_lock(&flow_limit_update_mutex); + +	ptr = table->data; +	old = *ptr; +	ret = proc_dointvec(table, write, buffer, lenp, ppos); +	if (!ret && write && !is_power_of_2(*ptr)) { +		*ptr = old; +		ret = -EINVAL; +	} + +	mutex_unlock(&flow_limit_update_mutex); +	return ret; +} +#endif /* CONFIG_NET_FLOW_LIMIT */ + +#ifdef CONFIG_NET_SCHED +static int set_default_qdisc(struct ctl_table *table, int write, +			     void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	char id[IFNAMSIZ]; +	struct ctl_table tbl = { +		.data = id, +		.maxlen = IFNAMSIZ, +	}; +	int ret; + +	qdisc_get_default(id, IFNAMSIZ); + +	ret = proc_dostring(&tbl, write, buffer, lenp, ppos); +	if (write && ret == 0) +		ret = qdisc_set_default(id); +	return ret; +} +#endif +  static struct ctl_table net_core_table[] = {  #ifdef CONFIG_NET  	{ @@ -85,28 +222,32 @@ static struct ctl_table net_core_table[] = {  		.data		= &sysctl_wmem_max,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one,  	},  	{  		.procname	= "rmem_max",  		.data		= &sysctl_rmem_max,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one,  	},  	{  		.procname	= "wmem_default",  		.data		= &sysctl_wmem_default,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one,  	},  	{  		.procname	= "rmem_default",  		.data		= &sysctl_rmem_default,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one,  	},  	{  		.procname	= "dev_weight", @@ -122,6 +263,15 @@ static struct ctl_table net_core_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_dointvec  	}, +#ifdef CONFIG_BPF_JIT +	{ +		.procname	= "bpf_jit_enable", +		.data		= &bpf_jit_enable, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +#endif  	{  		.procname	= "netdev_tstamp_prequeue",  		.data		= &netdev_tstamp_prequeue, @@ -158,6 +308,44 @@ static struct ctl_table net_core_table[] = {  		.proc_handler	= rps_sock_flow_sysctl  	},  #endif +#ifdef CONFIG_NET_FLOW_LIMIT +	{ +		.procname	= "flow_limit_cpu_bitmap", +		.mode		= 0644, +		.proc_handler	= flow_limit_cpu_sysctl +	}, +	{ +		.procname	= "flow_limit_table_len", +		.data		= &netdev_flow_limit_table_len, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= flow_limit_table_len_sysctl +	}, +#endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_RX_BUSY_POLL +	{ +		.procname	= "busy_poll", +		.data		= &sysctl_net_busy_poll, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +	{ +		.procname	= "busy_read", +		.data		= &sysctl_net_busy_read, +		.maxlen		= sizeof(unsigned int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +#endif +#ifdef CONFIG_NET_SCHED +	{ +		.procname	= "default_qdisc", +		.mode		= 0644, +		.maxlen		= IFNAMSIZ, +		.proc_handler	= set_default_qdisc +	}, +#endif  #endif /* CONFIG_NET */  	{  		.procname	= "netdev_budget", @@ -182,17 +370,13 @@ static struct ctl_table netns_core_table[] = {  		.data		= &init_net.core.sysctl_somaxconn,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.extra1		= &zero, +		.extra2		= &ushort_max, +		.proc_handler	= proc_dointvec_minmax  	},  	{ }  }; -__net_initdata struct ctl_path net_core_path[] = { -	{ .procname = "net", }, -	{ .procname = "core", }, -	{ }, -}; -  static __net_init int sysctl_core_net_init(struct net *net)  {  	struct ctl_table *tbl; @@ -206,10 +390,14 @@ static __net_init int sysctl_core_net_init(struct net *net)  			goto err_dup;  		tbl[0].data = &net->core.sysctl_somaxconn; + +		/* Don't export any sysctls to unprivileged users */ +		if (net->user_ns != &init_user_ns) { +			tbl[0].procname = NULL; +		}  	} -	net->core.sysctl_hdr = register_net_sysctl_table(net, -			net_core_path, tbl); +	net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);  	if (net->core.sysctl_hdr == NULL)  		goto err_reg; @@ -239,10 +427,7 @@ static __net_initdata struct pernet_operations sysctl_core_ops = {  static __init int sysctl_core_init(void)  { -	static struct ctl_table empty[1]; - -	register_sysctl_paths(net_core_path, empty); -	register_net_sysctl_rotable(net_core_path, net_core_table); +	register_net_sysctl(&init_net, "net/core", net_core_table);  	return register_pernet_subsys(&sysctl_core_ops);  } diff --git a/net/core/timestamping.c b/net/core/timestamping.c index b124d28ff1c..6521dfd8b7c 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -21,17 +21,13 @@  #include <linux/phy.h>  #include <linux/ptp_classify.h>  #include <linux/skbuff.h> - -static struct sock_filter ptp_filter[] = { -	PTP_FILTER -}; +#include <linux/export.h>  static unsigned int classify(const struct sk_buff *skb)  { -	if (likely(skb->dev && -		   skb->dev->phydev && +	if (likely(skb->dev && skb->dev->phydev &&  		   skb->dev->phydev->drv)) -		return sk_run_filter(skb, ptp_filter); +		return ptp_classify_raw(skb);  	else  		return PTP_CLASS_NONE;  } @@ -57,9 +53,15 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)  	case PTP_CLASS_V2_VLAN:  		phydev = skb->dev->phydev;  		if (likely(phydev->drv->txtstamp)) { +			if (!atomic_inc_not_zero(&sk->sk_refcnt)) +				return; +  			clone = skb_clone(skb, GFP_ATOMIC); -			if (!clone) +			if (!clone) { +				sock_put(sk);  				return; +			} +  			clone->sk = sk;  			phydev->drv->txtstamp(phydev, clone, type);  		} @@ -68,6 +70,7 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)  		break;  	}  } +EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);  void skb_complete_tx_timestamp(struct sk_buff *skb,  			       struct skb_shared_hwtstamps *hwtstamps) @@ -76,16 +79,23 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,  	struct sock_exterr_skb *serr;  	int err; -	if (!hwtstamps) +	if (!hwtstamps) { +		sock_put(sk); +		kfree_skb(skb);  		return; +	}  	*skb_hwtstamps(skb) = *hwtstamps; +  	serr = SKB_EXT_ERR(skb);  	memset(serr, 0, sizeof(*serr));  	serr->ee.ee_errno = ENOMSG;  	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;  	skb->sk = NULL; +  	err = sock_queue_err_skb(sk, skb); + +	sock_put(sk);  	if (err)  		kfree_skb(skb);  } @@ -96,11 +106,13 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)  	struct phy_device *phydev;  	unsigned int type; -	skb_push(skb, ETH_HLEN); +	if (skb_headroom(skb) < ETH_HLEN) +		return false; +	__skb_push(skb, ETH_HLEN);  	type = classify(skb); -	skb_pull(skb, ETH_HLEN); +	__skb_pull(skb, ETH_HLEN);  	switch (type) {  	case PTP_CLASS_V1_IPV4: @@ -119,8 +131,4 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)  	return false;  } - -void __init skb_timestamping_init(void) -{ -	BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter))); -} +EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp); diff --git a/net/core/tso.c b/net/core/tso.c new file mode 100644 index 00000000000..8c3203c585b --- /dev/null +++ b/net/core/tso.c @@ -0,0 +1,77 @@ +#include <linux/export.h> +#include <net/ip.h> +#include <net/tso.h> + +/* Calculate expected number of TX descriptors */ +int tso_count_descs(struct sk_buff *skb) +{ +	/* The Marvell Way */ +	return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; +} +EXPORT_SYMBOL(tso_count_descs); + +void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, +		   int size, bool is_last) +{ +	struct iphdr *iph; +	struct tcphdr *tcph; +	int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); +	int mac_hdr_len = skb_network_offset(skb); + +	memcpy(hdr, skb->data, hdr_len); +	iph = (struct iphdr *)(hdr + mac_hdr_len); +	iph->id = htons(tso->ip_id); +	iph->tot_len = htons(size + hdr_len - mac_hdr_len); +	tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb)); +	tcph->seq = htonl(tso->tcp_seq); +	tso->ip_id++; + +	if (!is_last) { +		/* Clear all special flags for not last packet */ +		tcph->psh = 0; +		tcph->fin = 0; +		tcph->rst = 0; +	} +} +EXPORT_SYMBOL(tso_build_hdr); + +void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) +{ +	tso->tcp_seq += size; +	tso->size -= size; +	tso->data += size; + +	if ((tso->size == 0) && +	    (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { +		skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + +		/* Move to next segment */ +		tso->size = frag->size; +		tso->data = page_address(frag->page.p) + frag->page_offset; +		tso->next_frag_idx++; +	} +} +EXPORT_SYMBOL(tso_build_data); + +void tso_start(struct sk_buff *skb, struct tso_t *tso) +{ +	int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + +	tso->ip_id = ntohs(ip_hdr(skb)->id); +	tso->tcp_seq = ntohl(tcp_hdr(skb)->seq); +	tso->next_frag_idx = 0; + +	/* Build first data */ +	tso->size = skb_headlen(skb) - hdr_len; +	tso->data = skb->data + hdr_len; +	if ((tso->size == 0) && +	    (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { +		skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + +		/* Move to next segment */ +		tso->size = frag->size; +		tso->data = page_address(frag->page.p) + frag->page_offset; +		tso->next_frag_idx++; +	} +} +EXPORT_SYMBOL(tso_start); diff --git a/net/core/user_dma.c b/net/core/user_dma.c index 25d717ebc92..1b5fefdb819 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -27,6 +27,7 @@  #include <linux/dmaengine.h>  #include <linux/socket.h> +#include <linux/export.h>  #include <net/tcp.h>  #include <net/netdma.h> @@ -71,14 +72,14 @@ int dma_skb_copy_datagram_iovec(struct dma_chan *chan,  	/* Copy paged appendix. Hmm... why does this look so complicated? */  	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {  		int end; +		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];  		WARN_ON(start > offset + len); -		end = start + skb_shinfo(skb)->frags[i].size; +		end = start + skb_frag_size(frag);  		copy = end - offset;  		if (copy > 0) { -			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; -			struct page *page = frag->page; +			struct page *page = skb_frag_page(frag);  			if (copy > len)  				copy = len; diff --git a/net/core/utils.c b/net/core/utils.c index 5fea0ab2190..eed34338736 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -17,6 +17,7 @@  #include <linux/module.h>  #include <linux/jiffies.h>  #include <linux/kernel.h> +#include <linux/ctype.h>  #include <linux/inet.h>  #include <linux/mm.h>  #include <linux/net.h> @@ -27,9 +28,9 @@  #include <linux/ratelimit.h>  #include <net/sock.h> +#include <net/net_ratelimit.h>  #include <asm/byteorder.h> -#include <asm/system.h>  #include <asm/uaccess.h>  int net_msg_warn __read_mostly = 1; @@ -58,14 +59,11 @@ __be32 in_aton(const char *str)  	int i;  	l = 0; -	for (i = 0; i < 4; i++) -	{ +	for (i = 0; i < 4; i++)	{  		l <<= 8; -		if (*str != '\0') -		{ +		if (*str != '\0') {  			val = 0; -			while (*str != '\0' && *str != '.' && *str != '\n') -			{ +			while (*str != '\0' && *str != '.' && *str != '\n') {  				val *= 10;  				val += *str - '0';  				str++; @@ -110,6 +108,18 @@ static inline int xdigit2bin(char c, int delim)  	return IN6PTON_UNKNOWN;  } +/** + * in4_pton - convert an IPv4 address from literal to binary representation + * @src: the start of the IPv4 address string + * @srclen: the length of the string, -1 means strlen(src) + * @dst: the binary (u8[4] array) representation of the IPv4 address + * @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter + * @end: A pointer to the end of the parsed string will be placed here + * + * Return one on success, return zero when any error occurs + * and @end will point to the end of the parsed string. + * + */  int in4_pton(const char *src, int srclen,  	     u8 *dst,  	     int delim, const char **end) @@ -164,6 +174,18 @@ out:  }  EXPORT_SYMBOL(in4_pton); +/** + * in6_pton - convert an IPv6 address from literal to binary representation + * @src: the start of the IPv6 address string + * @srclen: the length of the string, -1 means strlen(src) + * @dst: the binary (u8[16] array) representation of the IPv6 address + * @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter + * @end: A pointer to the end of the parsed string will be placed here + * + * Return one on success, return zero when any error occurs + * and @end will point to the end of the parsed string. + * + */  int in6_pton(const char *src, int srclen,  	     u8 *dst,  	     int delim, const char **end) @@ -296,3 +318,72 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,  				csum_unfold(*sum)));  }  EXPORT_SYMBOL(inet_proto_csum_replace4); + +void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, +			       const __be32 *from, const __be32 *to, +			       int pseudohdr) +{ +	__be32 diff[] = { +		~from[0], ~from[1], ~from[2], ~from[3], +		to[0], to[1], to[2], to[3], +	}; +	if (skb->ip_summed != CHECKSUM_PARTIAL) { +		*sum = csum_fold(csum_partial(diff, sizeof(diff), +				 ~csum_unfold(*sum))); +		if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) +			skb->csum = ~csum_partial(diff, sizeof(diff), +						  ~skb->csum); +	} else if (pseudohdr) +		*sum = ~csum_fold(csum_partial(diff, sizeof(diff), +				  csum_unfold(*sum))); +} +EXPORT_SYMBOL(inet_proto_csum_replace16); + +struct __net_random_once_work { +	struct work_struct work; +	struct static_key *key; +}; + +static void __net_random_once_deferred(struct work_struct *w) +{ +	struct __net_random_once_work *work = +		container_of(w, struct __net_random_once_work, work); +	BUG_ON(!static_key_enabled(work->key)); +	static_key_slow_dec(work->key); +	kfree(work); +} + +static void __net_random_once_disable_jump(struct static_key *key) +{ +	struct __net_random_once_work *w; + +	w = kmalloc(sizeof(*w), GFP_ATOMIC); +	if (!w) +		return; + +	INIT_WORK(&w->work, __net_random_once_deferred); +	w->key = key; +	schedule_work(&w->work); +} + +bool __net_get_random_once(void *buf, int nbytes, bool *done, +			   struct static_key *once_key) +{ +	static DEFINE_SPINLOCK(lock); +	unsigned long flags; + +	spin_lock_irqsave(&lock, flags); +	if (*done) { +		spin_unlock_irqrestore(&lock, flags); +		return false; +	} + +	get_random_bytes(buf, nbytes); +	*done = true; +	spin_unlock_irqrestore(&lock, flags); + +	__net_random_once_disable_jump(once_key); + +	return true; +} +EXPORT_SYMBOL(__net_get_random_once);  | 
