diff options
Diffstat (limited to 'drivers/net/tun.c')
| -rw-r--r-- | drivers/net/tun.c | 192 | 
1 files changed, 122 insertions, 70 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7cb105c103f..98bad1fb1bf 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -69,6 +69,7 @@  #include <net/netns/generic.h>  #include <net/rtnetlink.h>  #include <net/sock.h> +#include <linux/seq_file.h>  #include <asm/uaccess.h> @@ -110,7 +111,7 @@ struct tap_filter {  	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];  }; -/* DEFAULT_MAX_NUM_RSS_QUEUES were choosed to let the rx/tx queues allocated for +/* DEFAULT_MAX_NUM_RSS_QUEUES were chosen to let the rx/tx queues allocated for   * the netdevice to be fit in one page. So we can make sure the success of   * memory allocation. TODO: increase the limit. */  #define MAX_TAP_QUEUES DEFAULT_MAX_NUM_RSS_QUEUES @@ -119,7 +120,7 @@ struct tap_filter {  #define TUN_FLOW_EXPIRE (3 * HZ)  /* A tun_file connects an open character device to a tuntap netdevice. It - * also contains all socket related strctures (except sock_fprog and tap_filter) + * also contains all socket related structures (except sock_fprog and tap_filter)   * to serve as one transmit queue for tuntap device. The sock_fprog and   * tap_filter were kept in tun_struct since they were used for filtering for the   * netdevice not for a specific queue (at least I didn't see the requirement for @@ -152,6 +153,7 @@ struct tun_flow_entry {  	struct tun_struct *tun;  	u32 rxhash; +	u32 rps_rxhash;  	int queue_index;  	unsigned long updated;  }; @@ -220,6 +222,7 @@ static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,  			  rxhash, queue_index);  		e->updated = jiffies;  		e->rxhash = rxhash; +		e->rps_rxhash = 0;  		e->queue_index = queue_index;  		e->tun = tun;  		hlist_add_head_rcu(&e->hash_link, head); @@ -232,6 +235,7 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)  {  	tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",  		  e->rxhash, e->queue_index); +	sock_rps_reset_flow_hash(e->rps_rxhash);  	hlist_del_rcu(&e->hash_link);  	kfree_rcu(e, rcu);  	--tun->flow_count; @@ -325,6 +329,7 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash,  		/* TODO: keep queueing to old queue until it's empty? */  		e->queue_index = queue_index;  		e->updated = jiffies; +		sock_rps_record_flow_hash(e->rps_rxhash);  	} else {  		spin_lock_bh(&tun->lock);  		if (!tun_flow_find(head, rxhash) && @@ -341,14 +346,27 @@ unlock:  	rcu_read_unlock();  } +/** + * Save the hash received in the stack receive path and update the + * flow_hash table accordingly. + */ +static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) +{ +	if (unlikely(e->rps_rxhash != hash)) { +		sock_rps_reset_flow_hash(e->rps_rxhash); +		e->rps_rxhash = hash; +	} +} +  /* We try to identify a flow through its rxhash first. The reason that - * we do not check rxq no. is becuase some cards(e.g 82599), chooses + * we do not check rxq no. is because some cards(e.g 82599), chooses   * the rxq based on the txq where the last packet of the flow comes. As   * the userspace application move between processors, we may get a   * different rxq no. here. If we could not get rxhash, then we would   * hope the rxq no. may help here.   */ -static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) +static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, +			    void *accel_priv, select_queue_fallback_t fallback)  {  	struct tun_struct *tun = netdev_priv(dev);  	struct tun_flow_entry *e; @@ -358,12 +376,13 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)  	rcu_read_lock();  	numqueues = ACCESS_ONCE(tun->numqueues); -	txq = skb_get_rxhash(skb); +	txq = skb_get_hash(skb);  	if (txq) {  		e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); -		if (e) +		if (e) { +			tun_flow_save_rps_rxhash(e, txq);  			txq = e->queue_index; -		else +		} else  			/* use multiply and shift instead of expensive divide */  			txq = ((u64)txq * numqueues) >> 32;  	} else if (likely(skb_rx_queue_recorded(skb))) { @@ -433,7 +452,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)  		--tun->numqueues;  		if (clean) { -			rcu_assign_pointer(tfile->tun, NULL); +			RCU_INIT_POINTER(tfile->tun, NULL);  			sock_put(&tfile->sk);  		} else  			tun_disable_queue(tun, tfile); @@ -479,13 +498,13 @@ static void tun_detach_all(struct net_device *dev)  	for (i = 0; i < n; i++) {  		tfile = rtnl_dereference(tun->tfiles[i]);  		BUG_ON(!tfile); -		wake_up_all(&tfile->wq.wait); -		rcu_assign_pointer(tfile->tun, NULL); +		tfile->socket.sk->sk_data_ready(tfile->socket.sk); +		RCU_INIT_POINTER(tfile->tun, NULL);  		--tun->numqueues;  	}  	list_for_each_entry(tfile, &tun->disabled, next) { -		wake_up_all(&tfile->wq.wait); -		rcu_assign_pointer(tfile->tun, NULL); +		tfile->socket.sk->sk_data_ready(tfile->socket.sk); +		RCU_INIT_POINTER(tfile->tun, NULL);  	}  	BUG_ON(tun->numqueues != 0); @@ -531,7 +550,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte  	err = 0; -	/* Re-attach the filter to presist device */ +	/* Re-attach the filter to persist device */  	if (!skip_filter && (tun->filter_attached == true)) {  		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);  		if (!err) @@ -720,14 +739,32 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)  	struct tun_struct *tun = netdev_priv(dev);  	int txq = skb->queue_mapping;  	struct tun_file *tfile; +	u32 numqueues = 0;  	rcu_read_lock();  	tfile = rcu_dereference(tun->tfiles[txq]); +	numqueues = ACCESS_ONCE(tun->numqueues);  	/* Drop packet if interface is not attached */ -	if (txq >= tun->numqueues) +	if (txq >= numqueues)  		goto drop; +	if (numqueues == 1) { +		/* Select queue was not called for the skbuff, so we extract the +		 * RPS hash and save it into the flow_table here. +		 */ +		__u32 rxhash; + +		rxhash = skb_get_hash(skb); +		if (rxhash) { +			struct tun_flow_entry *e; +			e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], +					rxhash); +			if (e) +				tun_flow_save_rps_rxhash(e, rxhash); +		} +	} +  	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);  	BUG_ON(!tfile); @@ -745,8 +782,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)  	/* Limit the number of packets queued by dividing txq length with the  	 * number of queues.  	 */ -	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) -			  >= dev->tx_queue_len / tun->numqueues) +	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) * numqueues +			  >= dev->tx_queue_len)  		goto drop;  	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) @@ -770,8 +807,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)  	/* Notify and wake up reader process */  	if (tfile->flags & TUN_FASYNC)  		kill_fasync(&tfile->fasync, SIGIO, POLL_IN); -	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN | -				   POLLRDNORM | POLLRDBAND); +	tfile->socket.sk->sk_data_ready(tfile->socket.sk);  	rcu_read_unlock();  	return NETDEV_TX_OK; @@ -819,9 +855,9 @@ static void tun_poll_controller(struct net_device *dev)  	 * Tun only receives frames when:  	 * 1) the char device endpoint gets data from user space  	 * 2) the tun socket gets a sendmsg call from user space -	 * Since both of those are syncronous operations, we are guaranteed +	 * Since both of those are synchronous operations, we are guaranteed  	 * never to have pending data when we poll for it -	 * so theres nothing to do here but return. +	 * so there is nothing to do here but return.  	 * We need this though so netpoll recognizes us as an interface that  	 * supports polling, which enables bridge devices in virt setups to  	 * still use netconsole @@ -928,7 +964,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)  	tun_debug(KERN_INFO, tun, "tun_chr_poll\n"); -	poll_wait(file, &tfile->wq.wait, wait); +	poll_wait(file, sk_sleep(sk), wait);  	if (!skb_queue_empty(&sk->sk_receive_queue))  		mask |= POLLIN | POLLRDNORM; @@ -981,6 +1017,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,  	struct sk_buff *skb;  	size_t len = total_len, align = NET_SKB_PAD, linear;  	struct virtio_net_hdr gso = { 0 }; +	int good_linear;  	int offset = 0;  	int copylen;  	bool zerocopy = false; @@ -1021,12 +1058,16 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,  			return -EINVAL;  	} +	good_linear = SKB_MAX_HEAD(align); +  	if (msg_control) {  		/* There are 256 bytes to be copied in skb, so there is  		 * enough room for skb expand head in case it is used.  		 * The rest of the buffer is mapped from userspace.  		 */  		copylen = gso.hdr_len ? gso.hdr_len : GOODCOPY_LEN; +		if (copylen > good_linear) +			copylen = good_linear;  		linear = copylen;  		if (iov_pages(iv, offset + copylen, count) <= MAX_SKB_FRAGS)  			zerocopy = true; @@ -1034,7 +1075,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,  	if (!zerocopy) {  		copylen = len; -		linear = gso.hdr_len; +		if (gso.hdr_len > good_linear) +			linear = good_linear; +		else +			linear = gso.hdr_len;  	}  	skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); @@ -1138,7 +1182,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,  	skb_reset_network_header(skb);  	skb_probe_transport_header(skb, 0); -	rxhash = skb_get_rxhash(skb); +	rxhash = skb_get_hash(skb);  	netif_rx_ni(skb);  	tun->dev->stats.rx_packets++; @@ -1176,7 +1220,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,  {  	struct tun_pi pi = { 0, skb->protocol };  	ssize_t total = 0; -	int vlan_offset = 0; +	int vlan_offset = 0, copied;  	if (!(tun->flags & TUN_NO_PI)) {  		if ((len -= sizeof(pi)) < 0) @@ -1240,6 +1284,8 @@ static ssize_t tun_put_user(struct tun_struct *tun,  		total += tun->vnet_hdr_sz;  	} +	copied = total; +	total += skb->len;  	if (!vlan_tx_tag_present(skb)) {  		len = min_t(int, skb->len, len);  	} else { @@ -1254,24 +1300,24 @@ static ssize_t tun_put_user(struct tun_struct *tun,  		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);  		len = min_t(int, skb->len + VLAN_HLEN, len); +		total += VLAN_HLEN;  		copy = min_t(int, vlan_offset, len); -		ret = skb_copy_datagram_const_iovec(skb, 0, iv, total, copy); +		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);  		len -= copy; -		total += copy; +		copied += copy;  		if (ret || !len)  			goto done;  		copy = min_t(int, sizeof(veth), len); -		ret = memcpy_toiovecend(iv, (void *)&veth, total, copy); +		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);  		len -= copy; -		total += copy; +		copied += copy;  		if (ret || !len)  			goto done;  	} -	skb_copy_datagram_const_iovec(skb, vlan_offset, iv, total, len); -	total += len; +	skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);  done:  	tun->dev->stats.tx_packets++; @@ -1281,50 +1327,28 @@ done:  }  static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, -			   struct kiocb *iocb, const struct iovec *iv, -			   ssize_t len, int noblock) +			   const struct iovec *iv, ssize_t len, int noblock)  { -	DECLARE_WAITQUEUE(wait, current);  	struct sk_buff *skb;  	ssize_t ret = 0; +	int peeked, err, off = 0;  	tun_debug(KERN_INFO, tun, "tun_do_read\n"); -	if (unlikely(!noblock)) -		add_wait_queue(&tfile->wq.wait, &wait); -	while (len) { -		if (unlikely(!noblock)) -			current->state = TASK_INTERRUPTIBLE; +	if (!len) +		return ret; -		/* Read frames from the queue */ -		if (!(skb = skb_dequeue(&tfile->socket.sk->sk_receive_queue))) { -			if (noblock) { -				ret = -EAGAIN; -				break; -			} -			if (signal_pending(current)) { -				ret = -ERESTARTSYS; -				break; -			} -			if (tun->dev->reg_state != NETREG_REGISTERED) { -				ret = -EIO; -				break; -			} - -			/* Nothing to read, let's sleep */ -			schedule(); -			continue; -		} +	if (tun->dev->reg_state != NETREG_REGISTERED) +		return -EIO; +	/* Read frames from queue */ +	skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, +				  &peeked, &off, &err); +	if (skb) {  		ret = tun_put_user(tun, tfile, skb, iv, len);  		kfree_skb(skb); -		break; -	} - -	if (unlikely(!noblock)) { -		current->state = TASK_RUNNING; -		remove_wait_queue(&tfile->wq.wait, &wait); -	} +	} else +		ret = err;  	return ret;  } @@ -1345,9 +1369,11 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,  		goto out;  	} -	ret = tun_do_read(tun, tfile, iocb, iv, len, +	ret = tun_do_read(tun, tfile, iv, len,  			  file->f_flags & O_NONBLOCK);  	ret = min_t(ssize_t, ret, len); +	if (ret > 0) +		iocb->ki_pos = ret;  out:  	tun_put(tun);  	return ret; @@ -1444,7 +1470,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,  					 SOL_PACKET, TUN_TX_TIMESTAMP);  		goto out;  	} -	ret = tun_do_read(tun, tfile, iocb, m->msg_iov, total_len, +	ret = tun_do_read(tun, tfile, m->msg_iov, total_len,  			  flags & MSG_DONTWAIT);  	if (ret > total_len) {  		m->msg_flags |= MSG_TRUNC; @@ -1638,7 +1664,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)  				   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |  				   NETIF_F_HW_VLAN_STAG_TX;  		dev->features = dev->hw_features; -		dev->vlan_features = dev->features; +		dev->vlan_features = dev->features & +				     ~(NETIF_F_HW_VLAN_CTAG_TX | +				       NETIF_F_HW_VLAN_STAG_TX);  		INIT_LIST_HEAD(&tun->disabled);  		err = tun_attach(tun, file, false); @@ -2144,13 +2172,13 @@ static int tun_chr_open(struct inode *inode, struct file * file)  					    &tun_proto);  	if (!tfile)  		return -ENOMEM; -	rcu_assign_pointer(tfile->tun, NULL); +	RCU_INIT_POINTER(tfile->tun, NULL);  	tfile->net = get_net(current->nsproxy->net_ns);  	tfile->flags = 0;  	tfile->ifindex = 0; -	rcu_assign_pointer(tfile->socket.wq, &tfile->wq);  	init_waitqueue_head(&tfile->wq.wait); +	RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);  	tfile->socket.file = file;  	tfile->socket.ops = &tun_socket_ops; @@ -2181,6 +2209,27 @@ static int tun_chr_close(struct inode *inode, struct file *file)  	return 0;  } +#ifdef CONFIG_PROC_FS +static int tun_chr_show_fdinfo(struct seq_file *m, struct file *f) +{ +	struct tun_struct *tun; +	struct ifreq ifr; + +	memset(&ifr, 0, sizeof(ifr)); + +	rtnl_lock(); +	tun = tun_get(f); +	if (tun) +		tun_get_iff(current->nsproxy->net_ns, tun, &ifr); +	rtnl_unlock(); + +	if (tun) +		tun_put(tun); + +	return seq_printf(m, "iff:\t%s\n", ifr.ifr_name); +} +#endif +  static const struct file_operations tun_fops = {  	.owner	= THIS_MODULE,  	.llseek = no_llseek, @@ -2195,7 +2244,10 @@ static const struct file_operations tun_fops = {  #endif  	.open	= tun_chr_open,  	.release = tun_chr_close, -	.fasync = tun_chr_fasync +	.fasync = tun_chr_fasync, +#ifdef CONFIG_PROC_FS +	.show_fdinfo = tun_chr_show_fdinfo, +#endif  };  static struct miscdevice tun_miscdev = {  | 
