aboutsummaryrefslogtreecommitdiff
path: root/drivers/net/tun.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-24 10:01:50 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-24 10:01:50 -0700
commit3c4cfadef6a1665d9cd02a543782d03d3e6740c6 (patch)
tree3df72faaacd494d5ac8c9668df4f529b1b5e4457 /drivers/net/tun.c
parente017507f37d5cb8b541df165a824958bc333bec3 (diff)
parent320f5ea0cedc08ef65d67e056bcb9d181386ef2c (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David S Miller: 1) Remove the ipv4 routing cache. Now lookups go directly into the FIB trie and use prebuilt routes cached there. No more garbage collection, no more rDOS attacks on the routing cache. Instead we now get predictable and consistent performance, no matter what the pattern of traffic we service. This has been almost 2 years in the making. Special thanks to Julian Anastasov, Eric Dumazet, Steffen Klassert, and others who have helped along the way. I'm sure that with a change of this magnitude there will be some kind of fallout, but such things ought the be simple to fix at this point. Luckily I'm not European so I'll be around all of August to fix things :-) The major stages of this work here are each fronted by a forced merge commit whose commit message contains a top-level description of the motivations and implementation issues. 2) Pre-demux of established ipv4 TCP sockets, saves a route demux on input. 3) TCP SYN/ACK performance tweaks from Eric Dumazet. 4) Add namespace support for netfilter L4 conntrack helpers, from Gao Feng. 5) Add config mechanism for Energy Efficient Ethernet to ethtool, from Yuval Mintz. 6) Remove quadratic behavior from /proc/net/unix, from Eric Dumazet. 7) Support for connection tracker helpers in userspace, from Pablo Neira Ayuso. 8) Allow userspace driven TX load balancing functions in TEAM driver, from Jiri Pirko. 9) Kill off NLMSG_PUT and RTA_PUT macros, more gross stuff with embedded gotos. 10) TCP Small Queues, essentially minimize the amount of TCP data queued up in the packet scheduler layer. Whereas the existing BQL (Byte Queue Limits) limits the pkt_sched --> netdevice queuing levels, this controls the TCP --> pkt_sched queueing levels. From Eric Dumazet. 11) Reduce the number of get_page/put_page ops done on SKB fragments, from Alexander Duyck. 12) Implement protection against blind resets in TCP (RFC 5961), from Eric Dumazet. 13) Support the client side of TCP Fast Open, basically the ability to send data in the SYN exchange, from Yuchung Cheng. Basically, the sender queues up data with a sendmsg() call using MSG_FASTOPEN, then they do the connect() which emits the queued up fastopen data. 14) Avoid all the problems we get into in TCP when timers or PMTU events hit a locked socket. The TCP Small Queues changes added a tcp_release_cb() that allows us to queue work up to the release_sock() caller, and that's what we use here too. From Eric Dumazet. 15) Zero copy on TX support for TUN driver, from Michael S. Tsirkin. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1870 commits) genetlink: define lockdep_genl_is_held() when CONFIG_LOCKDEP r8169: revert "add byte queue limit support". ipv4: Change rt->rt_iif encoding. net: Make skb->skb_iif always track skb->dev ipv4: Prepare for change of rt->rt_iif encoding. ipv4: Remove all RTCF_DIRECTSRC handliing. ipv4: Really ignore ICMP address requests/replies. decnet: Don't set RTCF_DIRECTSRC. net/ipv4/ip_vti.c: Fix __rcu warnings detected by sparse. ipv4: Remove redundant assignment rds: set correct msg_namelen openvswitch: potential NULL deref in sample() tcp: dont drop MTU reduction indications bnx2x: Add new 57840 device IDs tcp: avoid oops in tcp_metrics and reset tcpm_stamp niu: Change niu_rbr_fill() to use unlikely() to check niu_rbr_add_page() return value niu: Fix to check for dma mapping errors. net: Fix references to out-of-scope variables in put_cmsg_compat() net: ethernet: davinci_emac: add pm_runtime support net: ethernet: davinci_emac: Remove unnecessary #include ...
Diffstat (limited to 'drivers/net/tun.c')
-rw-r--r--drivers/net/tun.c153
1 files changed, 140 insertions, 13 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 987aeefbc77..c62163e272c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -22,7 +22,7 @@
* Add TUNSETLINK ioctl to set the link encapsulation
*
* Mark Smith <markzzzsmith@yahoo.com.au>
- * Use random_ether_addr() for tap MAC address.
+ * Use eth_random_addr() for tap MAC address.
*
* Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20
* Fixes in packet dropping, queue length setting and queue wakeup.
@@ -100,6 +100,8 @@ do { \
} while (0)
#endif
+#define GOODCOPY_LEN 128
+
#define FLT_EXACT_COUNT 8
struct tap_filter {
unsigned int count; /* Number of addrs. Zero means disabled */
@@ -358,6 +360,8 @@ static void tun_free_netdev(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
+ BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED, &tun->socket.flags));
+
sk_release_kernel(tun->socket.sk);
}
@@ -414,6 +418,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
/* Orphan the skb - required as we might hang on to it
* for indefinite time. */
+ if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ goto drop;
skb_orphan(skb);
/* Enqueue packet */
@@ -600,19 +606,100 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
return skb;
}
+/* set skb frags from iovec, this can move to core network code for reuse */
+static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
+ int offset, size_t count)
+{
+ int len = iov_length(from, count) - offset;
+ int copy = skb_headlen(skb);
+ int size, offset1 = 0;
+ int i = 0;
+
+ /* Skip over from offset */
+ while (count && (offset >= from->iov_len)) {
+ offset -= from->iov_len;
+ ++from;
+ --count;
+ }
+
+ /* copy up to skb headlen */
+ while (count && (copy > 0)) {
+ size = min_t(unsigned int, copy, from->iov_len - offset);
+ if (copy_from_user(skb->data + offset1, from->iov_base + offset,
+ size))
+ return -EFAULT;
+ if (copy > size) {
+ ++from;
+ --count;
+ offset = 0;
+ } else
+ offset += size;
+ copy -= size;
+ offset1 += size;
+ }
+
+ if (len == offset1)
+ return 0;
+
+ while (count--) {
+ struct page *page[MAX_SKB_FRAGS];
+ int num_pages;
+ unsigned long base;
+ unsigned long truesize;
+
+ len = from->iov_len - offset;
+ if (!len) {
+ offset = 0;
+ ++from;
+ continue;
+ }
+ base = (unsigned long)from->iov_base + offset;
+ size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (i + size > MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+ num_pages = get_user_pages_fast(base, size, 0, &page[i]);
+ if (num_pages != size) {
+ for (i = 0; i < num_pages; i++)
+ put_page(page[i]);
+ return -EFAULT;
+ }
+ truesize = size * PAGE_SIZE;
+ skb->data_len += len;
+ skb->len += len;
+ skb->truesize += truesize;
+ atomic_add(truesize, &skb->sk->sk_wmem_alloc);
+ while (len) {
+ int off = base & ~PAGE_MASK;
+ int size = min_t(int, len, PAGE_SIZE - off);
+ __skb_fill_page_desc(skb, i, page[i], off, size);
+ skb_shinfo(skb)->nr_frags++;
+ /* increase sk_wmem_alloc */
+ base += size;
+ len -= size;
+ i++;
+ }
+ offset = 0;
+ ++from;
+ }
+ return 0;
+}
+
/* Get packet from user space buffer */
-static ssize_t tun_get_user(struct tun_struct *tun,
- const struct iovec *iv, size_t count,
- int noblock)
+static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control,
+ const struct iovec *iv, size_t total_len,
+ size_t count, int noblock)
{
struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
struct sk_buff *skb;
- size_t len = count, align = NET_SKB_PAD;
+ size_t len = total_len, align = NET_SKB_PAD;
struct virtio_net_hdr gso = { 0 };
int offset = 0;
+ int copylen;
+ bool zerocopy = false;
+ int err;
if (!(tun->flags & TUN_NO_PI)) {
- if ((len -= sizeof(pi)) > count)
+ if ((len -= sizeof(pi)) > total_len)
return -EINVAL;
if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi)))
@@ -621,7 +708,7 @@ static ssize_t tun_get_user(struct tun_struct *tun,
}
if (tun->flags & TUN_VNET_HDR) {
- if ((len -= tun->vnet_hdr_sz) > count)
+ if ((len -= tun->vnet_hdr_sz) > total_len)
return -EINVAL;
if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
@@ -643,14 +730,46 @@ static ssize_t tun_get_user(struct tun_struct *tun,
return -EINVAL;
}
- skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock);
+ if (msg_control)
+ zerocopy = true;
+
+ if (zerocopy) {
+ /* Userspace may produce vectors with count greater than
+ * MAX_SKB_FRAGS, so we need to linearize parts of the skb
+ * to let the rest of data to be fit in the frags.
+ */
+ if (count > MAX_SKB_FRAGS) {
+ copylen = iov_length(iv, count - MAX_SKB_FRAGS);
+ if (copylen < offset)
+ copylen = 0;
+ else
+ copylen -= offset;
+ } else
+ copylen = 0;
+ /* There are 256 bytes to be copied in skb, so there is enough
+ * room for skb expand head in case it is used.
+ * The rest of the buffer is mapped from userspace.
+ */
+ if (copylen < gso.hdr_len)
+ copylen = gso.hdr_len;
+ if (!copylen)
+ copylen = GOODCOPY_LEN;
+ } else
+ copylen = len;
+
+ skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock);
if (IS_ERR(skb)) {
if (PTR_ERR(skb) != -EAGAIN)
tun->dev->stats.rx_dropped++;
return PTR_ERR(skb);
}
- if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) {
+ if (zerocopy)
+ err = zerocopy_sg_from_iovec(skb, iv, offset, count);
+ else
+ err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);
+
+ if (err) {
tun->dev->stats.rx_dropped++;
kfree_skb(skb);
return -EFAULT;
@@ -724,12 +843,18 @@ static ssize_t tun_get_user(struct tun_struct *tun,
skb_shinfo(skb)->gso_segs = 0;
}
+ /* copy skb_ubuf_info for callback when skb has no error */
+ if (zerocopy) {
+ skb_shinfo(skb)->destructor_arg = msg_control;
+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+ }
+
netif_rx_ni(skb);
tun->dev->stats.rx_packets++;
tun->dev->stats.rx_bytes += len;
- return count;
+ return total_len;
}
static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
@@ -744,7 +869,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
- result = tun_get_user(tun, iv, iov_length(iv, count),
+ result = tun_get_user(tun, NULL, iv, iov_length(iv, count), count,
file->f_flags & O_NONBLOCK);
tun_put(tun);
@@ -958,8 +1083,8 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len)
{
struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
- return tun_get_user(tun, m->msg_iov, total_len,
- m->msg_flags & MSG_DONTWAIT);
+ return tun_get_user(tun, m->msg_control, m->msg_iov, total_len,
+ m->msg_iovlen, m->msg_flags & MSG_DONTWAIT);
}
static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
@@ -1115,6 +1240,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
tun->flags = flags;
tun->txflt.count = 0;
tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
+ set_bit(SOCK_EXTERNALLY_ALLOCATED, &tun->socket.flags);
err = -ENOMEM;
sk = sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
@@ -1128,6 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
sock_init_data(&tun->socket, sk);
sk->sk_write_space = tun_sock_write_space;
sk->sk_sndbuf = INT_MAX;
+ sock_set_flag(sk, SOCK_ZEROCOPY);
tun_sk(sk)->tun = tun;