/* Copyright (C) 2009 Red Hat, Inc.
* Author: Michael S. Tsirkin <mst@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2.
*
* virtio-net server in host kernel.
*/
#include <linux/compat.h>
#include <linux/eventfd.h>
#include <linux/vhost.h>
#include <linux/virtio_net.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/rcupdate.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/net.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/if_tun.h>
#include <linux/if_macvlan.h>
#include <linux/if_vlan.h>
#include <net/sock.h>
#include "vhost.h"
static int experimental_zcopytx = 1;
module_param(experimental_zcopytx, int, 0444);
MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
" 1 -Enable; 0 - Disable");
/* Max number of bytes transferred before requeueing the job.
* Using this limit prevents one virtqueue from starving others. */
#define VHOST_NET_WEIGHT 0x80000
/* MAX number of TX used buffers for outstanding zerocopy */
#define VHOST_MAX_PEND 128
#define VHOST_GOODCOPY_LEN 256
/*
* For transmit, used buffer len is unused; we override it to track buffer
* status internally; used for zerocopy tx only.
*/
/* Lower device DMA failed */
#define VHOST_DMA_FAILED_LEN 3
/* Lower device DMA done */
#define VHOST_DMA_DONE_LEN 2
/* Lower device DMA in progress */
#define VHOST_DMA_IN_PROGRESS 1
/* Buffer unused */
#define VHOST_DMA_CLEAR_LEN 0
#define VHOST_DMA_IS_DONE(len) ((len) >= VHOST_DMA_DONE_LEN)
enum {
VHOST_NET_VQ_RX = 0,
VHOST_NET_VQ_TX = 1,
VHOST_NET_VQ_MAX = 2,
};
enum vhost_net_poll_state {
VHOST_NET_POLL_DISABLED = 0,
VHOST_NET_POLL_STARTED = 1,
VHOST_NET_POLL_STOPPED = 2,
};
struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
/* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
enum vhost_net_poll_state tx_poll_state;
/* Number of TX recently submitted.
* Protected by tx vq lock. */
unsigned tx_packets;
/* Number of times zerocopy TX recently failed.
* Protected by tx vq lock. */
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
};
static void vhost_net_tx_packet(struct vhost_net *net)
{
++net->tx_packets;
if (net->tx_packets < 1024)
return;
net->tx_packets = 0;
net->tx_zcopy_err = 0;
}
static void vhost_net_tx_err(struct vhost_net *net)
{
++net->tx_zcopy_err;
}
static bool vhost_net_tx_select_zcopy(struct vhost_net *net)
{
/* TX flush waits for outstanding DMAs to be done.
* Don't start new DMAs.
*/
return !net->tx_flush &&
net->tx_packets / 64 >= net->tx_zcopy_err;
}
static bool vhost_sock_zcopy(struct socket *sock)
{
return unlikely(experimental_zcopytx) &&
sock_flag(sock->sk, SOCK_ZEROCOPY);
}
/* Pop first len bytes from iovec. Return number of segments used. */
static int move_iovec_hdr(struct iovec *from, struct iovec *to,
size_t len, int iov_count)
{
int seg = 0;
size_t size;
while (len && seg < iov_count) {
size = min(from->iov_len, len);
to->iov_base = from->iov_base;
to->iov_len = size;
from->iov_len -= size;
from->iov_base += size;
len -= size;
++from;
++to;
++seg;
}
return seg;
}
/* Copy iovec entries for len bytes from iovec. */
static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
size_t len, int iovcount)
{
int seg = 0;
size_t size;
while (len && seg < iovcount) {
size = min(from->iov_len, len);
to->iov_base = from->iov_base;
to->iov_len = size;
len -= size;
++from;
++to;
++seg;
}
}
/* Caller must have TX VQ lock */
static void tx_poll_stop(struct vhost_net *net)
{
if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
return;
vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
net->tx_poll_state = VHOST_NET_POLL_STOPPED;
}
/* Caller must have TX VQ lock */
static int tx_poll_start(struct vhost_net *net, struct socket *sock)
{
int ret;
if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
return 0;
ret = vhost_poll_start(net->poll + VHOST_NET_V