diff options
Diffstat (limited to 'tools/virtio')
34 files changed, 2381 insertions, 0 deletions
diff --git a/tools/virtio/.gitignore b/tools/virtio/.gitignore new file mode 100644 index 00000000000..1cfbb0157a4 --- /dev/null +++ b/tools/virtio/.gitignore @@ -0,0 +1,3 @@ +*.d +virtio_test +vringh_test diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile new file mode 100644 index 00000000000..9325f469382 --- /dev/null +++ b/tools/virtio/Makefile @@ -0,0 +1,14 @@ +all: test mod +test: virtio_test vringh_test +virtio_test: virtio_ring.o virtio_test.o +vringh_test: vringh_test.o vringh.o virtio_ring.o + +CFLAGS += -g -O2 -Wall -I. -I../include/ -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE +vpath %.c ../../drivers/virtio ../../drivers/vhost +mod: +	${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test +.PHONY: all test mod clean +clean: +	${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \ +              vhost_test/Module.symvers vhost_test/modules.order *.d +-include *.d diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h new file mode 100644 index 00000000000..aff61e13306 --- /dev/null +++ b/tools/virtio/asm/barrier.h @@ -0,0 +1,14 @@ +#if defined(__i386__) || defined(__x86_64__) +#define barrier() asm volatile("" ::: "memory") +#define mb() __sync_synchronize() + +#define smp_mb()	mb() +# define smp_rmb()	barrier() +# define smp_wmb()	barrier() +/* Weak barriers should be used. If not - it's a bug */ +# define rmb()	abort() +# define wmb()	abort() +#else +#error Please fill in barrier macros +#endif + diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h new file mode 100644 index 00000000000..fb94f0787c4 --- /dev/null +++ b/tools/virtio/linux/bug.h @@ -0,0 +1,10 @@ +#ifndef BUG_H +#define BUG_H + +#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) + +#define BUILD_BUG_ON(x) + +#define BUG() abort() + +#endif /* BUG_H */ diff --git a/tools/virtio/linux/device.h b/tools/virtio/linux/device.h new file mode 100644 index 00000000000..4ad7e1df0db --- /dev/null +++ b/tools/virtio/linux/device.h @@ -0,0 +1,2 @@ +#ifndef LINUX_DEVICE_H +#endif diff --git a/tools/virtio/linux/err.h b/tools/virtio/linux/err.h new file mode 100644 index 00000000000..e32eff8b2a1 --- /dev/null +++ b/tools/virtio/linux/err.h @@ -0,0 +1,26 @@ +#ifndef ERR_H +#define ERR_H +#define MAX_ERRNO	4095 + +#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) + +static inline void * __must_check ERR_PTR(long error) +{ +	return (void *) error; +} + +static inline long __must_check PTR_ERR(const void *ptr) +{ +	return (long) ptr; +} + +static inline long __must_check IS_ERR(const void *ptr) +{ +	return IS_ERR_VALUE((unsigned long)ptr); +} + +static inline long __must_check IS_ERR_OR_NULL(const void *ptr) +{ +	return !ptr || IS_ERR_VALUE((unsigned long)ptr); +} +#endif /* ERR_H */ diff --git a/tools/virtio/linux/hrtimer.h b/tools/virtio/linux/hrtimer.h new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/tools/virtio/linux/hrtimer.h diff --git a/tools/virtio/linux/irqreturn.h b/tools/virtio/linux/irqreturn.h new file mode 100644 index 00000000000..a3c4e7be708 --- /dev/null +++ b/tools/virtio/linux/irqreturn.h @@ -0,0 +1 @@ +#include "../../../include/linux/irqreturn.h" diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h new file mode 100644 index 00000000000..1e8ce6979c1 --- /dev/null +++ b/tools/virtio/linux/kernel.h @@ -0,0 +1,105 @@ +#ifndef KERNEL_H +#define KERNEL_H +#include <stdbool.h> +#include <stdlib.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <stdarg.h> + +#include <linux/types.h> +#include <linux/printk.h> +#include <linux/bug.h> +#include <errno.h> +#include <unistd.h> +#include <asm/barrier.h> + +#define CONFIG_SMP + +#define PAGE_SIZE getpagesize() +#define PAGE_MASK (~(PAGE_SIZE-1)) + +typedef unsigned long long dma_addr_t; +typedef size_t __kernel_size_t; + +struct page { +	unsigned long long dummy; +}; + +/* Physical == Virtual */ +#define virt_to_phys(p) ((unsigned long)p) +#define phys_to_virt(a) ((void *)(unsigned long)(a)) +/* Page address: Virtual / 4K */ +#define page_to_phys(p) ((dma_addr_t)(unsigned long)(p)) +#define virt_to_page(p) ((struct page *)((unsigned long)p & PAGE_MASK)) + +#define offset_in_page(p) (((unsigned long)p) % PAGE_SIZE) + +#define __printf(a,b) __attribute__((format(printf,a,b))) + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; +static inline void *kmalloc(size_t s, gfp_t gfp) +{ +	if (__kmalloc_fake) +		return __kmalloc_fake; +	return malloc(s); +} + +static inline void kfree(void *p) +{ +	if (p >= __kfree_ignore_start && p < __kfree_ignore_end) +		return; +	free(p); +} + +static inline void *krealloc(void *p, size_t s, gfp_t gfp) +{ +	return realloc(p, s); +} + + +static inline unsigned long __get_free_page(gfp_t gfp) +{ +	void *p; + +	posix_memalign(&p, PAGE_SIZE, PAGE_SIZE); +	return (unsigned long)p; +} + +static inline void free_page(unsigned long addr) +{ +	free((void *)addr); +} + +#define container_of(ptr, type, member) ({			\ +	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\ +	(type *)( (char *)__mptr - offsetof(type,member) );}) + +#define uninitialized_var(x) x = x + +# ifndef likely +#  define likely(x)	(__builtin_expect(!!(x), 1)) +# endif +# ifndef unlikely +#  define unlikely(x)	(__builtin_expect(!!(x), 0)) +# endif + +#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#ifdef DEBUG +#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#else +#define pr_debug(format, ...) do {} while (0) +#endif +#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) + +#define min(x, y) ({				\ +	typeof(x) _min1 = (x);			\ +	typeof(y) _min2 = (y);			\ +	(void) (&_min1 == &_min2);		\ +	_min1 < _min2 ? _min1 : _min2; }) + +#endif /* KERNEL_H */ diff --git a/tools/virtio/linux/kmemleak.h b/tools/virtio/linux/kmemleak.h new file mode 100644 index 00000000000..c07072270e2 --- /dev/null +++ b/tools/virtio/linux/kmemleak.h @@ -0,0 +1,3 @@ +static inline void kmemleak_ignore(const void *ptr) +{ +} diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h new file mode 100644 index 00000000000..28ce95a0599 --- /dev/null +++ b/tools/virtio/linux/module.h @@ -0,0 +1,6 @@ +#include <linux/export.h> + +#define MODULE_LICENSE(__MODULE_LICENSE_value) \ +	static __attribute__((unused)) const char *__MODULE_LICENSE_name = \ +		__MODULE_LICENSE_value + diff --git a/tools/virtio/linux/printk.h b/tools/virtio/linux/printk.h new file mode 100644 index 00000000000..9f2423bd89c --- /dev/null +++ b/tools/virtio/linux/printk.h @@ -0,0 +1,4 @@ +#include "../../../include/linux/kern_levels.h" + +#define printk printf +#define vprintk vprintf diff --git a/tools/virtio/linux/ratelimit.h b/tools/virtio/linux/ratelimit.h new file mode 100644 index 00000000000..dcce1725f90 --- /dev/null +++ b/tools/virtio/linux/ratelimit.h @@ -0,0 +1,4 @@ +#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init)	int name = 0 + +#define __ratelimit(x) (*(x)) + diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h new file mode 100644 index 00000000000..68c9e2adc99 --- /dev/null +++ b/tools/virtio/linux/scatterlist.h @@ -0,0 +1,189 @@ +#ifndef SCATTERLIST_H +#define SCATTERLIST_H +#include <linux/kernel.h> + +struct scatterlist { +	unsigned long	page_link; +	unsigned int	offset; +	unsigned int	length; +	dma_addr_t	dma_address; +}; + +/* Scatterlist helpers, stolen from linux/scatterlist.h */ +#define sg_is_chain(sg)		((sg)->page_link & 0x01) +#define sg_is_last(sg)		((sg)->page_link & 0x02) +#define sg_chain_ptr(sg)	\ +	((struct scatterlist *) ((sg)->page_link & ~0x03)) + +/** + * sg_assign_page - Assign a given page to an SG entry + * @sg:		    SG entry + * @page:	    The page + * + * Description: + *   Assign page to sg entry. Also see sg_set_page(), the most commonly used + *   variant. + * + **/ +static inline void sg_assign_page(struct scatterlist *sg, struct page *page) +{ +	unsigned long page_link = sg->page_link & 0x3; + +	/* +	 * In order for the low bit stealing approach to work, pages +	 * must be aligned at a 32-bit boundary as a minimum. +	 */ +	BUG_ON((unsigned long) page & 0x03); +#ifdef CONFIG_DEBUG_SG +	BUG_ON(sg->sg_magic != SG_MAGIC); +	BUG_ON(sg_is_chain(sg)); +#endif +	sg->page_link = page_link | (unsigned long) page; +} + +/** + * sg_set_page - Set sg entry to point at given page + * @sg:		 SG entry + * @page:	 The page + * @len:	 Length of data + * @offset:	 Offset into page + * + * Description: + *   Use this function to set an sg entry pointing at a page, never assign + *   the page directly. We encode sg table information in the lower bits + *   of the page pointer. See sg_page() for looking up the page belonging + *   to an sg entry. + * + **/ +static inline void sg_set_page(struct scatterlist *sg, struct page *page, +			       unsigned int len, unsigned int offset) +{ +	sg_assign_page(sg, page); +	sg->offset = offset; +	sg->length = len; +} + +static inline struct page *sg_page(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG +	BUG_ON(sg->sg_magic != SG_MAGIC); +	BUG_ON(sg_is_chain(sg)); +#endif +	return (struct page *)((sg)->page_link & ~0x3); +} + +/* + * Loop over each sg element, following the pointer to a new list if necessary + */ +#define for_each_sg(sglist, sg, nr, __i)	\ +	for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) + +/** + * sg_chain - Chain two sglists together + * @prv:	First scatterlist + * @prv_nents:	Number of entries in prv + * @sgl:	Second scatterlist + * + * Description: + *   Links @prv@ and @sgl@ together, to form a longer scatterlist. + * + **/ +static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, +			    struct scatterlist *sgl) +{ +	/* +	 * offset and length are unused for chain entry.  Clear them. +	 */ +	prv[prv_nents - 1].offset = 0; +	prv[prv_nents - 1].length = 0; + +	/* +	 * Set lowest bit to indicate a link pointer, and make sure to clear +	 * the termination bit if it happens to be set. +	 */ +	prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02; +} + +/** + * sg_mark_end - Mark the end of the scatterlist + * @sg:		 SG entryScatterlist + * + * Description: + *   Marks the passed in sg entry as the termination point for the sg + *   table. A call to sg_next() on this entry will return NULL. + * + **/ +static inline void sg_mark_end(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG +	BUG_ON(sg->sg_magic != SG_MAGIC); +#endif +	/* +	 * Set termination bit, clear potential chain bit +	 */ +	sg->page_link |= 0x02; +	sg->page_link &= ~0x01; +} + +/** + * sg_unmark_end - Undo setting the end of the scatterlist + * @sg:		 SG entryScatterlist + * + * Description: + *   Removes the termination marker from the given entry of the scatterlist. + * + **/ +static inline void sg_unmark_end(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG +	BUG_ON(sg->sg_magic != SG_MAGIC); +#endif +	sg->page_link &= ~0x02; +} + +static inline struct scatterlist *sg_next(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG +	BUG_ON(sg->sg_magic != SG_MAGIC); +#endif +	if (sg_is_last(sg)) +		return NULL; + +	sg++; +	if (unlikely(sg_is_chain(sg))) +		sg = sg_chain_ptr(sg); + +	return sg; +} + +static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) +{ +	memset(sgl, 0, sizeof(*sgl) * nents); +#ifdef CONFIG_DEBUG_SG +	{ +		unsigned int i; +		for (i = 0; i < nents; i++) +			sgl[i].sg_magic = SG_MAGIC; +	} +#endif +	sg_mark_end(&sgl[nents - 1]); +} + +static inline dma_addr_t sg_phys(struct scatterlist *sg) +{ +	return page_to_phys(sg_page(sg)) + sg->offset; +} + +static inline void sg_set_buf(struct scatterlist *sg, const void *buf, +			      unsigned int buflen) +{ +	sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); +} + +static inline void sg_init_one(struct scatterlist *sg, +			       const void *buf, unsigned int buflen) +{ +	sg_init_table(sg, 1); +	sg_set_buf(sg, buf, buflen); +} +#endif /* SCATTERLIST_H */ diff --git a/tools/virtio/linux/slab.h b/tools/virtio/linux/slab.h new file mode 100644 index 00000000000..81baeac8ae4 --- /dev/null +++ b/tools/virtio/linux/slab.h @@ -0,0 +1,2 @@ +#ifndef LINUX_SLAB_H +#endif diff --git a/tools/virtio/linux/uaccess.h b/tools/virtio/linux/uaccess.h new file mode 100644 index 00000000000..0a578fe1865 --- /dev/null +++ b/tools/virtio/linux/uaccess.h @@ -0,0 +1,50 @@ +#ifndef UACCESS_H +#define UACCESS_H +extern void *__user_addr_min, *__user_addr_max; + +#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) + +static inline void __chk_user_ptr(const volatile void *p, size_t size) +{ +	assert(p >= __user_addr_min && p + size <= __user_addr_max); +} + +#define put_user(x, ptr)					\ +({								\ +	typeof(ptr) __pu_ptr = (ptr);				\ +	__chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr));		\ +	ACCESS_ONCE(*(__pu_ptr)) = x;				\ +	0;							\ +}) + +#define get_user(x, ptr)					\ +({								\ +	typeof(ptr) __pu_ptr = (ptr);				\ +	__chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr));		\ +	x = ACCESS_ONCE(*(__pu_ptr));				\ +	0;							\ +}) + +static void volatile_memcpy(volatile char *to, const volatile char *from,  +			    unsigned long n) +{ +	while (n--) +		*(to++) = *(from++); +} + +static inline int copy_from_user(void *to, const void __user volatile *from, +				 unsigned long n) +{ +	__chk_user_ptr(from, n); +	volatile_memcpy(to, from, n); +	return 0; +} + +static inline int copy_to_user(void __user volatile *to, const void *from, +			       unsigned long n) +{ +	__chk_user_ptr(to, n); +	volatile_memcpy(to, from, n); +	return 0; +} +#endif /* UACCESS_H */ diff --git a/tools/virtio/linux/uio.h b/tools/virtio/linux/uio.h new file mode 100644 index 00000000000..cd20f0ba308 --- /dev/null +++ b/tools/virtio/linux/uio.h @@ -0,0 +1,3 @@ +#include <linux/kernel.h> + +#include "../../../include/linux/uio.h" diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h new file mode 100644 index 00000000000..5a2d1f0f6bc --- /dev/null +++ b/tools/virtio/linux/virtio.h @@ -0,0 +1,87 @@ +#ifndef LINUX_VIRTIO_H +#define LINUX_VIRTIO_H +#include <linux/scatterlist.h> +#include <linux/kernel.h> + +/* TODO: empty stubs for now. Broken but enough for virtio_ring.c */ +#define list_add_tail(a, b) do {} while (0) +#define list_del(a) do {} while (0) + +#define BIT_WORD(nr)		((nr) / BITS_PER_LONG) +#define BITS_PER_BYTE		8 +#define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE) +#define BIT_MASK(nr)		(1UL << ((nr) % BITS_PER_LONG)) + +/* TODO: Not atomic as it should be: + * we don't use this for anything important. */ +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ +	unsigned long mask = BIT_MASK(nr); +	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + +	*p &= ~mask; +} + +static inline int test_bit(int nr, const volatile unsigned long *addr) +{ +        return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); +} +/* end of stubs */ + +struct virtio_device { +	void *dev; +	unsigned long features[1]; +}; + +struct virtqueue { +	/* TODO: commented as list macros are empty stubs for now. +	 * Broken but enough for virtio_ring.c +	 * struct list_head list; */ +	void (*callback)(struct virtqueue *vq); +	const char *name; +	struct virtio_device *vdev; +        unsigned int index; +        unsigned int num_free; +	void *priv; +}; + +/* Interfaces exported by virtio_ring. */ +int virtqueue_add_sgs(struct virtqueue *vq, +		      struct scatterlist *sgs[], +		      unsigned int out_sgs, +		      unsigned int in_sgs, +		      void *data, +		      gfp_t gfp); + +int virtqueue_add_outbuf(struct virtqueue *vq, +			 struct scatterlist sg[], unsigned int num, +			 void *data, +			 gfp_t gfp); + +int virtqueue_add_inbuf(struct virtqueue *vq, +			struct scatterlist sg[], unsigned int num, +			void *data, +			gfp_t gfp); + +bool virtqueue_kick(struct virtqueue *vq); + +void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); + +void virtqueue_disable_cb(struct virtqueue *vq); + +bool virtqueue_enable_cb(struct virtqueue *vq); +bool virtqueue_enable_cb_delayed(struct virtqueue *vq); + +void *virtqueue_detach_unused_buf(struct virtqueue *vq); +struct virtqueue *vring_new_virtqueue(unsigned int index, +				      unsigned int num, +				      unsigned int vring_align, +				      struct virtio_device *vdev, +				      bool weak_barriers, +				      void *pages, +				      bool (*notify)(struct virtqueue *vq), +				      void (*callback)(struct virtqueue *vq), +				      const char *name); +void vring_del_virtqueue(struct virtqueue *vq); + +#endif diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h new file mode 100644 index 00000000000..5049967f99f --- /dev/null +++ b/tools/virtio/linux/virtio_config.h @@ -0,0 +1,6 @@ +#define VIRTIO_TRANSPORT_F_START	28 +#define VIRTIO_TRANSPORT_F_END		32 + +#define virtio_has_feature(dev, feature) \ +	test_bit((feature), (dev)->features) + diff --git a/tools/virtio/linux/virtio_ring.h b/tools/virtio/linux/virtio_ring.h new file mode 100644 index 00000000000..8949c4e2772 --- /dev/null +++ b/tools/virtio/linux/virtio_ring.h @@ -0,0 +1 @@ +#include "../../../include/linux/virtio_ring.h" diff --git a/tools/virtio/linux/vringh.h b/tools/virtio/linux/vringh.h new file mode 100644 index 00000000000..9348957be56 --- /dev/null +++ b/tools/virtio/linux/vringh.h @@ -0,0 +1 @@ +#include "../../../include/linux/vringh.h" diff --git a/tools/virtio/uapi/linux/uio.h b/tools/virtio/uapi/linux/uio.h new file mode 100644 index 00000000000..7230e900220 --- /dev/null +++ b/tools/virtio/uapi/linux/uio.h @@ -0,0 +1 @@ +#include <sys/uio.h> diff --git a/tools/virtio/uapi/linux/virtio_config.h b/tools/virtio/uapi/linux/virtio_config.h new file mode 100644 index 00000000000..4c86675f015 --- /dev/null +++ b/tools/virtio/uapi/linux/virtio_config.h @@ -0,0 +1 @@ +#include "../../../../include/uapi/linux/virtio_config.h" diff --git a/tools/virtio/uapi/linux/virtio_ring.h b/tools/virtio/uapi/linux/virtio_ring.h new file mode 100644 index 00000000000..4d99c78234d --- /dev/null +++ b/tools/virtio/uapi/linux/virtio_ring.h @@ -0,0 +1,4 @@ +#ifndef VIRTIO_RING_H +#define VIRTIO_RING_H +#include "../../../../include/uapi/linux/virtio_ring.h" +#endif /* VIRTIO_RING_H */ diff --git a/tools/virtio/vhost_test/Makefile b/tools/virtio/vhost_test/Makefile new file mode 100644 index 00000000000..a1d35b81b31 --- /dev/null +++ b/tools/virtio/vhost_test/Makefile @@ -0,0 +1,2 @@ +obj-m += vhost_test.o +EXTRA_CFLAGS += -Idrivers/vhost diff --git a/tools/virtio/vhost_test/vhost_test.c b/tools/virtio/vhost_test/vhost_test.c new file mode 100644 index 00000000000..18735189e62 --- /dev/null +++ b/tools/virtio/vhost_test/vhost_test.c @@ -0,0 +1 @@ +#include "test.c" diff --git a/tools/virtio/virtio-trace/Makefile b/tools/virtio/virtio-trace/Makefile new file mode 100644 index 00000000000..0d238163347 --- /dev/null +++ b/tools/virtio/virtio-trace/Makefile @@ -0,0 +1,13 @@ +CC = gcc +CFLAGS = -O2 -Wall -pthread + +all: trace-agent + +.c.o: +	$(CC) $(CFLAGS) -c $^ -o $@ + +trace-agent: trace-agent.o trace-agent-ctl.o trace-agent-rw.o +	$(CC) $(CFLAGS) -o $@ $^ + +clean: +	rm -f *.o trace-agent diff --git a/tools/virtio/virtio-trace/README b/tools/virtio/virtio-trace/README new file mode 100644 index 00000000000..b64845b823a --- /dev/null +++ b/tools/virtio/virtio-trace/README @@ -0,0 +1,118 @@ +Trace Agent for virtio-trace +============================ + +Trace agent is a user tool for sending trace data of a guest to a Host in low +overhead. Trace agent has the following functions: + - splice a page of ring-buffer to read_pipe without memory copying + - splice the page from write_pipe to virtio-console without memory copying + - write trace data to stdout by using -o option + - controlled by start/stop orders from a Host + +The trace agent operates as follows: + 1) Initialize all structures. + 2) Create a read/write thread per CPU. Each thread is bound to a CPU. +    The read/write threads hold it. + 3) A controller thread does poll() for a start order of a host. + 4) After the controller of the trace agent receives a start order from a host, +    the controller wake read/write threads. + 5) The read/write threads start to read trace data from ring-buffers and +    write the data to virtio-serial. + 6) If the controller receives a stop order from a host, the read/write threads +    stop to read trace data. + + +Files +===== + +README: this file +Makefile: Makefile of trace agent for virtio-trace +trace-agent.c: includes main function, sets up for operating trace agent +trace-agent.h: includes all structures and some macros +trace-agent-ctl.c: includes controller function for read/write threads +trace-agent-rw.c: includes read/write threads function + + +Setup +===== + +To use this trace agent for virtio-trace, we need to prepare some virtio-serial +I/Fs. + +1) Make FIFO in a host + virtio-trace uses virtio-serial pipe as trace data paths as to the number +of CPUs and a control path, so FIFO (named pipe) should be created as follows: +	# mkdir /tmp/virtio-trace/ +	# mkfifo /tmp/virtio-trace/trace-path-cpu{0,1,2,...,X}.{in,out} +	# mkfifo /tmp/virtio-trace/agent-ctl-path.{in,out} + +For example, if a guest use three CPUs, the names are +	trace-path-cpu{0,1,2}.{in.out} +and +	agent-ctl-path.{in,out}. + +2) Set up of virtio-serial pipe in a host + Add qemu option to use virtio-serial pipe. + + ##virtio-serial device## +     -device virtio-serial-pci,id=virtio-serial0\ + ##control path## +     -chardev pipe,id=charchannel0,path=/tmp/virtio-trace/agent-ctl-path\ +     -device virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,\ +      id=channel0,name=agent-ctl-path\ + ##data path## +     -chardev pipe,id=charchannel1,path=/tmp/virtio-trace/trace-path-cpu0\ +     -device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel0,\ +      id=channel1,name=trace-path-cpu0\ +      ... + +If you manage guests with libvirt, add the following tags to domain XML files. +Then, libvirt passes the same command option to qemu. + +	<channel type='pipe'> +	   <source path='/tmp/virtio-trace/agent-ctl-path'/> +	   <target type='virtio' name='agent-ctl-path'/> +	   <address type='virtio-serial' controller='0' bus='0' port='0'/> +	</channel> +	<channel type='pipe'> +	   <source path='/tmp/virtio-trace/trace-path-cpu0'/> +	   <target type='virtio' name='trace-path-cpu0'/> +	   <address type='virtio-serial' controller='0' bus='0' port='1'/> +	</channel> +	... +Here, chardev names are restricted to trace-path-cpuX and agent-ctl-path. For +example, if a guest use three CPUs, chardev names should be trace-path-cpu0, +trace-path-cpu1, trace-path-cpu2, and agent-ctl-path. + +3) Boot the guest + You can find some chardev in /dev/virtio-ports/ in the guest. + + +Run +=== + +0) Build trace agent in a guest +	$ make + +1) Enable ftrace in the guest + <Example> +	# echo 1 > /sys/kernel/debug/tracing/events/sched/enable + +2) Run trace agent in the guest + This agent must be operated as root. +	# ./trace-agent +read/write threads in the agent wait for start order from host. If you add -o +option, trace data are output via stdout in the guest. + +3) Open FIFO in a host +	# cat /tmp/virtio-trace/trace-path-cpu0.out +If a host does not open these, trace data get stuck in buffers of virtio. Then, +the guest will stop by specification of chardev in QEMU. This blocking mode may +be solved in the future. + +4) Start to read trace data by ordering from a host + A host injects read start order to the guest via virtio-serial. +	# echo 1 > /tmp/virtio-trace/agent-ctl-path.in + +5) Stop to read trace data by ordering from a host + A host injects read stop order to the guest via virtio-serial. +	# echo 0 > /tmp/virtio-trace/agent-ctl-path.in diff --git a/tools/virtio/virtio-trace/trace-agent-ctl.c b/tools/virtio/virtio-trace/trace-agent-ctl.c new file mode 100644 index 00000000000..a2d0403c4f9 --- /dev/null +++ b/tools/virtio/virtio-trace/trace-agent-ctl.c @@ -0,0 +1,137 @@ +/* + * Controller of read/write threads for virtio-trace + * + * Copyright (C) 2012 Hitachi, Ltd. + * Created by Yoshihiro Yunomae <yoshihiro.yunomae.ez@hitachi.com> + *            Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Licensed under GPL version 2 only. + * + */ + +#define _GNU_SOURCE +#include <fcntl.h> +#include <poll.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include "trace-agent.h" + +#define HOST_MSG_SIZE		256 +#define EVENT_WAIT_MSEC		100 + +static volatile sig_atomic_t global_signal_val; +bool global_sig_receive;	/* default false */ +bool global_run_operation;	/* default false*/ + +/* Handle SIGTERM/SIGINT/SIGQUIT to exit */ +static void signal_handler(int sig) +{ +	global_signal_val = sig; +} + +int rw_ctl_init(const char *ctl_path) +{ +	int ctl_fd; + +	ctl_fd = open(ctl_path, O_RDONLY); +	if (ctl_fd == -1) { +		pr_err("Cannot open ctl_fd\n"); +		goto error; +	} + +	return ctl_fd; + +error: +	exit(EXIT_FAILURE); +} + +static int wait_order(int ctl_fd) +{ +	struct pollfd poll_fd; +	int ret = 0; + +	while (!global_sig_receive) { +		poll_fd.fd = ctl_fd; +		poll_fd.events = POLLIN; + +		ret = poll(&poll_fd, 1, EVENT_WAIT_MSEC); + +		if (global_signal_val) { +			global_sig_receive = true; +			pr_info("Receive interrupt %d\n", global_signal_val); + +			/* Wakes rw-threads when they are sleeping */ +			if (!global_run_operation) +				pthread_cond_broadcast(&cond_wakeup); + +			ret = -1; +			break; +		} + +		if (ret < 0) { +			pr_err("Polling error\n"); +			goto error; +		} + +		if (ret) +			break; +	}; + +	return ret; + +error: +	exit(EXIT_FAILURE); +} + +/* + * contol read/write threads by handling global_run_operation + */ +void *rw_ctl_loop(int ctl_fd) +{ +	ssize_t rlen; +	char buf[HOST_MSG_SIZE]; +	int ret; + +	/* Setup signal handlers */ +	signal(SIGTERM, signal_handler); +	signal(SIGINT, signal_handler); +	signal(SIGQUIT, signal_handler); + +	while (!global_sig_receive) { + +		ret = wait_order(ctl_fd); +		if (ret < 0) +			break; + +		rlen = read(ctl_fd, buf, sizeof(buf)); +		if (rlen < 0) { +			pr_err("read data error in ctl thread\n"); +			goto error; +		} + +		if (rlen == 2 && buf[0] == '1') { +			/* +			 * If host writes '1' to a control path, +			 * this controller wakes all read/write threads. +			 */ +			global_run_operation = true; +			pthread_cond_broadcast(&cond_wakeup); +			pr_debug("Wake up all read/write threads\n"); +		} else if (rlen == 2 && buf[0] == '0') { +			/* +			 * If host writes '0' to a control path, read/write +			 * threads will wait for notification from Host. +			 */ +			global_run_operation = false; +			pr_debug("Stop all read/write threads\n"); +		} else +			pr_info("Invalid host notification: %s\n", buf); +	} + +	return NULL; + +error: +	exit(EXIT_FAILURE); +} diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c new file mode 100644 index 00000000000..3aace5ea484 --- /dev/null +++ b/tools/virtio/virtio-trace/trace-agent-rw.c @@ -0,0 +1,192 @@ +/* + * Read/write thread of a guest agent for virtio-trace + * + * Copyright (C) 2012 Hitachi, Ltd. + * Created by Yoshihiro Yunomae <yoshihiro.yunomae.ez@hitachi.com> + *            Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Licensed under GPL version 2 only. + * + */ + +#define _GNU_SOURCE +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/syscall.h> +#include "trace-agent.h" + +#define READ_WAIT_USEC	100000 + +void *rw_thread_info_new(void) +{ +	struct rw_thread_info *rw_ti; + +	rw_ti = zalloc(sizeof(struct rw_thread_info)); +	if (rw_ti == NULL) { +		pr_err("rw_thread_info zalloc error\n"); +		exit(EXIT_FAILURE); +	} + +	rw_ti->cpu_num = -1; +	rw_ti->in_fd = -1; +	rw_ti->out_fd = -1; +	rw_ti->read_pipe = -1; +	rw_ti->write_pipe = -1; +	rw_ti->pipe_size = PIPE_INIT; + +	return rw_ti; +} + +void *rw_thread_init(int cpu, const char *in_path, const char *out_path, +				bool stdout_flag, unsigned long pipe_size, +				struct rw_thread_info *rw_ti) +{ +	int data_pipe[2]; + +	rw_ti->cpu_num = cpu; + +	/* set read(input) fd */ +	rw_ti->in_fd = open(in_path, O_RDONLY); +	if (rw_ti->in_fd == -1) { +		pr_err("Could not open in_fd (CPU:%d)\n", cpu); +		goto error; +	} + +	/* set write(output) fd */ +	if (!stdout_flag) { +		/* virtio-serial output mode */ +		rw_ti->out_fd = open(out_path, O_WRONLY); +		if (rw_ti->out_fd == -1) { +			pr_err("Could not open out_fd (CPU:%d)\n", cpu); +			goto error; +		} +	} else +		/* stdout mode */ +		rw_ti->out_fd = STDOUT_FILENO; + +	if (pipe2(data_pipe, O_NONBLOCK) < 0) { +		pr_err("Could not create pipe in rw-thread(%d)\n", cpu); +		goto error; +	} + +	/* +	 * Size of pipe is 64kB in default based on fs/pipe.c. +	 * To read/write trace data speedy, pipe size is changed. +	 */ +	if (fcntl(*data_pipe, F_SETPIPE_SZ, pipe_size) < 0) { +		pr_err("Could not change pipe size in rw-thread(%d)\n", cpu); +		goto error; +	} + +	rw_ti->read_pipe = data_pipe[1]; +	rw_ti->write_pipe = data_pipe[0]; +	rw_ti->pipe_size = pipe_size; + +	return NULL; + +error: +	exit(EXIT_FAILURE); +} + +/* Bind a thread to a cpu */ +static void bind_cpu(int cpu_num) +{ +	cpu_set_t mask; + +	CPU_ZERO(&mask); +	CPU_SET(cpu_num, &mask); + +	/* bind my thread to cpu_num by assigning zero to the first argument */ +	if (sched_setaffinity(0, sizeof(mask), &mask) == -1) +		pr_err("Could not set CPU#%d affinity\n", (int)cpu_num); +} + +static void *rw_thread_main(void *thread_info) +{ +	ssize_t rlen, wlen; +	ssize_t ret; +	struct rw_thread_info *ts = (struct rw_thread_info *)thread_info; + +	bind_cpu(ts->cpu_num); + +	while (1) { +		/* Wait for a read order of trace data by Host OS */ +		if (!global_run_operation) { +			pthread_mutex_lock(&mutex_notify); +			pthread_cond_wait(&cond_wakeup, &mutex_notify); +			pthread_mutex_unlock(&mutex_notify); +		} + +		if (global_sig_receive) +			break; + +		/* +		 * Each thread read trace_pipe_raw of each cpu bounding the +		 * thread, so contention of multi-threads does not occur. +		 */ +		rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL, +				ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE); + +		if (rlen < 0) { +			pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num); +			goto error; +		} else if (rlen == 0) { +			/* +			 * If trace data do not exist or are unreadable not +			 * for exceeding the page size, splice_read returns +			 * NULL. Then, this waits for being filled the data in a +			 * ring-buffer. +			 */ +			usleep(READ_WAIT_USEC); +			pr_debug("Read retry(cpu:%d)\n", ts->cpu_num); +			continue; +		} + +		wlen = 0; + +		do { +			ret = splice(ts->write_pipe, NULL, ts->out_fd, NULL, +					rlen - wlen, +					SPLICE_F_MOVE | SPLICE_F_MORE); + +			if (ret < 0) { +				pr_err("Splice_write in rw-thread(%d)\n", +								ts->cpu_num); +				goto error; +			} else if (ret == 0) +				/* +				 * When host reader is not in time for reading +				 * trace data, guest will be stopped. This is +				 * because char dev in QEMU is not supported +				 * non-blocking mode. Then, writer might be +				 * sleep in that case. +				 * This sleep will be removed by supporting +				 * non-blocking mode. +				 */ +				sleep(1); +			wlen += ret; +		} while (wlen < rlen); +	} + +	return NULL; + +error: +	exit(EXIT_FAILURE); +} + + +pthread_t rw_thread_run(struct rw_thread_info *rw_ti) +{ +	int ret; +	pthread_t rw_thread_per_cpu; + +	ret = pthread_create(&rw_thread_per_cpu, NULL, rw_thread_main, rw_ti); +	if (ret != 0) { +		pr_err("Could not create a rw thread(%d)\n", rw_ti->cpu_num); +		exit(EXIT_FAILURE); +	} + +	return rw_thread_per_cpu; +} diff --git a/tools/virtio/virtio-trace/trace-agent.c b/tools/virtio/virtio-trace/trace-agent.c new file mode 100644 index 00000000000..0a0a7dd4eff --- /dev/null +++ b/tools/virtio/virtio-trace/trace-agent.c @@ -0,0 +1,270 @@ +/* + * Guest agent for virtio-trace + * + * Copyright (C) 2012 Hitachi, Ltd. + * Created by Yoshihiro Yunomae <yoshihiro.yunomae.ez@hitachi.com> + *            Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Licensed under GPL version 2 only. + * + */ + +#define _GNU_SOURCE +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include "trace-agent.h" + +#define PAGE_SIZE		(sysconf(_SC_PAGE_SIZE)) +#define PIPE_DEF_BUFS		16 +#define PIPE_MIN_SIZE		(PAGE_SIZE*PIPE_DEF_BUFS) +#define PIPE_MAX_SIZE		(1024*1024) +#define READ_PATH_FMT	\ +		"/sys/kernel/debug/tracing/per_cpu/cpu%d/trace_pipe_raw" +#define WRITE_PATH_FMT		"/dev/virtio-ports/trace-path-cpu%d" +#define CTL_PATH		"/dev/virtio-ports/agent-ctl-path" + +pthread_mutex_t mutex_notify = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t cond_wakeup = PTHREAD_COND_INITIALIZER; + +static int get_total_cpus(void) +{ +	int nr_cpus = (int)sysconf(_SC_NPROCESSORS_CONF); + +	if (nr_cpus <= 0) { +		pr_err("Could not read cpus\n"); +		goto error; +	} else if (nr_cpus > MAX_CPUS) { +		pr_err("Exceed max cpus(%d)\n", (int)MAX_CPUS); +		goto error; +	} + +	return nr_cpus; + +error: +	exit(EXIT_FAILURE); +} + +static void *agent_info_new(void) +{ +	struct agent_info *s; +	int i; + +	s = zalloc(sizeof(struct agent_info)); +	if (s == NULL) { +		pr_err("agent_info zalloc error\n"); +		exit(EXIT_FAILURE); +	} + +	s->pipe_size = PIPE_INIT; +	s->use_stdout = false; +	s->cpus = get_total_cpus(); +	s->ctl_fd = -1; + +	/* read/write threads init */ +	for (i = 0; i < s->cpus; i++) +		s->rw_ti[i] = rw_thread_info_new(); + +	return s; +} + +static unsigned long parse_size(const char *arg) +{ +	unsigned long value, round; +	char *ptr; + +	value = strtoul(arg, &ptr, 10); +	switch (*ptr) { +	case 'K': case 'k': +		value <<= 10; +		break; +	case 'M': case 'm': +		value <<= 20; +		break; +	default: +		break; +	} + +	if (value > PIPE_MAX_SIZE) { +		pr_err("Pipe size must be less than 1MB\n"); +		goto error; +	} else if (value < PIPE_MIN_SIZE) { +		pr_err("Pipe size must be over 64KB\n"); +		goto error; +	} + +	/* Align buffer size with page unit */ +	round = value & (PAGE_SIZE - 1); +	value = value - round; + +	return value; +error: +	return 0; +} + +static void usage(char const *prg) +{ +	pr_err("usage: %s [-h] [-o] [-s <size of pipe>]\n", prg); +} + +static const char *make_path(int cpu_num, bool this_is_write_path) +{ +	int ret; +	char *buf; + +	buf = zalloc(PATH_MAX); +	if (buf == NULL) { +		pr_err("Could not allocate buffer\n"); +		goto error; +	} + +	if (this_is_write_path) +		/* write(output) path */ +		ret = snprintf(buf, PATH_MAX, WRITE_PATH_FMT, cpu_num); +	else +		/* read(input) path */ +		ret = snprintf(buf, PATH_MAX, READ_PATH_FMT, cpu_num); + +	if (ret <= 0) { +		pr_err("Failed to generate %s path(CPU#%d):%d\n", +			this_is_write_path ? "read" : "write", cpu_num, ret); +		goto error; +	} + +	return buf; + +error: +	free(buf); +	return NULL; +} + +static const char *make_input_path(int cpu_num) +{ +	return make_path(cpu_num, false); +} + +static const char *make_output_path(int cpu_num) +{ +	return make_path(cpu_num, true); +} + +static void *agent_info_init(struct agent_info *s) +{ +	int cpu; +	const char *in_path = NULL; +	const char *out_path = NULL; + +	/* init read/write threads */ +	for (cpu = 0; cpu < s->cpus; cpu++) { +		/* set read(input) path per read/write thread */ +		in_path = make_input_path(cpu); +		if (in_path == NULL) +			goto error; + +		/* set write(output) path per read/write thread*/ +		if (!s->use_stdout) { +			out_path = make_output_path(cpu); +			if (out_path == NULL) +				goto error; +		} else +			/* stdout mode */ +			pr_debug("stdout mode\n"); + +		rw_thread_init(cpu, in_path, out_path, s->use_stdout, +						s->pipe_size, s->rw_ti[cpu]); +	} + +	/* init controller of read/write threads */ +	s->ctl_fd = rw_ctl_init((const char *)CTL_PATH); + +	return NULL; + +error: +	exit(EXIT_FAILURE); +} + +static void *parse_args(int argc, char *argv[], struct agent_info *s) +{ +	int cmd; +	unsigned long size; + +	while ((cmd = getopt(argc, argv, "hos:")) != -1) { +		switch (cmd) { +		/* stdout mode */ +		case 'o': +			s->use_stdout = true; +			break; +		/* size of pipe */ +		case 's': +			size = parse_size(optarg); +			if (size == 0) +				goto error; +			s->pipe_size = size; +			break; +		case 'h': +		default: +			usage(argv[0]); +			goto error; +		} +	} + +	agent_info_init(s); + +	return NULL; + +error: +	exit(EXIT_FAILURE); +} + +static void agent_main_loop(struct agent_info *s) +{ +	int cpu; +	pthread_t rw_thread_per_cpu[MAX_CPUS]; + +	/* Start all read/write threads */ +	for (cpu = 0; cpu < s->cpus; cpu++) +		rw_thread_per_cpu[cpu] = rw_thread_run(s->rw_ti[cpu]); + +	rw_ctl_loop(s->ctl_fd); + +	/* Finish all read/write threads */ +	for (cpu = 0; cpu < s->cpus; cpu++) { +		int ret; + +		ret = pthread_join(rw_thread_per_cpu[cpu], NULL); +		if (ret != 0) { +			pr_err("pthread_join() error:%d (cpu %d)\n", ret, cpu); +			exit(EXIT_FAILURE); +		} +	} +} + +static void agent_info_free(struct agent_info *s) +{ +	int i; + +	close(s->ctl_fd); +	for (i = 0; i < s->cpus; i++) { +		close(s->rw_ti[i]->in_fd); +		close(s->rw_ti[i]->out_fd); +		close(s->rw_ti[i]->read_pipe); +		close(s->rw_ti[i]->write_pipe); +		free(s->rw_ti[i]); +	} +	free(s); +} + +int main(int argc, char *argv[]) +{ +	struct agent_info *s = NULL; + +	s = agent_info_new(); +	parse_args(argc, argv, s); + +	agent_main_loop(s); + +	agent_info_free(s); + +	return 0; +} diff --git a/tools/virtio/virtio-trace/trace-agent.h b/tools/virtio/virtio-trace/trace-agent.h new file mode 100644 index 00000000000..8de79bfeaa7 --- /dev/null +++ b/tools/virtio/virtio-trace/trace-agent.h @@ -0,0 +1,75 @@ +#ifndef __TRACE_AGENT_H__ +#define __TRACE_AGENT_H__ +#include <pthread.h> +#include <stdbool.h> + +#define MAX_CPUS	256 +#define PIPE_INIT       (1024*1024) + +/* + * agent_info - structure managing total information of guest agent + * @pipe_size:	size of pipe (default 1MB) + * @use_stdout:	set to true when o option is added (default false) + * @cpus:	total number of CPUs + * @ctl_fd:	fd of control path, /dev/virtio-ports/agent-ctl-path + * @rw_ti:	structure managing information of read/write threads + */ +struct agent_info { +	unsigned long pipe_size; +	bool use_stdout; +	int cpus; +	int ctl_fd; +	struct rw_thread_info *rw_ti[MAX_CPUS]; +}; + +/* + * rw_thread_info - structure managing a read/write thread a cpu + * @cpu_num:	cpu number operating this read/write thread + * @in_fd:	fd of reading trace data path in cpu_num + * @out_fd:	fd of writing trace data path in cpu_num + * @read_pipe:	fd of read pipe + * @write_pipe:	fd of write pipe + * @pipe_size:	size of pipe (default 1MB) + */ +struct rw_thread_info { +	int cpu_num; +	int in_fd; +	int out_fd; +	int read_pipe; +	int write_pipe; +	unsigned long pipe_size; +}; + +/* use for stopping rw threads */ +extern bool global_sig_receive; + +/* use for notification */ +extern bool global_run_operation; +extern pthread_mutex_t mutex_notify; +extern pthread_cond_t cond_wakeup; + +/* for controller of read/write threads */ +extern int rw_ctl_init(const char *ctl_path); +extern void *rw_ctl_loop(int ctl_fd); + +/* for trace read/write thread */ +extern void *rw_thread_info_new(void); +extern void *rw_thread_init(int cpu, const char *in_path, const char *out_path, +			bool stdout_flag, unsigned long pipe_size, +			struct rw_thread_info *rw_ti); +extern pthread_t rw_thread_run(struct rw_thread_info *rw_ti); + +static inline void *zalloc(size_t size) +{ +	return calloc(1, size); +} + +#define pr_err(format, ...) fprintf(stderr, format, ## __VA_ARGS__) +#define pr_info(format, ...) fprintf(stdout, format, ## __VA_ARGS__) +#ifdef DEBUG +#define pr_debug(format, ...) fprintf(stderr, format, ## __VA_ARGS__) +#else +#define pr_debug(format, ...) do {} while (0) +#endif + +#endif /*__TRACE_AGENT_H__*/ diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c new file mode 100644 index 00000000000..00ea679b382 --- /dev/null +++ b/tools/virtio/virtio_test.c @@ -0,0 +1,290 @@ +#define _GNU_SOURCE +#include <getopt.h> +#include <string.h> +#include <poll.h> +#include <sys/eventfd.h> +#include <stdlib.h> +#include <assert.h> +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <fcntl.h> +#include <stdbool.h> +#include <linux/vhost.h> +#include <linux/virtio.h> +#include <linux/virtio_ring.h> +#include "../../drivers/vhost/test.h" + +/* Unused */ +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; + +struct vq_info { +	int kick; +	int call; +	int num; +	int idx; +	void *ring; +	/* copy used for control */ +	struct vring vring; +	struct virtqueue *vq; +}; + +struct vdev_info { +	struct virtio_device vdev; +	int control; +	struct pollfd fds[1]; +	struct vq_info vqs[1]; +	int nvqs; +	void *buf; +	size_t buf_size; +	struct vhost_memory *mem; +}; + +bool vq_notify(struct virtqueue *vq) +{ +	struct vq_info *info = vq->priv; +	unsigned long long v = 1; +	int r; +	r = write(info->kick, &v, sizeof v); +	assert(r == sizeof v); +	return true; +} + +void vq_callback(struct virtqueue *vq) +{ +} + + +void vhost_vq_setup(struct vdev_info *dev, struct vq_info *info) +{ +	struct vhost_vring_state state = { .index = info->idx }; +	struct vhost_vring_file file = { .index = info->idx }; +	unsigned long long features = dev->vdev.features[0]; +	struct vhost_vring_addr addr = { +		.index = info->idx, +		.desc_user_addr = (uint64_t)(unsigned long)info->vring.desc, +		.avail_user_addr = (uint64_t)(unsigned long)info->vring.avail, +		.used_user_addr = (uint64_t)(unsigned long)info->vring.used, +	}; +	int r; +	r = ioctl(dev->control, VHOST_SET_FEATURES, &features); +	assert(r >= 0); +	state.num = info->vring.num; +	r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state); +	assert(r >= 0); +	state.num = 0; +	r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state); +	assert(r >= 0); +	r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr); +	assert(r >= 0); +	file.fd = info->kick; +	r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file); +	assert(r >= 0); +	file.fd = info->call; +	r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file); +	assert(r >= 0); +} + +static void vq_info_add(struct vdev_info *dev, int num) +{ +	struct vq_info *info = &dev->vqs[dev->nvqs]; +	int r; +	info->idx = dev->nvqs; +	info->kick = eventfd(0, EFD_NONBLOCK); +	info->call = eventfd(0, EFD_NONBLOCK); +	r = posix_memalign(&info->ring, 4096, vring_size(num, 4096)); +	assert(r >= 0); +	memset(info->ring, 0, vring_size(num, 4096)); +	vring_init(&info->vring, num, info->ring, 4096); +	info->vq = vring_new_virtqueue(info->idx, +				       info->vring.num, 4096, &dev->vdev, +				       true, info->ring, +				       vq_notify, vq_callback, "test"); +	assert(info->vq); +	info->vq->priv = info; +	vhost_vq_setup(dev, info); +	dev->fds[info->idx].fd = info->call; +	dev->fds[info->idx].events = POLLIN; +	dev->nvqs++; +} + +static void vdev_info_init(struct vdev_info* dev, unsigned long long features) +{ +	int r; +	memset(dev, 0, sizeof *dev); +	dev->vdev.features[0] = features; +	dev->vdev.features[1] = features >> 32; +	dev->buf_size = 1024; +	dev->buf = malloc(dev->buf_size); +	assert(dev->buf); +        dev->control = open("/dev/vhost-test", O_RDWR); +	assert(dev->control >= 0); +	r = ioctl(dev->control, VHOST_SET_OWNER, NULL); +	assert(r >= 0); +	dev->mem = malloc(offsetof(struct vhost_memory, regions) + +			  sizeof dev->mem->regions[0]); +	assert(dev->mem); +	memset(dev->mem, 0, offsetof(struct vhost_memory, regions) + +                          sizeof dev->mem->regions[0]); +	dev->mem->nregions = 1; +	dev->mem->regions[0].guest_phys_addr = (long)dev->buf; +	dev->mem->regions[0].userspace_addr = (long)dev->buf; +	dev->mem->regions[0].memory_size = dev->buf_size; +	r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); +	assert(r >= 0); +} + +/* TODO: this is pretty bad: we get a cache line bounce + * for the wait queue on poll and another one on read, + * plus the read which is there just to clear the + * current state. */ +static void wait_for_interrupt(struct vdev_info *dev) +{ +	int i; +	unsigned long long val; +	poll(dev->fds, dev->nvqs, -1); +	for (i = 0; i < dev->nvqs; ++i) +		if (dev->fds[i].revents & POLLIN) { +			read(dev->fds[i].fd, &val, sizeof val); +		} +} + +static void run_test(struct vdev_info *dev, struct vq_info *vq, +		     bool delayed, int bufs) +{ +	struct scatterlist sl; +	long started = 0, completed = 0; +	long completed_before; +	int r, test = 1; +	unsigned len; +	long long spurious = 0; +	r = ioctl(dev->control, VHOST_TEST_RUN, &test); +	assert(r >= 0); +	for (;;) { +		virtqueue_disable_cb(vq->vq); +		completed_before = completed; +		do { +			if (started < bufs) { +				sg_init_one(&sl, dev->buf, dev->buf_size); +				r = virtqueue_add_outbuf(vq->vq, &sl, 1, +							 dev->buf + started, +							 GFP_ATOMIC); +				if (likely(r == 0)) { +					++started; +					if (unlikely(!virtqueue_kick(vq->vq))) +						r = -1; +				} +			} else +				r = -1; + +			/* Flush out completed bufs if any */ +			if (virtqueue_get_buf(vq->vq, &len)) { +				++completed; +				r = 0; +			} + +		} while (r == 0); +		if (completed == completed_before) +			++spurious; +		assert(completed <= bufs); +		assert(started <= bufs); +		if (completed == bufs) +			break; +		if (delayed) { +			if (virtqueue_enable_cb_delayed(vq->vq)) +				wait_for_interrupt(dev); +		} else { +			if (virtqueue_enable_cb(vq->vq)) +				wait_for_interrupt(dev); +		} +	} +	test = 0; +	r = ioctl(dev->control, VHOST_TEST_RUN, &test); +	assert(r >= 0); +	fprintf(stderr, "spurious wakeus: 0x%llx\n", spurious); +} + +const char optstring[] = "h"; +const struct option longopts[] = { +	{ +		.name = "help", +		.val = 'h', +	}, +	{ +		.name = "event-idx", +		.val = 'E', +	}, +	{ +		.name = "no-event-idx", +		.val = 'e', +	}, +	{ +		.name = "indirect", +		.val = 'I', +	}, +	{ +		.name = "no-indirect", +		.val = 'i', +	}, +	{ +		.name = "delayed-interrupt", +		.val = 'D', +	}, +	{ +		.name = "no-delayed-interrupt", +		.val = 'd', +	}, +	{ +	} +}; + +static void help(void) +{ +	fprintf(stderr, "Usage: virtio_test [--help]" +		" [--no-indirect]" +		" [--no-event-idx]" +		" [--delayed-interrupt]" +		"\n"); +} + +int main(int argc, char **argv) +{ +	struct vdev_info dev; +	unsigned long long features = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | +		(1ULL << VIRTIO_RING_F_EVENT_IDX); +	int o; +	bool delayed = false; + +	for (;;) { +		o = getopt_long(argc, argv, optstring, longopts, NULL); +		switch (o) { +		case -1: +			goto done; +		case '?': +			help(); +			exit(2); +		case 'e': +			features &= ~(1ULL << VIRTIO_RING_F_EVENT_IDX); +			break; +		case 'h': +			help(); +			goto done; +		case 'i': +			features &= ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC); +			break; +		case 'D': +			delayed = true; +			break; +		default: +			assert(0); +			break; +		} +	} + +done: +	vdev_info_init(&dev, features); +	vq_info_add(&dev, 256); +	run_test(&dev, &dev.vqs[0], delayed, 0x100000); +	return 0; +} diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c new file mode 100644 index 00000000000..14a4f4cab5b --- /dev/null +++ b/tools/virtio/vringh_test.c @@ -0,0 +1,746 @@ +/* Simple test of virtio code, entirely in userpsace. */ +#define _GNU_SOURCE +#include <sched.h> +#include <err.h> +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/virtio.h> +#include <linux/vringh.h> +#include <linux/virtio_ring.h> +#include <linux/uaccess.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <fcntl.h> + +#define USER_MEM (1024*1024) +void *__user_addr_min, *__user_addr_max; +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; +static u64 user_addr_offset; + +#define RINGSIZE 256 +#define ALIGN 4096 + +static bool never_notify_host(struct virtqueue *vq) +{ +	abort(); +} + +static void never_callback_guest(struct virtqueue *vq) +{ +	abort(); +} + +static bool getrange_iov(struct vringh *vrh, u64 addr, struct vringh_range *r) +{ +	if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) +		return false; +	if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) +		return false; + +	r->start = (u64)(unsigned long)__user_addr_min - user_addr_offset; +	r->end_incl = (u64)(unsigned long)__user_addr_max - 1 - user_addr_offset; +	r->offset = user_addr_offset; +	return true; +} + +/* We return single byte ranges. */ +static bool getrange_slow(struct vringh *vrh, u64 addr, struct vringh_range *r) +{ +	if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) +		return false; +	if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) +		return false; + +	r->start = addr; +	r->end_incl = r->start; +	r->offset = user_addr_offset; +	return true; +} + +struct guest_virtio_device { +	struct virtio_device vdev; +	int to_host_fd; +	unsigned long notifies; +}; + +static bool parallel_notify_host(struct virtqueue *vq) +{ +	int rc; +	struct guest_virtio_device *gvdev; + +	gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev); +	rc = write(gvdev->to_host_fd, "", 1); +	if (rc < 0) +		return false; +	gvdev->notifies++; +	return true; +} + +static bool no_notify_host(struct virtqueue *vq) +{ +	return true; +} + +#define NUM_XFERS (10000000) + +/* We aim for two "distant" cpus. */ +static void find_cpus(unsigned int *first, unsigned int *last) +{ +	unsigned int i; + +	*first = -1U; +	*last = 0; +	for (i = 0; i < 4096; i++) { +		cpu_set_t set; +		CPU_ZERO(&set); +		CPU_SET(i, &set); +		if (sched_setaffinity(getpid(), sizeof(set), &set) == 0) { +			if (i < *first) +				*first = i; +			if (i > *last) +				*last = i; +		} +	} +} + +/* Opencoded version for fast mode */ +static inline int vringh_get_head(struct vringh *vrh, u16 *head) +{ +	u16 avail_idx, i; +	int err; + +	err = get_user(avail_idx, &vrh->vring.avail->idx); +	if (err) +		return err; + +	if (vrh->last_avail_idx == avail_idx) +		return 0; + +	/* Only get avail ring entries after they have been exposed by guest. */ +	virtio_rmb(vrh->weak_barriers); + +	i = vrh->last_avail_idx & (vrh->vring.num - 1); + +	err = get_user(*head, &vrh->vring.avail->ring[i]); +	if (err) +		return err; + +	vrh->last_avail_idx++; +	return 1; +} + +static int parallel_test(unsigned long features, +			 bool (*getrange)(struct vringh *vrh, +					  u64 addr, struct vringh_range *r), +			 bool fast_vringh) +{ +	void *host_map, *guest_map; +	int fd, mapsize, to_guest[2], to_host[2]; +	unsigned long xfers = 0, notifies = 0, receives = 0; +	unsigned int first_cpu, last_cpu; +	cpu_set_t cpu_set; +	char buf[128]; + +	/* Create real file to mmap. */ +	fd = open("/tmp/vringh_test-file", O_RDWR|O_CREAT|O_TRUNC, 0600); +	if (fd < 0) +		err(1, "Opening /tmp/vringh_test-file"); + +	/* Extra room at the end for some data, and indirects */ +	mapsize = vring_size(RINGSIZE, ALIGN) +		+ RINGSIZE * 2 * sizeof(int) +		+ RINGSIZE * 6 * sizeof(struct vring_desc); +	mapsize = (mapsize + getpagesize() - 1) & ~(getpagesize() - 1); +	ftruncate(fd, mapsize); + +	/* Parent and child use separate addresses, to check our mapping logic! */ +	host_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); +	guest_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + +	pipe(to_guest); +	pipe(to_host); + +	CPU_ZERO(&cpu_set); +	find_cpus(&first_cpu, &last_cpu); +	printf("Using CPUS %u and %u\n", first_cpu, last_cpu); +	fflush(stdout); + +	if (fork() != 0) { +		struct vringh vrh; +		int status, err, rlen = 0; +		char rbuf[5]; + +		/* We are the host: never access guest addresses! */ +		munmap(guest_map, mapsize); + +		__user_addr_min = host_map; +		__user_addr_max = __user_addr_min + mapsize; +		user_addr_offset = host_map - guest_map; +		assert(user_addr_offset); + +		close(to_guest[0]); +		close(to_host[1]); + +		vring_init(&vrh.vring, RINGSIZE, host_map, ALIGN); +		vringh_init_user(&vrh, features, RINGSIZE, true, +				 vrh.vring.desc, vrh.vring.avail, vrh.vring.used); +		CPU_SET(first_cpu, &cpu_set); +		if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set)) +			errx(1, "Could not set affinity to cpu %u", first_cpu); + +		while (xfers < NUM_XFERS) { +			struct iovec host_riov[2], host_wiov[2]; +			struct vringh_iov riov, wiov; +			u16 head, written; + +			if (fast_vringh) { +				for (;;) { +					err = vringh_get_head(&vrh, &head); +					if (err != 0) +						break; +					err = vringh_need_notify_user(&vrh); +					if (err < 0) +						errx(1, "vringh_need_notify_user: %i", +						     err); +					if (err) { +						write(to_guest[1], "", 1); +						notifies++; +					} +				} +				if (err != 1) +					errx(1, "vringh_get_head"); +				written = 0; +				goto complete; +			} else { +				vringh_iov_init(&riov, +						host_riov, +						ARRAY_SIZE(host_riov)); +				vringh_iov_init(&wiov, +						host_wiov, +						ARRAY_SIZE(host_wiov)); + +				err = vringh_getdesc_user(&vrh, &riov, &wiov, +							  getrange, &head); +			} +			if (err == 0) { +				err = vringh_need_notify_user(&vrh); +				if (err < 0) +					errx(1, "vringh_need_notify_user: %i", +					     err); +				if (err) { +					write(to_guest[1], "", 1); +					notifies++; +				} + +				if (!vringh_notify_enable_user(&vrh)) +					continue; + +				/* Swallow all notifies at once. */ +				if (read(to_host[0], buf, sizeof(buf)) < 1) +					break; + +				vringh_notify_disable_user(&vrh); +				receives++; +				continue; +			} +			if (err != 1) +				errx(1, "vringh_getdesc_user: %i", err); + +			/* We simply copy bytes. */ +			if (riov.used) { +				rlen = vringh_iov_pull_user(&riov, rbuf, +							    sizeof(rbuf)); +				if (rlen != 4) +					errx(1, "vringh_iov_pull_user: %i", +					     rlen); +				assert(riov.i == riov.used); +				written = 0; +			} else { +				err = vringh_iov_push_user(&wiov, rbuf, rlen); +				if (err != rlen) +					errx(1, "vringh_iov_push_user: %i", +					     err); +				assert(wiov.i == wiov.used); +				written = err; +			} +		complete: +			xfers++; + +			err = vringh_complete_user(&vrh, head, written); +			if (err != 0) +				errx(1, "vringh_complete_user: %i", err); +		} + +		err = vringh_need_notify_user(&vrh); +		if (err < 0) +			errx(1, "vringh_need_notify_user: %i", err); +		if (err) { +			write(to_guest[1], "", 1); +			notifies++; +		} +		wait(&status); +		if (!WIFEXITED(status)) +			errx(1, "Child died with signal %i?", WTERMSIG(status)); +		if (WEXITSTATUS(status) != 0) +			errx(1, "Child exited %i?", WEXITSTATUS(status)); +		printf("Host: notified %lu, pinged %lu\n", notifies, receives); +		return 0; +	} else { +		struct guest_virtio_device gvdev; +		struct virtqueue *vq; +		unsigned int *data; +		struct vring_desc *indirects; +		unsigned int finished = 0; + +		/* We pass sg[]s pointing into here, but we need RINGSIZE+1 */ +		data = guest_map + vring_size(RINGSIZE, ALIGN); +		indirects = (void *)data + (RINGSIZE + 1) * 2 * sizeof(int); + +		/* We are the guest. */ +		munmap(host_map, mapsize); + +		close(to_guest[1]); +		close(to_host[0]); + +		gvdev.vdev.features[0] = features; +		gvdev.to_host_fd = to_host[1]; +		gvdev.notifies = 0; + +		CPU_SET(first_cpu, &cpu_set); +		if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set)) +			err(1, "Could not set affinity to cpu %u", first_cpu); + +		vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev, true, +					 guest_map, fast_vringh ? no_notify_host +					 : parallel_notify_host, +					 never_callback_guest, "guest vq"); + +		/* Don't kfree indirects. */ +		__kfree_ignore_start = indirects; +		__kfree_ignore_end = indirects + RINGSIZE * 6; + +		while (xfers < NUM_XFERS) { +			struct scatterlist sg[4]; +			unsigned int num_sg, len; +			int *dbuf, err; +			bool output = !(xfers % 2); + +			/* Consume bufs. */ +			while ((dbuf = virtqueue_get_buf(vq, &len)) != NULL) { +				if (len == 4) +					assert(*dbuf == finished - 1); +				else if (!fast_vringh) +					assert(*dbuf == finished); +				finished++; +			} + +			/* Produce a buffer. */ +			dbuf = data + (xfers % (RINGSIZE + 1)); + +			if (output) +				*dbuf = xfers; +			else +				*dbuf = -1; + +			switch ((xfers / sizeof(*dbuf)) % 4) { +			case 0: +				/* Nasty three-element sg list. */ +				sg_init_table(sg, num_sg = 3); +				sg_set_buf(&sg[0], (void *)dbuf, 1); +				sg_set_buf(&sg[1], (void *)dbuf + 1, 2); +				sg_set_buf(&sg[2], (void *)dbuf + 3, 1); +				break; +			case 1: +				sg_init_table(sg, num_sg = 2); +				sg_set_buf(&sg[0], (void *)dbuf, 1); +				sg_set_buf(&sg[1], (void *)dbuf + 1, 3); +				break; +			case 2: +				sg_init_table(sg, num_sg = 1); +				sg_set_buf(&sg[0], (void *)dbuf, 4); +				break; +			case 3: +				sg_init_table(sg, num_sg = 4); +				sg_set_buf(&sg[0], (void *)dbuf, 1); +				sg_set_buf(&sg[1], (void *)dbuf + 1, 1); +				sg_set_buf(&sg[2], (void *)dbuf + 2, 1); +				sg_set_buf(&sg[3], (void *)dbuf + 3, 1); +				break; +			} + +			/* May allocate an indirect, so force it to allocate +			 * user addr */ +			__kmalloc_fake = indirects + (xfers % RINGSIZE) * 4; +			if (output) +				err = virtqueue_add_outbuf(vq, sg, num_sg, dbuf, +							   GFP_KERNEL); +			else +				err = virtqueue_add_inbuf(vq, sg, num_sg, +							  dbuf, GFP_KERNEL); + +			if (err == -ENOSPC) { +				if (!virtqueue_enable_cb_delayed(vq)) +					continue; +				/* Swallow all notifies at once. */ +				if (read(to_guest[0], buf, sizeof(buf)) < 1) +					break; +				 +				receives++; +				virtqueue_disable_cb(vq); +				continue; +			} + +			if (err) +				errx(1, "virtqueue_add_in/outbuf: %i", err); + +			xfers++; +			virtqueue_kick(vq); +		} + +		/* Any extra? */ +		while (finished != xfers) { +			int *dbuf; +			unsigned int len; + +			/* Consume bufs. */ +			dbuf = virtqueue_get_buf(vq, &len); +			if (dbuf) { +				if (len == 4) +					assert(*dbuf == finished - 1); +				else +					assert(len == 0); +				finished++; +				continue; +			} + +			if (!virtqueue_enable_cb_delayed(vq)) +				continue; +			if (read(to_guest[0], buf, sizeof(buf)) < 1) +				break; +				 +			receives++; +			virtqueue_disable_cb(vq); +		} + +		printf("Guest: notified %lu, pinged %lu\n", +		       gvdev.notifies, receives); +		vring_del_virtqueue(vq); +		return 0; +	} +} + +int main(int argc, char *argv[]) +{ +	struct virtio_device vdev; +	struct virtqueue *vq; +	struct vringh vrh; +	struct scatterlist guest_sg[RINGSIZE], *sgs[2]; +	struct iovec host_riov[2], host_wiov[2]; +	struct vringh_iov riov, wiov; +	struct vring_used_elem used[RINGSIZE]; +	char buf[28]; +	u16 head; +	int err; +	unsigned i; +	void *ret; +	bool (*getrange)(struct vringh *vrh, u64 addr, struct vringh_range *r); +	bool fast_vringh = false, parallel = false; + +	getrange = getrange_iov; +	vdev.features[0] = 0; + +	while (argv[1]) { +		if (strcmp(argv[1], "--indirect") == 0) +			vdev.features[0] |= (1 << VIRTIO_RING_F_INDIRECT_DESC); +		else if (strcmp(argv[1], "--eventidx") == 0) +			vdev.features[0] |= (1 << VIRTIO_RING_F_EVENT_IDX); +		else if (strcmp(argv[1], "--slow-range") == 0) +			getrange = getrange_slow; +		else if (strcmp(argv[1], "--fast-vringh") == 0) +			fast_vringh = true; +		else if (strcmp(argv[1], "--parallel") == 0) +			parallel = true; +		else +			errx(1, "Unknown arg %s", argv[1]); +		argv++; +	} + +	if (parallel) +		return parallel_test(vdev.features[0], getrange, fast_vringh); + +	if (posix_memalign(&__user_addr_min, PAGE_SIZE, USER_MEM) != 0) +		abort(); +	__user_addr_max = __user_addr_min + USER_MEM; +	memset(__user_addr_min, 0, vring_size(RINGSIZE, ALIGN)); + +	/* Set up guest side. */ +	vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true, +				 __user_addr_min, +				 never_notify_host, never_callback_guest, +				 "guest vq"); + +	/* Set up host side. */ +	vring_init(&vrh.vring, RINGSIZE, __user_addr_min, ALIGN); +	vringh_init_user(&vrh, vdev.features[0], RINGSIZE, true, +			 vrh.vring.desc, vrh.vring.avail, vrh.vring.used); + +	/* No descriptor to get yet... */ +	err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); +	if (err != 0) +		errx(1, "vringh_getdesc_user: %i", err); + +	/* Guest puts in a descriptor. */ +	memcpy(__user_addr_max - 1, "a", 1); +	sg_init_table(guest_sg, 1); +	sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); +	sg_init_table(guest_sg+1, 1); +	sg_set_buf(&guest_sg[1], __user_addr_max - 3, 2); +	sgs[0] = &guest_sg[0]; +	sgs[1] = &guest_sg[1]; + +	/* May allocate an indirect, so force it to allocate user addr */ +	__kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); +	err = virtqueue_add_sgs(vq, sgs, 1, 1, &err, GFP_KERNEL); +	if (err) +		errx(1, "virtqueue_add_sgs: %i", err); +	__kmalloc_fake = NULL; + +	/* Host retreives it. */ +	vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); +	vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + +	err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); +	if (err != 1) +		errx(1, "vringh_getdesc_user: %i", err); + +	assert(riov.used == 1); +	assert(riov.iov[0].iov_base == __user_addr_max - 1); +	assert(riov.iov[0].iov_len == 1); +	if (getrange != getrange_slow) { +		assert(wiov.used == 1); +		assert(wiov.iov[0].iov_base == __user_addr_max - 3); +		assert(wiov.iov[0].iov_len == 2); +	} else { +		assert(wiov.used == 2); +		assert(wiov.iov[0].iov_base == __user_addr_max - 3); +		assert(wiov.iov[0].iov_len == 1); +		assert(wiov.iov[1].iov_base == __user_addr_max - 2); +		assert(wiov.iov[1].iov_len == 1); +	} + +	err = vringh_iov_pull_user(&riov, buf, 5); +	if (err != 1) +		errx(1, "vringh_iov_pull_user: %i", err); +	assert(buf[0] == 'a'); +	assert(riov.i == 1); +	assert(vringh_iov_pull_user(&riov, buf, 5) == 0); + +	memcpy(buf, "bcdef", 5); +	err = vringh_iov_push_user(&wiov, buf, 5); +	if (err != 2) +		errx(1, "vringh_iov_push_user: %i", err); +	assert(memcmp(__user_addr_max - 3, "bc", 2) == 0); +	assert(wiov.i == wiov.used); +	assert(vringh_iov_push_user(&wiov, buf, 5) == 0); + +	/* Host is done. */ +	err = vringh_complete_user(&vrh, head, err); +	if (err != 0) +		errx(1, "vringh_complete_user: %i", err); + +	/* Guest should see used token now. */ +	__kfree_ignore_start = __user_addr_min + vring_size(RINGSIZE, ALIGN); +	__kfree_ignore_end = __kfree_ignore_start + 1; +	ret = virtqueue_get_buf(vq, &i); +	if (ret != &err) +		errx(1, "virtqueue_get_buf: %p", ret); +	assert(i == 2); + +	/* Guest puts in a huge descriptor. */ +	sg_init_table(guest_sg, RINGSIZE); +	for (i = 0; i < RINGSIZE; i++) { +		sg_set_buf(&guest_sg[i], +			   __user_addr_max - USER_MEM/4, USER_MEM/4); +	} + +	/* Fill contents with recognisable garbage. */ +	for (i = 0; i < USER_MEM/4; i++) +		((char *)__user_addr_max - USER_MEM/4)[i] = i; + +	/* This will allocate an indirect, so force it to allocate user addr */ +	__kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); +	err = virtqueue_add_outbuf(vq, guest_sg, RINGSIZE, &err, GFP_KERNEL); +	if (err) +		errx(1, "virtqueue_add_outbuf (large): %i", err); +	__kmalloc_fake = NULL; + +	/* Host picks it up (allocates new iov). */ +	vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); +	vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + +	err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); +	if (err != 1) +		errx(1, "vringh_getdesc_user: %i", err); + +	assert(riov.max_num & VRINGH_IOV_ALLOCATED); +	assert(riov.iov != host_riov); +	if (getrange != getrange_slow) +		assert(riov.used == RINGSIZE); +	else +		assert(riov.used == RINGSIZE * USER_MEM/4); + +	assert(!(wiov.max_num & VRINGH_IOV_ALLOCATED)); +	assert(wiov.used == 0); + +	/* Pull data back out (in odd chunks), should be as expected. */ +	for (i = 0; i < RINGSIZE * USER_MEM/4; i += 3) { +		err = vringh_iov_pull_user(&riov, buf, 3); +		if (err != 3 && i + err != RINGSIZE * USER_MEM/4) +			errx(1, "vringh_iov_pull_user large: %i", err); +		assert(buf[0] == (char)i); +		assert(err < 2 || buf[1] == (char)(i + 1)); +		assert(err < 3 || buf[2] == (char)(i + 2)); +	} +	assert(riov.i == riov.used); +	vringh_iov_cleanup(&riov); +	vringh_iov_cleanup(&wiov); + +	/* Complete using multi interface, just because we can. */ +	used[0].id = head; +	used[0].len = 0; +	err = vringh_complete_multi_user(&vrh, used, 1); +	if (err) +		errx(1, "vringh_complete_multi_user(1): %i", err); + +	/* Free up those descriptors. */ +	ret = virtqueue_get_buf(vq, &i); +	if (ret != &err) +		errx(1, "virtqueue_get_buf: %p", ret); + +	/* Add lots of descriptors. */ +	sg_init_table(guest_sg, 1); +	sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); +	for (i = 0; i < RINGSIZE; i++) { +		err = virtqueue_add_outbuf(vq, guest_sg, 1, &err, GFP_KERNEL); +		if (err) +			errx(1, "virtqueue_add_outbuf (multiple): %i", err); +	} + +	/* Now get many, and consume them all at once. */ +	vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); +	vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + +	for (i = 0; i < RINGSIZE; i++) { +		err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); +		if (err != 1) +			errx(1, "vringh_getdesc_user: %i", err); +		used[i].id = head; +		used[i].len = 0; +	} +	/* Make sure it wraps around ring, to test! */ +	assert(vrh.vring.used->idx % RINGSIZE != 0); +	err = vringh_complete_multi_user(&vrh, used, RINGSIZE); +	if (err) +		errx(1, "vringh_complete_multi_user: %i", err); + +	/* Free those buffers. */ +	for (i = 0; i < RINGSIZE; i++) { +		unsigned len; +		assert(virtqueue_get_buf(vq, &len) != NULL); +	} + +	/* Test weird (but legal!) indirect. */ +	if (vdev.features[0] & (1 << VIRTIO_RING_F_INDIRECT_DESC)) { +		char *data = __user_addr_max - USER_MEM/4; +		struct vring_desc *d = __user_addr_max - USER_MEM/2; +		struct vring vring; + +		/* Force creation of direct, which we modify. */ +		vdev.features[0] &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); +		vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true, +					 __user_addr_min, +					 never_notify_host, +					 never_callback_guest, +					 "guest vq"); + +		sg_init_table(guest_sg, 4); +		sg_set_buf(&guest_sg[0], d, sizeof(*d)*2); +		sg_set_buf(&guest_sg[1], d + 2, sizeof(*d)*1); +		sg_set_buf(&guest_sg[2], data + 6, 4); +		sg_set_buf(&guest_sg[3], d + 3, sizeof(*d)*3); + +		err = virtqueue_add_outbuf(vq, guest_sg, 4, &err, GFP_KERNEL); +		if (err) +			errx(1, "virtqueue_add_outbuf (indirect): %i", err); + +		vring_init(&vring, RINGSIZE, __user_addr_min, ALIGN); + +		/* They're used in order, but double-check... */ +		assert(vring.desc[0].addr == (unsigned long)d); +		assert(vring.desc[1].addr == (unsigned long)(d+2)); +		assert(vring.desc[2].addr == (unsigned long)data + 6); +		assert(vring.desc[3].addr == (unsigned long)(d+3)); +		vring.desc[0].flags |= VRING_DESC_F_INDIRECT; +		vring.desc[1].flags |= VRING_DESC_F_INDIRECT; +		vring.desc[3].flags |= VRING_DESC_F_INDIRECT; + +		/* First indirect */ +		d[0].addr = (unsigned long)data; +		d[0].len = 1; +		d[0].flags = VRING_DESC_F_NEXT; +		d[0].next = 1; +		d[1].addr = (unsigned long)data + 1; +		d[1].len = 2; +		d[1].flags = 0; + +		/* Second indirect */ +		d[2].addr = (unsigned long)data + 3; +		d[2].len = 3; +		d[2].flags = 0; + +		/* Third indirect */ +		d[3].addr = (unsigned long)data + 10; +		d[3].len = 5; +		d[3].flags = VRING_DESC_F_NEXT; +		d[3].next = 1; +		d[4].addr = (unsigned long)data + 15; +		d[4].len = 6; +		d[4].flags = VRING_DESC_F_NEXT; +		d[4].next = 2; +		d[5].addr = (unsigned long)data + 21; +		d[5].len = 7; +		d[5].flags = 0; + +		/* Host picks it up (allocates new iov). */ +		vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); +		vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + +		err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); +		if (err != 1) +			errx(1, "vringh_getdesc_user: %i", err); + +		if (head != 0) +			errx(1, "vringh_getdesc_user: head %i not 0", head); + +		assert(riov.max_num & VRINGH_IOV_ALLOCATED); +		if (getrange != getrange_slow) +			assert(riov.used == 7); +		else +			assert(riov.used == 28); +		err = vringh_iov_pull_user(&riov, buf, 29); +		assert(err == 28); + +		/* Data should be linear. */ +		for (i = 0; i < err; i++) +			assert(buf[i] == i); +		vringh_iov_cleanup(&riov); +	} + +	/* Don't leak memory... */ +	vring_del_virtqueue(vq); +	free(__user_addr_min); + +	return 0; +}  | 
