diff options
Diffstat (limited to 'drivers/hv')
| -rw-r--r-- | drivers/hv/Kconfig | 22 | ||||
| -rw-r--r-- | drivers/hv/Makefile | 8 | ||||
| -rw-r--r-- | drivers/hv/channel.c | 829 | ||||
| -rw-r--r-- | drivers/hv/channel_mgmt.c | 873 | ||||
| -rw-r--r-- | drivers/hv/connection.c | 455 | ||||
| -rw-r--r-- | drivers/hv/hv.c | 426 | ||||
| -rw-r--r-- | drivers/hv/hv_balloon.c | 1553 | ||||
| -rw-r--r-- | drivers/hv/hv_fcopy.c | 414 | ||||
| -rw-r--r-- | drivers/hv/hv_kvp.c | 715 | ||||
| -rw-r--r-- | drivers/hv/hv_snapshot.c | 281 | ||||
| -rw-r--r-- | drivers/hv/hv_util.c | 446 | ||||
| -rw-r--r-- | drivers/hv/hyperv_vmbus.h | 682 | ||||
| -rw-r--r-- | drivers/hv/ring_buffer.c | 561 | ||||
| -rw-r--r-- | drivers/hv/vmbus_drv.c | 973 | 
14 files changed, 8238 insertions, 0 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig new file mode 100644 index 00000000000..0403b51d20b --- /dev/null +++ b/drivers/hv/Kconfig @@ -0,0 +1,22 @@ +menu "Microsoft Hyper-V guest support" + +config HYPERV +	tristate "Microsoft Hyper-V client drivers" +	depends on X86 && ACPI && PCI && X86_LOCAL_APIC && HYPERVISOR_GUEST +	help +	  Select this option to run Linux as a Hyper-V client operating +	  system. + +config HYPERV_UTILS +	tristate "Microsoft Hyper-V Utilities driver" +	depends on HYPERV && CONNECTOR && NLS +	help +	  Select this option to enable the Hyper-V Utilities. + +config HYPERV_BALLOON +	tristate "Microsoft Hyper-V Balloon driver" +	depends on HYPERV +	help +	  Select this option to enable Hyper-V Balloon driver. + +endmenu diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile new file mode 100644 index 00000000000..5e4dfa4cfe2 --- /dev/null +++ b/drivers/hv/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_HYPERV)		+= hv_vmbus.o +obj-$(CONFIG_HYPERV_UTILS)	+= hv_utils.o +obj-$(CONFIG_HYPERV_BALLOON)	+= hv_balloon.o + +hv_vmbus-y := vmbus_drv.o \ +		 hv.o connection.o channel.o \ +		 channel_mgmt.o ring_buffer.o +hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c new file mode 100644 index 00000000000..284cf66489f --- /dev/null +++ b/drivers/hv/channel.c @@ -0,0 +1,829 @@ +/* + * Copyright (c) 2009, Microsoft Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Authors: + *   Haiyang Zhang <haiyangz@microsoft.com> + *   Hank Janssen  <hjanssen@microsoft.com> + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/hyperv.h> +#include <linux/uio.h> + +#include "hyperv_vmbus.h" + +#define NUM_PAGES_SPANNED(addr, len) \ +((PAGE_ALIGN(addr + len) >> PAGE_SHIFT) - (addr >> PAGE_SHIFT)) + +/* + * vmbus_setevent- Trigger an event notification on the specified + * channel. + */ +static void vmbus_setevent(struct vmbus_channel *channel) +{ +	struct hv_monitor_page *monitorpage; + +	if (channel->offermsg.monitor_allocated) { +		/* Each u32 represents 32 channels */ +		sync_set_bit(channel->offermsg.child_relid & 31, +			(unsigned long *) vmbus_connection.send_int_page + +			(channel->offermsg.child_relid >> 5)); + +		/* Get the child to parent monitor page */ +		monitorpage = vmbus_connection.monitor_pages[1]; + +		sync_set_bit(channel->monitor_bit, +			(unsigned long *)&monitorpage->trigger_group +					[channel->monitor_grp].pending); + +	} else { +		vmbus_set_event(channel); +	} +} + +/* + * vmbus_open - Open the specified channel. + */ +int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, +		     u32 recv_ringbuffer_size, void *userdata, u32 userdatalen, +		     void (*onchannelcallback)(void *context), void *context) +{ +	struct vmbus_channel_open_channel *open_msg; +	struct vmbus_channel_msginfo *open_info = NULL; +	void *in, *out; +	unsigned long flags; +	int ret, t, err = 0; + +	spin_lock_irqsave(&newchannel->sc_lock, flags); +	if (newchannel->state == CHANNEL_OPEN_STATE) { +		newchannel->state = CHANNEL_OPENING_STATE; +	} else { +		spin_unlock_irqrestore(&newchannel->sc_lock, flags); +		return -EINVAL; +	} +	spin_unlock_irqrestore(&newchannel->sc_lock, flags); + +	newchannel->onchannel_callback = onchannelcallback; +	newchannel->channel_callback_context = context; + +	/* Allocate the ring buffer */ +	out = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, +		get_order(send_ringbuffer_size + recv_ringbuffer_size)); + +	if (!out) +		return -ENOMEM; + + +	in = (void *)((unsigned long)out + send_ringbuffer_size); + +	newchannel->ringbuffer_pages = out; +	newchannel->ringbuffer_pagecount = (send_ringbuffer_size + +					   recv_ringbuffer_size) >> PAGE_SHIFT; + +	ret = hv_ringbuffer_init( +		&newchannel->outbound, out, send_ringbuffer_size); + +	if (ret != 0) { +		err = ret; +		goto error0; +	} + +	ret = hv_ringbuffer_init( +		&newchannel->inbound, in, recv_ringbuffer_size); +	if (ret != 0) { +		err = ret; +		goto error0; +	} + + +	/* Establish the gpadl for the ring buffer */ +	newchannel->ringbuffer_gpadlhandle = 0; + +	ret = vmbus_establish_gpadl(newchannel, +					 newchannel->outbound.ring_buffer, +					 send_ringbuffer_size + +					 recv_ringbuffer_size, +					 &newchannel->ringbuffer_gpadlhandle); + +	if (ret != 0) { +		err = ret; +		goto error0; +	} + +	/* Create and init the channel open message */ +	open_info = kmalloc(sizeof(*open_info) + +			   sizeof(struct vmbus_channel_open_channel), +			   GFP_KERNEL); +	if (!open_info) { +		err = -ENOMEM; +		goto error0; +	} + +	init_completion(&open_info->waitevent); + +	open_msg = (struct vmbus_channel_open_channel *)open_info->msg; +	open_msg->header.msgtype = CHANNELMSG_OPENCHANNEL; +	open_msg->openid = newchannel->offermsg.child_relid; +	open_msg->child_relid = newchannel->offermsg.child_relid; +	open_msg->ringbuffer_gpadlhandle = newchannel->ringbuffer_gpadlhandle; +	open_msg->downstream_ringbuffer_pageoffset = send_ringbuffer_size >> +						  PAGE_SHIFT; +	open_msg->target_vp = newchannel->target_vp; + +	if (userdatalen > MAX_USER_DEFINED_BYTES) { +		err = -EINVAL; +		goto error0; +	} + +	if (userdatalen) +		memcpy(open_msg->userdata, userdata, userdatalen); + +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); +	list_add_tail(&open_info->msglistentry, +		      &vmbus_connection.chn_msg_list); +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + +	ret = vmbus_post_msg(open_msg, +			       sizeof(struct vmbus_channel_open_channel)); + +	if (ret != 0) +		goto error1; + +	t = wait_for_completion_timeout(&open_info->waitevent, 5*HZ); +	if (t == 0) { +		err = -ETIMEDOUT; +		goto error1; +	} + + +	if (open_info->response.open_result.status) +		err = open_info->response.open_result.status; + +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); +	list_del(&open_info->msglistentry); +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + +	if (err == 0) +		newchannel->state = CHANNEL_OPENED_STATE; + +	kfree(open_info); +	return err; + +error1: +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); +	list_del(&open_info->msglistentry); +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + +error0: +	free_pages((unsigned long)out, +		get_order(send_ringbuffer_size + recv_ringbuffer_size)); +	kfree(open_info); +	return err; +} +EXPORT_SYMBOL_GPL(vmbus_open); + +/* + * create_gpadl_header - Creates a gpadl for the specified buffer + */ +static int create_gpadl_header(void *kbuffer, u32 size, +					 struct vmbus_channel_msginfo **msginfo, +					 u32 *messagecount) +{ +	int i; +	int pagecount; +	struct vmbus_channel_gpadl_header *gpadl_header; +	struct vmbus_channel_gpadl_body *gpadl_body; +	struct vmbus_channel_msginfo *msgheader; +	struct vmbus_channel_msginfo *msgbody = NULL; +	u32 msgsize; + +	int pfnsum, pfncount, pfnleft, pfncurr, pfnsize; + +	pagecount = size >> PAGE_SHIFT; + +	/* do we need a gpadl body msg */ +	pfnsize = MAX_SIZE_CHANNEL_MESSAGE - +		  sizeof(struct vmbus_channel_gpadl_header) - +		  sizeof(struct gpa_range); +	pfncount = pfnsize / sizeof(u64); + +	if (pagecount > pfncount) { +		/* we need a gpadl body */ +		/* fill in the header */ +		msgsize = sizeof(struct vmbus_channel_msginfo) + +			  sizeof(struct vmbus_channel_gpadl_header) + +			  sizeof(struct gpa_range) + pfncount * sizeof(u64); +		msgheader =  kzalloc(msgsize, GFP_KERNEL); +		if (!msgheader) +			goto nomem; + +		INIT_LIST_HEAD(&msgheader->submsglist); +		msgheader->msgsize = msgsize; + +		gpadl_header = (struct vmbus_channel_gpadl_header *) +			msgheader->msg; +		gpadl_header->rangecount = 1; +		gpadl_header->range_buflen = sizeof(struct gpa_range) + +					 pagecount * sizeof(u64); +		gpadl_header->range[0].byte_offset = 0; +		gpadl_header->range[0].byte_count = size; +		for (i = 0; i < pfncount; i++) +			gpadl_header->range[0].pfn_array[i] = slow_virt_to_phys( +				kbuffer + PAGE_SIZE * i) >> PAGE_SHIFT; +		*msginfo = msgheader; +		*messagecount = 1; + +		pfnsum = pfncount; +		pfnleft = pagecount - pfncount; + +		/* how many pfns can we fit */ +		pfnsize = MAX_SIZE_CHANNEL_MESSAGE - +			  sizeof(struct vmbus_channel_gpadl_body); +		pfncount = pfnsize / sizeof(u64); + +		/* fill in the body */ +		while (pfnleft) { +			if (pfnleft > pfncount) +				pfncurr = pfncount; +			else +				pfncurr = pfnleft; + +			msgsize = sizeof(struct vmbus_channel_msginfo) + +				  sizeof(struct vmbus_channel_gpadl_body) + +				  pfncurr * sizeof(u64); +			msgbody = kzalloc(msgsize, GFP_KERNEL); + +			if (!msgbody) { +				struct vmbus_channel_msginfo *pos = NULL; +				struct vmbus_channel_msginfo *tmp = NULL; +				/* +				 * Free up all the allocated messages. +				 */ +				list_for_each_entry_safe(pos, tmp, +					&msgheader->submsglist, +					msglistentry) { + +					list_del(&pos->msglistentry); +					kfree(pos); +				} + +				goto nomem; +			} + +			msgbody->msgsize = msgsize; +			(*messagecount)++; +			gpadl_body = +				(struct vmbus_channel_gpadl_body *)msgbody->msg; + +			/* +			 * Gpadl is u32 and we are using a pointer which could +			 * be 64-bit +			 * This is governed by the guest/host protocol and +			 * so the hypervisor gurantees that this is ok. +			 */ +			for (i = 0; i < pfncurr; i++) +				gpadl_body->pfn[i] = slow_virt_to_phys( +					kbuffer + PAGE_SIZE * (pfnsum + i)) >> +					PAGE_SHIFT; + +			/* add to msg header */ +			list_add_tail(&msgbody->msglistentry, +				      &msgheader->submsglist); +			pfnsum += pfncurr; +			pfnleft -= pfncurr; +		} +	} else { +		/* everything fits in a header */ +		msgsize = sizeof(struct vmbus_channel_msginfo) + +			  sizeof(struct vmbus_channel_gpadl_header) + +			  sizeof(struct gpa_range) + pagecount * sizeof(u64); +		msgheader = kzalloc(msgsize, GFP_KERNEL); +		if (msgheader == NULL) +			goto nomem; +		msgheader->msgsize = msgsize; + +		gpadl_header = (struct vmbus_channel_gpadl_header *) +			msgheader->msg; +		gpadl_header->rangecount = 1; +		gpadl_header->range_buflen = sizeof(struct gpa_range) + +					 pagecount * sizeof(u64); +		gpadl_header->range[0].byte_offset = 0; +		gpadl_header->range[0].byte_count = size; +		for (i = 0; i < pagecount; i++) +			gpadl_header->range[0].pfn_array[i] = slow_virt_to_phys( +				kbuffer + PAGE_SIZE * i) >> PAGE_SHIFT; + +		*msginfo = msgheader; +		*messagecount = 1; +	} + +	return 0; +nomem: +	kfree(msgheader); +	kfree(msgbody); +	return -ENOMEM; +} + +/* + * vmbus_establish_gpadl - Estabish a GPADL for the specified buffer + * + * @channel: a channel + * @kbuffer: from kmalloc or vmalloc + * @size: page-size multiple + * @gpadl_handle: some funky thing + */ +int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer, +			       u32 size, u32 *gpadl_handle) +{ +	struct vmbus_channel_gpadl_header *gpadlmsg; +	struct vmbus_channel_gpadl_body *gpadl_body; +	struct vmbus_channel_msginfo *msginfo = NULL; +	struct vmbus_channel_msginfo *submsginfo; +	u32 msgcount; +	struct list_head *curr; +	u32 next_gpadl_handle; +	unsigned long flags; +	int ret = 0; +	int t; + +	next_gpadl_handle = atomic_read(&vmbus_connection.next_gpadl_handle); +	atomic_inc(&vmbus_connection.next_gpadl_handle); + +	ret = create_gpadl_header(kbuffer, size, &msginfo, &msgcount); +	if (ret) +		return ret; + +	init_completion(&msginfo->waitevent); + +	gpadlmsg = (struct vmbus_channel_gpadl_header *)msginfo->msg; +	gpadlmsg->header.msgtype = CHANNELMSG_GPADL_HEADER; +	gpadlmsg->child_relid = channel->offermsg.child_relid; +	gpadlmsg->gpadl = next_gpadl_handle; + + +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); +	list_add_tail(&msginfo->msglistentry, +		      &vmbus_connection.chn_msg_list); + +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + +	ret = vmbus_post_msg(gpadlmsg, msginfo->msgsize - +			       sizeof(*msginfo)); +	if (ret != 0) +		goto cleanup; + +	if (msgcount > 1) { +		list_for_each(curr, &msginfo->submsglist) { + +			submsginfo = (struct vmbus_channel_msginfo *)curr; +			gpadl_body = +			     (struct vmbus_channel_gpadl_body *)submsginfo->msg; + +			gpadl_body->header.msgtype = +				CHANNELMSG_GPADL_BODY; +			gpadl_body->gpadl = next_gpadl_handle; + +			ret = vmbus_post_msg(gpadl_body, +					       submsginfo->msgsize - +					       sizeof(*submsginfo)); +			if (ret != 0) +				goto cleanup; + +		} +	} +	t = wait_for_completion_timeout(&msginfo->waitevent, 5*HZ); +	BUG_ON(t == 0); + + +	/* At this point, we received the gpadl created msg */ +	*gpadl_handle = gpadlmsg->gpadl; + +cleanup: +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); +	list_del(&msginfo->msglistentry); +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + +	kfree(msginfo); +	return ret; +} +EXPORT_SYMBOL_GPL(vmbus_establish_gpadl); + +/* + * vmbus_teardown_gpadl -Teardown the specified GPADL handle + */ +int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle) +{ +	struct vmbus_channel_gpadl_teardown *msg; +	struct vmbus_channel_msginfo *info; +	unsigned long flags; +	int ret, t; + +	info = kmalloc(sizeof(*info) + +		       sizeof(struct vmbus_channel_gpadl_teardown), GFP_KERNEL); +	if (!info) +		return -ENOMEM; + +	init_completion(&info->waitevent); + +	msg = (struct vmbus_channel_gpadl_teardown *)info->msg; + +	msg->header.msgtype = CHANNELMSG_GPADL_TEARDOWN; +	msg->child_relid = channel->offermsg.child_relid; +	msg->gpadl = gpadl_handle; + +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); +	list_add_tail(&info->msglistentry, +		      &vmbus_connection.chn_msg_list); +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); +	ret = vmbus_post_msg(msg, +			       sizeof(struct vmbus_channel_gpadl_teardown)); + +	BUG_ON(ret != 0); +	t = wait_for_completion_timeout(&info->waitevent, 5*HZ); +	BUG_ON(t == 0); + +	/* Received a torndown response */ +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); +	list_del(&info->msglistentry); +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + +	kfree(info); +	return ret; +} +EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); + +static void reset_channel_cb(void *arg) +{ +	struct vmbus_channel *channel = arg; + +	channel->onchannel_callback = NULL; +} + +static void vmbus_close_internal(struct vmbus_channel *channel) +{ +	struct vmbus_channel_close_channel *msg; +	int ret; + +	channel->state = CHANNEL_OPEN_STATE; +	channel->sc_creation_callback = NULL; +	/* Stop callback and cancel the timer asap */ +	if (channel->target_cpu != smp_processor_id()) +		smp_call_function_single(channel->target_cpu, reset_channel_cb, +					 channel, true); +	else +		reset_channel_cb(channel); + +	/* Send a closing message */ + +	msg = &channel->close_msg.msg; + +	msg->header.msgtype = CHANNELMSG_CLOSECHANNEL; +	msg->child_relid = channel->offermsg.child_relid; + +	ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_close_channel)); + +	BUG_ON(ret != 0); +	/* Tear down the gpadl for the channel's ring buffer */ +	if (channel->ringbuffer_gpadlhandle) +		vmbus_teardown_gpadl(channel, +					  channel->ringbuffer_gpadlhandle); + +	/* Cleanup the ring buffers for this channel */ +	hv_ringbuffer_cleanup(&channel->outbound); +	hv_ringbuffer_cleanup(&channel->inbound); + +	free_pages((unsigned long)channel->ringbuffer_pages, +		get_order(channel->ringbuffer_pagecount * PAGE_SIZE)); + + +} + +/* + * vmbus_close - Close the specified channel + */ +void vmbus_close(struct vmbus_channel *channel) +{ +	struct list_head *cur, *tmp; +	struct vmbus_channel *cur_channel; + +	if (channel->primary_channel != NULL) { +		/* +		 * We will only close sub-channels when +		 * the primary is closed. +		 */ +		return; +	} +	/* +	 * Close all the sub-channels first and then close the +	 * primary channel. +	 */ +	list_for_each_safe(cur, tmp, &channel->sc_list) { +		cur_channel = list_entry(cur, struct vmbus_channel, sc_list); +		if (cur_channel->state != CHANNEL_OPENED_STATE) +			continue; +		vmbus_close_internal(cur_channel); +	} +	/* +	 * Now close the primary. +	 */ +	vmbus_close_internal(channel); +} +EXPORT_SYMBOL_GPL(vmbus_close); + +/** + * vmbus_sendpacket() - Send the specified buffer on the given channel + * @channel: Pointer to vmbus_channel structure. + * @buffer: Pointer to the buffer you want to receive the data into. + * @bufferlen: Maximum size of what the the buffer will hold + * @requestid: Identifier of the request + * @type: Type of packet that is being send e.g. negotiate, time + * packet etc. + * + * Sends data in @buffer directly to hyper-v via the vmbus + * This will send the data unparsed to hyper-v. + * + * Mainly used by Hyper-V drivers. + */ +int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, +			   u32 bufferlen, u64 requestid, +			   enum vmbus_packet_type type, u32 flags) +{ +	struct vmpacket_descriptor desc; +	u32 packetlen = sizeof(struct vmpacket_descriptor) + bufferlen; +	u32 packetlen_aligned = ALIGN(packetlen, sizeof(u64)); +	struct kvec bufferlist[3]; +	u64 aligned_data = 0; +	int ret; +	bool signal = false; + + +	/* Setup the descriptor */ +	desc.type = type; /* VmbusPacketTypeDataInBand; */ +	desc.flags = flags; /* VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; */ +	/* in 8-bytes granularity */ +	desc.offset8 = sizeof(struct vmpacket_descriptor) >> 3; +	desc.len8 = (u16)(packetlen_aligned >> 3); +	desc.trans_id = requestid; + +	bufferlist[0].iov_base = &desc; +	bufferlist[0].iov_len = sizeof(struct vmpacket_descriptor); +	bufferlist[1].iov_base = buffer; +	bufferlist[1].iov_len = bufferlen; +	bufferlist[2].iov_base = &aligned_data; +	bufferlist[2].iov_len = (packetlen_aligned - packetlen); + +	ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3, &signal); + +	if (ret == 0 && signal) +		vmbus_setevent(channel); + +	return ret; +} +EXPORT_SYMBOL(vmbus_sendpacket); + +/* + * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer + * packets using a GPADL Direct packet type. + */ +int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, +				     struct hv_page_buffer pagebuffers[], +				     u32 pagecount, void *buffer, u32 bufferlen, +				     u64 requestid) +{ +	int ret; +	int i; +	struct vmbus_channel_packet_page_buffer desc; +	u32 descsize; +	u32 packetlen; +	u32 packetlen_aligned; +	struct kvec bufferlist[3]; +	u64 aligned_data = 0; +	bool signal = false; + +	if (pagecount > MAX_PAGE_BUFFER_COUNT) +		return -EINVAL; + + +	/* +	 * Adjust the size down since vmbus_channel_packet_page_buffer is the +	 * largest size we support +	 */ +	descsize = sizeof(struct vmbus_channel_packet_page_buffer) - +			  ((MAX_PAGE_BUFFER_COUNT - pagecount) * +			  sizeof(struct hv_page_buffer)); +	packetlen = descsize + bufferlen; +	packetlen_aligned = ALIGN(packetlen, sizeof(u64)); + +	/* Setup the descriptor */ +	desc.type = VM_PKT_DATA_USING_GPA_DIRECT; +	desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; +	desc.dataoffset8 = descsize >> 3; /* in 8-bytes grandularity */ +	desc.length8 = (u16)(packetlen_aligned >> 3); +	desc.transactionid = requestid; +	desc.rangecount = pagecount; + +	for (i = 0; i < pagecount; i++) { +		desc.range[i].len = pagebuffers[i].len; +		desc.range[i].offset = pagebuffers[i].offset; +		desc.range[i].pfn	 = pagebuffers[i].pfn; +	} + +	bufferlist[0].iov_base = &desc; +	bufferlist[0].iov_len = descsize; +	bufferlist[1].iov_base = buffer; +	bufferlist[1].iov_len = bufferlen; +	bufferlist[2].iov_base = &aligned_data; +	bufferlist[2].iov_len = (packetlen_aligned - packetlen); + +	ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3, &signal); + +	if (ret == 0 && signal) +		vmbus_setevent(channel); + +	return ret; +} +EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); + +/* + * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet + * using a GPADL Direct packet type. + */ +int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel, +				struct hv_multipage_buffer *multi_pagebuffer, +				void *buffer, u32 bufferlen, u64 requestid) +{ +	int ret; +	struct vmbus_channel_packet_multipage_buffer desc; +	u32 descsize; +	u32 packetlen; +	u32 packetlen_aligned; +	struct kvec bufferlist[3]; +	u64 aligned_data = 0; +	bool signal = false; +	u32 pfncount = NUM_PAGES_SPANNED(multi_pagebuffer->offset, +					 multi_pagebuffer->len); + +	if (pfncount > MAX_MULTIPAGE_BUFFER_COUNT) +		return -EINVAL; + +	/* +	 * Adjust the size down since vmbus_channel_packet_multipage_buffer is +	 * the largest size we support +	 */ +	descsize = sizeof(struct vmbus_channel_packet_multipage_buffer) - +			  ((MAX_MULTIPAGE_BUFFER_COUNT - pfncount) * +			  sizeof(u64)); +	packetlen = descsize + bufferlen; +	packetlen_aligned = ALIGN(packetlen, sizeof(u64)); + + +	/* Setup the descriptor */ +	desc.type = VM_PKT_DATA_USING_GPA_DIRECT; +	desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; +	desc.dataoffset8 = descsize >> 3; /* in 8-bytes grandularity */ +	desc.length8 = (u16)(packetlen_aligned >> 3); +	desc.transactionid = requestid; +	desc.rangecount = 1; + +	desc.range.len = multi_pagebuffer->len; +	desc.range.offset = multi_pagebuffer->offset; + +	memcpy(desc.range.pfn_array, multi_pagebuffer->pfn_array, +	       pfncount * sizeof(u64)); + +	bufferlist[0].iov_base = &desc; +	bufferlist[0].iov_len = descsize; +	bufferlist[1].iov_base = buffer; +	bufferlist[1].iov_len = bufferlen; +	bufferlist[2].iov_base = &aligned_data; +	bufferlist[2].iov_len = (packetlen_aligned - packetlen); + +	ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3, &signal); + +	if (ret == 0 && signal) +		vmbus_setevent(channel); + +	return ret; +} +EXPORT_SYMBOL_GPL(vmbus_sendpacket_multipagebuffer); + +/** + * vmbus_recvpacket() - Retrieve the user packet on the specified channel + * @channel: Pointer to vmbus_channel structure. + * @buffer: Pointer to the buffer you want to receive the data into. + * @bufferlen: Maximum size of what the the buffer will hold + * @buffer_actual_len: The actual size of the data after it was received + * @requestid: Identifier of the request + * + * Receives directly from the hyper-v vmbus and puts the data it received + * into Buffer. This will receive the data unparsed from hyper-v. + * + * Mainly used by Hyper-V drivers. + */ +int vmbus_recvpacket(struct vmbus_channel *channel, void *buffer, +			u32 bufferlen, u32 *buffer_actual_len, u64 *requestid) +{ +	struct vmpacket_descriptor desc; +	u32 packetlen; +	u32 userlen; +	int ret; +	bool signal = false; + +	*buffer_actual_len = 0; +	*requestid = 0; + + +	ret = hv_ringbuffer_peek(&channel->inbound, &desc, +			     sizeof(struct vmpacket_descriptor)); +	if (ret != 0) +		return 0; + +	packetlen = desc.len8 << 3; +	userlen = packetlen - (desc.offset8 << 3); + +	*buffer_actual_len = userlen; + +	if (userlen > bufferlen) { + +		pr_err("Buffer too small - got %d needs %d\n", +			   bufferlen, userlen); +		return -ETOOSMALL; +	} + +	*requestid = desc.trans_id; + +	/* Copy over the packet to the user buffer */ +	ret = hv_ringbuffer_read(&channel->inbound, buffer, userlen, +			     (desc.offset8 << 3), &signal); + +	if (signal) +		vmbus_setevent(channel); + +	return 0; +} +EXPORT_SYMBOL(vmbus_recvpacket); + +/* + * vmbus_recvpacket_raw - Retrieve the raw packet on the specified channel + */ +int vmbus_recvpacket_raw(struct vmbus_channel *channel, void *buffer, +			      u32 bufferlen, u32 *buffer_actual_len, +			      u64 *requestid) +{ +	struct vmpacket_descriptor desc; +	u32 packetlen; +	int ret; +	bool signal = false; + +	*buffer_actual_len = 0; +	*requestid = 0; + + +	ret = hv_ringbuffer_peek(&channel->inbound, &desc, +			     sizeof(struct vmpacket_descriptor)); +	if (ret != 0) +		return 0; + + +	packetlen = desc.len8 << 3; + +	*buffer_actual_len = packetlen; + +	if (packetlen > bufferlen) { +		pr_err("Buffer too small - needed %d bytes but " +			"got space for only %d bytes\n", +			packetlen, bufferlen); +		return -ENOBUFS; +	} + +	*requestid = desc.trans_id; + +	/* Copy over the entire packet to the user buffer */ +	ret = hv_ringbuffer_read(&channel->inbound, buffer, packetlen, 0, +				 &signal); + +	if (signal) +		vmbus_setevent(channel); + +	return ret; +} +EXPORT_SYMBOL_GPL(vmbus_recvpacket_raw); diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c new file mode 100644 index 00000000000..ed9350d4276 --- /dev/null +++ b/drivers/hv/channel_mgmt.c @@ -0,0 +1,873 @@ +/* + * Copyright (c) 2009, Microsoft Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Authors: + *   Haiyang Zhang <haiyangz@microsoft.com> + *   Hank Janssen  <hjanssen@microsoft.com> + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/completion.h> +#include <linux/hyperv.h> + +#include "hyperv_vmbus.h" + +struct vmbus_channel_message_table_entry { +	enum vmbus_channel_message_type message_type; +	void (*message_handler)(struct vmbus_channel_message_header *msg); +}; + + +/** + * vmbus_prep_negotiate_resp() - Create default response for Hyper-V Negotiate message + * @icmsghdrp: Pointer to msg header structure + * @icmsg_negotiate: Pointer to negotiate message structure + * @buf: Raw buffer channel data + * + * @icmsghdrp is of type &struct icmsg_hdr. + * @negop is of type &struct icmsg_negotiate. + * Set up and fill in default negotiate response message. + * + * The fw_version specifies the  framework version that + * we can support and srv_version specifies the service + * version we can support. + * + * Mainly used by Hyper-V drivers. + */ +bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, +				struct icmsg_negotiate *negop, u8 *buf, +				int fw_version, int srv_version) +{ +	int icframe_major, icframe_minor; +	int icmsg_major, icmsg_minor; +	int fw_major, fw_minor; +	int srv_major, srv_minor; +	int i; +	bool found_match = false; + +	icmsghdrp->icmsgsize = 0x10; +	fw_major = (fw_version >> 16); +	fw_minor = (fw_version & 0xFFFF); + +	srv_major = (srv_version >> 16); +	srv_minor = (srv_version & 0xFFFF); + +	negop = (struct icmsg_negotiate *)&buf[ +		sizeof(struct vmbuspipe_hdr) + +		sizeof(struct icmsg_hdr)]; + +	icframe_major = negop->icframe_vercnt; +	icframe_minor = 0; + +	icmsg_major = negop->icmsg_vercnt; +	icmsg_minor = 0; + +	/* +	 * Select the framework version number we will +	 * support. +	 */ + +	for (i = 0; i < negop->icframe_vercnt; i++) { +		if ((negop->icversion_data[i].major == fw_major) && +		   (negop->icversion_data[i].minor == fw_minor)) { +			icframe_major = negop->icversion_data[i].major; +			icframe_minor = negop->icversion_data[i].minor; +			found_match = true; +		} +	} + +	if (!found_match) +		goto fw_error; + +	found_match = false; + +	for (i = negop->icframe_vercnt; +		 (i < negop->icframe_vercnt + negop->icmsg_vercnt); i++) { +		if ((negop->icversion_data[i].major == srv_major) && +		   (negop->icversion_data[i].minor == srv_minor)) { +			icmsg_major = negop->icversion_data[i].major; +			icmsg_minor = negop->icversion_data[i].minor; +			found_match = true; +		} +	} + +	/* +	 * Respond with the framework and service +	 * version numbers we can support. +	 */ + +fw_error: +	if (!found_match) { +		negop->icframe_vercnt = 0; +		negop->icmsg_vercnt = 0; +	} else { +		negop->icframe_vercnt = 1; +		negop->icmsg_vercnt = 1; +	} + +	negop->icversion_data[0].major = icframe_major; +	negop->icversion_data[0].minor = icframe_minor; +	negop->icversion_data[1].major = icmsg_major; +	negop->icversion_data[1].minor = icmsg_minor; +	return found_match; +} + +EXPORT_SYMBOL_GPL(vmbus_prep_negotiate_resp); + +/* + * alloc_channel - Allocate and initialize a vmbus channel object + */ +static struct vmbus_channel *alloc_channel(void) +{ +	struct vmbus_channel *channel; + +	channel = kzalloc(sizeof(*channel), GFP_ATOMIC); +	if (!channel) +		return NULL; + +	spin_lock_init(&channel->inbound_lock); +	spin_lock_init(&channel->sc_lock); + +	INIT_LIST_HEAD(&channel->sc_list); +	INIT_LIST_HEAD(&channel->percpu_list); + +	channel->controlwq = create_workqueue("hv_vmbus_ctl"); +	if (!channel->controlwq) { +		kfree(channel); +		return NULL; +	} + +	return channel; +} + +/* + * release_hannel - Release the vmbus channel object itself + */ +static void release_channel(struct work_struct *work) +{ +	struct vmbus_channel *channel = container_of(work, +						     struct vmbus_channel, +						     work); + +	destroy_workqueue(channel->controlwq); + +	kfree(channel); +} + +/* + * free_channel - Release the resources used by the vmbus channel object + */ +static void free_channel(struct vmbus_channel *channel) +{ + +	/* +	 * We have to release the channel's workqueue/thread in the vmbus's +	 * workqueue/thread context +	 * ie we can't destroy ourselves. +	 */ +	INIT_WORK(&channel->work, release_channel); +	queue_work(vmbus_connection.work_queue, &channel->work); +} + +static void percpu_channel_enq(void *arg) +{ +	struct vmbus_channel *channel = arg; +	int cpu = smp_processor_id(); + +	list_add_tail(&channel->percpu_list, &hv_context.percpu_list[cpu]); +} + +static void percpu_channel_deq(void *arg) +{ +	struct vmbus_channel *channel = arg; + +	list_del(&channel->percpu_list); +} + +/* + * vmbus_process_rescind_offer - + * Rescind the offer by initiating a device removal + */ +static void vmbus_process_rescind_offer(struct work_struct *work) +{ +	struct vmbus_channel *channel = container_of(work, +						     struct vmbus_channel, +						     work); +	unsigned long flags; +	struct vmbus_channel *primary_channel; +	struct vmbus_channel_relid_released msg; + +	if (channel->device_obj) +		vmbus_device_unregister(channel->device_obj); +	memset(&msg, 0, sizeof(struct vmbus_channel_relid_released)); +	msg.child_relid = channel->offermsg.child_relid; +	msg.header.msgtype = CHANNELMSG_RELID_RELEASED; +	vmbus_post_msg(&msg, sizeof(struct vmbus_channel_relid_released)); + +	if (channel->target_cpu != smp_processor_id()) +		smp_call_function_single(channel->target_cpu, +					 percpu_channel_deq, channel, true); +	else +		percpu_channel_deq(channel); + +	if (channel->primary_channel == NULL) { +		spin_lock_irqsave(&vmbus_connection.channel_lock, flags); +		list_del(&channel->listentry); +		spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); +	} else { +		primary_channel = channel->primary_channel; +		spin_lock_irqsave(&primary_channel->sc_lock, flags); +		list_del(&channel->sc_list); +		spin_unlock_irqrestore(&primary_channel->sc_lock, flags); +	} +	free_channel(channel); +} + +void vmbus_free_channels(void) +{ +	struct vmbus_channel *channel; + +	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { +		vmbus_device_unregister(channel->device_obj); +		kfree(channel->device_obj); +		free_channel(channel); +	} +} + +/* + * vmbus_process_offer - Process the offer by creating a channel/device + * associated with this offer + */ +static void vmbus_process_offer(struct work_struct *work) +{ +	struct vmbus_channel *newchannel = container_of(work, +							struct vmbus_channel, +							work); +	struct vmbus_channel *channel; +	bool fnew = true; +	bool enq = false; +	int ret; +	unsigned long flags; + +	/* The next possible work is rescind handling */ +	INIT_WORK(&newchannel->work, vmbus_process_rescind_offer); + +	/* Make sure this is a new offer */ +	spin_lock_irqsave(&vmbus_connection.channel_lock, flags); + +	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { +		if (!uuid_le_cmp(channel->offermsg.offer.if_type, +			newchannel->offermsg.offer.if_type) && +			!uuid_le_cmp(channel->offermsg.offer.if_instance, +				newchannel->offermsg.offer.if_instance)) { +			fnew = false; +			break; +		} +	} + +	if (fnew) { +		list_add_tail(&newchannel->listentry, +			      &vmbus_connection.chn_list); +		enq = true; +	} + +	spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); + +	if (enq) { +		if (newchannel->target_cpu != smp_processor_id()) +			smp_call_function_single(newchannel->target_cpu, +						 percpu_channel_enq, +						 newchannel, true); +		else +			percpu_channel_enq(newchannel); +	} +	if (!fnew) { +		/* +		 * Check to see if this is a sub-channel. +		 */ +		if (newchannel->offermsg.offer.sub_channel_index != 0) { +			/* +			 * Process the sub-channel. +			 */ +			newchannel->primary_channel = channel; +			spin_lock_irqsave(&channel->sc_lock, flags); +			list_add_tail(&newchannel->sc_list, &channel->sc_list); +			spin_unlock_irqrestore(&channel->sc_lock, flags); + +			if (newchannel->target_cpu != smp_processor_id()) +				smp_call_function_single(newchannel->target_cpu, +							 percpu_channel_enq, +							 newchannel, true); +			else +				percpu_channel_enq(newchannel); + +			newchannel->state = CHANNEL_OPEN_STATE; +			if (channel->sc_creation_callback != NULL) +				channel->sc_creation_callback(newchannel); + +			return; +		} + +		free_channel(newchannel); +		return; +	} + +	/* +	 * This state is used to indicate a successful open +	 * so that when we do close the channel normally, we +	 * can cleanup properly +	 */ +	newchannel->state = CHANNEL_OPEN_STATE; + +	/* +	 * Start the process of binding this offer to the driver +	 * We need to set the DeviceObject field before calling +	 * vmbus_child_dev_add() +	 */ +	newchannel->device_obj = vmbus_device_create( +		&newchannel->offermsg.offer.if_type, +		&newchannel->offermsg.offer.if_instance, +		newchannel); + +	/* +	 * Add the new device to the bus. This will kick off device-driver +	 * binding which eventually invokes the device driver's AddDevice() +	 * method. +	 */ +	ret = vmbus_device_register(newchannel->device_obj); +	if (ret != 0) { +		pr_err("unable to add child device object (relid %d)\n", +			   newchannel->offermsg.child_relid); + +		spin_lock_irqsave(&vmbus_connection.channel_lock, flags); +		list_del(&newchannel->listentry); +		spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); +		kfree(newchannel->device_obj); + +		free_channel(newchannel); +	} +} + +enum { +	IDE = 0, +	SCSI, +	NIC, +	MAX_PERF_CHN, +}; + +/* + * This is an array of device_ids (device types) that are performance critical. + * We attempt to distribute the interrupt load for these devices across + * all available CPUs. + */ +static const struct hv_vmbus_device_id hp_devs[] = { +	/* IDE */ +	{ HV_IDE_GUID, }, +	/* Storage - SCSI */ +	{ HV_SCSI_GUID, }, +	/* Network */ +	{ HV_NIC_GUID, }, +}; + + +/* + * We use this state to statically distribute the channel interrupt load. + */ +static u32  next_vp; + +/* + * Starting with Win8, we can statically distribute the incoming + * channel interrupt load by binding a channel to VCPU. We + * implement here a simple round robin scheme for distributing + * the interrupt load. + * We will bind channels that are not performance critical to cpu 0 and + * performance critical channels (IDE, SCSI and Network) will be uniformly + * distributed across all available CPUs. + */ +static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_guid) +{ +	u32 cur_cpu; +	int i; +	bool perf_chn = false; +	u32 max_cpus = num_online_cpus(); + +	for (i = IDE; i < MAX_PERF_CHN; i++) { +		if (!memcmp(type_guid->b, hp_devs[i].guid, +				 sizeof(uuid_le))) { +			perf_chn = true; +			break; +		} +	} +	if ((vmbus_proto_version == VERSION_WS2008) || +	    (vmbus_proto_version == VERSION_WIN7) || (!perf_chn)) { +		/* +		 * Prior to win8, all channel interrupts are +		 * delivered on cpu 0. +		 * Also if the channel is not a performance critical +		 * channel, bind it to cpu 0. +		 */ +		channel->target_cpu = 0; +		channel->target_vp = 0; +		return; +	} +	cur_cpu = (++next_vp % max_cpus); +	channel->target_cpu = cur_cpu; +	channel->target_vp = hv_context.vp_index[cur_cpu]; +} + +/* + * vmbus_onoffer - Handler for channel offers from vmbus in parent partition. + * + */ +static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) +{ +	struct vmbus_channel_offer_channel *offer; +	struct vmbus_channel *newchannel; + +	offer = (struct vmbus_channel_offer_channel *)hdr; + +	/* Allocate the channel object and save this offer. */ +	newchannel = alloc_channel(); +	if (!newchannel) { +		pr_err("Unable to allocate channel object\n"); +		return; +	} + +	/* +	 * By default we setup state to enable batched +	 * reading. A specific service can choose to +	 * disable this prior to opening the channel. +	 */ +	newchannel->batched_reading = true; + +	/* +	 * Setup state for signalling the host. +	 */ +	newchannel->sig_event = (struct hv_input_signal_event *) +				(ALIGN((unsigned long) +				&newchannel->sig_buf, +				HV_HYPERCALL_PARAM_ALIGN)); + +	newchannel->sig_event->connectionid.asu32 = 0; +	newchannel->sig_event->connectionid.u.id = VMBUS_EVENT_CONNECTION_ID; +	newchannel->sig_event->flag_number = 0; +	newchannel->sig_event->rsvdz = 0; + +	if (vmbus_proto_version != VERSION_WS2008) { +		newchannel->is_dedicated_interrupt = +				(offer->is_dedicated_interrupt != 0); +		newchannel->sig_event->connectionid.u.id = +				offer->connection_id; +	} + +	init_vp_index(newchannel, &offer->offer.if_type); + +	memcpy(&newchannel->offermsg, offer, +	       sizeof(struct vmbus_channel_offer_channel)); +	newchannel->monitor_grp = (u8)offer->monitorid / 32; +	newchannel->monitor_bit = (u8)offer->monitorid % 32; + +	INIT_WORK(&newchannel->work, vmbus_process_offer); +	queue_work(newchannel->controlwq, &newchannel->work); +} + +/* + * vmbus_onoffer_rescind - Rescind offer handler. + * + * We queue a work item to process this offer synchronously + */ +static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) +{ +	struct vmbus_channel_rescind_offer *rescind; +	struct vmbus_channel *channel; + +	rescind = (struct vmbus_channel_rescind_offer *)hdr; +	channel = relid2channel(rescind->child_relid); + +	if (channel == NULL) +		/* Just return here, no channel found */ +		return; + +	/* work is initialized for vmbus_process_rescind_offer() from +	 * vmbus_process_offer() where the channel got created */ +	queue_work(channel->controlwq, &channel->work); +} + +/* + * vmbus_onoffers_delivered - + * This is invoked when all offers have been delivered. + * + * Nothing to do here. + */ +static void vmbus_onoffers_delivered( +			struct vmbus_channel_message_header *hdr) +{ +} + +/* + * vmbus_onopen_result - Open result handler. + * + * This is invoked when we received a response to our channel open request. + * Find the matching request, copy the response and signal the requesting + * thread. + */ +static void vmbus_onopen_result(struct vmbus_channel_message_header *hdr) +{ +	struct vmbus_channel_open_result *result; +	struct vmbus_channel_msginfo *msginfo; +	struct vmbus_channel_message_header *requestheader; +	struct vmbus_channel_open_channel *openmsg; +	unsigned long flags; + +	result = (struct vmbus_channel_open_result *)hdr; + +	/* +	 * Find the open msg, copy the result and signal/unblock the wait event +	 */ +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + +	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, +				msglistentry) { +		requestheader = +			(struct vmbus_channel_message_header *)msginfo->msg; + +		if (requestheader->msgtype == CHANNELMSG_OPENCHANNEL) { +			openmsg = +			(struct vmbus_channel_open_channel *)msginfo->msg; +			if (openmsg->child_relid == result->child_relid && +			    openmsg->openid == result->openid) { +				memcpy(&msginfo->response.open_result, +				       result, +				       sizeof( +					struct vmbus_channel_open_result)); +				complete(&msginfo->waitevent); +				break; +			} +		} +	} +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); +} + +/* + * vmbus_ongpadl_created - GPADL created handler. + * + * This is invoked when we received a response to our gpadl create request. + * Find the matching request, copy the response and signal the requesting + * thread. + */ +static void vmbus_ongpadl_created(struct vmbus_channel_message_header *hdr) +{ +	struct vmbus_channel_gpadl_created *gpadlcreated; +	struct vmbus_channel_msginfo *msginfo; +	struct vmbus_channel_message_header *requestheader; +	struct vmbus_channel_gpadl_header *gpadlheader; +	unsigned long flags; + +	gpadlcreated = (struct vmbus_channel_gpadl_created *)hdr; + +	/* +	 * Find the establish msg, copy the result and signal/unblock the wait +	 * event +	 */ +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + +	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, +				msglistentry) { +		requestheader = +			(struct vmbus_channel_message_header *)msginfo->msg; + +		if (requestheader->msgtype == CHANNELMSG_GPADL_HEADER) { +			gpadlheader = +			(struct vmbus_channel_gpadl_header *)requestheader; + +			if ((gpadlcreated->child_relid == +			     gpadlheader->child_relid) && +			    (gpadlcreated->gpadl == gpadlheader->gpadl)) { +				memcpy(&msginfo->response.gpadl_created, +				       gpadlcreated, +				       sizeof( +					struct vmbus_channel_gpadl_created)); +				complete(&msginfo->waitevent); +				break; +			} +		} +	} +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); +} + +/* + * vmbus_ongpadl_torndown - GPADL torndown handler. + * + * This is invoked when we received a response to our gpadl teardown request. + * Find the matching request, copy the response and signal the requesting + * thread. + */ +static void vmbus_ongpadl_torndown( +			struct vmbus_channel_message_header *hdr) +{ +	struct vmbus_channel_gpadl_torndown *gpadl_torndown; +	struct vmbus_channel_msginfo *msginfo; +	struct vmbus_channel_message_header *requestheader; +	struct vmbus_channel_gpadl_teardown *gpadl_teardown; +	unsigned long flags; + +	gpadl_torndown = (struct vmbus_channel_gpadl_torndown *)hdr; + +	/* +	 * Find the open msg, copy the result and signal/unblock the wait event +	 */ +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + +	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, +				msglistentry) { +		requestheader = +			(struct vmbus_channel_message_header *)msginfo->msg; + +		if (requestheader->msgtype == CHANNELMSG_GPADL_TEARDOWN) { +			gpadl_teardown = +			(struct vmbus_channel_gpadl_teardown *)requestheader; + +			if (gpadl_torndown->gpadl == gpadl_teardown->gpadl) { +				memcpy(&msginfo->response.gpadl_torndown, +				       gpadl_torndown, +				       sizeof( +					struct vmbus_channel_gpadl_torndown)); +				complete(&msginfo->waitevent); +				break; +			} +		} +	} +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); +} + +/* + * vmbus_onversion_response - Version response handler + * + * This is invoked when we received a response to our initiate contact request. + * Find the matching request, copy the response and signal the requesting + * thread. + */ +static void vmbus_onversion_response( +		struct vmbus_channel_message_header *hdr) +{ +	struct vmbus_channel_msginfo *msginfo; +	struct vmbus_channel_message_header *requestheader; +	struct vmbus_channel_version_response *version_response; +	unsigned long flags; + +	version_response = (struct vmbus_channel_version_response *)hdr; +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + +	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, +				msglistentry) { +		requestheader = +			(struct vmbus_channel_message_header *)msginfo->msg; + +		if (requestheader->msgtype == +		    CHANNELMSG_INITIATE_CONTACT) { +			memcpy(&msginfo->response.version_response, +			      version_response, +			      sizeof(struct vmbus_channel_version_response)); +			complete(&msginfo->waitevent); +		} +	} +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); +} + +/* Channel message dispatch table */ +static struct vmbus_channel_message_table_entry +	channel_message_table[CHANNELMSG_COUNT] = { +	{CHANNELMSG_INVALID,			NULL}, +	{CHANNELMSG_OFFERCHANNEL,		vmbus_onoffer}, +	{CHANNELMSG_RESCIND_CHANNELOFFER,	vmbus_onoffer_rescind}, +	{CHANNELMSG_REQUESTOFFERS,		NULL}, +	{CHANNELMSG_ALLOFFERS_DELIVERED,	vmbus_onoffers_delivered}, +	{CHANNELMSG_OPENCHANNEL,		NULL}, +	{CHANNELMSG_OPENCHANNEL_RESULT,	vmbus_onopen_result}, +	{CHANNELMSG_CLOSECHANNEL,		NULL}, +	{CHANNELMSG_GPADL_HEADER,		NULL}, +	{CHANNELMSG_GPADL_BODY,		NULL}, +	{CHANNELMSG_GPADL_CREATED,		vmbus_ongpadl_created}, +	{CHANNELMSG_GPADL_TEARDOWN,		NULL}, +	{CHANNELMSG_GPADL_TORNDOWN,		vmbus_ongpadl_torndown}, +	{CHANNELMSG_RELID_RELEASED,		NULL}, +	{CHANNELMSG_INITIATE_CONTACT,		NULL}, +	{CHANNELMSG_VERSION_RESPONSE,		vmbus_onversion_response}, +	{CHANNELMSG_UNLOAD,			NULL}, +}; + +/* + * vmbus_onmessage - Handler for channel protocol messages. + * + * This is invoked in the vmbus worker thread context. + */ +void vmbus_onmessage(void *context) +{ +	struct hv_message *msg = context; +	struct vmbus_channel_message_header *hdr; +	int size; + +	hdr = (struct vmbus_channel_message_header *)msg->u.payload; +	size = msg->header.payload_size; + +	if (hdr->msgtype >= CHANNELMSG_COUNT) { +		pr_err("Received invalid channel message type %d size %d\n", +			   hdr->msgtype, size); +		print_hex_dump_bytes("", DUMP_PREFIX_NONE, +				     (unsigned char *)msg->u.payload, size); +		return; +	} + +	if (channel_message_table[hdr->msgtype].message_handler) +		channel_message_table[hdr->msgtype].message_handler(hdr); +	else +		pr_err("Unhandled channel message type %d\n", hdr->msgtype); +} + +/* + * vmbus_request_offers - Send a request to get all our pending offers. + */ +int vmbus_request_offers(void) +{ +	struct vmbus_channel_message_header *msg; +	struct vmbus_channel_msginfo *msginfo; +	int ret, t; + +	msginfo = kmalloc(sizeof(*msginfo) + +			  sizeof(struct vmbus_channel_message_header), +			  GFP_KERNEL); +	if (!msginfo) +		return -ENOMEM; + +	init_completion(&msginfo->waitevent); + +	msg = (struct vmbus_channel_message_header *)msginfo->msg; + +	msg->msgtype = CHANNELMSG_REQUESTOFFERS; + + +	ret = vmbus_post_msg(msg, +			       sizeof(struct vmbus_channel_message_header)); +	if (ret != 0) { +		pr_err("Unable to request offers - %d\n", ret); + +		goto cleanup; +	} + +	t = wait_for_completion_timeout(&msginfo->waitevent, 5*HZ); +	if (t == 0) { +		ret = -ETIMEDOUT; +		goto cleanup; +	} + + + +cleanup: +	kfree(msginfo); + +	return ret; +} + +/* + * Retrieve the (sub) channel on which to send an outgoing request. + * When a primary channel has multiple sub-channels, we choose a + * channel whose VCPU binding is closest to the VCPU on which + * this call is being made. + */ +struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary) +{ +	struct list_head *cur, *tmp; +	int cur_cpu = hv_context.vp_index[smp_processor_id()]; +	struct vmbus_channel *cur_channel; +	struct vmbus_channel *outgoing_channel = primary; +	int cpu_distance, new_cpu_distance; + +	if (list_empty(&primary->sc_list)) +		return outgoing_channel; + +	list_for_each_safe(cur, tmp, &primary->sc_list) { +		cur_channel = list_entry(cur, struct vmbus_channel, sc_list); +		if (cur_channel->state != CHANNEL_OPENED_STATE) +			continue; + +		if (cur_channel->target_vp == cur_cpu) +			return cur_channel; + +		cpu_distance = ((outgoing_channel->target_vp > cur_cpu) ? +				(outgoing_channel->target_vp - cur_cpu) : +				(cur_cpu - outgoing_channel->target_vp)); + +		new_cpu_distance = ((cur_channel->target_vp > cur_cpu) ? +				(cur_channel->target_vp - cur_cpu) : +				(cur_cpu - cur_channel->target_vp)); + +		if (cpu_distance < new_cpu_distance) +			continue; + +		outgoing_channel = cur_channel; +	} + +	return outgoing_channel; +} +EXPORT_SYMBOL_GPL(vmbus_get_outgoing_channel); + +static void invoke_sc_cb(struct vmbus_channel *primary_channel) +{ +	struct list_head *cur, *tmp; +	struct vmbus_channel *cur_channel; + +	if (primary_channel->sc_creation_callback == NULL) +		return; + +	list_for_each_safe(cur, tmp, &primary_channel->sc_list) { +		cur_channel = list_entry(cur, struct vmbus_channel, sc_list); + +		primary_channel->sc_creation_callback(cur_channel); +	} +} + +void vmbus_set_sc_create_callback(struct vmbus_channel *primary_channel, +				void (*sc_cr_cb)(struct vmbus_channel *new_sc)) +{ +	primary_channel->sc_creation_callback = sc_cr_cb; +} +EXPORT_SYMBOL_GPL(vmbus_set_sc_create_callback); + +bool vmbus_are_subchannels_present(struct vmbus_channel *primary) +{ +	bool ret; + +	ret = !list_empty(&primary->sc_list); + +	if (ret) { +		/* +		 * Invoke the callback on sub-channel creation. +		 * This will present a uniform interface to the +		 * clients. +		 */ +		invoke_sc_cb(primary); +	} + +	return ret; +} +EXPORT_SYMBOL_GPL(vmbus_are_subchannels_present); diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c new file mode 100644 index 00000000000..ae22e3c1fc4 --- /dev/null +++ b/drivers/hv/connection.c @@ -0,0 +1,455 @@ +/* + * + * Copyright (c) 2009, Microsoft Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Authors: + *   Haiyang Zhang <haiyangz@microsoft.com> + *   Hank Janssen  <hjanssen@microsoft.com> + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/delay.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/hyperv.h> +#include <linux/export.h> +#include <asm/hyperv.h> +#include "hyperv_vmbus.h" + + +struct vmbus_connection vmbus_connection = { +	.conn_state		= DISCONNECTED, +	.next_gpadl_handle	= ATOMIC_INIT(0xE1E10), +}; + +/* + * Negotiated protocol version with the host. + */ +__u32 vmbus_proto_version; +EXPORT_SYMBOL_GPL(vmbus_proto_version); + +static __u32 vmbus_get_next_version(__u32 current_version) +{ +	switch (current_version) { +	case (VERSION_WIN7): +		return VERSION_WS2008; + +	case (VERSION_WIN8): +		return VERSION_WIN7; + +	case (VERSION_WIN8_1): +		return VERSION_WIN8; + +	case (VERSION_WS2008): +	default: +		return VERSION_INVAL; +	} +} + +static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, +					__u32 version) +{ +	int ret = 0; +	struct vmbus_channel_initiate_contact *msg; +	unsigned long flags; + +	init_completion(&msginfo->waitevent); + +	msg = (struct vmbus_channel_initiate_contact *)msginfo->msg; + +	msg->header.msgtype = CHANNELMSG_INITIATE_CONTACT; +	msg->vmbus_version_requested = version; +	msg->interrupt_page = virt_to_phys(vmbus_connection.int_page); +	msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]); +	msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]); +	if (version == VERSION_WIN8_1) +		msg->target_vcpu = hv_context.vp_index[smp_processor_id()]; + +	/* +	 * Add to list before we send the request since we may +	 * receive the response before returning from this routine +	 */ +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); +	list_add_tail(&msginfo->msglistentry, +		      &vmbus_connection.chn_msg_list); + +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + +	ret = vmbus_post_msg(msg, +			       sizeof(struct vmbus_channel_initiate_contact)); +	if (ret != 0) { +		spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); +		list_del(&msginfo->msglistentry); +		spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, +					flags); +		return ret; +	} + +	/* Wait for the connection response */ +	wait_for_completion(&msginfo->waitevent); + +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); +	list_del(&msginfo->msglistentry); +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + +	/* Check if successful */ +	if (msginfo->response.version_response.version_supported) { +		vmbus_connection.conn_state = CONNECTED; +	} else { +		return -ECONNREFUSED; +	} + +	return ret; +} + +/* + * vmbus_connect - Sends a connect request on the partition service connection + */ +int vmbus_connect(void) +{ +	int ret = 0; +	struct vmbus_channel_msginfo *msginfo = NULL; +	__u32 version; + +	/* Initialize the vmbus connection */ +	vmbus_connection.conn_state = CONNECTING; +	vmbus_connection.work_queue = create_workqueue("hv_vmbus_con"); +	if (!vmbus_connection.work_queue) { +		ret = -ENOMEM; +		goto cleanup; +	} + +	INIT_LIST_HEAD(&vmbus_connection.chn_msg_list); +	spin_lock_init(&vmbus_connection.channelmsg_lock); + +	INIT_LIST_HEAD(&vmbus_connection.chn_list); +	spin_lock_init(&vmbus_connection.channel_lock); + +	/* +	 * Setup the vmbus event connection for channel interrupt +	 * abstraction stuff +	 */ +	vmbus_connection.int_page = +	(void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0); +	if (vmbus_connection.int_page == NULL) { +		ret = -ENOMEM; +		goto cleanup; +	} + +	vmbus_connection.recv_int_page = vmbus_connection.int_page; +	vmbus_connection.send_int_page = +		(void *)((unsigned long)vmbus_connection.int_page + +			(PAGE_SIZE >> 1)); + +	/* +	 * Setup the monitor notification facility. The 1st page for +	 * parent->child and the 2nd page for child->parent +	 */ +	vmbus_connection.monitor_pages[0] = (void *)__get_free_pages((GFP_KERNEL|__GFP_ZERO), 0); +	vmbus_connection.monitor_pages[1] = (void *)__get_free_pages((GFP_KERNEL|__GFP_ZERO), 0); +	if ((vmbus_connection.monitor_pages[0] == NULL) || +	    (vmbus_connection.monitor_pages[1] == NULL)) { +		ret = -ENOMEM; +		goto cleanup; +	} + +	msginfo = kzalloc(sizeof(*msginfo) + +			  sizeof(struct vmbus_channel_initiate_contact), +			  GFP_KERNEL); +	if (msginfo == NULL) { +		ret = -ENOMEM; +		goto cleanup; +	} + +	/* +	 * Negotiate a compatible VMBUS version number with the +	 * host. We start with the highest number we can support +	 * and work our way down until we negotiate a compatible +	 * version. +	 */ + +	version = VERSION_CURRENT; + +	do { +		ret = vmbus_negotiate_version(msginfo, version); +		if (ret == -ETIMEDOUT) +			goto cleanup; + +		if (vmbus_connection.conn_state == CONNECTED) +			break; + +		version = vmbus_get_next_version(version); +	} while (version != VERSION_INVAL); + +	if (version == VERSION_INVAL) +		goto cleanup; + +	vmbus_proto_version = version; +	pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d; Vmbus version:%d.%d\n", +		    host_info_eax, host_info_ebx >> 16, +		    host_info_ebx & 0xFFFF, host_info_ecx, +		    host_info_edx >> 24, host_info_edx & 0xFFFFFF, +		    version >> 16, version & 0xFFFF); + +	kfree(msginfo); +	return 0; + +cleanup: +	pr_err("Unable to connect to host\n"); +	vmbus_connection.conn_state = DISCONNECTED; + +	if (vmbus_connection.work_queue) +		destroy_workqueue(vmbus_connection.work_queue); + +	if (vmbus_connection.int_page) { +		free_pages((unsigned long)vmbus_connection.int_page, 0); +		vmbus_connection.int_page = NULL; +	} + +	free_pages((unsigned long)vmbus_connection.monitor_pages[0], 0); +	free_pages((unsigned long)vmbus_connection.monitor_pages[1], 0); +	vmbus_connection.monitor_pages[0] = NULL; +	vmbus_connection.monitor_pages[1] = NULL; + +	kfree(msginfo); + +	return ret; +} + +/* + * Map the given relid to the corresponding channel based on the + * per-cpu list of channels that have been affinitized to this CPU. + * This will be used in the channel callback path as we can do this + * mapping in a lock-free fashion. + */ +static struct vmbus_channel *pcpu_relid2channel(u32 relid) +{ +	struct vmbus_channel *channel; +	struct vmbus_channel *found_channel  = NULL; +	int cpu = smp_processor_id(); +	struct list_head *pcpu_head = &hv_context.percpu_list[cpu]; + +	list_for_each_entry(channel, pcpu_head, percpu_list) { +		if (channel->offermsg.child_relid == relid) { +			found_channel = channel; +			break; +		} +	} + +	return found_channel; +} + +/* + * relid2channel - Get the channel object given its + * child relative id (ie channel id) + */ +struct vmbus_channel *relid2channel(u32 relid) +{ +	struct vmbus_channel *channel; +	struct vmbus_channel *found_channel  = NULL; +	unsigned long flags; +	struct list_head *cur, *tmp; +	struct vmbus_channel *cur_sc; + +	spin_lock_irqsave(&vmbus_connection.channel_lock, flags); +	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { +		if (channel->offermsg.child_relid == relid) { +			found_channel = channel; +			break; +		} else if (!list_empty(&channel->sc_list)) { +			/* +			 * Deal with sub-channels. +			 */ +			list_for_each_safe(cur, tmp, &channel->sc_list) { +				cur_sc = list_entry(cur, struct vmbus_channel, +							sc_list); +				if (cur_sc->offermsg.child_relid == relid) { +					found_channel = cur_sc; +					break; +				} +			} +		} +	} +	spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); + +	return found_channel; +} + +/* + * process_chn_event - Process a channel event notification + */ +static void process_chn_event(u32 relid) +{ +	struct vmbus_channel *channel; +	void *arg; +	bool read_state; +	u32 bytes_to_read; + +	/* +	 * Find the channel based on this relid and invokes the +	 * channel callback to process the event +	 */ +	channel = pcpu_relid2channel(relid); + +	if (!channel) { +		pr_err("channel not found for relid - %u\n", relid); +		return; +	} + +	/* +	 * A channel once created is persistent even when there +	 * is no driver handling the device. An unloading driver +	 * sets the onchannel_callback to NULL on the same CPU +	 * as where this interrupt is handled (in an interrupt context). +	 * Thus, checking and invoking the driver specific callback takes +	 * care of orderly unloading of the driver. +	 */ + +	if (channel->onchannel_callback != NULL) { +		arg = channel->channel_callback_context; +		read_state = channel->batched_reading; +		/* +		 * This callback reads the messages sent by the host. +		 * We can optimize host to guest signaling by ensuring: +		 * 1. While reading the channel, we disable interrupts from +		 *    host. +		 * 2. Ensure that we process all posted messages from the host +		 *    before returning from this callback. +		 * 3. Once we return, enable signaling from the host. Once this +		 *    state is set we check to see if additional packets are +		 *    available to read. In this case we repeat the process. +		 */ + +		do { +			if (read_state) +				hv_begin_read(&channel->inbound); +			channel->onchannel_callback(arg); +			if (read_state) +				bytes_to_read = hv_end_read(&channel->inbound); +			else +				bytes_to_read = 0; +		} while (read_state && (bytes_to_read != 0)); +	} else { +		pr_err("no channel callback for relid - %u\n", relid); +	} + +} + +/* + * vmbus_on_event - Handler for events + */ +void vmbus_on_event(unsigned long data) +{ +	u32 dword; +	u32 maxdword; +	int bit; +	u32 relid; +	u32 *recv_int_page = NULL; +	void *page_addr; +	int cpu = smp_processor_id(); +	union hv_synic_event_flags *event; + +	if ((vmbus_proto_version == VERSION_WS2008) || +		(vmbus_proto_version == VERSION_WIN7)) { +		maxdword = MAX_NUM_CHANNELS_SUPPORTED >> 5; +		recv_int_page = vmbus_connection.recv_int_page; +	} else { +		/* +		 * When the host is win8 and beyond, the event page +		 * can be directly checked to get the id of the channel +		 * that has the interrupt pending. +		 */ +		maxdword = HV_EVENT_FLAGS_DWORD_COUNT; +		page_addr = hv_context.synic_event_page[cpu]; +		event = (union hv_synic_event_flags *)page_addr + +						 VMBUS_MESSAGE_SINT; +		recv_int_page = event->flags32; +	} + + + +	/* Check events */ +	if (!recv_int_page) +		return; +	for (dword = 0; dword < maxdword; dword++) { +		if (!recv_int_page[dword]) +			continue; +		for (bit = 0; bit < 32; bit++) { +			if (sync_test_and_clear_bit(bit, +				(unsigned long *)&recv_int_page[dword])) { +				relid = (dword << 5) + bit; + +				if (relid == 0) +					/* +					 * Special case - vmbus +					 * channel protocol msg +					 */ +					continue; + +				process_chn_event(relid); +			} +		} +	} +} + +/* + * vmbus_post_msg - Send a msg on the vmbus's message connection + */ +int vmbus_post_msg(void *buffer, size_t buflen) +{ +	union hv_connection_id conn_id; +	int ret = 0; +	int retries = 0; + +	conn_id.asu32 = 0; +	conn_id.u.id = VMBUS_MESSAGE_CONNECTION_ID; + +	/* +	 * hv_post_message() can have transient failures because of +	 * insufficient resources. Retry the operation a couple of +	 * times before giving up. +	 */ +	while (retries < 3) { +		ret =  hv_post_message(conn_id, 1, buffer, buflen); +		if (ret != HV_STATUS_INSUFFICIENT_BUFFERS) +			return ret; +		retries++; +		msleep(100); +	} +	return ret; +} + +/* + * vmbus_set_event - Send an event notification to the parent + */ +int vmbus_set_event(struct vmbus_channel *channel) +{ +	u32 child_relid = channel->offermsg.child_relid; + +	if (!channel->is_dedicated_interrupt) { +		/* Each u32 represents 32 channels */ +		sync_set_bit(child_relid & 31, +			(unsigned long *)vmbus_connection.send_int_page + +			(child_relid >> 5)); +	} + +	return hv_signal_event(channel->sig_event); +} diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c new file mode 100644 index 00000000000..edfc8488cb0 --- /dev/null +++ b/drivers/hv/hv.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2009, Microsoft Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Authors: + *   Haiyang Zhang <haiyangz@microsoft.com> + *   Hank Janssen  <hjanssen@microsoft.com> + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/hyperv.h> +#include <linux/version.h> +#include <linux/interrupt.h> +#include <asm/hyperv.h> +#include "hyperv_vmbus.h" + +/* The one and only */ +struct hv_context hv_context = { +	.synic_initialized	= false, +	.hypercall_page		= NULL, +}; + +/* + * query_hypervisor_info - Get version info of the windows hypervisor + */ +unsigned int host_info_eax; +unsigned int host_info_ebx; +unsigned int host_info_ecx; +unsigned int host_info_edx; + +static int query_hypervisor_info(void) +{ +	unsigned int eax; +	unsigned int ebx; +	unsigned int ecx; +	unsigned int edx; +	unsigned int max_leaf; +	unsigned int op; + +	/* +	* Its assumed that this is called after confirming that Viridian +	* is present. Query id and revision. +	*/ +	eax = 0; +	ebx = 0; +	ecx = 0; +	edx = 0; +	op = HVCPUID_VENDOR_MAXFUNCTION; +	cpuid(op, &eax, &ebx, &ecx, &edx); + +	max_leaf = eax; + +	if (max_leaf >= HVCPUID_VERSION) { +		eax = 0; +		ebx = 0; +		ecx = 0; +		edx = 0; +		op = HVCPUID_VERSION; +		cpuid(op, &eax, &ebx, &ecx, &edx); +		host_info_eax = eax; +		host_info_ebx = ebx; +		host_info_ecx = ecx; +		host_info_edx = edx; +	} +	return max_leaf; +} + +/* + * do_hypercall- Invoke the specified hypercall + */ +static u64 do_hypercall(u64 control, void *input, void *output) +{ +#ifdef CONFIG_X86_64 +	u64 hv_status = 0; +	u64 input_address = (input) ? virt_to_phys(input) : 0; +	u64 output_address = (output) ? virt_to_phys(output) : 0; +	void *hypercall_page = hv_context.hypercall_page; + +	__asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8"); +	__asm__ __volatile__("call *%3" : "=a" (hv_status) : +			     "c" (control), "d" (input_address), +			     "m" (hypercall_page)); + +	return hv_status; + +#else + +	u32 control_hi = control >> 32; +	u32 control_lo = control & 0xFFFFFFFF; +	u32 hv_status_hi = 1; +	u32 hv_status_lo = 1; +	u64 input_address = (input) ? virt_to_phys(input) : 0; +	u32 input_address_hi = input_address >> 32; +	u32 input_address_lo = input_address & 0xFFFFFFFF; +	u64 output_address = (output) ? virt_to_phys(output) : 0; +	u32 output_address_hi = output_address >> 32; +	u32 output_address_lo = output_address & 0xFFFFFFFF; +	void *hypercall_page = hv_context.hypercall_page; + +	__asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi), +			      "=a"(hv_status_lo) : "d" (control_hi), +			      "a" (control_lo), "b" (input_address_hi), +			      "c" (input_address_lo), "D"(output_address_hi), +			      "S"(output_address_lo), "m" (hypercall_page)); + +	return hv_status_lo | ((u64)hv_status_hi << 32); +#endif /* !x86_64 */ +} + +/* + * hv_init - Main initialization routine. + * + * This routine must be called before any other routines in here are called + */ +int hv_init(void) +{ +	int max_leaf; +	union hv_x64_msr_hypercall_contents hypercall_msr; +	void *virtaddr = NULL; + +	memset(hv_context.synic_event_page, 0, sizeof(void *) * NR_CPUS); +	memset(hv_context.synic_message_page, 0, +	       sizeof(void *) * NR_CPUS); +	memset(hv_context.vp_index, 0, +	       sizeof(int) * NR_CPUS); +	memset(hv_context.event_dpc, 0, +	       sizeof(void *) * NR_CPUS); + +	max_leaf = query_hypervisor_info(); + +	/* +	 * Write our OS ID. +	 */ +	hv_context.guestid = generate_guest_id(0, LINUX_VERSION_CODE, 0); +	wrmsrl(HV_X64_MSR_GUEST_OS_ID, hv_context.guestid); + +	/* See if the hypercall page is already set */ +	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + +	virtaddr = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_EXEC); + +	if (!virtaddr) +		goto cleanup; + +	hypercall_msr.enable = 1; + +	hypercall_msr.guest_physical_address = vmalloc_to_pfn(virtaddr); +	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + +	/* Confirm that hypercall page did get setup. */ +	hypercall_msr.as_uint64 = 0; +	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + +	if (!hypercall_msr.enable) +		goto cleanup; + +	hv_context.hypercall_page = virtaddr; + +	return 0; + +cleanup: +	if (virtaddr) { +		if (hypercall_msr.enable) { +			hypercall_msr.as_uint64 = 0; +			wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); +		} + +		vfree(virtaddr); +	} + +	return -ENOTSUPP; +} + +/* + * hv_cleanup - Cleanup routine. + * + * This routine is called normally during driver unloading or exiting. + */ +void hv_cleanup(void) +{ +	union hv_x64_msr_hypercall_contents hypercall_msr; + +	/* Reset our OS id */ +	wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + +	if (hv_context.hypercall_page) { +		hypercall_msr.as_uint64 = 0; +		wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); +		vfree(hv_context.hypercall_page); +		hv_context.hypercall_page = NULL; +	} +} + +/* + * hv_post_message - Post a message using the hypervisor message IPC. + * + * This involves a hypercall. + */ +int hv_post_message(union hv_connection_id connection_id, +		  enum hv_message_type message_type, +		  void *payload, size_t payload_size) +{ +	struct aligned_input { +		u64 alignment8; +		struct hv_input_post_message msg; +	}; + +	struct hv_input_post_message *aligned_msg; +	u16 status; +	unsigned long addr; + +	if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) +		return -EMSGSIZE; + +	addr = (unsigned long)kmalloc(sizeof(struct aligned_input), GFP_ATOMIC); +	if (!addr) +		return -ENOMEM; + +	aligned_msg = (struct hv_input_post_message *) +			(ALIGN(addr, HV_HYPERCALL_PARAM_ALIGN)); + +	aligned_msg->connectionid = connection_id; +	aligned_msg->message_type = message_type; +	aligned_msg->payload_size = payload_size; +	memcpy((void *)aligned_msg->payload, payload, payload_size); + +	status = do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL) +		& 0xFFFF; + +	kfree((void *)addr); + +	return status; +} + + +/* + * hv_signal_event - + * Signal an event on the specified connection using the hypervisor event IPC. + * + * This involves a hypercall. + */ +u16 hv_signal_event(void *con_id) +{ +	u16 status; + +	status = (do_hypercall(HVCALL_SIGNAL_EVENT, con_id, NULL) & 0xFFFF); + +	return status; +} + + +int hv_synic_alloc(void) +{ +	size_t size = sizeof(struct tasklet_struct); +	int cpu; + +	for_each_online_cpu(cpu) { +		hv_context.event_dpc[cpu] = kmalloc(size, GFP_ATOMIC); +		if (hv_context.event_dpc[cpu] == NULL) { +			pr_err("Unable to allocate event dpc\n"); +			goto err; +		} +		tasklet_init(hv_context.event_dpc[cpu], vmbus_on_event, cpu); + +		hv_context.synic_message_page[cpu] = +			(void *)get_zeroed_page(GFP_ATOMIC); + +		if (hv_context.synic_message_page[cpu] == NULL) { +			pr_err("Unable to allocate SYNIC message page\n"); +			goto err; +		} + +		hv_context.synic_event_page[cpu] = +			(void *)get_zeroed_page(GFP_ATOMIC); + +		if (hv_context.synic_event_page[cpu] == NULL) { +			pr_err("Unable to allocate SYNIC event page\n"); +			goto err; +		} +	} + +	return 0; +err: +	return -ENOMEM; +} + +static void hv_synic_free_cpu(int cpu) +{ +	kfree(hv_context.event_dpc[cpu]); +	if (hv_context.synic_event_page[cpu]) +		free_page((unsigned long)hv_context.synic_event_page[cpu]); +	if (hv_context.synic_message_page[cpu]) +		free_page((unsigned long)hv_context.synic_message_page[cpu]); +} + +void hv_synic_free(void) +{ +	int cpu; + +	for_each_online_cpu(cpu) +		hv_synic_free_cpu(cpu); +} + +/* + * hv_synic_init - Initialize the Synthethic Interrupt Controller. + * + * If it is already initialized by another entity (ie x2v shim), we need to + * retrieve the initialized message and event pages.  Otherwise, we create and + * initialize the message and event pages. + */ +void hv_synic_init(void *arg) +{ +	u64 version; +	union hv_synic_simp simp; +	union hv_synic_siefp siefp; +	union hv_synic_sint shared_sint; +	union hv_synic_scontrol sctrl; +	u64 vp_index; + +	int cpu = smp_processor_id(); + +	if (!hv_context.hypercall_page) +		return; + +	/* Check the version */ +	rdmsrl(HV_X64_MSR_SVERSION, version); + +	/* Setup the Synic's message page */ +	rdmsrl(HV_X64_MSR_SIMP, simp.as_uint64); +	simp.simp_enabled = 1; +	simp.base_simp_gpa = virt_to_phys(hv_context.synic_message_page[cpu]) +		>> PAGE_SHIFT; + +	wrmsrl(HV_X64_MSR_SIMP, simp.as_uint64); + +	/* Setup the Synic's event page */ +	rdmsrl(HV_X64_MSR_SIEFP, siefp.as_uint64); +	siefp.siefp_enabled = 1; +	siefp.base_siefp_gpa = virt_to_phys(hv_context.synic_event_page[cpu]) +		>> PAGE_SHIFT; + +	wrmsrl(HV_X64_MSR_SIEFP, siefp.as_uint64); + +	/* Setup the shared SINT. */ +	rdmsrl(HV_X64_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + +	shared_sint.as_uint64 = 0; +	shared_sint.vector = HYPERVISOR_CALLBACK_VECTOR; +	shared_sint.masked = false; +	shared_sint.auto_eoi = true; + +	wrmsrl(HV_X64_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + +	/* Enable the global synic bit */ +	rdmsrl(HV_X64_MSR_SCONTROL, sctrl.as_uint64); +	sctrl.enable = 1; + +	wrmsrl(HV_X64_MSR_SCONTROL, sctrl.as_uint64); + +	hv_context.synic_initialized = true; + +	/* +	 * Setup the mapping between Hyper-V's notion +	 * of cpuid and Linux' notion of cpuid. +	 * This array will be indexed using Linux cpuid. +	 */ +	rdmsrl(HV_X64_MSR_VP_INDEX, vp_index); +	hv_context.vp_index[cpu] = (u32)vp_index; + +	INIT_LIST_HEAD(&hv_context.percpu_list[cpu]); +	return; +} + +/* + * hv_synic_cleanup - Cleanup routine for hv_synic_init(). + */ +void hv_synic_cleanup(void *arg) +{ +	union hv_synic_sint shared_sint; +	union hv_synic_simp simp; +	union hv_synic_siefp siefp; +	int cpu = smp_processor_id(); + +	if (!hv_context.synic_initialized) +		return; + +	rdmsrl(HV_X64_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + +	shared_sint.masked = 1; + +	/* Need to correctly cleanup in the case of SMP!!! */ +	/* Disable the interrupt */ +	wrmsrl(HV_X64_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + +	rdmsrl(HV_X64_MSR_SIMP, simp.as_uint64); +	simp.simp_enabled = 0; +	simp.base_simp_gpa = 0; + +	wrmsrl(HV_X64_MSR_SIMP, simp.as_uint64); + +	rdmsrl(HV_X64_MSR_SIEFP, siefp.as_uint64); +	siefp.siefp_enabled = 0; +	siefp.base_siefp_gpa = 0; + +	wrmsrl(HV_X64_MSR_SIEFP, siefp.as_uint64); + +	free_page((unsigned long)hv_context.synic_message_page[cpu]); +	free_page((unsigned long)hv_context.synic_event_page[cpu]); +} diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c new file mode 100644 index 00000000000..5e90c5d771a --- /dev/null +++ b/drivers/hv/hv_balloon.c @@ -0,0 +1,1553 @@ +/* + * Copyright (c) 2012, Microsoft Corporation. + * + * Author: + *   K. Y. Srinivasan <kys@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT.  See the GNU General Public License for more + * details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/mman.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/kthread.h> +#include <linux/completion.h> +#include <linux/memory_hotplug.h> +#include <linux/memory.h> +#include <linux/notifier.h> +#include <linux/percpu_counter.h> + +#include <linux/hyperv.h> + +/* + * We begin with definitions supporting the Dynamic Memory protocol + * with the host. + * + * Begin protocol definitions. + */ + + + +/* + * Protocol versions. The low word is the minor version, the high word the major + * version. + * + * History: + * Initial version 1.0 + * Changed to 0.1 on 2009/03/25 + * Changes to 0.2 on 2009/05/14 + * Changes to 0.3 on 2009/12/03 + * Changed to 1.0 on 2011/04/05 + */ + +#define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor))) +#define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16) +#define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff) + +enum { +	DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3), +	DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0), + +	DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1, +	DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2, + +	DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN8 +}; + + + +/* + * Message Types + */ + +enum dm_message_type { +	/* +	 * Version 0.3 +	 */ +	DM_ERROR			= 0, +	DM_VERSION_REQUEST		= 1, +	DM_VERSION_RESPONSE		= 2, +	DM_CAPABILITIES_REPORT		= 3, +	DM_CAPABILITIES_RESPONSE	= 4, +	DM_STATUS_REPORT		= 5, +	DM_BALLOON_REQUEST		= 6, +	DM_BALLOON_RESPONSE		= 7, +	DM_UNBALLOON_REQUEST		= 8, +	DM_UNBALLOON_RESPONSE		= 9, +	DM_MEM_HOT_ADD_REQUEST		= 10, +	DM_MEM_HOT_ADD_RESPONSE		= 11, +	DM_VERSION_03_MAX		= 11, +	/* +	 * Version 1.0. +	 */ +	DM_INFO_MESSAGE			= 12, +	DM_VERSION_1_MAX		= 12 +}; + + +/* + * Structures defining the dynamic memory management + * protocol. + */ + +union dm_version { +	struct { +		__u16 minor_version; +		__u16 major_version; +	}; +	__u32 version; +} __packed; + + +union dm_caps { +	struct { +		__u64 balloon:1; +		__u64 hot_add:1; +		/* +		 * To support guests that may have alignment +		 * limitations on hot-add, the guest can specify +		 * its alignment requirements; a value of n +		 * represents an alignment of 2^n in mega bytes. +		 */ +		__u64 hot_add_alignment:4; +		__u64 reservedz:58; +	} cap_bits; +	__u64 caps; +} __packed; + +union dm_mem_page_range { +	struct  { +		/* +		 * The PFN number of the first page in the range. +		 * 40 bits is the architectural limit of a PFN +		 * number for AMD64. +		 */ +		__u64 start_page:40; +		/* +		 * The number of pages in the range. +		 */ +		__u64 page_cnt:24; +	} finfo; +	__u64  page_range; +} __packed; + + + +/* + * The header for all dynamic memory messages: + * + * type: Type of the message. + * size: Size of the message in bytes; including the header. + * trans_id: The guest is responsible for manufacturing this ID. + */ + +struct dm_header { +	__u16 type; +	__u16 size; +	__u32 trans_id; +} __packed; + +/* + * A generic message format for dynamic memory. + * Specific message formats are defined later in the file. + */ + +struct dm_message { +	struct dm_header hdr; +	__u8 data[]; /* enclosed message */ +} __packed; + + +/* + * Specific message types supporting the dynamic memory protocol. + */ + +/* + * Version negotiation message. Sent from the guest to the host. + * The guest is free to try different versions until the host + * accepts the version. + * + * dm_version: The protocol version requested. + * is_last_attempt: If TRUE, this is the last version guest will request. + * reservedz: Reserved field, set to zero. + */ + +struct dm_version_request { +	struct dm_header hdr; +	union dm_version version; +	__u32 is_last_attempt:1; +	__u32 reservedz:31; +} __packed; + +/* + * Version response message; Host to Guest and indicates + * if the host has accepted the version sent by the guest. + * + * is_accepted: If TRUE, host has accepted the version and the guest + * should proceed to the next stage of the protocol. FALSE indicates that + * guest should re-try with a different version. + * + * reservedz: Reserved field, set to zero. + */ + +struct dm_version_response { +	struct dm_header hdr; +	__u64 is_accepted:1; +	__u64 reservedz:63; +} __packed; + +/* + * Message reporting capabilities. This is sent from the guest to the + * host. + */ + +struct dm_capabilities { +	struct dm_header hdr; +	union dm_caps caps; +	__u64 min_page_cnt; +	__u64 max_page_number; +} __packed; + +/* + * Response to the capabilities message. This is sent from the host to the + * guest. This message notifies if the host has accepted the guest's + * capabilities. If the host has not accepted, the guest must shutdown + * the service. + * + * is_accepted: Indicates if the host has accepted guest's capabilities. + * reservedz: Must be 0. + */ + +struct dm_capabilities_resp_msg { +	struct dm_header hdr; +	__u64 is_accepted:1; +	__u64 reservedz:63; +} __packed; + +/* + * This message is used to report memory pressure from the guest. + * This message is not part of any transaction and there is no + * response to this message. + * + * num_avail: Available memory in pages. + * num_committed: Committed memory in pages. + * page_file_size: The accumulated size of all page files + *		   in the system in pages. + * zero_free: The nunber of zero and free pages. + * page_file_writes: The writes to the page file in pages. + * io_diff: An indicator of file cache efficiency or page file activity, + *	    calculated as File Cache Page Fault Count - Page Read Count. + *	    This value is in pages. + * + * Some of these metrics are Windows specific and fortunately + * the algorithm on the host side that computes the guest memory + * pressure only uses num_committed value. + */ + +struct dm_status { +	struct dm_header hdr; +	__u64 num_avail; +	__u64 num_committed; +	__u64 page_file_size; +	__u64 zero_free; +	__u32 page_file_writes; +	__u32 io_diff; +} __packed; + + +/* + * Message to ask the guest to allocate memory - balloon up message. + * This message is sent from the host to the guest. The guest may not be + * able to allocate as much memory as requested. + * + * num_pages: number of pages to allocate. + */ + +struct dm_balloon { +	struct dm_header hdr; +	__u32 num_pages; +	__u32 reservedz; +} __packed; + + +/* + * Balloon response message; this message is sent from the guest + * to the host in response to the balloon message. + * + * reservedz: Reserved; must be set to zero. + * more_pages: If FALSE, this is the last message of the transaction. + * if TRUE there will atleast one more message from the guest. + * + * range_count: The number of ranges in the range array. + * + * range_array: An array of page ranges returned to the host. + * + */ + +struct dm_balloon_response { +	struct dm_header hdr; +	__u32 reservedz; +	__u32 more_pages:1; +	__u32 range_count:31; +	union dm_mem_page_range range_array[]; +} __packed; + +/* + * Un-balloon message; this message is sent from the host + * to the guest to give guest more memory. + * + * more_pages: If FALSE, this is the last message of the transaction. + * if TRUE there will atleast one more message from the guest. + * + * reservedz: Reserved; must be set to zero. + * + * range_count: The number of ranges in the range array. + * + * range_array: An array of page ranges returned to the host. + * + */ + +struct dm_unballoon_request { +	struct dm_header hdr; +	__u32 more_pages:1; +	__u32 reservedz:31; +	__u32 range_count; +	union dm_mem_page_range range_array[]; +} __packed; + +/* + * Un-balloon response message; this message is sent from the guest + * to the host in response to an unballoon request. + * + */ + +struct dm_unballoon_response { +	struct dm_header hdr; +} __packed; + + +/* + * Hot add request message. Message sent from the host to the guest. + * + * mem_range: Memory range to hot add. + * + * On Linux we currently don't support this since we cannot hot add + * arbitrary granularity of memory. + */ + +struct dm_hot_add { +	struct dm_header hdr; +	union dm_mem_page_range range; +} __packed; + +/* + * Hot add response message. + * This message is sent by the guest to report the status of a hot add request. + * If page_count is less than the requested page count, then the host should + * assume all further hot add requests will fail, since this indicates that + * the guest has hit an upper physical memory barrier. + * + * Hot adds may also fail due to low resources; in this case, the guest must + * not complete this message until the hot add can succeed, and the host must + * not send a new hot add request until the response is sent. + * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS + * times it fails the request. + * + * + * page_count: number of pages that were successfully hot added. + * + * result: result of the operation 1: success, 0: failure. + * + */ + +struct dm_hot_add_response { +	struct dm_header hdr; +	__u32 page_count; +	__u32 result; +} __packed; + +/* + * Types of information sent from host to the guest. + */ + +enum dm_info_type { +	INFO_TYPE_MAX_PAGE_CNT = 0, +	MAX_INFO_TYPE +}; + + +/* + * Header for the information message. + */ + +struct dm_info_header { +	enum dm_info_type type; +	__u32 data_size; +} __packed; + +/* + * This message is sent from the host to the guest to pass + * some relevant information (win8 addition). + * + * reserved: no used. + * info_size: size of the information blob. + * info: information blob. + */ + +struct dm_info_msg { +	struct dm_header hdr; +	__u32 reserved; +	__u32 info_size; +	__u8  info[]; +}; + +/* + * End protocol definitions. + */ + +/* + * State to manage hot adding memory into the guest. + * The range start_pfn : end_pfn specifies the range + * that the host has asked us to hot add. The range + * start_pfn : ha_end_pfn specifies the range that we have + * currently hot added. We hot add in multiples of 128M + * chunks; it is possible that we may not be able to bring + * online all the pages in the region. The range + * covered_start_pfn : covered_end_pfn defines the pages that can + * be brough online. + */ + +struct hv_hotadd_state { +	struct list_head list; +	unsigned long start_pfn; +	unsigned long covered_start_pfn; +	unsigned long covered_end_pfn; +	unsigned long ha_end_pfn; +	unsigned long end_pfn; +}; + +struct balloon_state { +	__u32 num_pages; +	struct work_struct wrk; +}; + +struct hot_add_wrk { +	union dm_mem_page_range ha_page_range; +	union dm_mem_page_range ha_region_range; +	struct work_struct wrk; +}; + +static bool hot_add = true; +static bool do_hot_add; +/* + * Delay reporting memory pressure by + * the specified number of seconds. + */ +static uint pressure_report_delay = 45; + +/* + * The last time we posted a pressure report to host. + */ +static unsigned long last_post_time; + +module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); +MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); + +module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR)); +MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure"); +static atomic_t trans_id = ATOMIC_INIT(0); + +static int dm_ring_size = (5 * PAGE_SIZE); + +/* + * Driver specific state. + */ + +enum hv_dm_state { +	DM_INITIALIZING = 0, +	DM_INITIALIZED, +	DM_BALLOON_UP, +	DM_BALLOON_DOWN, +	DM_HOT_ADD, +	DM_INIT_ERROR +}; + + +static __u8 recv_buffer[PAGE_SIZE]; +static __u8 *send_buffer; +#define PAGES_IN_2M	512 +#define HA_CHUNK (32 * 1024) + +struct hv_dynmem_device { +	struct hv_device *dev; +	enum hv_dm_state state; +	struct completion host_event; +	struct completion config_event; + +	/* +	 * Number of pages we have currently ballooned out. +	 */ +	unsigned int num_pages_ballooned; + +	/* +	 * State to manage the ballooning (up) operation. +	 */ +	struct balloon_state balloon_wrk; + +	/* +	 * State to execute the "hot-add" operation. +	 */ +	struct hot_add_wrk ha_wrk; + +	/* +	 * This state tracks if the host has specified a hot-add +	 * region. +	 */ +	bool host_specified_ha_region; + +	/* +	 * State to synchronize hot-add. +	 */ +	struct completion  ol_waitevent; +	bool ha_waiting; +	/* +	 * This thread handles hot-add +	 * requests from the host as well as notifying +	 * the host with regards to memory pressure in +	 * the guest. +	 */ +	struct task_struct *thread; + +	/* +	 * A list of hot-add regions. +	 */ +	struct list_head ha_region_list; + +	/* +	 * We start with the highest version we can support +	 * and downgrade based on the host; we save here the +	 * next version to try. +	 */ +	__u32 next_version; +}; + +static struct hv_dynmem_device dm_device; + +static void post_status(struct hv_dynmem_device *dm); +#ifdef CONFIG_MEMORY_HOTPLUG + +static void hv_bring_pgs_online(unsigned long start_pfn, unsigned long size) +{ +	int i; + +	for (i = 0; i < size; i++) { +		struct page *pg; +		pg = pfn_to_page(start_pfn + i); +		__online_page_set_limits(pg); +		__online_page_increment_counters(pg); +		__online_page_free(pg); +	} +} + +static void hv_mem_hot_add(unsigned long start, unsigned long size, +				unsigned long pfn_count, +				struct hv_hotadd_state *has) +{ +	int ret = 0; +	int i, nid; +	unsigned long start_pfn; +	unsigned long processed_pfn; +	unsigned long total_pfn = pfn_count; + +	for (i = 0; i < (size/HA_CHUNK); i++) { +		start_pfn = start + (i * HA_CHUNK); +		has->ha_end_pfn +=  HA_CHUNK; + +		if (total_pfn > HA_CHUNK) { +			processed_pfn = HA_CHUNK; +			total_pfn -= HA_CHUNK; +		} else { +			processed_pfn = total_pfn; +			total_pfn = 0; +		} + +		has->covered_end_pfn +=  processed_pfn; + +		init_completion(&dm_device.ol_waitevent); +		dm_device.ha_waiting = true; + +		nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); +		ret = add_memory(nid, PFN_PHYS((start_pfn)), +				(HA_CHUNK << PAGE_SHIFT)); + +		if (ret) { +			pr_info("hot_add memory failed error is %d\n", ret); +			if (ret == -EEXIST) { +				/* +				 * This error indicates that the error +				 * is not a transient failure. This is the +				 * case where the guest's physical address map +				 * precludes hot adding memory. Stop all further +				 * memory hot-add. +				 */ +				do_hot_add = false; +			} +			has->ha_end_pfn -= HA_CHUNK; +			has->covered_end_pfn -=  processed_pfn; +			break; +		} + +		/* +		 * Wait for the memory block to be onlined. +		 * Since the hot add has succeeded, it is ok to +		 * proceed even if the pages in the hot added region +		 * have not been "onlined" within the allowed time. +		 */ +		wait_for_completion_timeout(&dm_device.ol_waitevent, 5*HZ); +		post_status(&dm_device); +	} + +	return; +} + +static void hv_online_page(struct page *pg) +{ +	struct list_head *cur; +	struct hv_hotadd_state *has; +	unsigned long cur_start_pgp; +	unsigned long cur_end_pgp; + +	if (dm_device.ha_waiting) { +		dm_device.ha_waiting = false; +		complete(&dm_device.ol_waitevent); +	} + +	list_for_each(cur, &dm_device.ha_region_list) { +		has = list_entry(cur, struct hv_hotadd_state, list); +		cur_start_pgp = (unsigned long) +				pfn_to_page(has->covered_start_pfn); +		cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); + +		if (((unsigned long)pg >= cur_start_pgp) && +			((unsigned long)pg < cur_end_pgp)) { +			/* +			 * This frame is currently backed; online the +			 * page. +			 */ +			__online_page_set_limits(pg); +			__online_page_increment_counters(pg); +			__online_page_free(pg); +			has->covered_start_pfn++; +		} +	} +} + +static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) +{ +	struct list_head *cur; +	struct hv_hotadd_state *has; +	unsigned long residual, new_inc; + +	if (list_empty(&dm_device.ha_region_list)) +		return false; + +	list_for_each(cur, &dm_device.ha_region_list) { +		has = list_entry(cur, struct hv_hotadd_state, list); + +		/* +		 * If the pfn range we are dealing with is not in the current +		 * "hot add block", move on. +		 */ +		if ((start_pfn >= has->end_pfn)) +			continue; +		/* +		 * If the current hot add-request extends beyond +		 * our current limit; extend it. +		 */ +		if ((start_pfn + pfn_cnt) > has->end_pfn) { +			residual = (start_pfn + pfn_cnt - has->end_pfn); +			/* +			 * Extend the region by multiples of HA_CHUNK. +			 */ +			new_inc = (residual / HA_CHUNK) * HA_CHUNK; +			if (residual % HA_CHUNK) +				new_inc += HA_CHUNK; + +			has->end_pfn += new_inc; +		} + +		/* +		 * If the current start pfn is not where the covered_end +		 * is, update it. +		 */ + +		if (has->covered_end_pfn != start_pfn) { +			has->covered_end_pfn = start_pfn; +			has->covered_start_pfn = start_pfn; +		} +		return true; + +	} + +	return false; +} + +static unsigned long handle_pg_range(unsigned long pg_start, +					unsigned long pg_count) +{ +	unsigned long start_pfn = pg_start; +	unsigned long pfn_cnt = pg_count; +	unsigned long size; +	struct list_head *cur; +	struct hv_hotadd_state *has; +	unsigned long pgs_ol = 0; +	unsigned long old_covered_state; + +	if (list_empty(&dm_device.ha_region_list)) +		return 0; + +	list_for_each(cur, &dm_device.ha_region_list) { +		has = list_entry(cur, struct hv_hotadd_state, list); + +		/* +		 * If the pfn range we are dealing with is not in the current +		 * "hot add block", move on. +		 */ +		if ((start_pfn >= has->end_pfn)) +			continue; + +		old_covered_state = has->covered_end_pfn; + +		if (start_pfn < has->ha_end_pfn) { +			/* +			 * This is the case where we are backing pages +			 * in an already hot added region. Bring +			 * these pages online first. +			 */ +			pgs_ol = has->ha_end_pfn - start_pfn; +			if (pgs_ol > pfn_cnt) +				pgs_ol = pfn_cnt; +			hv_bring_pgs_online(start_pfn, pgs_ol); +			has->covered_end_pfn +=  pgs_ol; +			has->covered_start_pfn +=  pgs_ol; +			pfn_cnt -= pgs_ol; +		} + +		if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { +			/* +			 * We have some residual hot add range +			 * that needs to be hot added; hot add +			 * it now. Hot add a multiple of +			 * of HA_CHUNK that fully covers the pages +			 * we have. +			 */ +			size = (has->end_pfn - has->ha_end_pfn); +			if (pfn_cnt <= size) { +				size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); +				if (pfn_cnt % HA_CHUNK) +					size += HA_CHUNK; +			} else { +				pfn_cnt = size; +			} +			hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has); +		} +		/* +		 * If we managed to online any pages that were given to us, +		 * we declare success. +		 */ +		return has->covered_end_pfn - old_covered_state; + +	} + +	return 0; +} + +static unsigned long process_hot_add(unsigned long pg_start, +					unsigned long pfn_cnt, +					unsigned long rg_start, +					unsigned long rg_size) +{ +	struct hv_hotadd_state *ha_region = NULL; + +	if (pfn_cnt == 0) +		return 0; + +	if (!dm_device.host_specified_ha_region) +		if (pfn_covered(pg_start, pfn_cnt)) +			goto do_pg_range; + +	/* +	 * If the host has specified a hot-add range; deal with it first. +	 */ + +	if (rg_size != 0) { +		ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL); +		if (!ha_region) +			return 0; + +		INIT_LIST_HEAD(&ha_region->list); + +		list_add_tail(&ha_region->list, &dm_device.ha_region_list); +		ha_region->start_pfn = rg_start; +		ha_region->ha_end_pfn = rg_start; +		ha_region->covered_start_pfn = pg_start; +		ha_region->covered_end_pfn = pg_start; +		ha_region->end_pfn = rg_start + rg_size; +	} + +do_pg_range: +	/* +	 * Process the page range specified; bringing them +	 * online if possible. +	 */ +	return handle_pg_range(pg_start, pfn_cnt); +} + +#endif + +static void hot_add_req(struct work_struct *dummy) +{ +	struct dm_hot_add_response resp; +#ifdef CONFIG_MEMORY_HOTPLUG +	unsigned long pg_start, pfn_cnt; +	unsigned long rg_start, rg_sz; +#endif +	struct hv_dynmem_device *dm = &dm_device; + +	memset(&resp, 0, sizeof(struct dm_hot_add_response)); +	resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE; +	resp.hdr.size = sizeof(struct dm_hot_add_response); + +#ifdef CONFIG_MEMORY_HOTPLUG +	pg_start = dm->ha_wrk.ha_page_range.finfo.start_page; +	pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt; + +	rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; +	rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; + +	if ((rg_start == 0) && (!dm->host_specified_ha_region)) { +		unsigned long region_size; +		unsigned long region_start; + +		/* +		 * The host has not specified the hot-add region. +		 * Based on the hot-add page range being specified, +		 * compute a hot-add region that can cover the pages +		 * that need to be hot-added while ensuring the alignment +		 * and size requirements of Linux as it relates to hot-add. +		 */ +		region_start = pg_start; +		region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; +		if (pfn_cnt % HA_CHUNK) +			region_size += HA_CHUNK; + +		region_start = (pg_start / HA_CHUNK) * HA_CHUNK; + +		rg_start = region_start; +		rg_sz = region_size; +	} + +	if (do_hot_add) +		resp.page_count = process_hot_add(pg_start, pfn_cnt, +						rg_start, rg_sz); +#endif +	/* +	 * The result field of the response structure has the +	 * following semantics: +	 * +	 * 1. If all or some pages hot-added: Guest should return success. +	 * +	 * 2. If no pages could be hot-added: +	 * +	 * If the guest returns success, then the host +	 * will not attempt any further hot-add operations. This +	 * signifies a permanent failure. +	 * +	 * If the guest returns failure, then this failure will be +	 * treated as a transient failure and the host may retry the +	 * hot-add operation after some delay. +	 */ +	if (resp.page_count > 0) +		resp.result = 1; +	else if (!do_hot_add) +		resp.result = 1; +	else +		resp.result = 0; + +	if (!do_hot_add || (resp.page_count == 0)) +		pr_info("Memory hot add failed\n"); + +	dm->state = DM_INITIALIZED; +	resp.hdr.trans_id = atomic_inc_return(&trans_id); +	vmbus_sendpacket(dm->dev->channel, &resp, +			sizeof(struct dm_hot_add_response), +			(unsigned long)NULL, +			VM_PKT_DATA_INBAND, 0); +} + +static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) +{ +	struct dm_info_header *info_hdr; + +	info_hdr = (struct dm_info_header *)msg->info; + +	switch (info_hdr->type) { +	case INFO_TYPE_MAX_PAGE_CNT: +		pr_info("Received INFO_TYPE_MAX_PAGE_CNT\n"); +		pr_info("Data Size is %d\n", info_hdr->data_size); +		break; +	default: +		pr_info("Received Unknown type: %d\n", info_hdr->type); +	} +} + +static unsigned long compute_balloon_floor(void) +{ +	unsigned long min_pages; +#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) +	/* Simple continuous piecewiese linear function: +	 *  max MiB -> min MiB  gradient +	 *       0         0 +	 *      16        16 +	 *      32        24 +	 *     128        72    (1/2) +	 *     512       168    (1/4) +	 *    2048       360    (1/8) +	 *    8192       552    (1/32) +	 *   32768      1320 +	 *  131072      4392 +	 */ +	if (totalram_pages < MB2PAGES(128)) +		min_pages = MB2PAGES(8) + (totalram_pages >> 1); +	else if (totalram_pages < MB2PAGES(512)) +		min_pages = MB2PAGES(40) + (totalram_pages >> 2); +	else if (totalram_pages < MB2PAGES(2048)) +		min_pages = MB2PAGES(104) + (totalram_pages >> 3); +	else +		min_pages = MB2PAGES(296) + (totalram_pages >> 5); +#undef MB2PAGES +	return min_pages; +} + +/* + * Post our status as it relates memory pressure to the + * host. Host expects the guests to post this status + * periodically at 1 second intervals. + * + * The metrics specified in this protocol are very Windows + * specific and so we cook up numbers here to convey our memory + * pressure. + */ + +static void post_status(struct hv_dynmem_device *dm) +{ +	struct dm_status status; +	struct sysinfo val; +	unsigned long now = jiffies; +	unsigned long last_post = last_post_time; + +	if (pressure_report_delay > 0) { +		--pressure_report_delay; +		return; +	} + +	if (!time_after(now, (last_post_time + HZ))) +		return; + +	si_meminfo(&val); +	memset(&status, 0, sizeof(struct dm_status)); +	status.hdr.type = DM_STATUS_REPORT; +	status.hdr.size = sizeof(struct dm_status); +	status.hdr.trans_id = atomic_inc_return(&trans_id); + +	/* +	 * The host expects the guest to report free memory. +	 * Further, the host expects the pressure information to +	 * include the ballooned out pages. +	 * For a given amount of memory that we are managing, we +	 * need to compute a floor below which we should not balloon. +	 * Compute this and add it to the pressure report. +	 */ +	status.num_avail = val.freeram; +	status.num_committed = vm_memory_committed() + +				dm->num_pages_ballooned + +				compute_balloon_floor(); + +	/* +	 * If our transaction ID is no longer current, just don't +	 * send the status. This can happen if we were interrupted +	 * after we picked our transaction ID. +	 */ +	if (status.hdr.trans_id != atomic_read(&trans_id)) +		return; + +	/* +	 * If the last post time that we sampled has changed, +	 * we have raced, don't post the status. +	 */ +	if (last_post != last_post_time) +		return; + +	last_post_time = jiffies; +	vmbus_sendpacket(dm->dev->channel, &status, +				sizeof(struct dm_status), +				(unsigned long)NULL, +				VM_PKT_DATA_INBAND, 0); + +} + +static void free_balloon_pages(struct hv_dynmem_device *dm, +			 union dm_mem_page_range *range_array) +{ +	int num_pages = range_array->finfo.page_cnt; +	__u64 start_frame = range_array->finfo.start_page; +	struct page *pg; +	int i; + +	for (i = 0; i < num_pages; i++) { +		pg = pfn_to_page(i + start_frame); +		__free_page(pg); +		dm->num_pages_ballooned--; +	} +} + + + +static int  alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages, +			 struct dm_balloon_response *bl_resp, int alloc_unit, +			 bool *alloc_error) +{ +	int i = 0; +	struct page *pg; + +	if (num_pages < alloc_unit) +		return 0; + +	for (i = 0; (i * alloc_unit) < num_pages; i++) { +		if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) > +			PAGE_SIZE) +			return i * alloc_unit; + +		/* +		 * We execute this code in a thread context. Furthermore, +		 * we don't want the kernel to try too hard. +		 */ +		pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY | +				__GFP_NOMEMALLOC | __GFP_NOWARN, +				get_order(alloc_unit << PAGE_SHIFT)); + +		if (!pg) { +			*alloc_error = true; +			return i * alloc_unit; +		} + + +		dm->num_pages_ballooned += alloc_unit; + +		/* +		 * If we allocatted 2M pages; split them so we +		 * can free them in any order we get. +		 */ + +		if (alloc_unit != 1) +			split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); + +		bl_resp->range_count++; +		bl_resp->range_array[i].finfo.start_page = +			page_to_pfn(pg); +		bl_resp->range_array[i].finfo.page_cnt = alloc_unit; +		bl_resp->hdr.size += sizeof(union dm_mem_page_range); + +	} + +	return num_pages; +} + + + +static void balloon_up(struct work_struct *dummy) +{ +	int num_pages = dm_device.balloon_wrk.num_pages; +	int num_ballooned = 0; +	struct dm_balloon_response *bl_resp; +	int alloc_unit; +	int ret; +	bool alloc_error = false; +	bool done = false; +	int i; + + +	/* +	 * We will attempt 2M allocations. However, if we fail to +	 * allocate 2M chunks, we will go back to 4k allocations. +	 */ +	alloc_unit = 512; + +	while (!done) { +		bl_resp = (struct dm_balloon_response *)send_buffer; +		memset(send_buffer, 0, PAGE_SIZE); +		bl_resp->hdr.type = DM_BALLOON_RESPONSE; +		bl_resp->hdr.size = sizeof(struct dm_balloon_response); +		bl_resp->more_pages = 1; + + +		num_pages -= num_ballooned; +		num_ballooned = alloc_balloon_pages(&dm_device, num_pages, +						bl_resp, alloc_unit, +						 &alloc_error); + +		if ((alloc_error) && (alloc_unit != 1)) { +			alloc_unit = 1; +			continue; +		} + +		if ((alloc_error) || (num_ballooned == num_pages)) { +			bl_resp->more_pages = 0; +			done = true; +			dm_device.state = DM_INITIALIZED; +		} + +		/* +		 * We are pushing a lot of data through the channel; +		 * deal with transient failures caused because of the +		 * lack of space in the ring buffer. +		 */ + +		do { +			bl_resp->hdr.trans_id = atomic_inc_return(&trans_id); +			ret = vmbus_sendpacket(dm_device.dev->channel, +						bl_resp, +						bl_resp->hdr.size, +						(unsigned long)NULL, +						VM_PKT_DATA_INBAND, 0); + +			if (ret == -EAGAIN) +				msleep(20); +			post_status(&dm_device); +		} while (ret == -EAGAIN); + +		if (ret) { +			/* +			 * Free up the memory we allocatted. +			 */ +			pr_info("Balloon response failed\n"); + +			for (i = 0; i < bl_resp->range_count; i++) +				free_balloon_pages(&dm_device, +						 &bl_resp->range_array[i]); + +			done = true; +		} +	} + +} + +static void balloon_down(struct hv_dynmem_device *dm, +			struct dm_unballoon_request *req) +{ +	union dm_mem_page_range *range_array = req->range_array; +	int range_count = req->range_count; +	struct dm_unballoon_response resp; +	int i; + +	for (i = 0; i < range_count; i++) { +		free_balloon_pages(dm, &range_array[i]); +		post_status(&dm_device); +	} + +	if (req->more_pages == 1) +		return; + +	memset(&resp, 0, sizeof(struct dm_unballoon_response)); +	resp.hdr.type = DM_UNBALLOON_RESPONSE; +	resp.hdr.trans_id = atomic_inc_return(&trans_id); +	resp.hdr.size = sizeof(struct dm_unballoon_response); + +	vmbus_sendpacket(dm_device.dev->channel, &resp, +				sizeof(struct dm_unballoon_response), +				(unsigned long)NULL, +				VM_PKT_DATA_INBAND, 0); + +	dm->state = DM_INITIALIZED; +} + +static void balloon_onchannelcallback(void *context); + +static int dm_thread_func(void *dm_dev) +{ +	struct hv_dynmem_device *dm = dm_dev; +	int t; + +	while (!kthread_should_stop()) { +		t = wait_for_completion_interruptible_timeout( +						&dm_device.config_event, 1*HZ); +		/* +		 * The host expects us to post information on the memory +		 * pressure every second. +		 */ + +		if (t == 0) +			post_status(dm); + +	} + +	return 0; +} + + +static void version_resp(struct hv_dynmem_device *dm, +			struct dm_version_response *vresp) +{ +	struct dm_version_request version_req; +	int ret; + +	if (vresp->is_accepted) { +		/* +		 * We are done; wakeup the +		 * context waiting for version +		 * negotiation. +		 */ +		complete(&dm->host_event); +		return; +	} +	/* +	 * If there are more versions to try, continue +	 * with negotiations; if not +	 * shutdown the service since we are not able +	 * to negotiate a suitable version number +	 * with the host. +	 */ +	if (dm->next_version == 0) +		goto version_error; + +	dm->next_version = 0; +	memset(&version_req, 0, sizeof(struct dm_version_request)); +	version_req.hdr.type = DM_VERSION_REQUEST; +	version_req.hdr.size = sizeof(struct dm_version_request); +	version_req.hdr.trans_id = atomic_inc_return(&trans_id); +	version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN7; +	version_req.is_last_attempt = 1; + +	ret = vmbus_sendpacket(dm->dev->channel, &version_req, +				sizeof(struct dm_version_request), +				(unsigned long)NULL, +				VM_PKT_DATA_INBAND, 0); + +	if (ret) +		goto version_error; + +	return; + +version_error: +	dm->state = DM_INIT_ERROR; +	complete(&dm->host_event); +} + +static void cap_resp(struct hv_dynmem_device *dm, +			struct dm_capabilities_resp_msg *cap_resp) +{ +	if (!cap_resp->is_accepted) { +		pr_info("Capabilities not accepted by host\n"); +		dm->state = DM_INIT_ERROR; +	} +	complete(&dm->host_event); +} + +static void balloon_onchannelcallback(void *context) +{ +	struct hv_device *dev = context; +	u32 recvlen; +	u64 requestid; +	struct dm_message *dm_msg; +	struct dm_header *dm_hdr; +	struct hv_dynmem_device *dm = hv_get_drvdata(dev); +	struct dm_balloon *bal_msg; +	struct dm_hot_add *ha_msg; +	union dm_mem_page_range *ha_pg_range; +	union dm_mem_page_range *ha_region; + +	memset(recv_buffer, 0, sizeof(recv_buffer)); +	vmbus_recvpacket(dev->channel, recv_buffer, +			 PAGE_SIZE, &recvlen, &requestid); + +	if (recvlen > 0) { +		dm_msg = (struct dm_message *)recv_buffer; +		dm_hdr = &dm_msg->hdr; + +		switch (dm_hdr->type) { +		case DM_VERSION_RESPONSE: +			version_resp(dm, +				 (struct dm_version_response *)dm_msg); +			break; + +		case DM_CAPABILITIES_RESPONSE: +			cap_resp(dm, +				 (struct dm_capabilities_resp_msg *)dm_msg); +			break; + +		case DM_BALLOON_REQUEST: +			if (dm->state == DM_BALLOON_UP) +				pr_warn("Currently ballooning\n"); +			bal_msg = (struct dm_balloon *)recv_buffer; +			dm->state = DM_BALLOON_UP; +			dm_device.balloon_wrk.num_pages = bal_msg->num_pages; +			schedule_work(&dm_device.balloon_wrk.wrk); +			break; + +		case DM_UNBALLOON_REQUEST: +			dm->state = DM_BALLOON_DOWN; +			balloon_down(dm, +				 (struct dm_unballoon_request *)recv_buffer); +			break; + +		case DM_MEM_HOT_ADD_REQUEST: +			if (dm->state == DM_HOT_ADD) +				pr_warn("Currently hot-adding\n"); +			dm->state = DM_HOT_ADD; +			ha_msg = (struct dm_hot_add *)recv_buffer; +			if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) { +				/* +				 * This is a normal hot-add request specifying +				 * hot-add memory. +				 */ +				ha_pg_range = &ha_msg->range; +				dm->ha_wrk.ha_page_range = *ha_pg_range; +				dm->ha_wrk.ha_region_range.page_range = 0; +			} else { +				/* +				 * Host is specifying that we first hot-add +				 * a region and then partially populate this +				 * region. +				 */ +				dm->host_specified_ha_region = true; +				ha_pg_range = &ha_msg->range; +				ha_region = &ha_pg_range[1]; +				dm->ha_wrk.ha_page_range = *ha_pg_range; +				dm->ha_wrk.ha_region_range = *ha_region; +			} +			schedule_work(&dm_device.ha_wrk.wrk); +			break; + +		case DM_INFO_MESSAGE: +			process_info(dm, (struct dm_info_msg *)dm_msg); +			break; + +		default: +			pr_err("Unhandled message: type: %d\n", dm_hdr->type); + +		} +	} + +} + +static int balloon_probe(struct hv_device *dev, +			const struct hv_vmbus_device_id *dev_id) +{ +	int ret, t; +	struct dm_version_request version_req; +	struct dm_capabilities cap_msg; + +	do_hot_add = hot_add; + +	/* +	 * First allocate a send buffer. +	 */ + +	send_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); +	if (!send_buffer) +		return -ENOMEM; + +	ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, +			balloon_onchannelcallback, dev); + +	if (ret) +		goto probe_error0; + +	dm_device.dev = dev; +	dm_device.state = DM_INITIALIZING; +	dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7; +	init_completion(&dm_device.host_event); +	init_completion(&dm_device.config_event); +	INIT_LIST_HEAD(&dm_device.ha_region_list); +	INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); +	INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); +	dm_device.host_specified_ha_region = false; + +	dm_device.thread = +		 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); +	if (IS_ERR(dm_device.thread)) { +		ret = PTR_ERR(dm_device.thread); +		goto probe_error1; +	} + +#ifdef CONFIG_MEMORY_HOTPLUG +	set_online_page_callback(&hv_online_page); +#endif + +	hv_set_drvdata(dev, &dm_device); +	/* +	 * Initiate the hand shake with the host and negotiate +	 * a version that the host can support. We start with the +	 * highest version number and go down if the host cannot +	 * support it. +	 */ +	memset(&version_req, 0, sizeof(struct dm_version_request)); +	version_req.hdr.type = DM_VERSION_REQUEST; +	version_req.hdr.size = sizeof(struct dm_version_request); +	version_req.hdr.trans_id = atomic_inc_return(&trans_id); +	version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN8; +	version_req.is_last_attempt = 0; + +	ret = vmbus_sendpacket(dev->channel, &version_req, +				sizeof(struct dm_version_request), +				(unsigned long)NULL, +				VM_PKT_DATA_INBAND, 0); +	if (ret) +		goto probe_error2; + +	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); +	if (t == 0) { +		ret = -ETIMEDOUT; +		goto probe_error2; +	} + +	/* +	 * If we could not negotiate a compatible version with the host +	 * fail the probe function. +	 */ +	if (dm_device.state == DM_INIT_ERROR) { +		ret = -ETIMEDOUT; +		goto probe_error2; +	} +	/* +	 * Now submit our capabilities to the host. +	 */ +	memset(&cap_msg, 0, sizeof(struct dm_capabilities)); +	cap_msg.hdr.type = DM_CAPABILITIES_REPORT; +	cap_msg.hdr.size = sizeof(struct dm_capabilities); +	cap_msg.hdr.trans_id = atomic_inc_return(&trans_id); + +	cap_msg.caps.cap_bits.balloon = 1; +	cap_msg.caps.cap_bits.hot_add = 1; + +	/* +	 * Specify our alignment requirements as it relates +	 * memory hot-add. Specify 128MB alignment. +	 */ +	cap_msg.caps.cap_bits.hot_add_alignment = 7; + +	/* +	 * Currently the host does not use these +	 * values and we set them to what is done in the +	 * Windows driver. +	 */ +	cap_msg.min_page_cnt = 0; +	cap_msg.max_page_number = -1; + +	ret = vmbus_sendpacket(dev->channel, &cap_msg, +				sizeof(struct dm_capabilities), +				(unsigned long)NULL, +				VM_PKT_DATA_INBAND, 0); +	if (ret) +		goto probe_error2; + +	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); +	if (t == 0) { +		ret = -ETIMEDOUT; +		goto probe_error2; +	} + +	/* +	 * If the host does not like our capabilities, +	 * fail the probe function. +	 */ +	if (dm_device.state == DM_INIT_ERROR) { +		ret = -ETIMEDOUT; +		goto probe_error2; +	} + +	dm_device.state = DM_INITIALIZED; + +	return 0; + +probe_error2: +#ifdef CONFIG_MEMORY_HOTPLUG +	restore_online_page_callback(&hv_online_page); +#endif +	kthread_stop(dm_device.thread); + +probe_error1: +	vmbus_close(dev->channel); +probe_error0: +	kfree(send_buffer); +	return ret; +} + +static int balloon_remove(struct hv_device *dev) +{ +	struct hv_dynmem_device *dm = hv_get_drvdata(dev); +	struct list_head *cur, *tmp; +	struct hv_hotadd_state *has; + +	if (dm->num_pages_ballooned != 0) +		pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); + +	cancel_work_sync(&dm->balloon_wrk.wrk); +	cancel_work_sync(&dm->ha_wrk.wrk); + +	vmbus_close(dev->channel); +	kthread_stop(dm->thread); +	kfree(send_buffer); +#ifdef CONFIG_MEMORY_HOTPLUG +	restore_online_page_callback(&hv_online_page); +#endif +	list_for_each_safe(cur, tmp, &dm->ha_region_list) { +		has = list_entry(cur, struct hv_hotadd_state, list); +		list_del(&has->list); +		kfree(has); +	} + +	return 0; +} + +static const struct hv_vmbus_device_id id_table[] = { +	/* Dynamic Memory Class ID */ +	/* 525074DC-8985-46e2-8057-A307DC18A502 */ +	{ HV_DM_GUID, }, +	{ }, +}; + +MODULE_DEVICE_TABLE(vmbus, id_table); + +static  struct hv_driver balloon_drv = { +	.name = "hv_balloon", +	.id_table = id_table, +	.probe =  balloon_probe, +	.remove =  balloon_remove, +}; + +static int __init init_balloon_drv(void) +{ + +	return vmbus_driver_register(&balloon_drv); +} + +module_init(init_balloon_drv); + +MODULE_DESCRIPTION("Hyper-V Balloon"); +MODULE_LICENSE("GPL"); diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c new file mode 100644 index 00000000000..23b2ce294c4 --- /dev/null +++ b/drivers/hv/hv_fcopy.c @@ -0,0 +1,414 @@ +/* + * An implementation of file copy service. + * + * Copyright (C) 2014, Microsoft, Inc. + * + * Author : K. Y. Srinivasan <ksrinivasan@novell.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT.  See the GNU General Public License for more + * details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/semaphore.h> +#include <linux/fs.h> +#include <linux/nls.h> +#include <linux/workqueue.h> +#include <linux/cdev.h> +#include <linux/hyperv.h> +#include <linux/sched.h> +#include <linux/uaccess.h> +#include <linux/miscdevice.h> + +#include "hyperv_vmbus.h" + +#define WIN8_SRV_MAJOR		1 +#define WIN8_SRV_MINOR		1 +#define WIN8_SRV_VERSION	(WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) + +/* + * Global state maintained for transaction that is being processed. + * For a class of integration services, including the "file copy service", + * the specified protocol is a "request/response" protocol which means that + * there can only be single outstanding transaction from the host at any + * given point in time. We use this to simplify memory management in this + * driver - we cache and process only one message at a time. + * + * While the request/response protocol is guaranteed by the host, we further + * ensure this by serializing packet processing in this driver - we do not + * read additional packets from the VMBUs until the current packet is fully + * handled. + * + * The transaction "active" state is set when we receive a request from the + * host and we cleanup this state when the transaction is completed - when we + * respond to the host with our response. When the transaction active state is + * set, we defer handling incoming packets. + */ + +static struct { +	bool active; /* transaction status - active or not */ +	int recv_len; /* number of bytes received. */ +	struct hv_fcopy_hdr  *fcopy_msg; /* current message */ +	struct hv_start_fcopy  message; /*  sent to daemon */ +	struct vmbus_channel *recv_channel; /* chn we got the request */ +	u64 recv_req_id; /* request ID. */ +	void *fcopy_context; /* for the channel callback */ +	struct semaphore read_sema; +} fcopy_transaction; + +static bool opened; /* currently device opened */ + +/* + * Before we can accept copy messages from the host, we need + * to handshake with the user level daemon. This state tracks + * if we are in the handshake phase. + */ +static bool in_hand_shake = true; +static void fcopy_send_data(void); +static void fcopy_respond_to_host(int error); +static void fcopy_work_func(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(fcopy_work, fcopy_work_func); +static u8 *recv_buffer; + +static void fcopy_work_func(struct work_struct *dummy) +{ +	/* +	 * If the timer fires, the user-mode component has not responded; +	 * process the pending transaction. +	 */ +	fcopy_respond_to_host(HV_E_FAIL); +} + +static int fcopy_handle_handshake(u32 version) +{ +	switch (version) { +	case FCOPY_CURRENT_VERSION: +		break; +	default: +		/* +		 * For now we will fail the registration. +		 * If and when we have multiple versions to +		 * deal with, we will be backward compatible. +		 * We will add this code when needed. +		 */ +		return -EINVAL; +	} +	pr_info("FCP: user-mode registering done. Daemon version: %d\n", +		version); +	fcopy_transaction.active = false; +	if (fcopy_transaction.fcopy_context) +		hv_fcopy_onchannelcallback(fcopy_transaction.fcopy_context); +	in_hand_shake = false; +	return 0; +} + +static void fcopy_send_data(void) +{ +	struct hv_start_fcopy *smsg_out = &fcopy_transaction.message; +	int operation = fcopy_transaction.fcopy_msg->operation; +	struct hv_start_fcopy *smsg_in; + +	/* +	 * The  strings sent from the host are encoded in +	 * in utf16; convert it to utf8 strings. +	 * The host assures us that the utf16 strings will not exceed +	 * the max lengths specified. We will however, reserve room +	 * for the string terminating character - in the utf16s_utf8s() +	 * function we limit the size of the buffer where the converted +	 * string is placed to W_MAX_PATH -1 to guarantee +	 * that the strings can be properly terminated! +	 */ + +	switch (operation) { +	case START_FILE_COPY: +		memset(smsg_out, 0, sizeof(struct hv_start_fcopy)); +		smsg_out->hdr.operation = operation; +		smsg_in = (struct hv_start_fcopy *)fcopy_transaction.fcopy_msg; + +		utf16s_to_utf8s((wchar_t *)smsg_in->file_name, W_MAX_PATH, +				UTF16_LITTLE_ENDIAN, +				(__u8 *)smsg_out->file_name, W_MAX_PATH - 1); + +		utf16s_to_utf8s((wchar_t *)smsg_in->path_name, W_MAX_PATH, +				UTF16_LITTLE_ENDIAN, +				(__u8 *)smsg_out->path_name, W_MAX_PATH - 1); + +		smsg_out->copy_flags = smsg_in->copy_flags; +		smsg_out->file_size = smsg_in->file_size; +		break; + +	default: +		break; +	} +	up(&fcopy_transaction.read_sema); +	return; +} + +/* + * Send a response back to the host. + */ + +static void +fcopy_respond_to_host(int error) +{ +	struct icmsg_hdr *icmsghdr; +	u32 buf_len; +	struct vmbus_channel *channel; +	u64 req_id; + +	/* +	 * Copy the global state for completing the transaction. Note that +	 * only one transaction can be active at a time. This is guaranteed +	 * by the file copy protocol implemented by the host. Furthermore, +	 * the "transaction active" state we maintain ensures that there can +	 * only be one active transaction at a time. +	 */ + +	buf_len = fcopy_transaction.recv_len; +	channel = fcopy_transaction.recv_channel; +	req_id = fcopy_transaction.recv_req_id; + +	fcopy_transaction.active = false; + +	icmsghdr = (struct icmsg_hdr *) +			&recv_buffer[sizeof(struct vmbuspipe_hdr)]; + +	if (channel->onchannel_callback == NULL) +		/* +		 * We have raced with util driver being unloaded; +		 * silently return. +		 */ +		return; + +	icmsghdr->status = error; +	icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; +	vmbus_sendpacket(channel, recv_buffer, buf_len, req_id, +				VM_PKT_DATA_INBAND, 0); +} + +void hv_fcopy_onchannelcallback(void *context) +{ +	struct vmbus_channel *channel = context; +	u32 recvlen; +	u64 requestid; +	struct hv_fcopy_hdr *fcopy_msg; +	struct icmsg_hdr *icmsghdr; +	struct icmsg_negotiate *negop = NULL; +	int util_fw_version; +	int fcopy_srv_version; + +	if (fcopy_transaction.active) { +		/* +		 * We will defer processing this callback once +		 * the current transaction is complete. +		 */ +		fcopy_transaction.fcopy_context = context; +		return; +	} + +	vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen, +			 &requestid); +	if (recvlen <= 0) +		return; + +	icmsghdr = (struct icmsg_hdr *)&recv_buffer[ +			sizeof(struct vmbuspipe_hdr)]; +	if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) { +		util_fw_version = UTIL_FW_VERSION; +		fcopy_srv_version = WIN8_SRV_VERSION; +		vmbus_prep_negotiate_resp(icmsghdr, negop, recv_buffer, +				util_fw_version, fcopy_srv_version); +	} else { +		fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[ +				sizeof(struct vmbuspipe_hdr) + +				sizeof(struct icmsg_hdr)]; + +		/* +		 * Stash away this global state for completing the +		 * transaction; note transactions are serialized. +		 */ + +		fcopy_transaction.active = true; +		fcopy_transaction.recv_len = recvlen; +		fcopy_transaction.recv_channel = channel; +		fcopy_transaction.recv_req_id = requestid; +		fcopy_transaction.fcopy_msg = fcopy_msg; + +		/* +		 * Send the information to the user-level daemon. +		 */ +		schedule_delayed_work(&fcopy_work, 5*HZ); +		fcopy_send_data(); +		return; +	} +	icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; +	vmbus_sendpacket(channel, recv_buffer, recvlen, requestid, +			VM_PKT_DATA_INBAND, 0); +} + +/* + * Create a char device that can support read/write for passing + * the payload. + */ + +static ssize_t fcopy_read(struct file *file, char __user *buf, +		size_t count, loff_t *ppos) +{ +	void *src; +	size_t copy_size; +	int operation; + +	/* +	 * Wait until there is something to be read. +	 */ +	if (down_interruptible(&fcopy_transaction.read_sema)) +		return -EINTR; + +	/* +	 * The channel may be rescinded and in this case, we will wakeup the +	 * the thread blocked on the semaphore and we will use the opened +	 * state to correctly handle this case. +	 */ +	if (!opened) +		return -ENODEV; + +	operation = fcopy_transaction.fcopy_msg->operation; + +	if (operation == START_FILE_COPY) { +		src = &fcopy_transaction.message; +		copy_size = sizeof(struct hv_start_fcopy); +		if (count < copy_size) +			return 0; +	} else { +		src = fcopy_transaction.fcopy_msg; +		copy_size = sizeof(struct hv_do_fcopy); +		if (count < copy_size) +			return 0; +	} +	if (copy_to_user(buf, src, copy_size)) +		return -EFAULT; + +	return copy_size; +} + +static ssize_t fcopy_write(struct file *file, const char __user *buf, +			size_t count, loff_t *ppos) +{ +	int response = 0; + +	if (count != sizeof(int)) +		return -EINVAL; + +	if (copy_from_user(&response, buf, sizeof(int))) +		return -EFAULT; + +	if (in_hand_shake) { +		if (fcopy_handle_handshake(response)) +			return -EINVAL; +		return sizeof(int); +	} + +	/* +	 * Complete the transaction by forwarding the result +	 * to the host. But first, cancel the timeout. +	 */ +	if (cancel_delayed_work_sync(&fcopy_work)) +		fcopy_respond_to_host(response); + +	return sizeof(int); +} + +static int fcopy_open(struct inode *inode, struct file *f) +{ +	/* +	 * The user level daemon that will open this device is +	 * really an extension of this driver. We can have only +	 * active open at a time. +	 */ +	if (opened) +		return -EBUSY; + +	/* +	 * The daemon is alive; setup the state. +	 */ +	opened = true; +	return 0; +} + +static int fcopy_release(struct inode *inode, struct file *f) +{ +	/* +	 * The daemon has exited; reset the state. +	 */ +	in_hand_shake = true; +	opened = false; +	return 0; +} + + +static const struct file_operations fcopy_fops = { +	.read           = fcopy_read, +	.write          = fcopy_write, +	.release	= fcopy_release, +	.open		= fcopy_open, +}; + +static struct miscdevice fcopy_misc = { +	.minor          = MISC_DYNAMIC_MINOR, +	.name           = "vmbus/hv_fcopy", +	.fops           = &fcopy_fops, +}; + +static int fcopy_dev_init(void) +{ +	return misc_register(&fcopy_misc); +} + +static void fcopy_dev_deinit(void) +{ + +	/* +	 * The device is going away - perhaps because the +	 * host has rescinded the channel. Setup state so that +	 * user level daemon can gracefully exit if it is blocked +	 * on the read semaphore. +	 */ +	opened = false; +	/* +	 * Signal the semaphore as the device is +	 * going away. +	 */ +	up(&fcopy_transaction.read_sema); +	misc_deregister(&fcopy_misc); +} + +int hv_fcopy_init(struct hv_util_service *srv) +{ +	recv_buffer = srv->recv_buffer; + +	/* +	 * When this driver loads, the user level daemon that +	 * processes the host requests may not yet be running. +	 * Defer processing channel callbacks until the daemon +	 * has registered. +	 */ +	fcopy_transaction.active = true; +	sema_init(&fcopy_transaction.read_sema, 0); + +	return fcopy_dev_init(); +} + +void hv_fcopy_deinit(void) +{ +	cancel_delayed_work_sync(&fcopy_work); +	fcopy_dev_deinit(); +} diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c new file mode 100644 index 00000000000..521c14625b3 --- /dev/null +++ b/drivers/hv/hv_kvp.c @@ -0,0 +1,715 @@ +/* + * An implementation of key value pair (KVP) functionality for Linux. + * + * + * Copyright (C) 2010, Novell, Inc. + * Author : K. Y. Srinivasan <ksrinivasan@novell.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT.  See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/nls.h> +#include <linux/connector.h> +#include <linux/workqueue.h> +#include <linux/hyperv.h> + + +/* + * Pre win8 version numbers used in ws2008 and ws 2008 r2 (win7) + */ +#define WS2008_SRV_MAJOR	1 +#define WS2008_SRV_MINOR	0 +#define WS2008_SRV_VERSION     (WS2008_SRV_MAJOR << 16 | WS2008_SRV_MINOR) + +#define WIN7_SRV_MAJOR   3 +#define WIN7_SRV_MINOR   0 +#define WIN7_SRV_VERSION     (WIN7_SRV_MAJOR << 16 | WIN7_SRV_MINOR) + +#define WIN8_SRV_MAJOR   4 +#define WIN8_SRV_MINOR   0 +#define WIN8_SRV_VERSION     (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) + +/* + * Global state maintained for transaction that is being processed. + * Note that only one transaction can be active at any point in time. + * + * This state is set when we receive a request from the host; we + * cleanup this state when the transaction is completed - when we respond + * to the host with the key value. + */ + +static struct { +	bool active; /* transaction status - active or not */ +	int recv_len; /* number of bytes received. */ +	struct hv_kvp_msg  *kvp_msg; /* current message */ +	struct vmbus_channel *recv_channel; /* chn we got the request */ +	u64 recv_req_id; /* request ID. */ +	void *kvp_context; /* for the channel callback */ +} kvp_transaction; + +/* + * Before we can accept KVP messages from the host, we need + * to handshake with the user level daemon. This state tracks + * if we are in the handshake phase. + */ +static bool in_hand_shake = true; + +/* + * This state maintains the version number registered by the daemon. + */ +static int dm_reg_value; + +static void kvp_send_key(struct work_struct *dummy); + + +static void kvp_respond_to_host(struct hv_kvp_msg *msg, int error); +static void kvp_work_func(struct work_struct *dummy); +static void kvp_register(int); + +static DECLARE_DELAYED_WORK(kvp_work, kvp_work_func); +static DECLARE_WORK(kvp_sendkey_work, kvp_send_key); + +static struct cb_id kvp_id = { CN_KVP_IDX, CN_KVP_VAL }; +static const char kvp_name[] = "kvp_kernel_module"; +static u8 *recv_buffer; +/* + * Register the kernel component with the user-level daemon. + * As part of this registration, pass the LIC version number. + * This number has no meaning, it satisfies the registration protocol. + */ +#define HV_DRV_VERSION           "3.1" + +static void +kvp_register(int reg_value) +{ + +	struct cn_msg *msg; +	struct hv_kvp_msg *kvp_msg; +	char *version; + +	msg = kzalloc(sizeof(*msg) + sizeof(struct hv_kvp_msg), GFP_ATOMIC); + +	if (msg) { +		kvp_msg = (struct hv_kvp_msg *)msg->data; +		version = kvp_msg->body.kvp_register.version; +		msg->id.idx =  CN_KVP_IDX; +		msg->id.val = CN_KVP_VAL; + +		kvp_msg->kvp_hdr.operation = reg_value; +		strcpy(version, HV_DRV_VERSION); +		msg->len = sizeof(struct hv_kvp_msg); +		cn_netlink_send(msg, 0, 0, GFP_ATOMIC); +		kfree(msg); +	} +} +static void +kvp_work_func(struct work_struct *dummy) +{ +	/* +	 * If the timer fires, the user-mode component has not responded; +	 * process the pending transaction. +	 */ +	kvp_respond_to_host(NULL, HV_E_FAIL); +} + +static void poll_channel(struct vmbus_channel *channel) +{ +	if (channel->target_cpu != smp_processor_id()) +		smp_call_function_single(channel->target_cpu, +					 hv_kvp_onchannelcallback, +					 channel, true); +	else +		hv_kvp_onchannelcallback(channel); +} + + +static int kvp_handle_handshake(struct hv_kvp_msg *msg) +{ +	int ret = 1; + +	switch (msg->kvp_hdr.operation) { +	case KVP_OP_REGISTER: +		dm_reg_value = KVP_OP_REGISTER; +		pr_info("KVP: IP injection functionality not available\n"); +		pr_info("KVP: Upgrade the KVP daemon\n"); +		break; +	case KVP_OP_REGISTER1: +		dm_reg_value = KVP_OP_REGISTER1; +		break; +	default: +		pr_info("KVP: incompatible daemon\n"); +		pr_info("KVP: KVP version: %d, Daemon version: %d\n", +			KVP_OP_REGISTER1, msg->kvp_hdr.operation); +		ret = 0; +	} + +	if (ret) { +		/* +		 * We have a compatible daemon; complete the handshake. +		 */ +		pr_info("KVP: user-mode registering done.\n"); +		kvp_register(dm_reg_value); +		kvp_transaction.active = false; +		if (kvp_transaction.kvp_context) +			poll_channel(kvp_transaction.kvp_context); +	} +	return ret; +} + + +/* + * Callback when data is received from user mode. + */ + +static void +kvp_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) +{ +	struct hv_kvp_msg *message; +	struct hv_kvp_msg_enumerate *data; +	int	error = 0; + +	message = (struct hv_kvp_msg *)msg->data; + +	/* +	 * If we are negotiating the version information +	 * with the daemon; handle that first. +	 */ + +	if (in_hand_shake) { +		if (kvp_handle_handshake(message)) +			in_hand_shake = false; +		return; +	} + +	/* +	 * Based on the version of the daemon, we propagate errors from the +	 * daemon differently. +	 */ + +	data = &message->body.kvp_enum_data; + +	switch (dm_reg_value) { +	case KVP_OP_REGISTER: +		/* +		 * Null string is used to pass back error condition. +		 */ +		if (data->data.key[0] == 0) +			error = HV_S_CONT; +		break; + +	case KVP_OP_REGISTER1: +		/* +		 * We use the message header information from +		 * the user level daemon to transmit errors. +		 */ +		error = message->error; +		break; +	} + +	/* +	 * Complete the transaction by forwarding the key value +	 * to the host. But first, cancel the timeout. +	 */ +	if (cancel_delayed_work_sync(&kvp_work)) +		kvp_respond_to_host(message, error); +} + + +static int process_ob_ipinfo(void *in_msg, void *out_msg, int op) +{ +	struct hv_kvp_msg *in = in_msg; +	struct hv_kvp_ip_msg *out = out_msg; +	int len; + +	switch (op) { +	case KVP_OP_GET_IP_INFO: +		/* +		 * Transform all parameters into utf16 encoding. +		 */ +		len = utf8s_to_utf16s((char *)in->body.kvp_ip_val.ip_addr, +				strlen((char *)in->body.kvp_ip_val.ip_addr), +				UTF16_HOST_ENDIAN, +				(wchar_t *)out->kvp_ip_val.ip_addr, +				MAX_IP_ADDR_SIZE); +		if (len < 0) +			return len; + +		len = utf8s_to_utf16s((char *)in->body.kvp_ip_val.sub_net, +				strlen((char *)in->body.kvp_ip_val.sub_net), +				UTF16_HOST_ENDIAN, +				(wchar_t *)out->kvp_ip_val.sub_net, +				MAX_IP_ADDR_SIZE); +		if (len < 0) +			return len; + +		len = utf8s_to_utf16s((char *)in->body.kvp_ip_val.gate_way, +				strlen((char *)in->body.kvp_ip_val.gate_way), +				UTF16_HOST_ENDIAN, +				(wchar_t *)out->kvp_ip_val.gate_way, +				MAX_GATEWAY_SIZE); +		if (len < 0) +			return len; + +		len = utf8s_to_utf16s((char *)in->body.kvp_ip_val.dns_addr, +				strlen((char *)in->body.kvp_ip_val.dns_addr), +				UTF16_HOST_ENDIAN, +				(wchar_t *)out->kvp_ip_val.dns_addr, +				MAX_IP_ADDR_SIZE); +		if (len < 0) +			return len; + +		len = utf8s_to_utf16s((char *)in->body.kvp_ip_val.adapter_id, +				strlen((char *)in->body.kvp_ip_val.adapter_id), +				UTF16_HOST_ENDIAN, +				(wchar_t *)out->kvp_ip_val.adapter_id, +				MAX_IP_ADDR_SIZE); +		if (len < 0) +			return len; + +		out->kvp_ip_val.dhcp_enabled = +			in->body.kvp_ip_val.dhcp_enabled; +		out->kvp_ip_val.addr_family = +			in->body.kvp_ip_val.addr_family; +	} + +	return 0; +} + +static void process_ib_ipinfo(void *in_msg, void *out_msg, int op) +{ +	struct hv_kvp_ip_msg *in = in_msg; +	struct hv_kvp_msg *out = out_msg; + +	switch (op) { +	case KVP_OP_SET_IP_INFO: +		/* +		 * Transform all parameters into utf8 encoding. +		 */ +		utf16s_to_utf8s((wchar_t *)in->kvp_ip_val.ip_addr, +				MAX_IP_ADDR_SIZE, +				UTF16_LITTLE_ENDIAN, +				(__u8 *)out->body.kvp_ip_val.ip_addr, +				MAX_IP_ADDR_SIZE); + +		utf16s_to_utf8s((wchar_t *)in->kvp_ip_val.sub_net, +				MAX_IP_ADDR_SIZE, +				UTF16_LITTLE_ENDIAN, +				(__u8 *)out->body.kvp_ip_val.sub_net, +				MAX_IP_ADDR_SIZE); + +		utf16s_to_utf8s((wchar_t *)in->kvp_ip_val.gate_way, +				MAX_GATEWAY_SIZE, +				UTF16_LITTLE_ENDIAN, +				(__u8 *)out->body.kvp_ip_val.gate_way, +				MAX_GATEWAY_SIZE); + +		utf16s_to_utf8s((wchar_t *)in->kvp_ip_val.dns_addr, +				MAX_IP_ADDR_SIZE, +				UTF16_LITTLE_ENDIAN, +				(__u8 *)out->body.kvp_ip_val.dns_addr, +				MAX_IP_ADDR_SIZE); + +		out->body.kvp_ip_val.dhcp_enabled = in->kvp_ip_val.dhcp_enabled; + +	default: +		utf16s_to_utf8s((wchar_t *)in->kvp_ip_val.adapter_id, +				MAX_ADAPTER_ID_SIZE, +				UTF16_LITTLE_ENDIAN, +				(__u8 *)out->body.kvp_ip_val.adapter_id, +				MAX_ADAPTER_ID_SIZE); + +		out->body.kvp_ip_val.addr_family = in->kvp_ip_val.addr_family; +	} +} + + + + +static void +kvp_send_key(struct work_struct *dummy) +{ +	struct cn_msg *msg; +	struct hv_kvp_msg *message; +	struct hv_kvp_msg *in_msg; +	__u8 operation = kvp_transaction.kvp_msg->kvp_hdr.operation; +	__u8 pool = kvp_transaction.kvp_msg->kvp_hdr.pool; +	__u32 val32; +	__u64 val64; + +	msg = kzalloc(sizeof(*msg) + sizeof(struct hv_kvp_msg) , GFP_ATOMIC); +	if (!msg) +		return; + +	msg->id.idx =  CN_KVP_IDX; +	msg->id.val = CN_KVP_VAL; + +	message = (struct hv_kvp_msg *)msg->data; +	message->kvp_hdr.operation = operation; +	message->kvp_hdr.pool = pool; +	in_msg = kvp_transaction.kvp_msg; + +	/* +	 * The key/value strings sent from the host are encoded in +	 * in utf16; convert it to utf8 strings. +	 * The host assures us that the utf16 strings will not exceed +	 * the max lengths specified. We will however, reserve room +	 * for the string terminating character - in the utf16s_utf8s() +	 * function we limit the size of the buffer where the converted +	 * string is placed to HV_KVP_EXCHANGE_MAX_*_SIZE -1 to gaurantee +	 * that the strings can be properly terminated! +	 */ + +	switch (message->kvp_hdr.operation) { +	case KVP_OP_SET_IP_INFO: +		process_ib_ipinfo(in_msg, message, KVP_OP_SET_IP_INFO); +		break; +	case KVP_OP_GET_IP_INFO: +		process_ib_ipinfo(in_msg, message, KVP_OP_GET_IP_INFO); +		break; +	case KVP_OP_SET: +		switch (in_msg->body.kvp_set.data.value_type) { +		case REG_SZ: +			/* +			 * The value is a string - utf16 encoding. +			 */ +			message->body.kvp_set.data.value_size = +				utf16s_to_utf8s( +				(wchar_t *)in_msg->body.kvp_set.data.value, +				in_msg->body.kvp_set.data.value_size, +				UTF16_LITTLE_ENDIAN, +				message->body.kvp_set.data.value, +				HV_KVP_EXCHANGE_MAX_VALUE_SIZE - 1) + 1; +				break; + +		case REG_U32: +			/* +			 * The value is a 32 bit scalar. +			 * We save this as a utf8 string. +			 */ +			val32 = in_msg->body.kvp_set.data.value_u32; +			message->body.kvp_set.data.value_size = +				sprintf(message->body.kvp_set.data.value, +					"%d", val32) + 1; +			break; + +		case REG_U64: +			/* +			 * The value is a 64 bit scalar. +			 * We save this as a utf8 string. +			 */ +			val64 = in_msg->body.kvp_set.data.value_u64; +			message->body.kvp_set.data.value_size = +				sprintf(message->body.kvp_set.data.value, +					"%llu", val64) + 1; +			break; + +		} +	case KVP_OP_GET: +		message->body.kvp_set.data.key_size = +			utf16s_to_utf8s( +			(wchar_t *)in_msg->body.kvp_set.data.key, +			in_msg->body.kvp_set.data.key_size, +			UTF16_LITTLE_ENDIAN, +			message->body.kvp_set.data.key, +			HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1) + 1; +			break; + +	case KVP_OP_DELETE: +		message->body.kvp_delete.key_size = +			utf16s_to_utf8s( +			(wchar_t *)in_msg->body.kvp_delete.key, +			in_msg->body.kvp_delete.key_size, +			UTF16_LITTLE_ENDIAN, +			message->body.kvp_delete.key, +			HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1) + 1; +			break; + +	case KVP_OP_ENUMERATE: +		message->body.kvp_enum_data.index = +			in_msg->body.kvp_enum_data.index; +			break; +	} + +	msg->len = sizeof(struct hv_kvp_msg); +	cn_netlink_send(msg, 0, 0, GFP_ATOMIC); +	kfree(msg); + +	return; +} + +/* + * Send a response back to the host. + */ + +static void +kvp_respond_to_host(struct hv_kvp_msg *msg_to_host, int error) +{ +	struct hv_kvp_msg  *kvp_msg; +	struct hv_kvp_exchg_msg_value  *kvp_data; +	char	*key_name; +	char	*value; +	struct icmsg_hdr *icmsghdrp; +	int	keylen = 0; +	int	valuelen = 0; +	u32	buf_len; +	struct vmbus_channel *channel; +	u64	req_id; +	int ret; + +	/* +	 * If a transaction is not active; log and return. +	 */ + +	if (!kvp_transaction.active) { +		/* +		 * This is a spurious call! +		 */ +		pr_warn("KVP: Transaction not active\n"); +		return; +	} +	/* +	 * Copy the global state for completing the transaction. Note that +	 * only one transaction can be active at a time. +	 */ + +	buf_len = kvp_transaction.recv_len; +	channel = kvp_transaction.recv_channel; +	req_id = kvp_transaction.recv_req_id; + +	kvp_transaction.active = false; + +	icmsghdrp = (struct icmsg_hdr *) +			&recv_buffer[sizeof(struct vmbuspipe_hdr)]; + +	if (channel->onchannel_callback == NULL) +		/* +		 * We have raced with util driver being unloaded; +		 * silently return. +		 */ +		return; + +	icmsghdrp->status = error; + +	/* +	 * If the error parameter is set, terminate the host's enumeration +	 * on this pool. +	 */ +	if (error) { +		/* +		 * Something failed or we have timedout; +		 * terminate the current host-side iteration. +		 */ +		goto response_done; +	} + +	kvp_msg = (struct hv_kvp_msg *) +			&recv_buffer[sizeof(struct vmbuspipe_hdr) + +			sizeof(struct icmsg_hdr)]; + +	switch (kvp_transaction.kvp_msg->kvp_hdr.operation) { +	case KVP_OP_GET_IP_INFO: +		ret = process_ob_ipinfo(msg_to_host, +				 (struct hv_kvp_ip_msg *)kvp_msg, +				 KVP_OP_GET_IP_INFO); +		if (ret < 0) +			icmsghdrp->status = HV_E_FAIL; + +		goto response_done; +	case KVP_OP_SET_IP_INFO: +		goto response_done; +	case KVP_OP_GET: +		kvp_data = &kvp_msg->body.kvp_get.data; +		goto copy_value; + +	case KVP_OP_SET: +	case KVP_OP_DELETE: +		goto response_done; + +	default: +		break; +	} + +	kvp_data = &kvp_msg->body.kvp_enum_data.data; +	key_name = msg_to_host->body.kvp_enum_data.data.key; + +	/* +	 * The windows host expects the key/value pair to be encoded +	 * in utf16. Ensure that the key/value size reported to the host +	 * will be less than or equal to the MAX size (including the +	 * terminating character). +	 */ +	keylen = utf8s_to_utf16s(key_name, strlen(key_name), UTF16_HOST_ENDIAN, +				(wchar_t *) kvp_data->key, +				(HV_KVP_EXCHANGE_MAX_KEY_SIZE / 2) - 2); +	kvp_data->key_size = 2*(keylen + 1); /* utf16 encoding */ + +copy_value: +	value = msg_to_host->body.kvp_enum_data.data.value; +	valuelen = utf8s_to_utf16s(value, strlen(value), UTF16_HOST_ENDIAN, +				(wchar_t *) kvp_data->value, +				(HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2); +	kvp_data->value_size = 2*(valuelen + 1); /* utf16 encoding */ + +	/* +	 * If the utf8s to utf16s conversion failed; notify host +	 * of the error. +	 */ +	if ((keylen < 0) || (valuelen < 0)) +		icmsghdrp->status = HV_E_FAIL; + +	kvp_data->value_type = REG_SZ; /* all our values are strings */ + +response_done: +	icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; + +	vmbus_sendpacket(channel, recv_buffer, buf_len, req_id, +				VM_PKT_DATA_INBAND, 0); +	poll_channel(channel); +} + +/* + * This callback is invoked when we get a KVP message from the host. + * The host ensures that only one KVP transaction can be active at a time. + * KVP implementation in Linux needs to forward the key to a user-mde + * component to retrive the corresponding value. Consequently, we cannot + * respond to the host in the conext of this callback. Since the host + * guarantees that at most only one transaction can be active at a time, + * we stash away the transaction state in a set of global variables. + */ + +void hv_kvp_onchannelcallback(void *context) +{ +	struct vmbus_channel *channel = context; +	u32 recvlen; +	u64 requestid; + +	struct hv_kvp_msg *kvp_msg; + +	struct icmsg_hdr *icmsghdrp; +	struct icmsg_negotiate *negop = NULL; +	int util_fw_version; +	int kvp_srv_version; + +	if (kvp_transaction.active) { +		/* +		 * We will defer processing this callback once +		 * the current transaction is complete. +		 */ +		kvp_transaction.kvp_context = context; +		return; +	} + +	vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 4, &recvlen, +			 &requestid); + +	if (recvlen > 0) { +		icmsghdrp = (struct icmsg_hdr *)&recv_buffer[ +			sizeof(struct vmbuspipe_hdr)]; + +		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { +			/* +			 * Based on the host, select appropriate +			 * framework and service versions we will +			 * negotiate. +			 */ +			switch (vmbus_proto_version) { +			case (VERSION_WS2008): +				util_fw_version = UTIL_WS2K8_FW_VERSION; +				kvp_srv_version = WS2008_SRV_VERSION; +				break; +			case (VERSION_WIN7): +				util_fw_version = UTIL_FW_VERSION; +				kvp_srv_version = WIN7_SRV_VERSION; +				break; +			default: +				util_fw_version = UTIL_FW_VERSION; +				kvp_srv_version = WIN8_SRV_VERSION; +			} +			vmbus_prep_negotiate_resp(icmsghdrp, negop, +				 recv_buffer, util_fw_version, +				 kvp_srv_version); + +		} else { +			kvp_msg = (struct hv_kvp_msg *)&recv_buffer[ +				sizeof(struct vmbuspipe_hdr) + +				sizeof(struct icmsg_hdr)]; + +			/* +			 * Stash away this global state for completing the +			 * transaction; note transactions are serialized. +			 */ + +			kvp_transaction.recv_len = recvlen; +			kvp_transaction.recv_channel = channel; +			kvp_transaction.recv_req_id = requestid; +			kvp_transaction.active = true; +			kvp_transaction.kvp_msg = kvp_msg; + +			/* +			 * Get the information from the +			 * user-mode component. +			 * component. This transaction will be +			 * completed when we get the value from +			 * the user-mode component. +			 * Set a timeout to deal with +			 * user-mode not responding. +			 */ +			schedule_work(&kvp_sendkey_work); +			schedule_delayed_work(&kvp_work, 5*HZ); + +			return; + +		} + +		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION +			| ICMSGHDRFLAG_RESPONSE; + +		vmbus_sendpacket(channel, recv_buffer, +				       recvlen, requestid, +				       VM_PKT_DATA_INBAND, 0); +	} + +} + +int +hv_kvp_init(struct hv_util_service *srv) +{ +	int err; + +	err = cn_add_callback(&kvp_id, kvp_name, kvp_cn_callback); +	if (err) +		return err; +	recv_buffer = srv->recv_buffer; + +	/* +	 * When this driver loads, the user level daemon that +	 * processes the host requests may not yet be running. +	 * Defer processing channel callbacks until the daemon +	 * has registered. +	 */ +	kvp_transaction.active = true; + +	return 0; +} + +void hv_kvp_deinit(void) +{ +	cn_del_callback(&kvp_id); +	cancel_delayed_work_sync(&kvp_work); +	cancel_work_sync(&kvp_sendkey_work); +} diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c new file mode 100644 index 00000000000..34f14fddb66 --- /dev/null +++ b/drivers/hv/hv_snapshot.c @@ -0,0 +1,281 @@ +/* + * An implementation of host initiated guest snapshot. + * + * + * Copyright (C) 2013, Microsoft, Inc. + * Author : K. Y. Srinivasan <kys@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT.  See the GNU General Public License for more + * details. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/nls.h> +#include <linux/connector.h> +#include <linux/workqueue.h> +#include <linux/hyperv.h> + +#define VSS_MAJOR  5 +#define VSS_MINOR  0 +#define VSS_VERSION    (VSS_MAJOR << 16 | VSS_MINOR) + + + +/* + * Global state maintained for transaction that is being processed. + * Note that only one transaction can be active at any point in time. + * + * This state is set when we receive a request from the host; we + * cleanup this state when the transaction is completed - when we respond + * to the host with the key value. + */ + +static struct { +	bool active; /* transaction status - active or not */ +	int recv_len; /* number of bytes received. */ +	struct vmbus_channel *recv_channel; /* chn we got the request */ +	u64 recv_req_id; /* request ID. */ +	struct hv_vss_msg  *msg; /* current message */ +} vss_transaction; + + +static void vss_respond_to_host(int error); + +static struct cb_id vss_id = { CN_VSS_IDX, CN_VSS_VAL }; +static const char vss_name[] = "vss_kernel_module"; +static __u8 *recv_buffer; + +static void vss_send_op(struct work_struct *dummy); +static DECLARE_WORK(vss_send_op_work, vss_send_op); + +/* + * Callback when data is received from user mode. + */ + +static void +vss_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) +{ +	struct hv_vss_msg *vss_msg; + +	vss_msg = (struct hv_vss_msg *)msg->data; + +	if (vss_msg->vss_hdr.operation == VSS_OP_REGISTER) { +		pr_info("VSS daemon registered\n"); +		vss_transaction.active = false; +		if (vss_transaction.recv_channel != NULL) +			hv_vss_onchannelcallback(vss_transaction.recv_channel); +		return; + +	} +	vss_respond_to_host(vss_msg->error); +} + + +static void vss_send_op(struct work_struct *dummy) +{ +	int op = vss_transaction.msg->vss_hdr.operation; +	struct cn_msg *msg; +	struct hv_vss_msg *vss_msg; + +	msg = kzalloc(sizeof(*msg) + sizeof(*vss_msg), GFP_ATOMIC); +	if (!msg) +		return; + +	vss_msg = (struct hv_vss_msg *)msg->data; + +	msg->id.idx =  CN_VSS_IDX; +	msg->id.val = CN_VSS_VAL; + +	vss_msg->vss_hdr.operation = op; +	msg->len = sizeof(struct hv_vss_msg); + +	cn_netlink_send(msg, 0, 0, GFP_ATOMIC); +	kfree(msg); + +	return; +} + +/* + * Send a response back to the host. + */ + +static void +vss_respond_to_host(int error) +{ +	struct icmsg_hdr *icmsghdrp; +	u32	buf_len; +	struct vmbus_channel *channel; +	u64	req_id; + +	/* +	 * If a transaction is not active; log and return. +	 */ + +	if (!vss_transaction.active) { +		/* +		 * This is a spurious call! +		 */ +		pr_warn("VSS: Transaction not active\n"); +		return; +	} +	/* +	 * Copy the global state for completing the transaction. Note that +	 * only one transaction can be active at a time. +	 */ + +	buf_len = vss_transaction.recv_len; +	channel = vss_transaction.recv_channel; +	req_id = vss_transaction.recv_req_id; +	vss_transaction.active = false; + +	icmsghdrp = (struct icmsg_hdr *) +			&recv_buffer[sizeof(struct vmbuspipe_hdr)]; + +	if (channel->onchannel_callback == NULL) +		/* +		 * We have raced with util driver being unloaded; +		 * silently return. +		 */ +		return; + +	icmsghdrp->status = error; + +	icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; + +	vmbus_sendpacket(channel, recv_buffer, buf_len, req_id, +				VM_PKT_DATA_INBAND, 0); + +} + +/* + * This callback is invoked when we get a VSS message from the host. + * The host ensures that only one VSS transaction can be active at a time. + */ + +void hv_vss_onchannelcallback(void *context) +{ +	struct vmbus_channel *channel = context; +	u32 recvlen; +	u64 requestid; +	struct hv_vss_msg *vss_msg; + + +	struct icmsg_hdr *icmsghdrp; +	struct icmsg_negotiate *negop = NULL; + +	if (vss_transaction.active) { +		/* +		 * We will defer processing this callback once +		 * the current transaction is complete. +		 */ +		vss_transaction.recv_channel = channel; +		return; +	} + +	vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen, +			 &requestid); + +	if (recvlen > 0) { +		icmsghdrp = (struct icmsg_hdr *)&recv_buffer[ +			sizeof(struct vmbuspipe_hdr)]; + +		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { +			vmbus_prep_negotiate_resp(icmsghdrp, negop, +				 recv_buffer, UTIL_FW_VERSION, +				 VSS_VERSION); +		} else { +			vss_msg = (struct hv_vss_msg *)&recv_buffer[ +				sizeof(struct vmbuspipe_hdr) + +				sizeof(struct icmsg_hdr)]; + +			/* +			 * Stash away this global state for completing the +			 * transaction; note transactions are serialized. +			 */ + +			vss_transaction.recv_len = recvlen; +			vss_transaction.recv_channel = channel; +			vss_transaction.recv_req_id = requestid; +			vss_transaction.active = true; +			vss_transaction.msg = (struct hv_vss_msg *)vss_msg; + +			switch (vss_msg->vss_hdr.operation) { +				/* +				 * Initiate a "freeze/thaw" +				 * operation in the guest. +				 * We respond to the host once +				 * the operation is complete. +				 * +				 * We send the message to the +				 * user space daemon and the +				 * operation is performed in +				 * the daemon. +				 */ +			case VSS_OP_FREEZE: +			case VSS_OP_THAW: +				schedule_work(&vss_send_op_work); +				return; + +			case VSS_OP_HOT_BACKUP: +				vss_msg->vss_cf.flags = +					 VSS_HBU_NO_AUTO_RECOVERY; +				vss_respond_to_host(0); +				return; + +			case VSS_OP_GET_DM_INFO: +				vss_msg->dm_info.flags = 0; +				vss_respond_to_host(0); +				return; + +			default: +				vss_respond_to_host(0); +				return; + +			} + +		} + +		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION +			| ICMSGHDRFLAG_RESPONSE; + +		vmbus_sendpacket(channel, recv_buffer, +				       recvlen, requestid, +				       VM_PKT_DATA_INBAND, 0); +	} + +} + +int +hv_vss_init(struct hv_util_service *srv) +{ +	int err; + +	err = cn_add_callback(&vss_id, vss_name, vss_cn_callback); +	if (err) +		return err; +	recv_buffer = srv->recv_buffer; + +	/* +	 * When this driver loads, the user level daemon that +	 * processes the host requests may not yet be running. +	 * Defer processing channel callbacks until the daemon +	 * has registered. +	 */ +	vss_transaction.active = true; +	return 0; +} + +void hv_vss_deinit(void) +{ +	cn_del_callback(&vss_id); +	cancel_work_sync(&vss_send_op_work); +} diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c new file mode 100644 index 00000000000..3b9c9ef0deb --- /dev/null +++ b/drivers/hv/hv_util.c @@ -0,0 +1,446 @@ +/* + * Copyright (c) 2010, Microsoft Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Authors: + *   Haiyang Zhang <haiyangz@microsoft.com> + *   Hank Janssen  <hjanssen@microsoft.com> + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/sysctl.h> +#include <linux/reboot.h> +#include <linux/hyperv.h> + +#include "hyperv_vmbus.h" + +#define SD_MAJOR	3 +#define SD_MINOR	0 +#define SD_VERSION	(SD_MAJOR << 16 | SD_MINOR) + +#define SD_WS2008_MAJOR		1 +#define SD_WS2008_VERSION	(SD_WS2008_MAJOR << 16 | SD_MINOR) + +#define TS_MAJOR	3 +#define TS_MINOR	0 +#define TS_VERSION	(TS_MAJOR << 16 | TS_MINOR) + +#define TS_WS2008_MAJOR		1 +#define TS_WS2008_VERSION	(TS_WS2008_MAJOR << 16 | TS_MINOR) + +#define HB_MAJOR	3 +#define HB_MINOR 0 +#define HB_VERSION	(HB_MAJOR << 16 | HB_MINOR) + +#define HB_WS2008_MAJOR	1 +#define HB_WS2008_VERSION	(HB_WS2008_MAJOR << 16 | HB_MINOR) + +static int sd_srv_version; +static int ts_srv_version; +static int hb_srv_version; +static int util_fw_version; + +static void shutdown_onchannelcallback(void *context); +static struct hv_util_service util_shutdown = { +	.util_cb = shutdown_onchannelcallback, +}; + +static void timesync_onchannelcallback(void *context); +static struct hv_util_service util_timesynch = { +	.util_cb = timesync_onchannelcallback, +}; + +static void heartbeat_onchannelcallback(void *context); +static struct hv_util_service util_heartbeat = { +	.util_cb = heartbeat_onchannelcallback, +}; + +static struct hv_util_service util_kvp = { +	.util_cb = hv_kvp_onchannelcallback, +	.util_init = hv_kvp_init, +	.util_deinit = hv_kvp_deinit, +}; + +static struct hv_util_service util_vss = { +	.util_cb = hv_vss_onchannelcallback, +	.util_init = hv_vss_init, +	.util_deinit = hv_vss_deinit, +}; + +static struct hv_util_service util_fcopy = { +	.util_cb = hv_fcopy_onchannelcallback, +	.util_init = hv_fcopy_init, +	.util_deinit = hv_fcopy_deinit, +}; + +static void perform_shutdown(struct work_struct *dummy) +{ +	orderly_poweroff(true); +} + +/* + * Perform the shutdown operation in a thread context. + */ +static DECLARE_WORK(shutdown_work, perform_shutdown); + +static void shutdown_onchannelcallback(void *context) +{ +	struct vmbus_channel *channel = context; +	u32 recvlen; +	u64 requestid; +	bool execute_shutdown = false; +	u8  *shut_txf_buf = util_shutdown.recv_buffer; + +	struct shutdown_msg_data *shutdown_msg; + +	struct icmsg_hdr *icmsghdrp; +	struct icmsg_negotiate *negop = NULL; + +	vmbus_recvpacket(channel, shut_txf_buf, +			 PAGE_SIZE, &recvlen, &requestid); + +	if (recvlen > 0) { +		icmsghdrp = (struct icmsg_hdr *)&shut_txf_buf[ +			sizeof(struct vmbuspipe_hdr)]; + +		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { +			vmbus_prep_negotiate_resp(icmsghdrp, negop, +					shut_txf_buf, util_fw_version, +					sd_srv_version); +		} else { +			shutdown_msg = +				(struct shutdown_msg_data *)&shut_txf_buf[ +					sizeof(struct vmbuspipe_hdr) + +					sizeof(struct icmsg_hdr)]; + +			switch (shutdown_msg->flags) { +			case 0: +			case 1: +				icmsghdrp->status = HV_S_OK; +				execute_shutdown = true; + +				pr_info("Shutdown request received -" +					    " graceful shutdown initiated\n"); +				break; +			default: +				icmsghdrp->status = HV_E_FAIL; +				execute_shutdown = false; + +				pr_info("Shutdown request received -" +					    " Invalid request\n"); +				break; +			} +		} + +		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION +			| ICMSGHDRFLAG_RESPONSE; + +		vmbus_sendpacket(channel, shut_txf_buf, +				       recvlen, requestid, +				       VM_PKT_DATA_INBAND, 0); +	} + +	if (execute_shutdown == true) +		schedule_work(&shutdown_work); +} + +/* + * Set guest time to host UTC time. + */ +static inline void do_adj_guesttime(u64 hosttime) +{ +	s64 host_tns; +	struct timespec host_ts; + +	host_tns = (hosttime - WLTIMEDELTA) * 100; +	host_ts = ns_to_timespec(host_tns); + +	do_settimeofday(&host_ts); +} + +/* + * Set the host time in a process context. + */ + +struct adj_time_work { +	struct work_struct work; +	u64	host_time; +}; + +static void hv_set_host_time(struct work_struct *work) +{ +	struct adj_time_work	*wrk; + +	wrk = container_of(work, struct adj_time_work, work); +	do_adj_guesttime(wrk->host_time); +	kfree(wrk); +} + +/* + * Synchronize time with host after reboot, restore, etc. + * + * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM. + * After reboot the flag ICTIMESYNCFLAG_SYNC is included in the first time + * message after the timesync channel is opened. Since the hv_utils module is + * loaded after hv_vmbus, the first message is usually missed. The other + * thing is, systime is automatically set to emulated hardware clock which may + * not be UTC time or in the same time zone. So, to override these effects, we + * use the first 50 time samples for initial system time setting. + */ +static inline void adj_guesttime(u64 hosttime, u8 flags) +{ +	struct adj_time_work    *wrk; +	static s32 scnt = 50; + +	wrk = kmalloc(sizeof(struct adj_time_work), GFP_ATOMIC); +	if (wrk == NULL) +		return; + +	wrk->host_time = hosttime; +	if ((flags & ICTIMESYNCFLAG_SYNC) != 0) { +		INIT_WORK(&wrk->work, hv_set_host_time); +		schedule_work(&wrk->work); +		return; +	} + +	if ((flags & ICTIMESYNCFLAG_SAMPLE) != 0 && scnt > 0) { +		scnt--; +		INIT_WORK(&wrk->work, hv_set_host_time); +		schedule_work(&wrk->work); +	} else +		kfree(wrk); +} + +/* + * Time Sync Channel message handler. + */ +static void timesync_onchannelcallback(void *context) +{ +	struct vmbus_channel *channel = context; +	u32 recvlen; +	u64 requestid; +	struct icmsg_hdr *icmsghdrp; +	struct ictimesync_data *timedatap; +	u8 *time_txf_buf = util_timesynch.recv_buffer; +	struct icmsg_negotiate *negop = NULL; + +	vmbus_recvpacket(channel, time_txf_buf, +			 PAGE_SIZE, &recvlen, &requestid); + +	if (recvlen > 0) { +		icmsghdrp = (struct icmsg_hdr *)&time_txf_buf[ +				sizeof(struct vmbuspipe_hdr)]; + +		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { +			vmbus_prep_negotiate_resp(icmsghdrp, negop, +						time_txf_buf, +						util_fw_version, +						ts_srv_version); +		} else { +			timedatap = (struct ictimesync_data *)&time_txf_buf[ +				sizeof(struct vmbuspipe_hdr) + +				sizeof(struct icmsg_hdr)]; +			adj_guesttime(timedatap->parenttime, timedatap->flags); +		} + +		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION +			| ICMSGHDRFLAG_RESPONSE; + +		vmbus_sendpacket(channel, time_txf_buf, +				recvlen, requestid, +				VM_PKT_DATA_INBAND, 0); +	} +} + +/* + * Heartbeat functionality. + * Every two seconds, Hyper-V send us a heartbeat request message. + * we respond to this message, and Hyper-V knows we are alive. + */ +static void heartbeat_onchannelcallback(void *context) +{ +	struct vmbus_channel *channel = context; +	u32 recvlen; +	u64 requestid; +	struct icmsg_hdr *icmsghdrp; +	struct heartbeat_msg_data *heartbeat_msg; +	u8 *hbeat_txf_buf = util_heartbeat.recv_buffer; +	struct icmsg_negotiate *negop = NULL; + +	vmbus_recvpacket(channel, hbeat_txf_buf, +			 PAGE_SIZE, &recvlen, &requestid); + +	if (recvlen > 0) { +		icmsghdrp = (struct icmsg_hdr *)&hbeat_txf_buf[ +				sizeof(struct vmbuspipe_hdr)]; + +		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { +			vmbus_prep_negotiate_resp(icmsghdrp, negop, +				hbeat_txf_buf, util_fw_version, +				hb_srv_version); +		} else { +			heartbeat_msg = +				(struct heartbeat_msg_data *)&hbeat_txf_buf[ +					sizeof(struct vmbuspipe_hdr) + +					sizeof(struct icmsg_hdr)]; + +			heartbeat_msg->seq_num += 1; +		} + +		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION +			| ICMSGHDRFLAG_RESPONSE; + +		vmbus_sendpacket(channel, hbeat_txf_buf, +				       recvlen, requestid, +				       VM_PKT_DATA_INBAND, 0); +	} +} + +static int util_probe(struct hv_device *dev, +			const struct hv_vmbus_device_id *dev_id) +{ +	struct hv_util_service *srv = +		(struct hv_util_service *)dev_id->driver_data; +	int ret; + +	srv->recv_buffer = kmalloc(PAGE_SIZE * 4, GFP_KERNEL); +	if (!srv->recv_buffer) +		return -ENOMEM; +	if (srv->util_init) { +		ret = srv->util_init(srv); +		if (ret) { +			ret = -ENODEV; +			goto error1; +		} +	} + +	/* +	 * The set of services managed by the util driver are not performance +	 * critical and do not need batched reading. Furthermore, some services +	 * such as KVP can only handle one message from the host at a time. +	 * Turn off batched reading for all util drivers before we open the +	 * channel. +	 */ + +	set_channel_read_state(dev->channel, false); + +	ret = vmbus_open(dev->channel, 4 * PAGE_SIZE, 4 * PAGE_SIZE, NULL, 0, +			srv->util_cb, dev->channel); +	if (ret) +		goto error; + +	hv_set_drvdata(dev, srv); +	/* +	 * Based on the host; initialize the framework and +	 * service version numbers we will negotiate. +	 */ +	switch (vmbus_proto_version) { +	case (VERSION_WS2008): +		util_fw_version = UTIL_WS2K8_FW_VERSION; +		sd_srv_version = SD_WS2008_VERSION; +		ts_srv_version = TS_WS2008_VERSION; +		hb_srv_version = HB_WS2008_VERSION; +		break; + +	default: +		util_fw_version = UTIL_FW_VERSION; +		sd_srv_version = SD_VERSION; +		ts_srv_version = TS_VERSION; +		hb_srv_version = HB_VERSION; +	} + +	return 0; + +error: +	if (srv->util_deinit) +		srv->util_deinit(); +error1: +	kfree(srv->recv_buffer); +	return ret; +} + +static int util_remove(struct hv_device *dev) +{ +	struct hv_util_service *srv = hv_get_drvdata(dev); + +	vmbus_close(dev->channel); +	if (srv->util_deinit) +		srv->util_deinit(); +	kfree(srv->recv_buffer); + +	return 0; +} + +static const struct hv_vmbus_device_id id_table[] = { +	/* Shutdown guid */ +	{ HV_SHUTDOWN_GUID, +	  .driver_data = (unsigned long)&util_shutdown +	}, +	/* Time synch guid */ +	{ HV_TS_GUID, +	  .driver_data = (unsigned long)&util_timesynch +	}, +	/* Heartbeat guid */ +	{ HV_HEART_BEAT_GUID, +	  .driver_data = (unsigned long)&util_heartbeat +	}, +	/* KVP guid */ +	{ HV_KVP_GUID, +	  .driver_data = (unsigned long)&util_kvp +	}, +	/* VSS GUID */ +	{ HV_VSS_GUID, +	  .driver_data = (unsigned long)&util_vss +	}, +	/* File copy GUID */ +	{ HV_FCOPY_GUID, +	  .driver_data = (unsigned long)&util_fcopy +	}, +	{ }, +}; + +MODULE_DEVICE_TABLE(vmbus, id_table); + +/* The one and only one */ +static  struct hv_driver util_drv = { +	.name = "hv_util", +	.id_table = id_table, +	.probe =  util_probe, +	.remove =  util_remove, +}; + +static int __init init_hyperv_utils(void) +{ +	pr_info("Registering HyperV Utility Driver\n"); + +	return vmbus_driver_register(&util_drv); +} + +static void exit_hyperv_utils(void) +{ +	pr_info("De-Registered HyperV Utility Driver\n"); + +	vmbus_driver_unregister(&util_drv); +} + +module_init(init_hyperv_utils); +module_exit(exit_hyperv_utils); + +MODULE_DESCRIPTION("Hyper-V Utilities"); +MODULE_LICENSE("GPL"); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h new file mode 100644 index 00000000000..22b750749a3 --- /dev/null +++ b/drivers/hv/hyperv_vmbus.h @@ -0,0 +1,682 @@ +/* + * + * Copyright (c) 2011, Microsoft Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Authors: + *   Haiyang Zhang <haiyangz@microsoft.com> + *   Hank Janssen  <hjanssen@microsoft.com> + *   K. Y. Srinivasan <kys@microsoft.com> + * + */ + +#ifndef _HYPERV_VMBUS_H +#define _HYPERV_VMBUS_H + +#include <linux/list.h> +#include <asm/sync_bitops.h> +#include <linux/atomic.h> +#include <linux/hyperv.h> + +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HVCPUID_VERSION_FEATURES). + */ +enum hv_cpuid_function { +	HVCPUID_VERSION_FEATURES		= 0x00000001, +	HVCPUID_VENDOR_MAXFUNCTION		= 0x40000000, +	HVCPUID_INTERFACE			= 0x40000001, + +	/* +	 * The remaining functions depend on the value of +	 * HVCPUID_INTERFACE +	 */ +	HVCPUID_VERSION			= 0x40000002, +	HVCPUID_FEATURES			= 0x40000003, +	HVCPUID_ENLIGHTENMENT_INFO	= 0x40000004, +	HVCPUID_IMPLEMENTATION_LIMITS		= 0x40000005, +}; + +/* Define version of the synthetic interrupt controller. */ +#define HV_SYNIC_VERSION		(1) + +/* Define the expected SynIC version. */ +#define HV_SYNIC_VERSION_1		(0x1) + +/* Define synthetic interrupt controller message constants. */ +#define HV_MESSAGE_SIZE			(256) +#define HV_MESSAGE_PAYLOAD_BYTE_COUNT	(240) +#define HV_MESSAGE_PAYLOAD_QWORD_COUNT	(30) +#define HV_ANY_VP			(0xFFFFFFFF) + +/* Define synthetic interrupt controller flag constants. */ +#define HV_EVENT_FLAGS_COUNT		(256 * 8) +#define HV_EVENT_FLAGS_BYTE_COUNT	(256) +#define HV_EVENT_FLAGS_DWORD_COUNT	(256 / sizeof(u32)) + +/* Define hypervisor message types. */ +enum hv_message_type { +	HVMSG_NONE			= 0x00000000, + +	/* Memory access messages. */ +	HVMSG_UNMAPPED_GPA		= 0x80000000, +	HVMSG_GPA_INTERCEPT		= 0x80000001, + +	/* Timer notification messages. */ +	HVMSG_TIMER_EXPIRED			= 0x80000010, + +	/* Error messages. */ +	HVMSG_INVALID_VP_REGISTER_VALUE	= 0x80000020, +	HVMSG_UNRECOVERABLE_EXCEPTION	= 0x80000021, +	HVMSG_UNSUPPORTED_FEATURE		= 0x80000022, + +	/* Trace buffer complete messages. */ +	HVMSG_EVENTLOG_BUFFERCOMPLETE	= 0x80000040, + +	/* Platform-specific processor intercept messages. */ +	HVMSG_X64_IOPORT_INTERCEPT		= 0x80010000, +	HVMSG_X64_MSR_INTERCEPT		= 0x80010001, +	HVMSG_X64_CPUID_INTERCEPT		= 0x80010002, +	HVMSG_X64_EXCEPTION_INTERCEPT	= 0x80010003, +	HVMSG_X64_APIC_EOI			= 0x80010004, +	HVMSG_X64_LEGACY_FP_ERROR		= 0x80010005 +}; + +/* Define the number of synthetic interrupt sources. */ +#define HV_SYNIC_SINT_COUNT		(16) +#define HV_SYNIC_STIMER_COUNT		(4) + +/* Define invalid partition identifier. */ +#define HV_PARTITION_ID_INVALID		((u64)0x0) + +/* Define port identifier type. */ +union hv_port_id { +	u32 asu32; +	struct { +		u32 id:24; +		u32 reserved:8; +	} u ; +}; + +/* Define port type. */ +enum hv_port_type { +	HVPORT_MSG	= 1, +	HVPORT_EVENT		= 2, +	HVPORT_MONITOR	= 3 +}; + +/* Define port information structure. */ +struct hv_port_info { +	enum hv_port_type port_type; +	u32 padding; +	union { +		struct { +			u32 target_sint; +			u32 target_vp; +			u64 rsvdz; +		} message_port_info; +		struct { +			u32 target_sint; +			u32 target_vp; +			u16 base_flag_bumber; +			u16 flag_count; +			u32 rsvdz; +		} event_port_info; +		struct { +			u64 monitor_address; +			u64 rsvdz; +		} monitor_port_info; +	}; +}; + +struct hv_connection_info { +	enum hv_port_type port_type; +	u32 padding; +	union { +		struct { +			u64 rsvdz; +		} message_connection_info; +		struct { +			u64 rsvdz; +		} event_connection_info; +		struct { +			u64 monitor_address; +		} monitor_connection_info; +	}; +}; + +/* Define synthetic interrupt controller message flags. */ +union hv_message_flags { +	u8 asu8; +	struct { +		u8 msg_pending:1; +		u8 reserved:7; +	}; +}; + +/* Define synthetic interrupt controller message header. */ +struct hv_message_header { +	enum hv_message_type message_type; +	u8 payload_size; +	union hv_message_flags message_flags; +	u8 reserved[2]; +	union { +		u64 sender; +		union hv_port_id port; +	}; +}; + +/* Define timer message payload structure. */ +struct hv_timer_message_payload { +	u32 timer_index; +	u32 reserved; +	u64 expiration_time;	/* When the timer expired */ +	u64 delivery_time;	/* When the message was delivered */ +}; + +/* Define synthetic interrupt controller message format. */ +struct hv_message { +	struct hv_message_header header; +	union { +		u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; +	} u ; +}; + +/* Define the number of message buffers associated with each port. */ +#define HV_PORT_MESSAGE_BUFFER_COUNT	(16) + +/* Define the synthetic interrupt message page layout. */ +struct hv_message_page { +	struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; +}; + +/* Define the synthetic interrupt controller event flags format. */ +union hv_synic_event_flags { +	u8 flags8[HV_EVENT_FLAGS_BYTE_COUNT]; +	u32 flags32[HV_EVENT_FLAGS_DWORD_COUNT]; +}; + +/* Define the synthetic interrupt flags page layout. */ +struct hv_synic_event_flags_page { +	union hv_synic_event_flags sintevent_flags[HV_SYNIC_SINT_COUNT]; +}; + +/* Define SynIC control register. */ +union hv_synic_scontrol { +	u64 as_uint64; +	struct { +		u64 enable:1; +		u64 reserved:63; +	}; +}; + +/* Define synthetic interrupt source. */ +union hv_synic_sint { +	u64 as_uint64; +	struct { +		u64 vector:8; +		u64 reserved1:8; +		u64 masked:1; +		u64 auto_eoi:1; +		u64 reserved2:46; +	}; +}; + +/* Define the format of the SIMP register */ +union hv_synic_simp { +	u64 as_uint64; +	struct { +		u64 simp_enabled:1; +		u64 preserved:11; +		u64 base_simp_gpa:52; +	}; +}; + +/* Define the format of the SIEFP register */ +union hv_synic_siefp { +	u64 as_uint64; +	struct { +		u64 siefp_enabled:1; +		u64 preserved:11; +		u64 base_siefp_gpa:52; +	}; +}; + +/* Definitions for the monitored notification facility */ +union hv_monitor_trigger_group { +	u64 as_uint64; +	struct { +		u32 pending; +		u32 armed; +	}; +}; + +struct hv_monitor_parameter { +	union hv_connection_id connectionid; +	u16 flagnumber; +	u16 rsvdz; +}; + +union hv_monitor_trigger_state { +	u32 asu32; + +	struct { +		u32 group_enable:4; +		u32 rsvdz:28; +	}; +}; + +/* struct hv_monitor_page Layout */ +/* ------------------------------------------------------ */ +/* | 0   | TriggerState (4 bytes) | Rsvd1 (4 bytes)     | */ +/* | 8   | TriggerGroup[0]                              | */ +/* | 10  | TriggerGroup[1]                              | */ +/* | 18  | TriggerGroup[2]                              | */ +/* | 20  | TriggerGroup[3]                              | */ +/* | 28  | Rsvd2[0]                                     | */ +/* | 30  | Rsvd2[1]                                     | */ +/* | 38  | Rsvd2[2]                                     | */ +/* | 40  | NextCheckTime[0][0]    | NextCheckTime[0][1] | */ +/* | ...                                                | */ +/* | 240 | Latency[0][0..3]                             | */ +/* | 340 | Rsvz3[0]                                     | */ +/* | 440 | Parameter[0][0]                              | */ +/* | 448 | Parameter[0][1]                              | */ +/* | ...                                                | */ +/* | 840 | Rsvd4[0]                                     | */ +/* ------------------------------------------------------ */ +struct hv_monitor_page { +	union hv_monitor_trigger_state trigger_state; +	u32 rsvdz1; + +	union hv_monitor_trigger_group trigger_group[4]; +	u64 rsvdz2[3]; + +	s32 next_checktime[4][32]; + +	u16 latency[4][32]; +	u64 rsvdz3[32]; + +	struct hv_monitor_parameter parameter[4][32]; + +	u8 rsvdz4[1984]; +}; + +/* Declare the various hypercall operations. */ +enum hv_call_code { +	HVCALL_POST_MESSAGE	= 0x005c, +	HVCALL_SIGNAL_EVENT	= 0x005d, +}; + +/* Definition of the hv_post_message hypercall input structure. */ +struct hv_input_post_message { +	union hv_connection_id connectionid; +	u32 reserved; +	enum hv_message_type message_type; +	u32 payload_size; +	u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; +}; + +/* + * Versioning definitions used for guests reporting themselves to the + * hypervisor, and visa versa. + */ + +/* Version info reported by guest OS's */ +enum hv_guest_os_vendor { +	HVGUESTOS_VENDOR_MICROSOFT	= 0x0001 +}; + +enum hv_guest_os_microsoft_ids { +	HVGUESTOS_MICROSOFT_UNDEFINED	= 0x00, +	HVGUESTOS_MICROSOFT_MSDOS		= 0x01, +	HVGUESTOS_MICROSOFT_WINDOWS3X	= 0x02, +	HVGUESTOS_MICROSOFT_WINDOWS9X	= 0x03, +	HVGUESTOS_MICROSOFT_WINDOWSNT	= 0x04, +	HVGUESTOS_MICROSOFT_WINDOWSCE	= 0x05 +}; + +/* + * Declare the MSR used to identify the guest OS. + */ +#define HV_X64_MSR_GUEST_OS_ID	0x40000000 + +union hv_x64_msr_guest_os_id_contents { +	u64 as_uint64; +	struct { +		u64 build_number:16; +		u64 service_version:8; /* Service Pack, etc. */ +		u64 minor_version:8; +		u64 major_version:8; +		u64 os_id:8; /* enum hv_guest_os_microsoft_ids (if Vendor=MS) */ +		u64 vendor_id:16; /* enum hv_guest_os_vendor */ +	}; +}; + +/* + * Declare the MSR used to setup pages used to communicate with the hypervisor. + */ +#define HV_X64_MSR_HYPERCALL	0x40000001 + +union hv_x64_msr_hypercall_contents { +	u64 as_uint64; +	struct { +		u64 enable:1; +		u64 reserved:11; +		u64 guest_physical_address:52; +	}; +}; + + +enum { +	VMBUS_MESSAGE_CONNECTION_ID	= 1, +	VMBUS_MESSAGE_PORT_ID		= 1, +	VMBUS_EVENT_CONNECTION_ID	= 2, +	VMBUS_EVENT_PORT_ID		= 2, +	VMBUS_MONITOR_CONNECTION_ID	= 3, +	VMBUS_MONITOR_PORT_ID		= 3, +	VMBUS_MESSAGE_SINT		= 2, +}; + +/* #defines */ + +#define HV_PRESENT_BIT			0x80000000 + +/* + * The guest OS needs to register the guest ID with the hypervisor. + * The guest ID is a 64 bit entity and the structure of this ID is + * specified in the Hyper-V specification: + * + * http://msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx + * + * While the current guideline does not specify how Linux guest ID(s) + * need to be generated, our plan is to publish the guidelines for + * Linux and other guest operating systems that currently are hosted + * on Hyper-V. The implementation here conforms to this yet + * unpublished guidelines. + * + * + * Bit(s) + * 63 - Indicates if the OS is Open Source or not; 1 is Open Source + * 62:56 - Os Type; Linux is 0x100 + * 55:48 - Distro specific identification + * 47:16 - Linux kernel version number + * 15:0  - Distro specific identification + * + * + */ + +#define HV_LINUX_VENDOR_ID		0x8100 + +/* + * Generate the guest ID based on the guideline described above. + */ + +static inline  __u64 generate_guest_id(__u8 d_info1, __u32 kernel_version, +					__u16 d_info2) +{ +	__u64 guest_id = 0; + +	guest_id = (((__u64)HV_LINUX_VENDOR_ID) << 48); +	guest_id |= (((__u64)(d_info1)) << 48); +	guest_id |= (((__u64)(kernel_version)) << 16); +	guest_id |= ((__u64)(d_info2)); + +	return guest_id; +} + + +#define HV_CPU_POWER_MANAGEMENT		(1 << 0) +#define HV_RECOMMENDATIONS_MAX		4 + +#define HV_X64_MAX			5 +#define HV_CAPS_MAX			8 + + +#define HV_HYPERCALL_PARAM_ALIGN	sizeof(u64) + + +/* Service definitions */ + +#define HV_SERVICE_PARENT_PORT				(0) +#define HV_SERVICE_PARENT_CONNECTION			(0) + +#define HV_SERVICE_CONNECT_RESPONSE_SUCCESS		(0) +#define HV_SERVICE_CONNECT_RESPONSE_INVALID_PARAMETER	(1) +#define HV_SERVICE_CONNECT_RESPONSE_UNKNOWN_SERVICE	(2) +#define HV_SERVICE_CONNECT_RESPONSE_CONNECTION_REJECTED	(3) + +#define HV_SERVICE_CONNECT_REQUEST_MESSAGE_ID		(1) +#define HV_SERVICE_CONNECT_RESPONSE_MESSAGE_ID		(2) +#define HV_SERVICE_DISCONNECT_REQUEST_MESSAGE_ID	(3) +#define HV_SERVICE_DISCONNECT_RESPONSE_MESSAGE_ID	(4) +#define HV_SERVICE_MAX_MESSAGE_ID				(4) + +#define HV_SERVICE_PROTOCOL_VERSION (0x0010) +#define HV_CONNECT_PAYLOAD_BYTE_COUNT 64 + +/* #define VMBUS_REVISION_NUMBER	6 */ + +/* Our local vmbus's port and connection id. Anything >0 is fine */ +/* #define VMBUS_PORT_ID		11 */ + +/* 628180B8-308D-4c5e-B7DB-1BEB62E62EF4 */ +static const uuid_le VMBUS_SERVICE_ID = { +	.b = { +		0xb8, 0x80, 0x81, 0x62, 0x8d, 0x30, 0x5e, 0x4c, +		0xb7, 0xdb, 0x1b, 0xeb, 0x62, 0xe6, 0x2e, 0xf4 +	}, +}; + + + +struct hv_context { +	/* We only support running on top of Hyper-V +	* So at this point this really can only contain the Hyper-V ID +	*/ +	u64 guestid; + +	void *hypercall_page; + +	bool synic_initialized; + +	void *synic_message_page[NR_CPUS]; +	void *synic_event_page[NR_CPUS]; +	/* +	 * Hypervisor's notion of virtual processor ID is different from +	 * Linux' notion of CPU ID. This information can only be retrieved +	 * in the context of the calling CPU. Setup a map for easy access +	 * to this information: +	 * +	 * vp_index[a] is the Hyper-V's processor ID corresponding to +	 * Linux cpuid 'a'. +	 */ +	u32 vp_index[NR_CPUS]; +	/* +	 * Starting with win8, we can take channel interrupts on any CPU; +	 * we will manage the tasklet that handles events on a per CPU +	 * basis. +	 */ +	struct tasklet_struct *event_dpc[NR_CPUS]; +	/* +	 * To optimize the mapping of relid to channel, maintain +	 * per-cpu list of the channels based on their CPU affinity. +	 */ +	struct list_head percpu_list[NR_CPUS]; +}; + +extern struct hv_context hv_context; + +struct hv_ring_buffer_debug_info { +	u32 current_interrupt_mask; +	u32 current_read_index; +	u32 current_write_index; +	u32 bytes_avail_toread; +	u32 bytes_avail_towrite; +}; + +/* Hv Interface */ + +extern int hv_init(void); + +extern void hv_cleanup(void); + +extern int hv_post_message(union hv_connection_id connection_id, +			 enum hv_message_type message_type, +			 void *payload, size_t payload_size); + +extern u16 hv_signal_event(void *con_id); + +extern int hv_synic_alloc(void); + +extern void hv_synic_free(void); + +extern void hv_synic_init(void *irqarg); + +extern void hv_synic_cleanup(void *arg); + +/* + * Host version information. + */ +extern unsigned int host_info_eax; +extern unsigned int host_info_ebx; +extern unsigned int host_info_ecx; +extern unsigned int host_info_edx; + +/* Interface */ + + +int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, void *buffer, +		   u32 buflen); + +void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info); + +int hv_ringbuffer_write(struct hv_ring_buffer_info *ring_info, +		    struct kvec *kv_list, +		    u32 kv_count, bool *signal); + +int hv_ringbuffer_peek(struct hv_ring_buffer_info *ring_info, void *buffer, +		   u32 buflen); + +int hv_ringbuffer_read(struct hv_ring_buffer_info *ring_info, +		   void *buffer, +		   u32 buflen, +		   u32 offset, bool *signal); + + +void hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info, +			    struct hv_ring_buffer_debug_info *debug_info); + +void hv_begin_read(struct hv_ring_buffer_info *rbi); + +u32 hv_end_read(struct hv_ring_buffer_info *rbi); + +/* + * Maximum channels is determined by the size of the interrupt page + * which is PAGE_SIZE. 1/2 of PAGE_SIZE is for send endpoint interrupt + * and the other is receive endpoint interrupt + */ +#define MAX_NUM_CHANNELS	((PAGE_SIZE >> 1) << 3)	/* 16348 channels */ + +/* The value here must be in multiple of 32 */ +/* TODO: Need to make this configurable */ +#define MAX_NUM_CHANNELS_SUPPORTED	256 + + +enum vmbus_connect_state { +	DISCONNECTED, +	CONNECTING, +	CONNECTED, +	DISCONNECTING +}; + +#define MAX_SIZE_CHANNEL_MESSAGE	HV_MESSAGE_PAYLOAD_BYTE_COUNT + +struct vmbus_connection { +	enum vmbus_connect_state conn_state; + +	atomic_t next_gpadl_handle; + +	/* +	 * Represents channel interrupts. Each bit position represents a +	 * channel.  When a channel sends an interrupt via VMBUS, it finds its +	 * bit in the sendInterruptPage, set it and calls Hv to generate a port +	 * event. The other end receives the port event and parse the +	 * recvInterruptPage to see which bit is set +	 */ +	void *int_page; +	void *send_int_page; +	void *recv_int_page; + +	/* +	 * 2 pages - 1st page for parent->child notification and 2nd +	 * is child->parent notification +	 */ +	struct hv_monitor_page *monitor_pages[2]; +	struct list_head chn_msg_list; +	spinlock_t channelmsg_lock; + +	/* List of channels */ +	struct list_head chn_list; +	spinlock_t channel_lock; + +	struct workqueue_struct *work_queue; +}; + + +struct vmbus_msginfo { +	/* Bookkeeping stuff */ +	struct list_head msglist_entry; + +	/* The message itself */ +	unsigned char msg[0]; +}; + + +extern struct vmbus_connection vmbus_connection; + +/* General vmbus interface */ + +struct hv_device *vmbus_device_create(const uuid_le *type, +				      const uuid_le *instance, +				      struct vmbus_channel *channel); + +int vmbus_device_register(struct hv_device *child_device_obj); +void vmbus_device_unregister(struct hv_device *device_obj); + +/* static void */ +/* VmbusChildDeviceDestroy( */ +/* struct hv_device *); */ + +struct vmbus_channel *relid2channel(u32 relid); + +void vmbus_free_channels(void); + +/* Connection interface */ + +int vmbus_connect(void); + +int vmbus_post_msg(void *buffer, size_t buflen); + +int vmbus_set_event(struct vmbus_channel *channel); + +void vmbus_on_event(unsigned long data); + +int hv_fcopy_init(struct hv_util_service *); +void hv_fcopy_deinit(void); +void hv_fcopy_onchannelcallback(void *); + + +#endif /* _HYPERV_VMBUS_H */ diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c new file mode 100644 index 00000000000..15db66b7414 --- /dev/null +++ b/drivers/hv/ring_buffer.c @@ -0,0 +1,561 @@ +/* + * + * Copyright (c) 2009, Microsoft Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Authors: + *   Haiyang Zhang <haiyangz@microsoft.com> + *   Hank Janssen  <hjanssen@microsoft.com> + *   K. Y. Srinivasan <kys@microsoft.com> + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/hyperv.h> +#include <linux/uio.h> + +#include "hyperv_vmbus.h" + +void hv_begin_read(struct hv_ring_buffer_info *rbi) +{ +	rbi->ring_buffer->interrupt_mask = 1; +	mb(); +} + +u32 hv_end_read(struct hv_ring_buffer_info *rbi) +{ +	u32 read; +	u32 write; + +	rbi->ring_buffer->interrupt_mask = 0; +	mb(); + +	/* +	 * Now check to see if the ring buffer is still empty. +	 * If it is not, we raced and we need to process new +	 * incoming messages. +	 */ +	hv_get_ringbuffer_availbytes(rbi, &read, &write); + +	return read; +} + +/* + * When we write to the ring buffer, check if the host needs to + * be signaled. Here is the details of this protocol: + * + *	1. The host guarantees that while it is draining the + *	   ring buffer, it will set the interrupt_mask to + *	   indicate it does not need to be interrupted when + *	   new data is placed. + * + *	2. The host guarantees that it will completely drain + *	   the ring buffer before exiting the read loop. Further, + *	   once the ring buffer is empty, it will clear the + *	   interrupt_mask and re-check to see if new data has + *	   arrived. + */ + +static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi) +{ +	mb(); +	if (rbi->ring_buffer->interrupt_mask) +		return false; + +	/* check interrupt_mask before read_index */ +	rmb(); +	/* +	 * This is the only case we need to signal when the +	 * ring transitions from being empty to non-empty. +	 */ +	if (old_write == rbi->ring_buffer->read_index) +		return true; + +	return false; +} + +/* + * To optimize the flow management on the send-side, + * when the sender is blocked because of lack of + * sufficient space in the ring buffer, potential the + * consumer of the ring buffer can signal the producer. + * This is controlled by the following parameters: + * + * 1. pending_send_sz: This is the size in bytes that the + *    producer is trying to send. + * 2. The feature bit feat_pending_send_sz set to indicate if + *    the consumer of the ring will signal when the ring + *    state transitions from being full to a state where + *    there is room for the producer to send the pending packet. + */ + +static bool hv_need_to_signal_on_read(u32 old_rd, +					 struct hv_ring_buffer_info *rbi) +{ +	u32 prev_write_sz; +	u32 cur_write_sz; +	u32 r_size; +	u32 write_loc = rbi->ring_buffer->write_index; +	u32 read_loc = rbi->ring_buffer->read_index; +	u32 pending_sz = rbi->ring_buffer->pending_send_sz; + +	/* +	 * If the other end is not blocked on write don't bother. +	 */ +	if (pending_sz == 0) +		return false; + +	r_size = rbi->ring_datasize; +	cur_write_sz = write_loc >= read_loc ? r_size - (write_loc - read_loc) : +			read_loc - write_loc; + +	prev_write_sz = write_loc >= old_rd ? r_size - (write_loc - old_rd) : +			old_rd - write_loc; + + +	if ((prev_write_sz < pending_sz) && (cur_write_sz >= pending_sz)) +		return true; + +	return false; +} + +/* + * hv_get_next_write_location() + * + * Get the next write location for the specified ring buffer + * + */ +static inline u32 +hv_get_next_write_location(struct hv_ring_buffer_info *ring_info) +{ +	u32 next = ring_info->ring_buffer->write_index; + +	return next; +} + +/* + * hv_set_next_write_location() + * + * Set the next write location for the specified ring buffer + * + */ +static inline void +hv_set_next_write_location(struct hv_ring_buffer_info *ring_info, +		     u32 next_write_location) +{ +	ring_info->ring_buffer->write_index = next_write_location; +} + +/* + * hv_get_next_read_location() + * + * Get the next read location for the specified ring buffer + */ +static inline u32 +hv_get_next_read_location(struct hv_ring_buffer_info *ring_info) +{ +	u32 next = ring_info->ring_buffer->read_index; + +	return next; +} + +/* + * hv_get_next_readlocation_withoffset() + * + * Get the next read location + offset for the specified ring buffer. + * This allows the caller to skip + */ +static inline u32 +hv_get_next_readlocation_withoffset(struct hv_ring_buffer_info *ring_info, +				 u32 offset) +{ +	u32 next = ring_info->ring_buffer->read_index; + +	next += offset; +	next %= ring_info->ring_datasize; + +	return next; +} + +/* + * + * hv_set_next_read_location() + * + * Set the next read location for the specified ring buffer + * + */ +static inline void +hv_set_next_read_location(struct hv_ring_buffer_info *ring_info, +		    u32 next_read_location) +{ +	ring_info->ring_buffer->read_index = next_read_location; +} + + +/* + * + * hv_get_ring_buffer() + * + * Get the start of the ring buffer + */ +static inline void * +hv_get_ring_buffer(struct hv_ring_buffer_info *ring_info) +{ +	return (void *)ring_info->ring_buffer->buffer; +} + + +/* + * + * hv_get_ring_buffersize() + * + * Get the size of the ring buffer + */ +static inline u32 +hv_get_ring_buffersize(struct hv_ring_buffer_info *ring_info) +{ +	return ring_info->ring_datasize; +} + +/* + * + * hv_get_ring_bufferindices() + * + * Get the read and write indices as u64 of the specified ring buffer + * + */ +static inline u64 +hv_get_ring_bufferindices(struct hv_ring_buffer_info *ring_info) +{ +	return (u64)ring_info->ring_buffer->write_index << 32; +} + +/* + * + * hv_copyfrom_ringbuffer() + * + * Helper routine to copy to source from ring buffer. + * Assume there is enough room. Handles wrap-around in src case only!! + * + */ +static u32 hv_copyfrom_ringbuffer( +	struct hv_ring_buffer_info	*ring_info, +	void				*dest, +	u32				destlen, +	u32				start_read_offset) +{ +	void *ring_buffer = hv_get_ring_buffer(ring_info); +	u32 ring_buffer_size = hv_get_ring_buffersize(ring_info); + +	u32 frag_len; + +	/* wrap-around detected at the src */ +	if (destlen > ring_buffer_size - start_read_offset) { +		frag_len = ring_buffer_size - start_read_offset; + +		memcpy(dest, ring_buffer + start_read_offset, frag_len); +		memcpy(dest + frag_len, ring_buffer, destlen - frag_len); +	} else + +		memcpy(dest, ring_buffer + start_read_offset, destlen); + + +	start_read_offset += destlen; +	start_read_offset %= ring_buffer_size; + +	return start_read_offset; +} + + +/* + * + * hv_copyto_ringbuffer() + * + * Helper routine to copy from source to ring buffer. + * Assume there is enough room. Handles wrap-around in dest case only!! + * + */ +static u32 hv_copyto_ringbuffer( +	struct hv_ring_buffer_info	*ring_info, +	u32				start_write_offset, +	void				*src, +	u32				srclen) +{ +	void *ring_buffer = hv_get_ring_buffer(ring_info); +	u32 ring_buffer_size = hv_get_ring_buffersize(ring_info); +	u32 frag_len; + +	/* wrap-around detected! */ +	if (srclen > ring_buffer_size - start_write_offset) { +		frag_len = ring_buffer_size - start_write_offset; +		memcpy(ring_buffer + start_write_offset, src, frag_len); +		memcpy(ring_buffer, src + frag_len, srclen - frag_len); +	} else +		memcpy(ring_buffer + start_write_offset, src, srclen); + +	start_write_offset += srclen; +	start_write_offset %= ring_buffer_size; + +	return start_write_offset; +} + +/* + * + * hv_ringbuffer_get_debuginfo() + * + * Get various debug metrics for the specified ring buffer + * + */ +void hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info, +			    struct hv_ring_buffer_debug_info *debug_info) +{ +	u32 bytes_avail_towrite; +	u32 bytes_avail_toread; + +	if (ring_info->ring_buffer) { +		hv_get_ringbuffer_availbytes(ring_info, +					&bytes_avail_toread, +					&bytes_avail_towrite); + +		debug_info->bytes_avail_toread = bytes_avail_toread; +		debug_info->bytes_avail_towrite = bytes_avail_towrite; +		debug_info->current_read_index = +			ring_info->ring_buffer->read_index; +		debug_info->current_write_index = +			ring_info->ring_buffer->write_index; +		debug_info->current_interrupt_mask = +			ring_info->ring_buffer->interrupt_mask; +	} +} + +/* + * + * hv_ringbuffer_init() + * + *Initialize the ring buffer + * + */ +int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, +		   void *buffer, u32 buflen) +{ +	if (sizeof(struct hv_ring_buffer) != PAGE_SIZE) +		return -EINVAL; + +	memset(ring_info, 0, sizeof(struct hv_ring_buffer_info)); + +	ring_info->ring_buffer = (struct hv_ring_buffer *)buffer; +	ring_info->ring_buffer->read_index = +		ring_info->ring_buffer->write_index = 0; + +	ring_info->ring_size = buflen; +	ring_info->ring_datasize = buflen - sizeof(struct hv_ring_buffer); + +	spin_lock_init(&ring_info->ring_lock); + +	return 0; +} + +/* + * + * hv_ringbuffer_cleanup() + * + * Cleanup the ring buffer + * + */ +void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info) +{ +} + +/* + * + * hv_ringbuffer_write() + * + * Write to the ring buffer + * + */ +int hv_ringbuffer_write(struct hv_ring_buffer_info *outring_info, +		    struct kvec *kv_list, u32 kv_count, bool *signal) +{ +	int i = 0; +	u32 bytes_avail_towrite; +	u32 bytes_avail_toread; +	u32 totalbytes_towrite = 0; + +	u32 next_write_location; +	u32 old_write; +	u64 prev_indices = 0; +	unsigned long flags; + +	for (i = 0; i < kv_count; i++) +		totalbytes_towrite += kv_list[i].iov_len; + +	totalbytes_towrite += sizeof(u64); + +	spin_lock_irqsave(&outring_info->ring_lock, flags); + +	hv_get_ringbuffer_availbytes(outring_info, +				&bytes_avail_toread, +				&bytes_avail_towrite); + + +	/* If there is only room for the packet, assume it is full. */ +	/* Otherwise, the next time around, we think the ring buffer */ +	/* is empty since the read index == write index */ +	if (bytes_avail_towrite <= totalbytes_towrite) { +		spin_unlock_irqrestore(&outring_info->ring_lock, flags); +		return -EAGAIN; +	} + +	/* Write to the ring buffer */ +	next_write_location = hv_get_next_write_location(outring_info); + +	old_write = next_write_location; + +	for (i = 0; i < kv_count; i++) { +		next_write_location = hv_copyto_ringbuffer(outring_info, +						     next_write_location, +						     kv_list[i].iov_base, +						     kv_list[i].iov_len); +	} + +	/* Set previous packet start */ +	prev_indices = hv_get_ring_bufferindices(outring_info); + +	next_write_location = hv_copyto_ringbuffer(outring_info, +					     next_write_location, +					     &prev_indices, +					     sizeof(u64)); + +	/* Issue a full memory barrier before updating the write index */ +	mb(); + +	/* Now, update the write location */ +	hv_set_next_write_location(outring_info, next_write_location); + + +	spin_unlock_irqrestore(&outring_info->ring_lock, flags); + +	*signal = hv_need_to_signal(old_write, outring_info); +	return 0; +} + + +/* + * + * hv_ringbuffer_peek() + * + * Read without advancing the read index + * + */ +int hv_ringbuffer_peek(struct hv_ring_buffer_info *Inring_info, +		   void *Buffer, u32 buflen) +{ +	u32 bytes_avail_towrite; +	u32 bytes_avail_toread; +	u32 next_read_location = 0; +	unsigned long flags; + +	spin_lock_irqsave(&Inring_info->ring_lock, flags); + +	hv_get_ringbuffer_availbytes(Inring_info, +				&bytes_avail_toread, +				&bytes_avail_towrite); + +	/* Make sure there is something to read */ +	if (bytes_avail_toread < buflen) { + +		spin_unlock_irqrestore(&Inring_info->ring_lock, flags); + +		return -EAGAIN; +	} + +	/* Convert to byte offset */ +	next_read_location = hv_get_next_read_location(Inring_info); + +	next_read_location = hv_copyfrom_ringbuffer(Inring_info, +						Buffer, +						buflen, +						next_read_location); + +	spin_unlock_irqrestore(&Inring_info->ring_lock, flags); + +	return 0; +} + + +/* + * + * hv_ringbuffer_read() + * + * Read and advance the read index + * + */ +int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info, void *buffer, +		   u32 buflen, u32 offset, bool *signal) +{ +	u32 bytes_avail_towrite; +	u32 bytes_avail_toread; +	u32 next_read_location = 0; +	u64 prev_indices = 0; +	unsigned long flags; +	u32 old_read; + +	if (buflen <= 0) +		return -EINVAL; + +	spin_lock_irqsave(&inring_info->ring_lock, flags); + +	hv_get_ringbuffer_availbytes(inring_info, +				&bytes_avail_toread, +				&bytes_avail_towrite); + +	old_read = bytes_avail_toread; + +	/* Make sure there is something to read */ +	if (bytes_avail_toread < buflen) { +		spin_unlock_irqrestore(&inring_info->ring_lock, flags); + +		return -EAGAIN; +	} + +	next_read_location = +		hv_get_next_readlocation_withoffset(inring_info, offset); + +	next_read_location = hv_copyfrom_ringbuffer(inring_info, +						buffer, +						buflen, +						next_read_location); + +	next_read_location = hv_copyfrom_ringbuffer(inring_info, +						&prev_indices, +						sizeof(u64), +						next_read_location); + +	/* Make sure all reads are done before we update the read index since */ +	/* the writer may start writing to the read area once the read index */ +	/*is updated */ +	mb(); + +	/* Update the read index */ +	hv_set_next_read_location(inring_info, next_read_location); + +	spin_unlock_irqrestore(&inring_info->ring_lock, flags); + +	*signal = hv_need_to_signal_on_read(old_read, inring_info); + +	return 0; +} diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c new file mode 100644 index 00000000000..4d6b26979fb --- /dev/null +++ b/drivers/hv/vmbus_drv.c @@ -0,0 +1,973 @@ +/* + * Copyright (c) 2009, Microsoft Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Authors: + *   Haiyang Zhang <haiyangz@microsoft.com> + *   Hank Janssen  <hjanssen@microsoft.com> + *   K. Y. Srinivasan <kys@microsoft.com> + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/interrupt.h> +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/acpi.h> +#include <linux/completion.h> +#include <linux/hyperv.h> +#include <linux/kernel_stat.h> +#include <asm/hyperv.h> +#include <asm/hypervisor.h> +#include <asm/mshyperv.h> +#include "hyperv_vmbus.h" + +static struct acpi_device  *hv_acpi_dev; + +static struct tasklet_struct msg_dpc; +static struct completion probe_event; +static int irq; + +struct resource hyperv_mmio = { +	.name  = "hyperv mmio", +	.flags = IORESOURCE_MEM, +}; +EXPORT_SYMBOL_GPL(hyperv_mmio); + +static int vmbus_exists(void) +{ +	if (hv_acpi_dev == NULL) +		return -ENODEV; + +	return 0; +} + +#define VMBUS_ALIAS_LEN ((sizeof((struct hv_vmbus_device_id *)0)->guid) * 2) +static void print_alias_name(struct hv_device *hv_dev, char *alias_name) +{ +	int i; +	for (i = 0; i < VMBUS_ALIAS_LEN; i += 2) +		sprintf(&alias_name[i], "%02x", hv_dev->dev_type.b[i/2]); +} + +static u8 channel_monitor_group(struct vmbus_channel *channel) +{ +	return (u8)channel->offermsg.monitorid / 32; +} + +static u8 channel_monitor_offset(struct vmbus_channel *channel) +{ +	return (u8)channel->offermsg.monitorid % 32; +} + +static u32 channel_pending(struct vmbus_channel *channel, +			   struct hv_monitor_page *monitor_page) +{ +	u8 monitor_group = channel_monitor_group(channel); +	return monitor_page->trigger_group[monitor_group].pending; +} + +static u32 channel_latency(struct vmbus_channel *channel, +			   struct hv_monitor_page *monitor_page) +{ +	u8 monitor_group = channel_monitor_group(channel); +	u8 monitor_offset = channel_monitor_offset(channel); +	return monitor_page->latency[monitor_group][monitor_offset]; +} + +static u32 channel_conn_id(struct vmbus_channel *channel, +			   struct hv_monitor_page *monitor_page) +{ +	u8 monitor_group = channel_monitor_group(channel); +	u8 monitor_offset = channel_monitor_offset(channel); +	return monitor_page->parameter[monitor_group][monitor_offset].connectionid.u.id; +} + +static ssize_t id_show(struct device *dev, struct device_attribute *dev_attr, +		       char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); + +	if (!hv_dev->channel) +		return -ENODEV; +	return sprintf(buf, "%d\n", hv_dev->channel->offermsg.child_relid); +} +static DEVICE_ATTR_RO(id); + +static ssize_t state_show(struct device *dev, struct device_attribute *dev_attr, +			  char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); + +	if (!hv_dev->channel) +		return -ENODEV; +	return sprintf(buf, "%d\n", hv_dev->channel->state); +} +static DEVICE_ATTR_RO(state); + +static ssize_t monitor_id_show(struct device *dev, +			       struct device_attribute *dev_attr, char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); + +	if (!hv_dev->channel) +		return -ENODEV; +	return sprintf(buf, "%d\n", hv_dev->channel->offermsg.monitorid); +} +static DEVICE_ATTR_RO(monitor_id); + +static ssize_t class_id_show(struct device *dev, +			       struct device_attribute *dev_attr, char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); + +	if (!hv_dev->channel) +		return -ENODEV; +	return sprintf(buf, "{%pUl}\n", +		       hv_dev->channel->offermsg.offer.if_type.b); +} +static DEVICE_ATTR_RO(class_id); + +static ssize_t device_id_show(struct device *dev, +			      struct device_attribute *dev_attr, char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); + +	if (!hv_dev->channel) +		return -ENODEV; +	return sprintf(buf, "{%pUl}\n", +		       hv_dev->channel->offermsg.offer.if_instance.b); +} +static DEVICE_ATTR_RO(device_id); + +static ssize_t modalias_show(struct device *dev, +			     struct device_attribute *dev_attr, char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); +	char alias_name[VMBUS_ALIAS_LEN + 1]; + +	print_alias_name(hv_dev, alias_name); +	return sprintf(buf, "vmbus:%s\n", alias_name); +} +static DEVICE_ATTR_RO(modalias); + +static ssize_t server_monitor_pending_show(struct device *dev, +					   struct device_attribute *dev_attr, +					   char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); + +	if (!hv_dev->channel) +		return -ENODEV; +	return sprintf(buf, "%d\n", +		       channel_pending(hv_dev->channel, +				       vmbus_connection.monitor_pages[1])); +} +static DEVICE_ATTR_RO(server_monitor_pending); + +static ssize_t client_monitor_pending_show(struct device *dev, +					   struct device_attribute *dev_attr, +					   char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); + +	if (!hv_dev->channel) +		return -ENODEV; +	return sprintf(buf, "%d\n", +		       channel_pending(hv_dev->channel, +				       vmbus_connection.monitor_pages[1])); +} +static DEVICE_ATTR_RO(client_monitor_pending); + +static ssize_t server_monitor_latency_show(struct device *dev, +					   struct device_attribute *dev_attr, +					   char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); + +	if (!hv_dev->channel) +		return -ENODEV; +	return sprintf(buf, "%d\n", +		       channel_latency(hv_dev->channel, +				       vmbus_connection.monitor_pages[0])); +} +static DEVICE_ATTR_RO(server_monitor_latency); + +static ssize_t client_monitor_latency_show(struct device *dev, +					   struct device_attribute *dev_attr, +					   char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); + +	if (!hv_dev->channel) +		return -ENODEV; +	return sprintf(buf, "%d\n", +		       channel_latency(hv_dev->channel, +				       vmbus_connection.monitor_pages[1])); +} +static DEVICE_ATTR_RO(client_monitor_latency); + +static ssize_t server_monitor_conn_id_show(struct device *dev, +					   struct device_attribute *dev_attr, +					   char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); + +	if (!hv_dev->channel) +		return -ENODEV; +	return sprintf(buf, "%d\n", +		       channel_conn_id(hv_dev->channel, +				       vmbus_connection.monitor_pages[0])); +} +static DEVICE_ATTR_RO(server_monitor_conn_id); + +static ssize_t client_monitor_conn_id_show(struct device *dev, +					   struct device_attribute *dev_attr, +					   char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); + +	if (!hv_dev->channel) +		return -ENODEV; +	return sprintf(buf, "%d\n", +		       channel_conn_id(hv_dev->channel, +				       vmbus_connection.monitor_pages[1])); +} +static DEVICE_ATTR_RO(client_monitor_conn_id); + +static ssize_t out_intr_mask_show(struct device *dev, +				  struct device_attribute *dev_attr, char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); +	struct hv_ring_buffer_debug_info outbound; + +	if (!hv_dev->channel) +		return -ENODEV; +	hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, &outbound); +	return sprintf(buf, "%d\n", outbound.current_interrupt_mask); +} +static DEVICE_ATTR_RO(out_intr_mask); + +static ssize_t out_read_index_show(struct device *dev, +				   struct device_attribute *dev_attr, char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); +	struct hv_ring_buffer_debug_info outbound; + +	if (!hv_dev->channel) +		return -ENODEV; +	hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, &outbound); +	return sprintf(buf, "%d\n", outbound.current_read_index); +} +static DEVICE_ATTR_RO(out_read_index); + +static ssize_t out_write_index_show(struct device *dev, +				    struct device_attribute *dev_attr, +				    char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); +	struct hv_ring_buffer_debug_info outbound; + +	if (!hv_dev->channel) +		return -ENODEV; +	hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, &outbound); +	return sprintf(buf, "%d\n", outbound.current_write_index); +} +static DEVICE_ATTR_RO(out_write_index); + +static ssize_t out_read_bytes_avail_show(struct device *dev, +					 struct device_attribute *dev_attr, +					 char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); +	struct hv_ring_buffer_debug_info outbound; + +	if (!hv_dev->channel) +		return -ENODEV; +	hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, &outbound); +	return sprintf(buf, "%d\n", outbound.bytes_avail_toread); +} +static DEVICE_ATTR_RO(out_read_bytes_avail); + +static ssize_t out_write_bytes_avail_show(struct device *dev, +					  struct device_attribute *dev_attr, +					  char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); +	struct hv_ring_buffer_debug_info outbound; + +	if (!hv_dev->channel) +		return -ENODEV; +	hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, &outbound); +	return sprintf(buf, "%d\n", outbound.bytes_avail_towrite); +} +static DEVICE_ATTR_RO(out_write_bytes_avail); + +static ssize_t in_intr_mask_show(struct device *dev, +				 struct device_attribute *dev_attr, char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); +	struct hv_ring_buffer_debug_info inbound; + +	if (!hv_dev->channel) +		return -ENODEV; +	hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound); +	return sprintf(buf, "%d\n", inbound.current_interrupt_mask); +} +static DEVICE_ATTR_RO(in_intr_mask); + +static ssize_t in_read_index_show(struct device *dev, +				  struct device_attribute *dev_attr, char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); +	struct hv_ring_buffer_debug_info inbound; + +	if (!hv_dev->channel) +		return -ENODEV; +	hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound); +	return sprintf(buf, "%d\n", inbound.current_read_index); +} +static DEVICE_ATTR_RO(in_read_index); + +static ssize_t in_write_index_show(struct device *dev, +				   struct device_attribute *dev_attr, char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); +	struct hv_ring_buffer_debug_info inbound; + +	if (!hv_dev->channel) +		return -ENODEV; +	hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound); +	return sprintf(buf, "%d\n", inbound.current_write_index); +} +static DEVICE_ATTR_RO(in_write_index); + +static ssize_t in_read_bytes_avail_show(struct device *dev, +					struct device_attribute *dev_attr, +					char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); +	struct hv_ring_buffer_debug_info inbound; + +	if (!hv_dev->channel) +		return -ENODEV; +	hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound); +	return sprintf(buf, "%d\n", inbound.bytes_avail_toread); +} +static DEVICE_ATTR_RO(in_read_bytes_avail); + +static ssize_t in_write_bytes_avail_show(struct device *dev, +					 struct device_attribute *dev_attr, +					 char *buf) +{ +	struct hv_device *hv_dev = device_to_hv_device(dev); +	struct hv_ring_buffer_debug_info inbound; + +	if (!hv_dev->channel) +		return -ENODEV; +	hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound); +	return sprintf(buf, "%d\n", inbound.bytes_avail_towrite); +} +static DEVICE_ATTR_RO(in_write_bytes_avail); + +/* Set up per device attributes in /sys/bus/vmbus/devices/<bus device> */ +static struct attribute *vmbus_attrs[] = { +	&dev_attr_id.attr, +	&dev_attr_state.attr, +	&dev_attr_monitor_id.attr, +	&dev_attr_class_id.attr, +	&dev_attr_device_id.attr, +	&dev_attr_modalias.attr, +	&dev_attr_server_monitor_pending.attr, +	&dev_attr_client_monitor_pending.attr, +	&dev_attr_server_monitor_latency.attr, +	&dev_attr_client_monitor_latency.attr, +	&dev_attr_server_monitor_conn_id.attr, +	&dev_attr_client_monitor_conn_id.attr, +	&dev_attr_out_intr_mask.attr, +	&dev_attr_out_read_index.attr, +	&dev_attr_out_write_index.attr, +	&dev_attr_out_read_bytes_avail.attr, +	&dev_attr_out_write_bytes_avail.attr, +	&dev_attr_in_intr_mask.attr, +	&dev_attr_in_read_index.attr, +	&dev_attr_in_write_index.attr, +	&dev_attr_in_read_bytes_avail.attr, +	&dev_attr_in_write_bytes_avail.attr, +	NULL, +}; +ATTRIBUTE_GROUPS(vmbus); + +/* + * vmbus_uevent - add uevent for our device + * + * This routine is invoked when a device is added or removed on the vmbus to + * generate a uevent to udev in the userspace. The udev will then look at its + * rule and the uevent generated here to load the appropriate driver + * + * The alias string will be of the form vmbus:guid where guid is the string + * representation of the device guid (each byte of the guid will be + * represented with two hex characters. + */ +static int vmbus_uevent(struct device *device, struct kobj_uevent_env *env) +{ +	struct hv_device *dev = device_to_hv_device(device); +	int ret; +	char alias_name[VMBUS_ALIAS_LEN + 1]; + +	print_alias_name(dev, alias_name); +	ret = add_uevent_var(env, "MODALIAS=vmbus:%s", alias_name); +	return ret; +} + +static const uuid_le null_guid; + +static inline bool is_null_guid(const __u8 *guid) +{ +	if (memcmp(guid, &null_guid, sizeof(uuid_le))) +		return false; +	return true; +} + +/* + * Return a matching hv_vmbus_device_id pointer. + * If there is no match, return NULL. + */ +static const struct hv_vmbus_device_id *hv_vmbus_get_id( +					const struct hv_vmbus_device_id *id, +					const __u8 *guid) +{ +	for (; !is_null_guid(id->guid); id++) +		if (!memcmp(&id->guid, guid, sizeof(uuid_le))) +			return id; + +	return NULL; +} + + + +/* + * vmbus_match - Attempt to match the specified device to the specified driver + */ +static int vmbus_match(struct device *device, struct device_driver *driver) +{ +	struct hv_driver *drv = drv_to_hv_drv(driver); +	struct hv_device *hv_dev = device_to_hv_device(device); + +	if (hv_vmbus_get_id(drv->id_table, hv_dev->dev_type.b)) +		return 1; + +	return 0; +} + +/* + * vmbus_probe - Add the new vmbus's child device + */ +static int vmbus_probe(struct device *child_device) +{ +	int ret = 0; +	struct hv_driver *drv = +			drv_to_hv_drv(child_device->driver); +	struct hv_device *dev = device_to_hv_device(child_device); +	const struct hv_vmbus_device_id *dev_id; + +	dev_id = hv_vmbus_get_id(drv->id_table, dev->dev_type.b); +	if (drv->probe) { +		ret = drv->probe(dev, dev_id); +		if (ret != 0) +			pr_err("probe failed for device %s (%d)\n", +			       dev_name(child_device), ret); + +	} else { +		pr_err("probe not set for driver %s\n", +		       dev_name(child_device)); +		ret = -ENODEV; +	} +	return ret; +} + +/* + * vmbus_remove - Remove a vmbus device + */ +static int vmbus_remove(struct device *child_device) +{ +	struct hv_driver *drv = drv_to_hv_drv(child_device->driver); +	struct hv_device *dev = device_to_hv_device(child_device); + +	if (drv->remove) +		drv->remove(dev); +	else +		pr_err("remove not set for driver %s\n", +			dev_name(child_device)); + +	return 0; +} + + +/* + * vmbus_shutdown - Shutdown a vmbus device + */ +static void vmbus_shutdown(struct device *child_device) +{ +	struct hv_driver *drv; +	struct hv_device *dev = device_to_hv_device(child_device); + + +	/* The device may not be attached yet */ +	if (!child_device->driver) +		return; + +	drv = drv_to_hv_drv(child_device->driver); + +	if (drv->shutdown) +		drv->shutdown(dev); + +	return; +} + + +/* + * vmbus_device_release - Final callback release of the vmbus child device + */ +static void vmbus_device_release(struct device *device) +{ +	struct hv_device *hv_dev = device_to_hv_device(device); + +	kfree(hv_dev); + +} + +/* The one and only one */ +static struct bus_type  hv_bus = { +	.name =		"vmbus", +	.match =		vmbus_match, +	.shutdown =		vmbus_shutdown, +	.remove =		vmbus_remove, +	.probe =		vmbus_probe, +	.uevent =		vmbus_uevent, +	.dev_groups =		vmbus_groups, +}; + +struct onmessage_work_context { +	struct work_struct work; +	struct hv_message msg; +}; + +static void vmbus_onmessage_work(struct work_struct *work) +{ +	struct onmessage_work_context *ctx; + +	ctx = container_of(work, struct onmessage_work_context, +			   work); +	vmbus_onmessage(&ctx->msg); +	kfree(ctx); +} + +static void vmbus_on_msg_dpc(unsigned long data) +{ +	int cpu = smp_processor_id(); +	void *page_addr = hv_context.synic_message_page[cpu]; +	struct hv_message *msg = (struct hv_message *)page_addr + +				  VMBUS_MESSAGE_SINT; +	struct onmessage_work_context *ctx; + +	while (1) { +		if (msg->header.message_type == HVMSG_NONE) { +			/* no msg */ +			break; +		} else { +			ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); +			if (ctx == NULL) +				continue; +			INIT_WORK(&ctx->work, vmbus_onmessage_work); +			memcpy(&ctx->msg, msg, sizeof(*msg)); +			queue_work(vmbus_connection.work_queue, &ctx->work); +		} + +		msg->header.message_type = HVMSG_NONE; + +		/* +		 * Make sure the write to MessageType (ie set to +		 * HVMSG_NONE) happens before we read the +		 * MessagePending and EOMing. Otherwise, the EOMing +		 * will not deliver any more messages since there is +		 * no empty slot +		 */ +		mb(); + +		if (msg->header.message_flags.msg_pending) { +			/* +			 * This will cause message queue rescan to +			 * possibly deliver another msg from the +			 * hypervisor +			 */ +			wrmsrl(HV_X64_MSR_EOM, 0); +		} +	} +} + +static void vmbus_isr(void) +{ +	int cpu = smp_processor_id(); +	void *page_addr; +	struct hv_message *msg; +	union hv_synic_event_flags *event; +	bool handled = false; + +	page_addr = hv_context.synic_event_page[cpu]; +	if (page_addr == NULL) +		return; + +	event = (union hv_synic_event_flags *)page_addr + +					 VMBUS_MESSAGE_SINT; +	/* +	 * Check for events before checking for messages. This is the order +	 * in which events and messages are checked in Windows guests on +	 * Hyper-V, and the Windows team suggested we do the same. +	 */ + +	if ((vmbus_proto_version == VERSION_WS2008) || +		(vmbus_proto_version == VERSION_WIN7)) { + +		/* Since we are a child, we only need to check bit 0 */ +		if (sync_test_and_clear_bit(0, +			(unsigned long *) &event->flags32[0])) { +			handled = true; +		} +	} else { +		/* +		 * Our host is win8 or above. The signaling mechanism +		 * has changed and we can directly look at the event page. +		 * If bit n is set then we have an interrup on the channel +		 * whose id is n. +		 */ +		handled = true; +	} + +	if (handled) +		tasklet_schedule(hv_context.event_dpc[cpu]); + + +	page_addr = hv_context.synic_message_page[cpu]; +	msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; + +	/* Check if there are actual msgs to be processed */ +	if (msg->header.message_type != HVMSG_NONE) +		tasklet_schedule(&msg_dpc); +} + +/* + * vmbus_bus_init -Main vmbus driver initialization routine. + * + * Here, we + *	- initialize the vmbus driver context + *	- invoke the vmbus hv main init routine + *	- get the irq resource + *	- retrieve the channel offers + */ +static int vmbus_bus_init(int irq) +{ +	int ret; + +	/* Hypervisor initialization...setup hypercall page..etc */ +	ret = hv_init(); +	if (ret != 0) { +		pr_err("Unable to initialize the hypervisor - 0x%x\n", ret); +		return ret; +	} + +	tasklet_init(&msg_dpc, vmbus_on_msg_dpc, 0); + +	ret = bus_register(&hv_bus); +	if (ret) +		goto err_cleanup; + +	hv_setup_vmbus_irq(vmbus_isr); + +	ret = hv_synic_alloc(); +	if (ret) +		goto err_alloc; +	/* +	 * Initialize the per-cpu interrupt state and +	 * connect to the host. +	 */ +	on_each_cpu(hv_synic_init, NULL, 1); +	ret = vmbus_connect(); +	if (ret) +		goto err_alloc; + +	vmbus_request_offers(); + +	return 0; + +err_alloc: +	hv_synic_free(); +	hv_remove_vmbus_irq(); + +	bus_unregister(&hv_bus); + +err_cleanup: +	hv_cleanup(); + +	return ret; +} + +/** + * __vmbus_child_driver_register - Register a vmbus's driver + * @drv: Pointer to driver structure you want to register + * @owner: owner module of the drv + * @mod_name: module name string + * + * Registers the given driver with Linux through the 'driver_register()' call + * and sets up the hyper-v vmbus handling for this driver. + * It will return the state of the 'driver_register()' call. + * + */ +int __vmbus_driver_register(struct hv_driver *hv_driver, struct module *owner, const char *mod_name) +{ +	int ret; + +	pr_info("registering driver %s\n", hv_driver->name); + +	ret = vmbus_exists(); +	if (ret < 0) +		return ret; + +	hv_driver->driver.name = hv_driver->name; +	hv_driver->driver.owner = owner; +	hv_driver->driver.mod_name = mod_name; +	hv_driver->driver.bus = &hv_bus; + +	ret = driver_register(&hv_driver->driver); + +	return ret; +} +EXPORT_SYMBOL_GPL(__vmbus_driver_register); + +/** + * vmbus_driver_unregister() - Unregister a vmbus's driver + * @drv: Pointer to driver structure you want to un-register + * + * Un-register the given driver that was previous registered with a call to + * vmbus_driver_register() + */ +void vmbus_driver_unregister(struct hv_driver *hv_driver) +{ +	pr_info("unregistering driver %s\n", hv_driver->name); + +	if (!vmbus_exists()) +		driver_unregister(&hv_driver->driver); +} +EXPORT_SYMBOL_GPL(vmbus_driver_unregister); + +/* + * vmbus_device_create - Creates and registers a new child device + * on the vmbus. + */ +struct hv_device *vmbus_device_create(const uuid_le *type, +				      const uuid_le *instance, +				      struct vmbus_channel *channel) +{ +	struct hv_device *child_device_obj; + +	child_device_obj = kzalloc(sizeof(struct hv_device), GFP_KERNEL); +	if (!child_device_obj) { +		pr_err("Unable to allocate device object for child device\n"); +		return NULL; +	} + +	child_device_obj->channel = channel; +	memcpy(&child_device_obj->dev_type, type, sizeof(uuid_le)); +	memcpy(&child_device_obj->dev_instance, instance, +	       sizeof(uuid_le)); + + +	return child_device_obj; +} + +/* + * vmbus_device_register - Register the child device + */ +int vmbus_device_register(struct hv_device *child_device_obj) +{ +	int ret = 0; + +	static atomic_t device_num = ATOMIC_INIT(0); + +	dev_set_name(&child_device_obj->device, "vmbus_0_%d", +		     atomic_inc_return(&device_num)); + +	child_device_obj->device.bus = &hv_bus; +	child_device_obj->device.parent = &hv_acpi_dev->dev; +	child_device_obj->device.release = vmbus_device_release; + +	/* +	 * Register with the LDM. This will kick off the driver/device +	 * binding...which will eventually call vmbus_match() and vmbus_probe() +	 */ +	ret = device_register(&child_device_obj->device); + +	if (ret) +		pr_err("Unable to register child device\n"); +	else +		pr_debug("child device %s registered\n", +			dev_name(&child_device_obj->device)); + +	return ret; +} + +/* + * vmbus_device_unregister - Remove the specified child device + * from the vmbus. + */ +void vmbus_device_unregister(struct hv_device *device_obj) +{ +	pr_debug("child device %s unregistered\n", +		dev_name(&device_obj->device)); + +	/* +	 * Kick off the process of unregistering the device. +	 * This will call vmbus_remove() and eventually vmbus_device_release() +	 */ +	device_unregister(&device_obj->device); +} + + +/* + * VMBUS is an acpi enumerated device. Get the the information we + * need from DSDT. + */ + +static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx) +{ +	switch (res->type) { +	case ACPI_RESOURCE_TYPE_IRQ: +		irq = res->data.irq.interrupts[0]; +		break; + +	case ACPI_RESOURCE_TYPE_ADDRESS64: +		hyperv_mmio.start = res->data.address64.minimum; +		hyperv_mmio.end = res->data.address64.maximum; +		break; +	} + +	return AE_OK; +} + +static int vmbus_acpi_add(struct acpi_device *device) +{ +	acpi_status result; +	int ret_val = -ENODEV; + +	hv_acpi_dev = device; + +	result = acpi_walk_resources(device->handle, METHOD_NAME__CRS, +					vmbus_walk_resources, NULL); + +	if (ACPI_FAILURE(result)) +		goto acpi_walk_err; +	/* +	 * The parent of the vmbus acpi device (Gen2 firmware) is the VMOD that +	 * has the mmio ranges. Get that. +	 */ +	if (device->parent) { +		result = acpi_walk_resources(device->parent->handle, +					METHOD_NAME__CRS, +					vmbus_walk_resources, NULL); + +		if (ACPI_FAILURE(result)) +			goto acpi_walk_err; +		if (hyperv_mmio.start && hyperv_mmio.end) +			request_resource(&iomem_resource, &hyperv_mmio); +	} +	ret_val = 0; + +acpi_walk_err: +	complete(&probe_event); +	return ret_val; +} + +static const struct acpi_device_id vmbus_acpi_device_ids[] = { +	{"VMBUS", 0}, +	{"VMBus", 0}, +	{"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids); + +static struct acpi_driver vmbus_acpi_driver = { +	.name = "vmbus", +	.ids = vmbus_acpi_device_ids, +	.ops = { +		.add = vmbus_acpi_add, +	}, +}; + +static int __init hv_acpi_init(void) +{ +	int ret, t; + +	if (x86_hyper != &x86_hyper_ms_hyperv) +		return -ENODEV; + +	init_completion(&probe_event); + +	/* +	 * Get irq resources first. +	 */ +	ret = acpi_bus_register_driver(&vmbus_acpi_driver); + +	if (ret) +		return ret; + +	t = wait_for_completion_timeout(&probe_event, 5*HZ); +	if (t == 0) { +		ret = -ETIMEDOUT; +		goto cleanup; +	} + +	if (irq <= 0) { +		ret = -ENODEV; +		goto cleanup; +	} + +	ret = vmbus_bus_init(irq); +	if (ret) +		goto cleanup; + +	return 0; + +cleanup: +	acpi_bus_unregister_driver(&vmbus_acpi_driver); +	hv_acpi_dev = NULL; +	return ret; +} + +static void __exit vmbus_exit(void) +{ +	hv_remove_vmbus_irq(); +	vmbus_free_channels(); +	bus_unregister(&hv_bus); +	hv_cleanup(); +	acpi_bus_unregister_driver(&vmbus_acpi_driver); +} + + +MODULE_LICENSE("GPL"); + +subsys_initcall(hv_acpi_init); +module_exit(vmbus_exit);  | 
