108 files changed, 70040 insertions, 11477 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 668dc234b8e..125d8450573 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -36,7 +36,9 @@
 #include <linux/ioport.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/reboot.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
@@ -52,6 +54,7 @@
 #define DAC960_GAM_MINOR	252
 
 
+static DEFINE_MUTEX(DAC960_mutex);
 static DAC960_Controller_T *DAC960_Controllers[DAC960_MaxControllers];
 static int DAC960_ControllerCount;
 static struct proc_dir_entry *DAC960_ProcDirectoryEntry;
@@ -77,23 +80,28 @@ static int DAC960_open(struct block_device *bdev, fmode_t mode)
 	struct gendisk *disk = bdev->bd_disk;
 	DAC960_Controller_T *p = disk->queue->queuedata;
 	int drive_nr = (long)disk->private_data;
+	int ret = -ENXIO;
 
+	mutex_lock(&DAC960_mutex);
 	if (p->FirmwareType == DAC960_V1_Controller) {
 		if (p->V1.LogicalDriveInformation[drive_nr].
 		    LogicalDriveState == DAC960_V1_LogicalDrive_Offline)
-			return -ENXIO;
+			goto out;
 	} else {
 		DAC960_V2_LogicalDeviceInfo_T *i =
 			p->V2.LogicalDeviceInformation[drive_nr];
 		if (!i || i->LogicalDeviceState == DAC960_V2_LogicalDevice_Offline)
-			return -ENXIO;
+			goto out;
 	}
 
 	check_disk_change(bdev);
 
 	if (!get_capacity(p->disks[drive_nr]))
-		return -ENXIO;
-	return 0;
+		goto out;
+	ret = 0;
+out:
+	mutex_unlock(&DAC960_mutex);
+	return ret;
 }
 
 static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -132,13 +140,14 @@ static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static int DAC960_media_changed(struct gendisk *disk)
+static unsigned int DAC960_check_events(struct gendisk *disk,
+					unsigned int clearing)
 {
 	DAC960_Controller_T *p = disk->queue->queuedata;
 	int drive_nr = (long)disk->private_data;
 
 	if (!p->LogicalDriveInitiallyAccessible[drive_nr])
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	return 0;
 }
 
@@ -151,11 +160,11 @@ static int DAC960_revalidate_disk(struct gendisk *disk)
 	return 0;
 }
 
-static struct block_device_operations DAC960_BlockDeviceOperations = {
+static const struct block_device_operations DAC960_BlockDeviceOperations = {
 	.owner			= THIS_MODULE,
 	.open			= DAC960_open,
 	.getgeo			= DAC960_getgeo,
-	.media_changed		= DAC960_media_changed,
+	.check_events		= DAC960_check_events,
 	.revalidate_disk	= DAC960_revalidate_disk,
 };
 
@@ -1168,7 +1177,8 @@ static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T
   int TimeoutCounter;
   int i;
 
-  
+  memset(&CommandMailbox, 0, sizeof(DAC960_V1_CommandMailbox_T));
+
   if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
 	return DAC960_Failure(Controller, "DMA mask out of range");
   Controller->BounceBufferLimit = DMA_BIT_MASK(32);
@@ -1781,7 +1791,7 @@ static bool DAC960_V2_ReadControllerConfiguration(DAC960_Controller_T
   unsigned short LogicalDeviceNumber = 0;
   int ModelNameLength;
 
-  /* Get data into dma-able area, then copy into permanant location */
+  /* Get data into dma-able area, then copy into permanent location */
   if (!DAC960_V2_NewControllerInfo(Controller))
     return DAC960_Failure(Controller, "GET CONTROLLER INFO");
   memcpy(ControllerInfo, Controller->V2.NewControllerInformation,
@@ -2531,9 +2541,8 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller)
   	Controller->RequestQueue[n] = RequestQueue;
   	blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit);
   	RequestQueue->queuedata = Controller;
-  	blk_queue_max_hw_segments(RequestQueue, Controller->DriverScatterGatherLimit);
-	blk_queue_max_phys_segments(RequestQueue, Controller->DriverScatterGatherLimit);
-	blk_queue_max_sectors(RequestQueue, Controller->MaxBlocksPerCommand);
+	blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit);
+	blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand);
 	disk->queue = RequestQueue;
 	sprintf(disk->disk_name, "rd/c%dd%d", Controller->ControllerNumber, n);
 	disk->major = MajorNumber;
@@ -4619,7 +4628,8 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
   DAC960_Controller_T *Controller = Command->Controller;
   DAC960_CommandType_T CommandType = Command->CommandType;
   DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
-  DAC960_V2_IOCTL_Opcode_T CommandOpcode = CommandMailbox->Common.IOCTL_Opcode;
+  DAC960_V2_IOCTL_Opcode_T IOCTLOpcode = CommandMailbox->Common.IOCTL_Opcode;
+  DAC960_V2_CommandOpcode_T CommandOpcode = CommandMailbox->SCSI_10.CommandOpcode;
   DAC960_V2_CommandStatus_T CommandStatus = Command->V2.CommandStatus;
 
   if (CommandType == DAC960_ReadCommand ||
@@ -4691,7 +4701,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
     {
       if (Controller->ShutdownMonitoringTimer)
 	      return;
-      if (CommandOpcode == DAC960_V2_GetControllerInfo)
+      if (IOCTLOpcode == DAC960_V2_GetControllerInfo)
 	{
 	  DAC960_V2_ControllerInfo_T *NewControllerInfo =
 	    Controller->V2.NewControllerInformation;
@@ -4711,14 +4721,14 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
 	  memcpy(ControllerInfo, NewControllerInfo,
 		 sizeof(DAC960_V2_ControllerInfo_T));
 	}
-      else if (CommandOpcode == DAC960_V2_GetEvent)
+      else if (IOCTLOpcode == DAC960_V2_GetEvent)
 	{
 	  if (CommandStatus == DAC960_V2_NormalCompletion) {
 	    DAC960_V2_ReportEvent(Controller, Controller->V2.Event);
 	  }
 	  Controller->V2.NextEventSequenceNumber++;
 	}
-      else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid &&
+      else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid &&
 	       CommandStatus == DAC960_V2_NormalCompletion)
 	{
 	  DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInfo =
@@ -4907,7 +4917,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
 	  NewPhysicalDeviceInfo->LogicalUnit++;
 	  Controller->V2.PhysicalDeviceIndex++;
 	}
-      else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid)
+      else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid)
 	{
 	  unsigned int DeviceIndex;
 	  for (DeviceIndex = Controller->V2.PhysicalDeviceIndex;
@@ -4930,7 +4940,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
 	    }
 	  Controller->V2.NeedPhysicalDeviceInformation = false;
 	}
-      else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid &&
+      else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid &&
 	       CommandStatus == DAC960_V2_NormalCompletion)
 	{
 	  DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInfo =
@@ -5057,7 +5067,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
 			 [LogicalDeviceNumber] = true;
 	  NewLogicalDeviceInfo->LogicalDeviceNumber++;
 	}
-      else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid)
+      else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid)
 	{
 	  int LogicalDriveNumber;
 	  for (LogicalDriveNumber = 0;
@@ -6401,12 +6411,12 @@ static bool DAC960_V2_ExecuteUserCommand(DAC960_Controller_T *Controller,
 					.ScatterGatherSegments[0]
 					.SegmentByteCount =
 	    CommandMailbox->ControllerInfo.DataTransferSize;
-	  DAC960_ExecuteCommand(Command);
-	  while (Controller->V2.NewControllerInformation->PhysicalScanActive)
-	    {
-	      DAC960_ExecuteCommand(Command);
-	      sleep_on_timeout(&Controller->CommandWaitQueue, HZ);
-	    }
+	  while (1) {
+	    DAC960_ExecuteCommand(Command);
+	    if (!Controller->V2.NewControllerInformation->PhysicalScanActive)
+		break;
+	    msleep(1000);
+	  }
 	  DAC960_UserCritical("Discovery Completed\n", Controller);
  	}
     }
@@ -6421,16 +6431,10 @@ static bool DAC960_V2_ExecuteUserCommand(DAC960_Controller_T *Controller,
   return true;
 }
 
-
-/*
-  DAC960_ProcReadStatus implements reading /proc/rd/status.
-*/
-
-static int DAC960_ProcReadStatus(char *Page, char **Start, off_t Offset,
-				 int Count, int *EOF, void *Data)
+static int dac960_proc_show(struct seq_file *m, void *v)
 {
   unsigned char *StatusMessage = "OK\n";
-  int ControllerNumber, BytesAvailable;
+  int ControllerNumber;
   for (ControllerNumber = 0;
        ControllerNumber < DAC960_ControllerCount;
        ControllerNumber++)
@@ -6443,52 +6447,49 @@ static int DAC960_ProcReadStatus(char *Page, char **Start, off_t Offset,
 	  break;
 	}
     }
-  BytesAvailable = strlen(StatusMessage) - Offset;
-  if (Count >= BytesAvailable)
-    {
-      Count = BytesAvailable;
-      *EOF = true;
-    }
-  if (Count <= 0) return 0;
-  *Start = Page;
-  memcpy(Page, &StatusMessage[Offset], Count);
-  return Count;
+  seq_puts(m, StatusMessage);
+  return 0;
 }
 
+static int dac960_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, dac960_proc_show, NULL);
+}
 
-/*
-  DAC960_ProcReadInitialStatus implements reading /proc/rd/cN/initial_status.
-*/
+static const struct file_operations dac960_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= dac960_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
-static int DAC960_ProcReadInitialStatus(char *Page, char **Start, off_t Offset,
-					int Count, int *EOF, void *Data)
+static int dac960_initial_status_proc_show(struct seq_file *m, void *v)
 {
-  DAC960_Controller_T *Controller = (DAC960_Controller_T *) Data;
-  int BytesAvailable = Controller->InitialStatusLength - Offset;
-  if (Count >= BytesAvailable)
-    {
-      Count = BytesAvailable;
-      *EOF = true;
-    }
-  if (Count <= 0) return 0;
-  *Start = Page;
-  memcpy(Page, &Controller->CombinedStatusBuffer[Offset], Count);
-  return Count;
+	DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
+	seq_printf(m, "%.*s", Controller->InitialStatusLength, Controller->CombinedStatusBuffer);
+	return 0;
 }
 
+static int dac960_initial_status_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, dac960_initial_status_proc_show, PDE_DATA(inode));
+}
 
-/*
-  DAC960_ProcReadCurrentStatus implements reading /proc/rd/cN/current_status.
-*/
+static const struct file_operations dac960_initial_status_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= dac960_initial_status_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
-static int DAC960_ProcReadCurrentStatus(char *Page, char **Start, off_t Offset,
-					int Count, int *EOF, void *Data)
+static int dac960_current_status_proc_show(struct seq_file *m, void *v)
 {
-  DAC960_Controller_T *Controller = (DAC960_Controller_T *) Data;
+  DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private;
   unsigned char *StatusMessage =
     "No Rebuild or Consistency Check in Progress\n";
   int ProgressMessageLength = strlen(StatusMessage);
-  int BytesAvailable;
   if (jiffies != Controller->LastCurrentStatusTime)
     {
       Controller->CurrentStatusLength = 0;
@@ -6512,56 +6513,48 @@ static int DAC960_ProcReadCurrentStatus(char *Page, char **Start, off_t Offset,
 	}
       Controller->LastCurrentStatusTime = jiffies;
     }
-  BytesAvailable = Controller->CurrentStatusLength - Offset;
-  if (Count >= BytesAvailable)
-    {
-      Count = BytesAvailable;
-      *EOF = true;
-    }
-  if (Count <= 0) return 0;
-  *Start = Page;
-  memcpy(Page, &Controller->CurrentStatusBuffer[Offset], Count);
-  return Count;
+	seq_printf(m, "%.*s", Controller->CurrentStatusLength, Controller->CurrentStatusBuffer);
+	return 0;
 }
 
+static int dac960_current_status_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, dac960_current_status_proc_show, PDE_DATA(inode));
+}
 
-/*
-  DAC960_ProcReadUserCommand implements reading /proc/rd/cN/user_command.
-*/
+static const struct file_operations dac960_current_status_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= dac960_current_status_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
-static int DAC960_ProcReadUserCommand(char *Page, char **Start, off_t Offset,
-				      int Count, int *EOF, void *Data)
+static int dac960_user_command_proc_show(struct seq_file *m, void *v)
 {
-  DAC960_Controller_T *Controller = (DAC960_Controller_T *) Data;
-  int BytesAvailable = Controller->UserStatusLength - Offset;
-  if (Count >= BytesAvailable)
-    {
-      Count = BytesAvailable;
-      *EOF = true;
-    }
-  if (Count <= 0) return 0;
-  *Start = Page;
-  memcpy(Page, &Controller->UserStatusBuffer[Offset], Count);
-  return Count;
-}
+	DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
 
+	seq_printf(m, "%.*s", Controller->UserStatusLength, Controller->UserStatusBuffer);
+	return 0;
+}
 
-/*
-  DAC960_ProcWriteUserCommand implements writing /proc/rd/cN/user_command.
-*/
+static int dac960_user_command_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, dac960_user_command_proc_show, PDE_DATA(inode));
+}
 
-static int DAC960_ProcWriteUserCommand(struct file *file,
+static ssize_t dac960_user_command_proc_write(struct file *file,
 				       const char __user *Buffer,
-				       unsigned long Count, void *Data)
+				       size_t Count, loff_t *pos)
 {
-  DAC960_Controller_T *Controller = (DAC960_Controller_T *) Data;
+  DAC960_Controller_T *Controller = PDE_DATA(file_inode(file));
   unsigned char CommandBuffer[80];
   int Length;
   if (Count > sizeof(CommandBuffer)-1) return -EINVAL;
   if (copy_from_user(CommandBuffer, Buffer, Count)) return -EFAULT;
   CommandBuffer[Count] = '\0';
   Length = strlen(CommandBuffer);
-  if (CommandBuffer[Length-1] == '\n')
+  if (Length > 0 && CommandBuffer[Length-1] == '\n')
     CommandBuffer[--Length] = '\0';
   if (Controller->FirmwareType == DAC960_V1_Controller)
     return (DAC960_V1_ExecuteUserCommand(Controller, CommandBuffer)
@@ -6571,6 +6564,14 @@ static int DAC960_ProcWriteUserCommand(struct file *file,
 	    ? Count : -EBUSY);
 }
 
+static const struct file_operations dac960_user_command_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= dac960_user_command_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= dac960_user_command_proc_write,
+};
 
 /*
   DAC960_CreateProcEntries creates the /proc/rd/... entries for the
@@ -6579,30 +6580,21 @@ static int DAC960_ProcWriteUserCommand(struct file *file,
 
 static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller)
 {
-	struct proc_dir_entry *StatusProcEntry;
 	struct proc_dir_entry *ControllerProcEntry;
-	struct proc_dir_entry *UserCommandProcEntry;
 
 	if (DAC960_ProcDirectoryEntry == NULL) {
-  		DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL);
-  		StatusProcEntry = create_proc_read_entry("status", 0,
-					   DAC960_ProcDirectoryEntry,
-					   DAC960_ProcReadStatus, NULL);
+		DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL);
+		proc_create("status", 0, DAC960_ProcDirectoryEntry,
+			    &dac960_proc_fops);
 	}
 
-      sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber);
-      ControllerProcEntry = proc_mkdir(Controller->ControllerName,
-				       DAC960_ProcDirectoryEntry);
-      create_proc_read_entry("initial_status", 0, ControllerProcEntry,
-			     DAC960_ProcReadInitialStatus, Controller);
-      create_proc_read_entry("current_status", 0, ControllerProcEntry,
-			     DAC960_ProcReadCurrentStatus, Controller);
-      UserCommandProcEntry =
-	create_proc_read_entry("user_command", S_IWUSR | S_IRUSR,
-			       ControllerProcEntry, DAC960_ProcReadUserCommand,
-			       Controller);
-      UserCommandProcEntry->write_proc = DAC960_ProcWriteUserCommand;
-      Controller->ControllerProcEntry = ControllerProcEntry;
+	sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber);
+	ControllerProcEntry = proc_mkdir(Controller->ControllerName,
+					 DAC960_ProcDirectoryEntry);
+	proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
+	proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
+	proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
+	Controller->ControllerProcEntry = ControllerProcEntry;
 }
 
 
@@ -6634,7 +6626,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
   long ErrorCode = 0;
   if (!capable(CAP_SYS_ADMIN)) return -EACCES;
 
-  lock_kernel();
+  mutex_lock(&DAC960_mutex);
   switch (Request)
     {
     case DAC960_IOCTL_GET_CONTROLLER_COUNT:
@@ -6652,7 +6644,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
 	else ErrorCode = get_user(ControllerNumber,
 			     &UserSpaceControllerInfo->ControllerNumber);
 	if (ErrorCode != 0)
-		break;;
+		break;
 	ErrorCode = -ENXIO;
 	if (ControllerNumber < 0 ||
 	    ControllerNumber > DAC960_ControllerCount - 1) {
@@ -6660,7 +6652,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
 	}
 	Controller = DAC960_Controllers[ControllerNumber];
 	if (Controller == NULL)
-		break;;
+		break;
 	memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T));
 	ControllerInfo.ControllerNumber = ControllerNumber;
 	ControllerInfo.FirmwareType = Controller->FirmwareType;
@@ -7043,18 +7035,16 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
 		ErrorCode = -EFAULT;
 		break;
 	}
-	while (Controller->V2.HealthStatusBuffer->StatusChangeCounter
-	       == HealthStatusBuffer.StatusChangeCounter &&
-	       Controller->V2.HealthStatusBuffer->NextEventSequenceNumber
-	       == HealthStatusBuffer.NextEventSequenceNumber)
-	  {
-	    interruptible_sleep_on_timeout(&Controller->HealthStatusWaitQueue,
-					   DAC960_MonitoringTimerInterval);
-	    if (signal_pending(current)) {
-	    	ErrorCode = -EINTR;
-	    	break;
-	    }
-	  }
+	ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue,
+			!(Controller->V2.HealthStatusBuffer->StatusChangeCounter
+			    == HealthStatusBuffer.StatusChangeCounter &&
+			  Controller->V2.HealthStatusBuffer->NextEventSequenceNumber
+			    == HealthStatusBuffer.NextEventSequenceNumber),
+			DAC960_MonitoringTimerInterval);
+	if (ErrorCode == -ERESTARTSYS) {
+		ErrorCode = -EINTR;
+		break;
+	}
 	if (copy_to_user(GetHealthStatus.HealthStatusBuffer,
 			 Controller->V2.HealthStatusBuffer,
 			 sizeof(DAC960_V2_HealthStatusBuffer_T)))
@@ -7062,16 +7052,18 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
 	else
 		ErrorCode =  0;
       }
+      break;
       default:
 	ErrorCode = -ENOTTY;
     }
-  unlock_kernel();
+  mutex_unlock(&DAC960_mutex);
   return ErrorCode;
 }
 
 static const struct file_operations DAC960_gam_fops = {
 	.owner		= THIS_MODULE,
-	.unlocked_ioctl	= DAC960_gam_ioctl
+	.unlocked_ioctl	= DAC960_gam_ioctl,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice DAC960_gam_dev = {
@@ -7114,7 +7106,7 @@ static struct DAC960_privdata DAC960_BA_privdata = {
 
 static struct DAC960_privdata DAC960_LP_privdata = {
 	.HardwareType =		DAC960_LP_Controller,
-	.FirmwareType 	=	DAC960_LP_Controller,
+	.FirmwareType 	=	DAC960_V2_Controller,
 	.InterruptHandler =	DAC960_LP_InterruptHandler,
 	.MemoryWindowSize =	DAC960_LP_RegisterWindowSize,
 };
@@ -7147,7 +7139,7 @@ static struct DAC960_privdata DAC960_P_privdata = {
 	.MemoryWindowSize =	DAC960_PD_RegisterWindowSize,
 };
 
-static struct pci_device_id DAC960_id_table[] = {
+static const struct pci_device_id DAC960_id_table[] = {
 	{
 		.vendor 	= PCI_VENDOR_ID_MYLEX,
 		.device		= PCI_DEVICE_ID_MYLEX_DAC960_GEM,
@@ -7209,7 +7201,7 @@ static struct pci_driver DAC960_pci_driver = {
 	.remove		= DAC960_Remove,
 };
 
-static int DAC960_init_module(void)
+static int __init DAC960_init_module(void)
 {
 	int ret;
 
@@ -7221,7 +7213,7 @@ static int DAC960_init_module(void)
 	return ret;
 }
 
-static void DAC960_cleanup_module(void)
+static void __exit DAC960_cleanup_module(void)
 {
 	int i;
 
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index f42fa50d355..014a1cfc41c 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -15,6 +15,9 @@ menuconfig BLK_DEV
 
 if BLK_DEV
 
+config BLK_DEV_NULL_BLK
+	tristate "Null test block driver"
+
 config BLK_DEV_FD
 	tristate "Normal floppy disk support"
 	depends on ARCH_MAY_HAVE_PC_FDC
@@ -63,18 +66,16 @@ config AMIGA_Z2RAM
 	  To compile this driver as a module, choose M here: the
 	  module will be called z2ram.
 
-config BLK_DEV_XD
-	tristate "XT hard disk support"
-	depends on ISA && ISA_DMA_API
-	select CHECK_SIGNATURE
+config GDROM
+	tristate "SEGA Dreamcast GD-ROM drive"
+	depends on SH_DREAMCAST
 	help
-	  Very old 8 bit hard disk controllers used in the IBM XT computer
-	  will be supported if you say Y here.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called xd.
-
-	  It's pretty unlikely that you have one of these: say N.
+	  A standard SEGA Dreamcast comes with a modified CD ROM drive called a
+	  "GD-ROM" by SEGA to signify it is capable of reading special disks
+	  with up to 1 GB of data. This drive will also read standard CD ROM
+	  disks. Select this option to access any disks in your GD ROM drive.
+	  Most users will want to say "Y" here.
+	  You can also build this as a module which will be called gdrom.
 
 config PARIDE
 	tristate "Parallel port IDE device support"
@@ -103,22 +104,15 @@ config PARIDE
 	  "MicroSolutions backpack protocol", "DataStor Commuter protocol"
 	  etc.).
 
-config GDROM
-	tristate "SEGA Dreamcast GD-ROM drive"
-	depends on SH_DREAMCAST
-	help
-	  A standard SEGA Dreamcast comes with a modified CD ROM drive called a
-	  "GD-ROM" by SEGA to signify it is capable of reading special disks
-	  with up to 1 GB of data. This drive will also read standard CD ROM
-	  disks. Select this option to access any disks in your GD ROM drive.
-	  Most users will want to say "Y" here.
-	  You can also build this as a module which will be called gdrom.ko
-
 source "drivers/block/paride/Kconfig"
 
+source "drivers/block/mtip32xx/Kconfig"
+
+source "drivers/block/zram/Kconfig"
+
 config BLK_CPQ_DA
 	tristate "Compaq SMART2 support"
-	depends on PCI && VIRT_TO_BUS
+	depends on PCI && VIRT_TO_BUS && 0
 	help
 	  This is the driver for Compaq Smart Array controllers.  Everyone
 	  using these boards should say Y here.  See the file
@@ -129,6 +123,7 @@ config BLK_CPQ_DA
 config BLK_CPQ_CISS_DA
 	tristate "Compaq Smart Array 5xxx support"
 	depends on PCI
+	select CHECK_SIGNATURE
 	help
 	  This is the driver for Compaq Smart Array 5xxx controllers.
 	  Everyone using these boards should say Y here.
@@ -164,8 +159,8 @@ config BLK_DEV_DAC960
 	  module will be called DAC960.
 
 config BLK_DEV_UMEM
-	tristate "Micro Memory MM5415 Battery Backed RAM support (EXPERIMENTAL)"
-	depends on PCI && EXPERIMENTAL
+	tristate "Micro Memory MM5415 Battery Backed RAM support"
+	depends on PCI
 	---help---
 	  Saying Y here will include support for the MM5415 family of
 	  battery backed (Non-volatile) RAM cards.
@@ -256,6 +251,21 @@ config BLK_DEV_LOOP
 
 	  Most users will answer N here.
 
+config BLK_DEV_LOOP_MIN_COUNT
+	int "Number of loop devices to pre-create at init time"
+	depends on BLK_DEV_LOOP
+	default 8
+	help
+	  Static number of loop devices to be unconditionally pre-created
+	  at init time.
+
+	  This default value can be overwritten on the kernel command
+	  line or with module-parameter loop.max_loop.
+
+	  The historic default is 8. If a late 2011 version of losetup(8)
+	  is used, it can be set to 0, since needed loop devices can be
+	  dynamically allocated with the /dev/loop-control interface.
+
 config BLK_DEV_CRYPTOLOOP
 	tristate "Cryptoloop Support"
 	select CRYPTO
@@ -271,6 +281,8 @@ config BLK_DEV_CRYPTOLOOP
 	  instead, which can be configured to be on-disk compatible with the
 	  cryptoloop device.
 
+source "drivers/block/drbd/Kconfig"
+
 config BLK_DEV_NBD
 	tristate "Network block device support"
 	depends on NET
@@ -298,6 +310,43 @@ config BLK_DEV_NBD
 
 	  If unsure, say N.
 
+config BLK_DEV_NVME
+	tristate "NVM Express block device"
+	depends on PCI
+	---help---
+	  The NVM Express driver is for solid state drives directly
+	  connected to the PCI or PCI Express bus.  If you know you
+	  don't have one of these, it is safe to answer N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called nvme.
+
+config BLK_DEV_SKD
+	tristate "STEC S1120 Block Driver"
+	depends on PCI
+	depends on 64BIT
+	---help---
+	Saying Y or M here will enable support for the
+	STEC, Inc. S1120 PCIe SSD.
+
+	Use device /dev/skd$N amd /dev/skd$Np$M.
+
+config BLK_DEV_OSD
+	tristate "OSD object-as-blkdev support"
+	depends on SCSI_OSD_ULD
+	---help---
+	  Saying Y or M here will allow the exporting of a single SCSI
+	  OSD (object-based storage) object as a Linux block device.
+
+	  For example, if you create a 2G object on an OSD device,
+	  you can then use this module to present that 2G object as
+	  a Linux block device.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called osdblk.
+
+	  If unsure, say N.
+
 config BLK_DEV_SX8
 	tristate "Promise SATA SX8 support"
 	depends on PCI
@@ -307,18 +356,6 @@ config BLK_DEV_SX8
 
 	  Use devices /dev/sx8/$N and /dev/sx8/$Np$M.
 
-config BLK_DEV_UB
-	tristate "Low Performance USB Block driver"
-	depends on USB
-	help
-	  This driver supports certain USB attached storage devices
-	  such as flash keys.
-
-	  If you enable this driver, it is recommended to avoid conflicts
-	  with usb-storage by enabling USB_LIBUSUAL.
-
-	  If unsure, say N.
-
 config BLK_DEV_RAM
 	tristate "RAM block device support"
 	---help---
@@ -333,7 +370,8 @@ config BLK_DEV_RAM
 	  For details, read <file:Documentation/blockdev/ramdisk.txt>.
 
 	  To compile this driver as a module, choose M here: the
-	  module will be called rd.
+	  module will be called brd. An alias "rd" has been defined
+	  for historical reasons.
 
 	  Most normal users won't need the RAM disk functionality, and can
 	  thus say N here.
@@ -396,8 +434,8 @@ config CDROM_PKTCDVD_BUFFERS
 	  a disc is opened for writing.
 
 config CDROM_PKTCDVD_WCACHE
-	bool "Enable write caching (EXPERIMENTAL)"
-	depends on CDROM_PKTCDVD && EXPERIMENTAL
+	bool "Enable write caching"
+	depends on CDROM_PKTCDVD
 	help
 	  If enabled, write caching will be set for the CD-R/W device. For now
 	  this option is dangerous unless the CD-RW media is known good, as we
@@ -438,7 +476,7 @@ source "drivers/s390/block/Kconfig"
 
 config XILINX_SYSACE
 	tristate "Xilinx SystemACE support"
-	depends on 4xx
+	depends on 4xx || MICROBLAZE
 	help
 	  Include support for the Xilinx SystemACE CompactFlash interface
 
@@ -446,14 +484,36 @@ config XEN_BLKDEV_FRONTEND
 	tristate "Xen virtual block device support"
 	depends on XEN
 	default y
+	select XEN_XENBUS_FRONTEND
 	help
 	  This driver implements the front-end of the Xen virtual
 	  block device driver.  It communicates with a back-end driver
 	  in another domain which drives the actual block device.
 
+config XEN_BLKDEV_BACKEND
+	tristate "Xen block-device backend driver"
+	depends on XEN_BACKEND
+	help
+	  The block-device backend driver allows the kernel to export its
+	  block devices to other guests via a high-performance shared-memory
+	  interface.
+
+	  The corresponding Linux frontend driver is enabled by the
+	  CONFIG_XEN_BLKDEV_FRONTEND configuration option.
+
+	  The backend driver attaches itself to a any block device specified
+	  in the XenBus configuration. There are no limits to what the block
+	  device as long as it has a major and minor.
+
+	  If you are compiling a kernel to run in a Xen block backend driver
+	  domain (often this is domain 0) you should say Y here. To
+	  compile this driver as a module, chose M here: the module
+	  will be called xen-blkback.
+
+
 config VIRTIO_BLK
-	tristate "Virtio block driver (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && VIRTIO
+	tristate "Virtio block driver"
+	depends on VIRTIO
 	---help---
 	  This is the virtual block driver for virtio.  It can be used with
           lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
@@ -461,7 +521,7 @@ config VIRTIO_BLK
 config BLK_DEV_HD
 	bool "Very old hard disk (MFM/RLL/IDE) driver"
 	depends on HAVE_IDE
-	depends on !ARM || ARCH_RPC || ARCH_SHARK || BROKEN
+	depends on !ARM || ARCH_RPC || BROKEN
 	help
 	  This is a very old hard disk driver that lacks the enhanced
 	  functionality of the newer ones.
@@ -470,4 +530,31 @@ config BLK_DEV_HD
 
 	  If unsure, say N.
 
+config BLK_DEV_RBD
+	tristate "Rados block device (RBD)"
+	depends on INET && BLOCK
+	select CEPH_LIB
+	select LIBCRC32C
+	select CRYPTO_AES
+	select CRYPTO
+	default n
+	help
+	  Say Y here if you want include the Rados block device, which stripes
+	  a block device over objects stored in the Ceph distributed object
+	  store.
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
+config BLK_DEV_RSXX
+	tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver"
+	depends on PCI
+	help
+	  Device driver for IBM's high speed PCIe SSD
+	  storage device: Flash Adapter 900GB Full Height.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called rsxx.
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 7755a5e2a85..02b688d1438 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_ATARI_FLOPPY)	+= ataflop.o
 obj-$(CONFIG_AMIGA_Z2RAM)	+= z2ram.o
 obj-$(CONFIG_BLK_DEV_RAM)	+= brd.o
 obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
-obj-$(CONFIG_BLK_DEV_XD)	+= xd.o
 obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
@@ -23,17 +22,28 @@ obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
+obj-$(CONFIG_BLK_DEV_NVME)	+= nvme.o
+obj-$(CONFIG_BLK_DEV_SKD)	+= skd.o
+obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
 obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
 obj-$(CONFIG_VIRTIO_BLK)	+= virtio_blk.o
 
-obj-$(CONFIG_VIODASD)		+= viodasd.o
 obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
-obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
 obj-$(CONFIG_BLK_DEV_HD)	+= hd.o
 
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
+obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= xen-blkback/
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
+obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 
-swim_mod-objs	:= swim.o swim_asm.o
+obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
+obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk.o
+obj-$(CONFIG_ZRAM) += zram/
+
+nvme-y		:= nvme-core.o nvme-scsi.o
+skd-y		:= skd_main.o
+swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 9c6e5b0fe89..758da2287d9 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -54,17 +54,20 @@
  */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 
 #include <linux/fd.h>
 #include <linux/hdreg.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/mutex.h>
 #include <linux/amifdreg.h>
 #include <linux/amifd.h>
-#include <linux/buffer_head.h>
+#include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/interrupt.h>
+#include <linux/platform_device.h>
 
 #include <asm/setup.h>
 #include <asm/uaccess.h>
@@ -106,13 +109,12 @@
 #define FD_HD_3 	0x55555555  /* high-density 3.5" (1760K) drive */
 #define FD_DD_5 	0xaaaaaaaa  /* double-density 5.25" (440K) drive */
 
+static DEFINE_MUTEX(amiflop_mutex);
 static unsigned long int fd_def_df0 = FD_DD_3;     /* default for df0 if it doesn't identify */
 
 module_param(fd_def_df0, ulong, 0);
 MODULE_LICENSE("GPL");
 
-static struct request_queue *floppy_queue;
-
 /*
  *  Macros
  */
@@ -161,6 +163,7 @@ static volatile int selected = -1;	/* currently selected drive */
 static int writepending;
 static int writefromint;
 static char *raw_buf;
+static int fdc_queue;
 
 static DEFINE_SPINLOCK(amiflop_lock);
 
@@ -340,7 +343,7 @@ static int fd_motor_on(int nr)
 		unit[nr].motor = 1;
 		fd_select(nr);
 
-		INIT_COMPLETION(motor_on_completion);
+		reinit_completion(&motor_on_completion);
 		motor_on_timer.data = nr;
 		mod_timer(&motor_on_timer, jiffies + HZ/2);
 
@@ -1331,6 +1334,42 @@ static int get_track(int drive, int track)
 	return -1;
 }
 
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static struct request *set_next_request(void)
+{
+	struct request_queue *q;
+	int cnt = FD_MAX_UNITS;
+	struct request *rq = NULL;
+
+	/* Find next queue we can dispatch from */
+	fdc_queue = fdc_queue + 1;
+	if (fdc_queue == FD_MAX_UNITS)
+		fdc_queue = 0;
+
+	for(cnt = FD_MAX_UNITS; cnt > 0; cnt--) {
+
+		if (unit[fdc_queue].type->code == FD_NODRIVE) {
+			if (++fdc_queue == FD_MAX_UNITS)
+				fdc_queue = 0;
+			continue;
+		}
+
+		q = unit[fdc_queue].gendisk->queue;
+		if (q) {
+			rq = blk_fetch_request(q);
+			if (rq)
+				break;
+		}
+
+		if (++fdc_queue == FD_MAX_UNITS)
+			fdc_queue = 0;
+	}
+
+	return rq;
+}
+
 static void redo_fd_request(void)
 {
 	struct request *rq;
@@ -1342,7 +1381,7 @@ static void redo_fd_request(void)
 	int err;
 
 next_req:
-	rq = blk_fetch_request(floppy_queue);
+	rq = set_next_request();
 	if (!rq) {
 		/* Nothing left to do */
 		return;
@@ -1367,7 +1406,7 @@ next_segment:
 
 		track = block / (floppy->dtype->sects * floppy->type->sect_mult);
 		sector = block % (floppy->dtype->sects * floppy->type->sect_mult);
-		data = rq->buffer + 512 * cnt;
+		data = bio_data(rq->bio) + 512 * cnt;
 #ifdef DEBUG
 		printk("access to track %d, sector %d, with buffer at "
 		       "0x%08lx\n", track, sector, data);
@@ -1421,7 +1460,7 @@ static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
 		    unsigned int cmd, unsigned long param)
 {
 	struct amiga_floppy_struct *p = bdev->bd_disk->private_data;
@@ -1498,6 +1537,18 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
 	return 0;
 }
 
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	mutex_lock(&amiflop_mutex);
+	ret = fd_locked_ioctl(bdev, mode, cmd, param);
+	mutex_unlock(&amiflop_mutex);
+
+	return ret;
+}
+
 static void fd_probe(int dev)
 {
 	unsigned long code;
@@ -1540,10 +1591,13 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	int old_dev;
 	unsigned long flags;
 
+	mutex_lock(&amiflop_mutex);
 	old_dev = fd_device[drive];
 
-	if (fd_ref[drive] && old_dev != system)
+	if (fd_ref[drive] && old_dev != system) {
+		mutex_unlock(&amiflop_mutex);
 		return -EBUSY;
+	}
 
 	if (mode & (FMODE_READ|FMODE_WRITE)) {
 		check_disk_change(bdev);
@@ -1556,8 +1610,10 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 			fd_deselect (drive);
 			rel_fdc();
 
-			if (wrprot)
+			if (wrprot) {
+				mutex_unlock(&amiflop_mutex);
 				return -EROFS;
+			}
 		}
 	}
 
@@ -1574,14 +1630,16 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive,
 	       unit[drive].type->name, data_types[system].name);
 
+	mutex_unlock(&amiflop_mutex);
 	return 0;
 }
 
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	struct amiga_floppy_struct *p = disk->private_data;
 	int drive = p - unit;
 
+	mutex_lock(&amiflop_mutex);
 	if (unit[drive].dirty == 1) {
 		del_timer (flush_track_timer + drive);
 		non_int_flush_track (drive);
@@ -1595,16 +1653,16 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 /* the mod_use counter is handled this way */
 	floppy_off (drive | 0x40000000);
 #endif
-	return 0;
+	mutex_unlock(&amiflop_mutex);
 }
 
 /*
- * floppy-change is never called from an interrupt, so we can relax a bit
+ * check_events is never called from an interrupt, so we can relax a bit
  * here, sleep etc. Note that floppy-on tries to set current_DOR to point
  * to the desired drive, but it will probably not survive the sleep if
  * several floppies are used at the same time: thus the loop.
  */
-static int amiga_floppy_change(struct gendisk *disk)
+static unsigned amiga_check_events(struct gendisk *disk, unsigned int clearing)
 {
 	struct amiga_floppy_struct *p = disk->private_data;
 	int drive = p - unit;
@@ -1627,25 +1685,25 @@ static int amiga_floppy_change(struct gendisk *disk)
 		p->dirty = 0;
 		writepending = 0; /* if this was true before, too bad! */
 		writefromint = 0;
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	}
 	return 0;
 }
 
-static struct block_device_operations floppy_fops = {
+static const struct block_device_operations floppy_fops = {
 	.owner		= THIS_MODULE,
 	.open		= floppy_open,
 	.release	= floppy_release,
-	.locked_ioctl	= fd_ioctl,
+	.ioctl		= fd_ioctl,
 	.getgeo		= fd_getgeo,
-	.media_changed	= amiga_floppy_change,
+	.check_events	= amiga_check_events,
 };
 
 static int __init fd_probe_drives(void)
 {
 	int drive,drives,nomem;
 
-	printk(KERN_INFO "FD: probing units\n" KERN_INFO "found ");
+	printk(KERN_INFO "FD: probing units\nfound ");
 	drives=0;
 	nomem=0;
 	for(drive=0;drive<FD_MAX_UNITS;drive++) {
@@ -1659,6 +1717,13 @@ static int __init fd_probe_drives(void)
 			continue;
 		}
 		unit[drive].gendisk = disk;
+
+		disk->queue = blk_init_queue(do_fd_request, &amiflop_lock);
+		if (!disk->queue) {
+			unit[drive].type->code = FD_NODRIVE;
+			continue;
+		}
+
 		drives++;
 		if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) {
 			printk("no mem for ");
@@ -1672,7 +1737,6 @@ static int __init fd_probe_drives(void)
 		disk->fops = &floppy_fops;
 		sprintf(disk->disk_name, "fd%d", drive);
 		disk->private_data = &unit[drive];
-		disk->queue = floppy_queue;
 		set_capacity(disk, 880*2);
 		add_disk(disk);
 	}
@@ -1695,34 +1759,18 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
 	return get_disk(unit[drive].gendisk);
 }
 
-static int __init amiga_floppy_init(void)
+static int __init amiga_floppy_probe(struct platform_device *pdev)
 {
 	int i, ret;
 
-	if (!MACH_IS_AMIGA)
-		return -ENODEV;
-
-	if (!AMIGAHW_PRESENT(AMI_FLOPPY))
-		return -ENODEV;
-
 	if (register_blkdev(FLOPPY_MAJOR,"fd"))
 		return -EBUSY;
 
-	/*
-	 *  We request DSKPTR, DSKLEN and DSKDATA only, because the other
-	 *  floppy registers are too spreaded over the custom register space
-	 */
-	ret = -EBUSY;
-	if (!request_mem_region(CUSTOM_PHYSADDR+0x20, 8, "amiflop [Paula]")) {
-		printk("fd: cannot get floppy registers\n");
-		goto out_blkdev;
-	}
-
 	ret = -ENOMEM;
-	if ((raw_buf = (char *)amiga_chip_alloc (RAW_BUF_SIZE, "Floppy")) ==
-	    NULL) {
+	raw_buf = amiga_chip_alloc(RAW_BUF_SIZE, "Floppy");
+	if (!raw_buf) {
 		printk("fd: cannot get chip mem buffer\n");
-		goto out_memregion;
+		goto out_blkdev;
 	}
 
 	ret = -EBUSY;
@@ -1736,11 +1784,6 @@ static int __init amiga_floppy_init(void)
 		goto out_irq2;
 	}
 
-	ret = -ENOMEM;
-	floppy_queue = blk_init_queue(do_fd_request, &amiflop_lock);
-	if (!floppy_queue)
-		goto out_queue;
-
 	ret = -ENODEV;
 	if (fd_probe_drives() < 1) /* No usable drives */
 		goto out_probe;
@@ -1784,33 +1827,29 @@ static int __init amiga_floppy_init(void)
 	return 0;
 
 out_probe:
-	blk_cleanup_queue(floppy_queue);
-out_queue:
 	free_irq(IRQ_AMIGA_CIAA_TB, NULL);
 out_irq2:
 	free_irq(IRQ_AMIGA_DSKBLK, NULL);
 out_irq:
 	amiga_chip_free(raw_buf);
-out_memregion:
-	release_mem_region(CUSTOM_PHYSADDR+0x20, 8);
 out_blkdev:
 	unregister_blkdev(FLOPPY_MAJOR,"fd");
 	return ret;
 }
 
-module_init(amiga_floppy_init);
-#ifdef MODULE
-
 #if 0 /* not safe to unload */
-void cleanup_module(void)
+static int __exit amiga_floppy_remove(struct platform_device *pdev)
 {
 	int i;
 
 	for( i = 0; i < FD_MAX_UNITS; i++) {
 		if (unit[i].type->code != FD_NODRIVE) {
+			struct request_queue *q = unit[i].gendisk->queue;
 			del_gendisk(unit[i].gendisk);
 			put_disk(unit[i].gendisk);
 			kfree(unit[i].trackbuf);
+			if (q)
+				blk_cleanup_queue(q);
 		}
 	}
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
@@ -1818,13 +1857,25 @@ void cleanup_module(void)
 	free_irq(IRQ_AMIGA_DSKBLK, NULL);
 	custom.dmacon = DMAF_DISK; /* disable DMA */
 	amiga_chip_free(raw_buf);
-	blk_cleanup_queue(floppy_queue);
-	release_mem_region(CUSTOM_PHYSADDR+0x20, 8);
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 }
 #endif
 
-#else
+static struct platform_driver amiga_floppy_driver = {
+	.driver   = {
+		.name	= "amiga-floppy",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init amiga_floppy_init(void)
+{
+	return platform_driver_probe(&amiga_floppy_driver, amiga_floppy_probe);
+}
+
+module_init(amiga_floppy_init);
+
+#ifndef MODULE
 static int __init amiga_floppy_setup (char *str)
 {
 	int n;
@@ -1839,3 +1890,5 @@ static int __init amiga_floppy_setup (char *str)
 
 __setup("floppy=", amiga_floppy_setup);
 #endif
+
+MODULE_ALIAS("platform:amiga-floppy");
diff --git a/drivers/block/aoe/Makefile b/drivers/block/aoe/Makefile
index e76d997183c..06ea82cdf27 100644
--- a/drivers/block/aoe/Makefile
+++ b/drivers/block/aoe/Makefile
@@ -3,4 +3,4 @@
 #
 
 obj-$(CONFIG_ATA_OVER_ETH)	+= aoe.o
-aoe-objs := aoeblk.o aoechr.o aoecmd.o aoedev.o aoemain.o aoenet.o
+aoe-y := aoeblk.o aoechr.o aoecmd.o aoedev.o aoemain.o aoenet.o
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 5e41e6dd657..9220f8e833d 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -1,5 +1,5 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
-#define VERSION "47"
+/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
+#define VERSION "85"
 #define AOE_MAJOR 152
 #define DEVICE_NAME "aoe"
 
@@ -10,10 +10,7 @@
 #define AOE_PARTITIONS (16)
 #endif
 
-#define SYSMINOR(aoemajor, aoeminor) ((aoemajor) * NPERSHELF + (aoeminor))
-#define AOEMAJOR(sysminor) ((sysminor) / NPERSHELF)
-#define AOEMINOR(sysminor) ((sysminor) % NPERSHELF)
-#define WHITESPACE " \t\v\f\n"
+#define WHITESPACE " \t\v\f\n,"
 
 enum {
 	AOECMD_ATA,
@@ -75,105 +72,136 @@ enum {
 	DEVFL_UP = 1,	/* device is installed in system and ready for AoE->ATA commands */
 	DEVFL_TKILL = (1<<1),	/* flag for timer to know when to kill self */
 	DEVFL_EXT = (1<<2),	/* device accepts lba48 commands */
-	DEVFL_CLOSEWAIT = (1<<3), /* device is waiting for all closes to revalidate */
-	DEVFL_GDALLOC = (1<<4),	/* need to alloc gendisk */
+	DEVFL_GDALLOC = (1<<3),	/* need to alloc gendisk */
+	DEVFL_GD_NOW = (1<<4),	/* allocating gendisk */
 	DEVFL_KICKME = (1<<5),	/* slow polling network card catch */
 	DEVFL_NEWSIZE = (1<<6),	/* need to update dev size in block layer */
-
-	BUFFL_FAIL = 1,
+	DEVFL_FREEING = (1<<7),	/* set when device is being cleaned up */
+	DEVFL_FREED = (1<<8),	/* device has been cleaned up */
 };
 
 enum {
 	DEFAULTBCNT = 2 * 512,	/* 2 sectors */
-	NPERSHELF = 16,		/* number of slots per shelf address */
-	FREETAG = -1,
 	MIN_BUFS = 16,
-	NTARGETS = 8,
+	NTARGETS = 4,
 	NAOEIFS = 8,
-	NSKBPOOLMAX = 128,
+	NSKBPOOLMAX = 256,
+	NFACTIVE = 61,
 
 	TIMERTICK = HZ / 10,
-	MINTIMER = HZ >> 2,
-	MAXTIMER = HZ << 1,
-	HELPWAIT = 20,
+	RTTSCALE = 8,
+	RTTDSCALE = 3,
+	RTTAVG_INIT = USEC_PER_SEC / 4 << RTTSCALE,
+	RTTDEV_INIT = RTTAVG_INIT / 4,
+
+	HARD_SCORN_SECS = 10,	/* try another remote port after this */
+	MAX_TAINT = 1000,	/* cap on aoetgt taint */
 };
 
 struct buf {
-	struct list_head bufs;
-	ulong stime;	/* for disk stats */
-	ulong flags;
 	ulong nframesout;
-	ulong resid;
-	ulong bv_resid;
-	ulong bv_off;
-	sector_t sector;
 	struct bio *bio;
-	struct bio_vec *bv;
+	struct bvec_iter iter;
+	struct request *rq;
+};
+
+enum frame_flags {
+	FFL_PROBE = 1,
 };
 
 struct frame {
-	int tag;
+	struct list_head head;
+	u32 tag;
+	struct timeval sent;	/* high-res time packet was sent */
+	u32 sent_jiffs;		/* low-res jiffies-based sent time */
 	ulong waited;
+	ulong waited_total;
+	struct aoetgt *t;		/* parent target I belong to */
+	struct sk_buff *skb;		/* command skb freed on module exit */
+	struct sk_buff *r_skb;		/* response skb for async processing */
 	struct buf *buf;
-	char *bufaddr;
-	ulong bcnt;
-	sector_t lba;
-	struct sk_buff *skb;
+	struct bvec_iter iter;
+	char flags;
 };
 
 struct aoeif {
 	struct net_device *nd;
-	unsigned char lost;
-	unsigned char lostjumbo;
-	ushort maxbcnt;
+	ulong lost;
+	int bcnt;
 };
 
 struct aoetgt {
 	unsigned char addr[6];
-	ushort nframes;
-	struct frame *frames;
+	ushort nframes;		/* cap on frames to use */
+	struct aoedev *d;			/* parent device I belong to */
+	struct list_head ffree;			/* list of free frames */
 	struct aoeif ifs[NAOEIFS];
 	struct aoeif *ifp;	/* current aoeif in use */
-	ushort nout;
-	ushort maxout;
-	u16 lasttag;		/* last tag sent */
-	u16 useme;
-	ulong lastwadj;		/* last window adjustment */
+	ushort nout;		/* number of AoE commands outstanding */
+	ushort maxout;		/* current value for max outstanding */
+	ushort next_cwnd;	/* incr maxout after decrementing to zero */
+	ushort ssthresh;	/* slow start threshold */
+	ulong falloc;		/* number of allocated frames */
+	int taint;		/* how much we want to avoid this aoetgt */
+	int minbcnt;
 	int wpkts, rpkts;
-	int dataref;
+	char nout_probes;
 };
 
 struct aoedev {
 	struct aoedev *next;
 	ulong sysminor;
 	ulong aoemajor;
+	u32 rttavg;		/* scaled AoE round trip time average */
+	u32 rttdev;		/* scaled round trip time mean deviation */
 	u16 aoeminor;
 	u16 flags;
 	u16 nopen;		/* (bd_openers isn't available without sleeping) */
-	u16 rttavg;		/* round trip average of requests/responses */
-	u16 mintimer;
 	u16 fw_ver;		/* version of blade's firmware */
+	u16 lasttag;		/* last tag sent */
+	u16 useme;
+	ulong ref;
 	struct work_struct work;/* disk create work struct */
 	struct gendisk *gd;
-	struct request_queue blkq;
-	struct hd_geometry geo; 
+	struct dentry *debugfs;
+	struct request_queue *blkq;
+	struct hd_geometry geo;
 	sector_t ssize;
 	struct timer_list timer;
 	spinlock_t lock;
-	struct sk_buff_head sendq;
 	struct sk_buff_head skbpool;
 	mempool_t *bufpool;	/* for deadlock-free Buf allocation */
-	struct list_head bufq;	/* queue of bios to work on */
-	struct buf *inprocess;	/* the one we're currently working on */
-	struct aoetgt *targets[NTARGETS];
+	struct {		/* pointers to work in progress */
+		struct buf *buf;
+		struct bio *nxbio;
+		struct request *rq;
+	} ip;
+	ulong maxbcnt;
+	struct list_head factive[NFACTIVE];	/* hash of active frames */
+	struct list_head rexmitq; /* deferred retransmissions */
+	struct aoetgt **targets;
+	ulong ntargets;		/* number of allocated aoetgt pointers */
 	struct aoetgt **tgt;	/* target in use when working */
-	struct aoetgt **htgt;	/* target needing rexmit assistance */
+	ulong kicked;
+	char ident[512];
 };
 
+/* kthread tracking */
+struct ktstate {
+	struct completion rendez;
+	struct task_struct *task;
+	wait_queue_head_t *waitq;
+	int (*fn) (int);
+	char name[12];
+	spinlock_t *lock;
+	int id;
+	int active;
+};
 
 int aoeblk_init(void);
 void aoeblk_exit(void);
 void aoeblk_gdalloc(void *);
+void aoedisk_rm_debugfs(struct aoedev *d);
 void aoedisk_rm_sysfs(struct aoedev *d);
 
 int aoechr_init(void);
@@ -182,22 +210,31 @@ void aoechr_error(char *);
 
 void aoecmd_work(struct aoedev *d);
 void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor);
-void aoecmd_ata_rsp(struct sk_buff *);
+struct sk_buff *aoecmd_ata_rsp(struct sk_buff *);
 void aoecmd_cfg_rsp(struct sk_buff *);
 void aoecmd_sleepwork(struct work_struct *);
+void aoecmd_wreset(struct aoetgt *t);
 void aoecmd_cleanslate(struct aoedev *);
+void aoecmd_exit(void);
+int aoecmd_init(void);
 struct sk_buff *aoecmd_ata_id(struct aoedev *);
+void aoe_freetframe(struct frame *);
+void aoe_flush_iocq(void);
+void aoe_flush_iocq_by_index(int);
+void aoe_end_request(struct aoedev *, struct request *, int);
+int aoe_ktstart(struct ktstate *k);
+void aoe_ktstop(struct ktstate *k);
 
 int aoedev_init(void);
 void aoedev_exit(void);
-struct aoedev *aoedev_by_aoeaddr(int maj, int min);
-struct aoedev *aoedev_by_sysminor_m(ulong sysminor);
+struct aoedev *aoedev_by_aoeaddr(ulong maj, int min, int do_alloc);
 void aoedev_downdev(struct aoedev *d);
 int aoedev_flush(const char __user *str, size_t size);
+void aoe_failbuf(struct aoedev *, struct buf *);
+void aoedev_put(struct aoedev *);
 
 int aoenet_init(void);
 void aoenet_exit(void);
 void aoenet_xmit(struct sk_buff_head *);
 int is_aoe_netif(struct net_device *ifp);
 int set_aoe_iflist(const char __user *str, size_t size);
-
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 2307a271bdc..dd73e1ff175 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -1,19 +1,35 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoeblk.c
  * block device routines
  */
 
+#include <linux/kernel.h>
 #include <linux/hdreg.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
 #include <linux/ioctl.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include <linux/genhd.h>
 #include <linux/netdevice.h>
+#include <linux/mutex.h>
+#include <linux/export.h>
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <scsi/sg.h>
 #include "aoe.h"
 
+static DEFINE_MUTEX(aoeblk_mutex);
 static struct kmem_cache *buf_pool_cache;
+static struct dentry *aoe_debugfs_dir;
+
+/* GPFS needs a larger value than the default. */
+static int aoe_maxsectors;
+module_param(aoe_maxsectors, int, 0644);
+MODULE_PARM_DESC(aoe_maxsectors,
+	"When nonzero, set the maximum number of sectors per I/O request");
 
 static ssize_t aoedisk_show_state(struct device *dev,
 				  struct device_attribute *attr, char *page)
@@ -53,7 +69,7 @@ static ssize_t aoedisk_show_netif(struct device *dev,
 	nd = nds;
 	ne = nd + ARRAY_SIZE(nds);
 	t = d->targets;
-	te = t + NTARGETS;
+	te = t + d->ntargets;
 	for (; t < te && *t; t++) {
 		ifp = (*t)->ifs;
 		e = ifp + NAOEIFS;
@@ -85,6 +101,63 @@ static ssize_t aoedisk_show_fwver(struct device *dev,
 
 	return snprintf(page, PAGE_SIZE, "0x%04x\n", (unsigned int) d->fw_ver);
 }
+static ssize_t aoedisk_show_payload(struct device *dev,
+				    struct device_attribute *attr, char *page)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct aoedev *d = disk->private_data;
+
+	return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt);
+}
+
+static int aoedisk_debugfs_show(struct seq_file *s, void *ignored)
+{
+	struct aoedev *d;
+	struct aoetgt **t, **te;
+	struct aoeif *ifp, *ife;
+	unsigned long flags;
+	char c;
+
+	d = s->private;
+	seq_printf(s, "rttavg: %d rttdev: %d\n",
+		d->rttavg >> RTTSCALE,
+		d->rttdev >> RTTDSCALE);
+	seq_printf(s, "nskbpool: %d\n", skb_queue_len(&d->skbpool));
+	seq_printf(s, "kicked: %ld\n", d->kicked);
+	seq_printf(s, "maxbcnt: %ld\n", d->maxbcnt);
+	seq_printf(s, "ref: %ld\n", d->ref);
+
+	spin_lock_irqsave(&d->lock, flags);
+	t = d->targets;
+	te = t + d->ntargets;
+	for (; t < te && *t; t++) {
+		c = '\t';
+		seq_printf(s, "falloc: %ld\n", (*t)->falloc);
+		seq_printf(s, "ffree: %p\n",
+			list_empty(&(*t)->ffree) ? NULL : (*t)->ffree.next);
+		seq_printf(s, "%pm:%d:%d:%d\n", (*t)->addr, (*t)->nout,
+			(*t)->maxout, (*t)->nframes);
+		seq_printf(s, "\tssthresh:%d\n", (*t)->ssthresh);
+		seq_printf(s, "\ttaint:%d\n", (*t)->taint);
+		seq_printf(s, "\tr:%d\n", (*t)->rpkts);
+		seq_printf(s, "\tw:%d\n", (*t)->wpkts);
+		ifp = (*t)->ifs;
+		ife = ifp + ARRAY_SIZE((*t)->ifs);
+		for (; ifp->nd && ifp < ife; ifp++) {
+			seq_printf(s, "%c%s", c, ifp->nd->name);
+			c = ',';
+		}
+		seq_puts(s, "\n");
+	}
+	spin_unlock_irqrestore(&d->lock, flags);
+
+	return 0;
+}
+
+static int aoe_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, aoedisk_debugfs_show, inode->i_private);
+}
 
 static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL);
 static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL);
@@ -93,12 +166,14 @@ static struct device_attribute dev_attr_firmware_version = {
 	.attr = { .name = "firmware-version", .mode = S_IRUGO },
 	.show = aoedisk_show_fwver,
 };
+static DEVICE_ATTR(payload, S_IRUGO, aoedisk_show_payload, NULL);
 
 static struct attribute *aoe_attrs[] = {
 	&dev_attr_state.attr,
 	&dev_attr_mac.attr,
 	&dev_attr_netif.attr,
 	&dev_attr_firmware_version.attr,
+	&dev_attr_payload.attr,
 	NULL,
 };
 
@@ -106,6 +181,44 @@ static const struct attribute_group attr_group = {
 	.attrs = aoe_attrs,
 };
 
+static const struct file_operations aoe_debugfs_fops = {
+	.open = aoe_debugfs_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void
+aoedisk_add_debugfs(struct aoedev *d)
+{
+	struct dentry *entry;
+	char *p;
+
+	if (aoe_debugfs_dir == NULL)
+		return;
+	p = strchr(d->gd->disk_name, '/');
+	if (p == NULL)
+		p = d->gd->disk_name;
+	else
+		p++;
+	BUG_ON(*p == '\0');
+	entry = debugfs_create_file(p, 0444, aoe_debugfs_dir, d,
+				    &aoe_debugfs_fops);
+	if (IS_ERR_OR_NULL(entry)) {
+		pr_info("aoe: cannot create debugfs file for %s\n",
+			d->gd->disk_name);
+		return;
+	}
+	BUG_ON(d->debugfs);
+	d->debugfs = entry;
+}
+void
+aoedisk_rm_debugfs(struct aoedev *d)
+{
+	debugfs_remove(d->debugfs);
+	d->debugfs = NULL;
+}
+
 static int
 aoedisk_add_sysfs(struct aoedev *d)
 {
@@ -123,17 +236,29 @@ aoeblk_open(struct block_device *bdev, fmode_t mode)
 	struct aoedev *d = bdev->bd_disk->private_data;
 	ulong flags;
 
+	if (!virt_addr_valid(d)) {
+		pr_crit("aoe: invalid device pointer in %s\n",
+			__func__);
+		WARN_ON(1);
+		return -ENODEV;
+	}
+	if (!(d->flags & DEVFL_UP) || d->flags & DEVFL_TKILL)
+		return -ENODEV;
+
+	mutex_lock(&aoeblk_mutex);
 	spin_lock_irqsave(&d->lock, flags);
-	if (d->flags & DEVFL_UP) {
+	if (d->flags & DEVFL_UP && !(d->flags & DEVFL_TKILL)) {
 		d->nopen++;
 		spin_unlock_irqrestore(&d->lock, flags);
+		mutex_unlock(&aoeblk_mutex);
 		return 0;
 	}
 	spin_unlock_irqrestore(&d->lock, flags);
+	mutex_unlock(&aoeblk_mutex);
 	return -ENODEV;
 }
 
-static int
+static void
 aoeblk_release(struct gendisk *disk, fmode_t mode)
 {
 	struct aoedev *d = disk->private_data;
@@ -144,78 +269,28 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
 	if (--d->nopen == 0) {
 		spin_unlock_irqrestore(&d->lock, flags);
 		aoecmd_cfg(d->aoemajor, d->aoeminor);
-		return 0;
+		return;
 	}
 	spin_unlock_irqrestore(&d->lock, flags);
-
-	return 0;
 }
 
-static int
-aoeblk_make_request(struct request_queue *q, struct bio *bio)
+static void
+aoeblk_request(struct request_queue *q)
 {
-	struct sk_buff_head queue;
 	struct aoedev *d;
-	struct buf *buf;
-	ulong flags;
-
-	blk_queue_bounce(q, &bio);
-
-	if (bio == NULL) {
-		printk(KERN_ERR "aoe: bio is NULL\n");
-		BUG();
-		return 0;
-	}
-	d = bio->bi_bdev->bd_disk->private_data;
-	if (d == NULL) {
-		printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n");
-		BUG();
-		bio_endio(bio, -ENXIO);
-		return 0;
-	} else if (bio->bi_io_vec == NULL) {
-		printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
-		BUG();
-		bio_endio(bio, -ENXIO);
-		return 0;
-	}
-	buf = mempool_alloc(d->bufpool, GFP_NOIO);
-	if (buf == NULL) {
-		printk(KERN_INFO "aoe: buf allocation failure\n");
-		bio_endio(bio, -ENOMEM);
-		return 0;
-	}
-	memset(buf, 0, sizeof(*buf));
-	INIT_LIST_HEAD(&buf->bufs);
-	buf->stime = jiffies;
-	buf->bio = bio;
-	buf->resid = bio->bi_size;
-	buf->sector = bio->bi_sector;
-	buf->bv = &bio->bi_io_vec[bio->bi_idx];
-	buf->bv_resid = buf->bv->bv_len;
-	WARN_ON(buf->bv_resid == 0);
-	buf->bv_off = buf->bv->bv_offset;
-
-	spin_lock_irqsave(&d->lock, flags);
+	struct request *rq;
 
+	d = q->queuedata;
 	if ((d->flags & DEVFL_UP) == 0) {
-		printk(KERN_INFO "aoe: device %ld.%d is not up\n",
+		pr_info_ratelimited("aoe: device %ld.%d is not up\n",
 			d->aoemajor, d->aoeminor);
-		spin_unlock_irqrestore(&d->lock, flags);
-		mempool_free(buf, d->bufpool);
-		bio_endio(bio, -ENXIO);
-		return 0;
+		while ((rq = blk_peek_request(q))) {
+			blk_start_request(rq);
+			aoe_end_request(d, rq, 1);
+		}
+		return;
 	}
-
-	list_add_tail(&buf->bufs, &d->bufq);
-
 	aoecmd_work(d);
-	__skb_queue_head_init(&queue);
-	skb_queue_splice_init(&d->sendq, &queue);
-
-	spin_unlock_irqrestore(&d->lock, flags);
-	aoenet_xmit(&queue);
-
-	return 0;
 }
 
 static int
@@ -234,9 +309,38 @@ aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations aoe_bdops = {
+static int
+aoeblk_ioctl(struct block_device *bdev, fmode_t mode, uint cmd, ulong arg)
+{
+	struct aoedev *d;
+
+	if (!arg)
+		return -EINVAL;
+
+	d = bdev->bd_disk->private_data;
+	if ((d->flags & DEVFL_UP) == 0) {
+		pr_err("aoe: disk not up\n");
+		return -ENODEV;
+	}
+
+	if (cmd == HDIO_GET_IDENTITY) {
+		if (!copy_to_user((void __user *) arg, &d->ident,
+			sizeof(d->ident)))
+			return 0;
+		return -EFAULT;
+	}
+
+	/* udev calls scsi_id, which uses SG_IO, resulting in noise */
+	if (cmd != SG_IO)
+		pr_info("aoe: unknown ioctl 0x%x\n", cmd);
+
+	return -ENOTTY;
+}
+
+static const struct block_device_operations aoe_bdops = {
 	.open = aoeblk_open,
 	.release = aoeblk_release,
+	.ioctl = aoeblk_ioctl,
 	.getgeo = aoeblk_getgeo,
 	.owner = THIS_MODULE,
 };
@@ -247,37 +351,67 @@ aoeblk_gdalloc(void *vp)
 {
 	struct aoedev *d = vp;
 	struct gendisk *gd;
+	mempool_t *mp;
+	struct request_queue *q;
+	enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, };
 	ulong flags;
+	int late = 0;
+
+	spin_lock_irqsave(&d->lock, flags);
+	if (d->flags & DEVFL_GDALLOC
+	&& !(d->flags & DEVFL_TKILL)
+	&& !(d->flags & DEVFL_GD_NOW))
+		d->flags |= DEVFL_GD_NOW;
+	else
+		late = 1;
+	spin_unlock_irqrestore(&d->lock, flags);
+	if (late)
+		return;
 
 	gd = alloc_disk(AOE_PARTITIONS);
 	if (gd == NULL) {
-		printk(KERN_ERR
-			"aoe: cannot allocate disk structure for %ld.%d\n",
+		pr_err("aoe: cannot allocate disk structure for %ld.%d\n",
 			d->aoemajor, d->aoeminor);
 		goto err;
 	}
 
-	d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache);
-	if (d->bufpool == NULL) {
+	mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab,
+		buf_pool_cache);
+	if (mp == NULL) {
 		printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n",
 			d->aoemajor, d->aoeminor);
 		goto err_disk;
 	}
-
-	blk_queue_make_request(&d->blkq, aoeblk_make_request);
-	if (bdi_init(&d->blkq.backing_dev_info))
+	q = blk_init_queue(aoeblk_request, &d->lock);
+	if (q == NULL) {
+		pr_err("aoe: cannot allocate block queue for %ld.%d\n",
+			d->aoemajor, d->aoeminor);
 		goto err_mempool;
+	}
+
 	spin_lock_irqsave(&d->lock, flags);
+	WARN_ON(!(d->flags & DEVFL_GD_NOW));
+	WARN_ON(!(d->flags & DEVFL_GDALLOC));
+	WARN_ON(d->flags & DEVFL_TKILL);
+	WARN_ON(d->gd);
+	WARN_ON(d->flags & DEVFL_UP);
+	blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS);
+	q->backing_dev_info.name = "aoe";
+	q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
+	d->bufpool = mp;
+	d->blkq = gd->queue = q;
+	q->queuedata = d;
+	d->gd = gd;
+	if (aoe_maxsectors)
+		blk_queue_max_hw_sectors(q, aoe_maxsectors);
 	gd->major = AOE_MAJOR;
-	gd->first_minor = d->sysminor * AOE_PARTITIONS;
+	gd->first_minor = d->sysminor;
 	gd->fops = &aoe_bdops;
 	gd->private_data = d;
 	set_capacity(gd, d->ssize);
 	snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d",
 		d->aoemajor, d->aoeminor);
 
-	gd->queue = &d->blkq;
-	d->gd = gd;
 	d->flags &= ~DEVFL_GDALLOC;
 	d->flags |= DEVFL_UP;
 
@@ -285,21 +419,30 @@ aoeblk_gdalloc(void *vp)
 
 	add_disk(gd);
 	aoedisk_add_sysfs(d);
+	aoedisk_add_debugfs(d);
+
+	spin_lock_irqsave(&d->lock, flags);
+	WARN_ON(!(d->flags & DEVFL_GD_NOW));
+	d->flags &= ~DEVFL_GD_NOW;
+	spin_unlock_irqrestore(&d->lock, flags);
 	return;
 
 err_mempool:
-	mempool_destroy(d->bufpool);
+	mempool_destroy(mp);
 err_disk:
 	put_disk(gd);
 err:
 	spin_lock_irqsave(&d->lock, flags);
-	d->flags &= ~DEVFL_GDALLOC;
+	d->flags &= ~DEVFL_GD_NOW;
+	schedule_work(&d->work);
 	spin_unlock_irqrestore(&d->lock, flags);
 }
 
 void
 aoeblk_exit(void)
 {
+	debugfs_remove_recursive(aoe_debugfs_dir);
+	aoe_debugfs_dir = NULL;
 	kmem_cache_destroy(buf_pool_cache);
 }
 
@@ -311,7 +454,11 @@ aoeblk_init(void)
 					   0, 0, NULL);
 	if (buf_pool_cache == NULL)
 		return -ENOMEM;
-
+	aoe_debugfs_dir = debugfs_create_dir("aoe", NULL);
+	if (IS_ERR_OR_NULL(aoe_debugfs_dir)) {
+		pr_info("aoe: cannot create debugfs directory\n");
+		aoe_debugfs_dir = NULL;
+	}
 	return 0;
 }
 
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 200efc4d2c1..ab41be625a5 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoechr.c
  * AoE character device driver
@@ -8,8 +8,10 @@
 #include <linux/blkdev.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
-#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
 #include <linux/skbuff.h>
+#include <linux/export.h>
 #include "aoe.h"
 
 enum {
@@ -36,6 +38,12 @@ struct ErrMsg {
 	char *msg;
 };
 
+static DEFINE_MUTEX(aoechr_mutex);
+
+/* A ring buffer of error messages, to be read through
+ * "/dev/etherd/err".  When no messages are present,
+ * readers will block waiting for messages to appear.
+ */
 static struct ErrMsg emsgs[NMSG];
 static int emsgs_head_idx, emsgs_tail_idx;
 static struct completion emsgs_comp;
@@ -83,34 +91,34 @@ revalidate(const char __user *str, size_t size)
 	if (copy_from_user(buf, str, size))
 		return -EFAULT;
 
-	/* should be e%d.%d format */
 	n = sscanf(buf, "e%d.%d", &major, &minor);
 	if (n != 2) {
-		printk(KERN_ERR "aoe: invalid device specification\n");
+		pr_err("aoe: invalid device specification %s\n", buf);
 		return -EINVAL;
 	}
-	d = aoedev_by_aoeaddr(major, minor);
+	d = aoedev_by_aoeaddr(major, minor, 0);
 	if (!d)
 		return -EINVAL;
 	spin_lock_irqsave(&d->lock, flags);
 	aoecmd_cleanslate(d);
+	aoecmd_cfg(major, minor);
 loop:
 	skb = aoecmd_ata_id(d);
 	spin_unlock_irqrestore(&d->lock, flags);
 	/* try again if we are able to sleep a bit,
 	 * otherwise give up this revalidation
 	 */
-	if (!skb && !msleep_interruptible(200)) {
+	if (!skb && !msleep_interruptible(250)) {
 		spin_lock_irqsave(&d->lock, flags);
 		goto loop;
 	}
+	aoedev_put(d);
 	if (skb) {
 		struct sk_buff_head queue;
 		__skb_queue_head_init(&queue);
 		__skb_queue_tail(&queue, skb);
 		aoenet_xmit(&queue);
 	}
-	aoecmd_cfg(major, minor);
 	return 0;
 }
 
@@ -131,13 +139,12 @@ bail:		spin_unlock_irqrestore(&emsgs_lock, flags);
 		return;
 	}
 
-	mp = kmalloc(n, GFP_ATOMIC);
+	mp = kmemdup(msg, n, GFP_ATOMIC);
 	if (mp == NULL) {
 		printk(KERN_ERR "aoe: allocation failure, len=%ld\n", n);
 		goto bail;
 	}
 
-	memcpy(mp, msg, n);
 	em->msg = mp;
 	em->flags |= EMFL_VALID;
 	em->len = n;
@@ -171,6 +178,7 @@ aoechr_write(struct file *filp, const char __user *buf, size_t cnt, loff_t *offp
 		break;
 	case MINOR_FLUSH:
 		ret = aoedev_flush(buf, cnt);
+		break;
 	}
 	if (ret == 0)
 		ret = cnt;
@@ -182,16 +190,16 @@ aoechr_open(struct inode *inode, struct file *filp)
 {
 	int n, i;
 
-	lock_kernel();
+	mutex_lock(&aoechr_mutex);
 	n = iminor(inode);
 	filp->private_data = (void *) (unsigned long) n;
 
 	for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
 		if (chardevs[i].minor == n) {
-			unlock_kernel();
+			mutex_unlock(&aoechr_mutex);
 			return 0;
 		}
-	unlock_kernel();
+	mutex_unlock(&aoechr_mutex);
 	return -EINVAL;
 }
 
@@ -264,15 +272,21 @@ static const struct file_operations aoe_fops = {
 	.open = aoechr_open,
 	.release = aoechr_rel,
 	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
 };
 
+static char *aoe_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev));
+}
+
 int __init
 aoechr_init(void)
 {
 	int n, i;
 
 	n = register_chrdev(AOE_MAJOR, "aoechr", &aoe_fops);
-	if (n < 0) { 
+	if (n < 0) {
 		printk(KERN_ERR "aoe: can't register char device\n");
 		return n;
 	}
@@ -283,6 +297,8 @@ aoechr_init(void)
 		unregister_chrdev(AOE_MAJOR, "aoechr");
 		return PTR_ERR(aoe_class);
 	}
+	aoe_class->devnode = aoe_devnode;
+
 	for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
 		device_create(aoe_class, NULL,
 			      MKDEV(AOE_MAJOR, chardevs[i].minor), NULL,
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 31693bc2444..422b7d84f68 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1,60 +1,113 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoecmd.c
  * Filesystem request handling methods
  */
 
 #include <linux/ata.h>
+#include <linux/slab.h>
 #include <linux/hdreg.h>
 #include <linux/blkdev.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/genhd.h>
 #include <linux/moduleparam.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <net/net_namespace.h>
 #include <asm/unaligned.h>
+#include <linux/uio.h>
 #include "aoe.h"
 
+#define MAXIOC (8192)	/* default meant to avoid most soft lockups */
+
+static void ktcomplete(struct frame *, struct sk_buff *);
+static int count_targets(struct aoedev *d, int *untainted);
+
+static struct buf *nextbuf(struct aoedev *);
+
 static int aoe_deadsecs = 60 * 3;
 module_param(aoe_deadsecs, int, 0644);
 MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
 
-static int aoe_maxout = 16;
+static int aoe_maxout = 64;
 module_param(aoe_maxout, int, 0644);
 MODULE_PARM_DESC(aoe_maxout,
 	"Only aoe_maxout outstanding packets for every MAC on eX.Y.");
 
+/* The number of online cpus during module initialization gives us a
+ * convenient heuristic cap on the parallelism used for ktio threads
+ * doing I/O completion.  It is not important that the cap equal the
+ * actual number of running CPUs at any given time, but because of CPU
+ * hotplug, we take care to use ncpus instead of using
+ * num_online_cpus() after module initialization.
+ */
+static int ncpus;
+
+/* mutex lock used for synchronization while thread spawning */
+static DEFINE_MUTEX(ktio_spawn_lock);
+
+static wait_queue_head_t *ktiowq;
+static struct ktstate *kts;
+
+/* io completion queue */
+struct iocq_ktio {
+	struct list_head head;
+	spinlock_t lock;
+};
+static struct iocq_ktio *iocq;
+
+static struct page *empty_page;
+
 static struct sk_buff *
 new_skb(ulong len)
 {
 	struct sk_buff *skb;
 
-	skb = alloc_skb(len, GFP_ATOMIC);
+	skb = alloc_skb(len + MAX_HEADER, GFP_ATOMIC);
 	if (skb) {
+		skb_reserve(skb, MAX_HEADER);
 		skb_reset_mac_header(skb);
 		skb_reset_network_header(skb);
 		skb->protocol = __constant_htons(ETH_P_AOE);
-		skb->priority = 0;
-		skb->next = skb->prev = NULL;
-
-		/* tell the network layer not to perform IP checksums
-		 * or to get the NIC to do it
-		 */
-		skb->ip_summed = CHECKSUM_NONE;
+		skb_checksum_none_assert(skb);
 	}
 	return skb;
 }
 
 static struct frame *
-getframe(struct aoetgt *t, int tag)
+getframe_deferred(struct aoedev *d, u32 tag)
+{
+	struct list_head *head, *pos, *nx;
+	struct frame *f;
+
+	head = &d->rexmitq;
+	list_for_each_safe(pos, nx, head) {
+		f = list_entry(pos, struct frame, head);
+		if (f->tag == tag) {
+			list_del(pos);
+			return f;
+		}
+	}
+	return NULL;
+}
+
+static struct frame *
+getframe(struct aoedev *d, u32 tag)
 {
-	struct frame *f, *e;
+	struct frame *f;
+	struct list_head *head, *pos, *nx;
+	u32 n;
 
-	f = t->frames;
-	e = f + t->nframes;
-	for (; f<e; f++)
-		if (f->tag == tag)
+	n = tag % NFACTIVE;
+	head = &d->factive[n];
+	list_for_each_safe(pos, nx, head) {
+		f = list_entry(pos, struct frame, head);
+		if (f->tag == tag) {
+			list_del(pos);
 			return f;
+		}
+	}
 	return NULL;
 }
 
@@ -64,18 +117,18 @@ getframe(struct aoetgt *t, int tag)
  * This driver reserves tag -1 to mean "unused frame."
  */
 static int
-newtag(struct aoetgt *t)
+newtag(struct aoedev *d)
 {
 	register ulong n;
 
 	n = jiffies & 0xffff;
-	return n |= (++t->lasttag & 0x7fff) << 16;
+	return n |= (++d->lasttag & 0x7fff) << 16;
 }
 
-static int
+static u32
 aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
 {
-	u32 host_tag = newtag(t);
+	u32 host_tag = newtag(d);
 
 	memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
 	memcpy(h->dst, t->addr, sizeof h->dst);
@@ -100,16 +153,18 @@ put_lba(struct aoe_atahdr *ah, sector_t lba)
 	ah->lba5 = lba >>= 8;
 }
 
-static void
+static struct aoeif *
 ifrotate(struct aoetgt *t)
 {
-	t->ifp++;
-	if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL)
-		t->ifp = t->ifs;
-	if (t->ifp->nd == NULL) {
-		printk(KERN_INFO "aoe: no interface to rotate to\n");
-		BUG();
-	}
+	struct aoeif *ifp;
+
+	ifp = t->ifp;
+	ifp++;
+	if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL)
+		ifp = t->ifs;
+	if (ifp->nd == NULL)
+		return NULL;
+	return t->ifp = ifp;
 }
 
 static void
@@ -134,134 +189,172 @@ skb_pool_get(struct aoedev *d)
 	return NULL;
 }
 
-/* freeframe is where we do our load balancing so it's a little hairy. */
+void
+aoe_freetframe(struct frame *f)
+{
+	struct aoetgt *t;
+
+	t = f->t;
+	f->buf = NULL;
+	memset(&f->iter, 0, sizeof(f->iter));
+	f->r_skb = NULL;
+	f->flags = 0;
+	list_add(&f->head, &t->ffree);
+}
+
 static struct frame *
-freeframe(struct aoedev *d)
+newtframe(struct aoedev *d, struct aoetgt *t)
 {
-	struct frame *f, *e, *rf;
-	struct aoetgt **t;
+	struct frame *f;
 	struct sk_buff *skb;
+	struct list_head *pos;
+
+	if (list_empty(&t->ffree)) {
+		if (t->falloc >= NSKBPOOLMAX*2)
+			return NULL;
+		f = kcalloc(1, sizeof(*f), GFP_ATOMIC);
+		if (f == NULL)
+			return NULL;
+		t->falloc++;
+		f->t = t;
+	} else {
+		pos = t->ffree.next;
+		list_del(pos);
+		f = list_entry(pos, struct frame, head);
+	}
+
+	skb = f->skb;
+	if (skb == NULL) {
+		f->skb = skb = new_skb(ETH_ZLEN);
+		if (!skb) {
+bail:			aoe_freetframe(f);
+			return NULL;
+		}
+	}
+
+	if (atomic_read(&skb_shinfo(skb)->dataref) != 1) {
+		skb = skb_pool_get(d);
+		if (skb == NULL)
+			goto bail;
+		skb_pool_put(d, f->skb);
+		f->skb = skb;
+	}
+
+	skb->truesize -= skb->data_len;
+	skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+	skb_trim(skb, 0);
+	return f;
+}
 
-	if (d->targets[0] == NULL) {	/* shouldn't happen, but I'm paranoid */
+static struct frame *
+newframe(struct aoedev *d)
+{
+	struct frame *f;
+	struct aoetgt *t, **tt;
+	int totout = 0;
+	int use_tainted;
+	int has_untainted;
+
+	if (!d->targets || !d->targets[0]) {
 		printk(KERN_ERR "aoe: NULL TARGETS!\n");
 		return NULL;
 	}
-	t = d->tgt;
-	t++;
-	if (t >= &d->targets[NTARGETS] || !*t)
-		t = d->targets;
-	for (;;) {
-		if ((*t)->nout < (*t)->maxout
-		&& t != d->htgt
-		&& (*t)->ifp->nd) {
-			rf = NULL;
-			f = (*t)->frames;
-			e = f + (*t)->nframes;
-			for (; f < e; f++) {
-				if (f->tag != FREETAG)
-					continue;
-				skb = f->skb;
-				if (!skb
-				&& !(f->skb = skb = new_skb(ETH_ZLEN)))
-					continue;
-				if (atomic_read(&skb_shinfo(skb)->dataref)
-					!= 1) {
-					if (!rf)
-						rf = f;
-					continue;
-				}
-gotone:				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
-				skb_trim(skb, 0);
-				d->tgt = t;
-				ifrotate(*t);
+	tt = d->tgt;	/* last used target */
+	for (use_tainted = 0, has_untainted = 0;;) {
+		tt++;
+		if (tt >= &d->targets[d->ntargets] || !*tt)
+			tt = d->targets;
+		t = *tt;
+		if (!t->taint) {
+			has_untainted = 1;
+			totout += t->nout;
+		}
+		if (t->nout < t->maxout
+		&& (use_tainted || !t->taint)
+		&& t->ifp->nd) {
+			f = newtframe(d, t);
+			if (f) {
+				ifrotate(t);
+				d->tgt = tt;
 				return f;
 			}
-			/* Work can be done, but the network layer is
-			   holding our precious packets.  Try to grab
-			   one from the pool. */
-			f = rf;
-			if (f == NULL) {	/* more paranoia */
-				printk(KERN_ERR
-					"aoe: freeframe: %s.\n",
-					"unexpected null rf");
-				d->flags |= DEVFL_KICKME;
-				return NULL;
-			}
-			skb = skb_pool_get(d);
-			if (skb) {
-				skb_pool_put(d, f->skb);
-				f->skb = skb;
-				goto gotone;
-			}
-			(*t)->dataref++;
-			if ((*t)->nout == 0)
-				d->flags |= DEVFL_KICKME;
 		}
-		if (t == d->tgt)	/* we've looped and found nada */
-			break;
-		t++;
-		if (t >= &d->targets[NTARGETS] || !*t)
-			t = d->targets;
+		if (tt == d->tgt) {	/* we've looped and found nada */
+			if (!use_tainted && !has_untainted)
+				use_tainted = 1;
+			else
+				break;
+		}
+	}
+	if (totout == 0) {
+		d->kicked++;
+		d->flags |= DEVFL_KICKME;
 	}
 	return NULL;
 }
 
-static int
-aoecmd_ata_rw(struct aoedev *d)
+static void
+skb_fillup(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter)
 {
-	struct frame *f;
+	int frag = 0;
+	struct bio_vec bv;
+
+	__bio_for_each_segment(bv, bio, iter, iter)
+		skb_fill_page_desc(skb, frag++, bv.bv_page,
+				   bv.bv_offset, bv.bv_len);
+}
+
+static void
+fhash(struct frame *f)
+{
+	struct aoedev *d = f->t->d;
+	u32 n;
+
+	n = f->tag % NFACTIVE;
+	list_add_tail(&f->head, &d->factive[n]);
+}
+
+static void
+ata_rw_frameinit(struct frame *f)
+{
+	struct aoetgt *t;
 	struct aoe_hdr *h;
 	struct aoe_atahdr *ah;
-	struct buf *buf;
-	struct bio_vec *bv;
-	struct aoetgt *t;
 	struct sk_buff *skb;
-	ulong bcnt;
 	char writebit, extbit;
 
-	writebit = 0x10;
-	extbit = 0x4;
-
-	f = freeframe(d);
-	if (f == NULL)
-		return 0;
-	t = *d->tgt;
-	buf = d->inprocess;
-	bv = buf->bv;
-	bcnt = t->ifp->maxbcnt;
-	if (bcnt == 0)
-		bcnt = DEFAULTBCNT;
-	if (bcnt > buf->bv_resid)
-		bcnt = buf->bv_resid;
-	/* initialize the headers & frame */
 	skb = f->skb;
 	h = (struct aoe_hdr *) skb_mac_header(skb);
-	ah = (struct aoe_atahdr *) (h+1);
-	skb_put(skb, sizeof *h + sizeof *ah);
+	ah = (struct aoe_atahdr *) (h + 1);
+	skb_put(skb, sizeof(*h) + sizeof(*ah));
 	memset(h, 0, skb->len);
-	f->tag = aoehdr_atainit(d, t, h);
+
+	writebit = 0x10;
+	extbit = 0x4;
+
+	t = f->t;
+	f->tag = aoehdr_atainit(t->d, t, h);
+	fhash(f);
 	t->nout++;
 	f->waited = 0;
-	f->buf = buf;
-	f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
-	f->bcnt = bcnt;
-	f->lba = buf->sector;
+	f->waited_total = 0;
 
 	/* set up ata header */
-	ah->scnt = bcnt >> 9;
-	put_lba(ah, buf->sector);
-	if (d->flags & DEVFL_EXT) {
+	ah->scnt = f->iter.bi_size >> 9;
+	put_lba(ah, f->iter.bi_sector);
+	if (t->d->flags & DEVFL_EXT) {
 		ah->aflags |= AOEAFL_EXT;
 	} else {
 		extbit = 0;
 		ah->lba3 &= 0x0f;
 		ah->lba3 |= 0xe0;	/* LBA bit + obsolete 0xa0 */
 	}
-	if (bio_data_dir(buf->bio) == WRITE) {
-		skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
+	if (f->buf && bio_data_dir(f->buf->bio) == WRITE) {
+		skb_fillup(skb, f->buf->bio, f->iter);
 		ah->aflags |= AOEAFL_WRITE;
-		skb->len += bcnt;
-		skb->data_len = bcnt;
+		skb->len += f->iter.bi_size;
+		skb->data_len = f->iter.bi_size;
+		skb->truesize += f->iter.bi_size;
 		t->wpkts++;
 	} else {
 		t->rpkts++;
@@ -269,26 +362,48 @@ aoecmd_ata_rw(struct aoedev *d)
 	}
 
 	ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
+	skb->dev = t->ifp->nd;
+}
+
+static int
+aoecmd_ata_rw(struct aoedev *d)
+{
+	struct frame *f;
+	struct buf *buf;
+	struct sk_buff *skb;
+	struct sk_buff_head queue;
+
+	buf = nextbuf(d);
+	if (buf == NULL)
+		return 0;
+	f = newframe(d);
+	if (f == NULL)
+		return 0;
+
+	/* initialize the headers & frame */
+	f->buf = buf;
+	f->iter = buf->iter;
+	f->iter.bi_size = min_t(unsigned long,
+				d->maxbcnt ?: DEFAULTBCNT,
+				f->iter.bi_size);
+	bio_advance_iter(buf->bio, &buf->iter, f->iter.bi_size);
+
+	if (!buf->iter.bi_size)
+		d->ip.buf = NULL;
 
 	/* mark all tracking fields and load out */
 	buf->nframesout += 1;
-	buf->bv_off += bcnt;
-	buf->bv_resid -= bcnt;
-	buf->resid -= bcnt;
-	buf->sector += bcnt >> 9;
-	if (buf->resid == 0) {
-		d->inprocess = NULL;
-	} else if (buf->bv_resid == 0) {
-		buf->bv = ++bv;
-		buf->bv_resid = bv->bv_len;
-		WARN_ON(buf->bv_resid == 0);
-		buf->bv_off = bv->bv_offset;
-	}
 
-	skb->dev = t->ifp->nd;
-	skb = skb_clone(skb, GFP_ATOMIC);
-	if (skb)
-		__skb_queue_tail(&d->sendq, skb);
+	ata_rw_frameinit(f);
+
+	skb = skb_clone(f->skb, GFP_ATOMIC);
+	if (skb) {
+		do_gettimeofday(&f->sent);
+		f->sent_jiffs = (u32) jiffies;
+		__skb_queue_head_init(&queue);
+		__skb_queue_tail(&queue, skb);
+		aoenet_xmit(&queue);
+	}
 	return 1;
 }
 
@@ -303,8 +418,8 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
 	struct sk_buff *skb;
 	struct net_device *ifp;
 
-	read_lock(&dev_base_lock);
-	for_each_netdev(&init_net, ifp) {
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, ifp) {
 		dev_hold(ifp);
 		if (!is_aoe_netif(ifp))
 			goto cont;
@@ -331,64 +446,90 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
 cont:
 		dev_put(ifp);
 	}
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 }
 
 static void
-resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
+resend(struct aoedev *d, struct frame *f)
 {
 	struct sk_buff *skb;
+	struct sk_buff_head queue;
 	struct aoe_hdr *h;
-	struct aoe_atahdr *ah;
+	struct aoetgt *t;
 	char buf[128];
 	u32 n;
 
-	ifrotate(t);
-	n = newtag(t);
+	t = f->t;
+	n = newtag(d);
 	skb = f->skb;
+	if (ifrotate(t) == NULL) {
+		/* probably can't happen, but set it up to fail anyway */
+		pr_info("aoe: resend: no interfaces to rotate to.\n");
+		ktcomplete(f, NULL);
+		return;
+	}
 	h = (struct aoe_hdr *) skb_mac_header(skb);
-	ah = (struct aoe_atahdr *) (h+1);
 
-	snprintf(buf, sizeof buf,
-		"%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
-		"retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n,
-		h->src, h->dst, t->nout);
-	aoechr_error(buf);
+	if (!(f->flags & FFL_PROBE)) {
+		snprintf(buf, sizeof(buf),
+			"%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
+			"retransmit", d->aoemajor, d->aoeminor,
+			f->tag, jiffies, n,
+			h->src, h->dst, t->nout);
+		aoechr_error(buf);
+	}
 
 	f->tag = n;
+	fhash(f);
 	h->tag = cpu_to_be32(n);
 	memcpy(h->dst, t->addr, sizeof h->dst);
 	memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
 
-	switch (ah->cmdstat) {
-	default:
-		break;
-	case ATA_CMD_PIO_READ:
-	case ATA_CMD_PIO_READ_EXT:
-	case ATA_CMD_PIO_WRITE:
-	case ATA_CMD_PIO_WRITE_EXT:
-		put_lba(ah, f->lba);
-
-		n = f->bcnt;
-		if (n > DEFAULTBCNT)
-			n = DEFAULTBCNT;
-		ah->scnt = n >> 9;
-		if (ah->aflags & AOEAFL_WRITE) {
-			skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
-				offset_in_page(f->bufaddr), n);
-			skb->len = sizeof *h + sizeof *ah + n;
-			skb->data_len = n;
-		}
-	}
 	skb->dev = t->ifp->nd;
 	skb = skb_clone(skb, GFP_ATOMIC);
 	if (skb == NULL)
 		return;
-	__skb_queue_tail(&d->sendq, skb);
+	do_gettimeofday(&f->sent);
+	f->sent_jiffs = (u32) jiffies;
+	__skb_queue_head_init(&queue);
+	__skb_queue_tail(&queue, skb);
+	aoenet_xmit(&queue);
+}
+
+static int
+tsince_hr(struct frame *f)
+{
+	struct timeval now;
+	int n;
+
+	do_gettimeofday(&now);
+	n = now.tv_usec - f->sent.tv_usec;
+	n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
+
+	if (n < 0)
+		n = -n;
+
+	/* For relatively long periods, use jiffies to avoid
+	 * discrepancies caused by updates to the system time.
+	 *
+	 * On system with HZ of 1000, 32-bits is over 49 days
+	 * worth of jiffies, or over 71 minutes worth of usecs.
+	 *
+	 * Jiffies overflow is handled by subtraction of unsigned ints:
+	 * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
+	 * $3 = 4
+	 * (gdb)
+	 */
+	if (n > USEC_PER_SEC / 4) {
+		n = ((u32) jiffies) - f->sent_jiffs;
+		n *= USEC_PER_SEC / HZ;
+	}
+
+	return n;
 }
 
 static int
-tsince(int tag)
+tsince(u32 tag)
 {
 	int n;
 
@@ -396,7 +537,7 @@ tsince(int tag)
 	n -= tag & 0xffff;
 	if (n < 0)
 		n += 1<<16;
-	return n;
+	return jiffies_to_usecs(n + 1);
 }
 
 static struct aoeif *
@@ -412,195 +553,399 @@ getif(struct aoetgt *t, struct net_device *nd)
 	return NULL;
 }
 
-static struct aoeif *
-addif(struct aoetgt *t, struct net_device *nd)
-{
-	struct aoeif *p;
-
-	p = getif(t, NULL);
-	if (!p)
-		return NULL;
-	p->nd = nd;
-	p->maxbcnt = DEFAULTBCNT;
-	p->lost = 0;
-	p->lostjumbo = 0;
-	return p;
-}
-
 static void
 ejectif(struct aoetgt *t, struct aoeif *ifp)
 {
 	struct aoeif *e;
+	struct net_device *nd;
 	ulong n;
 
+	nd = ifp->nd;
 	e = t->ifs + NAOEIFS - 1;
 	n = (e - ifp) * sizeof *ifp;
 	memmove(ifp, ifp+1, n);
 	e->nd = NULL;
+	dev_put(nd);
 }
 
-static int
-sthtith(struct aoedev *d)
+static struct frame *
+reassign_frame(struct frame *f)
 {
-	struct frame *f, *e, *nf;
+	struct frame *nf;
 	struct sk_buff *skb;
-	struct aoetgt *ht = *d->htgt;
 
-	f = ht->frames;
-	e = f + ht->nframes;
-	for (; f < e; f++) {
-		if (f->tag == FREETAG)
+	nf = newframe(f->t->d);
+	if (!nf)
+		return NULL;
+	if (nf->t == f->t) {
+		aoe_freetframe(nf);
+		return NULL;
+	}
+
+	skb = nf->skb;
+	nf->skb = f->skb;
+	nf->buf = f->buf;
+	nf->iter = f->iter;
+	nf->waited = 0;
+	nf->waited_total = f->waited_total;
+	nf->sent = f->sent;
+	nf->sent_jiffs = f->sent_jiffs;
+	f->skb = skb;
+
+	return nf;
+}
+
+static void
+probe(struct aoetgt *t)
+{
+	struct aoedev *d;
+	struct frame *f;
+	struct sk_buff *skb;
+	struct sk_buff_head queue;
+	size_t n, m;
+	int frag;
+
+	d = t->d;
+	f = newtframe(d, t);
+	if (!f) {
+		pr_err("%s %pm for e%ld.%d: %s\n",
+			"aoe: cannot probe remote address",
+			t->addr,
+			(long) d->aoemajor, d->aoeminor,
+			"no frame available");
+		return;
+	}
+	f->flags |= FFL_PROBE;
+	ifrotate(t);
+	f->iter.bi_size = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT;
+	ata_rw_frameinit(f);
+	skb = f->skb;
+	for (frag = 0, n = f->iter.bi_size; n > 0; ++frag, n -= m) {
+		if (n < PAGE_SIZE)
+			m = n;
+		else
+			m = PAGE_SIZE;
+		skb_fill_page_desc(skb, frag, empty_page, 0, m);
+	}
+	skb->len += f->iter.bi_size;
+	skb->data_len = f->iter.bi_size;
+	skb->truesize += f->iter.bi_size;
+
+	skb = skb_clone(f->skb, GFP_ATOMIC);
+	if (skb) {
+		do_gettimeofday(&f->sent);
+		f->sent_jiffs = (u32) jiffies;
+		__skb_queue_head_init(&queue);
+		__skb_queue_tail(&queue, skb);
+		aoenet_xmit(&queue);
+	}
+}
+
+static long
+rto(struct aoedev *d)
+{
+	long t;
+
+	t = 2 * d->rttavg >> RTTSCALE;
+	t += 8 * d->rttdev >> RTTDSCALE;
+	if (t == 0)
+		t = 1;
+
+	return t;
+}
+
+static void
+rexmit_deferred(struct aoedev *d)
+{
+	struct aoetgt *t;
+	struct frame *f;
+	struct frame *nf;
+	struct list_head *pos, *nx, *head;
+	int since;
+	int untainted;
+
+	count_targets(d, &untainted);
+
+	head = &d->rexmitq;
+	list_for_each_safe(pos, nx, head) {
+		f = list_entry(pos, struct frame, head);
+		t = f->t;
+		if (t->taint) {
+			if (!(f->flags & FFL_PROBE)) {
+				nf = reassign_frame(f);
+				if (nf) {
+					if (t->nout_probes == 0
+					&& untainted > 0) {
+						probe(t);
+						t->nout_probes++;
+					}
+					list_replace(&f->head, &nf->head);
+					pos = &nf->head;
+					aoe_freetframe(f);
+					f = nf;
+					t = f->t;
+				}
+			} else if (untainted < 1) {
+				/* don't probe w/o other untainted aoetgts */
+				goto stop_probe;
+			} else if (tsince_hr(f) < t->taint * rto(d)) {
+				/* reprobe slowly when taint is high */
+				continue;
+			}
+		} else if (f->flags & FFL_PROBE) {
+stop_probe:		/* don't probe untainted aoetgts */
+			list_del(pos);
+			aoe_freetframe(f);
+			/* leaving d->kicked, because this is routine */
+			f->t->d->flags |= DEVFL_KICKME;
 			continue;
-		nf = freeframe(d);
-		if (!nf)
-			return 0;
-		skb = nf->skb;
-		*nf = *f;
-		f->skb = skb;
-		f->tag = FREETAG;
-		nf->waited = 0;
-		ht->nout--;
-		(*d->tgt)->nout++;
-		resend(d, *d->tgt, nf);
+		}
+		if (t->nout >= t->maxout)
+			continue;
+		list_del(pos);
+		t->nout++;
+		if (f->flags & FFL_PROBE)
+			t->nout_probes++;
+		since = tsince_hr(f);
+		f->waited += since;
+		f->waited_total += since;
+		resend(d, f);
 	}
-	/* he's clean, he's useless.  take away his interfaces */
-	memset(ht->ifs, 0, sizeof ht->ifs);
-	d->htgt = NULL;
-	return 1;
 }
 
-static inline unsigned char
-ata_scnt(unsigned char *packet) {
-	struct aoe_hdr *h;
-	struct aoe_atahdr *ah;
+/* An aoetgt accumulates demerits quickly, and successful
+ * probing redeems the aoetgt slowly.
+ */
+static void
+scorn(struct aoetgt *t)
+{
+	int n;
 
-	h = (struct aoe_hdr *) packet;
-	ah = (struct aoe_atahdr *) (h+1);
-	return ah->scnt;
+	n = t->taint++;
+	t->taint += t->taint * 2;
+	if (n > t->taint)
+		t->taint = n;
+	if (t->taint > MAX_TAINT)
+		t->taint = MAX_TAINT;
+}
+
+static int
+count_targets(struct aoedev *d, int *untainted)
+{
+	int i, good;
+
+	for (i = good = 0; i < d->ntargets && d->targets[i]; ++i)
+		if (d->targets[i]->taint == 0)
+			good++;
+
+	if (untainted)
+		*untainted = good;
+	return i;
 }
 
 static void
 rexmit_timer(ulong vp)
 {
-	struct sk_buff_head queue;
 	struct aoedev *d;
-	struct aoetgt *t, **tt, **te;
+	struct aoetgt *t;
 	struct aoeif *ifp;
-	struct frame *f, *e;
+	struct frame *f;
+	struct list_head *head, *pos, *nx;
+	LIST_HEAD(flist);
 	register long timeout;
 	ulong flags, n;
+	int i;
+	int utgts;	/* number of aoetgt descriptors (not slots) */
+	int since;
 
 	d = (struct aoedev *) vp;
 
-	/* timeout is always ~150% of the moving average */
-	timeout = d->rttavg;
-	timeout += timeout >> 1;
-
 	spin_lock_irqsave(&d->lock, flags);
 
+	/* timeout based on observed timings and variations */
+	timeout = rto(d);
+
+	utgts = count_targets(d, NULL);
+
 	if (d->flags & DEVFL_TKILL) {
 		spin_unlock_irqrestore(&d->lock, flags);
 		return;
 	}
-	tt = d->targets;
-	te = tt + NTARGETS;
-	for (; tt < te && *tt; tt++) {
-		t = *tt;
-		f = t->frames;
-		e = f + t->nframes;
-		for (; f < e; f++) {
-			if (f->tag == FREETAG
-			|| tsince(f->tag) < timeout)
-				continue;
-			n = f->waited += timeout;
-			n /= HZ;
-			if (n > aoe_deadsecs) {
-				/* waited too long.  device failure. */
-				aoedev_downdev(d);
-				break;
-			}
 
-			if (n > HELPWAIT /* see if another target can help */
-			&& (tt != d->targets || d->targets[1]))
-				d->htgt = tt;
+	/* collect all frames to rexmit into flist */
+	for (i = 0; i < NFACTIVE; i++) {
+		head = &d->factive[i];
+		list_for_each_safe(pos, nx, head) {
+			f = list_entry(pos, struct frame, head);
+			if (tsince_hr(f) < timeout)
+				break;	/* end of expired frames */
+			/* move to flist for later processing */
+			list_move_tail(pos, &flist);
+		}
+	}
 
-			if (t->nout == t->maxout) {
-				if (t->maxout > 1)
-					t->maxout--;
-				t->lastwadj = jiffies;
-			}
+	/* process expired frames */
+	while (!list_empty(&flist)) {
+		pos = flist.next;
+		f = list_entry(pos, struct frame, head);
+		since = tsince_hr(f);
+		n = f->waited_total + since;
+		n /= USEC_PER_SEC;
+		if (aoe_deadsecs
+		&& n > aoe_deadsecs
+		&& !(f->flags & FFL_PROBE)) {
+			/* Waited too long.  Device failure.
+			 * Hang all frames on first hash bucket for downdev
+			 * to clean up.
+			 */
+			list_splice(&flist, &d->factive[0]);
+			aoedev_downdev(d);
+			goto out;
+		}
 
+		t = f->t;
+		n = f->waited + since;
+		n /= USEC_PER_SEC;
+		if (aoe_deadsecs && utgts > 0
+		&& (n > aoe_deadsecs / utgts || n > HARD_SCORN_SECS))
+			scorn(t); /* avoid this target */
+
+		if (t->maxout != 1) {
+			t->ssthresh = t->maxout / 2;
+			t->maxout = 1;
+		}
+
+		if (f->flags & FFL_PROBE) {
+			t->nout_probes--;
+		} else {
 			ifp = getif(t, f->skb->dev);
 			if (ifp && ++ifp->lost > (t->nframes << 1)
 			&& (ifp != t->ifs || t->ifs[1].nd)) {
 				ejectif(t, ifp);
 				ifp = NULL;
 			}
-
-			if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
-			&& ifp && ++ifp->lostjumbo > (t->nframes << 1)
-			&& ifp->maxbcnt != DEFAULTBCNT) {
-				printk(KERN_INFO
-					"aoe: e%ld.%d: "
-					"too many lost jumbo on "
-					"%s:%pm - "
-					"falling back to %d frames.\n",
-					d->aoemajor, d->aoeminor,
-					ifp->nd->name, t->addr,
-					DEFAULTBCNT);
-				ifp->maxbcnt = 0;
-			}
-			resend(d, t, f);
 		}
-
-		/* window check */
-		if (t->nout == t->maxout
-		&& t->maxout < t->nframes
-		&& (jiffies - t->lastwadj)/HZ > 10) {
-			t->maxout++;
-			t->lastwadj = jiffies;
-		}
-	}
-
-	if (!skb_queue_empty(&d->sendq)) {
-		n = d->rttavg <<= 1;
-		if (n > MAXTIMER)
-			d->rttavg = MAXTIMER;
+		list_move_tail(pos, &d->rexmitq);
+		t->nout--;
 	}
+	rexmit_deferred(d);
 
-	if (d->flags & DEVFL_KICKME || d->htgt) {
+out:
+	if ((d->flags & DEVFL_KICKME) && d->blkq) {
 		d->flags &= ~DEVFL_KICKME;
-		aoecmd_work(d);
+		d->blkq->request_fn(d->blkq);
 	}
 
-	__skb_queue_head_init(&queue);
-	skb_queue_splice_init(&d->sendq, &queue);
-
 	d->timer.expires = jiffies + TIMERTICK;
 	add_timer(&d->timer);
 
 	spin_unlock_irqrestore(&d->lock, flags);
+}
 
-	aoenet_xmit(&queue);
+static unsigned long
+rqbiocnt(struct request *r)
+{
+	struct bio *bio;
+	unsigned long n = 0;
+
+	__rq_for_each_bio(bio, r)
+		n++;
+	return n;
+}
+
+/* This can be removed if we are certain that no users of the block
+ * layer will ever use zero-count pages in bios.  Otherwise we have to
+ * protect against the put_page sometimes done by the network layer.
+ *
+ * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
+ * discussion.
+ *
+ * We cannot use get_page in the workaround, because it insists on a
+ * positive page count as a precondition.  So we use _count directly.
+ */
+static void
+bio_pageinc(struct bio *bio)
+{
+	struct bio_vec bv;
+	struct page *page;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(bv, bio, iter) {
+		/* Non-zero page count for non-head members of
+		 * compound pages is no longer allowed by the kernel.
+		 */
+		page = compound_head(bv.bv_page);
+		atomic_inc(&page->_count);
+	}
+}
+
+static void
+bio_pagedec(struct bio *bio)
+{
+	struct page *page;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(bv, bio, iter) {
+		page = compound_head(bv.bv_page);
+		atomic_dec(&page->_count);
+	}
+}
+
+static void
+bufinit(struct buf *buf, struct request *rq, struct bio *bio)
+{
+	memset(buf, 0, sizeof(*buf));
+	buf->rq = rq;
+	buf->bio = bio;
+	buf->iter = bio->bi_iter;
+	bio_pageinc(bio);
+}
+
+static struct buf *
+nextbuf(struct aoedev *d)
+{
+	struct request *rq;
+	struct request_queue *q;
+	struct buf *buf;
+	struct bio *bio;
+
+	q = d->blkq;
+	if (q == NULL)
+		return NULL;	/* initializing */
+	if (d->ip.buf)
+		return d->ip.buf;
+	rq = d->ip.rq;
+	if (rq == NULL) {
+		rq = blk_peek_request(q);
+		if (rq == NULL)
+			return NULL;
+		blk_start_request(rq);
+		d->ip.rq = rq;
+		d->ip.nxbio = rq->bio;
+		rq->special = (void *) rqbiocnt(rq);
+	}
+	buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
+	if (buf == NULL) {
+		pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
+		return NULL;
+	}
+	bio = d->ip.nxbio;
+	bufinit(buf, rq, bio);
+	bio = bio->bi_next;
+	d->ip.nxbio = bio;
+	if (bio == NULL)
+		d->ip.rq = NULL;
+	return d->ip.buf = buf;
 }
 
 /* enters with d->lock held */
 void
 aoecmd_work(struct aoedev *d)
 {
-	struct buf *buf;
-loop:
-	if (d->htgt && !sthtith(d))
-		return;
-	if (d->inprocess == NULL) {
-		if (list_empty(&d->bufq))
-			return;
-		buf = container_of(d->bufq.next, struct buf, bufs);
-		list_del(d->bufq.next);
-		d->inprocess = buf;
-	}
-	if (aoecmd_ata_rw(d))
-		goto loop;
+	rexmit_deferred(d);
+	while (aoecmd_ata_rw(d))
+		;
 }
 
 /* this function performs work that has been deferred until sleeping is OK
@@ -609,28 +954,36 @@ void
 aoecmd_sleepwork(struct work_struct *work)
 {
 	struct aoedev *d = container_of(work, struct aoedev, work);
+	struct block_device *bd;
+	u64 ssize;
 
 	if (d->flags & DEVFL_GDALLOC)
 		aoeblk_gdalloc(d);
 
 	if (d->flags & DEVFL_NEWSIZE) {
-		struct block_device *bd;
-		unsigned long flags;
-		u64 ssize;
-
 		ssize = get_capacity(d->gd);
 		bd = bdget_disk(d->gd, 0);
-
 		if (bd) {
 			mutex_lock(&bd->bd_inode->i_mutex);
 			i_size_write(bd->bd_inode, (loff_t)ssize<<9);
 			mutex_unlock(&bd->bd_inode->i_mutex);
 			bdput(bd);
 		}
-		spin_lock_irqsave(&d->lock, flags);
+		spin_lock_irq(&d->lock);
 		d->flags |= DEVFL_UP;
 		d->flags &= ~DEVFL_NEWSIZE;
-		spin_unlock_irqrestore(&d->lock, flags);
+		spin_unlock_irq(&d->lock);
+	}
+}
+
+static void
+ata_ident_fixstring(u16 *id, int ns)
+{
+	u16 s;
+
+	while (ns-- > 0) {
+		s = *id;
+		*id++ = s >> 8 | s << 8;
 	}
 }
 
@@ -669,6 +1022,11 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
 		d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
 	}
 
+	ata_ident_fixstring((u16 *) &id[10<<1], 10);	/* serial */
+	ata_ident_fixstring((u16 *) &id[23<<1], 4);	/* firmware */
+	ata_ident_fixstring((u16 *) &id[27<<1], 20);	/* model */
+	memcpy(d->ident, id, sizeof(d->ident));
+
 	if (d->ssize != ssize)
 		printk(KERN_INFO
 			"aoe: %pm e%ld.%d v%04x has %llu sectors\n",
@@ -688,26 +1046,28 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
 }
 
 static void
-calc_rttavg(struct aoedev *d, int rtt)
+calc_rttavg(struct aoedev *d, struct aoetgt *t, int rtt)
 {
 	register long n;
 
 	n = rtt;
-	if (n < 0) {
-		n = -rtt;
-		if (n < MINTIMER)
-			n = MINTIMER;
-		else if (n > MAXTIMER)
-			n = MAXTIMER;
-		d->mintimer += (n - d->mintimer) >> 1;
-	} else if (n < d->mintimer)
-		n = d->mintimer;
-	else if (n > MAXTIMER)
-		n = MAXTIMER;
-
-	/* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */
-	n -= d->rttavg;
-	d->rttavg += n >> 2;
+
+	/* cf. Congestion Avoidance and Control, Jacobson & Karels, 1988 */
+	n -= d->rttavg >> RTTSCALE;
+	d->rttavg += n;
+	if (n < 0)
+		n = -n;
+	n -= d->rttdev >> RTTDSCALE;
+	d->rttdev += n;
+
+	if (!t || t->maxout >= t->nframes)
+		return;
+	if (t->maxout < t->ssthresh)
+		t->maxout += 1;
+	else if (t->nout == t->maxout && t->next_cwnd-- == 0) {
+		t->maxout += 1;
+		t->next_cwnd = t->maxout;
+	}
 }
 
 static struct aoetgt *
@@ -716,166 +1076,352 @@ gettgt(struct aoedev *d, char *addr)
 	struct aoetgt **t, **e;
 
 	t = d->targets;
-	e = t + NTARGETS;
+	e = t + d->ntargets;
 	for (; t < e && *t; t++)
 		if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
 			return *t;
 	return NULL;
 }
 
-static inline void
-diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector)
+static void
+bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt)
 {
-	unsigned long n_sect = bio->bi_size >> 9;
-	const int rw = bio_data_dir(bio);
-	struct hd_struct *part;
-	int cpu;
+	int soff = 0;
+	struct bio_vec bv;
 
-	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(disk, sector);
+	iter.bi_size = cnt;
 
-	part_stat_inc(cpu, part, ios[rw]);
-	part_stat_add(cpu, part, ticks[rw], duration);
-	part_stat_add(cpu, part, sectors[rw], n_sect);
-	part_stat_add(cpu, part, io_ticks, duration);
-
-	part_stat_unlock();
+	__bio_for_each_segment(bv, bio, iter, iter) {
+		char *p = page_address(bv.bv_page) + bv.bv_offset;
+		skb_copy_bits(skb, soff, p, bv.bv_len);
+		soff += bv.bv_len;
+	}
 }
 
 void
-aoecmd_ata_rsp(struct sk_buff *skb)
+aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
+{
+	struct bio *bio;
+	int bok;
+	struct request_queue *q;
+
+	q = d->blkq;
+	if (rq == d->ip.rq)
+		d->ip.rq = NULL;
+	do {
+		bio = rq->bio;
+		bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
+	} while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size));
+
+	/* cf. http://lkml.org/lkml/2006/10/31/28 */
+	if (!fastfail)
+		__blk_run_queue(q);
+}
+
+static void
+aoe_end_buf(struct aoedev *d, struct buf *buf)
+{
+	struct request *rq;
+	unsigned long n;
+
+	if (buf == d->ip.buf)
+		d->ip.buf = NULL;
+	rq = buf->rq;
+	bio_pagedec(buf->bio);
+	mempool_free(buf, d->bufpool);
+	n = (unsigned long) rq->special;
+	rq->special = (void *) --n;
+	if (n == 0)
+		aoe_end_request(d, rq, 0);
+}
+
+static void
+ktiocomplete(struct frame *f)
 {
-	struct sk_buff_head queue;
-	struct aoedev *d;
 	struct aoe_hdr *hin, *hout;
 	struct aoe_atahdr *ahin, *ahout;
-	struct frame *f;
 	struct buf *buf;
+	struct sk_buff *skb;
 	struct aoetgt *t;
 	struct aoeif *ifp;
-	register long n;
-	ulong flags;
-	char ebuf[128];
-	u16 aoemajor;
+	struct aoedev *d;
+	long n;
+	int untainted;
 
-	hin = (struct aoe_hdr *) skb_mac_header(skb);
-	aoemajor = get_unaligned_be16(&hin->major);
-	d = aoedev_by_aoeaddr(aoemajor, hin->minor);
-	if (d == NULL) {
-		snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
-			"for unknown device %d.%d\n",
-			 aoemajor, hin->minor);
-		aoechr_error(ebuf);
+	if (f == NULL)
 		return;
-	}
-
-	spin_lock_irqsave(&d->lock, flags);
 
-	n = get_unaligned_be32(&hin->tag);
-	t = gettgt(d, hin->src);
-	if (t == NULL) {
-		printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n",
-			d->aoemajor, d->aoeminor, hin->src);
-		spin_unlock_irqrestore(&d->lock, flags);
-		return;
-	}
-	f = getframe(t, n);
-	if (f == NULL) {
-		calc_rttavg(d, -tsince(n));
-		spin_unlock_irqrestore(&d->lock, flags);
-		snprintf(ebuf, sizeof ebuf,
-			"%15s e%d.%d    tag=%08x@%08lx\n",
-			"unexpected rsp",
-			get_unaligned_be16(&hin->major),
-			hin->minor,
-			get_unaligned_be32(&hin->tag),
-			jiffies);
-		aoechr_error(ebuf);
-		return;
-	}
-
-	calc_rttavg(d, tsince(f->tag));
+	t = f->t;
+	d = t->d;
+	skb = f->r_skb;
+	buf = f->buf;
+	if (f->flags & FFL_PROBE)
+		goto out;
+	if (!skb)		/* just fail the buf. */
+		goto noskb;
 
-	ahin = (struct aoe_atahdr *) (hin+1);
 	hout = (struct aoe_hdr *) skb_mac_header(f->skb);
 	ahout = (struct aoe_atahdr *) (hout+1);
-	buf = f->buf;
 
+	hin = (struct aoe_hdr *) skb->data;
+	skb_pull(skb, sizeof(*hin));
+	ahin = (struct aoe_atahdr *) skb->data;
+	skb_pull(skb, sizeof(*ahin));
 	if (ahin->cmdstat & 0xa9) {	/* these bits cleared on success */
-		printk(KERN_ERR
-			"aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
+		pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
 			ahout->cmdstat, ahin->cmdstat,
 			d->aoemajor, d->aoeminor);
-		if (buf)
-			buf->flags |= BUFFL_FAIL;
-	} else {
-		if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */
-			d->htgt = NULL;
-		n = ahout->scnt << 9;
-		switch (ahout->cmdstat) {
-		case ATA_CMD_PIO_READ:
-		case ATA_CMD_PIO_READ_EXT:
-			if (skb->len - sizeof *hin - sizeof *ahin < n) {
-				printk(KERN_ERR
-					"aoe: %s.  skb->len=%d need=%ld\n",
-					"runt data size in read", skb->len, n);
-				/* fail frame f?  just returning will rexmit. */
-				spin_unlock_irqrestore(&d->lock, flags);
-				return;
-			}
-			memcpy(f->bufaddr, ahin+1, n);
-		case ATA_CMD_PIO_WRITE:
-		case ATA_CMD_PIO_WRITE_EXT:
-			ifp = getif(t, skb->dev);
-			if (ifp) {
-				ifp->lost = 0;
-				if (n > DEFAULTBCNT)
-					ifp->lostjumbo = 0;
-			}
-			if (f->bcnt -= n) {
-				f->lba += n >> 9;
-				f->bufaddr += n;
-				resend(d, t, f);
-				goto xmit;
-			}
+noskb:		if (buf)
+			clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+		goto out;
+	}
+
+	n = ahout->scnt << 9;
+	switch (ahout->cmdstat) {
+	case ATA_CMD_PIO_READ:
+	case ATA_CMD_PIO_READ_EXT:
+		if (skb->len < n) {
+			pr_err("%s e%ld.%d.  skb->len=%d need=%ld\n",
+				"aoe: runt data size in read from",
+				(long) d->aoemajor, d->aoeminor,
+			       skb->len, n);
+			clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
 			break;
-		case ATA_CMD_ID_ATA:
-			if (skb->len - sizeof *hin - sizeof *ahin < 512) {
-				printk(KERN_INFO
-					"aoe: runt data size in ataid.  skb->len=%d\n",
-					skb->len);
-				spin_unlock_irqrestore(&d->lock, flags);
-				return;
-			}
-			ataid_complete(d, t, (char *) (ahin+1));
+		}
+		if (n > f->iter.bi_size) {
+			pr_err_ratelimited("%s e%ld.%d.  bytes=%ld need=%u\n",
+				"aoe: too-large data size in read from",
+				(long) d->aoemajor, d->aoeminor,
+				n, f->iter.bi_size);
+			clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
 			break;
-		default:
-			printk(KERN_INFO
-				"aoe: unrecognized ata command %2.2Xh for %d.%d\n",
-				ahout->cmdstat,
-				get_unaligned_be16(&hin->major),
-				hin->minor);
 		}
+		bvcpy(skb, f->buf->bio, f->iter, n);
+	case ATA_CMD_PIO_WRITE:
+	case ATA_CMD_PIO_WRITE_EXT:
+		spin_lock_irq(&d->lock);
+		ifp = getif(t, skb->dev);
+		if (ifp)
+			ifp->lost = 0;
+		spin_unlock_irq(&d->lock);
+		break;
+	case ATA_CMD_ID_ATA:
+		if (skb->len < 512) {
+			pr_info("%s e%ld.%d.  skb->len=%d need=512\n",
+				"aoe: runt data size in ataid from",
+				(long) d->aoemajor, d->aoeminor,
+				skb->len);
+			break;
+		}
+		if (skb_linearize(skb))
+			break;
+		spin_lock_irq(&d->lock);
+		ataid_complete(d, t, skb->data);
+		spin_unlock_irq(&d->lock);
+		break;
+	default:
+		pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n",
+			ahout->cmdstat,
+			be16_to_cpu(get_unaligned(&hin->major)),
+			hin->minor);
+	}
+out:
+	spin_lock_irq(&d->lock);
+	if (t->taint > 0
+	&& --t->taint > 0
+	&& t->nout_probes == 0) {
+		count_targets(d, &untainted);
+		if (untainted > 0) {
+			probe(t);
+			t->nout_probes++;
+		}
+	}
+
+	aoe_freetframe(f);
+
+	if (buf && --buf->nframesout == 0 && buf->iter.bi_size == 0)
+		aoe_end_buf(d, buf);
+
+	spin_unlock_irq(&d->lock);
+	aoedev_put(d);
+	dev_kfree_skb(skb);
+}
+
+/* Enters with iocq.lock held.
+ * Returns true iff responses needing processing remain.
+ */
+static int
+ktio(int id)
+{
+	struct frame *f;
+	struct list_head *pos;
+	int i;
+	int actual_id;
+
+	for (i = 0; ; ++i) {
+		if (i == MAXIOC)
+			return 1;
+		if (list_empty(&iocq[id].head))
+			return 0;
+		pos = iocq[id].head.next;
+		list_del(pos);
+		f = list_entry(pos, struct frame, head);
+		spin_unlock_irq(&iocq[id].lock);
+		ktiocomplete(f);
+
+		/* Figure out if extra threads are required. */
+		actual_id = f->t->d->aoeminor % ncpus;
+
+		if (!kts[actual_id].active) {
+			BUG_ON(id != 0);
+			mutex_lock(&ktio_spawn_lock);
+			if (!kts[actual_id].active
+				&& aoe_ktstart(&kts[actual_id]) == 0)
+				kts[actual_id].active = 1;
+			mutex_unlock(&ktio_spawn_lock);
+		}
+		spin_lock_irq(&iocq[id].lock);
 	}
+}
 
-	if (buf && --buf->nframesout == 0 && buf->resid == 0) {
-		diskstats(d->gd, buf->bio, jiffies - buf->stime, buf->sector);
-		n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
-		bio_endio(buf->bio, n);
-		mempool_free(buf, d->bufpool);
+static int
+kthread(void *vp)
+{
+	struct ktstate *k;
+	DECLARE_WAITQUEUE(wait, current);
+	int more;
+
+	k = vp;
+	current->flags |= PF_NOFREEZE;
+	set_user_nice(current, -10);
+	complete(&k->rendez);	/* tell spawner we're running */
+	do {
+		spin_lock_irq(k->lock);
+		more = k->fn(k->id);
+		if (!more) {
+			add_wait_queue(k->waitq, &wait);
+			__set_current_state(TASK_INTERRUPTIBLE);
+		}
+		spin_unlock_irq(k->lock);
+		if (!more) {
+			schedule();
+			remove_wait_queue(k->waitq, &wait);
+		} else
+			cond_resched();
+	} while (!kthread_should_stop());
+	complete(&k->rendez);	/* tell spawner we're stopping */
+	return 0;
+}
+
+void
+aoe_ktstop(struct ktstate *k)
+{
+	kthread_stop(k->task);
+	wait_for_completion(&k->rendez);
+}
+
+int
+aoe_ktstart(struct ktstate *k)
+{
+	struct task_struct *task;
+
+	init_completion(&k->rendez);
+	task = kthread_run(kthread, k, "%s", k->name);
+	if (task == NULL || IS_ERR(task))
+		return -ENOMEM;
+	k->task = task;
+	wait_for_completion(&k->rendez); /* allow kthread to start */
+	init_completion(&k->rendez);	/* for waiting for exit later */
+	return 0;
+}
+
+/* pass it off to kthreads for processing */
+static void
+ktcomplete(struct frame *f, struct sk_buff *skb)
+{
+	int id;
+	ulong flags;
+
+	f->r_skb = skb;
+	id = f->t->d->aoeminor % ncpus;
+	spin_lock_irqsave(&iocq[id].lock, flags);
+	if (!kts[id].active) {
+		spin_unlock_irqrestore(&iocq[id].lock, flags);
+		/* The thread with id has not been spawned yet,
+		 * so delegate the work to the main thread and
+		 * try spawning a new thread.
+		 */
+		id = 0;
+		spin_lock_irqsave(&iocq[id].lock, flags);
 	}
+	list_add_tail(&f->head, &iocq[id].head);
+	spin_unlock_irqrestore(&iocq[id].lock, flags);
+	wake_up(&ktiowq[id]);
+}
 
-	f->buf = NULL;
-	f->tag = FREETAG;
-	t->nout--;
+struct sk_buff *
+aoecmd_ata_rsp(struct sk_buff *skb)
+{
+	struct aoedev *d;
+	struct aoe_hdr *h;
+	struct frame *f;
+	u32 n;
+	ulong flags;
+	char ebuf[128];
+	u16 aoemajor;
+
+	h = (struct aoe_hdr *) skb->data;
+	aoemajor = be16_to_cpu(get_unaligned(&h->major));
+	d = aoedev_by_aoeaddr(aoemajor, h->minor, 0);
+	if (d == NULL) {
+		snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
+			"for unknown device %d.%d\n",
+			aoemajor, h->minor);
+		aoechr_error(ebuf);
+		return skb;
+	}
+
+	spin_lock_irqsave(&d->lock, flags);
 
+	n = be32_to_cpu(get_unaligned(&h->tag));
+	f = getframe(d, n);
+	if (f) {
+		calc_rttavg(d, f->t, tsince_hr(f));
+		f->t->nout--;
+		if (f->flags & FFL_PROBE)
+			f->t->nout_probes--;
+	} else {
+		f = getframe_deferred(d, n);
+		if (f) {
+			calc_rttavg(d, NULL, tsince_hr(f));
+		} else {
+			calc_rttavg(d, NULL, tsince(n));
+			spin_unlock_irqrestore(&d->lock, flags);
+			aoedev_put(d);
+			snprintf(ebuf, sizeof(ebuf),
+				 "%15s e%d.%d    tag=%08x@%08lx s=%pm d=%pm\n",
+				 "unexpected rsp",
+				 get_unaligned_be16(&h->major),
+				 h->minor,
+				 get_unaligned_be32(&h->tag),
+				 jiffies,
+				 h->src,
+				 h->dst);
+			aoechr_error(ebuf);
+			return skb;
+		}
+	}
 	aoecmd_work(d);
-xmit:
-	__skb_queue_head_init(&queue);
-	skb_queue_splice_init(&d->sendq, &queue);
 
 	spin_unlock_irqrestore(&d->lock, flags);
-	aoenet_xmit(&queue);
+
+	ktcomplete(f, skb);
+
+	/*
+	 * Note here that we do not perform an aoedev_put, as we are
+	 * leaving this reference for the ktio to release.
+	 */
+	return NULL;
 }
 
 void
@@ -887,7 +1433,7 @@ aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
 	aoecmd_cfg_pkts(aoemajor, aoeminor, &queue);
 	aoenet_xmit(&queue);
 }
- 
+
 struct sk_buff *
 aoecmd_ata_id(struct aoedev *d)
 {
@@ -897,7 +1443,7 @@ aoecmd_ata_id(struct aoedev *d)
 	struct sk_buff *skb;
 	struct aoetgt *t;
 
-	f = freeframe(d);
+	f = newframe(d);
 	if (f == NULL)
 		return NULL;
 
@@ -910,8 +1456,10 @@ aoecmd_ata_id(struct aoedev *d)
 	skb_put(skb, sizeof *h + sizeof *ah);
 	memset(h, 0, skb->len);
 	f->tag = aoehdr_atainit(d, t, h);
+	fhash(f);
 	t->nout++;
 	f->waited = 0;
+	f->waited_total = 0;
 
 	/* set up ata header */
 	ah->scnt = 1;
@@ -920,46 +1468,120 @@ aoecmd_ata_id(struct aoedev *d)
 
 	skb->dev = t->ifp->nd;
 
-	d->rttavg = MAXTIMER;
+	d->rttavg = RTTAVG_INIT;
+	d->rttdev = RTTDEV_INIT;
 	d->timer.function = rexmit_timer;
 
-	return skb_clone(skb, GFP_ATOMIC);
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (skb) {
+		do_gettimeofday(&f->sent);
+		f->sent_jiffs = (u32) jiffies;
+	}
+
+	return skb;
 }
- 
+
+static struct aoetgt **
+grow_targets(struct aoedev *d)
+{
+	ulong oldn, newn;
+	struct aoetgt **tt;
+
+	oldn = d->ntargets;
+	newn = oldn * 2;
+	tt = kcalloc(newn, sizeof(*d->targets), GFP_ATOMIC);
+	if (!tt)
+		return NULL;
+	memmove(tt, d->targets, sizeof(*d->targets) * oldn);
+	d->tgt = tt + (d->tgt - d->targets);
+	kfree(d->targets);
+	d->targets = tt;
+	d->ntargets = newn;
+
+	return &d->targets[oldn];
+}
+
 static struct aoetgt *
 addtgt(struct aoedev *d, char *addr, ulong nframes)
 {
 	struct aoetgt *t, **tt, **te;
-	struct frame *f, *e;
 
 	tt = d->targets;
-	te = tt + NTARGETS;
+	te = tt + d->ntargets;
 	for (; tt < te && *tt; tt++)
 		;
 
 	if (tt == te) {
-		printk(KERN_INFO
-			"aoe: device addtgt failure; too many targets\n");
-		return NULL;
+		tt = grow_targets(d);
+		if (!tt)
+			goto nomem;
 	}
-	t = kcalloc(1, sizeof *t, GFP_ATOMIC);
-	f = kcalloc(nframes, sizeof *f, GFP_ATOMIC);
-	if (!t || !f) {
-		kfree(f);
-		kfree(t);
-		printk(KERN_INFO "aoe: cannot allocate memory to add target\n");
-		return NULL;
-	}
-
+	t = kzalloc(sizeof(*t), GFP_ATOMIC);
+	if (!t)
+		goto nomem;
 	t->nframes = nframes;
-	t->frames = f;
-	e = f + nframes;
-	for (; f < e; f++)
-		f->tag = FREETAG;
+	t->d = d;
 	memcpy(t->addr, addr, sizeof t->addr);
 	t->ifp = t->ifs;
-	t->maxout = t->nframes;
+	aoecmd_wreset(t);
+	t->maxout = t->nframes / 2;
+	INIT_LIST_HEAD(&t->ffree);
 	return *tt = t;
+
+ nomem:
+	pr_info("aoe: cannot allocate memory to add target\n");
+	return NULL;
+}
+
+static void
+setdbcnt(struct aoedev *d)
+{
+	struct aoetgt **t, **e;
+	int bcnt = 0;
+
+	t = d->targets;
+	e = t + d->ntargets;
+	for (; t < e && *t; t++)
+		if (bcnt == 0 || bcnt > (*t)->minbcnt)
+			bcnt = (*t)->minbcnt;
+	if (bcnt != d->maxbcnt) {
+		d->maxbcnt = bcnt;
+		pr_info("aoe: e%ld.%d: setting %d byte data frames\n",
+			d->aoemajor, d->aoeminor, bcnt);
+	}
+}
+
+static void
+setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt)
+{
+	struct aoedev *d;
+	struct aoeif *p, *e;
+	int minbcnt;
+
+	d = t->d;
+	minbcnt = bcnt;
+	p = t->ifs;
+	e = p + NAOEIFS;
+	for (; p < e; p++) {
+		if (p->nd == NULL)
+			break;		/* end of the valid interfaces */
+		if (p->nd == nd) {
+			p->bcnt = bcnt;	/* we're updating */
+			nd = NULL;
+		} else if (minbcnt > p->bcnt)
+			minbcnt = p->bcnt; /* find the min interface */
+	}
+	if (nd) {
+		if (p == e) {
+			pr_err("aoe: device setifbcnt failure; too many interfaces.\n");
+			return;
+		}
+		dev_hold(nd);
+		p->nd = nd;
+		p->bcnt = bcnt;
+	}
+	t->minbcnt = minbcnt;
+	setdbcnt(d);
 }
 
 void
@@ -969,11 +1591,12 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
 	struct aoe_hdr *h;
 	struct aoe_cfghdr *ch;
 	struct aoetgt *t;
-	struct aoeif *ifp;
-	ulong flags, sysminor, aoemajor;
+	ulong flags, aoemajor;
 	struct sk_buff *sl;
+	struct sk_buff_head queue;
 	u16 n;
 
+	sl = NULL;
 	h = (struct aoe_hdr *) skb_mac_header(skb);
 	ch = (struct aoe_cfghdr *) (h+1);
 
@@ -987,10 +1610,13 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
 			"Check shelf dip switches.\n");
 		return;
 	}
-
-	sysminor = SYSMINOR(aoemajor, h->minor);
-	if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) {
-		printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n",
+	if (aoemajor == 0xffff) {
+		pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n",
+			aoemajor, (int) h->minor);
+		return;
+	}
+	if (h->minor == 0xff) {
+		pr_info("aoe: e%ld.%d: broadcast slot number invalid\n",
 			aoemajor, (int) h->minor);
 		return;
 	}
@@ -999,63 +1625,41 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
 	if (n > aoe_maxout)	/* keep it reasonable */
 		n = aoe_maxout;
 
-	d = aoedev_by_sysminor_m(sysminor);
+	d = aoedev_by_aoeaddr(aoemajor, h->minor, 1);
 	if (d == NULL) {
-		printk(KERN_INFO "aoe: device sysminor_m failure\n");
+		pr_info("aoe: device allocation failure\n");
 		return;
 	}
 
 	spin_lock_irqsave(&d->lock, flags);
 
 	t = gettgt(d, h->src);
-	if (!t) {
+	if (t) {
+		t->nframes = n;
+		if (n < t->maxout)
+			aoecmd_wreset(t);
+	} else {
 		t = addtgt(d, h->src, n);
-		if (!t) {
-			spin_unlock_irqrestore(&d->lock, flags);
-			return;
-		}
-	}
-	ifp = getif(t, skb->dev);
-	if (!ifp) {
-		ifp = addif(t, skb->dev);
-		if (!ifp) {
-			printk(KERN_INFO
-				"aoe: device addif failure; "
-				"too many interfaces?\n");
-			spin_unlock_irqrestore(&d->lock, flags);
-			return;
-		}
-	}
-	if (ifp->maxbcnt) {
-		n = ifp->nd->mtu;
-		n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr);
-		n /= 512;
-		if (n > ch->scnt)
-			n = ch->scnt;
-		n = n ? n * 512 : DEFAULTBCNT;
-		if (n != ifp->maxbcnt) {
-			printk(KERN_INFO
-				"aoe: e%ld.%d: setting %d%s%s:%pm\n",
-				d->aoemajor, d->aoeminor, n,
-				" byte data frames on ", ifp->nd->name,
-				t->addr);
-			ifp->maxbcnt = n;
-		}
+		if (!t)
+			goto bail;
 	}
+	n = skb->dev->mtu;
+	n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
+	n /= 512;
+	if (n > ch->scnt)
+		n = ch->scnt;
+	n = n ? n * 512 : DEFAULTBCNT;
+	setifbcnt(t, skb->dev, n);
 
 	/* don't change users' perspective */
-	if (d->nopen) {
-		spin_unlock_irqrestore(&d->lock, flags);
-		return;
+	if (d->nopen == 0) {
+		d->fw_ver = be16_to_cpu(ch->fwver);
+		sl = aoecmd_ata_id(d);
 	}
-	d->fw_ver = be16_to_cpu(ch->fwver);
-
-	sl = aoecmd_ata_id(d);
-
+bail:
 	spin_unlock_irqrestore(&d->lock, flags);
-
+	aoedev_put(d);
 	if (sl) {
-		struct sk_buff_head queue;
 		__skb_queue_head_init(&queue);
 		__skb_queue_tail(&queue, sl);
 		aoenet_xmit(&queue);
@@ -1063,23 +1667,160 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
 }
 
 void
+aoecmd_wreset(struct aoetgt *t)
+{
+	t->maxout = 1;
+	t->ssthresh = t->nframes / 2;
+	t->next_cwnd = t->nframes;
+}
+
+void
 aoecmd_cleanslate(struct aoedev *d)
 {
 	struct aoetgt **t, **te;
-	struct aoeif *p, *e;
 
-	d->mintimer = MINTIMER;
+	d->rttavg = RTTAVG_INIT;
+	d->rttdev = RTTDEV_INIT;
+	d->maxbcnt = 0;
 
 	t = d->targets;
-	te = t + NTARGETS;
-	for (; t < te && *t; t++) {
-		(*t)->maxout = (*t)->nframes;
-		p = (*t)->ifs;
-		e = p + NAOEIFS;
-		for (; p < e; p++) {
-			p->lostjumbo = 0;
-			p->lost = 0;
-			p->maxbcnt = DEFAULTBCNT;
+	te = t + d->ntargets;
+	for (; t < te && *t; t++)
+		aoecmd_wreset(*t);
+}
+
+void
+aoe_failbuf(struct aoedev *d, struct buf *buf)
+{
+	if (buf == NULL)
+		return;
+	buf->iter.bi_size = 0;
+	clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+	if (buf->nframesout == 0)
+		aoe_end_buf(d, buf);
+}
+
+void
+aoe_flush_iocq(void)
+{
+	int i;
+
+	for (i = 0; i < ncpus; i++) {
+		if (kts[i].active)
+			aoe_flush_iocq_by_index(i);
+	}
+}
+
+void
+aoe_flush_iocq_by_index(int id)
+{
+	struct frame *f;
+	struct aoedev *d;
+	LIST_HEAD(flist);
+	struct list_head *pos;
+	struct sk_buff *skb;
+	ulong flags;
+
+	spin_lock_irqsave(&iocq[id].lock, flags);
+	list_splice_init(&iocq[id].head, &flist);
+	spin_unlock_irqrestore(&iocq[id].lock, flags);
+	while (!list_empty(&flist)) {
+		pos = flist.next;
+		list_del(pos);
+		f = list_entry(pos, struct frame, head);
+		d = f->t->d;
+		skb = f->r_skb;
+		spin_lock_irqsave(&d->lock, flags);
+		if (f->buf) {
+			f->buf->nframesout--;
+			aoe_failbuf(d, f->buf);
 		}
+		aoe_freetframe(f);
+		spin_unlock_irqrestore(&d->lock, flags);
+		dev_kfree_skb(skb);
+		aoedev_put(d);
 	}
 }
+
+int __init
+aoecmd_init(void)
+{
+	void *p;
+	int i;
+	int ret;
+
+	/* get_zeroed_page returns page with ref count 1 */
+	p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT);
+	if (!p)
+		return -ENOMEM;
+	empty_page = virt_to_page(p);
+
+	ncpus = num_online_cpus();
+
+	iocq = kcalloc(ncpus, sizeof(struct iocq_ktio), GFP_KERNEL);
+	if (!iocq)
+		return -ENOMEM;
+
+	kts = kcalloc(ncpus, sizeof(struct ktstate), GFP_KERNEL);
+	if (!kts) {
+		ret = -ENOMEM;
+		goto kts_fail;
+	}
+
+	ktiowq = kcalloc(ncpus, sizeof(wait_queue_head_t), GFP_KERNEL);
+	if (!ktiowq) {
+		ret = -ENOMEM;
+		goto ktiowq_fail;
+	}
+
+	mutex_init(&ktio_spawn_lock);
+
+	for (i = 0; i < ncpus; i++) {
+		INIT_LIST_HEAD(&iocq[i].head);
+		spin_lock_init(&iocq[i].lock);
+		init_waitqueue_head(&ktiowq[i]);
+		snprintf(kts[i].name, sizeof(kts[i].name), "aoe_ktio%d", i);
+		kts[i].fn = ktio;
+		kts[i].waitq = &ktiowq[i];
+		kts[i].lock = &iocq[i].lock;
+		kts[i].id = i;
+		kts[i].active = 0;
+	}
+	kts[0].active = 1;
+	if (aoe_ktstart(&kts[0])) {
+		ret = -ENOMEM;
+		goto ktstart_fail;
+	}
+	return 0;
+
+ktstart_fail:
+	kfree(ktiowq);
+ktiowq_fail:
+	kfree(kts);
+kts_fail:
+	kfree(iocq);
+
+	return ret;
+}
+
+void
+aoecmd_exit(void)
+{
+	int i;
+
+	for (i = 0; i < ncpus; i++)
+		if (kts[i].active)
+			aoe_ktstop(&kts[i]);
+
+	aoe_flush_iocq();
+
+	/* Free up the iocq and thread speicific configuration
+	* allocated during startup.
+	*/
+	kfree(iocq);
+	kfree(kts);
+	kfree(ktiowq);
+
+	free_page((unsigned long) page_address(empty_page));
+	empty_page = NULL;
+}
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index eeea477d960..e774c50b684 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoedev.c
  * AoE device utility functions; maintains device list.
@@ -8,30 +8,141 @@
 #include <linux/blkdev.h>
 #include <linux/netdevice.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+#include <linux/kdev_t.h>
+#include <linux/moduleparam.h>
+#include <linux/string.h>
 #include "aoe.h"
 
 static void dummy_timer(ulong);
-static void aoedev_freedev(struct aoedev *);
 static void freetgt(struct aoedev *d, struct aoetgt *t);
 static void skbpoolfree(struct aoedev *d);
 
+static int aoe_dyndevs = 1;
+module_param(aoe_dyndevs, int, 0644);
+MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices.");
+
 static struct aoedev *devlist;
 static DEFINE_SPINLOCK(devlist_lock);
 
-struct aoedev *
-aoedev_by_aoeaddr(int maj, int min)
+/* Because some systems will have one, many, or no
+ *   - partitions,
+ *   - slots per shelf,
+ *   - or shelves,
+ * we need some flexibility in the way the minor numbers
+ * are allocated.  So they are dynamic.
+ */
+#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS)
+
+static DEFINE_SPINLOCK(used_minors_lock);
+static DECLARE_BITMAP(used_minors, N_DEVS);
+
+static int
+minor_get_dyn(ulong *sysminor)
 {
-	struct aoedev *d;
 	ulong flags;
+	ulong n;
+	int error = 0;
+
+	spin_lock_irqsave(&used_minors_lock, flags);
+	n = find_first_zero_bit(used_minors, N_DEVS);
+	if (n < N_DEVS)
+		set_bit(n, used_minors);
+	else
+		error = -1;
+	spin_unlock_irqrestore(&used_minors_lock, flags);
+
+	*sysminor = n * AOE_PARTITIONS;
+	return error;
+}
 
-	spin_lock_irqsave(&devlist_lock, flags);
+static int
+minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin)
+{
+	ulong flags;
+	ulong n;
+	int error = 0;
+	enum {
+		/* for backwards compatibility when !aoe_dyndevs,
+		 * a static number of supported slots per shelf */
+		NPERSHELF = 16,
+	};
+
+	if (aoemin >= NPERSHELF) {
+		pr_err("aoe: %s %d slots per shelf\n",
+			"static minor device numbers support only",
+			NPERSHELF);
+		error = -1;
+		goto out;
+	}
 
-	for (d=devlist; d; d=d->next)
-		if (d->aoemajor == maj && d->aoeminor == min)
-			break;
+	n = aoemaj * NPERSHELF + aoemin;
+	if (n >= N_DEVS) {
+		pr_err("aoe: %s with e%ld.%d\n",
+			"cannot use static minor device numbers",
+			aoemaj, aoemin);
+		error = -1;
+		goto out;
+	}
+
+	spin_lock_irqsave(&used_minors_lock, flags);
+	if (test_bit(n, used_minors)) {
+		pr_err("aoe: %s %lu\n",
+			"existing device already has static minor number",
+			n);
+		error = -1;
+	} else
+		set_bit(n, used_minors);
+	spin_unlock_irqrestore(&used_minors_lock, flags);
+	*sysminor = n * AOE_PARTITIONS;
+out:
+	return error;
+}
+
+static int
+minor_get(ulong *sysminor, ulong aoemaj, int aoemin)
+{
+	if (aoe_dyndevs)
+		return minor_get_dyn(sysminor);
+	else
+		return minor_get_static(sysminor, aoemaj, aoemin);
+}
+
+static void
+minor_free(ulong minor)
+{
+	ulong flags;
+
+	minor /= AOE_PARTITIONS;
+	BUG_ON(minor >= N_DEVS);
+
+	spin_lock_irqsave(&used_minors_lock, flags);
+	BUG_ON(!test_bit(minor, used_minors));
+	clear_bit(minor, used_minors);
+	spin_unlock_irqrestore(&used_minors_lock, flags);
+}
+
+/*
+ * Users who grab a pointer to the device with aoedev_by_aoeaddr
+ * automatically get a reference count and must be responsible
+ * for performing a aoedev_put.  With the addition of async
+ * kthread processing I'm no longer confident that we can
+ * guarantee consistency in the face of device flushes.
+ *
+ * For the time being, we only bother to add extra references for
+ * frames sitting on the iocq.  When the kthreads finish processing
+ * these frames, they will aoedev_put the device.
+ */
+
+void
+aoedev_put(struct aoedev *d)
+{
+	ulong flags;
 
+	spin_lock_irqsave(&devlist_lock, flags);
+	d->ref--;
 	spin_unlock_irqrestore(&devlist_lock, flags);
-	return d;
 }
 
 static void
@@ -46,127 +157,247 @@ dummy_timer(ulong vp)
 	add_timer(&d->timer);
 }
 
+static void
+aoe_failip(struct aoedev *d)
+{
+	struct request *rq;
+	struct bio *bio;
+	unsigned long n;
+
+	aoe_failbuf(d, d->ip.buf);
+
+	rq = d->ip.rq;
+	if (rq == NULL)
+		return;
+	while ((bio = d->ip.nxbio)) {
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+		d->ip.nxbio = bio->bi_next;
+		n = (unsigned long) rq->special;
+		rq->special = (void *) --n;
+	}
+	if ((unsigned long) rq->special == 0)
+		aoe_end_request(d, rq, 0);
+}
+
+static void
+downdev_frame(struct list_head *pos)
+{
+	struct frame *f;
+
+	f = list_entry(pos, struct frame, head);
+	list_del(pos);
+	if (f->buf) {
+		f->buf->nframesout--;
+		aoe_failbuf(f->t->d, f->buf);
+	}
+	aoe_freetframe(f);
+}
+
 void
 aoedev_downdev(struct aoedev *d)
 {
-	struct aoetgt **t, **te;
-	struct frame *f, *e;
-	struct buf *buf;
-	struct bio *bio;
+	struct aoetgt *t, **tt, **te;
+	struct list_head *head, *pos, *nx;
+	struct request *rq;
+	int i;
 
-	t = d->targets;
-	te = t + NTARGETS;
-	for (; t < te && *t; t++) {
-		f = (*t)->frames;
-		e = f + (*t)->nframes;
-		for (; f < e; f->tag = FREETAG, f->buf = NULL, f++) {
-			if (f->tag == FREETAG || f->buf == NULL)
-				continue;
-			buf = f->buf;
-			bio = buf->bio;
-			if (--buf->nframesout == 0
-			&& buf != d->inprocess) {
-				mempool_free(buf, d->bufpool);
-				bio_endio(bio, -EIO);
-			}
-		}
-		(*t)->maxout = (*t)->nframes;
-		(*t)->nout = 0;
+	d->flags &= ~DEVFL_UP;
+
+	/* clean out active and to-be-retransmitted buffers */
+	for (i = 0; i < NFACTIVE; i++) {
+		head = &d->factive[i];
+		list_for_each_safe(pos, nx, head)
+			downdev_frame(pos);
 	}
-	buf = d->inprocess;
-	if (buf) {
-		bio = buf->bio;
-		mempool_free(buf, d->bufpool);
-		bio_endio(bio, -EIO);
+	head = &d->rexmitq;
+	list_for_each_safe(pos, nx, head)
+		downdev_frame(pos);
+
+	/* reset window dressings */
+	tt = d->targets;
+	te = tt + d->ntargets;
+	for (; tt < te && (t = *tt); tt++) {
+		aoecmd_wreset(t);
+		t->nout = 0;
 	}
-	d->inprocess = NULL;
-	d->htgt = NULL;
-
-	while (!list_empty(&d->bufq)) {
-		buf = container_of(d->bufq.next, struct buf, bufs);
-		list_del(d->bufq.next);
-		bio = buf->bio;
-		mempool_free(buf, d->bufpool);
-		bio_endio(bio, -EIO);
+
+	/* clean out the in-process request (if any) */
+	aoe_failip(d);
+
+	/* fast fail all pending I/O */
+	if (d->blkq) {
+		while ((rq = blk_peek_request(d->blkq))) {
+			blk_start_request(rq);
+			aoe_end_request(d, rq, 1);
+		}
 	}
 
 	if (d->gd)
 		set_capacity(d->gd, 0);
+}
 
-	d->flags &= ~DEVFL_UP;
+/* return whether the user asked for this particular
+ * device to be flushed
+ */
+static int
+user_req(char *s, size_t slen, struct aoedev *d)
+{
+	const char *p;
+	size_t lim;
+
+	if (!d->gd)
+		return 0;
+	p = kbasename(d->gd->disk_name);
+	lim = sizeof(d->gd->disk_name);
+	lim -= p - d->gd->disk_name;
+	if (slen < lim)
+		lim = slen;
+
+	return !strncmp(s, p, lim);
 }
 
 static void
-aoedev_freedev(struct aoedev *d)
+freedev(struct aoedev *d)
 {
 	struct aoetgt **t, **e;
+	int freeing = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&d->lock, flags);
+	if (d->flags & DEVFL_TKILL
+	&& !(d->flags & DEVFL_FREEING)) {
+		d->flags |= DEVFL_FREEING;
+		freeing = 1;
+	}
+	spin_unlock_irqrestore(&d->lock, flags);
+	if (!freeing)
+		return;
 
+	del_timer_sync(&d->timer);
 	if (d->gd) {
+		aoedisk_rm_debugfs(d);
 		aoedisk_rm_sysfs(d);
 		del_gendisk(d->gd);
 		put_disk(d->gd);
+		blk_cleanup_queue(d->blkq);
 	}
 	t = d->targets;
-	e = t + NTARGETS;
+	e = t + d->ntargets;
 	for (; t < e && *t; t++)
 		freetgt(d, *t);
 	if (d->bufpool)
 		mempool_destroy(d->bufpool);
 	skbpoolfree(d);
-	kfree(d);
+	minor_free(d->sysminor);
+
+	spin_lock_irqsave(&d->lock, flags);
+	d->flags |= DEVFL_FREED;
+	spin_unlock_irqrestore(&d->lock, flags);
 }
 
-int
-aoedev_flush(const char __user *str, size_t cnt)
+enum flush_parms {
+	NOT_EXITING = 0,
+	EXITING = 1,
+};
+
+static int
+flush(const char __user *str, size_t cnt, int exiting)
 {
 	ulong flags;
 	struct aoedev *d, **dd;
-	struct aoedev *rmd = NULL;
 	char buf[16];
 	int all = 0;
+	int specified = 0;	/* flush a specific device */
+	unsigned int skipflags;
+
+	skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL;
 
-	if (cnt >= 3) {
+	if (!exiting && cnt >= 3) {
 		if (cnt > sizeof buf)
 			cnt = sizeof buf;
 		if (copy_from_user(buf, str, cnt))
 			return -EFAULT;
 		all = !strncmp(buf, "all", 3);
+		if (!all)
+			specified = 1;
 	}
 
 	flush_scheduled_work();
+	/* pass one: without sleeping, do aoedev_downdev */
 	spin_lock_irqsave(&devlist_lock, flags);
-	dd = &devlist;
-	while ((d = *dd)) {
+	for (d = devlist; d; d = d->next) {
 		spin_lock(&d->lock);
-		if ((!all && (d->flags & DEVFL_UP))
-		|| (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
-		|| d->nopen) {
-			spin_unlock(&d->lock);
-			dd = &d->next;
-			continue;
-		}
-		*dd = d->next;
+		if (exiting) {
+			/* unconditionally take each device down */
+		} else if (specified) {
+			if (!user_req(buf, cnt, d))
+				goto cont;
+		} else if ((!all && (d->flags & DEVFL_UP))
+		|| d->flags & skipflags
+		|| d->nopen
+		|| d->ref)
+			goto cont;
+
 		aoedev_downdev(d);
 		d->flags |= DEVFL_TKILL;
+cont:
 		spin_unlock(&d->lock);
-		d->next = rmd;
-		rmd = d;
 	}
 	spin_unlock_irqrestore(&devlist_lock, flags);
-	while ((d = rmd)) {
-		rmd = d->next;
-		del_timer_sync(&d->timer);
-		aoedev_freedev(d);	/* must be able to sleep */
+
+	/* pass two: call freedev, which might sleep,
+	 * for aoedevs marked with DEVFL_TKILL
+	 */
+restart:
+	spin_lock_irqsave(&devlist_lock, flags);
+	for (d = devlist; d; d = d->next) {
+		spin_lock(&d->lock);
+		if (d->flags & DEVFL_TKILL
+		&& !(d->flags & DEVFL_FREEING)) {
+			spin_unlock(&d->lock);
+			spin_unlock_irqrestore(&devlist_lock, flags);
+			freedev(d);
+			goto restart;
+		}
+		spin_unlock(&d->lock);
 	}
+
+	/* pass three: remove aoedevs marked with DEVFL_FREED */
+	for (dd = &devlist, d = *dd; d; d = *dd) {
+		struct aoedev *doomed = NULL;
+
+		spin_lock(&d->lock);
+		if (d->flags & DEVFL_FREED) {
+			*dd = d->next;
+			doomed = d;
+		} else {
+			dd = &d->next;
+		}
+		spin_unlock(&d->lock);
+		if (doomed)
+			kfree(doomed->targets);
+		kfree(doomed);
+	}
+	spin_unlock_irqrestore(&devlist_lock, flags);
+
 	return 0;
 }
 
-/* I'm not really sure that this is a realistic problem, but if the
-network driver goes gonzo let's just leak memory after complaining. */
+int
+aoedev_flush(const char __user *str, size_t cnt)
+{
+	return flush(str, cnt, NOT_EXITING);
+}
+
+/* This has been confirmed to occur once with Tms=3*1000 due to the
+ * driver changing link and not processing its transmit ring.  The
+ * problem is hard enough to solve by returning an error that I'm
+ * still punting on "solving" this.
+ */
 static void
 skbfree(struct sk_buff *skb)
 {
-	enum { Sms = 100, Tms = 3*1000};
+	enum { Sms = 250, Tms = 30 * 1000};
 	int i = Tms / Sms;
 
 	if (skb == NULL)
@@ -180,6 +411,7 @@ skbfree(struct sk_buff *skb)
 			"cannot free skb -- memory leaked.");
 		return;
 	}
+	skb->truesize -= skb->data_len;
 	skb_shinfo(skb)->nr_frags = skb->data_len = 0;
 	skb_trim(skb, 0);
 	dev_kfree_skb(skb);
@@ -196,26 +428,43 @@ skbpoolfree(struct aoedev *d)
 	__skb_queue_head_init(&d->skbpool);
 }
 
-/* find it or malloc it */
+/* find it or allocate it */
 struct aoedev *
-aoedev_by_sysminor_m(ulong sysminor)
+aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
 {
 	struct aoedev *d;
+	int i;
 	ulong flags;
+	ulong sysminor = 0;
 
 	spin_lock_irqsave(&devlist_lock, flags);
 
 	for (d=devlist; d; d=d->next)
-		if (d->sysminor == sysminor)
+		if (d->aoemajor == maj && d->aoeminor == min) {
+			spin_lock(&d->lock);
+			if (d->flags & DEVFL_TKILL) {
+				spin_unlock(&d->lock);
+				d = NULL;
+				goto out;
+			}
+			d->ref++;
+			spin_unlock(&d->lock);
 			break;
-	if (d)
+		}
+	if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0)
 		goto out;
 	d = kcalloc(1, sizeof *d, GFP_ATOMIC);
 	if (!d)
 		goto out;
+	d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC);
+	if (!d->targets) {
+		kfree(d);
+		d = NULL;
+		goto out;
+	}
+	d->ntargets = NTARGETS;
 	INIT_WORK(&d->work, aoecmd_sleepwork);
 	spin_lock_init(&d->lock);
-	skb_queue_head_init(&d->sendq);
 	skb_queue_head_init(&d->skbpool);
 	init_timer(&d->timer);
 	d->timer.data = (ulong) d;
@@ -224,11 +473,15 @@ aoedev_by_sysminor_m(ulong sysminor)
 	add_timer(&d->timer);
 	d->bufpool = NULL;	/* defer to aoeblk_gdalloc */
 	d->tgt = d->targets;
-	INIT_LIST_HEAD(&d->bufq);
+	d->ref = 1;
+	for (i = 0; i < NFACTIVE; i++)
+		INIT_LIST_HEAD(&d->factive[i]);
+	INIT_LIST_HEAD(&d->rexmitq);
 	d->sysminor = sysminor;
-	d->aoemajor = AOEMAJOR(sysminor);
-	d->aoeminor = AOEMINOR(sysminor);
-	d->mintimer = MINTIMER;
+	d->aoemajor = maj;
+	d->aoeminor = min;
+	d->rttavg = RTTAVG_INIT;
+	d->rttdev = RTTDEV_INIT;
 	d->next = devlist;
 	devlist = d;
  out:
@@ -239,35 +492,31 @@ aoedev_by_sysminor_m(ulong sysminor)
 static void
 freetgt(struct aoedev *d, struct aoetgt *t)
 {
-	struct frame *f, *e;
+	struct frame *f;
+	struct list_head *pos, *nx, *head;
+	struct aoeif *ifp;
+
+	for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) {
+		if (!ifp->nd)
+			break;
+		dev_put(ifp->nd);
+	}
 
-	f = t->frames;
-	e = f + t->nframes;
-	for (; f < e; f++)
+	head = &t->ffree;
+	list_for_each_safe(pos, nx, head) {
+		list_del(pos);
+		f = list_entry(pos, struct frame, head);
 		skbfree(f->skb);
-	kfree(t->frames);
+		kfree(f);
+	}
 	kfree(t);
 }
 
 void
 aoedev_exit(void)
 {
-	struct aoedev *d;
-	ulong flags;
-
 	flush_scheduled_work();
-
-	while ((d = devlist)) {
-		devlist = d->next;
-
-		spin_lock_irqsave(&d->lock, flags);
-		aoedev_downdev(d);
-		d->flags |= DEVFL_TKILL;
-		spin_unlock_irqrestore(&d->lock, flags);
-
-		del_timer_sync(&d->timer);
-		aoedev_freedev(d);
-	}
+	flush(NULL, 0, EXITING);
 }
 
 int __init
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
index 7f83ad90e76..4b987c2fefb 100644
--- a/drivers/block/aoe/aoemain.c
+++ b/drivers/block/aoe/aoemain.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoemain.c
  * Module initialization routines, discover timer
@@ -61,6 +61,7 @@ aoe_exit(void)
 
 	aoenet_exit();
 	unregister_blkdev(AOE_MAJOR, DEVICE_NAME);
+	aoecmd_exit();
 	aoechr_exit();
 	aoedev_exit();
 	aoeblk_exit();		/* free cache after de-allocating bufs */
@@ -83,17 +84,20 @@ aoe_init(void)
 	ret = aoenet_init();
 	if (ret)
 		goto net_fail;
+	ret = aoecmd_init();
+	if (ret)
+		goto cmd_fail;
 	ret = register_blkdev(AOE_MAJOR, DEVICE_NAME);
 	if (ret < 0) {
 		printk(KERN_ERR "aoe: can't register major\n");
 		goto blkreg_fail;
 	}
-
 	printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION);
 	discover_timer(TINIT);
 	return 0;
-
  blkreg_fail:
+	aoecmd_exit();
+ cmd_fail:
 	aoenet_exit();
  net_fail:
 	aoeblk_exit();
@@ -101,7 +105,7 @@ aoe_init(void)
 	aoechr_exit();
  chr_fail:
 	aoedev_exit();
-	
+
 	printk(KERN_INFO "aoe: initialisation failure.\n");
 	return ret;
 }
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index ce0d62cd71b..63773a90581 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -1,9 +1,10 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoenet.c
  * Ethernet portion of AoE driver
  */
 
+#include <linux/gfp.h>
 #include <linux/hdreg.h>
 #include <linux/blkdev.h>
 #include <linux/netdevice.h>
@@ -30,7 +31,10 @@ enum {
 
 static char aoe_iflist[IFLISTSZ];
 module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600);
-MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=\"dev1 [dev2 ...]\"");
+MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]");
+
+static wait_queue_head_t txwq;
+static struct ktstate kts;
 
 #ifndef MODULE
 static int __init aoe_iflist_setup(char *str)
@@ -43,6 +47,28 @@ static int __init aoe_iflist_setup(char *str)
 __setup("aoe_iflist=", aoe_iflist_setup);
 #endif
 
+static spinlock_t txlock;
+static struct sk_buff_head skbtxq;
+
+/* enters with txlock held */
+static int
+tx(int id) __must_hold(&txlock)
+{
+	struct sk_buff *skb;
+	struct net_device *ifp;
+
+	while ((skb = skb_dequeue(&skbtxq))) {
+		spin_unlock_irq(&txlock);
+		ifp = skb->dev;
+		if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit())
+			pr_warn("aoe: packet could not be sent on %s.  %s\n",
+				ifp ? ifp->name : "netif",
+				"consider increasing tx_queue_len");
+		spin_lock_irq(&txlock);
+	}
+	return 0;
+}
+
 int
 is_aoe_netif(struct net_device *ifp)
 {
@@ -87,21 +113,27 @@ void
 aoenet_xmit(struct sk_buff_head *queue)
 {
 	struct sk_buff *skb, *tmp;
+	ulong flags;
 
 	skb_queue_walk_safe(queue, skb, tmp) {
 		__skb_unlink(skb, queue);
-		dev_queue_xmit(skb);
+		spin_lock_irqsave(&txlock, flags);
+		skb_queue_tail(&skbtxq, skb);
+		spin_unlock_irqrestore(&txlock, flags);
+		wake_up(&txwq);
 	}
 }
 
-/* 
- * (1) len doesn't include the header by default.  I want this. 
+/*
+ * (1) len doesn't include the header by default.  I want this.
  */
 static int
 aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct aoe_hdr *h;
+	struct aoe_atahdr *ah;
 	u32 n;
+	int sn;
 
 	if (dev_net(ifp) != &init_net)
 		goto exit;
@@ -109,13 +141,16 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
 	skb = skb_share_check(skb, GFP_ATOMIC);
 	if (skb == NULL)
 		return 0;
-	if (skb_linearize(skb))
-		goto exit;
 	if (!is_aoe_netif(ifp))
 		goto exit;
 	skb_push(skb, ETH_HLEN);	/* (1) */
-
-	h = (struct aoe_hdr *) skb_mac_header(skb);
+	sn = sizeof(*h) + sizeof(*ah);
+	if (skb->len >= sn) {
+		sn -= skb_headlen(skb);
+		if (sn > 0 && !__pskb_pull_tail(skb, sn))
+			goto exit;
+	}
+	h = (struct aoe_hdr *) skb->data;
 	n = get_unaligned_be32(&h->tag);
 	if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
 		goto exit;
@@ -136,7 +171,8 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
 
 	switch (h->cmd) {
 	case AOECMD_ATA:
-		aoecmd_ata_rsp(skb);
+		/* ata_rsp may keep skb for later processing or give it back */
+		skb = aoecmd_ata_rsp(skb);
 		break;
 	case AOECMD_CFG:
 		aoecmd_cfg_rsp(skb);
@@ -144,8 +180,12 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
 	default:
 		if (h->cmd >= AOECMD_VEND_MIN)
 			break;	/* don't complain about vendor commands */
-		printk(KERN_INFO "aoe: unknown cmd %d\n", h->cmd);
+		pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd);
+		break;
 	}
+
+	if (!skb)
+		return 0;
 exit:
 	dev_kfree_skb(skb);
 	return 0;
@@ -159,6 +199,16 @@ static struct packet_type aoe_pt __read_mostly = {
 int __init
 aoenet_init(void)
 {
+	skb_queue_head_init(&skbtxq);
+	init_waitqueue_head(&txwq);
+	spin_lock_init(&txlock);
+	kts.lock = &txlock;
+	kts.fn = tx;
+	kts.waitq = &txwq;
+	kts.id = 0;
+	snprintf(kts.name, sizeof(kts.name), "aoe_tx%d", kts.id);
+	if (aoe_ktstart(&kts))
+		return -EAGAIN;
 	dev_add_pack(&aoe_pt);
 	return 0;
 }
@@ -166,6 +216,8 @@ aoenet_init(void)
 void
 aoenet_exit(void)
 {
+	aoe_ktstop(&kts);
+	skb_queue_purge(&skbtxq);
 	dev_remove_pack(&aoe_pt);
 }
 
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index f5e7180d7f4..2104b1b4ccd 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -67,6 +67,9 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/wait.h>
 
 #include <asm/atafd.h>
 #include <asm/atafdreg.h>
@@ -78,8 +81,9 @@
 
 #undef DEBUG
 
-static struct request_queue *floppy_queue;
+static DEFINE_MUTEX(ataflop_mutex);
 static struct request *fd_request;
+static int fdc_queue;
 
 /* Disk types: DD, HD, ED */
 static struct atari_disk_type {
@@ -299,7 +303,7 @@ module_param_array(UserSteprate, int, NULL, 0);
 /* Synchronization of FDC access. */
 static volatile int fdc_busy = 0;
 static DECLARE_WAIT_QUEUE_HEAD(fdc_wait);
-static DECLARE_WAIT_QUEUE_HEAD(format_wait);
+static DECLARE_COMPLETION(format_wait);
 
 static unsigned long changed_floppies = 0xff, fake_change = 0;
 #define	CHECK_CHANGE_DELAY	HZ/2
@@ -359,13 +363,13 @@ static void finish_fdc( void );
 static void finish_fdc_done( int dummy );
 static void setup_req_params( int drive );
 static void redo_fd_request( void);
-static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
                      cmd, unsigned long param);
 static void fd_probe( int drive );
 static int fd_test_drive_present( int drive );
 static void config_types( void );
 static int floppy_open(struct block_device *bdev, fmode_t mode);
-static int floppy_release(struct gendisk *disk, fmode_t mode);
+static void floppy_release(struct gendisk *disk, fmode_t mode);
 
 /************************* End of Prototypes **************************/
 
@@ -606,7 +610,7 @@ static void fd_error( void )
 	if (IsFormatting) {
 		IsFormatting = 0;
 		FormatError = 1;
-		wake_up( &format_wait );
+		complete(&format_wait);
 		return;
 	}
 
@@ -648,9 +652,8 @@ static int do_format(int drive, int type, struct atari_format_descr *desc)
 	DPRINT(("do_format( dr=%d tr=%d he=%d offs=%d )\n",
 		drive, desc->track, desc->head, desc->sect_offset ));
 
+	wait_event(fdc_wait, cmpxchg(&fdc_busy, 0, 1) == 0);
 	local_irq_save(flags);
-	while( fdc_busy ) sleep_on( &fdc_wait );
-	fdc_busy = 1;
 	stdma_lock(floppy_irq, NULL);
 	atari_turnon_irq( IRQ_MFP_FDC ); /* should be already, just to be sure */
 	local_irq_restore(flags);
@@ -704,7 +707,7 @@ static int do_format(int drive, int type, struct atari_format_descr *desc)
 	ReqSide  = desc->head;
 	do_fd_action( drive );
 
-	sleep_on( &format_wait );
+	wait_for_completion(&format_wait);
 
 	redo_fd_request();
 	return( FormatError ? -EIO : 0 );	
@@ -1227,7 +1230,7 @@ static void fd_writetrack_done( int status )
 		goto err_end;
 	}
 
-	wake_up( &format_wait );
+	complete(&format_wait);
 	return;
 
   err_end:
@@ -1322,23 +1325,24 @@ static void finish_fdc_done( int dummy )
  * due to unrecognised disk changes.
  */
 
-static int check_floppy_change(struct gendisk *disk)
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
 {
 	struct atari_floppy_struct *p = disk->private_data;
 	unsigned int drive = p - unit;
 	if (test_bit (drive, &fake_change)) {
 		/* simulated change (e.g. after formatting) */
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	}
 	if (test_bit (drive, &changed_floppies)) {
 		/* surely changed (the WP signal changed at least once) */
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	}
 	if (UD.wpstat) {
 		/* WP is on -> could be changed: to be sure, buffers should be
 		 * invalidated...
 		 */
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	}
 
 	return 0;
@@ -1390,6 +1394,29 @@ static void setup_req_params( int drive )
 			ReqTrack, ReqSector, (unsigned long)ReqData ));
 }
 
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static struct request *set_next_request(void)
+{
+	struct request_queue *q;
+	int old_pos = fdc_queue;
+	struct request *rq = NULL;
+
+	do {
+		q = unit[fdc_queue].disk->queue;
+		if (++fdc_queue == FD_MAX_UNITS)
+			fdc_queue = 0;
+		if (q) {
+			rq = blk_fetch_request(q);
+			if (rq)
+				break;
+		}
+	} while (fdc_queue != old_pos);
+
+	return rq;
+}
+
 
 static void redo_fd_request(void)
 {
@@ -1404,7 +1431,7 @@ static void redo_fd_request(void)
 
 repeat:
 	if (!fd_request) {
-		fd_request = blk_fetch_request(floppy_queue);
+		fd_request = set_next_request();
 		if (!fd_request)
 			goto the_end;
 	}
@@ -1457,7 +1484,7 @@ repeat:
 	ReqCnt = 0;
 	ReqCmd = rq_data_dir(fd_request);
 	ReqBlock = blk_rq_pos(fd_request);
-	ReqBuffer = fd_request->buffer;
+	ReqBuffer = bio_data(fd_request->bio);
 	setup_req_params( drive );
 	do_fd_action( drive );
 
@@ -1470,22 +1497,16 @@ repeat:
 
 void do_fd_request(struct request_queue * q)
 {
- 	unsigned long flags;
-
 	DPRINT(("do_fd_request for pid %d\n",current->pid));
-	while( fdc_busy ) sleep_on( &fdc_wait );
-	fdc_busy = 1;
+	wait_event(fdc_wait, cmpxchg(&fdc_busy, 0, 1) == 0);
 	stdma_lock(floppy_irq, NULL);
 
 	atari_disable_irq( IRQ_MFP_FDC );
-	local_save_flags(flags);	/* The request function is called with ints
-	local_irq_disable();		 * disabled... so must save the IPL for later */ 
 	redo_fd_request();
-	local_irq_restore(flags);
 	atari_enable_irq( IRQ_MFP_FDC );
 }
 
-static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
 		    unsigned int cmd, unsigned long param)
 {
 	struct gendisk *disk = bdev->bd_disk;
@@ -1550,7 +1571,7 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
 		 * or the next access will revalidate - and clear UDT :-(
 		 */
 
-		if (check_floppy_change(disk))
+		if (floppy_check_events(disk, 0))
 		        floppy_revalidate(disk);
 
 		if (UD.flags & FTD_MSG)
@@ -1627,7 +1648,7 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
 				drive, dtp->blocks, dtp->spt, dtp->stretch);
 
 		/* sanity check */
-		if (!dtp || setprm.track != dtp->blocks/dtp->spt/2 ||
+		if (setprm.track != dtp->blocks/dtp->spt/2 ||
 		    setprm.head != 2) {
 			redo_fd_request();
 			return -EINVAL;
@@ -1670,6 +1691,17 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
 	}
 }
 
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	mutex_lock(&ataflop_mutex);
+	ret = fd_locked_ioctl(bdev, mode, cmd, arg);
+	mutex_unlock(&ataflop_mutex);
+
+	return ret;
+}
 
 /* Initialize the 'unit' variable for drive 'drive' */
 
@@ -1843,25 +1875,36 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	return 0;
 }
 
+static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+	int ret;
+
+	mutex_lock(&ataflop_mutex);
+	ret = floppy_open(bdev, mode);
+	mutex_unlock(&ataflop_mutex);
 
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+	return ret;
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	struct atari_floppy_struct *p = disk->private_data;
+	mutex_lock(&ataflop_mutex);
 	if (p->ref < 0)
 		p->ref = 0;
 	else if (!p->ref--) {
 		printk(KERN_ERR "floppy_release with fd_ref == 0");
 		p->ref = 0;
 	}
-	return 0;
+	mutex_unlock(&ataflop_mutex);
 }
 
-static struct block_device_operations floppy_fops = {
+static const struct block_device_operations floppy_fops = {
 	.owner		= THIS_MODULE,
-	.open		= floppy_open,
+	.open		= floppy_unlocked_open,
 	.release	= floppy_release,
-	.locked_ioctl	= fd_ioctl,
-	.media_changed	= check_floppy_change,
+	.ioctl		= fd_ioctl,
+	.check_events	= floppy_check_events,
 	.revalidate_disk= floppy_revalidate,
 };
 
@@ -1909,14 +1952,10 @@ static int __init atari_floppy_init (void)
 		goto Enomem;
 	}
 	TrackBuffer = DMABuffer + 512;
-	PhysDMABuffer = virt_to_phys(DMABuffer);
+	PhysDMABuffer = atari_stram_to_phys(DMABuffer);
 	PhysTrackBuffer = virt_to_phys(TrackBuffer);
 	BufferDrive = BufferSide = BufferTrack = -1;
 
-	floppy_queue = blk_init_queue(do_fd_request, &ataflop_lock);
-	if (!floppy_queue)
-		goto Enomem;
-
 	for (i = 0; i < FD_MAX_UNITS; i++) {
 		unit[i].track = -1;
 		unit[i].flags = 0;
@@ -1925,7 +1964,10 @@ static int __init atari_floppy_init (void)
 		sprintf(unit[i].disk->disk_name, "fd%d", i);
 		unit[i].disk->fops = &floppy_fops;
 		unit[i].disk->private_data = &unit[i];
-		unit[i].disk->queue = floppy_queue;
+		unit[i].disk->queue = blk_init_queue(do_fd_request,
+					&ataflop_lock);
+		if (!unit[i].disk->queue)
+			goto Enomem;
 		set_capacity(unit[i].disk, MAX_DISK_SIZE * 2);
 		add_disk(unit[i].disk);
 	}
@@ -1940,10 +1982,14 @@ static int __init atari_floppy_init (void)
 
 	return 0;
 Enomem:
-	while (i--)
+	while (i--) {
+		struct request_queue *q = unit[i].disk->queue;
+
 		put_disk(unit[i].disk);
-	if (floppy_queue)
-		blk_cleanup_queue(floppy_queue);
+		if (q)
+			blk_cleanup_queue(q);
+	}
+
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 	return -ENOMEM;
 }
@@ -1992,12 +2038,14 @@ static void __exit atari_floppy_exit(void)
 	int i;
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
 	for (i = 0; i < FD_MAX_UNITS; i++) {
+		struct request_queue *q = unit[i].disk->queue;
+
 		del_gendisk(unit[i].disk);
 		put_disk(unit[i].disk);
+		blk_cleanup_queue(q);
 	}
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 
-	blk_cleanup_queue(floppy_queue);
 	del_timer_sync(&fd_timer);
 	atari_stram_free( DMABuffer );
 }
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 4bf8705b3ac..c7d138eca73 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -15,9 +15,10 @@
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/highmem.h>
-#include <linux/gfp.h>
+#include <linux/mutex.h>
 #include <linux/radix-tree.h>
-#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
+#include <linux/fs.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 
@@ -34,10 +35,6 @@
  */
 struct brd_device {
 	int		brd_number;
-	int		brd_refcnt;
-	loff_t		brd_offset;
-	loff_t		brd_sizelimit;
-	unsigned	brd_blocksize;
 
 	struct request_queue	*brd_queue;
 	struct gendisk		*brd_disk;
@@ -54,6 +51,7 @@ struct brd_device {
 /*
  * Look up and return a brd's page for a given sector.
  */
+static DEFINE_MUTEX(brd_mutex);
 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
 {
 	pgoff_t idx;
@@ -119,13 +117,13 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
 
 	spin_lock(&brd->brd_lock);
 	idx = sector >> PAGE_SECTORS_SHIFT;
+	page->index = idx;
 	if (radix_tree_insert(&brd->brd_pages, idx, page)) {
 		__free_page(page);
 		page = radix_tree_lookup(&brd->brd_pages, idx);
 		BUG_ON(!page);
 		BUG_ON(page->index != idx);
-	} else
-		page->index = idx;
+	}
 	spin_unlock(&brd->brd_lock);
 
 	radix_tree_preload_end();
@@ -133,6 +131,28 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
 	return page;
 }
 
+static void brd_free_page(struct brd_device *brd, sector_t sector)
+{
+	struct page *page;
+	pgoff_t idx;
+
+	spin_lock(&brd->brd_lock);
+	idx = sector >> PAGE_SECTORS_SHIFT;
+	page = radix_tree_delete(&brd->brd_pages, idx);
+	spin_unlock(&brd->brd_lock);
+	if (page)
+		__free_page(page);
+}
+
+static void brd_zero_page(struct brd_device *brd, sector_t sector)
+{
+	struct page *page;
+
+	page = brd_lookup_page(brd, sector);
+	if (page)
+		clear_highpage(page);
+}
+
 /*
  * Free all backing store pages and radix tree. This must only be called when
  * there are no other users of the device.
@@ -180,15 +200,33 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
 
 	copy = min_t(size_t, n, PAGE_SIZE - offset);
 	if (!brd_insert_page(brd, sector))
-		return -ENOMEM;
+		return -ENOSPC;
 	if (copy < n) {
 		sector += copy >> SECTOR_SHIFT;
 		if (!brd_insert_page(brd, sector))
-			return -ENOMEM;
+			return -ENOSPC;
 	}
 	return 0;
 }
 
+static void discard_from_brd(struct brd_device *brd,
+			sector_t sector, size_t n)
+{
+	while (n >= PAGE_SIZE) {
+		/*
+		 * Don't want to actually discard pages here because
+		 * re-allocating the pages can result in writeback
+		 * deadlocks under heavy load.
+		 */
+		if (0)
+			brd_free_page(brd, sector);
+		else
+			brd_zero_page(brd, sector);
+		sector += PAGE_SIZE >> SECTOR_SHIFT;
+		n -= PAGE_SIZE;
+	}
+}
+
 /*
  * Copy n bytes from src to the brd starting at sector. Does not sleep.
  */
@@ -204,9 +242,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src,
 	page = brd_lookup_page(brd, sector);
 	BUG_ON(!page);
 
-	dst = kmap_atomic(page, KM_USER1);
+	dst = kmap_atomic(page);
 	memcpy(dst + offset, src, copy);
-	kunmap_atomic(dst, KM_USER1);
+	kunmap_atomic(dst);
 
 	if (copy < n) {
 		src += copy;
@@ -215,9 +253,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src,
 		page = brd_lookup_page(brd, sector);
 		BUG_ON(!page);
 
-		dst = kmap_atomic(page, KM_USER1);
+		dst = kmap_atomic(page);
 		memcpy(dst, src, copy);
-		kunmap_atomic(dst, KM_USER1);
+		kunmap_atomic(dst);
 	}
 }
 
@@ -235,9 +273,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
 	copy = min_t(size_t, n, PAGE_SIZE - offset);
 	page = brd_lookup_page(brd, sector);
 	if (page) {
-		src = kmap_atomic(page, KM_USER1);
+		src = kmap_atomic(page);
 		memcpy(dst, src + offset, copy);
-		kunmap_atomic(src, KM_USER1);
+		kunmap_atomic(src);
 	} else
 		memset(dst, 0, copy);
 
@@ -247,9 +285,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
 		copy = n - copy;
 		page = brd_lookup_page(brd, sector);
 		if (page) {
-			src = kmap_atomic(page, KM_USER1);
+			src = kmap_atomic(page);
 			memcpy(dst, src, copy);
-			kunmap_atomic(src, KM_USER1);
+			kunmap_atomic(src);
 		} else
 			memset(dst, 0, copy);
 	}
@@ -271,7 +309,7 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
 			goto out;
 	}
 
-	mem = kmap_atomic(page, KM_USER0);
+	mem = kmap_atomic(page);
 	if (rw == READ) {
 		copy_from_brd(mem + off, brd, sector, len);
 		flush_dcache_page(page);
@@ -279,35 +317,40 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
 		flush_dcache_page(page);
 		copy_to_brd(brd, mem + off, sector, len);
 	}
-	kunmap_atomic(mem, KM_USER0);
+	kunmap_atomic(mem);
 
 out:
 	return err;
 }
 
-static int brd_make_request(struct request_queue *q, struct bio *bio)
+static void brd_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	int rw;
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
 	sector_t sector;
-	int i;
+	struct bvec_iter iter;
 	int err = -EIO;
 
-	sector = bio->bi_sector;
-	if (sector + (bio->bi_size >> SECTOR_SHIFT) >
-						get_capacity(bdev->bd_disk))
+	sector = bio->bi_iter.bi_sector;
+	if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
 		goto out;
 
+	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+		err = 0;
+		discard_from_brd(brd, sector, bio->bi_iter.bi_size);
+		goto out;
+	}
+
 	rw = bio_rw(bio);
 	if (rw == READA)
 		rw = READ;
 
-	bio_for_each_segment(bvec, bio, i) {
-		unsigned int len = bvec->bv_len;
-		err = brd_do_bvec(brd, bvec->bv_page, len,
-					bvec->bv_offset, rw, sector);
+	bio_for_each_segment(bvec, bio, iter) {
+		unsigned int len = bvec.bv_len;
+		err = brd_do_bvec(brd, bvec.bv_page, len,
+					bvec.bv_offset, rw, sector);
 		if (err)
 			break;
 		sector += len >> SECTOR_SHIFT;
@@ -315,12 +358,19 @@ static int brd_make_request(struct request_queue *q, struct bio *bio)
 
 out:
 	bio_endio(bio, err);
+}
 
-	return 0;
+static int brd_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, int rw)
+{
+	struct brd_device *brd = bdev->bd_disk->private_data;
+	int err = brd_do_bvec(brd, page, PAGE_CACHE_SIZE, 0, rw, sector);
+	page_endio(page, rw & WRITE, err);
+	return err;
 }
 
 #ifdef CONFIG_BLK_DEV_XIP
-static int brd_direct_access (struct block_device *bdev, sector_t sector,
+static int brd_direct_access(struct block_device *bdev, sector_t sector,
 			void **kaddr, unsigned long *pfn)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
@@ -334,7 +384,7 @@ static int brd_direct_access (struct block_device *bdev, sector_t sector,
 		return -ERANGE;
 	page = brd_insert_page(brd, sector);
 	if (!page)
-		return -ENOMEM;
+		return -ENOSPC;
 	*kaddr = page_address(page);
 	*pfn = page_to_pfn(page);
 
@@ -355,29 +405,31 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
 	 * ram device BLKFLSBUF has special semantics, we want to actually
 	 * release and destroy the ramdisk data.
 	 */
+	mutex_lock(&brd_mutex);
 	mutex_lock(&bdev->bd_mutex);
 	error = -EBUSY;
 	if (bdev->bd_openers <= 1) {
 		/*
-		 * Invalidate the cache first, so it isn't written
-		 * back to the device.
+		 * Kill the cache first, so it isn't written back to the
+		 * device.
 		 *
 		 * Another thread might instantiate more buffercache here,
 		 * but there is not much we can do to close that race.
 		 */
-		invalidate_bh_lrus();
-		truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+		kill_bdev(bdev);
 		brd_free_pages(brd);
 		error = 0;
 	}
 	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&brd_mutex);
 
 	return error;
 }
 
-static struct block_device_operations brd_fops = {
+static const struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
-	.locked_ioctl =		brd_ioctl,
+	.rw_page =		brd_rw_page,
+	.ioctl =		brd_ioctl,
 #ifdef CONFIG_BLK_DEV_XIP
 	.direct_access =	brd_direct_access,
 #endif
@@ -390,11 +442,11 @@ static int rd_nr;
 int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
 static int max_part;
 static int part_shift;
-module_param(rd_nr, int, 0);
+module_param(rd_nr, int, S_IRUGO);
 MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
-module_param(rd_size, int, 0);
+module_param(rd_size, int, S_IRUGO);
 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
-module_param(max_part, int, 0);
+module_param(max_part, int, S_IRUGO);
 MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
@@ -433,10 +485,14 @@ static struct brd_device *brd_alloc(int i)
 	if (!brd->brd_queue)
 		goto out_free_dev;
 	blk_queue_make_request(brd->brd_queue, brd_make_request);
-	blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL);
-	blk_queue_max_sectors(brd->brd_queue, 1024);
+	blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 	blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 
+	brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
+	brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
+	brd->brd_queue->limits.discard_zeroes_data = 1;
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
+
 	disk = brd->brd_disk = alloc_disk(1 << part_shift);
 	if (!disk)
 		goto out_free_queue;
@@ -498,8 +554,8 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
 	struct kobject *kobj;
 
 	mutex_lock(&brd_devices_mutex);
-	brd = brd_init_one(dev & MINORMASK);
-	kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM);
+	brd = brd_init_one(MINOR(dev) >> part_shift);
+	kobj = brd ? get_disk(brd->brd_disk) : NULL;
 	mutex_unlock(&brd_devices_mutex);
 
 	*part = 0;
@@ -521,25 +577,39 @@ static int __init brd_init(void)
 	 *
 	 * (1) if rd_nr is specified, create that many upfront, and this
 	 *     also becomes a hard limit.
-	 * (2) if rd_nr is not specified, create 1 rd device on module
-	 *     load, user can further extend brd device by create dev node
-	 *     themselves and have kernel automatically instantiate actual
-	 *     device on-demand.
+	 * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT
+	 *     (default 16) rd device on module load, user can further
+	 *     extend brd device by create dev node themselves and have
+	 *     kernel automatically instantiate actual device on-demand.
 	 */
 
 	part_shift = 0;
-	if (max_part > 0)
+	if (max_part > 0) {
 		part_shift = fls(max_part);
 
+		/*
+		 * Adjust max_part according to part_shift as it is exported
+		 * to user space so that user can decide correct minor number
+		 * if [s]he want to create more devices.
+		 *
+		 * Note that -1 is required because partition 0 is reserved
+		 * for the whole disk.
+		 */
+		max_part = (1UL << part_shift) - 1;
+	}
+
+	if ((1UL << part_shift) > DISK_MAX_PARTS)
+		return -EINVAL;
+
 	if (rd_nr > 1UL << (MINORBITS - part_shift))
 		return -EINVAL;
 
 	if (rd_nr) {
 		nr = rd_nr;
-		range = rd_nr;
+		range = rd_nr << part_shift;
 	} else {
 		nr = CONFIG_BLK_DEV_RAM_COUNT;
-		range = 1UL << (MINORBITS - part_shift);
+		range = 1UL << MINORBITS;
 	}
 
 	if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
@@ -578,7 +648,7 @@ static void __exit brd_exit(void)
 	unsigned long range;
 	struct brd_device *brd, *next;
 
-	range = rd_nr ? rd_nr :  1UL << (MINORBITS - part_shift);
+	range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
 
 	list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
 		brd_del_one(brd);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index b22cec97ea1..4595c22f33f 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -24,6 +24,7 @@
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/pci.h>
+#include <linux/pci-aspm.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
@@ -35,12 +36,14 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
-#include <linux/blktrace_api.h>
+#include <linux/mutex.h>
+#include <linux/bitmap.h>
+#include <linux/io.h>
 #include <asm/uaccess.h>
-#include <asm/io.h>
 
 #include <linux/dma-mapping.h>
 #include <linux/blkdev.h>
@@ -54,17 +57,32 @@
 #include <linux/kthread.h>
 
 #define CCISS_DRIVER_VERSION(maj,min,submin) ((maj<<16)|(min<<8)|(submin))
-#define DRIVER_NAME "HP CISS Driver (v 3.6.20)"
-#define DRIVER_VERSION CCISS_DRIVER_VERSION(3, 6, 20)
+#define DRIVER_NAME "HP CISS Driver (v 3.6.26)"
+#define DRIVER_VERSION CCISS_DRIVER_VERSION(3, 6, 26)
 
 /* Embedded module documentation macros - see modules.h */
 MODULE_AUTHOR("Hewlett-Packard Company");
 MODULE_DESCRIPTION("Driver for HP Smart Array Controllers");
-MODULE_SUPPORTED_DEVICE("HP SA5i SA5i+ SA532 SA5300 SA5312 SA641 SA642 SA6400"
-			" SA6i P600 P800 P400 P400i E200 E200i E500 P700m"
-			" Smart Array G2 Series SAS/SATA Controllers");
-MODULE_VERSION("3.6.20");
+MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers");
+MODULE_VERSION("3.6.26");
 MODULE_LICENSE("GPL");
+static int cciss_tape_cmds = 6;
+module_param(cciss_tape_cmds, int, 0644);
+MODULE_PARM_DESC(cciss_tape_cmds,
+	"number of commands to allocate for tape devices (default: 6)");
+static int cciss_simple_mode;
+module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cciss_simple_mode,
+	"Use 'simple mode' rather than 'performant mode'");
+
+static int cciss_allow_hpsa;
+module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cciss_allow_hpsa,
+	"Prevent cciss driver from accessing hardware known to be "
+	" supported by the hpsa driver");
+
+static DEFINE_MUTEX(cciss_mutex);
+static struct proc_dir_entry *proc_cciss;
 
 #include "cciss_cmd.h"
 #include "cciss.h"
@@ -92,15 +110,6 @@ static const struct pci_device_id cciss_pci_device_id[] = {
 	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSD,     0x103C, 0x3215},
 	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSC,     0x103C, 0x3237},
 	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSC,     0x103C, 0x323D},
-	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSE,     0x103C, 0x3241},
-	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSE,     0x103C, 0x3243},
-	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSE,     0x103C, 0x3245},
-	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSE,     0x103C, 0x3247},
-	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSE,     0x103C, 0x3249},
-	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSE,     0x103C, 0x324A},
-	{PCI_VENDOR_ID_HP,     PCI_DEVICE_ID_HP_CISSE,     0x103C, 0x324B},
-	{PCI_VENDOR_ID_HP,     PCI_ANY_ID,	PCI_ANY_ID, PCI_ANY_ID,
-		PCI_CLASS_STORAGE_RAID << 8, 0xffff << 8, 0},
 	{0,}
 };
 
@@ -130,15 +139,9 @@ static struct board_type products[] = {
 	{0x3214103C, "Smart Array E200i", &SA5_access},
 	{0x3215103C, "Smart Array E200i", &SA5_access},
 	{0x3237103C, "Smart Array E500", &SA5_access},
+	{0x3223103C, "Smart Array P800", &SA5_access},
+	{0x3234103C, "Smart Array P400", &SA5_access},
 	{0x323D103C, "Smart Array P700m", &SA5_access},
-	{0x3241103C, "Smart Array P212", &SA5_access},
-	{0x3243103C, "Smart Array P410", &SA5_access},
-	{0x3245103C, "Smart Array P410i", &SA5_access},
-	{0x3247103C, "Smart Array P411", &SA5_access},
-	{0x3249103C, "Smart Array P812", &SA5_access},
-	{0x324A103C, "Smart Array P712m", &SA5_access},
-	{0x324B103C, "Smart Array P711m", &SA5_access},
-	{0xFFFF103C, "Unknown Smart Array", &SA5_access},
 };
 
 /* How long to wait (in milliseconds) for board to go into simple mode */
@@ -155,47 +158,68 @@ static struct board_type products[] = {
 
 static ctlr_info_t *hba[MAX_CTLR];
 
+static struct task_struct *cciss_scan_thread;
+static DEFINE_MUTEX(scan_mutex);
+static LIST_HEAD(scan_q);
+
 static void do_cciss_request(struct request_queue *q);
-static irqreturn_t do_cciss_intr(int irq, void *dev_id);
+static irqreturn_t do_cciss_intx(int irq, void *dev_id);
+static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id);
 static int cciss_open(struct block_device *bdev, fmode_t mode);
-static int cciss_release(struct gendisk *disk, fmode_t mode);
+static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode);
+static void cciss_release(struct gendisk *disk, fmode_t mode);
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		       unsigned int cmd, unsigned long arg);
 static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static int cciss_revalidate(struct gendisk *disk);
-static int rebuild_lun_table(ctlr_info_t *h, int first_time);
+static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl);
 static int deregister_disk(ctlr_info_t *h, int drv_index,
-			   int clear_all);
+			   int clear_all, int via_ioctl);
 
-static void cciss_read_capacity(int ctlr, int logvol, int withirq,
+static void cciss_read_capacity(ctlr_info_t *h, int logvol,
 			sector_t *total_size, unsigned int *block_size);
-static void cciss_read_capacity_16(int ctlr, int logvol, int withirq,
+static void cciss_read_capacity_16(ctlr_info_t *h, int logvol,
 			sector_t *total_size, unsigned int *block_size);
-static void cciss_geometry_inquiry(int ctlr, int logvol,
-			int withirq, sector_t total_size,
+static void cciss_geometry_inquiry(ctlr_info_t *h, int logvol,
+			sector_t total_size,
 			unsigned int block_size, InquiryData_struct *inq_buff,
 				   drive_info_struct *drv);
-static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *,
-					   __u32);
+static void cciss_interrupt_mode(ctlr_info_t *);
+static int cciss_enter_simple_mode(struct ctlr_info *h);
 static void start_io(ctlr_info_t *h);
-static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
-		   __u8 page_code, unsigned char *scsi3addr, int cmd_type);
-static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
+static int sendcmd_withirq(ctlr_info_t *h, __u8 cmd, void *buff, size_t size,
 			__u8 page_code, unsigned char scsi3addr[],
 			int cmd_type);
 static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c,
 	int attempt_retry);
 static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c);
 
-static void fail_all_cmds(unsigned long ctlr);
+static int add_to_scan_list(struct ctlr_info *h);
 static int scan_thread(void *data);
 static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c);
+static void cciss_hba_release(struct device *dev);
+static void cciss_device_release(struct device *dev);
+static void cciss_free_gendisk(ctlr_info_t *h, int drv_index);
+static void cciss_free_drive_info(ctlr_info_t *h, int drv_index);
+static inline u32 next_command(ctlr_info_t *h);
+static int cciss_find_cfg_addrs(struct pci_dev *pdev, void __iomem *vaddr,
+				u32 *cfg_base_addr, u64 *cfg_base_addr_index,
+				u64 *cfg_offset);
+static int cciss_pci_find_memory_BAR(struct pci_dev *pdev,
+				     unsigned long *memory_bar);
+static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag);
+static int write_driver_ver_to_cfgtable(CfgTable_struct __iomem *cfgtable);
+
+/* performant mode helper functions */
+static void  calc_bucket_map(int *bucket, int num_buckets, int nsgs,
+				int *bucket_map);
+static void cciss_put_controller_into_performant_mode(ctlr_info_t *h);
 
 #ifdef CONFIG_PROC_FS
-static void cciss_procinit(int i);
+static void cciss_procinit(ctlr_info_t *h);
 #else
-static void cciss_procinit(int i)
+static void cciss_procinit(ctlr_info_t *h)
 {
 }
 #endif				/* CONFIG_PROC_FS */
@@ -205,11 +229,11 @@ static int cciss_compat_ioctl(struct block_device *, fmode_t,
 			      unsigned, unsigned long);
 #endif
 
-static struct block_device_operations cciss_fops = {
+static const struct block_device_operations cciss_fops = {
 	.owner = THIS_MODULE,
-	.open = cciss_open,
+	.open = cciss_unlocked_open,
 	.release = cciss_release,
-	.locked_ioctl = cciss_ioctl,
+	.ioctl = cciss_ioctl,
 	.getgeo = cciss_getgeo,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = cciss_compat_ioctl,
@@ -217,25 +241,135 @@ static struct block_device_operations cciss_fops = {
 	.revalidate_disk = cciss_revalidate,
 };
 
+/* set_performant_mode: Modify the tag for cciss performant
+ * set bit 0 for pull model, bits 3-1 for block fetch
+ * register number
+ */
+static void set_performant_mode(ctlr_info_t *h, CommandList_struct *c)
+{
+	if (likely(h->transMethod & CFGTBL_Trans_Performant))
+		c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
+}
+
 /*
  * Enqueuing and dequeuing functions for cmdlists.
  */
-static inline void addQ(struct hlist_head *list, CommandList_struct *c)
+static inline void addQ(struct list_head *list, CommandList_struct *c)
 {
-	hlist_add_head(&c->list, list);
+	list_add_tail(&c->list, list);
 }
 
 static inline void removeQ(CommandList_struct *c)
 {
-	if (WARN_ON(hlist_unhashed(&c->list)))
+	/*
+	 * After kexec/dump some commands might still
+	 * be in flight, which the firmware will try
+	 * to complete. Resetting the firmware doesn't work
+	 * with old fw revisions, so we have to mark
+	 * them off as 'stale' to prevent the driver from
+	 * falling over.
+	 */
+	if (WARN_ON(list_empty(&c->list))) {
+		c->cmd_type = CMD_MSG_STALE;
 		return;
+	}
+
+	list_del_init(&c->list);
+}
+
+static void enqueue_cmd_and_start_io(ctlr_info_t *h,
+	CommandList_struct *c)
+{
+	unsigned long flags;
+	set_performant_mode(h, c);
+	spin_lock_irqsave(&h->lock, flags);
+	addQ(&h->reqQ, c);
+	h->Qdepth++;
+	if (h->Qdepth > h->maxQsinceinit)
+		h->maxQsinceinit = h->Qdepth;
+	start_io(h);
+	spin_unlock_irqrestore(&h->lock, flags);
+}
 
-	hlist_del_init(&c->list);
+static void cciss_free_sg_chain_blocks(SGDescriptor_struct **cmd_sg_list,
+	int nr_cmds)
+{
+	int i;
+
+	if (!cmd_sg_list)
+		return;
+	for (i = 0; i < nr_cmds; i++) {
+		kfree(cmd_sg_list[i]);
+		cmd_sg_list[i] = NULL;
+	}
+	kfree(cmd_sg_list);
+}
+
+static SGDescriptor_struct **cciss_allocate_sg_chain_blocks(
+	ctlr_info_t *h, int chainsize, int nr_cmds)
+{
+	int j;
+	SGDescriptor_struct **cmd_sg_list;
+
+	if (chainsize <= 0)
+		return NULL;
+
+	cmd_sg_list = kmalloc(sizeof(*cmd_sg_list) * nr_cmds, GFP_KERNEL);
+	if (!cmd_sg_list)
+		return NULL;
+
+	/* Build up chain blocks for each command */
+	for (j = 0; j < nr_cmds; j++) {
+		/* Need a block of chainsized s/g elements. */
+		cmd_sg_list[j] = kmalloc((chainsize *
+			sizeof(*cmd_sg_list[j])), GFP_KERNEL);
+		if (!cmd_sg_list[j]) {
+			dev_err(&h->pdev->dev, "Cannot get memory "
+				"for s/g chains.\n");
+			goto clean;
+		}
+	}
+	return cmd_sg_list;
+clean:
+	cciss_free_sg_chain_blocks(cmd_sg_list, nr_cmds);
+	return NULL;
+}
+
+static void cciss_unmap_sg_chain_block(ctlr_info_t *h, CommandList_struct *c)
+{
+	SGDescriptor_struct *chain_sg;
+	u64bit temp64;
+
+	if (c->Header.SGTotal <= h->max_cmd_sgentries)
+		return;
+
+	chain_sg = &c->SG[h->max_cmd_sgentries - 1];
+	temp64.val32.lower = chain_sg->Addr.lower;
+	temp64.val32.upper = chain_sg->Addr.upper;
+	pci_unmap_single(h->pdev, temp64.val, chain_sg->Len, PCI_DMA_TODEVICE);
+}
+
+static void cciss_map_sg_chain_block(ctlr_info_t *h, CommandList_struct *c,
+	SGDescriptor_struct *chain_block, int len)
+{
+	SGDescriptor_struct *chain_sg;
+	u64bit temp64;
+
+	chain_sg = &c->SG[h->max_cmd_sgentries - 1];
+	chain_sg->Ext = CCISS_SG_CHAIN;
+	chain_sg->Len = len;
+	temp64.val = pci_map_single(h->pdev, chain_block, len,
+				PCI_DMA_TODEVICE);
+	chain_sg->Addr.lower = temp64.val32.lower;
+	chain_sg->Addr.upper = temp64.val32.upper;
 }
 
 #include "cciss_scsi.c"		/* For SCSI tape support */
 
-#define RAID_UNKNOWN 6
+static const char *raid_label[] = { "0", "4", "1(1+0)", "5", "5+1", "ADG",
+	"UNKNOWN"
+};
+#define RAID_UNKNOWN (ARRAY_SIZE(raid_label)-1)
 
 #ifdef CONFIG_PROC_FS
 
@@ -245,11 +379,6 @@ static inline void removeQ(CommandList_struct *c)
 #define ENG_GIG 1000000000
 #define ENG_GIG_FACTOR (ENG_GIG/512)
 #define ENGAGE_SCSI	"engage scsi"
-static const char *raid_label[] = { "0", "4", "1(1+0)", "5", "5+1", "ADG",
-	"UNKNOWN"
-};
-
-static struct proc_dir_entry *proc_cciss;
 
 static void cciss_seq_show_header(struct seq_file *seq)
 {
@@ -269,32 +398,31 @@ static void cciss_seq_show_header(struct seq_file *seq)
 		h->product_name,
 		(unsigned long)h->board_id,
 		h->firm_ver[0], h->firm_ver[1], h->firm_ver[2],
-		h->firm_ver[3], (unsigned int)h->intr[SIMPLE_MODE_INT],
+		h->firm_ver[3], (unsigned int)h->intr[h->intr_mode],
 		h->num_luns,
 		h->Qdepth, h->commands_outstanding,
 		h->maxQsinceinit, h->max_outstanding, h->maxSG);
 
 #ifdef CONFIG_CISS_SCSI_TAPE
-	cciss_seq_tape_report(seq, h->ctlr);
+	cciss_seq_tape_report(seq, h);
 #endif /* CONFIG_CISS_SCSI_TAPE */
 }
 
 static void *cciss_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	ctlr_info_t *h = seq->private;
-	unsigned ctlr = h->ctlr;
 	unsigned long flags;
 
 	/* prevent displaying bogus info during configuration
 	 * or deconfiguration of a logical volume
 	 */
-	spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
+	spin_lock_irqsave(&h->lock, flags);
 	if (h->busy_configuring) {
-		spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
+		spin_unlock_irqrestore(&h->lock, flags);
 		return ERR_PTR(-EBUSY);
 	}
 	h->busy_configuring = 1;
-	spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
+	spin_unlock_irqrestore(&h->lock, flags);
 
 	if (*pos == 0)
 		cciss_seq_show_header(seq);
@@ -308,11 +436,14 @@ static int cciss_seq_show(struct seq_file *seq, void *v)
 	ctlr_info_t *h = seq->private;
 	unsigned ctlr = h->ctlr;
 	loff_t *pos = v;
-	drive_info_struct *drv = &h->drv[*pos];
+	drive_info_struct *drv = h->drv[*pos];
 
 	if (*pos > h->highest_lun)
 		return 0;
 
+	if (drv == NULL) /* it's possible for h->drv[] to have holes. */
+		return 0;
+
 	if (drv->heads == 0)
 		return 0;
 
@@ -321,7 +452,7 @@ static int cciss_seq_show(struct seq_file *seq, void *v)
 	vol_sz_frac *= 100;
 	sector_div(vol_sz_frac, ENG_GIG_FACTOR);
 
-	if (drv->raid_level > 5)
+	if (drv->raid_level < 0 || drv->raid_level > RAID_UNKNOWN)
 		drv->raid_level = RAID_UNKNOWN;
 	seq_printf(seq, "cciss/c%dd%d:"
 			"\t%4u.%02uGB\tRAID %s\n",
@@ -353,7 +484,7 @@ static void cciss_seq_stop(struct seq_file *seq, void *v)
 	h->busy_configuring = 0;
 }
 
-static struct seq_operations cciss_seq_ops = {
+static const struct seq_operations cciss_seq_ops = {
 	.start = cciss_seq_start,
 	.show  = cciss_seq_show,
 	.next  = cciss_seq_next,
@@ -366,7 +497,7 @@ static int cciss_seq_open(struct inode *inode, struct file *file)
 	struct seq_file *seq = file->private_data;
 
 	if (!ret)
-		seq->private = PDE(inode)->data;
+		seq->private = PDE_DATA(inode);
 
 	return ret;
 }
@@ -398,12 +529,9 @@ cciss_proc_write(struct file *file, const char __user *buf,
 	if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) {
 		struct seq_file *seq = file->private_data;
 		ctlr_info_t *h = seq->private;
-		int rc;
 
-		rc = cciss_engage_scsi(h->ctlr);
-		if (rc != 0)
-			err = -rc;
-		else
+		err = cciss_engage_scsi(h);
+		if (err == 0)
 			err = length;
 	} else
 #endif /* CONFIG_CISS_SCSI_TAPE */
@@ -416,7 +544,7 @@ out:
 	return err;
 }
 
-static struct file_operations cciss_proc_fops = {
+static const struct file_operations cciss_proc_fops = {
 	.owner	 = THIS_MODULE,
 	.open    = cciss_seq_open,
 	.read    = seq_read,
@@ -425,7 +553,7 @@ static struct file_operations cciss_proc_fops = {
 	.write	 = cciss_proc_write,
 };
 
-static void __devinit cciss_procinit(int i)
+static void cciss_procinit(ctlr_info_t *h)
 {
 	struct proc_dir_entry *pde;
 
@@ -433,9 +561,9 @@ static void __devinit cciss_procinit(int i)
 		proc_cciss = proc_mkdir("driver/cciss", NULL);
 	if (!proc_cciss)
 		return;
-	pde = proc_create_data(hba[i]->devname, S_IWUSR | S_IRUSR | S_IRGRP |
+	pde = proc_create_data(h->devname, S_IWUSR | S_IRUSR | S_IRGRP |
 					S_IROTH, proc_cciss,
-					&cciss_proc_fops, hba[i]);
+					&cciss_proc_fops, h);
 }
 #endif				/* CONFIG_PROC_FS */
 
@@ -444,10 +572,92 @@ static void __devinit cciss_procinit(int i)
 #define to_hba(n) container_of(n, struct ctlr_info, dev)
 #define to_drv(n) container_of(n, drive_info_struct, dev)
 
-static struct device_type cciss_host_type = {
-	.name		= "cciss_host",
+/* List of controllers which cannot be hard reset on kexec with reset_devices */
+static u32 unresettable_controller[] = {
+	0x324a103C, /* Smart Array P712m */
+	0x324b103C, /* SmartArray P711m */
+	0x3223103C, /* Smart Array P800 */
+	0x3234103C, /* Smart Array P400 */
+	0x3235103C, /* Smart Array P400i */
+	0x3211103C, /* Smart Array E200i */
+	0x3212103C, /* Smart Array E200 */
+	0x3213103C, /* Smart Array E200i */
+	0x3214103C, /* Smart Array E200i */
+	0x3215103C, /* Smart Array E200i */
+	0x3237103C, /* Smart Array E500 */
+	0x323D103C, /* Smart Array P700m */
+	0x409C0E11, /* Smart Array 6400 */
+	0x409D0E11, /* Smart Array 6400 EM */
+};
+
+/* List of controllers which cannot even be soft reset */
+static u32 soft_unresettable_controller[] = {
+	0x409C0E11, /* Smart Array 6400 */
+	0x409D0E11, /* Smart Array 6400 EM */
 };
 
+static int ctlr_is_hard_resettable(u32 board_id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++)
+		if (unresettable_controller[i] == board_id)
+			return 0;
+	return 1;
+}
+
+static int ctlr_is_soft_resettable(u32 board_id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(soft_unresettable_controller); i++)
+		if (soft_unresettable_controller[i] == board_id)
+			return 0;
+	return 1;
+}
+
+static int ctlr_is_resettable(u32 board_id)
+{
+	return ctlr_is_hard_resettable(board_id) ||
+		ctlr_is_soft_resettable(board_id);
+}
+
+static ssize_t host_show_resettable(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct ctlr_info *h = to_hba(dev);
+
+	return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h->board_id));
+}
+static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL);
+
+static ssize_t host_store_rescan(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct ctlr_info *h = to_hba(dev);
+
+	add_to_scan_list(h);
+	wake_up_process(cciss_scan_thread);
+	wait_for_completion_interruptible(&h->scan_wait);
+
+	return count;
+}
+static DEVICE_ATTR(rescan, S_IWUSR, NULL, host_store_rescan);
+
+static ssize_t host_show_transport_mode(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct ctlr_info *h = to_hba(dev);
+
+	return snprintf(buf, 20, "%s\n",
+		h->transMethod & CFGTBL_Trans_Performant ?
+			"performant" : "simple");
+}
+static DEVICE_ATTR(transport_mode, S_IRUGO, host_show_transport_mode, NULL);
+
 static ssize_t dev_show_unique_id(struct device *dev,
 				 struct device_attribute *attr,
 				 char *buf)
@@ -458,12 +668,12 @@ static ssize_t dev_show_unique_id(struct device *dev,
 	unsigned long flags;
 	int ret = 0;
 
-	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	spin_lock_irqsave(&h->lock, flags);
 	if (h->busy_configuring)
 		ret = -EBUSY;
 	else
 		memcpy(sn, drv->serial_no, sizeof(sn));
-	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+	spin_unlock_irqrestore(&h->lock, flags);
 
 	if (ret)
 		return ret;
@@ -476,7 +686,7 @@ static ssize_t dev_show_unique_id(struct device *dev,
 				sn[8], sn[9], sn[10], sn[11],
 				sn[12], sn[13], sn[14], sn[15]);
 }
-DEVICE_ATTR(unique_id, S_IRUGO, dev_show_unique_id, NULL);
+static DEVICE_ATTR(unique_id, S_IRUGO, dev_show_unique_id, NULL);
 
 static ssize_t dev_show_vendor(struct device *dev,
 			       struct device_attribute *attr,
@@ -488,19 +698,19 @@ static ssize_t dev_show_vendor(struct device *dev,
 	unsigned long flags;
 	int ret = 0;
 
-	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	spin_lock_irqsave(&h->lock, flags);
 	if (h->busy_configuring)
 		ret = -EBUSY;
 	else
 		memcpy(vendor, drv->vendor, VENDOR_LEN + 1);
-	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+	spin_unlock_irqrestore(&h->lock, flags);
 
 	if (ret)
 		return ret;
 	else
 		return snprintf(buf, sizeof(vendor) + 1, "%s\n", drv->vendor);
 }
-DEVICE_ATTR(vendor, S_IRUGO, dev_show_vendor, NULL);
+static DEVICE_ATTR(vendor, S_IRUGO, dev_show_vendor, NULL);
 
 static ssize_t dev_show_model(struct device *dev,
 			      struct device_attribute *attr,
@@ -512,19 +722,19 @@ static ssize_t dev_show_model(struct device *dev,
 	unsigned long flags;
 	int ret = 0;
 
-	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	spin_lock_irqsave(&h->lock, flags);
 	if (h->busy_configuring)
 		ret = -EBUSY;
 	else
 		memcpy(model, drv->model, MODEL_LEN + 1);
-	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+	spin_unlock_irqrestore(&h->lock, flags);
 
 	if (ret)
 		return ret;
 	else
 		return snprintf(buf, sizeof(model) + 1, "%s\n", drv->model);
 }
-DEVICE_ATTR(model, S_IRUGO, dev_show_model, NULL);
+static DEVICE_ATTR(model, S_IRUGO, dev_show_model, NULL);
 
 static ssize_t dev_show_rev(struct device *dev,
 			    struct device_attribute *attr,
@@ -536,25 +746,117 @@ static ssize_t dev_show_rev(struct device *dev,
 	unsigned long flags;
 	int ret = 0;
 
-	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	spin_lock_irqsave(&h->lock, flags);
 	if (h->busy_configuring)
 		ret = -EBUSY;
 	else
 		memcpy(rev, drv->rev, REV_LEN + 1);
-	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+	spin_unlock_irqrestore(&h->lock, flags);
 
 	if (ret)
 		return ret;
 	else
 		return snprintf(buf, sizeof(rev) + 1, "%s\n", drv->rev);
 }
-DEVICE_ATTR(rev, S_IRUGO, dev_show_rev, NULL);
+static DEVICE_ATTR(rev, S_IRUGO, dev_show_rev, NULL);
+
+static ssize_t cciss_show_lunid(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	drive_info_struct *drv = to_drv(dev);
+	struct ctlr_info *h = to_hba(drv->dev.parent);
+	unsigned long flags;
+	unsigned char lunid[8];
+
+	spin_lock_irqsave(&h->lock, flags);
+	if (h->busy_configuring) {
+		spin_unlock_irqrestore(&h->lock, flags);
+		return -EBUSY;
+	}
+	if (!drv->heads) {
+		spin_unlock_irqrestore(&h->lock, flags);
+		return -ENOTTY;
+	}
+	memcpy(lunid, drv->LunID, sizeof(lunid));
+	spin_unlock_irqrestore(&h->lock, flags);
+	return snprintf(buf, 20, "0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+		lunid[0], lunid[1], lunid[2], lunid[3],
+		lunid[4], lunid[5], lunid[6], lunid[7]);
+}
+static DEVICE_ATTR(lunid, S_IRUGO, cciss_show_lunid, NULL);
+
+static ssize_t cciss_show_raid_level(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	drive_info_struct *drv = to_drv(dev);
+	struct ctlr_info *h = to_hba(drv->dev.parent);
+	int raid;
+	unsigned long flags;
+
+	spin_lock_irqsave(&h->lock, flags);
+	if (h->busy_configuring) {
+		spin_unlock_irqrestore(&h->lock, flags);
+		return -EBUSY;
+	}
+	raid = drv->raid_level;
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (raid < 0 || raid > RAID_UNKNOWN)
+		raid = RAID_UNKNOWN;
+
+	return snprintf(buf, strlen(raid_label[raid]) + 7, "RAID %s\n",
+			raid_label[raid]);
+}
+static DEVICE_ATTR(raid_level, S_IRUGO, cciss_show_raid_level, NULL);
+
+static ssize_t cciss_show_usage_count(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	drive_info_struct *drv = to_drv(dev);
+	struct ctlr_info *h = to_hba(drv->dev.parent);
+	unsigned long flags;
+	int count;
+
+	spin_lock_irqsave(&h->lock, flags);
+	if (h->busy_configuring) {
+		spin_unlock_irqrestore(&h->lock, flags);
+		return -EBUSY;
+	}
+	count = drv->usage_count;
+	spin_unlock_irqrestore(&h->lock, flags);
+	return snprintf(buf, 20, "%d\n", count);
+}
+static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL);
+
+static struct attribute *cciss_host_attrs[] = {
+	&dev_attr_rescan.attr,
+	&dev_attr_resettable.attr,
+	&dev_attr_transport_mode.attr,
+	NULL
+};
+
+static struct attribute_group cciss_host_attr_group = {
+	.attrs = cciss_host_attrs,
+};
+
+static const struct attribute_group *cciss_host_attr_groups[] = {
+	&cciss_host_attr_group,
+	NULL
+};
+
+static struct device_type cciss_host_type = {
+	.name		= "cciss_host",
+	.groups		= cciss_host_attr_groups,
+	.release	= cciss_hba_release,
+};
 
 static struct attribute *cciss_dev_attrs[] = {
 	&dev_attr_unique_id.attr,
 	&dev_attr_model.attr,
 	&dev_attr_vendor.attr,
 	&dev_attr_rev.attr,
+	&dev_attr_lunid.attr,
+	&dev_attr_raid_level.attr,
+	&dev_attr_usage_count.attr,
 	NULL
 };
 
@@ -562,7 +864,7 @@ static struct attribute_group cciss_dev_attr_group = {
 	.attrs = cciss_dev_attrs,
 };
 
-static struct attribute_group *cciss_dev_attr_groups[] = {
+static const struct attribute_group *cciss_dev_attr_groups[] = {
 	&cciss_dev_attr_group,
 	NULL
 };
@@ -570,12 +872,24 @@ static struct attribute_group *cciss_dev_attr_groups[] = {
 static struct device_type cciss_dev_type = {
 	.name		= "cciss_device",
 	.groups		= cciss_dev_attr_groups,
+	.release	= cciss_device_release,
 };
 
 static struct bus_type cciss_bus_type = {
 	.name		= "cciss",
 };
 
+/*
+ * cciss_hba_release is called when the reference count
+ * of h->dev goes to zero.
+ */
+static void cciss_hba_release(struct device *dev)
+{
+	/*
+	 * nothing to do, but need this to avoid a warning
+	 * about not having a release handler from lib/kref.c.
+	 */
+}
 
 /*
  * Initialize sysfs entry for each controller.  This sets up and registers
@@ -599,6 +913,16 @@ static int cciss_create_hba_sysfs_entry(struct ctlr_info *h)
 static void cciss_destroy_hba_sysfs_entry(struct ctlr_info *h)
 {
 	device_del(&h->dev);
+	put_device(&h->dev); /* final put. */
+}
+
+/* cciss_device_release is called when the reference count
+ * of h->drv[x]dev goes to zero.
+ */
+static void cciss_device_release(struct device *dev)
+{
+	drive_info_struct *drv = to_drv(dev);
+	kfree(drv);
 }
 
 /*
@@ -607,85 +931,109 @@ static void cciss_destroy_hba_sysfs_entry(struct ctlr_info *h)
  * /sys/bus/pci/devices/<dev/ccis#/. We also create a link from
  * /sys/block/cciss!c#d# to this entry.
  */
-static int cciss_create_ld_sysfs_entry(struct ctlr_info *h,
-				       drive_info_struct *drv,
+static long cciss_create_ld_sysfs_entry(struct ctlr_info *h,
 				       int drv_index)
 {
-	device_initialize(&drv->dev);
-	drv->dev.type = &cciss_dev_type;
-	drv->dev.bus = &cciss_bus_type;
-	dev_set_name(&drv->dev, "c%dd%d", h->ctlr, drv_index);
-	drv->dev.parent = &h->dev;
-	return device_add(&drv->dev);
+	struct device *dev;
+
+	if (h->drv[drv_index]->device_initialized)
+		return 0;
+
+	dev = &h->drv[drv_index]->dev;
+	device_initialize(dev);
+	dev->type = &cciss_dev_type;
+	dev->bus = &cciss_bus_type;
+	dev_set_name(dev, "c%dd%d", h->ctlr, drv_index);
+	dev->parent = &h->dev;
+	h->drv[drv_index]->device_initialized = 1;
+	return device_add(dev);
 }
 
 /*
  * Remove sysfs entries for a logical drive.
  */
-static void cciss_destroy_ld_sysfs_entry(drive_info_struct *drv)
+static void cciss_destroy_ld_sysfs_entry(struct ctlr_info *h, int drv_index,
+	int ctlr_exiting)
 {
-	device_del(&drv->dev);
+	struct device *dev = &h->drv[drv_index]->dev;
+
+	/* special case for c*d0, we only destroy it on controller exit */
+	if (drv_index == 0 && !ctlr_exiting)
+		return;
+
+	device_del(dev);
+	put_device(dev); /* the "final" put. */
+	h->drv[drv_index] = NULL;
 }
 
 /*
  * For operations that cannot sleep, a command block is allocated at init,
  * and managed by cmd_alloc() and cmd_free() using a simple bitmap to track
- * which ones are free or in use.  For operations that can wait for kmalloc
- * to possible sleep, this routine can be called with get_from_pool set to 0.
- * cmd_free() MUST be called with a got_from_pool set to 0 if cmd_alloc was.
+ * which ones are free or in use.
  */
-static CommandList_struct *cmd_alloc(ctlr_info_t *h, int get_from_pool)
+static CommandList_struct *cmd_alloc(ctlr_info_t *h)
 {
 	CommandList_struct *c;
 	int i;
 	u64bit temp64;
 	dma_addr_t cmd_dma_handle, err_dma_handle;
 
-	if (!get_from_pool) {
-		c = (CommandList_struct *) pci_alloc_consistent(h->pdev,
-			sizeof(CommandList_struct), &cmd_dma_handle);
-		if (c == NULL)
+	do {
+		i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
+		if (i == h->nr_cmds)
 			return NULL;
-		memset(c, 0, sizeof(CommandList_struct));
+	} while (test_and_set_bit(i, h->cmd_pool_bits) != 0);
+	c = h->cmd_pool + i;
+	memset(c, 0, sizeof(CommandList_struct));
+	cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct);
+	c->err_info = h->errinfo_pool + i;
+	memset(c->err_info, 0, sizeof(ErrorInfo_struct));
+	err_dma_handle = h->errinfo_pool_dhandle
+	    + i * sizeof(ErrorInfo_struct);
+	h->nr_allocs++;
 
-		c->cmdindex = -1;
+	c->cmdindex = i;
 
-		c->err_info = (ErrorInfo_struct *)
-		    pci_alloc_consistent(h->pdev, sizeof(ErrorInfo_struct),
-			    &err_dma_handle);
+	INIT_LIST_HEAD(&c->list);
+	c->busaddr = (__u32) cmd_dma_handle;
+	temp64.val = (__u64) err_dma_handle;
+	c->ErrDesc.Addr.lower = temp64.val32.lower;
+	c->ErrDesc.Addr.upper = temp64.val32.upper;
+	c->ErrDesc.Len = sizeof(ErrorInfo_struct);
 
-		if (c->err_info == NULL) {
-			pci_free_consistent(h->pdev,
-				sizeof(CommandList_struct), c, cmd_dma_handle);
-			return NULL;
-		}
-		memset(c->err_info, 0, sizeof(ErrorInfo_struct));
-	} else {		/* get it out of the controllers pool */
-
-		do {
-			i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
-			if (i == h->nr_cmds)
-				return NULL;
-		} while (test_and_set_bit
-			 (i & (BITS_PER_LONG - 1),
-			  h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0);
-#ifdef CCISS_DEBUG
-		printk(KERN_DEBUG "cciss: using command buffer %d\n", i);
-#endif
-		c = h->cmd_pool + i;
-		memset(c, 0, sizeof(CommandList_struct));
-		cmd_dma_handle = h->cmd_pool_dhandle
-		    + i * sizeof(CommandList_struct);
-		c->err_info = h->errinfo_pool + i;
-		memset(c->err_info, 0, sizeof(ErrorInfo_struct));
-		err_dma_handle = h->errinfo_pool_dhandle
-		    + i * sizeof(ErrorInfo_struct);
-		h->nr_allocs++;
+	c->ctlr = h->ctlr;
+	return c;
+}
 
-		c->cmdindex = i;
+/* allocate a command using pci_alloc_consistent, used for ioctls,
+ * etc., not for the main i/o path.
+ */
+static CommandList_struct *cmd_special_alloc(ctlr_info_t *h)
+{
+	CommandList_struct *c;
+	u64bit temp64;
+	dma_addr_t cmd_dma_handle, err_dma_handle;
+
+	c = (CommandList_struct *) pci_alloc_consistent(h->pdev,
+		sizeof(CommandList_struct), &cmd_dma_handle);
+	if (c == NULL)
+		return NULL;
+	memset(c, 0, sizeof(CommandList_struct));
+
+	c->cmdindex = -1;
+
+	c->err_info = (ErrorInfo_struct *)
+	    pci_alloc_consistent(h->pdev, sizeof(ErrorInfo_struct),
+		    &err_dma_handle);
+
+	if (c->err_info == NULL) {
+		pci_free_consistent(h->pdev,
+			sizeof(CommandList_struct), c, cmd_dma_handle);
+		return NULL;
 	}
+	memset(c->err_info, 0, sizeof(ErrorInfo_struct));
 
-	INIT_HLIST_NODE(&c->list);
+	INIT_LIST_HEAD(&c->list);
 	c->busaddr = (__u32) cmd_dma_handle;
 	temp64.val = (__u64) err_dma_handle;
 	c->ErrDesc.Addr.lower = temp64.val32.lower;
@@ -696,27 +1044,25 @@ static CommandList_struct *cmd_alloc(ctlr_info_t *h, int get_from_pool)
 	return c;
 }
 
-/*
- * Frees a command block that was previously allocated with cmd_alloc().
- */
-static void cmd_free(ctlr_info_t *h, CommandList_struct *c, int got_from_pool)
+static void cmd_free(ctlr_info_t *h, CommandList_struct *c)
 {
 	int i;
+
+	i = c - h->cmd_pool;
+	clear_bit(i, h->cmd_pool_bits);
+	h->nr_frees++;
+}
+
+static void cmd_special_free(ctlr_info_t *h, CommandList_struct *c)
+{
 	u64bit temp64;
 
-	if (!got_from_pool) {
-		temp64.val32.lower = c->ErrDesc.Addr.lower;
-		temp64.val32.upper = c->ErrDesc.Addr.upper;
-		pci_free_consistent(h->pdev, sizeof(ErrorInfo_struct),
-				    c->err_info, (dma_addr_t) temp64.val);
-		pci_free_consistent(h->pdev, sizeof(CommandList_struct),
-				    c, (dma_addr_t) c->busaddr);
-	} else {
-		i = c - h->cmd_pool;
-		clear_bit(i & (BITS_PER_LONG - 1),
-			  h->cmd_pool_bits + (i / BITS_PER_LONG));
-		h->nr_frees++;
-	}
+	temp64.val32.lower = c->ErrDesc.Addr.lower;
+	temp64.val32.upper = c->ErrDesc.Addr.upper;
+	pci_free_consistent(h->pdev, sizeof(ErrorInfo_struct),
+			    c->err_info, (dma_addr_t) temp64.val);
+	pci_free_consistent(h->pdev, sizeof(CommandList_struct), c,
+		(dma_addr_t) cciss_tag_discard_error_bits(h, (u32) c->busaddr));
 }
 
 static inline ctlr_info_t *get_host(struct gendisk *disk)
@@ -734,14 +1080,11 @@ static inline drive_info_struct *get_drv(struct gendisk *disk)
  */
 static int cciss_open(struct block_device *bdev, fmode_t mode)
 {
-	ctlr_info_t *host = get_host(bdev->bd_disk);
+	ctlr_info_t *h = get_host(bdev->bd_disk);
 	drive_info_struct *drv = get_drv(bdev->bd_disk);
 
-#ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "cciss_open %s\n", bdev->bd_disk->disk_name);
-#endif				/* CCISS_DEBUG */
-
-	if (host->busy_initializing || drv->busy_configuring)
+	dev_dbg(&h->pdev->dev, "cciss_open %s\n", bdev->bd_disk->disk_name);
+	if (drv->busy_configuring)
 		return -EBUSY;
 	/*
 	 * Root is allowed to open raw volume zero even if it's not configured
@@ -757,7 +1100,8 @@ static int cciss_open(struct block_device *bdev, fmode_t mode)
 			if (MINOR(bdev->bd_dev) & 0x0f) {
 				return -ENXIO;
 				/* if it is, make sure we have a LUN ID */
-			} else if (drv->LunID == 0) {
+			} else if (memcmp(drv->LunID, CTLR_LUNID,
+				sizeof(drv->LunID))) {
 				return -ENXIO;
 			}
 		}
@@ -765,39 +1109,40 @@ static int cciss_open(struct block_device *bdev, fmode_t mode)
 			return -EPERM;
 	}
 	drv->usage_count++;
-	host->usage_count++;
+	h->usage_count++;
 	return 0;
 }
 
+static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+	int ret;
+
+	mutex_lock(&cciss_mutex);
+	ret = cciss_open(bdev, mode);
+	mutex_unlock(&cciss_mutex);
+
+	return ret;
+}
+
 /*
  * Close.  Sync first.
  */
-static int cciss_release(struct gendisk *disk, fmode_t mode)
+static void cciss_release(struct gendisk *disk, fmode_t mode)
 {
-	ctlr_info_t *host = get_host(disk);
-	drive_info_struct *drv = get_drv(disk);
-
-#ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "cciss_release %s\n", disk->disk_name);
-#endif				/* CCISS_DEBUG */
+	ctlr_info_t *h;
+	drive_info_struct *drv;
 
+	mutex_lock(&cciss_mutex);
+	h = get_host(disk);
+	drv = get_drv(disk);
+	dev_dbg(&h->pdev->dev, "cciss_release %s\n", disk->disk_name);
 	drv->usage_count--;
-	host->usage_count--;
-	return 0;
+	h->usage_count--;
+	mutex_unlock(&cciss_mutex);
 }
 
 #ifdef CONFIG_COMPAT
 
-static int do_ioctl(struct block_device *bdev, fmode_t mode,
-		    unsigned cmd, unsigned long arg)
-{
-	int ret;
-	lock_kernel();
-	ret = cciss_ioctl(bdev, mode, cmd, arg);
-	unlock_kernel();
-	return ret;
-}
-
 static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode,
 				  unsigned cmd, unsigned long arg);
 static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode,
@@ -822,7 +1167,7 @@ static int cciss_compat_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_REGNEWD:
 	case CCISS_RESCANDISK:
 	case CCISS_GETLUNINFO:
-		return do_ioctl(bdev, mode, cmd, arg);
+		return cciss_ioctl(bdev, mode, cmd, arg);
 
 	case CCISS_PASSTHRU32:
 		return cciss_ioctl32_passthru(bdev, mode, cmd, arg);
@@ -844,6 +1189,7 @@ static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode,
 	int err;
 	u32 cp;
 
+	memset(&arg64, 0, sizeof(arg64));
 	err = 0;
 	err |=
 	    copy_from_user(&arg64.LUN_info, &arg32->LUN_info,
@@ -862,7 +1208,7 @@ static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode,
 	if (err)
 		return -EFAULT;
 
-	err = do_ioctl(bdev, mode, CCISS_PASSTHRU, (unsigned long)p);
+	err = cciss_ioctl(bdev, mode, CCISS_PASSTHRU, (unsigned long)p);
 	if (err)
 		return err;
 	err |=
@@ -884,6 +1230,7 @@ static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode,
 	int err;
 	u32 cp;
 
+	memset(&arg64, 0, sizeof(arg64));
 	err = 0;
 	err |=
 	    copy_from_user(&arg64.LUN_info, &arg32->LUN_info,
@@ -903,7 +1250,7 @@ static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode,
 	if (err)
 		return -EFAULT;
 
-	err = do_ioctl(bdev, mode, CCISS_BIG_PASSTHRU, (unsigned long)p);
+	err = cciss_ioctl(bdev, mode, CCISS_BIG_PASSTHRU, (unsigned long)p);
 	if (err)
 		return err;
 	err |=
@@ -928,494 +1275,471 @@ static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static void check_ioctl_unit_attention(ctlr_info_t *host, CommandList_struct *c)
+static void check_ioctl_unit_attention(ctlr_info_t *h, CommandList_struct *c)
 {
 	if (c->err_info->CommandStatus == CMD_TARGET_STATUS &&
 			c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION)
-		(void)check_for_unit_attention(host, c);
+		(void)check_for_unit_attention(h, c);
 }
-/*
- * ioctl
- */
-static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
-		       unsigned int cmd, unsigned long arg)
+
+static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp)
 {
-	struct gendisk *disk = bdev->bd_disk;
-	ctlr_info_t *host = get_host(disk);
-	drive_info_struct *drv = get_drv(disk);
-	int ctlr = host->ctlr;
-	void __user *argp = (void __user *)arg;
+	cciss_pci_info_struct pciinfo;
 
-#ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "cciss_ioctl: Called with cmd=%x %lx\n", cmd, arg);
-#endif				/* CCISS_DEBUG */
+	if (!argp)
+		return -EINVAL;
+	pciinfo.domain = pci_domain_nr(h->pdev->bus);
+	pciinfo.bus = h->pdev->bus->number;
+	pciinfo.dev_fn = h->pdev->devfn;
+	pciinfo.board_id = h->board_id;
+	if (copy_to_user(argp, &pciinfo, sizeof(cciss_pci_info_struct)))
+		return -EFAULT;
+	return 0;
+}
 
-	switch (cmd) {
-	case CCISS_GETPCIINFO:
-		{
-			cciss_pci_info_struct pciinfo;
-
-			if (!arg)
-				return -EINVAL;
-			pciinfo.domain = pci_domain_nr(host->pdev->bus);
-			pciinfo.bus = host->pdev->bus->number;
-			pciinfo.dev_fn = host->pdev->devfn;
-			pciinfo.board_id = host->board_id;
-			if (copy_to_user
-			    (argp, &pciinfo, sizeof(cciss_pci_info_struct)))
-				return -EFAULT;
-			return 0;
-		}
-	case CCISS_GETINTINFO:
-		{
-			cciss_coalint_struct intinfo;
-			if (!arg)
-				return -EINVAL;
-			intinfo.delay =
-			    readl(&host->cfgtable->HostWrite.CoalIntDelay);
-			intinfo.count =
-			    readl(&host->cfgtable->HostWrite.CoalIntCount);
-			if (copy_to_user
-			    (argp, &intinfo, sizeof(cciss_coalint_struct)))
-				return -EFAULT;
-			return 0;
-		}
-	case CCISS_SETINTINFO:
-		{
-			cciss_coalint_struct intinfo;
-			unsigned long flags;
-			int i;
-
-			if (!arg)
-				return -EINVAL;
-			if (!capable(CAP_SYS_ADMIN))
-				return -EPERM;
-			if (copy_from_user
-			    (&intinfo, argp, sizeof(cciss_coalint_struct)))
-				return -EFAULT;
-			if ((intinfo.delay == 0) && (intinfo.count == 0))
-			{
-//                      printk("cciss_ioctl: delay and count cannot be 0\n");
-				return -EINVAL;
-			}
-			spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
-			/* Update the field, and then ring the doorbell */
-			writel(intinfo.delay,
-			       &(host->cfgtable->HostWrite.CoalIntDelay));
-			writel(intinfo.count,
-			       &(host->cfgtable->HostWrite.CoalIntCount));
-			writel(CFGTBL_ChangeReq, host->vaddr + SA5_DOORBELL);
-
-			for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
-				if (!(readl(host->vaddr + SA5_DOORBELL)
-				      & CFGTBL_ChangeReq))
-					break;
-				/* delay and try again */
-				udelay(1000);
-			}
-			spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-			if (i >= MAX_IOCTL_CONFIG_WAIT)
-				return -EAGAIN;
-			return 0;
-		}
-	case CCISS_GETNODENAME:
-		{
-			NodeName_type NodeName;
-			int i;
-
-			if (!arg)
-				return -EINVAL;
-			for (i = 0; i < 16; i++)
-				NodeName[i] =
-				    readb(&host->cfgtable->ServerName[i]);
-			if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
-				return -EFAULT;
-			return 0;
-		}
-	case CCISS_SETNODENAME:
-		{
-			NodeName_type NodeName;
-			unsigned long flags;
-			int i;
+static int cciss_getintinfo(ctlr_info_t *h, void __user *argp)
+{
+	cciss_coalint_struct intinfo;
+	unsigned long flags;
 
-			if (!arg)
-				return -EINVAL;
-			if (!capable(CAP_SYS_ADMIN))
-				return -EPERM;
+	if (!argp)
+		return -EINVAL;
+	spin_lock_irqsave(&h->lock, flags);
+	intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay);
+	intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount);
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (copy_to_user
+	    (argp, &intinfo, sizeof(cciss_coalint_struct)))
+		return -EFAULT;
+	return 0;
+}
 
-			if (copy_from_user
-			    (NodeName, argp, sizeof(NodeName_type)))
-				return -EFAULT;
+static int cciss_setintinfo(ctlr_info_t *h, void __user *argp)
+{
+	cciss_coalint_struct intinfo;
+	unsigned long flags;
+	int i;
 
-			spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
+	if (!argp)
+		return -EINVAL;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&intinfo, argp, sizeof(intinfo)))
+		return -EFAULT;
+	if ((intinfo.delay == 0) && (intinfo.count == 0))
+		return -EINVAL;
+	spin_lock_irqsave(&h->lock, flags);
+	/* Update the field, and then ring the doorbell */
+	writel(intinfo.delay, &(h->cfgtable->HostWrite.CoalIntDelay));
+	writel(intinfo.count, &(h->cfgtable->HostWrite.CoalIntCount));
+	writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
 
-			/* Update the field, and then ring the doorbell */
-			for (i = 0; i < 16; i++)
-				writeb(NodeName[i],
-				       &host->cfgtable->ServerName[i]);
+	for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
+		if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+			break;
+		udelay(1000); /* delay and try again */
+	}
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (i >= MAX_IOCTL_CONFIG_WAIT)
+		return -EAGAIN;
+	return 0;
+}
 
-			writel(CFGTBL_ChangeReq, host->vaddr + SA5_DOORBELL);
+static int cciss_getnodename(ctlr_info_t *h, void __user *argp)
+{
+	NodeName_type NodeName;
+	unsigned long flags;
+	int i;
 
-			for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
-				if (!(readl(host->vaddr + SA5_DOORBELL)
-				      & CFGTBL_ChangeReq))
-					break;
-				/* delay and try again */
-				udelay(1000);
-			}
-			spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-			if (i >= MAX_IOCTL_CONFIG_WAIT)
-				return -EAGAIN;
-			return 0;
-		}
+	if (!argp)
+		return -EINVAL;
+	spin_lock_irqsave(&h->lock, flags);
+	for (i = 0; i < 16; i++)
+		NodeName[i] = readb(&h->cfgtable->ServerName[i]);
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
+		return -EFAULT;
+	return 0;
+}
 
-	case CCISS_GETHEARTBEAT:
-		{
-			Heartbeat_type heartbeat;
-
-			if (!arg)
-				return -EINVAL;
-			heartbeat = readl(&host->cfgtable->HeartBeat);
-			if (copy_to_user
-			    (argp, &heartbeat, sizeof(Heartbeat_type)))
-				return -EFAULT;
-			return 0;
-		}
-	case CCISS_GETBUSTYPES:
-		{
-			BusTypes_type BusTypes;
-
-			if (!arg)
-				return -EINVAL;
-			BusTypes = readl(&host->cfgtable->BusTypes);
-			if (copy_to_user
-			    (argp, &BusTypes, sizeof(BusTypes_type)))
-				return -EFAULT;
-			return 0;
-		}
-	case CCISS_GETFIRMVER:
-		{
-			FirmwareVer_type firmware;
+static int cciss_setnodename(ctlr_info_t *h, void __user *argp)
+{
+	NodeName_type NodeName;
+	unsigned long flags;
+	int i;
 
-			if (!arg)
-				return -EINVAL;
-			memcpy(firmware, host->firm_ver, 4);
+	if (!argp)
+		return -EINVAL;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(NodeName, argp, sizeof(NodeName_type)))
+		return -EFAULT;
+	spin_lock_irqsave(&h->lock, flags);
+	/* Update the field, and then ring the doorbell */
+	for (i = 0; i < 16; i++)
+		writeb(NodeName[i], &h->cfgtable->ServerName[i]);
+	writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+	for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
+		if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+			break;
+		udelay(1000); /* delay and try again */
+	}
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (i >= MAX_IOCTL_CONFIG_WAIT)
+		return -EAGAIN;
+	return 0;
+}
 
-			if (copy_to_user
-			    (argp, firmware, sizeof(FirmwareVer_type)))
-				return -EFAULT;
-			return 0;
-		}
-	case CCISS_GETDRIVVER:
-		{
-			DriverVer_type DriverVer = DRIVER_VERSION;
+static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp)
+{
+	Heartbeat_type heartbeat;
+	unsigned long flags;
 
-			if (!arg)
-				return -EINVAL;
+	if (!argp)
+		return -EINVAL;
+	spin_lock_irqsave(&h->lock, flags);
+	heartbeat = readl(&h->cfgtable->HeartBeat);
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type)))
+		return -EFAULT;
+	return 0;
+}
 
-			if (copy_to_user
-			    (argp, &DriverVer, sizeof(DriverVer_type)))
-				return -EFAULT;
-			return 0;
-		}
+static int cciss_getbustypes(ctlr_info_t *h, void __user *argp)
+{
+	BusTypes_type BusTypes;
+	unsigned long flags;
 
-	case CCISS_DEREGDISK:
-	case CCISS_REGNEWD:
-	case CCISS_REVALIDVOLS:
-		return rebuild_lun_table(host, 0);
+	if (!argp)
+		return -EINVAL;
+	spin_lock_irqsave(&h->lock, flags);
+	BusTypes = readl(&h->cfgtable->BusTypes);
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type)))
+		return -EFAULT;
+	return 0;
+}
 
-	case CCISS_GETLUNINFO:{
-			LogvolInfo_struct luninfo;
+static int cciss_getfirmver(ctlr_info_t *h, void __user *argp)
+{
+	FirmwareVer_type firmware;
 
-			luninfo.LunID = drv->LunID;
-			luninfo.num_opens = drv->usage_count;
-			luninfo.num_parts = 0;
-			if (copy_to_user(argp, &luninfo,
-					 sizeof(LogvolInfo_struct)))
-				return -EFAULT;
-			return 0;
+	if (!argp)
+		return -EINVAL;
+	memcpy(firmware, h->firm_ver, 4);
+
+	if (copy_to_user
+	    (argp, firmware, sizeof(FirmwareVer_type)))
+		return -EFAULT;
+	return 0;
+}
+
+static int cciss_getdrivver(ctlr_info_t *h, void __user *argp)
+{
+	DriverVer_type DriverVer = DRIVER_VERSION;
+
+	if (!argp)
+		return -EINVAL;
+	if (copy_to_user(argp, &DriverVer, sizeof(DriverVer_type)))
+		return -EFAULT;
+	return 0;
+}
+
+static int cciss_getluninfo(ctlr_info_t *h,
+	struct gendisk *disk, void __user *argp)
+{
+	LogvolInfo_struct luninfo;
+	drive_info_struct *drv = get_drv(disk);
+
+	if (!argp)
+		return -EINVAL;
+	memcpy(&luninfo.LunID, drv->LunID, sizeof(luninfo.LunID));
+	luninfo.num_opens = drv->usage_count;
+	luninfo.num_parts = 0;
+	if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct)))
+		return -EFAULT;
+	return 0;
+}
+
+static int cciss_passthru(ctlr_info_t *h, void __user *argp)
+{
+	IOCTL_Command_struct iocommand;
+	CommandList_struct *c;
+	char *buff = NULL;
+	u64bit temp64;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	if (!argp)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	if (copy_from_user
+	    (&iocommand, argp, sizeof(IOCTL_Command_struct)))
+		return -EFAULT;
+	if ((iocommand.buf_size < 1) &&
+	    (iocommand.Request.Type.Direction != XFER_NONE)) {
+		return -EINVAL;
+	}
+	if (iocommand.buf_size > 0) {
+		buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
+		if (buff == NULL)
+			return -EFAULT;
+	}
+	if (iocommand.Request.Type.Direction == XFER_WRITE) {
+		/* Copy the data into the buffer we created */
+		if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) {
+			kfree(buff);
+			return -EFAULT;
 		}
-	case CCISS_PASSTHRU:
-		{
-			IOCTL_Command_struct iocommand;
-			CommandList_struct *c;
-			char *buff = NULL;
-			u64bit temp64;
-			unsigned long flags;
-			DECLARE_COMPLETION_ONSTACK(wait);
-
-			if (!arg)
-				return -EINVAL;
-
-			if (!capable(CAP_SYS_RAWIO))
-				return -EPERM;
-
-			if (copy_from_user
-			    (&iocommand, argp, sizeof(IOCTL_Command_struct)))
-				return -EFAULT;
-			if ((iocommand.buf_size < 1) &&
-			    (iocommand.Request.Type.Direction != XFER_NONE)) {
-				return -EINVAL;
-			}
-#if 0				/* 'buf_size' member is 16-bits, and always smaller than kmalloc limit */
-			/* Check kmalloc limits */
-			if (iocommand.buf_size > 128000)
-				return -EINVAL;
-#endif
-			if (iocommand.buf_size > 0) {
-				buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
-				if (buff == NULL)
-					return -EFAULT;
-			}
-			if (iocommand.Request.Type.Direction == XFER_WRITE) {
-				/* Copy the data into the buffer we created */
-				if (copy_from_user
-				    (buff, iocommand.buf, iocommand.buf_size)) {
-					kfree(buff);
-					return -EFAULT;
-				}
-			} else {
-				memset(buff, 0, iocommand.buf_size);
-			}
-			if ((c = cmd_alloc(host, 0)) == NULL) {
-				kfree(buff);
-				return -ENOMEM;
-			}
-			// Fill in the command type
-			c->cmd_type = CMD_IOCTL_PEND;
-			// Fill in Command Header
-			c->Header.ReplyQueue = 0;	// unused in simple mode
-			if (iocommand.buf_size > 0)	// buffer to fill
-			{
-				c->Header.SGList = 1;
-				c->Header.SGTotal = 1;
-			} else	// no buffers to fill
-			{
-				c->Header.SGList = 0;
-				c->Header.SGTotal = 0;
-			}
-			c->Header.LUN = iocommand.LUN_info;
-			c->Header.Tag.lower = c->busaddr;	// use the kernel address the cmd block for tag
-
-			// Fill in Request block
-			c->Request = iocommand.Request;
-
-			// Fill in the scatter gather information
-			if (iocommand.buf_size > 0) {
-				temp64.val = pci_map_single(host->pdev, buff,
-					iocommand.buf_size,
-					PCI_DMA_BIDIRECTIONAL);
-				c->SG[0].Addr.lower = temp64.val32.lower;
-				c->SG[0].Addr.upper = temp64.val32.upper;
-				c->SG[0].Len = iocommand.buf_size;
-				c->SG[0].Ext = 0;	// we are not chaining
-			}
-			c->waiting = &wait;
-
-			/* Put the request on the tail of the request queue */
-			spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
-			addQ(&host->reqQ, c);
-			host->Qdepth++;
-			start_io(host);
-			spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-
-			wait_for_completion(&wait);
-
-			/* unlock the buffers from DMA */
-			temp64.val32.lower = c->SG[0].Addr.lower;
-			temp64.val32.upper = c->SG[0].Addr.upper;
-			pci_unmap_single(host->pdev, (dma_addr_t) temp64.val,
-					 iocommand.buf_size,
-					 PCI_DMA_BIDIRECTIONAL);
-
-			check_ioctl_unit_attention(host, c);
-
-			/* Copy the error information out */
-			iocommand.error_info = *(c->err_info);
-			if (copy_to_user
-			    (argp, &iocommand, sizeof(IOCTL_Command_struct))) {
-				kfree(buff);
-				cmd_free(host, c, 0);
-				return -EFAULT;
-			}
+	} else {
+		memset(buff, 0, iocommand.buf_size);
+	}
+	c = cmd_special_alloc(h);
+	if (!c) {
+		kfree(buff);
+		return -ENOMEM;
+	}
+	/* Fill in the command type */
+	c->cmd_type = CMD_IOCTL_PEND;
+	/* Fill in Command Header */
+	c->Header.ReplyQueue = 0;   /* unused in simple mode */
+	if (iocommand.buf_size > 0) { /* buffer to fill */
+		c->Header.SGList = 1;
+		c->Header.SGTotal = 1;
+	} else { /* no buffers to fill */
+		c->Header.SGList = 0;
+		c->Header.SGTotal = 0;
+	}
+	c->Header.LUN = iocommand.LUN_info;
+	/* use the kernel address the cmd block for tag */
+	c->Header.Tag.lower = c->busaddr;
 
-			if (iocommand.Request.Type.Direction == XFER_READ) {
-				/* Copy the data out of the buffer we created */
-				if (copy_to_user
-				    (iocommand.buf, buff, iocommand.buf_size)) {
-					kfree(buff);
-					cmd_free(host, c, 0);
-					return -EFAULT;
-				}
-			}
+	/* Fill in Request block */
+	c->Request = iocommand.Request;
+
+	/* Fill in the scatter gather information */
+	if (iocommand.buf_size > 0) {
+		temp64.val = pci_map_single(h->pdev, buff,
+			iocommand.buf_size, PCI_DMA_BIDIRECTIONAL);
+		c->SG[0].Addr.lower = temp64.val32.lower;
+		c->SG[0].Addr.upper = temp64.val32.upper;
+		c->SG[0].Len = iocommand.buf_size;
+		c->SG[0].Ext = 0;  /* we are not chaining */
+	}
+	c->waiting = &wait;
+
+	enqueue_cmd_and_start_io(h, c);
+	wait_for_completion(&wait);
+
+	/* unlock the buffers from DMA */
+	temp64.val32.lower = c->SG[0].Addr.lower;
+	temp64.val32.upper = c->SG[0].Addr.upper;
+	pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, iocommand.buf_size,
+			 PCI_DMA_BIDIRECTIONAL);
+	check_ioctl_unit_attention(h, c);
+
+	/* Copy the error information out */
+	iocommand.error_info = *(c->err_info);
+	if (copy_to_user(argp, &iocommand, sizeof(IOCTL_Command_struct))) {
+		kfree(buff);
+		cmd_special_free(h, c);
+		return -EFAULT;
+	}
+
+	if (iocommand.Request.Type.Direction == XFER_READ) {
+		/* Copy the data out of the buffer we created */
+		if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) {
 			kfree(buff);
-			cmd_free(host, c, 0);
-			return 0;
+			cmd_special_free(h, c);
+			return -EFAULT;
 		}
-	case CCISS_BIG_PASSTHRU:{
-			BIG_IOCTL_Command_struct *ioc;
-			CommandList_struct *c;
-			unsigned char **buff = NULL;
-			int *buff_size = NULL;
-			u64bit temp64;
-			unsigned long flags;
-			BYTE sg_used = 0;
-			int status = 0;
-			int i;
-			DECLARE_COMPLETION_ONSTACK(wait);
-			__u32 left;
-			__u32 sz;
-			BYTE __user *data_ptr;
-
-			if (!arg)
-				return -EINVAL;
-			if (!capable(CAP_SYS_RAWIO))
-				return -EPERM;
-			ioc = (BIG_IOCTL_Command_struct *)
-			    kmalloc(sizeof(*ioc), GFP_KERNEL);
-			if (!ioc) {
-				status = -ENOMEM;
-				goto cleanup1;
-			}
-			if (copy_from_user(ioc, argp, sizeof(*ioc))) {
+	}
+	kfree(buff);
+	cmd_special_free(h, c);
+	return 0;
+}
+
+static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp)
+{
+	BIG_IOCTL_Command_struct *ioc;
+	CommandList_struct *c;
+	unsigned char **buff = NULL;
+	int *buff_size = NULL;
+	u64bit temp64;
+	BYTE sg_used = 0;
+	int status = 0;
+	int i;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	__u32 left;
+	__u32 sz;
+	BYTE __user *data_ptr;
+
+	if (!argp)
+		return -EINVAL;
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+	ioc = kmalloc(sizeof(*ioc), GFP_KERNEL);
+	if (!ioc) {
+		status = -ENOMEM;
+		goto cleanup1;
+	}
+	if (copy_from_user(ioc, argp, sizeof(*ioc))) {
+		status = -EFAULT;
+		goto cleanup1;
+	}
+	if ((ioc->buf_size < 1) &&
+	    (ioc->Request.Type.Direction != XFER_NONE)) {
+		status = -EINVAL;
+		goto cleanup1;
+	}
+	/* Check kmalloc limits  using all SGs */
+	if (ioc->malloc_size > MAX_KMALLOC_SIZE) {
+		status = -EINVAL;
+		goto cleanup1;
+	}
+	if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) {
+		status = -EINVAL;
+		goto cleanup1;
+	}
+	buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
+	if (!buff) {
+		status = -ENOMEM;
+		goto cleanup1;
+	}
+	buff_size = kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL);
+	if (!buff_size) {
+		status = -ENOMEM;
+		goto cleanup1;
+	}
+	left = ioc->buf_size;
+	data_ptr = ioc->buf;
+	while (left) {
+		sz = (left > ioc->malloc_size) ? ioc->malloc_size : left;
+		buff_size[sg_used] = sz;
+		buff[sg_used] = kmalloc(sz, GFP_KERNEL);
+		if (buff[sg_used] == NULL) {
+			status = -ENOMEM;
+			goto cleanup1;
+		}
+		if (ioc->Request.Type.Direction == XFER_WRITE) {
+			if (copy_from_user(buff[sg_used], data_ptr, sz)) {
 				status = -EFAULT;
 				goto cleanup1;
 			}
-			if ((ioc->buf_size < 1) &&
-			    (ioc->Request.Type.Direction != XFER_NONE)) {
-				status = -EINVAL;
-				goto cleanup1;
-			}
-			/* Check kmalloc limits  using all SGs */
-			if (ioc->malloc_size > MAX_KMALLOC_SIZE) {
-				status = -EINVAL;
-				goto cleanup1;
-			}
-			if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) {
-				status = -EINVAL;
-				goto cleanup1;
-			}
-			buff =
-			    kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
-			if (!buff) {
-				status = -ENOMEM;
-				goto cleanup1;
-			}
-			buff_size = kmalloc(MAXSGENTRIES * sizeof(int),
-						   GFP_KERNEL);
-			if (!buff_size) {
-				status = -ENOMEM;
-				goto cleanup1;
-			}
-			left = ioc->buf_size;
-			data_ptr = ioc->buf;
-			while (left) {
-				sz = (left >
-				      ioc->malloc_size) ? ioc->
-				    malloc_size : left;
-				buff_size[sg_used] = sz;
-				buff[sg_used] = kmalloc(sz, GFP_KERNEL);
-				if (buff[sg_used] == NULL) {
-					status = -ENOMEM;
-					goto cleanup1;
-				}
-				if (ioc->Request.Type.Direction == XFER_WRITE) {
-					if (copy_from_user
-					    (buff[sg_used], data_ptr, sz)) {
-						status = -EFAULT;
-						goto cleanup1;
-					}
-				} else {
-					memset(buff[sg_used], 0, sz);
-				}
-				left -= sz;
-				data_ptr += sz;
-				sg_used++;
-			}
-			if ((c = cmd_alloc(host, 0)) == NULL) {
-				status = -ENOMEM;
-				goto cleanup1;
-			}
-			c->cmd_type = CMD_IOCTL_PEND;
-			c->Header.ReplyQueue = 0;
+		} else {
+			memset(buff[sg_used], 0, sz);
+		}
+		left -= sz;
+		data_ptr += sz;
+		sg_used++;
+	}
+	c = cmd_special_alloc(h);
+	if (!c) {
+		status = -ENOMEM;
+		goto cleanup1;
+	}
+	c->cmd_type = CMD_IOCTL_PEND;
+	c->Header.ReplyQueue = 0;
+	c->Header.SGList = sg_used;
+	c->Header.SGTotal = sg_used;
+	c->Header.LUN = ioc->LUN_info;
+	c->Header.Tag.lower = c->busaddr;
 
-			if (ioc->buf_size > 0) {
-				c->Header.SGList = sg_used;
-				c->Header.SGTotal = sg_used;
-			} else {
-				c->Header.SGList = 0;
-				c->Header.SGTotal = 0;
-			}
-			c->Header.LUN = ioc->LUN_info;
-			c->Header.Tag.lower = c->busaddr;
-
-			c->Request = ioc->Request;
-			if (ioc->buf_size > 0) {
-				int i;
-				for (i = 0; i < sg_used; i++) {
-					temp64.val =
-					    pci_map_single(host->pdev, buff[i],
-						    buff_size[i],
-						    PCI_DMA_BIDIRECTIONAL);
-					c->SG[i].Addr.lower =
-					    temp64.val32.lower;
-					c->SG[i].Addr.upper =
-					    temp64.val32.upper;
-					c->SG[i].Len = buff_size[i];
-					c->SG[i].Ext = 0;	/* we are not chaining */
-				}
-			}
-			c->waiting = &wait;
-			/* Put the request on the tail of the request queue */
-			spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
-			addQ(&host->reqQ, c);
-			host->Qdepth++;
-			start_io(host);
-			spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-			wait_for_completion(&wait);
-			/* unlock the buffers from DMA */
-			for (i = 0; i < sg_used; i++) {
-				temp64.val32.lower = c->SG[i].Addr.lower;
-				temp64.val32.upper = c->SG[i].Addr.upper;
-				pci_unmap_single(host->pdev,
-					(dma_addr_t) temp64.val, buff_size[i],
-					PCI_DMA_BIDIRECTIONAL);
-			}
-			check_ioctl_unit_attention(host, c);
-			/* Copy the error information out */
-			ioc->error_info = *(c->err_info);
-			if (copy_to_user(argp, ioc, sizeof(*ioc))) {
-				cmd_free(host, c, 0);
+	c->Request = ioc->Request;
+	for (i = 0; i < sg_used; i++) {
+		temp64.val = pci_map_single(h->pdev, buff[i], buff_size[i],
+				    PCI_DMA_BIDIRECTIONAL);
+		c->SG[i].Addr.lower = temp64.val32.lower;
+		c->SG[i].Addr.upper = temp64.val32.upper;
+		c->SG[i].Len = buff_size[i];
+		c->SG[i].Ext = 0;	/* we are not chaining */
+	}
+	c->waiting = &wait;
+	enqueue_cmd_and_start_io(h, c);
+	wait_for_completion(&wait);
+	/* unlock the buffers from DMA */
+	for (i = 0; i < sg_used; i++) {
+		temp64.val32.lower = c->SG[i].Addr.lower;
+		temp64.val32.upper = c->SG[i].Addr.upper;
+		pci_unmap_single(h->pdev,
+			(dma_addr_t) temp64.val, buff_size[i],
+			PCI_DMA_BIDIRECTIONAL);
+	}
+	check_ioctl_unit_attention(h, c);
+	/* Copy the error information out */
+	ioc->error_info = *(c->err_info);
+	if (copy_to_user(argp, ioc, sizeof(*ioc))) {
+		cmd_special_free(h, c);
+		status = -EFAULT;
+		goto cleanup1;
+	}
+	if (ioc->Request.Type.Direction == XFER_READ) {
+		/* Copy the data out of the buffer we created */
+		BYTE __user *ptr = ioc->buf;
+		for (i = 0; i < sg_used; i++) {
+			if (copy_to_user(ptr, buff[i], buff_size[i])) {
+				cmd_special_free(h, c);
 				status = -EFAULT;
 				goto cleanup1;
 			}
-			if (ioc->Request.Type.Direction == XFER_READ) {
-				/* Copy the data out of the buffer we created */
-				BYTE __user *ptr = ioc->buf;
-				for (i = 0; i < sg_used; i++) {
-					if (copy_to_user
-					    (ptr, buff[i], buff_size[i])) {
-						cmd_free(host, c, 0);
-						status = -EFAULT;
-						goto cleanup1;
-					}
-					ptr += buff_size[i];
-				}
-			}
-			cmd_free(host, c, 0);
-			status = 0;
-		      cleanup1:
-			if (buff) {
-				for (i = 0; i < sg_used; i++)
-					kfree(buff[i]);
-				kfree(buff);
-			}
-			kfree(buff_size);
-			kfree(ioc);
-			return status;
+			ptr += buff_size[i];
 		}
+	}
+	cmd_special_free(h, c);
+	status = 0;
+cleanup1:
+	if (buff) {
+		for (i = 0; i < sg_used; i++)
+			kfree(buff[i]);
+		kfree(buff);
+	}
+	kfree(buff_size);
+	kfree(ioc);
+	return status;
+}
+
+static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
+	unsigned int cmd, unsigned long arg)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	ctlr_info_t *h = get_host(disk);
+	void __user *argp = (void __user *)arg;
 
-	/* scsi_cmd_ioctl handles these, below, though some are not */
+	dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n",
+		cmd, arg);
+	switch (cmd) {
+	case CCISS_GETPCIINFO:
+		return cciss_getpciinfo(h, argp);
+	case CCISS_GETINTINFO:
+		return cciss_getintinfo(h, argp);
+	case CCISS_SETINTINFO:
+		return cciss_setintinfo(h, argp);
+	case CCISS_GETNODENAME:
+		return cciss_getnodename(h, argp);
+	case CCISS_SETNODENAME:
+		return cciss_setnodename(h, argp);
+	case CCISS_GETHEARTBEAT:
+		return cciss_getheartbeat(h, argp);
+	case CCISS_GETBUSTYPES:
+		return cciss_getbustypes(h, argp);
+	case CCISS_GETFIRMVER:
+		return cciss_getfirmver(h, argp);
+	case CCISS_GETDRIVVER:
+		return cciss_getdrivver(h, argp);
+	case CCISS_DEREGDISK:
+	case CCISS_REGNEWD:
+	case CCISS_REVALIDVOLS:
+		return rebuild_lun_table(h, 0, 1);
+	case CCISS_GETLUNINFO:
+		return cciss_getluninfo(h, disk, argp);
+	case CCISS_PASSTHRU:
+		return cciss_passthru(h, argp);
+	case CCISS_BIG_PASSTHRU:
+		return cciss_bigpassthru(h, argp);
+
+	/* scsi_cmd_blk_ioctl handles these, below, though some are not */
 	/* very meaningful for cciss.  SG_IO is the main one people want. */
 
 	case SG_GET_VERSION_NUM:
@@ -1426,9 +1750,9 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case SG_EMULATED_HOST:
 	case SG_IO:
 	case SCSI_IOCTL_SEND_COMMAND:
-		return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp);
+		return scsi_cmd_blk_ioctl(bdev, mode, cmd, argp);
 
-	/* scsi_cmd_ioctl would normally handle these, below, but */
+	/* scsi_cmd_blk_ioctl would normally handle these, below, but */
 	/* they aren't a good fit for cciss, as CD-ROMs are */
 	/* not supported, and we don't have any bus/target/lun */
 	/* which we present to the kernel. */
@@ -1465,7 +1789,10 @@ static void cciss_check_queues(ctlr_info_t *h)
 		/* make sure the disk has been added and the drive is real
 		 * because this can be called from the middle of init_one.
 		 */
-		if (!(h->drv[curr_queue].queue) || !(h->drv[curr_queue].heads))
+		if (!h->drv[curr_queue])
+			continue;
+		if (!(h->drv[curr_queue]->queue) ||
+			!(h->drv[curr_queue]->heads))
 			continue;
 		blk_start_queue(h->gendisk[curr_queue]->queue);
 
@@ -1487,55 +1814,61 @@ static void cciss_check_queues(ctlr_info_t *h)
 
 static void cciss_softirq_done(struct request *rq)
 {
-	CommandList_struct *cmd = rq->completion_data;
-	ctlr_info_t *h = hba[cmd->ctlr];
-	unsigned long flags;
+	CommandList_struct *c = rq->completion_data;
+	ctlr_info_t *h = hba[c->ctlr];
+	SGDescriptor_struct *curr_sg = c->SG;
 	u64bit temp64;
+	unsigned long flags;
 	int i, ddir;
+	int sg_index = 0;
 
-	if (cmd->Request.Type.Direction == XFER_READ)
+	if (c->Request.Type.Direction == XFER_READ)
 		ddir = PCI_DMA_FROMDEVICE;
 	else
 		ddir = PCI_DMA_TODEVICE;
 
 	/* command did not need to be retried */
 	/* unmap the DMA mapping for all the scatter gather elements */
-	for (i = 0; i < cmd->Header.SGList; i++) {
-		temp64.val32.lower = cmd->SG[i].Addr.lower;
-		temp64.val32.upper = cmd->SG[i].Addr.upper;
-		pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir);
+	for (i = 0; i < c->Header.SGList; i++) {
+		if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) {
+			cciss_unmap_sg_chain_block(h, c);
+			/* Point to the next block */
+			curr_sg = h->cmd_sg_list[c->cmdindex];
+			sg_index = 0;
+		}
+		temp64.val32.lower = curr_sg[sg_index].Addr.lower;
+		temp64.val32.upper = curr_sg[sg_index].Addr.upper;
+		pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len,
+				ddir);
+		++sg_index;
 	}
 
-#ifdef CCISS_DEBUG
-	printk("Done with %p\n", rq);
-#endif				/* CCISS_DEBUG */
+	dev_dbg(&h->pdev->dev, "Done with %p\n", rq);
 
 	/* set the residual count for pc requests */
-	if (blk_pc_request(rq))
-		rq->resid_len = cmd->err_info->ResidualCnt;
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
+		rq->resid_len = c->err_info->ResidualCnt;
 
 	blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
 
 	spin_lock_irqsave(&h->lock, flags);
-	cmd_free(h, cmd, 1);
+	cmd_free(h, c);
 	cciss_check_queues(h);
 	spin_unlock_irqrestore(&h->lock, flags);
 }
 
-static void log_unit_to_scsi3addr(ctlr_info_t *h, unsigned char scsi3addr[],
-	uint32_t log_unit)
+static inline void log_unit_to_scsi3addr(ctlr_info_t *h,
+	unsigned char scsi3addr[], uint32_t log_unit)
 {
-	log_unit = h->drv[log_unit].LunID & 0x03fff;
-	memset(&scsi3addr[4], 0, 4);
-	memcpy(&scsi3addr[0], &log_unit, 4);
-	scsi3addr[3] |= 0x40;
+	memcpy(scsi3addr, h->drv[log_unit]->LunID,
+		sizeof(h->drv[log_unit]->LunID));
 }
 
 /* This function gets the SCSI vendor, model, and revision of a logical drive
  * via the inquiry page 0.  Model, vendor, and rev are set to empty strings if
  * they cannot be read.
  */
-static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
+static void cciss_get_device_descr(ctlr_info_t *h, int logvol,
 				   char *vendor, char *model, char *rev)
 {
 	int rc;
@@ -1550,15 +1883,9 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
 	if (!inq_buf)
 		return;
 
-	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-	if (withirq)
-		rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf,
-			     sizeof(InquiryData_struct), 0,
-				scsi3addr, TYPE_CMD);
-	else
-		rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf,
-			     sizeof(InquiryData_struct), 0,
-				scsi3addr, TYPE_CMD);
+	log_unit_to_scsi3addr(h, scsi3addr, logvol);
+	rc = sendcmd_withirq(h, CISS_INQUIRY, inq_buf, sizeof(*inq_buf), 0,
+			scsi3addr, TYPE_CMD);
 	if (rc == IO_OK) {
 		memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
 		vendor[VENDOR_LEN] = '\0';
@@ -1577,7 +1904,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
  * number cannot be had, for whatever reason, 16 bytes of 0xff
  * are returned instead.
  */
-static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
+static void cciss_get_serial_no(ctlr_info_t *h, int logvol,
 				unsigned char *serial_no, int buflen)
 {
 #define PAGE_83_INQ_BYTES 64
@@ -1592,54 +1919,61 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
 	if (!buf)
 		return;
 	memset(serial_no, 0, buflen);
-	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-	if (withirq)
-		rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
-			PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
-	else
-		rc = sendcmd(CISS_INQUIRY, ctlr, buf,
-			PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
+	log_unit_to_scsi3addr(h, scsi3addr, logvol);
+	rc = sendcmd_withirq(h, CISS_INQUIRY, buf,
+		PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
 	if (rc == IO_OK)
 		memcpy(serial_no, &buf[8], buflen);
 	kfree(buf);
 	return;
 }
 
-static void cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
+/*
+ * cciss_add_disk sets up the block device queue for a logical drive
+ */
+static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
 				int drv_index)
 {
 	disk->queue = blk_init_queue(do_cciss_request, &h->lock);
+	if (!disk->queue)
+		goto init_queue_failure;
 	sprintf(disk->disk_name, "cciss/c%dd%d", h->ctlr, drv_index);
 	disk->major = h->major;
 	disk->first_minor = drv_index << NWD_SHIFT;
 	disk->fops = &cciss_fops;
-	disk->private_data = &h->drv[drv_index];
-	disk->driverfs_dev = &h->drv[drv_index].dev;
+	if (cciss_create_ld_sysfs_entry(h, drv_index))
+		goto cleanup_queue;
+	disk->private_data = h->drv[drv_index];
+	disk->driverfs_dev = &h->drv[drv_index]->dev;
 
 	/* Set up queue information */
 	blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
 
 	/* This is a hardware imposed limit. */
-	blk_queue_max_hw_segments(disk->queue, MAXSGENTRIES);
-
-	/* This is a limit in the driver and could be eliminated. */
-	blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES);
+	blk_queue_max_segments(disk->queue, h->maxsgentries);
 
-	blk_queue_max_sectors(disk->queue, h->cciss_max_sectors);
+	blk_queue_max_hw_sectors(disk->queue, h->cciss_max_sectors);
 
 	blk_queue_softirq_done(disk->queue, cciss_softirq_done);
 
 	disk->queue->queuedata = h;
 
 	blk_queue_logical_block_size(disk->queue,
-				     h->drv[drv_index].block_size);
+				     h->drv[drv_index]->block_size);
 
 	/* Make sure all queue data is written out before */
-	/* setting h->drv[drv_index].queue, as setting this */
+	/* setting h->drv[drv_index]->queue, as setting this */
 	/* allows the interrupt handler to start the queue */
 	wmb();
-	h->drv[drv_index].queue = disk->queue;
+	h->drv[drv_index]->queue = disk->queue;
 	add_disk(disk);
+	return 0;
+
+cleanup_queue:
+	blk_cleanup_queue(disk->queue);
+	disk->queue = NULL;
+init_queue_failure:
+	return -1;
 }
 
 /* This function will check the usage_count of the drive to be updated/added.
@@ -1652,9 +1986,9 @@ static void cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
  * is also the controller node.  Any changes to disk 0 will show up on
  * the next reboot.
  */
-static void cciss_update_drive_info(int ctlr, int drv_index, int first_time)
+static void cciss_update_drive_info(ctlr_info_t *h, int drv_index,
+	int first_time, int via_ioctl)
 {
-	ctlr_info_t *h = hba[ctlr];
 	struct gendisk *disk;
 	InquiryData_struct *inq_buff = NULL;
 	unsigned int block_size;
@@ -1662,35 +1996,25 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time)
 	unsigned long flags = 0;
 	int ret = 0;
 	drive_info_struct *drvinfo;
-	int was_only_controller_node;
 
 	/* Get information about the disk and modify the driver structure */
 	inq_buff = kmalloc(sizeof(InquiryData_struct), GFP_KERNEL);
-	drvinfo = kmalloc(sizeof(*drvinfo), GFP_KERNEL);
+	drvinfo = kzalloc(sizeof(*drvinfo), GFP_KERNEL);
 	if (inq_buff == NULL || drvinfo == NULL)
 		goto mem_msg;
 
-	/* See if we're trying to update the "controller node"
-	 * this will happen the when the first logical drive gets
-	 * created by ACU.
-	 */
-	was_only_controller_node = (drv_index == 0 &&
-				h->drv[0].raid_level == -1);
-
 	/* testing to see if 16-byte CDBs are already being used */
 	if (h->cciss_read == CCISS_READ_16) {
-		cciss_read_capacity_16(h->ctlr, drv_index, 1,
+		cciss_read_capacity_16(h, drv_index,
 			&total_size, &block_size);
 
 	} else {
-		cciss_read_capacity(ctlr, drv_index, 1,
-				    &total_size, &block_size);
-
+		cciss_read_capacity(h, drv_index, &total_size, &block_size);
 		/* if read_capacity returns all F's this volume is >2TB */
 		/* in size so we switch to 16-byte CDB's for all */
 		/* read/write ops */
 		if (total_size == 0xFFFFFFFFULL) {
-			cciss_read_capacity_16(ctlr, drv_index, 1,
+			cciss_read_capacity_16(h, drv_index,
 			&total_size, &block_size);
 			h->cciss_read = CCISS_READ_16;
 			h->cciss_write = CCISS_WRITE_16;
@@ -1700,25 +2024,28 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time)
 		}
 	}
 
-	cciss_geometry_inquiry(ctlr, drv_index, 1, total_size, block_size,
+	cciss_geometry_inquiry(h, drv_index, total_size, block_size,
 			       inq_buff, drvinfo);
 	drvinfo->block_size = block_size;
 	drvinfo->nr_blocks = total_size + 1;
 
-	cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor,
+	cciss_get_device_descr(h, drv_index, drvinfo->vendor,
 				drvinfo->model, drvinfo->rev);
-	cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no,
+	cciss_get_serial_no(h, drv_index, drvinfo->serial_no,
 			sizeof(drvinfo->serial_no));
+	/* Save the lunid in case we deregister the disk, below. */
+	memcpy(drvinfo->LunID, h->drv[drv_index]->LunID,
+		sizeof(drvinfo->LunID));
 
 	/* Is it the same disk we already know, and nothing's changed? */
-	if (h->drv[drv_index].raid_level != -1 &&
+	if (h->drv[drv_index]->raid_level != -1 &&
 		((memcmp(drvinfo->serial_no,
-				h->drv[drv_index].serial_no, 16) == 0) &&
-		drvinfo->block_size == h->drv[drv_index].block_size &&
-		drvinfo->nr_blocks == h->drv[drv_index].nr_blocks &&
-		drvinfo->heads == h->drv[drv_index].heads &&
-		drvinfo->sectors == h->drv[drv_index].sectors &&
-		drvinfo->cylinders == h->drv[drv_index].cylinders))
+				h->drv[drv_index]->serial_no, 16) == 0) &&
+		drvinfo->block_size == h->drv[drv_index]->block_size &&
+		drvinfo->nr_blocks == h->drv[drv_index]->nr_blocks &&
+		drvinfo->heads == h->drv[drv_index]->heads &&
+		drvinfo->sectors == h->drv[drv_index]->sectors &&
+		drvinfo->cylinders == h->drv[drv_index]->cylinders))
 			/* The disk is unchanged, nothing to update */
 			goto freeret;
 
@@ -1728,18 +2055,17 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time)
 	 * If the disk already exists then deregister it before proceeding
 	 * (unless it's the first disk (for the controller node).
 	 */
-	if (h->drv[drv_index].raid_level != -1 && drv_index != 0) {
-		printk(KERN_WARNING "disk %d has changed.\n", drv_index);
-		spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
-		h->drv[drv_index].busy_configuring = 1;
-		spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+	if (h->drv[drv_index]->raid_level != -1 && drv_index != 0) {
+		dev_warn(&h->pdev->dev, "disk %d has changed.\n", drv_index);
+		spin_lock_irqsave(&h->lock, flags);
+		h->drv[drv_index]->busy_configuring = 1;
+		spin_unlock_irqrestore(&h->lock, flags);
 
-		/* deregister_disk sets h->drv[drv_index].queue = NULL
+		/* deregister_disk sets h->drv[drv_index]->queue = NULL
 		 * which keeps the interrupt handler from starting
 		 * the queue.
 		 */
-		ret = deregister_disk(h, drv_index, 0);
-		h->drv[drv_index].busy_configuring = 0;
+		ret = deregister_disk(h, drv_index, 0, via_ioctl);
 	}
 
 	/* If the disk is in use return */
@@ -1747,22 +2073,31 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time)
 		goto freeret;
 
 	/* Save the new information from cciss_geometry_inquiry
-	 * and serial number inquiry.
+	 * and serial number inquiry.  If the disk was deregistered
+	 * above, then h->drv[drv_index] will be NULL.
 	 */
-	h->drv[drv_index].block_size = drvinfo->block_size;
-	h->drv[drv_index].nr_blocks = drvinfo->nr_blocks;
-	h->drv[drv_index].heads = drvinfo->heads;
-	h->drv[drv_index].sectors = drvinfo->sectors;
-	h->drv[drv_index].cylinders = drvinfo->cylinders;
-	h->drv[drv_index].raid_level = drvinfo->raid_level;
-	memcpy(h->drv[drv_index].serial_no, drvinfo->serial_no, 16);
-	memcpy(h->drv[drv_index].vendor, drvinfo->vendor, VENDOR_LEN + 1);
-	memcpy(h->drv[drv_index].model, drvinfo->model, MODEL_LEN + 1);
-	memcpy(h->drv[drv_index].rev, drvinfo->rev, REV_LEN + 1);
+	if (h->drv[drv_index] == NULL) {
+		drvinfo->device_initialized = 0;
+		h->drv[drv_index] = drvinfo;
+		drvinfo = NULL; /* so it won't be freed below. */
+	} else {
+		/* special case for cxd0 */
+		h->drv[drv_index]->block_size = drvinfo->block_size;
+		h->drv[drv_index]->nr_blocks = drvinfo->nr_blocks;
+		h->drv[drv_index]->heads = drvinfo->heads;
+		h->drv[drv_index]->sectors = drvinfo->sectors;
+		h->drv[drv_index]->cylinders = drvinfo->cylinders;
+		h->drv[drv_index]->raid_level = drvinfo->raid_level;
+		memcpy(h->drv[drv_index]->serial_no, drvinfo->serial_no, 16);
+		memcpy(h->drv[drv_index]->vendor, drvinfo->vendor,
+			VENDOR_LEN + 1);
+		memcpy(h->drv[drv_index]->model, drvinfo->model, MODEL_LEN + 1);
+		memcpy(h->drv[drv_index]->rev, drvinfo->rev, REV_LEN + 1);
+	}
 
 	++h->num_luns;
 	disk = h->gendisk[drv_index];
-	set_capacity(disk, h->drv[drv_index].nr_blocks);
+	set_capacity(disk, h->drv[drv_index]->nr_blocks);
 
 	/* If it's not disk 0 (drv_index != 0)
 	 * or if it was disk 0, but there was previously
@@ -1770,41 +2105,90 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time)
 	 * (raid_leve == -1) then we want to update the
 	 * logical drive's information.
 	 */
-	if (drv_index || first_time)
-		cciss_add_disk(h, disk, drv_index);
+	if (drv_index || first_time) {
+		if (cciss_add_disk(h, disk, drv_index) != 0) {
+			cciss_free_gendisk(h, drv_index);
+			cciss_free_drive_info(h, drv_index);
+			dev_warn(&h->pdev->dev, "could not update disk %d\n",
+				drv_index);
+			--h->num_luns;
+		}
+	}
 
 freeret:
 	kfree(inq_buff);
 	kfree(drvinfo);
 	return;
 mem_msg:
-	printk(KERN_ERR "cciss: out of memory\n");
+	dev_err(&h->pdev->dev, "out of memory\n");
 	goto freeret;
 }
 
 /* This function will find the first index of the controllers drive array
- * that has a -1 for the raid_level and will return that index.  This is
- * where new drives will be added.  If the index to be returned is greater
- * than the highest_lun index for the controller then highest_lun is set
- * to this new index.  If there are no available indexes then -1 is returned.
- * "controller_node" is used to know if this is a real logical drive, or just
- * the controller node, which determines if this counts towards highest_lun.
+ * that has a null drv pointer and allocate the drive info struct and
+ * will return that index   This is where new drives will be added.
+ * If the index to be returned is greater than the highest_lun index for
+ * the controller then highest_lun is set * to this new index.
+ * If there are no available indexes or if tha allocation fails, then -1
+ * is returned.  * "controller_node" is used to know if this is a real
+ * logical drive, or just the controller node, which determines if this
+ * counts towards highest_lun.
  */
-static int cciss_find_free_drive_index(int ctlr, int controller_node)
+static int cciss_alloc_drive_info(ctlr_info_t *h, int controller_node)
 {
 	int i;
+	drive_info_struct *drv;
 
+	/* Search for an empty slot for our drive info */
 	for (i = 0; i < CISS_MAX_LUN; i++) {
-		if (hba[ctlr]->drv[i].raid_level == -1) {
-			if (i > hba[ctlr]->highest_lun)
-				if (!controller_node)
-					hba[ctlr]->highest_lun = i;
+
+		/* if not cxd0 case, and it's occupied, skip it. */
+		if (h->drv[i] && i != 0)
+			continue;
+		/*
+		 * If it's cxd0 case, and drv is alloc'ed already, and a
+		 * disk is configured there, skip it.
+		 */
+		if (i == 0 && h->drv[i] && h->drv[i]->raid_level != -1)
+			continue;
+
+		/*
+		 * We've found an empty slot.  Update highest_lun
+		 * provided this isn't just the fake cxd0 controller node.
+		 */
+		if (i > h->highest_lun && !controller_node)
+			h->highest_lun = i;
+
+		/* If adding a real disk at cxd0, and it's already alloc'ed */
+		if (i == 0 && h->drv[i] != NULL)
 			return i;
-		}
+
+		/*
+		 * Found an empty slot, not already alloc'ed.  Allocate it.
+		 * Mark it with raid_level == -1, so we know it's new later on.
+		 */
+		drv = kzalloc(sizeof(*drv), GFP_KERNEL);
+		if (!drv)
+			return -1;
+		drv->raid_level = -1; /* so we know it's new */
+		h->drv[i] = drv;
+		return i;
 	}
 	return -1;
 }
 
+static void cciss_free_drive_info(ctlr_info_t *h, int drv_index)
+{
+	kfree(h->drv[drv_index]);
+	h->drv[drv_index] = NULL;
+}
+
+static void cciss_free_gendisk(ctlr_info_t *h, int drv_index)
+{
+	put_disk(h->gendisk[drv_index]);
+	h->gendisk[drv_index] = NULL;
+}
+
 /* cciss_add_gendisk finds a free hba[]->drv structure
  * and allocates a gendisk if needed, and sets the lunid
  * in the drvinfo structure.   It returns the index into
@@ -1814,38 +2198,41 @@ static int cciss_find_free_drive_index(int ctlr, int controller_node)
  * a means to talk to the controller in case no logical
  * drives have yet been configured.
  */
-static int cciss_add_gendisk(ctlr_info_t *h, __u32 lunid, int controller_node)
+static int cciss_add_gendisk(ctlr_info_t *h, unsigned char lunid[],
+	int controller_node)
 {
 	int drv_index;
 
-	drv_index = cciss_find_free_drive_index(h->ctlr, controller_node);
+	drv_index = cciss_alloc_drive_info(h, controller_node);
 	if (drv_index == -1)
 		return -1;
+
 	/*Check if the gendisk needs to be allocated */
 	if (!h->gendisk[drv_index]) {
 		h->gendisk[drv_index] =
 			alloc_disk(1 << NWD_SHIFT);
 		if (!h->gendisk[drv_index]) {
-			printk(KERN_ERR "cciss%d: could not "
-				"allocate a new disk %d\n",
-				h->ctlr, drv_index);
-			return -1;
+			dev_err(&h->pdev->dev,
+				"could not allocate a new disk %d\n",
+				drv_index);
+			goto err_free_drive_info;
 		}
 	}
-	h->drv[drv_index].LunID = lunid;
-	if (cciss_create_ld_sysfs_entry(h, &h->drv[drv_index], drv_index))
+	memcpy(h->drv[drv_index]->LunID, lunid,
+		sizeof(h->drv[drv_index]->LunID));
+	if (cciss_create_ld_sysfs_entry(h, drv_index))
 		goto err_free_disk;
-
 	/* Don't need to mark this busy because nobody */
 	/* else knows about this disk yet to contend */
 	/* for access to it. */
-	h->drv[drv_index].busy_configuring = 0;
+	h->drv[drv_index]->busy_configuring = 0;
 	wmb();
 	return drv_index;
 
 err_free_disk:
-	put_disk(h->gendisk[drv_index]);
-	h->gendisk[drv_index] = NULL;
+	cciss_free_gendisk(h, drv_index);
+err_free_drive_info:
+	cciss_free_drive_info(h, drv_index);
 	return -1;
 }
 
@@ -1862,21 +2249,24 @@ static void cciss_add_controller_node(ctlr_info_t *h)
 	if (h->gendisk[0] != NULL) /* already did this? Then bail. */
 		return;
 
-	drv_index = cciss_add_gendisk(h, 0, 1);
-	if (drv_index == -1) {
-		printk(KERN_WARNING "cciss%d: could not "
-			"add disk 0.\n", h->ctlr);
-		return;
-	}
-	h->drv[drv_index].block_size = 512;
-	h->drv[drv_index].nr_blocks = 0;
-	h->drv[drv_index].heads = 0;
-	h->drv[drv_index].sectors = 0;
-	h->drv[drv_index].cylinders = 0;
-	h->drv[drv_index].raid_level = -1;
-	memset(h->drv[drv_index].serial_no, 0, 16);
+	drv_index = cciss_add_gendisk(h, CTLR_LUNID, 1);
+	if (drv_index == -1)
+		goto error;
+	h->drv[drv_index]->block_size = 512;
+	h->drv[drv_index]->nr_blocks = 0;
+	h->drv[drv_index]->heads = 0;
+	h->drv[drv_index]->sectors = 0;
+	h->drv[drv_index]->cylinders = 0;
+	h->drv[drv_index]->raid_level = -1;
+	memset(h->drv[drv_index]->serial_no, 0, 16);
 	disk = h->gendisk[drv_index];
-	cciss_add_disk(h, disk, drv_index);
+	if (cciss_add_disk(h, disk, drv_index) == 0)
+		return;
+	cciss_free_gendisk(h, drv_index);
+	cciss_free_drive_info(h, drv_index);
+error:
+	dev_warn(&h->pdev->dev, "could not add disk 0.\n");
+	return;
 }
 
 /* This function will add and remove logical drives from the Logical
@@ -1887,9 +2277,9 @@ static void cciss_add_controller_node(ctlr_info_t *h)
  * INPUT
  * h		= The controller to perform the operations on
  */
-static int rebuild_lun_table(ctlr_info_t *h, int first_time)
+static int rebuild_lun_table(ctlr_info_t *h, int first_time,
+	int via_ioctl)
 {
-	int ctlr = h->ctlr;
 	int num_luns;
 	ReportLunData_struct *ld_buff = NULL;
 	int return_code;
@@ -1897,34 +2287,34 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time)
 	int i;
 	int drv_found;
 	int drv_index = 0;
-	__u32 lunid = 0;
+	unsigned char lunid[8] = CTLR_LUNID;
 	unsigned long flags;
 
 	if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
 
 	/* Set busy_configuring flag for this operation */
-	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	spin_lock_irqsave(&h->lock, flags);
 	if (h->busy_configuring) {
-		spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+		spin_unlock_irqrestore(&h->lock, flags);
 		return -EBUSY;
 	}
 	h->busy_configuring = 1;
-	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+	spin_unlock_irqrestore(&h->lock, flags);
 
 	ld_buff = kzalloc(sizeof(ReportLunData_struct), GFP_KERNEL);
 	if (ld_buff == NULL)
 		goto mem_msg;
 
-	return_code = sendcmd_withirq(CISS_REPORT_LOG, ctlr, ld_buff,
+	return_code = sendcmd_withirq(h, CISS_REPORT_LOG, ld_buff,
 				      sizeof(ReportLunData_struct),
 				      0, CTLR_LUNID, TYPE_CMD);
 
 	if (return_code == IO_OK)
 		listlength = be32_to_cpu(*(__be32 *) ld_buff->LUNListLength);
 	else {	/* reading number of logical volumes failed */
-		printk(KERN_WARNING "cciss: report logical volume"
-		       " command failed\n");
+		dev_warn(&h->pdev->dev,
+			"report logical volume command failed\n");
 		listlength = 0;
 		goto freeret;
 	}
@@ -1932,7 +2322,7 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time)
 	num_luns = listlength / 8;	/* 8 bytes per entry */
 	if (num_luns > CISS_MAX_LUN) {
 		num_luns = CISS_MAX_LUN;
-		printk(KERN_WARNING "cciss: more luns configured"
+		dev_warn(&h->pdev->dev, "more luns configured"
 		       " on controller than can be handled by"
 		       " this driver.\n");
 	}
@@ -1950,25 +2340,25 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time)
 		drv_found = 0;
 
 		/* skip holes in the array from already deleted drives */
-		if (h->drv[i].raid_level == -1)
+		if (h->drv[i] == NULL)
 			continue;
 
 		for (j = 0; j < num_luns; j++) {
-			memcpy(&lunid, &ld_buff->LUN[j][0], 4);
-			lunid = le32_to_cpu(lunid);
-			if (h->drv[i].LunID == lunid) {
+			memcpy(lunid, &ld_buff->LUN[j][0], sizeof(lunid));
+			if (memcmp(h->drv[i]->LunID, lunid,
+				sizeof(lunid)) == 0) {
 				drv_found = 1;
 				break;
 			}
 		}
 		if (!drv_found) {
 			/* Deregister it from the OS, it's gone. */
-			spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
-			h->drv[i].busy_configuring = 1;
-			spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
-			return_code = deregister_disk(h, i, 1);
-			cciss_destroy_ld_sysfs_entry(&h->drv[i]);
-			h->drv[i].busy_configuring = 0;
+			spin_lock_irqsave(&h->lock, flags);
+			h->drv[i]->busy_configuring = 1;
+			spin_unlock_irqrestore(&h->lock, flags);
+			return_code = deregister_disk(h, i, 1, via_ioctl);
+			if (h->drv[i] != NULL)
+				h->drv[i]->busy_configuring = 0;
 		}
 	}
 
@@ -1982,17 +2372,16 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time)
 
 		drv_found = 0;
 
-		memcpy(&lunid, &ld_buff->LUN[i][0], 4);
-		lunid = le32_to_cpu(lunid);
-
+		memcpy(lunid, &ld_buff->LUN[i][0], sizeof(lunid));
 		/* Find if the LUN is already in the drive array
 		 * of the driver.  If so then update its info
 		 * if not in use.  If it does not exist then find
 		 * the first free index and add it.
 		 */
 		for (j = 0; j <= h->highest_lun; j++) {
-			if (h->drv[j].raid_level != -1 &&
-				h->drv[j].LunID == lunid) {
+			if (h->drv[j] != NULL &&
+				memcmp(h->drv[j]->LunID, lunid,
+					sizeof(h->drv[j]->LunID)) == 0) {
 				drv_index = j;
 				drv_found = 1;
 				break;
@@ -2005,7 +2394,7 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time)
 			if (drv_index == -1)
 				goto freeret;
 		}
-		cciss_update_drive_info(ctlr, drv_index, first_time);
+		cciss_update_drive_info(h, drv_index, first_time, via_ioctl);
 	}		/* end for */
 
 freeret:
@@ -2017,11 +2406,30 @@ freeret:
 	 */
 	return -1;
 mem_msg:
-	printk(KERN_ERR "cciss: out of memory\n");
+	dev_err(&h->pdev->dev, "out of memory\n");
 	h->busy_configuring = 0;
 	goto freeret;
 }
 
+static void cciss_clear_drive_info(drive_info_struct *drive_info)
+{
+	/* zero out the disk size info */
+	drive_info->nr_blocks = 0;
+	drive_info->block_size = 0;
+	drive_info->heads = 0;
+	drive_info->sectors = 0;
+	drive_info->cylinders = 0;
+	drive_info->raid_level = -1;
+	memset(drive_info->serial_no, 0, sizeof(drive_info->serial_no));
+	memset(drive_info->model, 0, sizeof(drive_info->model));
+	memset(drive_info->rev, 0, sizeof(drive_info->rev));
+	memset(drive_info->vendor, 0, sizeof(drive_info->vendor));
+	/*
+	 * don't clear the LUNID though, we need to remember which
+	 * one this one is.
+	 */
+}
+
 /* This function will deregister the disk and it's queue from the
  * kernel.  It must be called with the controller lock held and the
  * drv structures busy_configuring flag set.  It's parameters are:
@@ -2036,43 +2444,48 @@ mem_msg:
  *             the disk in preparation for re-adding it.  In this case
  *             the highest_lun should be left unchanged and the LunID
  *             should not be cleared.
+ * via_ioctl
+ *    This indicates whether we've reached this path via ioctl.
+ *    This affects the maximum usage count allowed for c0d0 to be messed with.
+ *    If this path is reached via ioctl(), then the max_usage_count will
+ *    be 1, as the process calling ioctl() has got to have the device open.
+ *    If we get here via sysfs, then the max usage count will be zero.
 */
 static int deregister_disk(ctlr_info_t *h, int drv_index,
-			   int clear_all)
+			   int clear_all, int via_ioctl)
 {
 	int i;
 	struct gendisk *disk;
 	drive_info_struct *drv;
+	int recalculate_highest_lun;
 
 	if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
 
-	drv = &h->drv[drv_index];
+	drv = h->drv[drv_index];
 	disk = h->gendisk[drv_index];
 
 	/* make sure logical volume is NOT is use */
 	if (clear_all || (h->gendisk[0] == disk)) {
-		if (drv->usage_count > 1)
+		if (drv->usage_count > via_ioctl)
 			return -EBUSY;
 	} else if (drv->usage_count > 0)
 		return -EBUSY;
 
+	recalculate_highest_lun = (drv == h->drv[h->highest_lun]);
+
 	/* invalidate the devices and deregister the disk.  If it is disk
 	 * zero do not deregister it but just zero out it's values.  This
 	 * allows us to delete disk zero but keep the controller registered.
 	 */
 	if (h->gendisk[0] != disk) {
 		struct request_queue *q = disk->queue;
-		if (disk->flags & GENHD_FL_UP)
+		if (disk->flags & GENHD_FL_UP) {
+			cciss_destroy_ld_sysfs_entry(h, drv_index, 0);
 			del_gendisk(disk);
-		if (q) {
-			blk_cleanup_queue(q);
-			/* Set drv->queue to NULL so that we do not try
-			 * to call blk_start_queue on this queue in the
-			 * interrupt handler
-			 */
-			drv->queue = NULL;
 		}
+		if (q)
+			blk_cleanup_queue(q);
 		/* If clear_all is set then we are deleting the logical
 		 * drive, not just refreshing its info.  For drives
 		 * other than disk 0 we will call put_disk.  We do not
@@ -2095,43 +2508,28 @@ static int deregister_disk(ctlr_info_t *h, int drv_index,
 		}
 	} else {
 		set_capacity(disk, 0);
+		cciss_clear_drive_info(drv);
 	}
 
 	--h->num_luns;
-	/* zero out the disk size info */
-	drv->nr_blocks = 0;
-	drv->block_size = 0;
-	drv->heads = 0;
-	drv->sectors = 0;
-	drv->cylinders = 0;
-	drv->raid_level = -1;	/* This can be used as a flag variable to
-				 * indicate that this element of the drive
-				 * array is free.
-				 */
-
-	if (clear_all) {
-		/* check to see if it was the last disk */
-		if (drv == h->drv + h->highest_lun) {
-			/* if so, find the new hightest lun */
-			int i, newhighest = -1;
-			for (i = 0; i <= h->highest_lun; i++) {
-				/* if the disk has size > 0, it is available */
-				if (h->drv[i].heads)
-					newhighest = i;
-			}
-			h->highest_lun = newhighest;
-		}
 
-		drv->LunID = 0;
+	/* if it was the last disk, find the new hightest lun */
+	if (clear_all && recalculate_highest_lun) {
+		int newhighest = -1;
+		for (i = 0; i <= h->highest_lun; i++) {
+			/* if the disk has size > 0, it is available */
+			if (h->drv[i] && h->drv[i]->heads)
+				newhighest = i;
+		}
+		h->highest_lun = newhighest;
 	}
 	return 0;
 }
 
-static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
+static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
 		size_t size, __u8 page_code, unsigned char *scsi3addr,
 		int cmd_type)
 {
-	ctlr_info_t *h = hba[ctlr];
 	u64bit buff_dma_handle;
 	int status = IO_OK;
 
@@ -2173,7 +2571,7 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 			c->Request.Type.Direction = XFER_READ;
 			c->Request.Timeout = 0;
 			c->Request.CDB[0] = cmd;
-			c->Request.CDB[6] = (size >> 24) & 0xFF;	//MSB
+			c->Request.CDB[6] = (size >> 24) & 0xFF; /* MSB */
 			c->Request.CDB[7] = (size >> 16) & 0xFF;
 			c->Request.CDB[8] = (size >> 8) & 0xFF;
 			c->Request.CDB[9] = size & 0xFF;
@@ -2207,6 +2605,8 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 			c->Request.Timeout = 0;
 			c->Request.CDB[0] = BMIC_WRITE;
 			c->Request.CDB[6] = BMIC_CACHE_FLUSH;
+			c->Request.CDB[7] = (size >> 8) & 0xFF;
+			c->Request.CDB[8] = size & 0xFF;
 			break;
 		case TEST_UNIT_READY:
 			c->Request.CDBLen = 6;
@@ -2215,13 +2615,12 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 			c->Request.Timeout = 0;
 			break;
 		default:
-			printk(KERN_WARNING
-			       "cciss%d:  Unknown Command 0x%c\n", ctlr, cmd);
+			dev_warn(&h->pdev->dev, "Unknown Command 0x%c\n", cmd);
 			return IO_ERROR;
 		}
 	} else if (cmd_type == TYPE_MSG) {
 		switch (cmd) {
-		case 0:	/* ABORT message */
+		case CCISS_ABORT_MSG:
 			c->Request.CDBLen = 12;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_WRITE;
@@ -2231,16 +2630,16 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 			/* buff contains the tag of the command to abort */
 			memcpy(&c->Request.CDB[4], buff, 8);
 			break;
-		case 1:	/* RESET message */
+		case CCISS_RESET_MSG:
 			c->Request.CDBLen = 16;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_NONE;
 			c->Request.Timeout = 0;
 			memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB));
 			c->Request.CDB[0] = cmd;	/* reset */
-			c->Request.CDB[1] = 0x03;	/* reset a target */
+			c->Request.CDB[1] = CCISS_RESET_TYPE_TARGET;
 			break;
-		case 3:	/* No-Op message */
+		case CCISS_NOOP_MSG:
 			c->Request.CDBLen = 1;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_WRITE;
@@ -2248,13 +2647,12 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 			c->Request.CDB[0] = cmd;
 			break;
 		default:
-			printk(KERN_WARNING
-			       "cciss%d: unknown message type %d\n", ctlr, cmd);
+			dev_warn(&h->pdev->dev,
+				"unknown message type %d\n", cmd);
 			return IO_ERROR;
 		}
 	} else {
-		printk(KERN_WARNING
-		       "cciss%d: unknown command type %d\n", ctlr, cmd_type);
+		dev_warn(&h->pdev->dev, "unknown command type %d\n", cmd_type);
 		return IO_ERROR;
 	}
 	/* Fill in the scatter gather information */
@@ -2270,6 +2668,31 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 	return status;
 }
 
+static int cciss_send_reset(ctlr_info_t *h, unsigned char *scsi3addr,
+			    u8 reset_type)
+{
+	CommandList_struct *c;
+	int return_status;
+
+	c = cmd_alloc(h);
+	if (!c)
+		return -ENOMEM;
+	return_status = fill_cmd(h, c, CCISS_RESET_MSG, NULL, 0, 0,
+		CTLR_LUNID, TYPE_MSG);
+	c->Request.CDB[1] = reset_type; /* fill_cmd defaults to target reset */
+	if (return_status != IO_OK) {
+		cmd_special_free(h, c);
+		return return_status;
+	}
+	c->waiting = NULL;
+	enqueue_cmd_and_start_io(h, c);
+	/* Don't wait for completion, the reset won't complete.  Don't free
+	 * the command either.  This is the last command we will send before
+	 * re-initializing everything, so it doesn't matter and won't leak.
+	 */
+	return 0;
+}
+
 static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
 {
 	switch (c->err_info->ScsiStatus) {
@@ -2280,15 +2703,16 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
 		case 0: return IO_OK; /* no sense */
 		case 1: return IO_OK; /* recovered error */
 		default:
-			printk(KERN_WARNING "cciss%d: cmd 0x%02x "
+			if (check_for_unit_attention(h, c))
+				return IO_NEEDS_RETRY;
+			dev_warn(&h->pdev->dev, "cmd 0x%02x "
 				"check condition, sense key = 0x%02x\n",
-				h->ctlr, c->Request.CDB[0],
-				c->err_info->SenseInfo[2]);
+				c->Request.CDB[0], c->err_info->SenseInfo[2]);
 		}
 		break;
 	default:
-		printk(KERN_WARNING "cciss%d: cmd 0x%02x"
-			"scsi status = 0x%02x\n", h->ctlr,
+		dev_warn(&h->pdev->dev, "cmd 0x%02x"
+			"scsi status = 0x%02x\n",
 			c->Request.CDB[0], c->err_info->ScsiStatus);
 		break;
 	}
@@ -2311,43 +2735,46 @@ static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c)
 		/* expected for inquiry and report lun commands */
 		break;
 	case CMD_INVALID:
-		printk(KERN_WARNING "cciss: cmd 0x%02x is "
+		dev_warn(&h->pdev->dev, "cmd 0x%02x is "
 		       "reported invalid\n", c->Request.CDB[0]);
 		return_status = IO_ERROR;
 		break;
 	case CMD_PROTOCOL_ERR:
-		printk(KERN_WARNING "cciss: cmd 0x%02x has "
-		       "protocol error \n", c->Request.CDB[0]);
+		dev_warn(&h->pdev->dev, "cmd 0x%02x has "
+		       "protocol error\n", c->Request.CDB[0]);
 		return_status = IO_ERROR;
 		break;
 	case CMD_HARDWARE_ERR:
-		printk(KERN_WARNING "cciss: cmd 0x%02x had "
+		dev_warn(&h->pdev->dev, "cmd 0x%02x had "
 		       " hardware error\n", c->Request.CDB[0]);
 		return_status = IO_ERROR;
 		break;
 	case CMD_CONNECTION_LOST:
-		printk(KERN_WARNING "cciss: cmd 0x%02x had "
+		dev_warn(&h->pdev->dev, "cmd 0x%02x had "
 		       "connection lost\n", c->Request.CDB[0]);
 		return_status = IO_ERROR;
 		break;
 	case CMD_ABORTED:
-		printk(KERN_WARNING "cciss: cmd 0x%02x was "
+		dev_warn(&h->pdev->dev, "cmd 0x%02x was "
 		       "aborted\n", c->Request.CDB[0]);
 		return_status = IO_ERROR;
 		break;
 	case CMD_ABORT_FAILED:
-		printk(KERN_WARNING "cciss: cmd 0x%02x reports "
+		dev_warn(&h->pdev->dev, "cmd 0x%02x reports "
 		       "abort failed\n", c->Request.CDB[0]);
 		return_status = IO_ERROR;
 		break;
 	case CMD_UNSOLICITED_ABORT:
-		printk(KERN_WARNING
-		       "cciss%d: unsolicited abort 0x%02x\n", h->ctlr,
+		dev_warn(&h->pdev->dev, "unsolicited abort 0x%02x\n",
 			c->Request.CDB[0]);
 		return_status = IO_NEEDS_RETRY;
 		break;
+	case CMD_UNABORTABLE:
+		dev_warn(&h->pdev->dev, "cmd unabortable\n");
+		return_status = IO_ERROR;
+		break;
 	default:
-		printk(KERN_WARNING "cciss: cmd 0x%02x returned "
+		dev_warn(&h->pdev->dev, "cmd 0x%02x returned "
 		       "unknown status %x\n", c->Request.CDB[0],
 		       c->err_info->CommandStatus);
 		return_status = IO_ERROR;
@@ -2360,17 +2787,11 @@ static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c,
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	u64bit buff_dma_handle;
-	unsigned long flags;
 	int return_status = IO_OK;
 
 resend_cmd2:
 	c->waiting = &wait;
-	/* Put the request on the tail of the queue and send it */
-	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
-	addQ(&h->reqQ, c);
-	h->Qdepth++;
-	start_io(h);
-	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+	enqueue_cmd_and_start_io(h, c);
 
 	wait_for_completion(&wait);
 
@@ -2381,13 +2802,13 @@ resend_cmd2:
 
 	if (return_status == IO_NEEDS_RETRY &&
 		c->retry_count < MAX_CMD_RETRIES) {
-		printk(KERN_WARNING "cciss%d: retrying 0x%02x\n", h->ctlr,
+		dev_warn(&h->pdev->dev, "retrying 0x%02x\n",
 			c->Request.CDB[0]);
 		c->retry_count++;
 		/* erase the old error information */
 		memset(c->err_info, 0, sizeof(ErrorInfo_struct));
 		return_status = IO_OK;
-		INIT_COMPLETION(wait);
+		reinit_completion(&wait);
 		goto resend_cmd2;
 	}
 
@@ -2400,28 +2821,27 @@ command_done:
 	return return_status;
 }
 
-static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
+static int sendcmd_withirq(ctlr_info_t *h, __u8 cmd, void *buff, size_t size,
 			   __u8 page_code, unsigned char scsi3addr[],
 			int cmd_type)
 {
-	ctlr_info_t *h = hba[ctlr];
 	CommandList_struct *c;
 	int return_status;
 
-	c = cmd_alloc(h, 0);
+	c = cmd_special_alloc(h);
 	if (!c)
 		return -ENOMEM;
-	return_status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
+	return_status = fill_cmd(h, c, cmd, buff, size, page_code,
 		scsi3addr, cmd_type);
 	if (return_status == IO_OK)
 		return_status = sendcmd_withirq_core(h, c, 1);
 
-	cmd_free(h, c, 0);
+	cmd_special_free(h, c);
 	return return_status;
 }
 
-static void cciss_geometry_inquiry(int ctlr, int logvol,
-				   int withirq, sector_t total_size,
+static void cciss_geometry_inquiry(ctlr_info_t *h, int logvol,
+				   sector_t total_size,
 				   unsigned int block_size,
 				   InquiryData_struct *inq_buff,
 				   drive_info_struct *drv)
@@ -2431,22 +2851,16 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
 	unsigned char scsi3addr[8];
 
 	memset(inq_buff, 0, sizeof(InquiryData_struct));
-	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-	if (withirq)
-		return_code = sendcmd_withirq(CISS_INQUIRY, ctlr,
-					      inq_buff, sizeof(*inq_buff),
-					      0xC1, scsi3addr, TYPE_CMD);
-	else
-		return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff,
-				      sizeof(*inq_buff), 0xC1, scsi3addr,
-				      TYPE_CMD);
+	log_unit_to_scsi3addr(h, scsi3addr, logvol);
+	return_code = sendcmd_withirq(h, CISS_INQUIRY, inq_buff,
+			sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD);
 	if (return_code == IO_OK) {
 		if (inq_buff->data_byte[8] == 0xFF) {
-			printk(KERN_WARNING
-			       "cciss: reading geometry failed, volume "
+			dev_warn(&h->pdev->dev,
+			       "reading geometry failed, volume "
 			       "does not support reading geometry\n");
 			drv->heads = 255;
-			drv->sectors = 32;	// Sectors per track
+			drv->sectors = 32;	/* Sectors per track */
 			drv->cylinders = total_size + 1;
 			drv->raid_level = RAID_UNKNOWN;
 		} else {
@@ -2467,14 +2881,12 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
 			drv->cylinders = real_size;
 		}
 	} else {		/* Get geometry failed */
-		printk(KERN_WARNING "cciss: reading geometry failed\n");
+		dev_warn(&h->pdev->dev, "reading geometry failed\n");
 	}
-	printk(KERN_INFO "      heads=%d, sectors=%d, cylinders=%d\n\n",
-	       drv->heads, drv->sectors, drv->cylinders);
 }
 
 static void
-cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
+cciss_read_capacity(ctlr_info_t *h, int logvol, sector_t *total_size,
 		    unsigned int *block_size)
 {
 	ReadCapdata_struct *buf;
@@ -2483,35 +2895,26 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
 
 	buf = kzalloc(sizeof(ReadCapdata_struct), GFP_KERNEL);
 	if (!buf) {
-		printk(KERN_WARNING "cciss: out of memory\n");
+		dev_warn(&h->pdev->dev, "out of memory\n");
 		return;
 	}
 
-	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-	if (withirq)
-		return_code = sendcmd_withirq(CCISS_READ_CAPACITY,
-				ctlr, buf, sizeof(ReadCapdata_struct),
-					0, scsi3addr, TYPE_CMD);
-	else
-		return_code = sendcmd(CCISS_READ_CAPACITY,
-				ctlr, buf, sizeof(ReadCapdata_struct),
-					0, scsi3addr, TYPE_CMD);
+	log_unit_to_scsi3addr(h, scsi3addr, logvol);
+	return_code = sendcmd_withirq(h, CCISS_READ_CAPACITY, buf,
+		sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD);
 	if (return_code == IO_OK) {
 		*total_size = be32_to_cpu(*(__be32 *) buf->total_size);
 		*block_size = be32_to_cpu(*(__be32 *) buf->block_size);
 	} else {		/* read capacity command failed */
-		printk(KERN_WARNING "cciss: read capacity failed\n");
+		dev_warn(&h->pdev->dev, "read capacity failed\n");
 		*total_size = 0;
 		*block_size = BLOCK_SIZE;
 	}
-	if (*total_size != 0)
-		printk(KERN_INFO "      blocks= %llu block_size= %d\n",
-		(unsigned long long)*total_size+1, *block_size);
 	kfree(buf);
 }
 
-static void
-cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, 				unsigned int *block_size)
+static void cciss_read_capacity_16(ctlr_info_t *h, int logvol,
+	sector_t *total_size, unsigned int *block_size)
 {
 	ReadCapdata_struct_16 *buf;
 	int return_code;
@@ -2519,30 +2922,23 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,
 
 	buf = kzalloc(sizeof(ReadCapdata_struct_16), GFP_KERNEL);
 	if (!buf) {
-		printk(KERN_WARNING "cciss: out of memory\n");
+		dev_warn(&h->pdev->dev, "out of memory\n");
 		return;
 	}
 
-	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-	if (withirq) {
-		return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
-			ctlr, buf, sizeof(ReadCapdata_struct_16),
-				0, scsi3addr, TYPE_CMD);
-	}
-	else {
-		return_code = sendcmd(CCISS_READ_CAPACITY_16,
-			ctlr, buf, sizeof(ReadCapdata_struct_16),
-				0, scsi3addr, TYPE_CMD);
-	}
+	log_unit_to_scsi3addr(h, scsi3addr, logvol);
+	return_code = sendcmd_withirq(h, CCISS_READ_CAPACITY_16,
+		buf, sizeof(ReadCapdata_struct_16),
+			0, scsi3addr, TYPE_CMD);
 	if (return_code == IO_OK) {
 		*total_size = be64_to_cpu(*(__be64 *) buf->total_size);
 		*block_size = be32_to_cpu(*(__be32 *) buf->block_size);
 	} else {		/* read capacity command failed */
-		printk(KERN_WARNING "cciss: read capacity failed\n");
+		dev_warn(&h->pdev->dev, "read capacity failed\n");
 		*total_size = 0;
 		*block_size = BLOCK_SIZE;
 	}
-	printk(KERN_INFO "      blocks= %llu block_size= %d\n",
+	dev_info(&h->pdev->dev, "      blocks= %llu block_size= %d\n",
 	       (unsigned long long)*total_size+1, *block_size);
 	kfree(buf);
 }
@@ -2557,8 +2953,11 @@ static int cciss_revalidate(struct gendisk *disk)
 	sector_t total_size;
 	InquiryData_struct *inq_buff = NULL;
 
-	for (logvol = 0; logvol < CISS_MAX_LUN; logvol++) {
-		if (h->drv[logvol].LunID == drv->LunID) {
+	for (logvol = 0; logvol <= h->highest_lun; logvol++) {
+		if (!h->drv[logvol])
+			continue;
+		if (memcmp(h->drv[logvol]->LunID, drv->LunID,
+			sizeof(drv->LunID)) == 0) {
 			FOUND = 1;
 			break;
 		}
@@ -2569,17 +2968,17 @@ static int cciss_revalidate(struct gendisk *disk)
 
 	inq_buff = kmalloc(sizeof(InquiryData_struct), GFP_KERNEL);
 	if (inq_buff == NULL) {
-		printk(KERN_WARNING "cciss: out of memory\n");
+		dev_warn(&h->pdev->dev, "out of memory\n");
 		return 1;
 	}
 	if (h->cciss_read == CCISS_READ_10) {
-		cciss_read_capacity(h->ctlr, logvol, 1,
+		cciss_read_capacity(h, logvol,
 					&total_size, &block_size);
 	} else {
-		cciss_read_capacity_16(h->ctlr, logvol, 1,
+		cciss_read_capacity_16(h, logvol,
 					&total_size, &block_size);
 	}
-	cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size,
+	cciss_geometry_inquiry(h, logvol, total_size, block_size,
 			       inq_buff, drv);
 
 	blk_queue_logical_block_size(drv->queue, drv->block_size);
@@ -2590,167 +2989,6 @@ static int cciss_revalidate(struct gendisk *disk)
 }
 
 /*
- *   Wait polling for a command to complete.
- *   The memory mapped FIFO is polled for the completion.
- *   Used only at init time, interrupts from the HBA are disabled.
- */
-static unsigned long pollcomplete(int ctlr)
-{
-	unsigned long done;
-	int i;
-
-	/* Wait (up to 20 seconds) for a command to complete */
-
-	for (i = 20 * HZ; i > 0; i--) {
-		done = hba[ctlr]->access.command_completed(hba[ctlr]);
-		if (done == FIFO_EMPTY)
-			schedule_timeout_uninterruptible(1);
-		else
-			return done;
-	}
-	/* Invalid address to tell caller we ran out of time */
-	return 1;
-}
-
-/* Send command c to controller h and poll for it to complete.
- * Turns interrupts off on the board.  Used at driver init time
- * and during SCSI error recovery.
- */
-static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
-{
-	int i;
-	unsigned long complete;
-	int status = IO_ERROR;
-	u64bit buff_dma_handle;
-
-resend_cmd1:
-
-	/* Disable interrupt on the board. */
-	h->access.set_intr_mask(h, CCISS_INTR_OFF);
-
-	/* Make sure there is room in the command FIFO */
-	/* Actually it should be completely empty at this time */
-	/* unless we are in here doing error handling for the scsi */
-	/* tape side of the driver. */
-	for (i = 200000; i > 0; i--) {
-		/* if fifo isn't full go */
-		if (!(h->access.fifo_full(h)))
-			break;
-		udelay(10);
-		printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full,"
-		       " waiting!\n", h->ctlr);
-	}
-	h->access.submit_command(h, c); /* Send the cmd */
-	do {
-		complete = pollcomplete(h->ctlr);
-
-#ifdef CCISS_DEBUG
-		printk(KERN_DEBUG "cciss: command completed\n");
-#endif				/* CCISS_DEBUG */
-
-		if (complete == 1) {
-			printk(KERN_WARNING
-			       "cciss cciss%d: SendCmd Timeout out, "
-			       "No command list address returned!\n", h->ctlr);
-			status = IO_ERROR;
-			break;
-		}
-
-		/* Make sure it's the command we're expecting. */
-		if ((complete & ~CISS_ERROR_BIT) != c->busaddr) {
-			printk(KERN_WARNING "cciss%d: Unexpected command "
-				"completion.\n", h->ctlr);
-			continue;
-		}
-
-		/* It is our command.  If no error, we're done. */
-		if (!(complete & CISS_ERROR_BIT)) {
-			status = IO_OK;
-			break;
-		}
-
-		/* There is an error... */
-
-		/* if data overrun or underun on Report command ignore it */
-		if (((c->Request.CDB[0] == CISS_REPORT_LOG) ||
-		     (c->Request.CDB[0] == CISS_REPORT_PHYS) ||
-		     (c->Request.CDB[0] == CISS_INQUIRY)) &&
-			((c->err_info->CommandStatus == CMD_DATA_OVERRUN) ||
-			 (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) {
-			complete = c->busaddr;
-			status = IO_OK;
-			break;
-		}
-
-		if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) {
-			printk(KERN_WARNING "cciss%d: unsolicited abort %p\n",
-				h->ctlr, c);
-			if (c->retry_count < MAX_CMD_RETRIES) {
-				printk(KERN_WARNING "cciss%d: retrying %p\n",
-				   h->ctlr, c);
-				c->retry_count++;
-				/* erase the old error information */
-				memset(c->err_info, 0, sizeof(c->err_info));
-				goto resend_cmd1;
-			}
-			printk(KERN_WARNING "cciss%d: retried %p too many "
-				"times\n", h->ctlr, c);
-			status = IO_ERROR;
-			break;
-		}
-
-		if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
-			printk(KERN_WARNING "cciss%d: command could not be "
-				"aborted.\n", h->ctlr);
-			status = IO_ERROR;
-			break;
-		}
-
-		if (c->err_info->CommandStatus == CMD_TARGET_STATUS) {
-			status = check_target_status(h, c);
-			break;
-		}
-
-		printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
-		printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
-			c->Request.CDB[0], c->err_info->CommandStatus);
-		status = IO_ERROR;
-		break;
-
-	} while (1);
-
-	/* unlock the data buffer from DMA */
-	buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
-	buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
-	pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
-			 c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
-	return status;
-}
-
-/*
- * Send a command to the controller, and wait for it to complete.
- * Used at init time, and during SCSI error recovery.
- */
-static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
-	__u8 page_code, unsigned char *scsi3addr, int cmd_type)
-{
-	CommandList_struct *c;
-	int status;
-
-	c = cmd_alloc(hba[ctlr], 1);
-	if (!c) {
-		printk(KERN_WARNING "cciss: unable to get memory");
-		return IO_ERROR;
-	}
-	status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
-		scsi3addr, cmd_type);
-	if (status == IO_OK)
-		status = sendcmd_core(hba[ctlr], c);
-	cmd_free(hba[ctlr], c, 1);
-	return status;
-}
-
-/*
  * Map (physical) PCI mem into (virtual) kernel space
  */
 static void __iomem *remap_pci_mem(ulong base, ulong size)
@@ -2770,11 +3008,11 @@ static void start_io(ctlr_info_t *h)
 {
 	CommandList_struct *c;
 
-	while (!hlist_empty(&h->reqQ)) {
-		c = hlist_entry(h->reqQ.first, CommandList_struct, list);
+	while (!list_empty(&h->reqQ)) {
+		c = list_entry(h->reqQ.next, CommandList_struct, list);
 		/* can't do anything if fifo is full */
 		if ((h->access.fifo_full(h))) {
-			printk(KERN_WARNING "cciss: fifo full\n");
+			dev_warn(&h->pdev->dev, "fifo full\n");
 			break;
 		}
 
@@ -2790,7 +3028,7 @@ static void start_io(ctlr_info_t *h)
 	}
 }
 
-/* Assumes that CCISS_LOCK(h->ctlr) is held. */
+/* Assumes that h->lock is held. */
 /* Zeros out the error record and then resends the command back */
 /* to the controller */
 static inline void resend_cciss_cmd(ctlr_info_t *h, CommandList_struct *c)
@@ -2831,7 +3069,7 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 	driver_byte = DRIVER_OK;
 	msg_byte = cmd->err_info->CommandStatus; /* correct?  seems too device specific */
 
-	if (blk_pc_request(cmd->rq))
+	if (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC)
 		host_byte = DID_PASSTHROUGH;
 	else
 		host_byte = DID_OK;
@@ -2840,8 +3078,8 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 		host_byte, driver_byte);
 
 	if (cmd->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION) {
-		if (!blk_pc_request(cmd->rq))
-			printk(KERN_WARNING "cciss: cmd %p "
+		if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC)
+			dev_warn(&h->pdev->dev, "cmd %p "
 			       "has SCSI Status 0x%x\n",
 			       cmd, cmd->err_info->ScsiStatus);
 		return error_value;
@@ -2850,17 +3088,19 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 	/* check the sense key */
 	sense_key = 0xf & cmd->err_info->SenseInfo[2];
 	/* no status or recovered error */
-	if (((sense_key == 0x0) || (sense_key == 0x1)) && !blk_pc_request(cmd->rq))
+	if (((sense_key == 0x0) || (sense_key == 0x1)) &&
+	    (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC))
 		error_value = 0;
 
 	if (check_for_unit_attention(h, cmd)) {
-		*retry_cmd = !blk_pc_request(cmd->rq);
+		*retry_cmd = !(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC);
 		return 0;
 	}
 
-	if (!blk_pc_request(cmd->rq)) { /* Not SG_IO or similar? */
+	/* Not SG_IO or similar? */
+	if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC) {
 		if (error_value != 0)
-			printk(KERN_WARNING "cciss: cmd %p has CHECK CONDITION"
+			dev_warn(&h->pdev->dev, "cmd %p has CHECK CONDITION"
 			       " sense key = 0x%x\n", cmd, sense_key);
 		return error_value;
 	}
@@ -2900,90 +3140,104 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		rq->errors = evaluate_target_status(h, cmd, &retry_cmd);
 		break;
 	case CMD_DATA_UNDERRUN:
-		if (blk_fs_request(cmd->rq)) {
-			printk(KERN_WARNING "cciss: cmd %p has"
+		if (cmd->rq->cmd_type == REQ_TYPE_FS) {
+			dev_warn(&h->pdev->dev, "cmd %p has"
 			       " completed with data underrun "
 			       "reported\n", cmd);
 			cmd->rq->resid_len = cmd->err_info->ResidualCnt;
 		}
 		break;
 	case CMD_DATA_OVERRUN:
-		if (blk_fs_request(cmd->rq))
-			printk(KERN_WARNING "cciss: cmd %p has"
+		if (cmd->rq->cmd_type == REQ_TYPE_FS)
+			dev_warn(&h->pdev->dev, "cciss: cmd %p has"
 			       " completed with data overrun "
 			       "reported\n", cmd);
 		break;
 	case CMD_INVALID:
-		printk(KERN_WARNING "cciss: cmd %p is "
+		dev_warn(&h->pdev->dev, "cciss: cmd %p is "
 		       "reported invalid\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_PROTOCOL_ERR:
-		printk(KERN_WARNING "cciss: cmd %p has "
-		       "protocol error \n", cmd);
+		dev_warn(&h->pdev->dev, "cciss: cmd %p has "
+		       "protocol error\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_HARDWARE_ERR:
-		printk(KERN_WARNING "cciss: cmd %p had "
+		dev_warn(&h->pdev->dev, "cciss: cmd %p had "
 		       " hardware error\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_CONNECTION_LOST:
-		printk(KERN_WARNING "cciss: cmd %p had "
+		dev_warn(&h->pdev->dev, "cciss: cmd %p had "
 		       "connection lost\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_ABORTED:
-		printk(KERN_WARNING "cciss: cmd %p was "
+		dev_warn(&h->pdev->dev, "cciss: cmd %p was "
 		       "aborted\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ABORT);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ABORT);
 		break;
 	case CMD_ABORT_FAILED:
-		printk(KERN_WARNING "cciss: cmd %p reports "
+		dev_warn(&h->pdev->dev, "cciss: cmd %p reports "
 		       "abort failed\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_UNSOLICITED_ABORT:
-		printk(KERN_WARNING "cciss%d: unsolicited "
+		dev_warn(&h->pdev->dev, "cciss%d: unsolicited "
 		       "abort %p\n", h->ctlr, cmd);
 		if (cmd->retry_count < MAX_CMD_RETRIES) {
 			retry_cmd = 1;
-			printk(KERN_WARNING
-			       "cciss%d: retrying %p\n", h->ctlr, cmd);
+			dev_warn(&h->pdev->dev, "retrying %p\n", cmd);
 			cmd->retry_count++;
 		} else
-			printk(KERN_WARNING
-			       "cciss%d: %p retried too "
-			       "many times\n", h->ctlr, cmd);
+			dev_warn(&h->pdev->dev,
+				"%p retried too many times\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ABORT);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ABORT);
 		break;
 	case CMD_TIMEOUT:
-		printk(KERN_WARNING "cciss: cmd %p timedout\n", cmd);
+		dev_warn(&h->pdev->dev, "cmd %p timedout\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
+		break;
+	case CMD_UNABORTABLE:
+		dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
+		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+			cmd->err_info->CommandStatus, DRIVER_OK,
+			cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	default:
-		printk(KERN_WARNING "cciss: cmd %p returned "
+		dev_warn(&h->pdev->dev, "cmd %p returned "
 		       "unknown status %x\n", cmd,
 		       cmd->err_info->CommandStatus);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 	}
 
 after_error_processing:
@@ -2997,6 +3251,37 @@ after_error_processing:
 	blk_complete_request(cmd->rq);
 }
 
+static inline u32 cciss_tag_contains_index(u32 tag)
+{
+#define DIRECT_LOOKUP_BIT 0x10
+	return tag & DIRECT_LOOKUP_BIT;
+}
+
+static inline u32 cciss_tag_to_index(u32 tag)
+{
+#define DIRECT_LOOKUP_SHIFT 5
+	return tag >> DIRECT_LOOKUP_SHIFT;
+}
+
+static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag)
+{
+#define CCISS_PERF_ERROR_BITS ((1 << DIRECT_LOOKUP_SHIFT) - 1)
+#define CCISS_SIMPLE_ERROR_BITS 0x03
+	if (likely(h->transMethod & CFGTBL_Trans_Performant))
+		return tag & ~CCISS_PERF_ERROR_BITS;
+	return tag & ~CCISS_SIMPLE_ERROR_BITS;
+}
+
+static inline void cciss_mark_tag_indexed(u32 *tag)
+{
+	*tag |= DIRECT_LOOKUP_BIT;
+}
+
+static inline void cciss_set_tag_index(u32 *tag, u32 index)
+{
+	*tag |= (index << DIRECT_LOOKUP_SHIFT);
+}
+
 /*
  * Get a request and submit it to the controller.
  */
@@ -3008,28 +3293,27 @@ static void do_cciss_request(struct request_queue *q)
 	int seg;
 	struct request *creq;
 	u64bit temp64;
-	struct scatterlist tmp_sg[MAXSGENTRIES];
+	struct scatterlist *tmp_sg;
+	SGDescriptor_struct *curr_sg;
 	drive_info_struct *drv;
 	int i, dir;
-
-	/* We call start_io here in case there is a command waiting on the
-	 * queue that has not been sent.
-	 */
-	if (blk_queue_plugged(q))
-		goto startio;
+	int sg_index = 0;
+	int chained = 0;
 
       queue:
 	creq = blk_peek_request(q);
 	if (!creq)
 		goto startio;
 
-	BUG_ON(creq->nr_phys_segments > MAXSGENTRIES);
+	BUG_ON(creq->nr_phys_segments > h->maxsgentries);
 
-	if ((c = cmd_alloc(h, 1)) == NULL)
+	c = cmd_alloc(h);
+	if (!c)
 		goto full;
 
 	blk_start_request(creq);
 
+	tmp_sg = h->scatter_list[c->cmdindex];
 	spin_unlock_irq(q->queue_lock);
 
 	c->cmd_type = CMD_RWREQ;
@@ -3037,29 +3321,25 @@ static void do_cciss_request(struct request_queue *q)
 
 	/* fill in the request */
 	drv = creq->rq_disk->private_data;
-	c->Header.ReplyQueue = 0;	// unused in simple mode
+	c->Header.ReplyQueue = 0;	/* unused in simple mode */
 	/* got command from pool, so use the command block index instead */
 	/* for direct lookups. */
 	/* The first 2 bits are reserved for controller error reporting. */
-	c->Header.Tag.lower = (c->cmdindex << 3);
-	c->Header.Tag.lower |= 0x04;	/* flag for direct lookup. */
-	c->Header.LUN.LogDev.VolId = drv->LunID;
-	c->Header.LUN.LogDev.Mode = 1;
-	c->Request.CDBLen = 10;	// 12 byte commands not in FW yet;
-	c->Request.Type.Type = TYPE_CMD;	// It is a command.
+	cciss_set_tag_index(&c->Header.Tag.lower, c->cmdindex);
+	cciss_mark_tag_indexed(&c->Header.Tag.lower);
+	memcpy(&c->Header.LUN, drv->LunID, sizeof(drv->LunID));
+	c->Request.CDBLen = 10;	/* 12 byte commands not in FW yet; */
+	c->Request.Type.Type = TYPE_CMD;	/* It is a command. */
 	c->Request.Type.Attribute = ATTR_SIMPLE;
 	c->Request.Type.Direction =
 	    (rq_data_dir(creq) == READ) ? XFER_READ : XFER_WRITE;
-	c->Request.Timeout = 0;	// Don't time out
+	c->Request.Timeout = 0;	/* Don't time out */
 	c->Request.CDB[0] =
 	    (rq_data_dir(creq) == READ) ? h->cciss_read : h->cciss_write;
 	start_blk = blk_rq_pos(creq);
-#ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "ciss: sector =%d nr_sectors=%d\n",
+	dev_dbg(&h->pdev->dev, "sector =%d nr_sectors=%d\n",
 	       (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq));
-#endif				/* CCISS_DEBUG */
-
-	sg_init_table(tmp_sg, MAXSGENTRIES);
+	sg_init_table(tmp_sg, h->maxsgentries);
 	seg = blk_rq_map_sg(q, creq, tmp_sg);
 
 	/* get the DMA records for the setup */
@@ -3068,33 +3348,55 @@ static void do_cciss_request(struct request_queue *q)
 	else
 		dir = PCI_DMA_TODEVICE;
 
+	curr_sg = c->SG;
+	sg_index = 0;
+	chained = 0;
+
 	for (i = 0; i < seg; i++) {
-		c->SG[i].Len = tmp_sg[i].length;
+		if (((sg_index+1) == (h->max_cmd_sgentries)) &&
+			!chained && ((seg - i) > 1)) {
+			/* Point to next chain block. */
+			curr_sg = h->cmd_sg_list[c->cmdindex];
+			sg_index = 0;
+			chained = 1;
+		}
+		curr_sg[sg_index].Len = tmp_sg[i].length;
 		temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]),
-						  tmp_sg[i].offset,
-						  tmp_sg[i].length, dir);
-		c->SG[i].Addr.lower = temp64.val32.lower;
-		c->SG[i].Addr.upper = temp64.val32.upper;
-		c->SG[i].Ext = 0;	// we are not chaining
+						tmp_sg[i].offset,
+						tmp_sg[i].length, dir);
+		curr_sg[sg_index].Addr.lower = temp64.val32.lower;
+		curr_sg[sg_index].Addr.upper = temp64.val32.upper;
+		curr_sg[sg_index].Ext = 0;  /* we are not chaining */
+		++sg_index;
 	}
+	if (chained)
+		cciss_map_sg_chain_block(h, c, h->cmd_sg_list[c->cmdindex],
+			(seg - (h->max_cmd_sgentries - 1)) *
+				sizeof(SGDescriptor_struct));
+
 	/* track how many SG entries we are using */
 	if (seg > h->maxSG)
 		h->maxSG = seg;
 
-#ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n",
-	       blk_rq_sectors(creq), seg);
-#endif				/* CCISS_DEBUG */
+	dev_dbg(&h->pdev->dev, "Submitting %u sectors in %d segments "
+			"chained[%d]\n",
+			blk_rq_sectors(creq), seg, chained);
 
-	c->Header.SGList = c->Header.SGTotal = seg;
-	if (likely(blk_fs_request(creq))) {
+	c->Header.SGTotal = seg + chained;
+	if (seg <= h->max_cmd_sgentries)
+		c->Header.SGList = c->Header.SGTotal;
+	else
+		c->Header.SGList = h->max_cmd_sgentries;
+	set_performant_mode(h, c);
+
+	if (likely(creq->cmd_type == REQ_TYPE_FS)) {
 		if(h->cciss_read == CCISS_READ_10) {
 			c->Request.CDB[1] = 0;
-			c->Request.CDB[2] = (start_blk >> 24) & 0xff;	//MSB
+			c->Request.CDB[2] = (start_blk >> 24) & 0xff; /* MSB */
 			c->Request.CDB[3] = (start_blk >> 16) & 0xff;
 			c->Request.CDB[4] = (start_blk >> 8) & 0xff;
 			c->Request.CDB[5] = start_blk & 0xff;
-			c->Request.CDB[6] = 0;	// (sect >> 24) & 0xff; MSB
+			c->Request.CDB[6] = 0; /* (sect >> 24) & 0xff; MSB */
 			c->Request.CDB[7] = (blk_rq_sectors(creq) >> 8) & 0xff;
 			c->Request.CDB[8] = blk_rq_sectors(creq) & 0xff;
 			c->Request.CDB[9] = c->Request.CDB[11] = c->Request.CDB[12] = 0;
@@ -3103,7 +3405,7 @@ static void do_cciss_request(struct request_queue *q)
 
 			c->Request.CDBLen = 16;
 			c->Request.CDB[1]= 0;
-			c->Request.CDB[2]= (upper32 >> 24) & 0xff;	//MSB
+			c->Request.CDB[2]= (upper32 >> 24) & 0xff; /* MSB */
 			c->Request.CDB[3]= (upper32 >> 16) & 0xff;
 			c->Request.CDB[4]= (upper32 >>  8) & 0xff;
 			c->Request.CDB[5]= upper32 & 0xff;
@@ -3117,11 +3419,12 @@ static void do_cciss_request(struct request_queue *q)
 			c->Request.CDB[13]= blk_rq_sectors(creq) & 0xff;
 			c->Request.CDB[14] = c->Request.CDB[15] = 0;
 		}
-	} else if (blk_pc_request(creq)) {
+	} else if (creq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		c->Request.CDBLen = creq->cmd_len;
 		memcpy(c->Request.CDB, creq->cmd, BLK_MAX_CDB);
 	} else {
-		printk(KERN_WARNING "cciss%d: bad request type %d\n", h->ctlr, creq->cmd_type);
+		dev_warn(&h->pdev->dev, "bad request type %d\n",
+			creq->cmd_type);
 		BUG();
 	}
 
@@ -3154,88 +3457,306 @@ static inline int interrupt_pending(ctlr_info_t *h)
 
 static inline long interrupt_not_for_us(ctlr_info_t *h)
 {
-	return (((h->access.intr_pending(h) == 0) ||
-		 (h->interrupts_enabled == 0)));
+	return ((h->access.intr_pending(h) == 0) ||
+		(h->interrupts_enabled == 0));
 }
 
-static irqreturn_t do_cciss_intr(int irq, void *dev_id)
+static inline int bad_tag(ctlr_info_t *h, u32 tag_index,
+			u32 raw_tag)
 {
-	ctlr_info_t *h = dev_id;
+	if (unlikely(tag_index >= h->nr_cmds)) {
+		dev_warn(&h->pdev->dev, "bad tag 0x%08x ignored.\n", raw_tag);
+		return 1;
+	}
+	return 0;
+}
+
+static inline void finish_cmd(ctlr_info_t *h, CommandList_struct *c,
+				u32 raw_tag)
+{
+	removeQ(c);
+	if (likely(c->cmd_type == CMD_RWREQ))
+		complete_command(h, c, 0);
+	else if (c->cmd_type == CMD_IOCTL_PEND)
+		complete(c->waiting);
+#ifdef CONFIG_CISS_SCSI_TAPE
+	else if (c->cmd_type == CMD_SCSI)
+		complete_scsi_command(c, 0, raw_tag);
+#endif
+}
+
+static inline u32 next_command(ctlr_info_t *h)
+{
+	u32 a;
+
+	if (unlikely(!(h->transMethod & CFGTBL_Trans_Performant)))
+		return h->access.command_completed(h);
+
+	if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) {
+		a = *(h->reply_pool_head); /* Next cmd in ring buffer */
+		(h->reply_pool_head)++;
+		h->commands_outstanding--;
+	} else {
+		a = FIFO_EMPTY;
+	}
+	/* Check for wraparound */
+	if (h->reply_pool_head == (h->reply_pool + h->max_commands)) {
+		h->reply_pool_head = h->reply_pool;
+		h->reply_pool_wraparound ^= 1;
+	}
+	return a;
+}
+
+/* process completion of an indexed ("direct lookup") command */
+static inline u32 process_indexed_cmd(ctlr_info_t *h, u32 raw_tag)
+{
+	u32 tag_index;
 	CommandList_struct *c;
+
+	tag_index = cciss_tag_to_index(raw_tag);
+	if (bad_tag(h, tag_index, raw_tag))
+		return next_command(h);
+	c = h->cmd_pool + tag_index;
+	finish_cmd(h, c, raw_tag);
+	return next_command(h);
+}
+
+/* process completion of a non-indexed command */
+static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag)
+{
+	CommandList_struct *c = NULL;
+	__u32 busaddr_masked, tag_masked;
+
+	tag_masked = cciss_tag_discard_error_bits(h, raw_tag);
+	list_for_each_entry(c, &h->cmpQ, list) {
+		busaddr_masked = cciss_tag_discard_error_bits(h, c->busaddr);
+		if (busaddr_masked == tag_masked) {
+			finish_cmd(h, c, raw_tag);
+			return next_command(h);
+		}
+	}
+	bad_tag(h, h->nr_cmds + 1, raw_tag);
+	return next_command(h);
+}
+
+/* Some controllers, like p400, will give us one interrupt
+ * after a soft reset, even if we turned interrupts off.
+ * Only need to check for this in the cciss_xxx_discard_completions
+ * functions.
+ */
+static int ignore_bogus_interrupt(ctlr_info_t *h)
+{
+	if (likely(!reset_devices))
+		return 0;
+
+	if (likely(h->interrupts_enabled))
+		return 0;
+
+	dev_info(&h->pdev->dev, "Received interrupt while interrupts disabled "
+		"(known firmware bug.)  Ignoring.\n");
+
+	return 1;
+}
+
+static irqreturn_t cciss_intx_discard_completions(int irq, void *dev_id)
+{
+	ctlr_info_t *h = dev_id;
 	unsigned long flags;
-	__u32 a, a1, a2;
+	u32 raw_tag;
+
+	if (ignore_bogus_interrupt(h))
+		return IRQ_NONE;
 
 	if (interrupt_not_for_us(h))
 		return IRQ_NONE;
-	/*
-	 * If there are completed commands in the completion queue,
-	 * we had better do something about it.
-	 */
-	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	spin_lock_irqsave(&h->lock, flags);
 	while (interrupt_pending(h)) {
-		while ((a = get_next_completion(h)) != FIFO_EMPTY) {
-			a1 = a;
-			if ((a & 0x04)) {
-				a2 = (a >> 3);
-				if (a2 >= h->nr_cmds) {
-					printk(KERN_WARNING
-					       "cciss: controller cciss%d failed, stopping.\n",
-					       h->ctlr);
-					fail_all_cmds(h->ctlr);
-					return IRQ_HANDLED;
-				}
+		raw_tag = get_next_completion(h);
+		while (raw_tag != FIFO_EMPTY)
+			raw_tag = next_command(h);
+	}
+	spin_unlock_irqrestore(&h->lock, flags);
+	return IRQ_HANDLED;
+}
 
-				c = h->cmd_pool + a2;
-				a = c->busaddr;
+static irqreturn_t cciss_msix_discard_completions(int irq, void *dev_id)
+{
+	ctlr_info_t *h = dev_id;
+	unsigned long flags;
+	u32 raw_tag;
 
-			} else {
-				struct hlist_node *tmp;
+	if (ignore_bogus_interrupt(h))
+		return IRQ_NONE;
 
-				a &= ~3;
-				c = NULL;
-				hlist_for_each_entry(c, tmp, &h->cmpQ, list) {
-					if (c->busaddr == a)
-						break;
-				}
-			}
-			/*
-			 * If we've found the command, take it off the
-			 * completion Q and free it
-			 */
-			if (c && c->busaddr == a) {
-				removeQ(c);
-				if (c->cmd_type == CMD_RWREQ) {
-					complete_command(h, c, 0);
-				} else if (c->cmd_type == CMD_IOCTL_PEND) {
-					complete(c->waiting);
-				}
-#				ifdef CONFIG_CISS_SCSI_TAPE
-				else if (c->cmd_type == CMD_SCSI)
-					complete_scsi_command(c, 0, a1);
-#				endif
-				continue;
-			}
+	spin_lock_irqsave(&h->lock, flags);
+	raw_tag = get_next_completion(h);
+	while (raw_tag != FIFO_EMPTY)
+		raw_tag = next_command(h);
+	spin_unlock_irqrestore(&h->lock, flags);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t do_cciss_intx(int irq, void *dev_id)
+{
+	ctlr_info_t *h = dev_id;
+	unsigned long flags;
+	u32 raw_tag;
+
+	if (interrupt_not_for_us(h))
+		return IRQ_NONE;
+	spin_lock_irqsave(&h->lock, flags);
+	while (interrupt_pending(h)) {
+		raw_tag = get_next_completion(h);
+		while (raw_tag != FIFO_EMPTY) {
+			if (cciss_tag_contains_index(raw_tag))
+				raw_tag = process_indexed_cmd(h, raw_tag);
+			else
+				raw_tag = process_nonindexed_cmd(h, raw_tag);
 		}
 	}
+	spin_unlock_irqrestore(&h->lock, flags);
+	return IRQ_HANDLED;
+}
+
+/* Add a second interrupt handler for MSI/MSI-X mode. In this mode we never
+ * check the interrupt pending register because it is not set.
+ */
+static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id)
+{
+	ctlr_info_t *h = dev_id;
+	unsigned long flags;
+	u32 raw_tag;
 
-	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+	spin_lock_irqsave(&h->lock, flags);
+	raw_tag = get_next_completion(h);
+	while (raw_tag != FIFO_EMPTY) {
+		if (cciss_tag_contains_index(raw_tag))
+			raw_tag = process_indexed_cmd(h, raw_tag);
+		else
+			raw_tag = process_nonindexed_cmd(h, raw_tag);
+	}
+	spin_unlock_irqrestore(&h->lock, flags);
 	return IRQ_HANDLED;
 }
 
+/**
+ * add_to_scan_list() - add controller to rescan queue
+ * @h:		      Pointer to the controller.
+ *
+ * Adds the controller to the rescan queue if not already on the queue.
+ *
+ * returns 1 if added to the queue, 0 if skipped (could be on the
+ * queue already, or the controller could be initializing or shutting
+ * down).
+ **/
+static int add_to_scan_list(struct ctlr_info *h)
+{
+	struct ctlr_info *test_h;
+	int found = 0;
+	int ret = 0;
+
+	if (h->busy_initializing)
+		return 0;
+
+	if (!mutex_trylock(&h->busy_shutting_down))
+		return 0;
+
+	mutex_lock(&scan_mutex);
+	list_for_each_entry(test_h, &scan_q, scan_list) {
+		if (test_h == h) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found && !h->busy_scanning) {
+		reinit_completion(&h->scan_wait);
+		list_add_tail(&h->scan_list, &scan_q);
+		ret = 1;
+	}
+	mutex_unlock(&scan_mutex);
+	mutex_unlock(&h->busy_shutting_down);
+
+	return ret;
+}
+
+/**
+ * remove_from_scan_list() - remove controller from rescan queue
+ * @h:			   Pointer to the controller.
+ *
+ * Removes the controller from the rescan queue if present. Blocks if
+ * the controller is currently conducting a rescan.  The controller
+ * can be in one of three states:
+ * 1. Doesn't need a scan
+ * 2. On the scan list, but not scanning yet (we remove it)
+ * 3. Busy scanning (and not on the list). In this case we want to wait for
+ *    the scan to complete to make sure the scanning thread for this
+ *    controller is completely idle.
+ **/
+static void remove_from_scan_list(struct ctlr_info *h)
+{
+	struct ctlr_info *test_h, *tmp_h;
+
+	mutex_lock(&scan_mutex);
+	list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) {
+		if (test_h == h) { /* state 2. */
+			list_del(&h->scan_list);
+			complete_all(&h->scan_wait);
+			mutex_unlock(&scan_mutex);
+			return;
+		}
+	}
+	if (h->busy_scanning) { /* state 3. */
+		mutex_unlock(&scan_mutex);
+		wait_for_completion(&h->scan_wait);
+	} else { /* state 1, nothing to do. */
+		mutex_unlock(&scan_mutex);
+	}
+}
+
+/**
+ * scan_thread() - kernel thread used to rescan controllers
+ * @data:	 Ignored.
+ *
+ * A kernel thread used scan for drive topology changes on
+ * controllers. The thread processes only one controller at a time
+ * using a queue.  Controllers are added to the queue using
+ * add_to_scan_list() and removed from the queue either after done
+ * processing or using remove_from_scan_list().
+ *
+ * returns 0.
+ **/
 static int scan_thread(void *data)
 {
-	ctlr_info_t *h = data;
-	int rc;
-	DECLARE_COMPLETION_ONSTACK(wait);
-	h->rescan_wait = &wait;
+	struct ctlr_info *h;
 
-	for (;;) {
-		rc = wait_for_completion_interruptible(&wait);
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
 		if (kthread_should_stop())
 			break;
-		if (!rc)
-			rebuild_lun_table(h, 0);
+
+		while (1) {
+			mutex_lock(&scan_mutex);
+			if (list_empty(&scan_q)) {
+				mutex_unlock(&scan_mutex);
+				break;
+			}
+
+			h = list_entry(scan_q.next,
+				       struct ctlr_info,
+				       scan_list);
+			list_del(&h->scan_list);
+			h->busy_scanning = 1;
+			mutex_unlock(&scan_mutex);
+
+			rebuild_lun_table(h, 0, 0);
+			complete_all(&h->scan_wait);
+			mutex_lock(&scan_mutex);
+			h->busy_scanning = 0;
+			mutex_unlock(&scan_mutex);
+		}
 	}
+
 	return 0;
 }
 
@@ -3246,36 +3767,48 @@ static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c)
 
 	switch (c->err_info->SenseInfo[12]) {
 	case STATE_CHANGED:
-		printk(KERN_WARNING "cciss%d: a state change "
-			"detected, command retried\n", h->ctlr);
+		dev_warn(&h->pdev->dev, "a state change "
+			"detected, command retried\n");
 		return 1;
 	break;
 	case LUN_FAILED:
-		printk(KERN_WARNING "cciss%d: LUN failure "
-			"detected, action required\n", h->ctlr);
+		dev_warn(&h->pdev->dev, "LUN failure "
+			"detected, action required\n");
 		return 1;
 	break;
 	case REPORT_LUNS_CHANGED:
-		printk(KERN_WARNING "cciss%d: report LUN data "
-			"changed\n", h->ctlr);
-		if (h->rescan_wait)
-			complete(h->rescan_wait);
+		dev_warn(&h->pdev->dev, "report LUN data changed\n");
+	/*
+	 * Here, we could call add_to_scan_list and wake up the scan thread,
+	 * except that it's quite likely that we will get more than one
+	 * REPORT_LUNS_CHANGED condition in quick succession, which means
+	 * that those which occur after the first one will likely happen
+	 * *during* the scan_thread's rescan.  And the rescan code is not
+	 * robust enough to restart in the middle, undoing what it has already
+	 * done, and it's not clear that it's even possible to do this, since
+	 * part of what it does is notify the block layer, which starts
+	 * doing it's own i/o to read partition tables and so on, and the
+	 * driver doesn't have visibility to know what might need undoing.
+	 * In any event, if possible, it is horribly complicated to get right
+	 * so we just don't do it for now.
+	 *
+	 * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012.
+	 */
 		return 1;
 	break;
 	case POWER_OR_RESET:
-		printk(KERN_WARNING "cciss%d: a power on "
-			"or device reset detected\n", h->ctlr);
+		dev_warn(&h->pdev->dev,
+			"a power on or device reset detected\n");
 		return 1;
 	break;
 	case UNIT_ATTENTION_CLEARED:
-		printk(KERN_WARNING "cciss%d: unit attention "
-		    "cleared by another initiator\n", h->ctlr);
+		dev_warn(&h->pdev->dev,
+			"unit attention cleared by another initiator\n");
 		return 1;
 	break;
 	default:
-		printk(KERN_WARNING "cciss%d: unknown "
-			"unit attention detected\n", h->ctlr);
-				return 1;
+		dev_warn(&h->pdev->dev, "unknown unit attention detected\n");
+		return 1;
 	}
 }
 
@@ -3284,39 +3817,41 @@ static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c)
  *   the io functions.
  *   This is for debug only.
  */
-#ifdef CCISS_DEBUG
-static void print_cfg_table(CfgTable_struct *tb)
+static void print_cfg_table(ctlr_info_t *h)
 {
 	int i;
 	char temp_name[17];
+	CfgTable_struct *tb = h->cfgtable;
 
-	printk("Controller Configuration information\n");
-	printk("------------------------------------\n");
+	dev_dbg(&h->pdev->dev, "Controller Configuration information\n");
+	dev_dbg(&h->pdev->dev, "------------------------------------\n");
 	for (i = 0; i < 4; i++)
 		temp_name[i] = readb(&(tb->Signature[i]));
 	temp_name[4] = '\0';
-	printk("   Signature = %s\n", temp_name);
-	printk("   Spec Number = %d\n", readl(&(tb->SpecValence)));
-	printk("   Transport methods supported = 0x%x\n",
+	dev_dbg(&h->pdev->dev, "   Signature = %s\n", temp_name);
+	dev_dbg(&h->pdev->dev, "   Spec Number = %d\n",
+		readl(&(tb->SpecValence)));
+	dev_dbg(&h->pdev->dev, "   Transport methods supported = 0x%x\n",
 	       readl(&(tb->TransportSupport)));
-	printk("   Transport methods active = 0x%x\n",
+	dev_dbg(&h->pdev->dev, "   Transport methods active = 0x%x\n",
 	       readl(&(tb->TransportActive)));
-	printk("   Requested transport Method = 0x%x\n",
+	dev_dbg(&h->pdev->dev, "   Requested transport Method = 0x%x\n",
 	       readl(&(tb->HostWrite.TransportRequest)));
-	printk("   Coalesce Interrupt Delay = 0x%x\n",
+	dev_dbg(&h->pdev->dev, "   Coalesce Interrupt Delay = 0x%x\n",
 	       readl(&(tb->HostWrite.CoalIntDelay)));
-	printk("   Coalesce Interrupt Count = 0x%x\n",
+	dev_dbg(&h->pdev->dev, "   Coalesce Interrupt Count = 0x%x\n",
 	       readl(&(tb->HostWrite.CoalIntCount)));
-	printk("   Max outstanding commands = 0x%d\n",
+	dev_dbg(&h->pdev->dev, "   Max outstanding commands = 0x%d\n",
 	       readl(&(tb->CmdsOutMax)));
-	printk("   Bus Types = 0x%x\n", readl(&(tb->BusTypes)));
+	dev_dbg(&h->pdev->dev, "   Bus Types = 0x%x\n",
+		readl(&(tb->BusTypes)));
 	for (i = 0; i < 16; i++)
 		temp_name[i] = readb(&(tb->ServerName[i]));
 	temp_name[16] = '\0';
-	printk("   Server Name = %s\n", temp_name);
-	printk("   Heartbeat Counter = 0x%x\n\n\n", readl(&(tb->HeartBeat)));
+	dev_dbg(&h->pdev->dev, "   Server Name = %s\n", temp_name);
+	dev_dbg(&h->pdev->dev, "   Heartbeat Counter = 0x%x\n\n\n",
+		readl(&(tb->HeartBeat)));
 }
-#endif				/* CCISS_DEBUG */
 
 static int find_PCI_BAR_index(struct pci_dev *pdev, unsigned long pci_bar_addr)
 {
@@ -3340,7 +3875,7 @@ static int find_PCI_BAR_index(struct pci_dev *pdev, unsigned long pci_bar_addr)
 				offset += 8;
 				break;
 			default:	/* reserved in PCI 2.2 */
-				printk(KERN_WARNING
+				dev_warn(&pdev->dev,
 				       "Base address is invalid\n");
 				return -1;
 				break;
@@ -3352,12 +3887,186 @@ static int find_PCI_BAR_index(struct pci_dev *pdev, unsigned long pci_bar_addr)
 	return -1;
 }
 
+/* Fill in bucket_map[], given nsgs (the max number of
+ * scatter gather elements supported) and bucket[],
+ * which is an array of 8 integers.  The bucket[] array
+ * contains 8 different DMA transfer sizes (in 16
+ * byte increments) which the controller uses to fetch
+ * commands.  This function fills in bucket_map[], which
+ * maps a given number of scatter gather elements to one of
+ * the 8 DMA transfer sizes.  The point of it is to allow the
+ * controller to only do as much DMA as needed to fetch the
+ * command, with the DMA transfer size encoded in the lower
+ * bits of the command address.
+ */
+static void  calc_bucket_map(int bucket[], int num_buckets,
+	int nsgs, int *bucket_map)
+{
+	int i, j, b, size;
+
+	/* even a command with 0 SGs requires 4 blocks */
+#define MINIMUM_TRANSFER_BLOCKS 4
+#define NUM_BUCKETS 8
+	/* Note, bucket_map must have nsgs+1 entries. */
+	for (i = 0; i <= nsgs; i++) {
+		/* Compute size of a command with i SG entries */
+		size = i + MINIMUM_TRANSFER_BLOCKS;
+		b = num_buckets; /* Assume the biggest bucket */
+		/* Find the bucket that is just big enough */
+		for (j = 0; j < 8; j++) {
+			if (bucket[j] >= size) {
+				b = j;
+				break;
+			}
+		}
+		/* for a command with i SG entries, use bucket b. */
+		bucket_map[i] = b;
+	}
+}
+
+static void cciss_wait_for_mode_change_ack(ctlr_info_t *h)
+{
+	int i;
+
+	/* under certain very rare conditions, this can take awhile.
+	 * (e.g.: hot replace a failed 144GB drive in a RAID 5 set right
+	 * as we enter this code.) */
+	for (i = 0; i < MAX_CONFIG_WAIT; i++) {
+		if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+			break;
+		usleep_range(10000, 20000);
+	}
+}
+
+static void cciss_enter_performant_mode(ctlr_info_t *h, u32 use_short_tags)
+{
+	/* This is a bit complicated.  There are 8 registers on
+	 * the controller which we write to to tell it 8 different
+	 * sizes of commands which there may be.  It's a way of
+	 * reducing the DMA done to fetch each command.  Encoded into
+	 * each command's tag are 3 bits which communicate to the controller
+	 * which of the eight sizes that command fits within.  The size of
+	 * each command depends on how many scatter gather entries there are.
+	 * Each SG entry requires 16 bytes.  The eight registers are programmed
+	 * with the number of 16-byte blocks a command of that size requires.
+	 * The smallest command possible requires 5 such 16 byte blocks.
+	 * the largest command possible requires MAXSGENTRIES + 4 16-byte
+	 * blocks.  Note, this only extends to the SG entries contained
+	 * within the command block, and does not extend to chained blocks
+	 * of SG elements.   bft[] contains the eight values we write to
+	 * the registers.  They are not evenly distributed, but have more
+	 * sizes for small commands, and fewer sizes for larger commands.
+	 */
+	__u32 trans_offset;
+	int bft[8] = { 5, 6, 8, 10, 12, 20, 28, MAXSGENTRIES + 4};
+			/*
+			 *  5 = 1 s/g entry or 4k
+			 *  6 = 2 s/g entry or 8k
+			 *  8 = 4 s/g entry or 16k
+			 * 10 = 6 s/g entry or 24k
+			 */
+	unsigned long register_value;
+	BUILD_BUG_ON(28 > MAXSGENTRIES + 4);
+
+	h->reply_pool_wraparound = 1; /* spec: init to 1 */
+
+	/* Controller spec: zero out this buffer. */
+	memset(h->reply_pool, 0, h->max_commands * sizeof(__u64));
+	h->reply_pool_head = h->reply_pool;
+
+	trans_offset = readl(&(h->cfgtable->TransMethodOffset));
+	calc_bucket_map(bft, ARRAY_SIZE(bft), h->maxsgentries,
+				h->blockFetchTable);
+	writel(bft[0], &h->transtable->BlockFetch0);
+	writel(bft[1], &h->transtable->BlockFetch1);
+	writel(bft[2], &h->transtable->BlockFetch2);
+	writel(bft[3], &h->transtable->BlockFetch3);
+	writel(bft[4], &h->transtable->BlockFetch4);
+	writel(bft[5], &h->transtable->BlockFetch5);
+	writel(bft[6], &h->transtable->BlockFetch6);
+	writel(bft[7], &h->transtable->BlockFetch7);
+
+	/* size of controller ring buffer */
+	writel(h->max_commands, &h->transtable->RepQSize);
+	writel(1, &h->transtable->RepQCount);
+	writel(0, &h->transtable->RepQCtrAddrLow32);
+	writel(0, &h->transtable->RepQCtrAddrHigh32);
+	writel(h->reply_pool_dhandle, &h->transtable->RepQAddr0Low32);
+	writel(0, &h->transtable->RepQAddr0High32);
+	writel(CFGTBL_Trans_Performant | use_short_tags,
+			&(h->cfgtable->HostWrite.TransportRequest));
+
+	writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+	cciss_wait_for_mode_change_ack(h);
+	register_value = readl(&(h->cfgtable->TransportActive));
+	if (!(register_value & CFGTBL_Trans_Performant))
+		dev_warn(&h->pdev->dev, "cciss: unable to get board into"
+					" performant mode\n");
+}
+
+static void cciss_put_controller_into_performant_mode(ctlr_info_t *h)
+{
+	__u32 trans_support;
+
+	if (cciss_simple_mode)
+		return;
+
+	dev_dbg(&h->pdev->dev, "Trying to put board into Performant mode\n");
+	/* Attempt to put controller into performant mode if supported */
+	/* Does board support performant mode? */
+	trans_support = readl(&(h->cfgtable->TransportSupport));
+	if (!(trans_support & PERFORMANT_MODE))
+		return;
+
+	dev_dbg(&h->pdev->dev, "Placing controller into performant mode\n");
+	/* Performant mode demands commands on a 32 byte boundary
+	 * pci_alloc_consistent aligns on page boundarys already.
+	 * Just need to check if divisible by 32
+	 */
+	if ((sizeof(CommandList_struct) % 32) != 0) {
+		dev_warn(&h->pdev->dev, "%s %d %s\n",
+			"cciss info: command size[",
+			(int)sizeof(CommandList_struct),
+			"] not divisible by 32, no performant mode..\n");
+		return;
+	}
+
+	/* Performant mode ring buffer and supporting data structures */
+	h->reply_pool = (__u64 *)pci_alloc_consistent(
+		h->pdev, h->max_commands * sizeof(__u64),
+		&(h->reply_pool_dhandle));
+
+	/* Need a block fetch table for performant mode */
+	h->blockFetchTable = kmalloc(((h->maxsgentries+1) *
+		sizeof(__u32)), GFP_KERNEL);
+
+	if ((h->reply_pool == NULL) || (h->blockFetchTable == NULL))
+		goto clean_up;
+
+	cciss_enter_performant_mode(h,
+		trans_support & CFGTBL_Trans_use_short_tags);
+
+	/* Change the access methods to the performant access methods */
+	h->access = SA5_performant_access;
+	h->transMethod = CFGTBL_Trans_Performant;
+
+	return;
+clean_up:
+	kfree(h->blockFetchTable);
+	if (h->reply_pool)
+		pci_free_consistent(h->pdev,
+				h->max_commands * sizeof(__u64),
+				h->reply_pool,
+				h->reply_pool_dhandle);
+	return;
+
+} /* cciss_put_controller_into_performant_mode */
+
 /* If MSI/MSI-X is supported by the kernel we will try to enable it on
  * controllers that are capable. If not, we use IO-APIC mode.
  */
 
-static void __devinit cciss_interrupt_mode(ctlr_info_t *c,
-					   struct pci_dev *pdev, __u32 board_id)
+static void cciss_interrupt_mode(ctlr_info_t *h)
 {
 #ifdef CONFIG_PCI_MSI
 	int err;
@@ -3366,262 +4075,309 @@ static void __devinit cciss_interrupt_mode(ctlr_info_t *c,
 	};
 
 	/* Some boards advertise MSI but don't really support it */
-	if ((board_id == 0x40700E11) ||
-	    (board_id == 0x40800E11) ||
-	    (board_id == 0x40820E11) || (board_id == 0x40830E11))
+	if ((h->board_id == 0x40700E11) || (h->board_id == 0x40800E11) ||
+	    (h->board_id == 0x40820E11) || (h->board_id == 0x40830E11))
 		goto default_int_mode;
 
-	if (pci_find_capability(pdev, PCI_CAP_ID_MSIX)) {
-		err = pci_enable_msix(pdev, cciss_msix_entries, 4);
+	if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) {
+		err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4);
 		if (!err) {
-			c->intr[0] = cciss_msix_entries[0].vector;
-			c->intr[1] = cciss_msix_entries[1].vector;
-			c->intr[2] = cciss_msix_entries[2].vector;
-			c->intr[3] = cciss_msix_entries[3].vector;
-			c->msix_vector = 1;
+			h->intr[0] = cciss_msix_entries[0].vector;
+			h->intr[1] = cciss_msix_entries[1].vector;
+			h->intr[2] = cciss_msix_entries[2].vector;
+			h->intr[3] = cciss_msix_entries[3].vector;
+			h->msix_vector = 1;
 			return;
-		}
-		if (err > 0) {
-			printk(KERN_WARNING "cciss: only %d MSI-X vectors "
-			       "available\n", err);
-			goto default_int_mode;
 		} else {
-			printk(KERN_WARNING "cciss: MSI-X init failed %d\n",
-			       err);
-			goto default_int_mode;
+			dev_warn(&h->pdev->dev,
+				"MSI-X init failed %d\n", err);
 		}
 	}
-	if (pci_find_capability(pdev, PCI_CAP_ID_MSI)) {
-		if (!pci_enable_msi(pdev)) {
-			c->msi_vector = 1;
-		} else {
-			printk(KERN_WARNING "cciss: MSI init failed\n");
-		}
+	if (pci_find_capability(h->pdev, PCI_CAP_ID_MSI)) {
+		if (!pci_enable_msi(h->pdev))
+			h->msi_vector = 1;
+		else
+			dev_warn(&h->pdev->dev, "MSI init failed\n");
 	}
 default_int_mode:
 #endif				/* CONFIG_PCI_MSI */
 	/* if we get here we're going to use the default interrupt mode */
-	c->intr[SIMPLE_MODE_INT] = pdev->irq;
+	h->intr[h->intr_mode] = h->pdev->irq;
 	return;
 }
 
-static int __devinit cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
+static int cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id)
 {
-	ushort subsystem_vendor_id, subsystem_device_id, command;
-	__u32 board_id, scratchpad = 0;
-	__u64 cfg_offset;
-	__u32 cfg_base_addr;
-	__u64 cfg_base_addr_index;
-	int i, err;
-
-	/* check to see if controller has been disabled */
-	/* BEFORE trying to enable it */
-	(void)pci_read_config_word(pdev, PCI_COMMAND, &command);
-	if (!(command & 0x02)) {
-		printk(KERN_WARNING
-		       "cciss: controller appears to be disabled\n");
-		return -ENODEV;
-	}
+	int i;
+	u32 subsystem_vendor_id, subsystem_device_id;
 
-	err = pci_enable_device(pdev);
-	if (err) {
-		printk(KERN_ERR "cciss: Unable to Enable PCI device\n");
-		return err;
-	}
+	subsystem_vendor_id = pdev->subsystem_vendor;
+	subsystem_device_id = pdev->subsystem_device;
+	*board_id = ((subsystem_device_id << 16) & 0xffff0000) |
+			subsystem_vendor_id;
 
-	err = pci_request_regions(pdev, "cciss");
-	if (err) {
-		printk(KERN_ERR "cciss: Cannot obtain PCI resources, "
-		       "aborting\n");
-		return err;
+	for (i = 0; i < ARRAY_SIZE(products); i++) {
+		/* Stand aside for hpsa driver on request */
+		if (cciss_allow_hpsa)
+			return -ENODEV;
+		if (*board_id == products[i].board_id)
+			return i;
 	}
+	dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n",
+		*board_id);
+	return -ENODEV;
+}
 
-	subsystem_vendor_id = pdev->subsystem_vendor;
-	subsystem_device_id = pdev->subsystem_device;
-	board_id = (((__u32) (subsystem_device_id << 16) & 0xffff0000) |
-		    subsystem_vendor_id);
+static inline bool cciss_board_disabled(ctlr_info_t *h)
+{
+	u16 command;
 
-#ifdef CCISS_DEBUG
-	printk("command = %x\n", command);
-	printk("irq = %x\n", pdev->irq);
-	printk("board_id = %x\n", board_id);
-#endif				/* CCISS_DEBUG */
+	(void) pci_read_config_word(h->pdev, PCI_COMMAND, &command);
+	return ((command & PCI_COMMAND_MEMORY) == 0);
+}
 
-/* If the kernel supports MSI/MSI-X we will try to enable that functionality,
- * else we use the IO-APIC interrupt assigned to us by system ROM.
- */
-	cciss_interrupt_mode(c, pdev, board_id);
+static int cciss_pci_find_memory_BAR(struct pci_dev *pdev,
+				     unsigned long *memory_bar)
+{
+	int i;
 
-	/* find the memory BAR */
-	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-		if (pci_resource_flags(pdev, i) & IORESOURCE_MEM)
-			break;
-	}
-	if (i == DEVICE_COUNT_RESOURCE) {
-		printk(KERN_WARNING "cciss: No memory BAR found\n");
-		err = -ENODEV;
-		goto err_out_free_res;
-	}
+	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++)
+		if (pci_resource_flags(pdev, i) & IORESOURCE_MEM) {
+			/* addressing mode bits already removed */
+			*memory_bar = pci_resource_start(pdev, i);
+			dev_dbg(&pdev->dev, "memory BAR = %lx\n",
+				*memory_bar);
+			return 0;
+		}
+	dev_warn(&pdev->dev, "no memory BAR found\n");
+	return -ENODEV;
+}
 
-	c->paddr = pci_resource_start(pdev, i); /* addressing mode bits
-						 * already removed
-						 */
+static int cciss_wait_for_board_state(struct pci_dev *pdev,
+				      void __iomem *vaddr, int wait_for_ready)
+#define BOARD_READY 1
+#define BOARD_NOT_READY 0
+{
+	int i, iterations;
+	u32 scratchpad;
 
-#ifdef CCISS_DEBUG
-	printk("address 0 = %lx\n", c->paddr);
-#endif				/* CCISS_DEBUG */
-	c->vaddr = remap_pci_mem(c->paddr, 0x250);
+	if (wait_for_ready)
+		iterations = CCISS_BOARD_READY_ITERATIONS;
+	else
+		iterations = CCISS_BOARD_NOT_READY_ITERATIONS;
 
-	/* Wait for the board to become ready.  (PCI hotplug needs this.)
-	 * We poll for up to 120 secs, once per 100ms. */
-	for (i = 0; i < 1200; i++) {
-		scratchpad = readl(c->vaddr + SA5_SCRATCHPAD_OFFSET);
-		if (scratchpad == CCISS_FIRMWARE_READY)
-			break;
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(HZ / 10);	/* wait 100ms */
+	for (i = 0; i < iterations; i++) {
+		scratchpad = readl(vaddr + SA5_SCRATCHPAD_OFFSET);
+		if (wait_for_ready) {
+			if (scratchpad == CCISS_FIRMWARE_READY)
+				return 0;
+		} else {
+			if (scratchpad != CCISS_FIRMWARE_READY)
+				return 0;
+		}
+		msleep(CCISS_BOARD_READY_POLL_INTERVAL_MSECS);
 	}
-	if (scratchpad != CCISS_FIRMWARE_READY) {
-		printk(KERN_WARNING "cciss: Board not ready.  Timed out.\n");
-		err = -ENODEV;
-		goto err_out_free_res;
+	dev_warn(&pdev->dev, "board not ready, timed out.\n");
+	return -ENODEV;
+}
+
+static int cciss_find_cfg_addrs(struct pci_dev *pdev, void __iomem *vaddr,
+				u32 *cfg_base_addr, u64 *cfg_base_addr_index,
+				u64 *cfg_offset)
+{
+	*cfg_base_addr = readl(vaddr + SA5_CTCFG_OFFSET);
+	*cfg_offset = readl(vaddr + SA5_CTMEM_OFFSET);
+	*cfg_base_addr &= (u32) 0x0000ffff;
+	*cfg_base_addr_index = find_PCI_BAR_index(pdev, *cfg_base_addr);
+	if (*cfg_base_addr_index == -1) {
+		dev_warn(&pdev->dev, "cannot find cfg_base_addr_index, "
+			"*cfg_base_addr = 0x%08x\n", *cfg_base_addr);
+		return -ENODEV;
 	}
+	return 0;
+}
 
-	/* get the address index number */
-	cfg_base_addr = readl(c->vaddr + SA5_CTCFG_OFFSET);
-	cfg_base_addr &= (__u32) 0x0000ffff;
-#ifdef CCISS_DEBUG
-	printk("cfg base address = %x\n", cfg_base_addr);
-#endif				/* CCISS_DEBUG */
-	cfg_base_addr_index = find_PCI_BAR_index(pdev, cfg_base_addr);
-#ifdef CCISS_DEBUG
-	printk("cfg base address index = %llx\n",
-		(unsigned long long)cfg_base_addr_index);
-#endif				/* CCISS_DEBUG */
-	if (cfg_base_addr_index == -1) {
-		printk(KERN_WARNING "cciss: Cannot find cfg_base_addr_index\n");
-		err = -ENODEV;
-		goto err_out_free_res;
+static int cciss_find_cfgtables(ctlr_info_t *h)
+{
+	u64 cfg_offset;
+	u32 cfg_base_addr;
+	u64 cfg_base_addr_index;
+	u32 trans_offset;
+	int rc;
+
+	rc = cciss_find_cfg_addrs(h->pdev, h->vaddr, &cfg_base_addr,
+		&cfg_base_addr_index, &cfg_offset);
+	if (rc)
+		return rc;
+	h->cfgtable = remap_pci_mem(pci_resource_start(h->pdev,
+		cfg_base_addr_index) + cfg_offset, sizeof(*h->cfgtable));
+	if (!h->cfgtable)
+		return -ENOMEM;
+	rc = write_driver_ver_to_cfgtable(h->cfgtable);
+	if (rc)
+		return rc;
+	/* Find performant mode table. */
+	trans_offset = readl(&h->cfgtable->TransMethodOffset);
+	h->transtable = remap_pci_mem(pci_resource_start(h->pdev,
+				cfg_base_addr_index)+cfg_offset+trans_offset,
+				sizeof(*h->transtable));
+	if (!h->transtable)
+		return -ENOMEM;
+	return 0;
+}
+
+static void cciss_get_max_perf_mode_cmds(struct ctlr_info *h)
+{
+	h->max_commands = readl(&(h->cfgtable->MaxPerformantModeCommands));
+
+	/* Limit commands in memory limited kdump scenario. */
+	if (reset_devices && h->max_commands > 32)
+		h->max_commands = 32;
+
+	if (h->max_commands < 16) {
+		dev_warn(&h->pdev->dev, "Controller reports "
+			"max supported commands of %d, an obvious lie. "
+			"Using 16.  Ensure that firmware is up to date.\n",
+			h->max_commands);
+		h->max_commands = 16;
 	}
+}
 
-	cfg_offset = readl(c->vaddr + SA5_CTMEM_OFFSET);
-#ifdef CCISS_DEBUG
-	printk("cfg offset = %llx\n", (unsigned long long)cfg_offset);
-#endif				/* CCISS_DEBUG */
-	c->cfgtable = remap_pci_mem(pci_resource_start(pdev,
-						       cfg_base_addr_index) +
-				    cfg_offset, sizeof(CfgTable_struct));
-	c->board_id = board_id;
-
-#ifdef CCISS_DEBUG
-	print_cfg_table(c->cfgtable);
-#endif				/* CCISS_DEBUG */
-
-	/* Some controllers support Zero Memory Raid (ZMR).
-	 * When configured in ZMR mode the number of supported
-	 * commands drops to 64. So instead of just setting an
-	 * arbitrary value we make the driver a little smarter.
-	 * We read the config table to tell us how many commands
-	 * are supported on the controller then subtract 4 to
-	 * leave a little room for ioctl calls.
+/* Interrogate the hardware for some limits:
+ * max commands, max SG elements without chaining, and with chaining,
+ * SG chain block size, etc.
+ */
+static void cciss_find_board_params(ctlr_info_t *h)
+{
+	cciss_get_max_perf_mode_cmds(h);
+	h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds;
+	h->maxsgentries = readl(&(h->cfgtable->MaxSGElements));
+	/*
+	 * The P600 may exhibit poor performnace under some workloads
+	 * if we use the value in the configuration table. Limit this
+	 * controller to MAXSGENTRIES (32) instead.
 	 */
-	c->max_commands = readl(&(c->cfgtable->CmdsOutMax));
-	for (i = 0; i < ARRAY_SIZE(products); i++) {
-		if (board_id == products[i].board_id) {
-			c->product_name = products[i].product_name;
-			c->access = *(products[i].access);
-			c->nr_cmds = c->max_commands - 4;
-			break;
-		}
-	}
-	if ((readb(&c->cfgtable->Signature[0]) != 'C') ||
-	    (readb(&c->cfgtable->Signature[1]) != 'I') ||
-	    (readb(&c->cfgtable->Signature[2]) != 'S') ||
-	    (readb(&c->cfgtable->Signature[3]) != 'S')) {
-		printk("Does not appear to be a valid CISS config table\n");
-		err = -ENODEV;
-		goto err_out_free_res;
-	}
-	/* We didn't find the controller in our list. We know the
-	 * signature is valid. If it's an HP device let's try to
-	 * bind to the device and fire it up. Otherwise we bail.
+	if (h->board_id == 0x3225103C)
+		h->maxsgentries = MAXSGENTRIES;
+	/*
+	 * Limit in-command s/g elements to 32 save dma'able memory.
+	 * Howvever spec says if 0, use 31
 	 */
-	if (i == ARRAY_SIZE(products)) {
-		if (subsystem_vendor_id == PCI_VENDOR_ID_HP) {
-			c->product_name = products[i-1].product_name;
-			c->access = *(products[i-1].access);
-			c->nr_cmds = c->max_commands - 4;
-			printk(KERN_WARNING "cciss: This is an unknown "
-				"Smart Array controller.\n"
-				"cciss: Please update to the latest driver "
-				"available from www.hp.com.\n");
-		} else {
-			printk(KERN_WARNING "cciss: Sorry, I don't know how"
-				" to access the Smart Array controller %08lx\n"
-					, (unsigned long)board_id);
-			err = -ENODEV;
-			goto err_out_free_res;
-		}
+	h->max_cmd_sgentries = 31;
+	if (h->maxsgentries > 512) {
+		h->max_cmd_sgentries = 32;
+		h->chainsize = h->maxsgentries - h->max_cmd_sgentries + 1;
+		h->maxsgentries--; /* save one for chain pointer */
+	} else {
+		h->maxsgentries = 31; /* default to traditional values */
+		h->chainsize = 0;
 	}
-#ifdef CONFIG_X86
-	{
-		/* Need to enable prefetch in the SCSI core for 6400 in x86 */
-		__u32 prefetch;
-		prefetch = readl(&(c->cfgtable->SCSI_Prefetch));
-		prefetch |= 0x100;
-		writel(prefetch, &(c->cfgtable->SCSI_Prefetch));
+}
+
+static inline bool CISS_signature_present(ctlr_info_t *h)
+{
+	if (!check_signature(h->cfgtable->Signature, "CISS", 4)) {
+		dev_warn(&h->pdev->dev, "not a valid CISS config table\n");
+		return false;
 	}
+	return true;
+}
+
+/* Need to enable prefetch in the SCSI core for 6400 in x86 */
+static inline void cciss_enable_scsi_prefetch(ctlr_info_t *h)
+{
+#ifdef CONFIG_X86
+	u32 prefetch;
+
+	prefetch = readl(&(h->cfgtable->SCSI_Prefetch));
+	prefetch |= 0x100;
+	writel(prefetch, &(h->cfgtable->SCSI_Prefetch));
 #endif
+}
 
-	/* Disabling DMA prefetch and refetch for the P600.
-	 * An ASIC bug may result in accesses to invalid memory addresses.
-	 * We've disabled prefetch for some time now. Testing with XEN
-	 * kernels revealed a bug in the refetch if dom0 resides on a P600.
-	 */
-	if(board_id == 0x3225103C) {
-		__u32 dma_prefetch;
-		__u32 dma_refetch;
-		dma_prefetch = readl(c->vaddr + I2O_DMA1_CFG);
-		dma_prefetch |= 0x8000;
-		writel(dma_prefetch, c->vaddr + I2O_DMA1_CFG);
-		pci_read_config_dword(pdev, PCI_COMMAND_PARITY, &dma_refetch);
-		dma_refetch |= 0x1;
-		pci_write_config_dword(pdev, PCI_COMMAND_PARITY, dma_refetch);
-	}
-
-#ifdef CCISS_DEBUG
-	printk("Trying to put board into Simple mode\n");
-#endif				/* CCISS_DEBUG */
-	c->max_commands = readl(&(c->cfgtable->CmdsOutMax));
-	/* Update the field, and then ring the doorbell */
-	writel(CFGTBL_Trans_Simple, &(c->cfgtable->HostWrite.TransportRequest));
-	writel(CFGTBL_ChangeReq, c->vaddr + SA5_DOORBELL);
+/* Disable DMA prefetch for the P600.  Otherwise an ASIC bug may result
+ * in a prefetch beyond physical memory.
+ */
+static inline void cciss_p600_dma_prefetch_quirk(ctlr_info_t *h)
+{
+	u32 dma_prefetch;
+	__u32 dma_refetch;
 
-	/* under certain very rare conditions, this can take awhile.
-	 * (e.g.: hot replace a failed 144GB drive in a RAID 5 set right
-	 * as we enter this code.) */
-	for (i = 0; i < MAX_CONFIG_WAIT; i++) {
-		if (!(readl(c->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
-			break;
-		/* delay and try again */
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(10);
+	if (h->board_id != 0x3225103C)
+		return;
+	dma_prefetch = readl(h->vaddr + I2O_DMA1_CFG);
+	dma_prefetch |= 0x8000;
+	writel(dma_prefetch, h->vaddr + I2O_DMA1_CFG);
+	pci_read_config_dword(h->pdev, PCI_COMMAND_PARITY, &dma_refetch);
+	dma_refetch |= 0x1;
+	pci_write_config_dword(h->pdev, PCI_COMMAND_PARITY, dma_refetch);
+}
+
+static int cciss_pci_init(ctlr_info_t *h)
+{
+	int prod_index, err;
+
+	prod_index = cciss_lookup_board_id(h->pdev, &h->board_id);
+	if (prod_index < 0)
+		return -ENODEV;
+	h->product_name = products[prod_index].product_name;
+	h->access = *(products[prod_index].access);
+
+	if (cciss_board_disabled(h)) {
+		dev_warn(&h->pdev->dev, "controller appears to be disabled\n");
+		return -ENODEV;
 	}
 
-#ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "I counter got to %d %x\n", i,
-	       readl(c->vaddr + SA5_DOORBELL));
-#endif				/* CCISS_DEBUG */
-#ifdef CCISS_DEBUG
-	print_cfg_table(c->cfgtable);
-#endif				/* CCISS_DEBUG */
+	pci_disable_link_state(h->pdev, PCIE_LINK_STATE_L0S |
+				PCIE_LINK_STATE_L1 | PCIE_LINK_STATE_CLKPM);
 
-	if (!(readl(&(c->cfgtable->TransportActive)) & CFGTBL_Trans_Simple)) {
-		printk(KERN_WARNING "cciss: unable to get board into"
-		       " simple mode\n");
+	err = pci_enable_device(h->pdev);
+	if (err) {
+		dev_warn(&h->pdev->dev, "Unable to Enable PCI device\n");
+		return err;
+	}
+
+	err = pci_request_regions(h->pdev, "cciss");
+	if (err) {
+		dev_warn(&h->pdev->dev,
+			"Cannot obtain PCI resources, aborting\n");
+		return err;
+	}
+
+	dev_dbg(&h->pdev->dev, "irq = %x\n", h->pdev->irq);
+	dev_dbg(&h->pdev->dev, "board_id = %x\n", h->board_id);
+
+/* If the kernel supports MSI/MSI-X we will try to enable that functionality,
+ * else we use the IO-APIC interrupt assigned to us by system ROM.
+ */
+	cciss_interrupt_mode(h);
+	err = cciss_pci_find_memory_BAR(h->pdev, &h->paddr);
+	if (err)
+		goto err_out_free_res;
+	h->vaddr = remap_pci_mem(h->paddr, 0x250);
+	if (!h->vaddr) {
+		err = -ENOMEM;
+		goto err_out_free_res;
+	}
+	err = cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY);
+	if (err)
+		goto err_out_free_res;
+	err = cciss_find_cfgtables(h);
+	if (err)
+		goto err_out_free_res;
+	print_cfg_table(h);
+	cciss_find_board_params(h);
+
+	if (!CISS_signature_present(h)) {
 		err = -ENODEV;
 		goto err_out_free_res;
 	}
+	cciss_enable_scsi_prefetch(h);
+	cciss_p600_dma_prefetch_quirk(h);
+	err = cciss_enter_simple_mode(h);
+	if (err)
+		goto err_out_free_res;
+	cciss_put_controller_into_performant_mode(h);
 	return 0;
 
 err_out_free_res:
@@ -3629,49 +4385,56 @@ err_out_free_res:
 	 * Deliberately omit pci_disable_device(): it does something nasty to
 	 * Smart Array controllers that pci_enable_device does not undo
 	 */
-	pci_release_regions(pdev);
+	if (h->transtable)
+		iounmap(h->transtable);
+	if (h->cfgtable)
+		iounmap(h->cfgtable);
+	if (h->vaddr)
+		iounmap(h->vaddr);
+	pci_release_regions(h->pdev);
 	return err;
 }
 
 /* Function to find the first free pointer into our hba[] array
  * Returns -1 if no free entries are left.
  */
-static int alloc_cciss_hba(void)
+static int alloc_cciss_hba(struct pci_dev *pdev)
 {
 	int i;
 
 	for (i = 0; i < MAX_CTLR; i++) {
 		if (!hba[i]) {
-			ctlr_info_t *p;
+			ctlr_info_t *h;
 
-			p = kzalloc(sizeof(ctlr_info_t), GFP_KERNEL);
-			if (!p)
+			h = kzalloc(sizeof(ctlr_info_t), GFP_KERNEL);
+			if (!h)
 				goto Enomem;
-			hba[i] = p;
+			hba[i] = h;
 			return i;
 		}
 	}
-	printk(KERN_WARNING "cciss: This driver supports a maximum"
+	dev_warn(&pdev->dev, "This driver supports a maximum"
 	       " of %d controllers.\n", MAX_CTLR);
 	return -1;
 Enomem:
-	printk(KERN_ERR "cciss: out of memory.\n");
+	dev_warn(&pdev->dev, "out of memory.\n");
 	return -1;
 }
 
-static void free_hba(int i)
+static void free_hba(ctlr_info_t *h)
 {
-	ctlr_info_t *p = hba[i];
-	int n;
+	int i;
 
-	hba[i] = NULL;
-	for (n = 0; n < CISS_MAX_LUN; n++)
-		put_disk(p->gendisk[n]);
-	kfree(p);
+	hba[h->ctlr] = NULL;
+	for (i = 0; i < h->highest_lun + 1; i++)
+		if (h->gendisk[i] != NULL)
+			put_disk(h->gendisk[i]);
+	kfree(h);
 }
 
 /* Send a message CDB to the firmware. */
-static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, unsigned char type)
+static int cciss_message(struct pci_dev *pdev, unsigned char opcode,
+			 unsigned char type)
 {
 	typedef struct {
 		CommandListHeader_struct CommandHeader;
@@ -3735,7 +4498,7 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
 		tag = readl(vaddr + SA5_REPLY_PORT_OFFSET);
 		if ((tag & ~3) == paddr32)
 			break;
-		schedule_timeout_uninterruptible(HZ);
+		msleep(CCISS_POST_RESET_NOOP_TIMEOUT_MSECS);
 	}
 
 	iounmap(vaddr);
@@ -3743,7 +4506,8 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
 	/* we leak the DMA buffer here ... no choice since the controller could
 	   still complete the command. */
 	if (i == 10) {
-		printk(KERN_ERR "cciss: controller message %02x:%02x timed out\n",
+		dev_err(&pdev->dev,
+			"controller message %02x:%02x timed out\n",
 			opcode, type);
 		return -ETIMEDOUT;
 	}
@@ -3751,172 +4515,513 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
 	pci_free_consistent(pdev, cmd_sz, cmd, paddr64);
 
 	if (tag & 2) {
-		printk(KERN_ERR "cciss: controller message %02x:%02x failed\n",
+		dev_err(&pdev->dev, "controller message %02x:%02x failed\n",
 			opcode, type);
 		return -EIO;
 	}
 
-	printk(KERN_INFO "cciss: controller message %02x:%02x succeeded\n",
+	dev_info(&pdev->dev, "controller message %02x:%02x succeeded\n",
 		opcode, type);
 	return 0;
 }
 
-#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0)
 #define cciss_noop(p) cciss_message(p, 3, 0)
 
-static __devinit int cciss_reset_msi(struct pci_dev *pdev)
+static int cciss_controller_hard_reset(struct pci_dev *pdev,
+	void * __iomem vaddr, u32 use_doorbell)
 {
-/* the #defines are stolen from drivers/pci/msi.h. */
-#define msi_control_reg(base)		(base + PCI_MSI_FLAGS)
-#define PCI_MSIX_FLAGS_ENABLE		(1 << 15)
-
+	u16 pmcsr;
 	int pos;
-	u16 control = 0;
-
-	pos = pci_find_capability(pdev, PCI_CAP_ID_MSI);
-	if (pos) {
-		pci_read_config_word(pdev, msi_control_reg(pos), &control);
-		if (control & PCI_MSI_FLAGS_ENABLE) {
-			printk(KERN_INFO "cciss: resetting MSI\n");
-			pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSI_FLAGS_ENABLE);
-		}
-	}
 
-	pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
-	if (pos) {
-		pci_read_config_word(pdev, msi_control_reg(pos), &control);
-		if (control & PCI_MSIX_FLAGS_ENABLE) {
-			printk(KERN_INFO "cciss: resetting MSI-X\n");
-			pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSIX_FLAGS_ENABLE);
+	if (use_doorbell) {
+		/* For everything after the P600, the PCI power state method
+		 * of resetting the controller doesn't work, so we have this
+		 * other way using the doorbell register.
+		 */
+		dev_info(&pdev->dev, "using doorbell to reset controller\n");
+		writel(use_doorbell, vaddr + SA5_DOORBELL);
+	} else { /* Try to do it the PCI power state way */
+
+		/* Quoting from the Open CISS Specification: "The Power
+		 * Management Control/Status Register (CSR) controls the power
+		 * state of the device.  The normal operating state is D0,
+		 * CSR=00h.  The software off state is D3, CSR=03h.  To reset
+		 * the controller, place the interface device in D3 then to D0,
+		 * this causes a secondary PCI reset which will reset the
+		 * controller." */
+
+		pos = pci_find_capability(pdev, PCI_CAP_ID_PM);
+		if (pos == 0) {
+			dev_err(&pdev->dev,
+				"cciss_controller_hard_reset: "
+				"PCI PM not supported\n");
+			return -ENODEV;
 		}
+		dev_info(&pdev->dev, "using PCI PM to reset controller\n");
+		/* enter the D3hot power management state */
+		pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr);
+		pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+		pmcsr |= PCI_D3hot;
+		pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr);
+
+		msleep(500);
+
+		/* enter the D0 power management state */
+		pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+		pmcsr |= PCI_D0;
+		pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr);
+
+		/*
+		 * The P600 requires a small delay when changing states.
+		 * Otherwise we may think the board did not reset and we bail.
+		 * This for kdump only and is particular to the P600.
+		 */
+		msleep(500);
 	}
+	return 0;
+}
+
+static void init_driver_version(char *driver_version, int len)
+{
+	memset(driver_version, 0, len);
+	strncpy(driver_version, "cciss " DRIVER_NAME, len - 1);
+}
+
+static int write_driver_ver_to_cfgtable(CfgTable_struct __iomem *cfgtable)
+{
+	char *driver_version;
+	int i, size = sizeof(cfgtable->driver_version);
+
+	driver_version = kmalloc(size, GFP_KERNEL);
+	if (!driver_version)
+		return -ENOMEM;
 
+	init_driver_version(driver_version, size);
+	for (i = 0; i < size; i++)
+		writeb(driver_version[i], &cfgtable->driver_version[i]);
+	kfree(driver_version);
 	return 0;
 }
 
+static void read_driver_ver_from_cfgtable(CfgTable_struct __iomem *cfgtable,
+					  unsigned char *driver_ver)
+{
+	int i;
+
+	for (i = 0; i < sizeof(cfgtable->driver_version); i++)
+		driver_ver[i] = readb(&cfgtable->driver_version[i]);
+}
+
+static int controller_reset_failed(CfgTable_struct __iomem *cfgtable)
+{
+
+	char *driver_ver, *old_driver_ver;
+	int rc, size = sizeof(cfgtable->driver_version);
+
+	old_driver_ver = kmalloc(2 * size, GFP_KERNEL);
+	if (!old_driver_ver)
+		return -ENOMEM;
+	driver_ver = old_driver_ver + size;
+
+	/* After a reset, the 32 bytes of "driver version" in the cfgtable
+	 * should have been changed, otherwise we know the reset failed.
+	 */
+	init_driver_version(old_driver_ver, size);
+	read_driver_ver_from_cfgtable(cfgtable, driver_ver);
+	rc = !memcmp(driver_ver, old_driver_ver, size);
+	kfree(old_driver_ver);
+	return rc;
+}
+
 /* This does a hard reset of the controller using PCI power management
- * states. */
-static __devinit int cciss_hard_reset_controller(struct pci_dev *pdev)
+ * states or using the doorbell register. */
+static int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 {
-	u16 pmcsr, saved_config_space[32];
-	int i, pos;
+	u64 cfg_offset;
+	u32 cfg_base_addr;
+	u64 cfg_base_addr_index;
+	void __iomem *vaddr;
+	unsigned long paddr;
+	u32 misc_fw_support;
+	int rc;
+	CfgTable_struct __iomem *cfgtable;
+	u32 use_doorbell;
+	u32 board_id;
+	u16 command_register;
+
+	/* For controllers as old a the p600, this is very nearly
+	 * the same thing as
+	 *
+	 * pci_save_state(pci_dev);
+	 * pci_set_power_state(pci_dev, PCI_D3hot);
+	 * pci_set_power_state(pci_dev, PCI_D0);
+	 * pci_restore_state(pci_dev);
+	 *
+	 * For controllers newer than the P600, the pci power state
+	 * method of resetting doesn't work so we have another way
+	 * using the doorbell register.
+	 */
+
+	/* Exclude 640x boards.  These are two pci devices in one slot
+	 * which share a battery backed cache module.  One controls the
+	 * cache, the other accesses the cache through the one that controls
+	 * it.  If we reset the one controlling the cache, the other will
+	 * likely not be happy.  Just forbid resetting this conjoined mess.
+	 */
+	cciss_lookup_board_id(pdev, &board_id);
+	if (!ctlr_is_resettable(board_id)) {
+		dev_warn(&pdev->dev, "Cannot reset Smart Array 640x "
+				"due to shared cache module.");
+		return -ENODEV;
+	}
+
+	/* if controller is soft- but not hard resettable... */
+	if (!ctlr_is_hard_resettable(board_id))
+		return -ENOTSUPP; /* try soft reset later. */
+
+	/* Save the PCI command register */
+	pci_read_config_word(pdev, 4, &command_register);
+	/* Turn the board off.  This is so that later pci_restore_state()
+	 * won't turn the board on before the rest of config space is ready.
+	 */
+	pci_disable_device(pdev);
+	pci_save_state(pdev);
+
+	/* find the first memory BAR, so we can find the cfg table */
+	rc = cciss_pci_find_memory_BAR(pdev, &paddr);
+	if (rc)
+		return rc;
+	vaddr = remap_pci_mem(paddr, 0x250);
+	if (!vaddr)
+		return -ENOMEM;
+
+	/* find cfgtable in order to check if reset via doorbell is supported */
+	rc = cciss_find_cfg_addrs(pdev, vaddr, &cfg_base_addr,
+					&cfg_base_addr_index, &cfg_offset);
+	if (rc)
+		goto unmap_vaddr;
+	cfgtable = remap_pci_mem(pci_resource_start(pdev,
+		       cfg_base_addr_index) + cfg_offset, sizeof(*cfgtable));
+	if (!cfgtable) {
+		rc = -ENOMEM;
+		goto unmap_vaddr;
+	}
+	rc = write_driver_ver_to_cfgtable(cfgtable);
+	if (rc)
+		goto unmap_vaddr;
+
+	/* If reset via doorbell register is supported, use that.
+	 * There are two such methods.  Favor the newest method.
+	 */
+	misc_fw_support = readl(&cfgtable->misc_fw_support);
+	use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET2;
+	if (use_doorbell) {
+		use_doorbell = DOORBELL_CTLR_RESET2;
+	} else {
+		use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET;
+		if (use_doorbell) {
+			dev_warn(&pdev->dev, "Controller claims that "
+				"'Bit 2 doorbell reset' is "
+				"supported, but not 'bit 5 doorbell reset'.  "
+				"Firmware update is recommended.\n");
+			rc = -ENOTSUPP; /* use the soft reset */
+			goto unmap_cfgtable;
+		}
+	}
+
+	rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell);
+	if (rc)
+		goto unmap_cfgtable;
+	pci_restore_state(pdev);
+	rc = pci_enable_device(pdev);
+	if (rc) {
+		dev_warn(&pdev->dev, "failed to enable device.\n");
+		goto unmap_cfgtable;
+	}
+	pci_write_config_word(pdev, 4, command_register);
+
+	/* Some devices (notably the HP Smart Array 5i Controller)
+	   need a little pause here */
+	msleep(CCISS_POST_RESET_PAUSE_MSECS);
+
+	/* Wait for board to become not ready, then ready. */
+	dev_info(&pdev->dev, "Waiting for board to reset.\n");
+	rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
+	if (rc) {
+		dev_warn(&pdev->dev, "Failed waiting for board to hard reset."
+				"  Will try soft reset.\n");
+		rc = -ENOTSUPP; /* Not expected, but try soft reset later */
+		goto unmap_cfgtable;
+	}
+	rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY);
+	if (rc) {
+		dev_warn(&pdev->dev,
+			"failed waiting for board to become ready "
+			"after hard reset\n");
+		goto unmap_cfgtable;
+	}
+
+	rc = controller_reset_failed(vaddr);
+	if (rc < 0)
+		goto unmap_cfgtable;
+	if (rc) {
+		dev_warn(&pdev->dev, "Unable to successfully hard reset "
+			"controller. Will try soft reset.\n");
+		rc = -ENOTSUPP; /* Not expected, but try soft reset later */
+	} else {
+		dev_info(&pdev->dev, "Board ready after hard reset.\n");
+	}
 
-	printk(KERN_INFO "cciss: using PCI PM to reset controller\n");
+unmap_cfgtable:
+	iounmap(cfgtable);
 
-	/* This is very nearly the same thing as
+unmap_vaddr:
+	iounmap(vaddr);
+	return rc;
+}
 
-	   pci_save_state(pci_dev);
-	   pci_set_power_state(pci_dev, PCI_D3hot);
-	   pci_set_power_state(pci_dev, PCI_D0);
-	   pci_restore_state(pci_dev);
+static int cciss_init_reset_devices(struct pci_dev *pdev)
+{
+	int rc, i;
 
-	   but we can't use these nice canned kernel routines on
-	   kexec, because they also check the MSI/MSI-X state in PCI
-	   configuration space and do the wrong thing when it is
-	   set/cleared.  Also, the pci_save/restore_state functions
-	   violate the ordering requirements for restoring the
-	   configuration space from the CCISS document (see the
-	   comment below).  So we roll our own .... */
+	if (!reset_devices)
+		return 0;
 
-	for (i = 0; i < 32; i++)
-		pci_read_config_word(pdev, 2*i, &saved_config_space[i]);
+	/* Reset the controller with a PCI power-cycle or via doorbell */
+	rc = cciss_kdump_hard_reset_controller(pdev);
 
-	pos = pci_find_capability(pdev, PCI_CAP_ID_PM);
-	if (pos == 0) {
-		printk(KERN_ERR "cciss_reset_controller: PCI PM not supported\n");
+	/* -ENOTSUPP here means we cannot reset the controller
+	 * but it's already (and still) up and running in
+	 * "performant mode".  Or, it might be 640x, which can't reset
+	 * due to concerns about shared bbwc between 6402/6404 pair.
+	 */
+	if (rc == -ENOTSUPP)
+		return rc; /* just try to do the kdump anyhow. */
+	if (rc)
 		return -ENODEV;
+
+	/* Now try to get the controller to respond to a no-op */
+	dev_warn(&pdev->dev, "Waiting for controller to respond to no-op\n");
+	for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) {
+		if (cciss_noop(pdev) == 0)
+			break;
+		else
+			dev_warn(&pdev->dev, "no-op failed%s\n",
+				(i < CCISS_POST_RESET_NOOP_RETRIES - 1 ?
+					"; re-trying" : ""));
+		msleep(CCISS_POST_RESET_NOOP_INTERVAL_MSECS);
 	}
+	return 0;
+}
 
-	/* Quoting from the Open CISS Specification: "The Power
-	 * Management Control/Status Register (CSR) controls the power
-	 * state of the device.  The normal operating state is D0,
-	 * CSR=00h.  The software off state is D3, CSR=03h.  To reset
-	 * the controller, place the interface device in D3 then to
-	 * D0, this causes a secondary PCI reset which will reset the
-	 * controller." */
-
-	/* enter the D3hot power management state */
-	pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr);
-	pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
-	pmcsr |= PCI_D3hot;
-	pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr);
-
-	schedule_timeout_uninterruptible(HZ >> 1);
-
-	/* enter the D0 power management state */
-	pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
-	pmcsr |= PCI_D0;
-	pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr);
-
-	schedule_timeout_uninterruptible(HZ >> 1);
-
-	/* Restore the PCI configuration space.  The Open CISS
-	 * Specification says, "Restore the PCI Configuration
-	 * Registers, offsets 00h through 60h. It is important to
-	 * restore the command register, 16-bits at offset 04h,
-	 * last. Do not restore the configuration status register,
-	 * 16-bits at offset 06h."  Note that the offset is 2*i. */
-	for (i = 0; i < 32; i++) {
-		if (i == 2 || i == 3)
-			continue;
-		pci_write_config_word(pdev, 2*i, saved_config_space[i]);
+static int cciss_allocate_cmd_pool(ctlr_info_t *h)
+{
+	h->cmd_pool_bits = kmalloc(BITS_TO_LONGS(h->nr_cmds) *
+		sizeof(unsigned long), GFP_KERNEL);
+	h->cmd_pool = pci_alloc_consistent(h->pdev,
+		h->nr_cmds * sizeof(CommandList_struct),
+		&(h->cmd_pool_dhandle));
+	h->errinfo_pool = pci_alloc_consistent(h->pdev,
+		h->nr_cmds * sizeof(ErrorInfo_struct),
+		&(h->errinfo_pool_dhandle));
+	if ((h->cmd_pool_bits == NULL)
+		|| (h->cmd_pool == NULL)
+		|| (h->errinfo_pool == NULL)) {
+		dev_err(&h->pdev->dev, "out of memory");
+		return -ENOMEM;
 	}
-	wmb();
-	pci_write_config_word(pdev, 4, saved_config_space[2]);
+	return 0;
+}
+
+static int cciss_allocate_scatterlists(ctlr_info_t *h)
+{
+	int i;
+
+	/* zero it, so that on free we need not know how many were alloc'ed */
+	h->scatter_list = kzalloc(h->max_commands *
+				sizeof(struct scatterlist *), GFP_KERNEL);
+	if (!h->scatter_list)
+		return -ENOMEM;
 
+	for (i = 0; i < h->nr_cmds; i++) {
+		h->scatter_list[i] = kmalloc(sizeof(struct scatterlist) *
+						h->maxsgentries, GFP_KERNEL);
+		if (h->scatter_list[i] == NULL) {
+			dev_err(&h->pdev->dev, "could not allocate "
+				"s/g lists\n");
+			return -ENOMEM;
+		}
+	}
 	return 0;
 }
 
+static void cciss_free_scatterlists(ctlr_info_t *h)
+{
+	int i;
+
+	if (h->scatter_list) {
+		for (i = 0; i < h->nr_cmds; i++)
+			kfree(h->scatter_list[i]);
+		kfree(h->scatter_list);
+	}
+}
+
+static void cciss_free_cmd_pool(ctlr_info_t *h)
+{
+	kfree(h->cmd_pool_bits);
+	if (h->cmd_pool)
+		pci_free_consistent(h->pdev,
+			h->nr_cmds * sizeof(CommandList_struct),
+			h->cmd_pool, h->cmd_pool_dhandle);
+	if (h->errinfo_pool)
+		pci_free_consistent(h->pdev,
+			h->nr_cmds * sizeof(ErrorInfo_struct),
+			h->errinfo_pool, h->errinfo_pool_dhandle);
+}
+
+static int cciss_request_irq(ctlr_info_t *h,
+	irqreturn_t (*msixhandler)(int, void *),
+	irqreturn_t (*intxhandler)(int, void *))
+{
+	if (h->msix_vector || h->msi_vector) {
+		if (!request_irq(h->intr[h->intr_mode], msixhandler,
+				0, h->devname, h))
+			return 0;
+		dev_err(&h->pdev->dev, "Unable to get msi irq %d"
+			" for %s\n", h->intr[h->intr_mode],
+			h->devname);
+		return -1;
+	}
+
+	if (!request_irq(h->intr[h->intr_mode], intxhandler,
+			IRQF_SHARED, h->devname, h))
+		return 0;
+	dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
+		h->intr[h->intr_mode], h->devname);
+	return -1;
+}
+
+static int cciss_kdump_soft_reset(ctlr_info_t *h)
+{
+	if (cciss_send_reset(h, CTLR_LUNID, CCISS_RESET_TYPE_CONTROLLER)) {
+		dev_warn(&h->pdev->dev, "Resetting array controller failed.\n");
+		return -EIO;
+	}
+
+	dev_info(&h->pdev->dev, "Waiting for board to soft reset.\n");
+	if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_NOT_READY)) {
+		dev_warn(&h->pdev->dev, "Soft reset had no effect.\n");
+		return -1;
+	}
+
+	dev_info(&h->pdev->dev, "Board reset, awaiting READY status.\n");
+	if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY)) {
+		dev_warn(&h->pdev->dev, "Board failed to become ready "
+			"after soft reset.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h)
+{
+	int ctlr = h->ctlr;
+
+	free_irq(h->intr[h->intr_mode], h);
+#ifdef CONFIG_PCI_MSI
+	if (h->msix_vector)
+		pci_disable_msix(h->pdev);
+	else if (h->msi_vector)
+		pci_disable_msi(h->pdev);
+#endif /* CONFIG_PCI_MSI */
+	cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
+	cciss_free_scatterlists(h);
+	cciss_free_cmd_pool(h);
+	kfree(h->blockFetchTable);
+	if (h->reply_pool)
+		pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64),
+				h->reply_pool, h->reply_pool_dhandle);
+	if (h->transtable)
+		iounmap(h->transtable);
+	if (h->cfgtable)
+		iounmap(h->cfgtable);
+	if (h->vaddr)
+		iounmap(h->vaddr);
+	unregister_blkdev(h->major, h->devname);
+	cciss_destroy_hba_sysfs_entry(h);
+	pci_release_regions(h->pdev);
+	kfree(h);
+	hba[ctlr] = NULL;
+}
+
 /*
  *  This is it.  Find all the controllers and register them.  I really hate
  *  stealing all these major device numbers.
  *  returns the number of block devices registered.
  */
-static int __devinit cciss_init_one(struct pci_dev *pdev,
-				    const struct pci_device_id *ent)
+static int cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	int i;
 	int j = 0;
 	int rc;
+	int try_soft_reset = 0;
 	int dac, return_code;
-	InquiryData_struct *inq_buff = NULL;
-
-	if (reset_devices) {
-		/* Reset the controller with a PCI power-cycle */
-		if (cciss_hard_reset_controller(pdev) || cciss_reset_msi(pdev))
-			return -ENODEV;
-
-		/* Now try to get the controller to respond to a no-op. Some
-		   devices (notably the HP Smart Array 5i Controller) need
-		   up to 30 seconds to respond. */
-		for (i=0; i<30; i++) {
-			if (cciss_noop(pdev) == 0)
-				break;
+	InquiryData_struct *inq_buff;
+	ctlr_info_t *h;
+	unsigned long flags;
 
-			schedule_timeout_uninterruptible(HZ);
-		}
-		if (i == 30) {
-			printk(KERN_ERR "cciss: controller seems dead\n");
-			return -EBUSY;
-		}
+	/*
+	 * By default the cciss driver is used for all older HP Smart Array
+	 * controllers. There are module paramaters that allow a user to
+	 * override this behavior and instead use the hpsa SCSI driver. If
+	 * this is the case cciss may be loaded first from the kdump initrd
+	 * image and cause a kernel panic. So if reset_devices is true and
+	 * cciss_allow_hpsa is set just bail.
+	 */
+	if ((reset_devices) && (cciss_allow_hpsa == 1))
+		return -ENODEV;
+	rc = cciss_init_reset_devices(pdev);
+	if (rc) {
+		if (rc != -ENOTSUPP)
+			return rc;
+		/* If the reset fails in a particular way (it has no way to do
+		 * a proper hard reset, so returns -ENOTSUPP) we can try to do
+		 * a soft reset once we get the controller configured up to the
+		 * point that it can accept a command.
+		 */
+		try_soft_reset = 1;
+		rc = 0;
 	}
 
-	i = alloc_cciss_hba();
+reinit_after_soft_reset:
+
+	i = alloc_cciss_hba(pdev);
 	if (i < 0)
-		return -1;
+		return -ENOMEM;
 
-	hba[i]->busy_initializing = 1;
-	INIT_HLIST_HEAD(&hba[i]->cmpQ);
-	INIT_HLIST_HEAD(&hba[i]->reqQ);
+	h = hba[i];
+	h->pdev = pdev;
+	h->busy_initializing = 1;
+	h->intr_mode = cciss_simple_mode ? SIMPLE_MODE_INT : PERF_MODE_INT;
+	INIT_LIST_HEAD(&h->cmpQ);
+	INIT_LIST_HEAD(&h->reqQ);
+	mutex_init(&h->busy_shutting_down);
 
-	if (cciss_pci_init(hba[i], pdev) != 0)
-		goto clean0;
+	if (cciss_pci_init(h) != 0)
+		goto clean_no_release_regions;
+
+	sprintf(h->devname, "cciss%d", i);
+	h->ctlr = i;
+
+	if (cciss_tape_cmds < 2)
+		cciss_tape_cmds = 2;
+	if (cciss_tape_cmds > 16)
+		cciss_tape_cmds = 16;
 
-	sprintf(hba[i]->devname, "cciss%d", i);
-	hba[i]->ctlr = i;
-	hba[i]->pdev = pdev;
+	init_completion(&h->scan_wait);
 
-	if (cciss_create_hba_sysfs_entry(hba[i]))
+	if (cciss_create_hba_sysfs_entry(h))
 		goto clean0;
 
 	/* configure PCI DMA stuff */
@@ -3925,7 +5030,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))
 		dac = 0;
 	else {
-		printk(KERN_ERR "cciss: no suitable DMA available\n");
+		dev_err(&h->pdev->dev, "no suitable DMA available\n");
 		goto clean1;
 	}
 
@@ -3935,238 +5040,292 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	 * 8 controller support.
 	 */
 	if (i < MAX_CTLR_ORIG)
-		hba[i]->major = COMPAQ_CISS_MAJOR + i;
-	rc = register_blkdev(hba[i]->major, hba[i]->devname);
+		h->major = COMPAQ_CISS_MAJOR + i;
+	rc = register_blkdev(h->major, h->devname);
 	if (rc == -EBUSY || rc == -EINVAL) {
-		printk(KERN_ERR
-		       "cciss:  Unable to get major number %d for %s "
-		       "on hba %d\n", hba[i]->major, hba[i]->devname, i);
+		dev_err(&h->pdev->dev,
+		       "Unable to get major number %d for %s "
+		       "on hba %d\n", h->major, h->devname, i);
 		goto clean1;
 	} else {
 		if (i >= MAX_CTLR_ORIG)
-			hba[i]->major = rc;
+			h->major = rc;
 	}
 
 	/* make sure the board interrupts are off */
-	hba[i]->access.set_intr_mask(hba[i], CCISS_INTR_OFF);
-	if (request_irq(hba[i]->intr[SIMPLE_MODE_INT], do_cciss_intr,
-			IRQF_DISABLED | IRQF_SHARED, hba[i]->devname, hba[i])) {
-		printk(KERN_ERR "cciss: Unable to get irq %d for %s\n",
-		       hba[i]->intr[SIMPLE_MODE_INT], hba[i]->devname);
+	h->access.set_intr_mask(h, CCISS_INTR_OFF);
+	rc = cciss_request_irq(h, do_cciss_msix_intr, do_cciss_intx);
+	if (rc)
 		goto clean2;
-	}
 
-	printk(KERN_INFO "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n",
-	       hba[i]->devname, pdev->device, pci_name(pdev),
-	       hba[i]->intr[SIMPLE_MODE_INT], dac ? "" : " not");
-
-	hba[i]->cmd_pool_bits =
-	    kmalloc(DIV_ROUND_UP(hba[i]->nr_cmds, BITS_PER_LONG)
-			* sizeof(unsigned long), GFP_KERNEL);
-	hba[i]->cmd_pool = (CommandList_struct *)
-	    pci_alloc_consistent(hba[i]->pdev,
-		    hba[i]->nr_cmds * sizeof(CommandList_struct),
-		    &(hba[i]->cmd_pool_dhandle));
-	hba[i]->errinfo_pool = (ErrorInfo_struct *)
-	    pci_alloc_consistent(hba[i]->pdev,
-		    hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
-		    &(hba[i]->errinfo_pool_dhandle));
-	if ((hba[i]->cmd_pool_bits == NULL)
-	    || (hba[i]->cmd_pool == NULL)
-	    || (hba[i]->errinfo_pool == NULL)) {
-		printk(KERN_ERR "cciss: out of memory");
+	dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n",
+	       h->devname, pdev->device, pci_name(pdev),
+	       h->intr[h->intr_mode], dac ? "" : " not");
+
+	if (cciss_allocate_cmd_pool(h))
 		goto clean4;
-	}
-	spin_lock_init(&hba[i]->lock);
+
+	if (cciss_allocate_scatterlists(h))
+		goto clean4;
+
+	h->cmd_sg_list = cciss_allocate_sg_chain_blocks(h,
+		h->chainsize, h->nr_cmds);
+	if (!h->cmd_sg_list && h->chainsize > 0)
+		goto clean4;
+
+	spin_lock_init(&h->lock);
 
 	/* Initialize the pdev driver private data.
-	   have it point to hba[i].  */
-	pci_set_drvdata(pdev, hba[i]);
+	   have it point to h.  */
+	pci_set_drvdata(pdev, h);
 	/* command and error info recs zeroed out before
 	   they are used */
-	memset(hba[i]->cmd_pool_bits, 0,
-	       DIV_ROUND_UP(hba[i]->nr_cmds, BITS_PER_LONG)
-			* sizeof(unsigned long));
+	bitmap_zero(h->cmd_pool_bits, h->nr_cmds);
 
-	hba[i]->num_luns = 0;
-	hba[i]->highest_lun = -1;
+	h->num_luns = 0;
+	h->highest_lun = -1;
 	for (j = 0; j < CISS_MAX_LUN; j++) {
-		hba[i]->drv[j].raid_level = -1;
-		hba[i]->drv[j].queue = NULL;
-		hba[i]->gendisk[j] = NULL;
+		h->drv[j] = NULL;
+		h->gendisk[j] = NULL;
 	}
 
-	cciss_scsi_setup(i);
+	/* At this point, the controller is ready to take commands.
+	 * Now, if reset_devices and the hard reset didn't work, try
+	 * the soft reset and see if that works.
+	 */
+	if (try_soft_reset) {
+
+		/* This is kind of gross.  We may or may not get a completion
+		 * from the soft reset command, and if we do, then the value
+		 * from the fifo may or may not be valid.  So, we wait 10 secs
+		 * after the reset throwing away any completions we get during
+		 * that time.  Unregister the interrupt handler and register
+		 * fake ones to scoop up any residual completions.
+		 */
+		spin_lock_irqsave(&h->lock, flags);
+		h->access.set_intr_mask(h, CCISS_INTR_OFF);
+		spin_unlock_irqrestore(&h->lock, flags);
+		free_irq(h->intr[h->intr_mode], h);
+		rc = cciss_request_irq(h, cciss_msix_discard_completions,
+					cciss_intx_discard_completions);
+		if (rc) {
+			dev_warn(&h->pdev->dev, "Failed to request_irq after "
+				"soft reset.\n");
+			goto clean4;
+		}
+
+		rc = cciss_kdump_soft_reset(h);
+		if (rc) {
+			dev_warn(&h->pdev->dev, "Soft reset failed.\n");
+			goto clean4;
+		}
+
+		dev_info(&h->pdev->dev, "Board READY.\n");
+		dev_info(&h->pdev->dev,
+			"Waiting for stale completions to drain.\n");
+		h->access.set_intr_mask(h, CCISS_INTR_ON);
+		msleep(10000);
+		h->access.set_intr_mask(h, CCISS_INTR_OFF);
+
+		rc = controller_reset_failed(h->cfgtable);
+		if (rc)
+			dev_info(&h->pdev->dev,
+				"Soft reset appears to have failed.\n");
+
+		/* since the controller's reset, we have to go back and re-init
+		 * everything.  Easiest to just forget what we've done and do it
+		 * all over again.
+		 */
+		cciss_undo_allocations_after_kdump_soft_reset(h);
+		try_soft_reset = 0;
+		if (rc)
+			/* don't go to clean4, we already unallocated */
+			return -ENODEV;
+
+		goto reinit_after_soft_reset;
+	}
+
+	cciss_scsi_setup(h);
 
 	/* Turn the interrupts on so we can service requests */
-	hba[i]->access.set_intr_mask(hba[i], CCISS_INTR_ON);
+	h->access.set_intr_mask(h, CCISS_INTR_ON);
 
 	/* Get the firmware version */
 	inq_buff = kzalloc(sizeof(InquiryData_struct), GFP_KERNEL);
 	if (inq_buff == NULL) {
-		printk(KERN_ERR "cciss: out of memory\n");
+		dev_err(&h->pdev->dev, "out of memory\n");
 		goto clean4;
 	}
 
-	return_code = sendcmd_withirq(CISS_INQUIRY, i, inq_buff,
+	return_code = sendcmd_withirq(h, CISS_INQUIRY, inq_buff,
 		sizeof(InquiryData_struct), 0, CTLR_LUNID, TYPE_CMD);
 	if (return_code == IO_OK) {
-		hba[i]->firm_ver[0] = inq_buff->data_byte[32];
-		hba[i]->firm_ver[1] = inq_buff->data_byte[33];
-		hba[i]->firm_ver[2] = inq_buff->data_byte[34];
-		hba[i]->firm_ver[3] = inq_buff->data_byte[35];
+		h->firm_ver[0] = inq_buff->data_byte[32];
+		h->firm_ver[1] = inq_buff->data_byte[33];
+		h->firm_ver[2] = inq_buff->data_byte[34];
+		h->firm_ver[3] = inq_buff->data_byte[35];
 	} else {	 /* send command failed */
-		printk(KERN_WARNING "cciss: unable to determine firmware"
+		dev_warn(&h->pdev->dev, "unable to determine firmware"
 			" version of controller\n");
 	}
+	kfree(inq_buff);
 
-	cciss_procinit(i);
-
-	hba[i]->cciss_max_sectors = 2048;
-
-	hba[i]->busy_initializing = 0;
+	cciss_procinit(h);
 
-	rebuild_lun_table(hba[i], 1);
-	hba[i]->cciss_scan_thread = kthread_run(scan_thread, hba[i],
-				"cciss_scan%02d", i);
-	if (IS_ERR(hba[i]->cciss_scan_thread))
-		return PTR_ERR(hba[i]->cciss_scan_thread);
+	h->cciss_max_sectors = 8192;
 
-	return 1;
+	rebuild_lun_table(h, 1, 0);
+	cciss_engage_scsi(h);
+	h->busy_initializing = 0;
+	return 0;
 
 clean4:
-	kfree(inq_buff);
-	kfree(hba[i]->cmd_pool_bits);
-	if (hba[i]->cmd_pool)
-		pci_free_consistent(hba[i]->pdev,
-				    hba[i]->nr_cmds * sizeof(CommandList_struct),
-				    hba[i]->cmd_pool, hba[i]->cmd_pool_dhandle);
-	if (hba[i]->errinfo_pool)
-		pci_free_consistent(hba[i]->pdev,
-				    hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
-				    hba[i]->errinfo_pool,
-				    hba[i]->errinfo_pool_dhandle);
-	free_irq(hba[i]->intr[SIMPLE_MODE_INT], hba[i]);
+	cciss_free_cmd_pool(h);
+	cciss_free_scatterlists(h);
+	cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
+	free_irq(h->intr[h->intr_mode], h);
 clean2:
-	unregister_blkdev(hba[i]->major, hba[i]->devname);
+	unregister_blkdev(h->major, h->devname);
 clean1:
-	cciss_destroy_hba_sysfs_entry(hba[i]);
+	cciss_destroy_hba_sysfs_entry(h);
 clean0:
-	hba[i]->busy_initializing = 0;
-	/* cleanup any queues that may have been initialized */
-	for (j=0; j <= hba[i]->highest_lun; j++){
-		drive_info_struct *drv = &(hba[i]->drv[j]);
-		if (drv->queue)
-			blk_cleanup_queue(drv->queue);
-	}
+	pci_release_regions(pdev);
+clean_no_release_regions:
+	h->busy_initializing = 0;
+
 	/*
 	 * Deliberately omit pci_disable_device(): it does something nasty to
 	 * Smart Array controllers that pci_enable_device does not undo
 	 */
-	pci_release_regions(pdev);
 	pci_set_drvdata(pdev, NULL);
-	free_hba(i);
-	return -1;
+	free_hba(h);
+	return -ENODEV;
 }
 
 static void cciss_shutdown(struct pci_dev *pdev)
 {
-	ctlr_info_t *tmp_ptr;
-	int i;
-	char flush_buf[4];
+	ctlr_info_t *h;
+	char *flush_buf;
 	int return_code;
 
-	tmp_ptr = pci_get_drvdata(pdev);
-	if (tmp_ptr == NULL)
-		return;
-	i = tmp_ptr->ctlr;
-	if (hba[i] == NULL)
+	h = pci_get_drvdata(pdev);
+	flush_buf = kzalloc(4, GFP_KERNEL);
+	if (!flush_buf) {
+		dev_warn(&h->pdev->dev, "cache not flushed, out of memory.\n");
 		return;
+	}
+	/* write all data in the battery backed cache to disk */
+	return_code = sendcmd_withirq(h, CCISS_CACHE_FLUSH, flush_buf,
+		4, 0, CTLR_LUNID, TYPE_CMD);
+	kfree(flush_buf);
+	if (return_code != IO_OK)
+		dev_warn(&h->pdev->dev, "Error flushing cache\n");
+	h->access.set_intr_mask(h, CCISS_INTR_OFF);
+	free_irq(h->intr[h->intr_mode], h);
+}
 
-	/* Turn board interrupts off  and send the flush cache command */
-	/* sendcmd will turn off interrupt, and send the flush...
-	 * To write all data in the battery backed cache to disks */
-	memset(flush_buf, 0, 4);
-	return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0,
-		CTLR_LUNID, TYPE_CMD);
-	if (return_code == IO_OK) {
-		printk(KERN_INFO "Completed flushing cache on controller %d\n", i);
-	} else {
-		printk(KERN_WARNING "Error flushing cache on controller %d\n", i);
+static int cciss_enter_simple_mode(struct ctlr_info *h)
+{
+	u32 trans_support;
+
+	trans_support = readl(&(h->cfgtable->TransportSupport));
+	if (!(trans_support & SIMPLE_MODE))
+		return -ENOTSUPP;
+
+	h->max_commands = readl(&(h->cfgtable->CmdsOutMax));
+	writel(CFGTBL_Trans_Simple, &(h->cfgtable->HostWrite.TransportRequest));
+	writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+	cciss_wait_for_mode_change_ack(h);
+	print_cfg_table(h);
+	if (!(readl(&(h->cfgtable->TransportActive)) & CFGTBL_Trans_Simple)) {
+		dev_warn(&h->pdev->dev, "unable to get board into simple mode\n");
+		return -ENODEV;
 	}
-	free_irq(hba[i]->intr[2], hba[i]);
+	h->transMethod = CFGTBL_Trans_Simple;
+	return 0;
 }
 
-static void __devexit cciss_remove_one(struct pci_dev *pdev)
+
+static void cciss_remove_one(struct pci_dev *pdev)
 {
-	ctlr_info_t *tmp_ptr;
+	ctlr_info_t *h;
 	int i, j;
 
 	if (pci_get_drvdata(pdev) == NULL) {
-		printk(KERN_ERR "cciss: Unable to remove device \n");
+		dev_err(&pdev->dev, "Unable to remove device\n");
 		return;
 	}
 
-	tmp_ptr = pci_get_drvdata(pdev);
-	i = tmp_ptr->ctlr;
+	h = pci_get_drvdata(pdev);
+	i = h->ctlr;
 	if (hba[i] == NULL) {
-		printk(KERN_ERR "cciss: device appears to "
-		       "already be removed \n");
+		dev_err(&pdev->dev, "device appears to already be removed\n");
 		return;
 	}
 
-	kthread_stop(hba[i]->cciss_scan_thread);
+	mutex_lock(&h->busy_shutting_down);
 
-	remove_proc_entry(hba[i]->devname, proc_cciss);
-	unregister_blkdev(hba[i]->major, hba[i]->devname);
+	remove_from_scan_list(h);
+	remove_proc_entry(h->devname, proc_cciss);
+	unregister_blkdev(h->major, h->devname);
 
 	/* remove it from the disk list */
 	for (j = 0; j < CISS_MAX_LUN; j++) {
-		struct gendisk *disk = hba[i]->gendisk[j];
+		struct gendisk *disk = h->gendisk[j];
 		if (disk) {
 			struct request_queue *q = disk->queue;
 
-			if (disk->flags & GENHD_FL_UP)
+			if (disk->flags & GENHD_FL_UP) {
+				cciss_destroy_ld_sysfs_entry(h, j, 1);
 				del_gendisk(disk);
+			}
 			if (q)
 				blk_cleanup_queue(q);
 		}
 	}
 
 #ifdef CONFIG_CISS_SCSI_TAPE
-	cciss_unregister_scsi(i);	/* unhook from SCSI subsystem */
+	cciss_unregister_scsi(h);	/* unhook from SCSI subsystem */
 #endif
 
 	cciss_shutdown(pdev);
 
 #ifdef CONFIG_PCI_MSI
-	if (hba[i]->msix_vector)
-		pci_disable_msix(hba[i]->pdev);
-	else if (hba[i]->msi_vector)
-		pci_disable_msi(hba[i]->pdev);
+	if (h->msix_vector)
+		pci_disable_msix(h->pdev);
+	else if (h->msi_vector)
+		pci_disable_msi(h->pdev);
 #endif				/* CONFIG_PCI_MSI */
 
-	iounmap(hba[i]->vaddr);
-
-	pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(CommandList_struct),
-			    hba[i]->cmd_pool, hba[i]->cmd_pool_dhandle);
-	pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
-			    hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle);
-	kfree(hba[i]->cmd_pool_bits);
+	iounmap(h->transtable);
+	iounmap(h->cfgtable);
+	iounmap(h->vaddr);
+
+	cciss_free_cmd_pool(h);
+	/* Free up sg elements */
+	for (j = 0; j < h->nr_cmds; j++)
+		kfree(h->scatter_list[j]);
+	kfree(h->scatter_list);
+	cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
+	kfree(h->blockFetchTable);
+	if (h->reply_pool)
+		pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64),
+				h->reply_pool, h->reply_pool_dhandle);
 	/*
 	 * Deliberately omit pci_disable_device(): it does something nasty to
 	 * Smart Array controllers that pci_enable_device does not undo
 	 */
 	pci_release_regions(pdev);
 	pci_set_drvdata(pdev, NULL);
-	cciss_destroy_hba_sysfs_entry(hba[i]);
-	free_hba(i);
+	cciss_destroy_hba_sysfs_entry(h);
+	mutex_unlock(&h->busy_shutting_down);
+	free_hba(h);
 }
 
 static struct pci_driver cciss_pci_driver = {
 	.name = "cciss",
 	.probe = cciss_init_one,
-	.remove = __devexit_p(cciss_remove_one),
+	.remove = cciss_remove_one,
 	.id_table = cciss_pci_device_id,	/* id_table */
 	.shutdown = cciss_shutdown,
 };
@@ -4184,23 +5343,32 @@ static int __init cciss_init(void)
 	 * boundary. Given that we use pci_alloc_consistent() to allocate an
 	 * array of them, the size must be a multiple of 8 bytes.
 	 */
-	BUILD_BUG_ON(sizeof(CommandList_struct) % 8);
-
+	BUILD_BUG_ON(sizeof(CommandList_struct) % COMMANDLIST_ALIGNMENT);
 	printk(KERN_INFO DRIVER_NAME "\n");
 
 	err = bus_register(&cciss_bus_type);
 	if (err)
 		return err;
 
+	/* Start the scan thread */
+	cciss_scan_thread = kthread_run(scan_thread, NULL, "cciss_scan");
+	if (IS_ERR(cciss_scan_thread)) {
+		err = PTR_ERR(cciss_scan_thread);
+		goto err_bus_unregister;
+	}
+
 	/* Register for our PCI devices */
 	err = pci_register_driver(&cciss_pci_driver);
 	if (err)
-		goto err_bus_register;
+		goto err_thread_stop;
 
-	return 0;
+	return err;
 
-err_bus_register:
+err_thread_stop:
+	kthread_stop(cciss_scan_thread);
+err_bus_unregister:
 	bus_unregister(&cciss_bus_type);
+
 	return err;
 }
 
@@ -4212,54 +5380,16 @@ static void __exit cciss_cleanup(void)
 	/* double check that all controller entrys have been removed */
 	for (i = 0; i < MAX_CTLR; i++) {
 		if (hba[i] != NULL) {
-			printk(KERN_WARNING "cciss: had to remove"
-			       " controller %d\n", i);
+			dev_warn(&hba[i]->pdev->dev,
+				"had to remove controller\n");
 			cciss_remove_one(hba[i]->pdev);
 		}
 	}
-	remove_proc_entry("driver/cciss", NULL);
+	kthread_stop(cciss_scan_thread);
+	if (proc_cciss)
+		remove_proc_entry("driver/cciss", NULL);
 	bus_unregister(&cciss_bus_type);
 }
 
-static void fail_all_cmds(unsigned long ctlr)
-{
-	/* If we get here, the board is apparently dead. */
-	ctlr_info_t *h = hba[ctlr];
-	CommandList_struct *c;
-	unsigned long flags;
-
-	printk(KERN_WARNING "cciss%d: controller not responding.\n", h->ctlr);
-	h->alive = 0;		/* the controller apparently died... */
-
-	spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
-
-	pci_disable_device(h->pdev);	/* Make sure it is really dead. */
-
-	/* move everything off the request queue onto the completed queue */
-	while (!hlist_empty(&h->reqQ)) {
-		c = hlist_entry(h->reqQ.first, CommandList_struct, list);
-		removeQ(c);
-		h->Qdepth--;
-		addQ(&h->cmpQ, c);
-	}
-
-	/* Now, fail everything on the completed queue with a HW error */
-	while (!hlist_empty(&h->cmpQ)) {
-		c = hlist_entry(h->cmpQ.first, CommandList_struct, list);
-		removeQ(c);
-		c->err_info->CommandStatus = CMD_HARDWARE_ERR;
-		if (c->cmd_type == CMD_RWREQ) {
-			complete_command(h, c, 0);
-		} else if (c->cmd_type == CMD_IOCTL_PEND)
-			complete(c->waiting);
-#ifdef CONFIG_CISS_SCSI_TAPE
-		else if (c->cmd_type == CMD_SCSI)
-			complete_scsi_command(c, 0, 0);
-#endif
-	}
-	spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-	return;
-}
-
 module_init(cciss_init);
 module_exit(cciss_cleanup);
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 06a5db25b29..7fda30e4a24 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -2,6 +2,7 @@
 #define CCISS_H
 
 #include <linux/genhd.h>
+#include <linux/mutex.h>
 
 #include "cciss_cmd.h"
 
@@ -24,12 +25,12 @@ struct access_method {
 	void (*submit_command)(ctlr_info_t *h, CommandList_struct *c);
 	void (*set_intr_mask)(ctlr_info_t *h, unsigned long val);
 	unsigned long (*fifo_full)(ctlr_info_t *h);
-	unsigned long (*intr_pending)(ctlr_info_t *h);
+	bool (*intr_pending)(ctlr_info_t *h);
 	unsigned long (*command_completed)(ctlr_info_t *h);
 };
 typedef struct _drive_info_struct
 {
- 	__u32   LunID;	
+	unsigned char LunID[8];
 	int 	usage_count;
 	struct request_queue *queue;
 	sector_t nr_blocks;
@@ -51,14 +52,15 @@ typedef struct _drive_info_struct
 	char vendor[VENDOR_LEN + 1]; /* SCSI vendor string */
 	char model[MODEL_LEN + 1];   /* SCSI model string */
 	char rev[REV_LEN + 1];       /* SCSI revision string */
+	char device_initialized;     /* indicates whether dev is initialized */
 } drive_info_struct;
 
-struct ctlr_info 
+struct ctlr_info
 {
 	int	ctlr;
 	char	devname[8];
 	char    *product_name;
-	char	firm_ver[4]; // Firmware version 
+	char	firm_ver[4]; /* Firmware version */
 	struct pci_dev *pdev;
 	__u32	board_id;
 	void __iomem *vaddr;
@@ -73,32 +75,43 @@ struct ctlr_info
 	int	num_luns;
 	int 	highest_lun;
 	int	usage_count;  /* number of opens all all minor devices */
-#	define DOORBELL_INT	0
-#	define PERF_MODE_INT	1
+	/* Need space for temp sg list
+	 * number of scatter/gathers supported
+	 * number of scatter/gathers in chained block
+	 */
+	struct	scatterlist **scatter_list;
+	int	maxsgentries;
+	int	chainsize;
+	int	max_cmd_sgentries;
+	SGDescriptor_struct **cmd_sg_list;
+
+#	define PERF_MODE_INT	0
+#	define DOORBELL_INT	1
 #	define SIMPLE_MODE_INT	2
 #	define MEMQ_MODE_INT	3
 	unsigned int intr[4];
 	unsigned int msix_vector;
 	unsigned int msi_vector;
+	int	intr_mode;
 	int 	cciss_max_sectors;
 	BYTE	cciss_read;
 	BYTE	cciss_write;
 	BYTE	cciss_read_capacity;
 
-	// information about each logical volume
-	drive_info_struct drv[CISS_MAX_LUN];
+	/* information about each logical volume */
+	drive_info_struct *drv[CISS_MAX_LUN];
 
 	struct access_method access;
 
 	/* queue and queue Info */ 
-	struct hlist_head reqQ;
-	struct hlist_head cmpQ;
+	struct list_head reqQ;
+	struct list_head cmpQ;
 	unsigned int Qdepth;
 	unsigned int maxQsinceinit;
 	unsigned int maxSG;
 	spinlock_t lock;
 
-	//* pointers to command and error info pool */ 
+	/* pointers to command and error info pool */
 	CommandList_struct 	*cmd_pool;
 	dma_addr_t		cmd_pool_dhandle; 
 	ErrorInfo_struct 	*errinfo_pool;
@@ -108,27 +121,44 @@ struct ctlr_info
 	int			nr_frees; 
 	int			busy_configuring;
 	int			busy_initializing;
+	int			busy_scanning;
+	struct mutex		busy_shutting_down;
 
 	/* This element holds the zero based queue number of the last
 	 * queue to be started.  It is used for fairness.
 	*/
 	int			next_to_run;
 
-	// Disk structures we need to pass back
+	/* Disk structures we need to pass back */
 	struct gendisk   *gendisk[CISS_MAX_LUN];
 #ifdef CONFIG_CISS_SCSI_TAPE
-	void *scsi_ctlr; /* ptr to structure containing scsi related stuff */
-	/* list of block side commands the scsi error handling sucked up */
-	/* and saved for later processing */
+	struct cciss_scsi_adapter_data_t *scsi_ctlr;
 #endif
 	unsigned char alive;
-	struct completion *rescan_wait;
-	struct task_struct *cciss_scan_thread;
+	struct list_head scan_list;
+	struct completion scan_wait;
 	struct device dev;
+	/*
+	 * Performant mode tables.
+	 */
+	u32 trans_support;
+	u32 trans_offset;
+	struct TransTable_struct *transtable;
+	unsigned long transMethod;
+
+	/*
+	 * Performant mode completion buffer
+	 */
+	u64 *reply_pool;
+	dma_addr_t reply_pool_dhandle;
+	u64 *reply_pool_head;
+	size_t reply_pool_size;
+	unsigned char reply_pool_wraparound;
+	u32 *blockFetchTable;
 };
 
-/*  Defining the diffent access_menthods */
-/*
+/*  Defining the diffent access_methods
+ *
  * Memory mapped FIFO interface (SMART 53xx cards)
  */
 #define SA5_DOORBELL	0x20
@@ -147,20 +177,54 @@ struct ctlr_info
 #define SA5B_INTR_PENDING	0x04
 #define FIFO_EMPTY		0xffffffff	
 #define CCISS_FIRMWARE_READY	0xffff0000 /* value in scratchpad register */
+/* Perf. mode flags */
+#define SA5_PERF_INTR_PENDING	0x04
+#define SA5_PERF_INTR_OFF	0x05
+#define SA5_OUTDB_STATUS_PERF_BIT	0x01
+#define SA5_OUTDB_CLEAR_PERF_BIT	0x01
+#define SA5_OUTDB_CLEAR         0xA0
+#define SA5_OUTDB_CLEAR_PERF_BIT        0x01
+#define SA5_OUTDB_STATUS        0x9C
+
 
 #define  CISS_ERROR_BIT		0x02
 
 #define CCISS_INTR_ON 	1 
 #define CCISS_INTR_OFF	0
+
+
+/* CCISS_BOARD_READY_WAIT_SECS is how long to wait for a board
+ * to become ready, in seconds, before giving up on it.
+ * CCISS_BOARD_READY_POLL_INTERVAL_MSECS * is how long to wait
+ * between polling the board to see if it is ready, in
+ * milliseconds.  CCISS_BOARD_READY_ITERATIONS is derived
+ * the above.
+ */
+#define CCISS_BOARD_READY_WAIT_SECS (120)
+#define CCISS_BOARD_NOT_READY_WAIT_SECS (100)
+#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100)
+#define CCISS_BOARD_READY_ITERATIONS \
+	((CCISS_BOARD_READY_WAIT_SECS * 1000) / \
+		CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
+#define CCISS_BOARD_NOT_READY_ITERATIONS \
+	((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \
+		CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
+#define CCISS_POST_RESET_PAUSE_MSECS (3000)
+#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (4000)
+#define CCISS_POST_RESET_NOOP_RETRIES (12)
+#define CCISS_POST_RESET_NOOP_TIMEOUT_MSECS (10000)
+
 /* 
 	Send the command to the hardware 
 */
 static void SA5_submit_command( ctlr_info_t *h, CommandList_struct *c) 
 {
 #ifdef CCISS_DEBUG
-	 printk("Sending %x - down to controller\n", c->busaddr );
-#endif /* CCISS_DEBUG */ 
+	printk(KERN_WARNING "cciss%d: Sending %08x - down to controller\n",
+			h->ctlr, c->busaddr);
+#endif /* CCISS_DEBUG */
          writel(c->busaddr, h->vaddr + SA5_REQUEST_PORT_OFFSET);
+	readl(h->vaddr + SA5_SCRATCHPAD_OFFSET);
 	 h->commands_outstanding++;
 	 if ( h->commands_outstanding > h->max_outstanding)
 		h->max_outstanding = h->commands_outstanding;
@@ -177,11 +241,13 @@ static void SA5_intr_mask(ctlr_info_t *h, unsigned long val)
 	{ /* Turn interrupts on */
 		h->interrupts_enabled = 1;
 		writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
 	} else /* Turn them off */
 	{
 		h->interrupts_enabled = 0;
         	writel( SA5_INTR_OFF, 
 			h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
 	}
 }
 /*
@@ -195,13 +261,31 @@ static void SA5B_intr_mask(ctlr_info_t *h, unsigned long val)
         { /* Turn interrupts on */
 		h->interrupts_enabled = 1;
                 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
         } else /* Turn them off */
         {
 		h->interrupts_enabled = 0;
                 writel( SA5B_INTR_OFF,
                         h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
         }
 }
+
+/* Performant mode intr_mask */
+static void SA5_performant_intr_mask(ctlr_info_t *h, unsigned long val)
+{
+	if (val) { /* turn on interrupts */
+		h->interrupts_enabled = 1;
+		writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+	} else {
+		h->interrupts_enabled = 0;
+		writel(SA5_PERF_INTR_OFF,
+				h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+		(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+	}
+}
+
 /*
  *  Returns true if fifo is full.  
  * 
@@ -238,10 +322,44 @@ static unsigned long SA5_completed(ctlr_info_t *h)
 	return ( register_value); 
 
 }
+
+/* Performant mode command completed */
+static unsigned long SA5_performant_completed(ctlr_info_t *h)
+{
+	unsigned long register_value = FIFO_EMPTY;
+
+	/* flush the controller write of the reply queue by reading
+	 * outbound doorbell status register.
+	 */
+	register_value = readl(h->vaddr + SA5_OUTDB_STATUS);
+	/* msi auto clears the interrupt pending bit. */
+	if (!(h->msi_vector || h->msix_vector)) {
+		writel(SA5_OUTDB_CLEAR_PERF_BIT, h->vaddr + SA5_OUTDB_CLEAR);
+		/* Do a read in order to flush the write to the controller
+		 * (as per spec.)
+		 */
+		register_value = readl(h->vaddr + SA5_OUTDB_STATUS);
+	}
+
+	if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) {
+		register_value = *(h->reply_pool_head);
+		(h->reply_pool_head)++;
+		h->commands_outstanding--;
+	} else {
+		register_value = FIFO_EMPTY;
+	}
+	/* Check for wraparound */
+	if (h->reply_pool_head == (h->reply_pool + h->max_commands)) {
+		h->reply_pool_head = h->reply_pool;
+		h->reply_pool_wraparound ^= 1;
+	}
+
+	return register_value;
+}
 /*
  *	Returns true if an interrupt is pending.. 
  */
-static unsigned long SA5_intr_pending(ctlr_info_t *h)
+static bool SA5_intr_pending(ctlr_info_t *h)
 {
 	unsigned long register_value  = 
 		readl(h->vaddr + SA5_INTR_STATUS);
@@ -256,7 +374,7 @@ static unsigned long SA5_intr_pending(ctlr_info_t *h)
 /*
  *      Returns true if an interrupt is pending..
  */
-static unsigned long SA5B_intr_pending(ctlr_info_t *h)
+static bool SA5B_intr_pending(ctlr_info_t *h)
 {
         unsigned long register_value  =
                 readl(h->vaddr + SA5_INTR_STATUS);
@@ -268,6 +386,20 @@ static unsigned long SA5B_intr_pending(ctlr_info_t *h)
         return 0 ;
 }
 
+static bool SA5_performant_intr_pending(ctlr_info_t *h)
+{
+	unsigned long register_value = readl(h->vaddr + SA5_INTR_STATUS);
+
+	if (!register_value)
+		return false;
+
+	if (h->msi_vector || h->msix_vector)
+		return true;
+
+	/* Read outbound doorbell to flush */
+	register_value = readl(h->vaddr + SA5_OUTDB_STATUS);
+	return register_value & SA5_OUTDB_STATUS_PERF_BIT;
+}
 
 static struct access_method SA5_access = {
 	SA5_submit_command,
@@ -285,6 +417,14 @@ static struct access_method SA5B_access = {
         SA5_completed,
 };
 
+static struct access_method SA5_performant_access = {
+	SA5_submit_command,
+	SA5_performant_intr_mask,
+	SA5_fifo_full,
+	SA5_performant_intr_pending,
+	SA5_performant_completed,
+};
+
 struct board_type {
 	__u32	board_id;
 	char	*product_name;
@@ -292,7 +432,4 @@ struct board_type {
 	int nr_cmds; /* Max cmds this kind of ctlr can handle. */
 };
 
-#define CCISS_LOCK(i)	(&hba[i]->lock)
-
 #endif /* CCISS_H */
-
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index cd665b00c7c..d9be6b4d49a 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -1,30 +1,16 @@
 #ifndef CCISS_CMD_H
 #define CCISS_CMD_H
-//###########################################################################
-//DEFINES
-//###########################################################################
+
+#include <linux/cciss_defs.h>
+
+/* DEFINES */
 #define CISS_VERSION "1.00"
 
-//general boundary defintions
-#define SENSEINFOBYTES          32//note that this value may vary between host implementations
-#define MAXSGENTRIES            31
+/* general boundary definitions */
+#define MAXSGENTRIES            32
+#define CCISS_SG_CHAIN          0x80000000
 #define MAXREPLYQS              256
 
-//Command Status value
-#define CMD_SUCCESS             0x0000
-#define CMD_TARGET_STATUS       0x0001
-#define CMD_DATA_UNDERRUN       0x0002
-#define CMD_DATA_OVERRUN        0x0003
-#define CMD_INVALID             0x0004
-#define CMD_PROTOCOL_ERR        0x0005
-#define CMD_HARDWARE_ERR        0x0006
-#define CMD_CONNECTION_LOST     0x0007
-#define CMD_ABORTED             0x0008
-#define CMD_ABORT_FAILED        0x0009
-#define CMD_UNSOLICITED_ABORT   0x000A
-#define CMD_TIMEOUT             0x000B
-#define CMD_UNABORTABLE		0x000C
-
 /* Unit Attentions ASC's as defined for the MSA2012sa */
 #define POWER_OR_RESET			0x29
 #define STATE_CHANGED			0x2a
@@ -48,30 +34,13 @@
 #define ASYM_ACCESS_CHANGED		0x06
 #define LUN_CAPACITY_CHANGED		0x09
 
-//transfer direction
-#define XFER_NONE               0x00
-#define XFER_WRITE              0x01
-#define XFER_READ               0x02
-#define XFER_RSVD               0x03
-
-//task attribute
-#define ATTR_UNTAGGED           0x00
-#define ATTR_SIMPLE             0x04
-#define ATTR_HEADOFQUEUE        0x05
-#define ATTR_ORDERED            0x06
-#define ATTR_ACA                0x07
-
-//cdb type
-#define TYPE_CMD				0x00
-#define TYPE_MSG				0x01
-
-//config space register offsets
+/* config space register offsets */
 #define CFG_VENDORID            0x00
 #define CFG_DEVICEID            0x02
 #define CFG_I2OBAR              0x10
 #define CFG_MEM1BAR             0x14
 
-//i2o space register offsets
+/* i2o space register offsets */
 #define I2O_IBDB_SET            0x20
 #define I2O_IBDB_CLEAR          0x70
 #define I2O_INT_STATUS          0x30
@@ -80,11 +49,15 @@
 #define I2O_OBPOST_Q            0x44
 #define I2O_DMA1_CFG		0x214
 
-//Configuration Table
+/* Configuration Table */
 #define CFGTBL_ChangeReq        0x00000001l
 #define CFGTBL_AccCmds          0x00000001l
+#define DOORBELL_CTLR_RESET     0x00000004l
+#define DOORBELL_CTLR_RESET2    0x00000020l
 
 #define CFGTBL_Trans_Simple     0x00000002l
+#define CFGTBL_Trans_Performant 0x00000004l
+#define CFGTBL_Trans_use_short_tags 0x20000000l
 
 #define CFGTBL_BusType_Ultra2   0x00000001l
 #define CFGTBL_BusType_Ultra3   0x00000002l
@@ -102,24 +75,17 @@ typedef union _u64bit
    __u64	val;
 } u64bit;
 
-// Type defs used in the following structs
-#define BYTE __u8
-#define WORD __u16
-#define HWORD __u16
-#define DWORD __u32
+/* Type defs used in the following structs */
 #define QWORD vals32 
 
-//###########################################################################
-//STRUCTURES
-//###########################################################################
-#define CISS_MAX_LUN	1024
+/* STRUCTURES */
 #define CISS_MAX_PHYS_LUN	1024
-// SCSI-3 Cmmands 
+/* SCSI-3 Cmmands */
 
 #pragma pack(1)	
 
 #define CISS_INQUIRY 0x12
-//Date returned
+/* Date returned */
 typedef struct _InquiryData_struct
 {
   BYTE data_byte[36];
@@ -127,7 +93,7 @@ typedef struct _InquiryData_struct
 
 #define CISS_REPORT_LOG 0xc2    /* Report Logical LUNs */
 #define CISS_REPORT_PHYS 0xc3   /* Report Physical LUNs */
-// Data returned
+/* Data returned */
 typedef struct _ReportLUNdata_struct
 {
   BYTE LUNListLength[4];
@@ -138,8 +104,8 @@ typedef struct _ReportLUNdata_struct
 #define CCISS_READ_CAPACITY 0x25 /* Read Capacity */ 
 typedef struct _ReadCapdata_struct
 {
-  BYTE total_size[4];	// Total size in blocks
-  BYTE block_size[4];	// Size of blocks in bytes
+  BYTE total_size[4];	/* Total size in blocks */
+  BYTE block_size[4];	/* Size of blocks in bytes */
 } ReadCapdata_struct;
 
 #define CCISS_READ_CAPACITY_16 0x9e /* Read Capacity 16 */
@@ -171,52 +137,21 @@ typedef struct _ReadCapdata_struct_16
 #define CDB_LEN10	10
 #define CDB_LEN16	16
 
-// BMIC commands 
+/* BMIC commands */
 #define BMIC_READ 0x26
 #define BMIC_WRITE 0x27
 #define BMIC_CACHE_FLUSH 0xc2
-#define CCISS_CACHE_FLUSH 0x01	//C2 was already being used by CCISS
-
-//Command List Structure
-typedef union _SCSI3Addr_struct {
-   struct {
-    BYTE Dev;
-    BYTE Bus:6;
-    BYTE Mode:2;        // b00
-  } PeripDev;
-   struct {
-    BYTE DevLSB;
-    BYTE DevMSB:6;
-    BYTE Mode:2;        // b01
-  } LogDev;
-   struct {
-    BYTE Dev:5;
-    BYTE Bus:3;
-    BYTE Targ:6;
-    BYTE Mode:2;        // b10
-  } LogUnit;
-} SCSI3Addr_struct;
-
-typedef struct _PhysDevAddr_struct {
-  DWORD             TargetId:24;
-  DWORD             Bus:6;
-  DWORD             Mode:2;
-  SCSI3Addr_struct  Target[2]; //2 level target device addr
-} PhysDevAddr_struct;
-  
-typedef struct _LogDevAddr_struct {
-  DWORD            VolId:30;
-  DWORD            Mode:2;
-  BYTE             reserved[4];
-} LogDevAddr_struct;
-
-typedef union _LUNAddr_struct {
-  BYTE               LunAddrBytes[8];
-  SCSI3Addr_struct   SCSI3Lun[4];
-  PhysDevAddr_struct PhysDev;
-  LogDevAddr_struct  LogDev;
-} LUNAddr_struct;
+#define CCISS_CACHE_FLUSH 0x01	/* C2 was already being used by CCISS */
 
+#define CCISS_ABORT_MSG 0x00
+#define CCISS_RESET_MSG 0x01
+#define CCISS_RESET_TYPE_CONTROLLER 0x00
+#define CCISS_RESET_TYPE_BUS 0x01
+#define CCISS_RESET_TYPE_TARGET 0x03
+#define CCISS_RESET_TYPE_LUN 0x04
+#define CCISS_NOOP_MSG 0x03
+
+/* Command List Structure */
 #define CTLR_LUNID "\0\0\0\0\0\0\0\0"
 
 typedef struct _CommandListHeader_struct {
@@ -226,16 +161,6 @@ typedef struct _CommandListHeader_struct {
   QWORD             Tag;
   LUNAddr_struct    LUN;
 } CommandListHeader_struct;
-typedef struct _RequestBlock_struct {
-  BYTE   CDBLen;
-  struct {
-    BYTE Type:3;
-    BYTE Attribute:3;
-    BYTE Direction:2;
-  } Type;
-  HWORD  Timeout;
-  BYTE   CDB[16];
-} RequestBlock_struct;
 typedef struct _ErrDescriptor_struct {
   QWORD  Addr;
   DWORD  Len;
@@ -246,39 +171,29 @@ typedef struct _SGDescriptor_struct {
   DWORD  Ext;
 } SGDescriptor_struct;
 
-typedef union _MoreErrInfo_struct{
-  struct {
-    BYTE  Reserved[3];
-    BYTE  Type;
-    DWORD ErrorInfo;
-  }Common_Info;
-  struct{
-    BYTE  Reserved[2];
-    BYTE  offense_size;//size of offending entry
-    BYTE  offense_num; //byte # of offense 0-base
-    DWORD offense_value;
-  }Invalid_Cmd;
-}MoreErrInfo_struct;
-typedef struct _ErrorInfo_struct {
-  BYTE               ScsiStatus;
-  BYTE               SenseLen;
-  HWORD              CommandStatus;
-  DWORD              ResidualCnt;
-  MoreErrInfo_struct MoreErrInfo;
-  BYTE               SenseInfo[SENSEINFOBYTES];
-} ErrorInfo_struct;
-
 /* Command types */
 #define CMD_RWREQ       0x00
 #define CMD_IOCTL_PEND  0x01
 #define CMD_SCSI	0x03
 #define CMD_MSG_DONE	0x04
 #define CMD_MSG_TIMEOUT 0x05
+#define CMD_MSG_STALE	0xff
 
-/* This structure needs to be divisible by 8 for new
- * indexing method.
+/* This structure needs to be divisible by COMMANDLIST_ALIGNMENT
+ * because low bits of the address are used to to indicate that
+ * whether the tag contains an index or an address.  PAD_32 and
+ * PAD_64 can be adjusted independently as needed for 32-bit
+ * and 64-bits systems.
  */
-#define PADSIZE (sizeof(long) - 4)
+#define COMMANDLIST_ALIGNMENT (32)
+#define IS_64_BIT ((sizeof(long) - 4)/4)
+#define IS_32_BIT (!IS_64_BIT)
+#define PAD_32 (0)
+#define PAD_64 (4)
+#define PADSIZE (IS_32_BIT * PAD_32 + IS_64_BIT * PAD_64)
+#define DIRECT_LOOKUP_BIT 0x10
+#define DIRECT_LOOKUP_SHIFT 5
+
 typedef struct _CommandList_struct {
   CommandListHeader_struct Header;
   RequestBlock_struct      Request;
@@ -290,15 +205,15 @@ typedef struct _CommandList_struct {
   int			   ctlr;
   int			   cmd_type; 
   long			   cmdindex;
-  struct hlist_node list;
+  struct list_head list;
   struct request *	   rq;
   struct completion *waiting;
   int	 retry_count;
   void * scsi_cmd;
-  char   pad[PADSIZE];
+  char pad[PADSIZE];
 } CommandList_struct;
 
-//Configuration Table Structure
+/* Configuration Table Structure */
 typedef struct _HostWrite_struct {
   DWORD TransportRequest;
   DWORD Reserved;
@@ -309,15 +224,46 @@ typedef struct _HostWrite_struct {
 typedef struct _CfgTable_struct {
   BYTE             Signature[4];
   DWORD            SpecValence;
+#define SIMPLE_MODE	0x02
+#define PERFORMANT_MODE	0x04
+#define MEMQ_MODE	0x08
   DWORD            TransportSupport;
   DWORD            TransportActive;
   HostWrite_struct HostWrite;
   DWORD            CmdsOutMax;
   DWORD            BusTypes;
-  DWORD            Reserved; 
+  DWORD            TransMethodOffset;
   BYTE             ServerName[16];
   DWORD            HeartBeat;
   DWORD            SCSI_Prefetch;
+  DWORD            MaxSGElements;
+  DWORD            MaxLogicalUnits;
+  DWORD            MaxPhysicalDrives;
+  DWORD            MaxPhysicalDrivesPerLogicalUnit;
+  DWORD            MaxPerformantModeCommands;
+  u8		   reserved[0x78 - 0x58];
+  u32		   misc_fw_support; /* offset 0x78 */
+#define MISC_FW_DOORBELL_RESET (0x02)
+#define MISC_FW_DOORBELL_RESET2 (0x10)
+	u8	   driver_version[32];
 } CfgTable_struct;
+
+struct TransTable_struct {
+  u32 BlockFetch0;
+  u32 BlockFetch1;
+  u32 BlockFetch2;
+  u32 BlockFetch3;
+  u32 BlockFetch4;
+  u32 BlockFetch5;
+  u32 BlockFetch6;
+  u32 BlockFetch7;
+  u32 RepQSize;
+  u32 RepQCount;
+  u32 RepQCtrAddrLow32;
+  u32 RepQCtrAddrHigh32;
+  u32 RepQAddr0Low32;
+  u32 RepQAddr0High32;
+};
+
 #pragma pack()	 
-#endif // CCISS_CMD_H
+#endif /* CCISS_CMD_H */
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 3315268b4ec..ecd845cd28d 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -33,7 +33,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
@@ -44,24 +44,24 @@
 #define CCISS_ABORT_MSG 0x00
 #define CCISS_RESET_MSG 0x01
 
-static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
+static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
 	size_t size,
 	__u8 page_code, unsigned char *scsi3addr,
 	int cmd_type);
 
-static CommandList_struct *cmd_alloc(ctlr_info_t *h, int get_from_pool);
-static void cmd_free(ctlr_info_t *h, CommandList_struct *c, int got_from_pool);
+static CommandList_struct *cmd_alloc(ctlr_info_t *h);
+static CommandList_struct *cmd_special_alloc(ctlr_info_t *h);
+static void cmd_free(ctlr_info_t *h, CommandList_struct *c);
+static void cmd_special_free(ctlr_info_t *h, CommandList_struct *c);
 
-static int cciss_scsi_proc_info(
-		struct Scsi_Host *sh,
+static int cciss_scsi_write_info(struct Scsi_Host *sh,
 		char *buffer, /* data buffer */
-		char **start, 	   /* where data in buffer starts */
-		off_t offset,	   /* offset from start of imaginary file */
-		int length, 	   /* length of data in buffer */
-		int func);	   /* 0 == read, 1 == write */
+		int length); 	   /* length of data in buffer */
+static int cciss_scsi_show_info(struct seq_file *m,
+				struct Scsi_Host *sh);
 
-static int cciss_scsi_queue_command (struct scsi_cmnd *cmd,
-		void (* done)(struct scsi_cmnd *));
+static int cciss_scsi_queue_command (struct Scsi_Host *h,
+				     struct scsi_cmnd *cmd);
 static int cciss_eh_device_reset_handler(struct scsi_cmnd *);
 static int cciss_eh_abort_handler(struct scsi_cmnd *);
 
@@ -80,11 +80,10 @@ static struct scsi_host_template cciss_driver_template = {
 	.module			= THIS_MODULE,
 	.name			= "cciss",
 	.proc_name		= "cciss",
-	.proc_info		= cciss_scsi_proc_info,
+	.write_info		= cciss_scsi_write_info,
+	.show_info		= cciss_scsi_show_info,
 	.queuecommand		= cciss_scsi_queue_command,
-	.can_queue		= SCSI_CCISS_CAN_QUEUE,
 	.this_id		= 7,
-	.sg_tablesize		= MAXSGENTRIES,
 	.cmd_per_lun		= 1,
 	.use_clustering		= DISABLE_CLUSTERING,
 	/* Can't have eh_bus_reset_handler or eh_host_reset_handler for cciss */
@@ -93,47 +92,48 @@ static struct scsi_host_template cciss_driver_template = {
 };
 
 #pragma pack(1)
+
+#define SCSI_PAD_32 8
+#define SCSI_PAD_64 8
+
 struct cciss_scsi_cmd_stack_elem_t {
 	CommandList_struct cmd;
 	ErrorInfo_struct Err;
 	__u32 busaddr;
-	__u32 pad;
+	int cmdindex;
+	u8 pad[IS_32_BIT * SCSI_PAD_32 + IS_64_BIT * SCSI_PAD_64];
 };
 
 #pragma pack()
 
-#define CMD_STACK_SIZE (SCSI_CCISS_CAN_QUEUE * \
-		CCISS_MAX_SCSI_DEVS_PER_HBA + 2)
-			// plus two for init time usage
-
 #pragma pack(1)
 struct cciss_scsi_cmd_stack_t {
 	struct cciss_scsi_cmd_stack_elem_t *pool;
-	struct cciss_scsi_cmd_stack_elem_t *elem[CMD_STACK_SIZE];
+	struct cciss_scsi_cmd_stack_elem_t **elem;
 	dma_addr_t cmd_pool_handle;
 	int top;
+	int nelems;
 };
 #pragma pack()
 
 struct cciss_scsi_adapter_data_t {
 	struct Scsi_Host *scsi_host;
 	struct cciss_scsi_cmd_stack_t cmd_stack;
+	SGDescriptor_struct **cmd_sg_list;
 	int registered;
 	spinlock_t lock; // to protect ccissscsi[ctlr]; 
 };
 
-#define CPQ_TAPE_LOCK(ctlr, flags) spin_lock_irqsave( \
-	&(((struct cciss_scsi_adapter_data_t *) \
-	hba[ctlr]->scsi_ctlr)->lock), flags);
-#define CPQ_TAPE_UNLOCK(ctlr, flags) spin_unlock_irqrestore( \
-	&(((struct cciss_scsi_adapter_data_t *) \
-	hba[ctlr]->scsi_ctlr)->lock), flags);
+#define CPQ_TAPE_LOCK(h, flags) spin_lock_irqsave( \
+	&h->scsi_ctlr->lock, flags);
+#define CPQ_TAPE_UNLOCK(h, flags) spin_unlock_irqrestore( \
+	&h->scsi_ctlr->lock, flags);
 
 static CommandList_struct *
 scsi_cmd_alloc(ctlr_info_t *h)
 {
 	/* assume only one process in here at a time, locking done by caller. */
-	/* use CCISS_LOCK(ctlr) */
+	/* use h->lock */
 	/* might be better to rewrite how we allocate scsi commands in a way that */
 	/* needs no locking at all. */
 
@@ -143,7 +143,7 @@ scsi_cmd_alloc(ctlr_info_t *h)
 	struct cciss_scsi_cmd_stack_t *stk;
 	u64bit temp64;
 
-	sa = (struct cciss_scsi_adapter_data_t *) h->scsi_ctlr;
+	sa = h->scsi_ctlr;
 	stk = &sa->cmd_stack; 
 
 	if (stk->top < 0) 
@@ -154,6 +154,7 @@ scsi_cmd_alloc(ctlr_info_t *h)
 	memset(&c->Err, 0, sizeof(c->Err));
 	/* set physical addr of cmd and addr of scsi parameters */
 	c->cmd.busaddr = c->busaddr; 
+	c->cmd.cmdindex = c->cmdindex;
 	/* (__u32) (stk->cmd_pool_handle + 
 		(sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top)); */
 
@@ -173,74 +174,90 @@ scsi_cmd_alloc(ctlr_info_t *h)
 }
 
 static void 
-scsi_cmd_free(ctlr_info_t *h, CommandList_struct *cmd)
+scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c)
 {
 	/* assume only one process in here at a time, locking done by caller. */
-	/* use CCISS_LOCK(ctlr) */
+	/* use h->lock */
 	/* drop the free memory chunk on top of the stack. */
 
 	struct cciss_scsi_adapter_data_t *sa;
 	struct cciss_scsi_cmd_stack_t *stk;
 
-	sa = (struct cciss_scsi_adapter_data_t *) h->scsi_ctlr;
+	sa = h->scsi_ctlr;
 	stk = &sa->cmd_stack; 
-	if (stk->top >= CMD_STACK_SIZE) {
-		printk("cciss: scsi_cmd_free called too many times.\n");
+	stk->top++;
+	if (stk->top >= stk->nelems) {
+		dev_err(&h->pdev->dev,
+			"scsi_cmd_free called too many times.\n");
 		BUG();
 	}
-	stk->top++;
-	stk->elem[stk->top] = (struct cciss_scsi_cmd_stack_elem_t *) cmd;
+	stk->elem[stk->top] = (struct cciss_scsi_cmd_stack_elem_t *) c;
 }
 
 static int
-scsi_cmd_stack_setup(int ctlr, struct cciss_scsi_adapter_data_t *sa)
+scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
 {
 	int i;
 	struct cciss_scsi_cmd_stack_t *stk;
 	size_t size;
 
-	stk = &sa->cmd_stack; 
-	size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE;
+	stk = &sa->cmd_stack;
+	stk->nelems = cciss_tape_cmds + 2;
+	sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(h,
+		h->chainsize, stk->nelems);
+	if (!sa->cmd_sg_list && h->chainsize > 0)
+		return -ENOMEM;
 
-	// pci_alloc_consistent guarantees 32-bit DMA address will
-	// be used
+	size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
 
+	/* Check alignment, see cciss_cmd.h near CommandList_struct def. */
+	BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0);
+	/* pci_alloc_consistent guarantees 32-bit DMA address will be used */
 	stk->pool = (struct cciss_scsi_cmd_stack_elem_t *)
-		pci_alloc_consistent(hba[ctlr]->pdev, size, &stk->cmd_pool_handle);
+		pci_alloc_consistent(h->pdev, size, &stk->cmd_pool_handle);
 
 	if (stk->pool == NULL) {
-		printk("stk->pool is null\n");
+		cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
+		sa->cmd_sg_list = NULL;
+		return -ENOMEM;
+	}
+	stk->elem = kmalloc(sizeof(stk->elem[0]) * stk->nelems, GFP_KERNEL);
+	if (!stk->elem) {
+		pci_free_consistent(h->pdev, size, stk->pool,
+		stk->cmd_pool_handle);
 		return -1;
 	}
-
-	for (i=0; i<CMD_STACK_SIZE; i++) {
+	for (i = 0; i < stk->nelems; i++) {
 		stk->elem[i] = &stk->pool[i];
 		stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + 
 			(sizeof(struct cciss_scsi_cmd_stack_elem_t) * i));
+		stk->elem[i]->cmdindex = i;
 	}
-	stk->top = CMD_STACK_SIZE-1;
+	stk->top = stk->nelems-1;
 	return 0;
 }
 
 static void
-scsi_cmd_stack_free(int ctlr)
+scsi_cmd_stack_free(ctlr_info_t *h)
 {
 	struct cciss_scsi_adapter_data_t *sa;
 	struct cciss_scsi_cmd_stack_t *stk;
 	size_t size;
 
-	sa = (struct cciss_scsi_adapter_data_t *) hba[ctlr]->scsi_ctlr;
+	sa = h->scsi_ctlr;
 	stk = &sa->cmd_stack; 
-	if (stk->top != CMD_STACK_SIZE-1) {
-		printk( "cciss: %d scsi commands are still outstanding.\n",
-			CMD_STACK_SIZE - stk->top);
-		// BUG();
-		printk("WE HAVE A BUG HERE!!! stk=0x%p\n", stk);
+	if (stk->top != stk->nelems-1) {
+		dev_warn(&h->pdev->dev,
+			"bug: %d scsi commands are still outstanding.\n",
+			stk->nelems - stk->top);
 	}
-	size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE;
+	size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
 
-	pci_free_consistent(hba[ctlr]->pdev, size, stk->pool, stk->cmd_pool_handle);
+	pci_free_consistent(h->pdev, size, stk->pool, stk->cmd_pool_handle);
 	stk->pool = NULL;
+	cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
+	kfree(stk->elem);
+	stk->elem = NULL;
 }
 
 #if 0
@@ -330,20 +347,20 @@ print_cmd(CommandList_struct *cp)
 #endif
 
 static int 
-find_bus_target_lun(int ctlr, int *bus, int *target, int *lun)
+find_bus_target_lun(ctlr_info_t *h, int *bus, int *target, int *lun)
 {
 	/* finds an unused bus, target, lun for a new device */
-	/* assumes hba[ctlr]->scsi_ctlr->lock is held */ 
+	/* assumes h->scsi_ctlr->lock is held */
 	int i, found=0;
 	unsigned char target_taken[CCISS_MAX_SCSI_DEVS_PER_HBA];
 
 	memset(&target_taken[0], 0, CCISS_MAX_SCSI_DEVS_PER_HBA);
 
 	target_taken[SELF_SCSI_ID] = 1;	
-	for (i=0;i<ccissscsi[ctlr].ndevices;i++)
-		target_taken[ccissscsi[ctlr].dev[i].target] = 1;
+	for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++)
+		target_taken[ccissscsi[h->ctlr].dev[i].target] = 1;
 	
-	for (i=0;i<CCISS_MAX_SCSI_DEVS_PER_HBA;i++) {
+	for (i = 0; i < CCISS_MAX_SCSI_DEVS_PER_HBA; i++) {
 		if (!target_taken[i]) {
 			*bus = 0; *target=i; *lun = 0; found=1;
 			break;
@@ -357,19 +374,19 @@ struct scsi2map {
 };
 
 static int 
-cciss_scsi_add_entry(int ctlr, int hostno, 
+cciss_scsi_add_entry(ctlr_info_t *h, int hostno,
 		struct cciss_scsi_dev_t *device,
 		struct scsi2map *added, int *nadded)
 {
-	/* assumes hba[ctlr]->scsi_ctlr->lock is held */ 
-	int n = ccissscsi[ctlr].ndevices;
+	/* assumes h->scsi_ctlr->lock is held */
+	int n = ccissscsi[h->ctlr].ndevices;
 	struct cciss_scsi_dev_t *sd;
 	int i, bus, target, lun;
 	unsigned char addr1[8], addr2[8];
 
 	if (n >= CCISS_MAX_SCSI_DEVS_PER_HBA) {
-		printk("cciss%d: Too many devices, "
-			"some will be inaccessible.\n", ctlr);
+		dev_warn(&h->pdev->dev, "Too many devices, "
+			"some will be inaccessible.\n");
 		return -1;
 	}
 
@@ -385,7 +402,7 @@ cciss_scsi_add_entry(int ctlr, int hostno,
 		memcpy(addr1, device->scsi3addr, 8);
 		addr1[4] = 0;
 		for (i = 0; i < n; i++) {
-			sd = &ccissscsi[ctlr].dev[i];
+			sd = &ccissscsi[h->ctlr].dev[i];
 			memcpy(addr2, sd->scsi3addr, 8);
 			addr2[4] = 0;
 			/* differ only in byte 4? */
@@ -398,9 +415,9 @@ cciss_scsi_add_entry(int ctlr, int hostno,
 		}
 	}
 
-	sd = &ccissscsi[ctlr].dev[n];
+	sd = &ccissscsi[h->ctlr].dev[n];
 	if (lun == 0) {
-		if (find_bus_target_lun(ctlr,
+		if (find_bus_target_lun(h,
 			&sd->bus, &sd->target, &sd->lun) != 0)
 			return -1;
 	} else {
@@ -419,37 +436,37 @@ cciss_scsi_add_entry(int ctlr, int hostno,
 	memcpy(sd->device_id, device->device_id, sizeof(sd->device_id));
 	sd->devtype = device->devtype;
 
-	ccissscsi[ctlr].ndevices++;
+	ccissscsi[h->ctlr].ndevices++;
 
 	/* initially, (before registering with scsi layer) we don't 
 	   know our hostno and we don't want to print anything first 
 	   time anyway (the scsi layer's inquiries will show that info) */
 	if (hostno != -1)
-		printk("cciss%d: %s device c%db%dt%dl%d added.\n", 
-			ctlr, scsi_device_type(sd->devtype), hostno,
+		dev_info(&h->pdev->dev, "%s device c%db%dt%dl%d added.\n",
+			scsi_device_type(sd->devtype), hostno,
 			sd->bus, sd->target, sd->lun);
 	return 0;
 }
 
 static void
-cciss_scsi_remove_entry(int ctlr, int hostno, int entry,
+cciss_scsi_remove_entry(ctlr_info_t *h, int hostno, int entry,
 	struct scsi2map *removed, int *nremoved)
 {
-	/* assumes hba[ctlr]->scsi_ctlr->lock is held */ 
+	/* assumes h->ctlr]->scsi_ctlr->lock is held */
 	int i;
 	struct cciss_scsi_dev_t sd;
 
 	if (entry < 0 || entry >= CCISS_MAX_SCSI_DEVS_PER_HBA) return;
-	sd = ccissscsi[ctlr].dev[entry];
+	sd = ccissscsi[h->ctlr].dev[entry];
 	removed[*nremoved].bus    = sd.bus;
 	removed[*nremoved].target = sd.target;
 	removed[*nremoved].lun    = sd.lun;
 	(*nremoved)++;
-	for (i=entry;i<ccissscsi[ctlr].ndevices-1;i++)
-		ccissscsi[ctlr].dev[i] = ccissscsi[ctlr].dev[i+1];
-	ccissscsi[ctlr].ndevices--;
-	printk("cciss%d: %s device c%db%dt%dl%d removed.\n",
-		ctlr, scsi_device_type(sd.devtype), hostno,
+	for (i = entry; i < ccissscsi[h->ctlr].ndevices-1; i++)
+		ccissscsi[h->ctlr].dev[i] = ccissscsi[h->ctlr].dev[i+1];
+	ccissscsi[h->ctlr].ndevices--;
+	dev_info(&h->pdev->dev, "%s device c%db%dt%dl%d removed.\n",
+		scsi_device_type(sd.devtype), hostno,
 			sd.bus, sd.target, sd.lun);
 }
 
@@ -464,24 +481,24 @@ cciss_scsi_remove_entry(int ctlr, int hostno, int entry,
 	(a)[1] == (b)[1] && \
 	(a)[0] == (b)[0])
 
-static void fixup_botched_add(int ctlr, char *scsi3addr)
+static void fixup_botched_add(ctlr_info_t *h, char *scsi3addr)
 {
 	/* called when scsi_add_device fails in order to re-adjust */
 	/* ccissscsi[] to match the mid layer's view. */
 	unsigned long flags;
 	int i, j;
-	CPQ_TAPE_LOCK(ctlr, flags);
-	for (i = 0; i < ccissscsi[ctlr].ndevices; i++) {
+	CPQ_TAPE_LOCK(h, flags);
+	for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) {
 		if (memcmp(scsi3addr,
-				ccissscsi[ctlr].dev[i].scsi3addr, 8) == 0) {
-			for (j = i; j < ccissscsi[ctlr].ndevices-1; j++)
-				ccissscsi[ctlr].dev[j] =
-					ccissscsi[ctlr].dev[j+1];
-			ccissscsi[ctlr].ndevices--;
+				ccissscsi[h->ctlr].dev[i].scsi3addr, 8) == 0) {
+			for (j = i; j < ccissscsi[h->ctlr].ndevices-1; j++)
+				ccissscsi[h->ctlr].dev[j] =
+					ccissscsi[h->ctlr].dev[j+1];
+			ccissscsi[h->ctlr].ndevices--;
 			break;
 		}
 	}
-	CPQ_TAPE_UNLOCK(ctlr, flags);
+	CPQ_TAPE_UNLOCK(h, flags);
 }
 
 static int device_is_the_same(struct cciss_scsi_dev_t *dev1,
@@ -501,7 +518,7 @@ static int device_is_the_same(struct cciss_scsi_dev_t *dev1,
 }
 
 static int
-adjust_cciss_scsi_table(int ctlr, int hostno,
+adjust_cciss_scsi_table(ctlr_info_t *h, int hostno,
 	struct cciss_scsi_dev_t sd[], int nsds)
 {
 	/* sd contains scsi3 addresses and devtypes, but
@@ -522,16 +539,15 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
 			GFP_KERNEL);
 
 	if (!added || !removed) {
-		printk(KERN_WARNING "cciss%d: Out of memory in "
-			"adjust_cciss_scsi_table\n", ctlr);
+		dev_warn(&h->pdev->dev,
+			"Out of memory in adjust_cciss_scsi_table\n");
 		goto free_and_out;
 	}
 
-	CPQ_TAPE_LOCK(ctlr, flags);
+	CPQ_TAPE_LOCK(h, flags);
 
 	if (hostno != -1)  /* if it's not the first time... */
-		sh = ((struct cciss_scsi_adapter_data_t *)
-			hba[ctlr]->scsi_ctlr)->scsi_host;
+		sh = h->scsi_ctlr->scsi_host;
 
 	/* find any devices in ccissscsi[] that are not in 
 	   sd[] and remove them from ccissscsi[] */
@@ -539,8 +555,8 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
 	i = 0;
 	nremoved = 0;
 	nadded = 0;
-	while(i<ccissscsi[ctlr].ndevices) {
-		csd = &ccissscsi[ctlr].dev[i];
+	while (i < ccissscsi[h->ctlr].ndevices) {
+		csd = &ccissscsi[h->ctlr].dev[i];
 		found=0;
 		for (j=0;j<nsds;j++) {
 			if (SCSI3ADDR_EQ(sd[j].scsi3addr,
@@ -555,20 +571,18 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
 
 		if (found == 0) { /* device no longer present. */ 
 			changes++;
-			/* printk("cciss%d: %s device c%db%dt%dl%d removed.\n",
-				ctlr, scsi_device_type(csd->devtype), hostno,
-					csd->bus, csd->target, csd->lun); */
-			cciss_scsi_remove_entry(ctlr, hostno, i,
+			cciss_scsi_remove_entry(h, hostno, i,
 				removed, &nremoved);
 			/* remove ^^^, hence i not incremented */
 		} else if (found == 1) { /* device is different in some way */
 			changes++;
-			printk("cciss%d: device c%db%dt%dl%d has changed.\n",
-				ctlr, hostno, csd->bus, csd->target, csd->lun);
-			cciss_scsi_remove_entry(ctlr, hostno, i,
+			dev_info(&h->pdev->dev,
+				"device c%db%dt%dl%d has changed.\n",
+				hostno, csd->bus, csd->target, csd->lun);
+			cciss_scsi_remove_entry(h, hostno, i,
 				removed, &nremoved);
 			/* remove ^^^, hence i not incremented */
-			if (cciss_scsi_add_entry(ctlr, hostno, &sd[j],
+			if (cciss_scsi_add_entry(h, hostno, &sd[j],
 				added, &nadded) != 0)
 				/* we just removed one, so add can't fail. */
 					BUG();
@@ -590,8 +604,8 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
 
 	for (i=0;i<nsds;i++) {
 		found=0;
-		for (j=0;j<ccissscsi[ctlr].ndevices;j++) {
-			csd = &ccissscsi[ctlr].dev[j];
+		for (j = 0; j < ccissscsi[h->ctlr].ndevices; j++) {
+			csd = &ccissscsi[h->ctlr].dev[j];
 			if (SCSI3ADDR_EQ(sd[i].scsi3addr,
 				csd->scsi3addr)) {
 				if (device_is_the_same(&sd[i], csd))
@@ -603,18 +617,18 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
 		}
 		if (!found) {
 			changes++;
-			if (cciss_scsi_add_entry(ctlr, hostno, &sd[i],
+			if (cciss_scsi_add_entry(h, hostno, &sd[i],
 				added, &nadded) != 0)
 				break;
 		} else if (found == 1) {
 			/* should never happen... */
 			changes++;
-			printk(KERN_WARNING "cciss%d: device "
-				"unexpectedly changed\n", ctlr);
+			dev_warn(&h->pdev->dev,
+				"device unexpectedly changed\n");
 			/* but if it does happen, we just ignore that device */
 		}
 	}
-	CPQ_TAPE_UNLOCK(ctlr, flags);
+	CPQ_TAPE_UNLOCK(h, flags);
 
 	/* Don't notify scsi mid layer of any changes the first time through */
 	/* (or if there are no changes) scsi_scan_host will do it later the */
@@ -634,9 +648,9 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
 			/* We don't expect to get here. */
 			/* future cmds to this device will get selection */
 			/* timeout as if the device was gone. */
-			printk(KERN_WARNING "cciss%d: didn't find "
+			dev_warn(&h->pdev->dev, "didn't find "
 				"c%db%dt%dl%d\n for removal.",
-				ctlr, hostno, removed[i].bus,
+				hostno, removed[i].bus,
 				removed[i].target, removed[i].lun);
 		}
 	}
@@ -648,13 +662,12 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
 			added[i].target, added[i].lun);
 		if (rc == 0)
 			continue;
-		printk(KERN_WARNING "cciss%d: scsi_add_device "
+		dev_warn(&h->pdev->dev, "scsi_add_device "
 			"c%db%dt%dl%d failed, device not added.\n",
-			ctlr, hostno,
-			added[i].bus, added[i].target, added[i].lun);
+			hostno, added[i].bus, added[i].target, added[i].lun);
 		/* now we have to remove it from ccissscsi, */
 		/* since it didn't get added to scsi mid layer */
-		fixup_botched_add(ctlr, added[i].scsi3addr);
+		fixup_botched_add(h, added[i].scsi3addr);
 	}
 
 free_and_out:
@@ -664,33 +677,33 @@ free_and_out:
 }
 
 static int
-lookup_scsi3addr(int ctlr, int bus, int target, int lun, char *scsi3addr)
+lookup_scsi3addr(ctlr_info_t *h, int bus, int target, int lun, char *scsi3addr)
 {
 	int i;
 	struct cciss_scsi_dev_t *sd;
 	unsigned long flags;
 
-	CPQ_TAPE_LOCK(ctlr, flags);
-	for (i=0;i<ccissscsi[ctlr].ndevices;i++) {
-		sd = &ccissscsi[ctlr].dev[i];
+	CPQ_TAPE_LOCK(h, flags);
+	for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) {
+		sd = &ccissscsi[h->ctlr].dev[i];
 		if (sd->bus == bus &&
 		    sd->target == target &&
 		    sd->lun == lun) {
 			memcpy(scsi3addr, &sd->scsi3addr[0], 8);
-			CPQ_TAPE_UNLOCK(ctlr, flags);
+			CPQ_TAPE_UNLOCK(h, flags);
 			return 0;
 		}
 	}
-	CPQ_TAPE_UNLOCK(ctlr, flags);
+	CPQ_TAPE_UNLOCK(h, flags);
 	return -1;
 }
 
 static void 
-cciss_scsi_setup(int cntl_num)
+cciss_scsi_setup(ctlr_info_t *h)
 {
 	struct cciss_scsi_adapter_data_t * shba;
 
-	ccissscsi[cntl_num].ndevices = 0;
+	ccissscsi[h->ctlr].ndevices = 0;
 	shba = (struct cciss_scsi_adapter_data_t *)
 		kmalloc(sizeof(*shba), GFP_KERNEL);	
 	if (shba == NULL)
@@ -698,33 +711,35 @@ cciss_scsi_setup(int cntl_num)
 	shba->scsi_host = NULL;
 	spin_lock_init(&shba->lock);
 	shba->registered = 0;
-	if (scsi_cmd_stack_setup(cntl_num, shba) != 0) {
+	if (scsi_cmd_stack_setup(h, shba) != 0) {
 		kfree(shba);
 		shba = NULL;
 	}
-	hba[cntl_num]->scsi_ctlr = (void *) shba;
+	h->scsi_ctlr = shba;
 	return;
 }
 
-static void
-complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
+static void complete_scsi_command(CommandList_struct *c, int timeout,
+	__u32 tag)
 {
 	struct scsi_cmnd *cmd;
-	ctlr_info_t *ctlr;
+	ctlr_info_t *h;
 	ErrorInfo_struct *ei;
 
-	ei = cp->err_info;
+	ei = c->err_info;
 
 	/* First, see if it was a message rather than a command */
-	if (cp->Request.Type.Type == TYPE_MSG)  {
-		cp->cmd_type = CMD_MSG_DONE;
+	if (c->Request.Type.Type == TYPE_MSG)  {
+		c->cmd_type = CMD_MSG_DONE;
 		return;
 	}
 
-	cmd = (struct scsi_cmnd *) cp->scsi_cmd;	
-	ctlr = hba[cp->ctlr];
+	cmd = (struct scsi_cmnd *) c->scsi_cmd;
+	h = hba[c->ctlr];
 
 	scsi_dma_unmap(cmd);
+	if (c->Header.SGTotal > h->max_cmd_sgentries)
+		cciss_unmap_sg_chain_block(h, c);
 
 	cmd->result = (DID_OK << 16); 		/* host byte */
 	cmd->result |= (COMMAND_COMPLETE << 8);	/* msg byte */
@@ -747,17 +762,7 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
 		{
 			case CMD_TARGET_STATUS:
 				/* Pass it up to the upper layers... */
-				if( ei->ScsiStatus)
-                		{
-#if 0
-                    			printk(KERN_WARNING "cciss: cmd %p "
-					"has SCSI Status = %x\n",
-                        			cp,  
-						ei->ScsiStatus); 
-#endif
-					cmd->result |= (ei->ScsiStatus < 1);
-                		}
-				else {  /* scsi status is zero??? How??? */
+				if (!ei->ScsiStatus) {
 					
 	/* Ordinarily, this case should never happen, but there is a bug
 	   in some released firmware revisions that allows it to happen
@@ -773,13 +778,13 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
 			case CMD_DATA_UNDERRUN: /* let mid layer handle it. */
 			break;
 			case CMD_DATA_OVERRUN:
-				printk(KERN_WARNING "cciss: cp %p has"
+				dev_warn(&h->pdev->dev, "%p has"
 					" completed with data overrun "
-					"reported\n", cp);
+					"reported\n", c);
 			break;
 			case CMD_INVALID: {
-				/* print_bytes(cp, sizeof(*cp), 1, 0);
-				print_cmd(cp); */
+				/* print_bytes(c, sizeof(*c), 1, 0);
+				print_cmd(c); */
      /* We get CMD_INVALID if you address a non-existent tape drive instead
 	of a selection timeout (no response).  You will see this if you yank 
 	out a tape drive, then try to access it. This is kind of a shame
@@ -789,54 +794,56 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
 				}
 			break;
 			case CMD_PROTOCOL_ERR:
-                                printk(KERN_WARNING "cciss: cp %p has "
-					"protocol error \n", cp);
+				cmd->result = DID_ERROR << 16;
+				dev_warn(&h->pdev->dev,
+					"%p has protocol error\n", c);
                         break;
 			case CMD_HARDWARE_ERR:
 				cmd->result = DID_ERROR << 16;
-                                printk(KERN_WARNING "cciss: cp %p had " 
-                                        " hardware error\n", cp);
+				dev_warn(&h->pdev->dev,
+					"%p had hardware error\n", c);
                         break;
 			case CMD_CONNECTION_LOST:
 				cmd->result = DID_ERROR << 16;
-				printk(KERN_WARNING "cciss: cp %p had "
-					"connection lost\n", cp);
+				dev_warn(&h->pdev->dev,
+					"%p had connection lost\n", c);
 			break;
 			case CMD_ABORTED:
 				cmd->result = DID_ABORT << 16;
-				printk(KERN_WARNING "cciss: cp %p was "
-					"aborted\n", cp);
+				dev_warn(&h->pdev->dev, "%p was aborted\n", c);
 			break;
 			case CMD_ABORT_FAILED:
 				cmd->result = DID_ERROR << 16;
-				printk(KERN_WARNING "cciss: cp %p reports "
-					"abort failed\n", cp);
+				dev_warn(&h->pdev->dev,
+					"%p reports abort failed\n", c);
 			break;
 			case CMD_UNSOLICITED_ABORT:
 				cmd->result = DID_ABORT << 16;
-				printk(KERN_WARNING "cciss: cp %p aborted "
-					"do to an unsolicited abort\n", cp);
+				dev_warn(&h->pdev->dev, "%p aborted due to an "
+					"unsolicited abort\n", c);
 			break;
 			case CMD_TIMEOUT:
 				cmd->result = DID_TIME_OUT << 16;
-				printk(KERN_WARNING "cciss: cp %p timedout\n",
-					cp);
+				dev_warn(&h->pdev->dev, "%p timedout\n", c);
+			break;
+			case CMD_UNABORTABLE:
+				cmd->result = DID_ERROR << 16;
+				dev_warn(&h->pdev->dev, "c %p command "
+					"unabortable\n", c);
 			break;
 			default:
 				cmd->result = DID_ERROR << 16;
-				printk(KERN_WARNING "cciss: cp %p returned "
-					"unknown status %x\n", cp, 
+				dev_warn(&h->pdev->dev,
+					"%p returned unknown status %x\n", c,
 						ei->CommandStatus); 
 		}
 	}
-	// printk("c:%p:c%db%dt%dl%d ", cmd, ctlr->ctlr, cmd->channel, 
-	//	cmd->target, cmd->lun);
 	cmd->scsi_done(cmd);
-	scsi_cmd_free(ctlr, cp);
+	scsi_cmd_free(h, c);
 }
 
 static int
-cciss_scsi_detect(int ctlr)
+cciss_scsi_detect(ctlr_info_t *h)
 {
 	struct Scsi_Host *sh;
 	int error;
@@ -847,13 +854,17 @@ cciss_scsi_detect(int ctlr)
 	sh->io_port = 0;	// good enough?  FIXME, 
 	sh->n_io_port = 0;	// I don't think we use these two...
 	sh->this_id = SELF_SCSI_ID;  
+	sh->can_queue = cciss_tape_cmds;
+	sh->sg_tablesize = h->maxsgentries;
+	sh->max_cmd_len = MAX_COMMAND_SIZE;
+	sh->max_sectors = h->cciss_max_sectors;
 
 	((struct cciss_scsi_adapter_data_t *) 
-		hba[ctlr]->scsi_ctlr)->scsi_host = (void *) sh;
-	sh->hostdata[0] = (unsigned long) hba[ctlr];
-	sh->irq = hba[ctlr]->intr[SIMPLE_MODE_INT];
+		h->scsi_ctlr)->scsi_host = sh;
+	sh->hostdata[0] = (unsigned long) h;
+	sh->irq = h->intr[SIMPLE_MODE_INT];
 	sh->unique_id = sh->irq;
-	error = scsi_add_host(sh, &hba[ctlr]->pdev->dev);
+	error = scsi_add_host(sh, &h->pdev->dev);
 	if (error)
 		goto fail_host_put;
 	scsi_scan_host(sh);
@@ -867,20 +878,20 @@ cciss_scsi_detect(int ctlr)
 
 static void
 cciss_unmap_one(struct pci_dev *pdev,
-		CommandList_struct *cp,
+		CommandList_struct *c,
 		size_t buflen,
 		int data_direction)
 {
 	u64bit addr64;
 
-	addr64.val32.lower = cp->SG[0].Addr.lower;
-	addr64.val32.upper = cp->SG[0].Addr.upper;
+	addr64.val32.lower = c->SG[0].Addr.lower;
+	addr64.val32.upper = c->SG[0].Addr.upper;
 	pci_unmap_single(pdev, (dma_addr_t) addr64.val, buflen, data_direction);
 }
 
 static void
 cciss_map_one(struct pci_dev *pdev,
-		CommandList_struct *cp,
+		CommandList_struct *c,
 		unsigned char *buf,
 		size_t buflen,
 		int data_direction)
@@ -888,164 +899,153 @@ cciss_map_one(struct pci_dev *pdev,
 	__u64 addr64;
 
 	addr64 = (__u64) pci_map_single(pdev, buf, buflen, data_direction);
-	cp->SG[0].Addr.lower = 
+	c->SG[0].Addr.lower =
 	  (__u32) (addr64 & (__u64) 0x00000000FFFFFFFF);
-	cp->SG[0].Addr.upper =
+	c->SG[0].Addr.upper =
 	  (__u32) ((addr64 >> 32) & (__u64) 0x00000000FFFFFFFF);
-	cp->SG[0].Len = buflen;
-	cp->Header.SGList = (__u8) 1;   /* no. SGs contig in this cmd */
-	cp->Header.SGTotal = (__u16) 1; /* total sgs in this cmd list */
+	c->SG[0].Len = buflen;
+	c->Header.SGList = (__u8) 1;   /* no. SGs contig in this cmd */
+	c->Header.SGTotal = (__u16) 1; /* total sgs in this cmd list */
 }
 
 static int
-cciss_scsi_do_simple_cmd(ctlr_info_t *c,
-			CommandList_struct *cp,
+cciss_scsi_do_simple_cmd(ctlr_info_t *h,
+			CommandList_struct *c,
 			unsigned char *scsi3addr, 
 			unsigned char *cdb,
 			unsigned char cdblen,
 			unsigned char *buf, int bufsize,
 			int direction)
 {
-	unsigned long flags;
 	DECLARE_COMPLETION_ONSTACK(wait);
 
-	cp->cmd_type = CMD_IOCTL_PEND;		// treat this like an ioctl 
-	cp->scsi_cmd = NULL;
-	cp->Header.ReplyQueue = 0;  // unused in simple mode
-	memcpy(&cp->Header.LUN, scsi3addr, sizeof(cp->Header.LUN));
-	cp->Header.Tag.lower = cp->busaddr;  // Use k. address of cmd as tag
+	c->cmd_type = CMD_IOCTL_PEND; /* treat this like an ioctl */
+	c->scsi_cmd = NULL;
+	c->Header.ReplyQueue = 0;  /* unused in simple mode */
+	memcpy(&c->Header.LUN, scsi3addr, sizeof(c->Header.LUN));
+	c->Header.Tag.lower = c->busaddr;  /* Use k. address of cmd as tag */
 	// Fill in the request block...
 
 	/* printk("Using scsi3addr 0x%02x%0x2%0x2%0x2%0x2%0x2%0x2%0x2\n", 
 		scsi3addr[0], scsi3addr[1], scsi3addr[2], scsi3addr[3],
 		scsi3addr[4], scsi3addr[5], scsi3addr[6], scsi3addr[7]); */
 
-	memset(cp->Request.CDB, 0, sizeof(cp->Request.CDB));
-	memcpy(cp->Request.CDB, cdb, cdblen);
-	cp->Request.Timeout = 0;
-	cp->Request.CDBLen = cdblen;
-	cp->Request.Type.Type = TYPE_CMD;
-	cp->Request.Type.Attribute = ATTR_SIMPLE;
-	cp->Request.Type.Direction = direction;
+	memset(c->Request.CDB, 0, sizeof(c->Request.CDB));
+	memcpy(c->Request.CDB, cdb, cdblen);
+	c->Request.Timeout = 0;
+	c->Request.CDBLen = cdblen;
+	c->Request.Type.Type = TYPE_CMD;
+	c->Request.Type.Attribute = ATTR_SIMPLE;
+	c->Request.Type.Direction = direction;
 
 	/* Fill in the SG list and do dma mapping */
-	cciss_map_one(c->pdev, cp, (unsigned char *) buf,
+	cciss_map_one(h->pdev, c, (unsigned char *) buf,
 			bufsize, DMA_FROM_DEVICE); 
 
-	cp->waiting = &wait;
-
-	/* Put the request on the tail of the request queue */
-	spin_lock_irqsave(CCISS_LOCK(c->ctlr), flags);
-	addQ(&c->reqQ, cp);
-	c->Qdepth++;
-	start_io(c);
-	spin_unlock_irqrestore(CCISS_LOCK(c->ctlr), flags);
-
+	c->waiting = &wait;
+	enqueue_cmd_and_start_io(h, c);
 	wait_for_completion(&wait);
 
 	/* undo the dma mapping */
-	cciss_unmap_one(c->pdev, cp, bufsize, DMA_FROM_DEVICE);
+	cciss_unmap_one(h->pdev, c, bufsize, DMA_FROM_DEVICE);
 	return(0);
 }
 
 static void 
-cciss_scsi_interpret_error(CommandList_struct *cp)
+cciss_scsi_interpret_error(ctlr_info_t *h, CommandList_struct *c)
 {
 	ErrorInfo_struct *ei;
 
-	ei = cp->err_info; 
+	ei = c->err_info;
 	switch(ei->CommandStatus)
 	{
 		case CMD_TARGET_STATUS:
-			printk(KERN_WARNING "cciss: cmd %p has "
-				"completed with errors\n", cp);
-			printk(KERN_WARNING "cciss: cmd %p "
-				"has SCSI Status = %x\n",
-					cp,  
-					ei->ScsiStatus);
+			dev_warn(&h->pdev->dev,
+				"cmd %p has completed with errors\n", c);
+			dev_warn(&h->pdev->dev,
+				"cmd %p has SCSI Status = %x\n",
+				c, ei->ScsiStatus);
 			if (ei->ScsiStatus == 0)
-				printk(KERN_WARNING 
-				"cciss:SCSI status is abnormally zero.  "
+				dev_warn(&h->pdev->dev,
+				"SCSI status is abnormally zero.  "
 				"(probably indicates selection timeout "
 				"reported incorrectly due to a known "
 				"firmware bug, circa July, 2001.)\n");
 		break;
 		case CMD_DATA_UNDERRUN: /* let mid layer handle it. */
-			printk("UNDERRUN\n");
+			dev_info(&h->pdev->dev, "UNDERRUN\n");
 		break;
 		case CMD_DATA_OVERRUN:
-			printk(KERN_WARNING "cciss: cp %p has"
+			dev_warn(&h->pdev->dev, "%p has"
 				" completed with data overrun "
-				"reported\n", cp);
+				"reported\n", c);
 		break;
 		case CMD_INVALID: {
 			/* controller unfortunately reports SCSI passthru's */
 			/* to non-existent targets as invalid commands. */
-			printk(KERN_WARNING "cciss: cp %p is "
-				"reported invalid (probably means "
-				"target device no longer present)\n", 
-				cp); 
-			/* print_bytes((unsigned char *) cp, sizeof(*cp), 1, 0);
-			print_cmd(cp);  */
+			dev_warn(&h->pdev->dev,
+				"%p is reported invalid (probably means "
+				"target device no longer present)\n", c);
+			/* print_bytes((unsigned char *) c, sizeof(*c), 1, 0);
+			print_cmd(c);  */
 			}
 		break;
 		case CMD_PROTOCOL_ERR:
-			printk(KERN_WARNING "cciss: cp %p has "
-				"protocol error \n", cp);
+			dev_warn(&h->pdev->dev, "%p has protocol error\n", c);
 		break;
 		case CMD_HARDWARE_ERR:
 			/* cmd->result = DID_ERROR << 16; */
-			printk(KERN_WARNING "cciss: cp %p had " 
-				" hardware error\n", cp);
+			dev_warn(&h->pdev->dev, "%p had hardware error\n", c);
 		break;
 		case CMD_CONNECTION_LOST:
-			printk(KERN_WARNING "cciss: cp %p had "
-				"connection lost\n", cp);
+			dev_warn(&h->pdev->dev, "%p had connection lost\n", c);
 		break;
 		case CMD_ABORTED:
-			printk(KERN_WARNING "cciss: cp %p was "
-				"aborted\n", cp);
+			dev_warn(&h->pdev->dev, "%p was aborted\n", c);
 		break;
 		case CMD_ABORT_FAILED:
-			printk(KERN_WARNING "cciss: cp %p reports "
-				"abort failed\n", cp);
+			dev_warn(&h->pdev->dev,
+				"%p reports abort failed\n", c);
 		break;
 		case CMD_UNSOLICITED_ABORT:
-			printk(KERN_WARNING "cciss: cp %p aborted "
-				"do to an unsolicited abort\n", cp);
+			dev_warn(&h->pdev->dev,
+				"%p aborted due to an unsolicited abort\n", c);
 		break;
 		case CMD_TIMEOUT:
-			printk(KERN_WARNING "cciss: cp %p timedout\n",
-				cp);
+			dev_warn(&h->pdev->dev, "%p timedout\n", c);
+		break;
+		case CMD_UNABORTABLE:
+			dev_warn(&h->pdev->dev,
+				"%p unabortable\n", c);
 		break;
 		default:
-			printk(KERN_WARNING "cciss: cp %p returned "
-				"unknown status %x\n", cp, 
-					ei->CommandStatus); 
+			dev_warn(&h->pdev->dev,
+				"%p returned unknown status %x\n",
+				c, ei->CommandStatus);
 	}
 }
 
 static int
-cciss_scsi_do_inquiry(ctlr_info_t *c, unsigned char *scsi3addr, 
+cciss_scsi_do_inquiry(ctlr_info_t *h, unsigned char *scsi3addr,
 	unsigned char page, unsigned char *buf,
 	unsigned char bufsize)
 {
 	int rc;
-	CommandList_struct *cp;
+	CommandList_struct *c;
 	char cdb[6];
 	ErrorInfo_struct *ei;
 	unsigned long flags;
 
-	spin_lock_irqsave(CCISS_LOCK(c->ctlr), flags);
-	cp = scsi_cmd_alloc(c);
-	spin_unlock_irqrestore(CCISS_LOCK(c->ctlr), flags);
+	spin_lock_irqsave(&h->lock, flags);
+	c = scsi_cmd_alloc(h);
+	spin_unlock_irqrestore(&h->lock, flags);
 
-	if (cp == NULL) {			/* trouble... */
+	if (c == NULL) {			/* trouble... */
 		printk("cmd_alloc returned NULL!\n");
 		return -1;
 	}
 
-	ei = cp->err_info; 
+	ei = c->err_info;
 
 	cdb[0] = CISS_INQUIRY;
 	cdb[1] = (page != 0);
@@ -1053,24 +1053,24 @@ cciss_scsi_do_inquiry(ctlr_info_t *c, unsigned char *scsi3addr,
 	cdb[3] = 0;
 	cdb[4] = bufsize;
 	cdb[5] = 0;
-	rc = cciss_scsi_do_simple_cmd(c, cp, scsi3addr, cdb, 
+	rc = cciss_scsi_do_simple_cmd(h, c, scsi3addr, cdb,
 				6, buf, bufsize, XFER_READ);
 
 	if (rc != 0) return rc; /* something went wrong */
 
 	if (ei->CommandStatus != 0 && 
 	    ei->CommandStatus != CMD_DATA_UNDERRUN) {
-		cciss_scsi_interpret_error(cp);
+		cciss_scsi_interpret_error(h, c);
 		rc = -1;
 	}
-	spin_lock_irqsave(CCISS_LOCK(c->ctlr), flags);
-	scsi_cmd_free(c, cp);
-	spin_unlock_irqrestore(CCISS_LOCK(c->ctlr), flags);
+	spin_lock_irqsave(&h->lock, flags);
+	scsi_cmd_free(h, c);
+	spin_unlock_irqrestore(&h->lock, flags);
 	return rc;	
 }
 
 /* Get the device id from inquiry page 0x83 */
-static int cciss_scsi_get_device_id(ctlr_info_t *c, unsigned char *scsi3addr,
+static int cciss_scsi_get_device_id(ctlr_info_t *h, unsigned char *scsi3addr,
 	unsigned char *device_id, int buflen)
 {
 	int rc;
@@ -1081,7 +1081,7 @@ static int cciss_scsi_get_device_id(ctlr_info_t *c, unsigned char *scsi3addr,
 	buf = kzalloc(64, GFP_KERNEL);
 	if (!buf)
 		return -1;
-	rc = cciss_scsi_do_inquiry(c, scsi3addr, 0x83, buf, 64);
+	rc = cciss_scsi_do_inquiry(h, scsi3addr, 0x83, buf, 64);
 	if (rc == 0)
 		memcpy(device_id, &buf[8], buflen);
 	kfree(buf);
@@ -1089,20 +1089,20 @@ static int cciss_scsi_get_device_id(ctlr_info_t *c, unsigned char *scsi3addr,
 }
 
 static int
-cciss_scsi_do_report_phys_luns(ctlr_info_t *c, 
+cciss_scsi_do_report_phys_luns(ctlr_info_t *h,
 		ReportLunData_struct *buf, int bufsize)
 {
 	int rc;
-	CommandList_struct *cp;
+	CommandList_struct *c;
 	unsigned char cdb[12];
 	unsigned char scsi3addr[8]; 
 	ErrorInfo_struct *ei;
 	unsigned long flags;
 
-	spin_lock_irqsave(CCISS_LOCK(c->ctlr), flags);
-	cp = scsi_cmd_alloc(c);
-	spin_unlock_irqrestore(CCISS_LOCK(c->ctlr), flags);
-	if (cp == NULL) {			/* trouble... */
+	spin_lock_irqsave(&h->lock, flags);
+	c = scsi_cmd_alloc(h);
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (c == NULL) {			/* trouble... */
 		printk("cmd_alloc returned NULL!\n");
 		return -1;
 	}
@@ -1121,27 +1121,27 @@ cciss_scsi_do_report_phys_luns(ctlr_info_t *c,
 	cdb[10] = 0;
 	cdb[11] = 0;
 
-	rc = cciss_scsi_do_simple_cmd(c, cp, scsi3addr, 
+	rc = cciss_scsi_do_simple_cmd(h, c, scsi3addr,
 				cdb, 12, 
 				(unsigned char *) buf, 
 				bufsize, XFER_READ);
 
 	if (rc != 0) return rc; /* something went wrong */
 
-	ei = cp->err_info; 
+	ei = c->err_info;
 	if (ei->CommandStatus != 0 && 
 	    ei->CommandStatus != CMD_DATA_UNDERRUN) {
-		cciss_scsi_interpret_error(cp);
+		cciss_scsi_interpret_error(h, c);
 		rc = -1;
 	}
-	spin_lock_irqsave(CCISS_LOCK(c->ctlr), flags);
-	scsi_cmd_free(c, cp);
-	spin_unlock_irqrestore(CCISS_LOCK(c->ctlr), flags);
+	spin_lock_irqsave(&h->lock, flags);
+	scsi_cmd_free(h, c);
+	spin_unlock_irqrestore(&h->lock, flags);
 	return rc;	
 }
 
 static void
-cciss_update_non_disk_devices(int cntl_num, int hostno)
+cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
 {
 	/* the idea here is we could get notified from /proc
 	   that some devices have changed, so we do a report 
@@ -1174,7 +1174,6 @@ cciss_update_non_disk_devices(int cntl_num, int hostno)
 	ReportLunData_struct *ld_buff;
 	unsigned char *inq_buff;
 	unsigned char scsi3addr[8];
-	ctlr_info_t *c;
 	__u32 num_luns=0;
 	unsigned char *ch;
 	struct cciss_scsi_dev_t *currentsd, *this_device;
@@ -1182,7 +1181,6 @@ cciss_update_non_disk_devices(int cntl_num, int hostno)
 	int reportlunsize = sizeof(*ld_buff) + CISS_MAX_PHYS_LUN * 8;
 	int i;
 
-	c = (ctlr_info_t *) hba[cntl_num];	
 	ld_buff = kzalloc(reportlunsize, GFP_KERNEL);
 	inq_buff = kmalloc(OBDR_TAPE_INQ_SIZE, GFP_KERNEL);
 	currentsd = kzalloc(sizeof(*currentsd) *
@@ -1192,7 +1190,7 @@ cciss_update_non_disk_devices(int cntl_num, int hostno)
 		goto out;
 	}
 	this_device = &currentsd[CCISS_MAX_SCSI_DEVS_PER_HBA];
-	if (cciss_scsi_do_report_phys_luns(c, ld_buff, reportlunsize) == 0) {
+	if (cciss_scsi_do_report_phys_luns(h, ld_buff, reportlunsize) == 0) {
 		ch = &ld_buff->LUNListLength[0];
 		num_luns = ((ch[0]<<24) | (ch[1]<<16) | (ch[2]<<8) | ch[3]) / 8;
 		if (num_luns > CISS_MAX_PHYS_LUN) {
@@ -1216,7 +1214,7 @@ cciss_update_non_disk_devices(int cntl_num, int hostno)
 		memset(inq_buff, 0, OBDR_TAPE_INQ_SIZE);
 		memcpy(&scsi3addr[0], &ld_buff->LUN[i][0], 8);
 
-		if (cciss_scsi_do_inquiry(hba[cntl_num], scsi3addr, 0, inq_buff,
+		if (cciss_scsi_do_inquiry(h, scsi3addr, 0, inq_buff,
 			(unsigned char) OBDR_TAPE_INQ_SIZE) != 0)
 			/* Inquiry failed (msg printed already) */
 			continue; /* so we will skip this device. */
@@ -1234,7 +1232,7 @@ cciss_update_non_disk_devices(int cntl_num, int hostno)
 			sizeof(this_device->revision));
 		memset(this_device->device_id, 0,
 			sizeof(this_device->device_id));
-		cciss_scsi_get_device_id(hba[cntl_num], scsi3addr,
+		cciss_scsi_get_device_id(h, scsi3addr,
 			this_device->device_id, sizeof(this_device->device_id));
 
 		switch (this_device->devtype)
@@ -1261,7 +1259,7 @@ cciss_update_non_disk_devices(int cntl_num, int hostno)
 		  case 0x08: /* medium changer */
 			if (ncurrent >= CCISS_MAX_SCSI_DEVS_PER_HBA) {
 				printk(KERN_INFO "cciss%d: %s ignored, "
-					"too many devices.\n", cntl_num,
+					"too many devices.\n", h->ctlr,
 					scsi_device_type(this_device->devtype));
 				break;
 			}
@@ -1273,7 +1271,7 @@ cciss_update_non_disk_devices(int cntl_num, int hostno)
 		}
 	}
 
-	adjust_cciss_scsi_table(cntl_num, hostno, currentsd, ncurrent);
+	adjust_cciss_scsi_table(h, hostno, currentsd, ncurrent);
 out:
 	kfree(inq_buff);
 	kfree(ld_buff);
@@ -1292,125 +1290,136 @@ is_keyword(char *ptr, int len, char *verb)  // Thanks to ncr53c8xx.c
 }
 
 static int
-cciss_scsi_user_command(int ctlr, int hostno, char *buffer, int length)
+cciss_scsi_user_command(ctlr_info_t *h, int hostno, char *buffer, int length)
 {
 	int arg_len;
 
 	if ((arg_len = is_keyword(buffer, length, "rescan")) != 0)
-		cciss_update_non_disk_devices(ctlr, hostno);
+		cciss_update_non_disk_devices(h, hostno);
 	else
 		return -EINVAL;
 	return length;
 }
 
-
 static int
-cciss_scsi_proc_info(struct Scsi_Host *sh,
+cciss_scsi_write_info(struct Scsi_Host *sh,
 		char *buffer, /* data buffer */
-		char **start, 	   /* where data in buffer starts */
-		off_t offset,	   /* offset from start of imaginary file */
-		int length, 	   /* length of data in buffer */
-		int func)	   /* 0 == read, 1 == write */
+		int length) 	   /* length of data in buffer */
 {
+	ctlr_info_t *h = (ctlr_info_t *) sh->hostdata[0];
+	if (h == NULL)  /* This really shouldn't ever happen. */
+		return -EINVAL;
 
-	int buflen, datalen;
-	ctlr_info_t *ci;
-	int i;
-	int cntl_num;
+	return cciss_scsi_user_command(h, sh->host_no,
+			buffer, length);	
+} 
+
+static int
+cciss_scsi_show_info(struct seq_file *m, struct Scsi_Host *sh)
+{
 
+	ctlr_info_t *h = (ctlr_info_t *) sh->hostdata[0];
+	int i;
 
-	ci = (ctlr_info_t *) sh->hostdata[0];
-	if (ci == NULL)  /* This really shouldn't ever happen. */
+	if (h == NULL)  /* This really shouldn't ever happen. */
 		return -EINVAL;
 
-	cntl_num = ci->ctlr;	/* Get our index into the hba[] array */
-
-	if (func == 0) {	/* User is reading from /proc/scsi/ciss*?/?*  */
-		buflen = sprintf(buffer, "cciss%d: SCSI host: %d\n",
-				cntl_num, sh->host_no);
-
-		/* this information is needed by apps to know which cciss
-		   device corresponds to which scsi host number without
-		   having to open a scsi target device node.  The device
-		   information is not a duplicate of /proc/scsi/scsi because
-		   the two may be out of sync due to scsi hotplug, rather
-		   this info is for an app to be able to use to know how to
-		   get them back in sync. */
-
-		for (i=0;i<ccissscsi[cntl_num].ndevices;i++) {
-			struct cciss_scsi_dev_t *sd = &ccissscsi[cntl_num].dev[i];
-			buflen += sprintf(&buffer[buflen], "c%db%dt%dl%d %02d "
-				"0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
-				sh->host_no, sd->bus, sd->target, sd->lun,
-				sd->devtype,
-				sd->scsi3addr[0], sd->scsi3addr[1],
-				sd->scsi3addr[2], sd->scsi3addr[3],
-				sd->scsi3addr[4], sd->scsi3addr[5],
-				sd->scsi3addr[6], sd->scsi3addr[7]);
-		}
-		datalen = buflen - offset;
-		if (datalen < 0) { 	/* they're reading past EOF. */
-			datalen = 0;
-			*start = buffer+buflen;	
-		} else
-			*start = buffer + offset;
-		return(datalen);
-	} else 	/* User is writing to /proc/scsi/cciss*?/?*  ... */
-		return cciss_scsi_user_command(cntl_num, sh->host_no,
-			buffer, length);	
-} 
+	seq_printf(m, "cciss%d: SCSI host: %d\n",
+			h->ctlr, sh->host_no);
+
+	/* this information is needed by apps to know which cciss
+	   device corresponds to which scsi host number without
+	   having to open a scsi target device node.  The device
+	   information is not a duplicate of /proc/scsi/scsi because
+	   the two may be out of sync due to scsi hotplug, rather
+	   this info is for an app to be able to use to know how to
+	   get them back in sync. */
+
+	for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) {
+		struct cciss_scsi_dev_t *sd =
+			&ccissscsi[h->ctlr].dev[i];
+		seq_printf(m, "c%db%dt%dl%d %02d "
+			"0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+			sh->host_no, sd->bus, sd->target, sd->lun,
+			sd->devtype,
+			sd->scsi3addr[0], sd->scsi3addr[1],
+			sd->scsi3addr[2], sd->scsi3addr[3],
+			sd->scsi3addr[4], sd->scsi3addr[5],
+			sd->scsi3addr[6], sd->scsi3addr[7]);
+	}
+	return 0;
+}
 
 /* cciss_scatter_gather takes a struct scsi_cmnd, (cmd), and does the pci 
    dma mapping  and fills in the scatter gather entries of the 
-   cciss command, cp. */
+   cciss command, c. */
 
-static void
-cciss_scatter_gather(struct pci_dev *pdev, 
-		CommandList_struct *cp,	
-		struct scsi_cmnd *cmd)
+static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c,
+	struct scsi_cmnd *cmd)
 {
 	unsigned int len;
 	struct scatterlist *sg;
 	__u64 addr64;
-	int use_sg, i;
-
-	BUG_ON(scsi_sg_count(cmd) > MAXSGENTRIES);
-
-	use_sg = scsi_dma_map(cmd);
-	if (use_sg) {	/* not too many addrs? */
-		scsi_for_each_sg(cmd, sg, use_sg, i) {
+	int request_nsgs, i, chained, sg_index;
+	struct cciss_scsi_adapter_data_t *sa = h->scsi_ctlr;
+	SGDescriptor_struct *curr_sg;
+
+	BUG_ON(scsi_sg_count(cmd) > h->maxsgentries);
+
+	chained = 0;
+	sg_index = 0;
+	curr_sg = c->SG;
+	request_nsgs = scsi_dma_map(cmd);
+	if (request_nsgs) {
+		scsi_for_each_sg(cmd, sg, request_nsgs, i) {
+			if (sg_index + 1 == h->max_cmd_sgentries &&
+				!chained && request_nsgs - i > 1) {
+				chained = 1;
+				sg_index = 0;
+				curr_sg = sa->cmd_sg_list[c->cmdindex];
+			}
 			addr64 = (__u64) sg_dma_address(sg);
 			len  = sg_dma_len(sg);
-			cp->SG[i].Addr.lower =
-				(__u32) (addr64 & (__u64) 0x00000000FFFFFFFF);
-			cp->SG[i].Addr.upper =
-				(__u32) ((addr64 >> 32) & (__u64) 0x00000000FFFFFFFF);
-			cp->SG[i].Len = len;
-			cp->SG[i].Ext = 0;  // we are not chaining
+			curr_sg[sg_index].Addr.lower =
+				(__u32) (addr64 & 0x0FFFFFFFFULL);
+			curr_sg[sg_index].Addr.upper =
+				(__u32) ((addr64 >> 32) & 0x0FFFFFFFFULL);
+			curr_sg[sg_index].Len = len;
+			curr_sg[sg_index].Ext = 0;
+			++sg_index;
 		}
+		if (chained)
+			cciss_map_sg_chain_block(h, c,
+				sa->cmd_sg_list[c->cmdindex],
+				(request_nsgs - (h->max_cmd_sgentries - 1)) *
+					sizeof(SGDescriptor_struct));
 	}
-
-	cp->Header.SGList = (__u8) use_sg;   /* no. SGs contig in this cmd */
-	cp->Header.SGTotal = (__u16) use_sg; /* total sgs in this cmd list */
+	/* track how many SG entries we are using */
+	if (request_nsgs > h->maxSG)
+		h->maxSG = request_nsgs;
+	c->Header.SGTotal = (u16) request_nsgs + chained;
+	if (request_nsgs > h->max_cmd_sgentries)
+		c->Header.SGList = h->max_cmd_sgentries;
+	else
+		c->Header.SGList = c->Header.SGTotal;
 	return;
 }
 
 
 static int
-cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd *))
+cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
 {
-	ctlr_info_t **c;
-	int ctlr, rc;
+	ctlr_info_t *h;
+	int rc;
 	unsigned char scsi3addr[8];
-	CommandList_struct *cp;
+	CommandList_struct *c;
 	unsigned long flags;
 
 	// Get the ptr to our adapter structure (hba[i]) out of cmd->host.
 	// We violate cmd->host privacy here.  (Is there another way?)
-	c = (ctlr_info_t **) &cmd->device->host->hostdata[0];	
-	ctlr = (*c)->ctlr;
+	h = (ctlr_info_t *) cmd->device->host->hostdata[0];
 
-	rc = lookup_scsi3addr(ctlr, cmd->device->channel, cmd->device->id, 
+	rc = lookup_scsi3addr(h, cmd->device->channel, cmd->device->id,
 			cmd->device->lun, scsi3addr);
 	if (rc != 0) {
 		/* the scsi nexus does not match any that we presented... */
@@ -1422,19 +1431,14 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd
 		return 0;
 	}
 
-	/* printk("cciss_queue_command, p=%p, cmd=0x%02x, c%db%dt%dl%d\n", 
-		cmd, cmd->cmnd[0], ctlr, cmd->channel, cmd->target, cmd->lun);*/
-	// printk("q:%p:c%db%dt%dl%d ", cmd, ctlr, cmd->channel, 
-	//	cmd->target, cmd->lun);
-
 	/* Ok, we have a reasonable scsi nexus, so send the cmd down, and
            see what the device thinks of it. */
 
-	spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
-	cp = scsi_cmd_alloc(*c);
-	spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-	if (cp == NULL) {			/* trouble... */
-		printk("scsi_cmd_alloc returned NULL!\n");
+	spin_lock_irqsave(&h->lock, flags);
+	c = scsi_cmd_alloc(h);
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (c == NULL) {			/* trouble... */
+		dev_warn(&h->pdev->dev, "scsi_cmd_alloc returned NULL!\n");
 		/* FIXME: next 3 lines are -> BAD! <- */
 		cmd->result = DID_NO_CONNECT << 16;
 		done(cmd);
@@ -1445,35 +1449,41 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd
 
 	cmd->scsi_done = done;    // save this for use by completion code 
 
-	// save cp in case we have to abort it 
-	cmd->host_scribble = (unsigned char *) cp; 
+	/* save c in case we have to abort it */
+	cmd->host_scribble = (unsigned char *) c;
 
-	cp->cmd_type = CMD_SCSI;
-	cp->scsi_cmd = cmd;
-	cp->Header.ReplyQueue = 0;  // unused in simple mode
-	memcpy(&cp->Header.LUN.LunAddrBytes[0], &scsi3addr[0], 8);
-	cp->Header.Tag.lower = cp->busaddr;  // Use k. address of cmd as tag
+	c->cmd_type = CMD_SCSI;
+	c->scsi_cmd = cmd;
+	c->Header.ReplyQueue = 0;  /* unused in simple mode */
+	memcpy(&c->Header.LUN.LunAddrBytes[0], &scsi3addr[0], 8);
+	c->Header.Tag.lower = c->busaddr;  /* Use k. address of cmd as tag */
 	
 	// Fill in the request block...
 
-	cp->Request.Timeout = 0;
-	memset(cp->Request.CDB, 0, sizeof(cp->Request.CDB));
-	BUG_ON(cmd->cmd_len > sizeof(cp->Request.CDB));
-	cp->Request.CDBLen = cmd->cmd_len;
-	memcpy(cp->Request.CDB, cmd->cmnd, cmd->cmd_len);
-	cp->Request.Type.Type = TYPE_CMD;
-	cp->Request.Type.Attribute = ATTR_SIMPLE;
+	c->Request.Timeout = 0;
+	memset(c->Request.CDB, 0, sizeof(c->Request.CDB));
+	BUG_ON(cmd->cmd_len > sizeof(c->Request.CDB));
+	c->Request.CDBLen = cmd->cmd_len;
+	memcpy(c->Request.CDB, cmd->cmnd, cmd->cmd_len);
+	c->Request.Type.Type = TYPE_CMD;
+	c->Request.Type.Attribute = ATTR_SIMPLE;
 	switch(cmd->sc_data_direction)
 	{
-	  case DMA_TO_DEVICE: cp->Request.Type.Direction = XFER_WRITE; break;
-	  case DMA_FROM_DEVICE: cp->Request.Type.Direction = XFER_READ; break;
-	  case DMA_NONE: cp->Request.Type.Direction = XFER_NONE; break;
+	  case DMA_TO_DEVICE:
+		c->Request.Type.Direction = XFER_WRITE;
+		break;
+	  case DMA_FROM_DEVICE:
+		c->Request.Type.Direction = XFER_READ;
+		break;
+	  case DMA_NONE:
+		c->Request.Type.Direction = XFER_NONE;
+		break;
 	  case DMA_BIDIRECTIONAL:
 		// This can happen if a buggy application does a scsi passthru
 		// and sets both inlen and outlen to non-zero. ( see
 		// ../scsi/scsi_ioctl.c:scsi_ioctl_send_command() )
 
-	  	cp->Request.Type.Direction = XFER_RSVD;
+		c->Request.Type.Direction = XFER_RSVD;
 		// This is technically wrong, and cciss controllers should
 		// reject it with CMD_INVALID, which is the most correct 
 		// response, but non-fibre backends appear to let it 
@@ -1484,28 +1494,20 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd
 		break;
 
 	  default: 
-		printk("cciss: unknown data direction: %d\n", 
+		dev_warn(&h->pdev->dev, "unknown data direction: %d\n",
 			cmd->sc_data_direction);
 		BUG();
 		break;
 	}
-
-	cciss_scatter_gather((*c)->pdev, cp, cmd); // Fill the SG list
-
-	/* Put the request on the tail of the request queue */
-
-	spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
-	addQ(&(*c)->reqQ, cp);
-	(*c)->Qdepth++;
-	start_io(*c);
-	spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-
+	cciss_scatter_gather(h, c, cmd);
+	enqueue_cmd_and_start_io(h, c);
 	/* the cmd'll come back via intr handler in complete_scsi_command()  */
 	return 0;
 }
 
-static void 
-cciss_unregister_scsi(int ctlr)
+static DEF_SCSI_QCMD(cciss_scsi_queue_command)
+
+static void cciss_unregister_scsi(ctlr_info_t *h)
 {
 	struct cciss_scsi_adapter_data_t *sa;
 	struct cciss_scsi_cmd_stack_t *stk;
@@ -1513,59 +1515,58 @@ cciss_unregister_scsi(int ctlr)
 
 	/* we are being forcibly unloaded, and may not refuse. */
 
-	spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
-	sa = (struct cciss_scsi_adapter_data_t *) hba[ctlr]->scsi_ctlr;
+	spin_lock_irqsave(&h->lock, flags);
+	sa = h->scsi_ctlr;
 	stk = &sa->cmd_stack; 
 
 	/* if we weren't ever actually registered, don't unregister */ 
 	if (sa->registered) {
-		spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
+		spin_unlock_irqrestore(&h->lock, flags);
 		scsi_remove_host(sa->scsi_host);
 		scsi_host_put(sa->scsi_host);
-		spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
+		spin_lock_irqsave(&h->lock, flags);
 	}
 
 	/* set scsi_host to NULL so our detect routine will 
 	   find us on register */
 	sa->scsi_host = NULL;
-	spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-	scsi_cmd_stack_free(ctlr);
+	spin_unlock_irqrestore(&h->lock, flags);
+	scsi_cmd_stack_free(h);
 	kfree(sa);
 }
 
-static int 
-cciss_engage_scsi(int ctlr)
+static int cciss_engage_scsi(ctlr_info_t *h)
 {
 	struct cciss_scsi_adapter_data_t *sa;
 	struct cciss_scsi_cmd_stack_t *stk;
 	unsigned long flags;
 
-	spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
-	sa = (struct cciss_scsi_adapter_data_t *) hba[ctlr]->scsi_ctlr;
+	spin_lock_irqsave(&h->lock, flags);
+	sa = h->scsi_ctlr;
 	stk = &sa->cmd_stack; 
 
 	if (sa->registered) {
-		printk("cciss%d: SCSI subsystem already engaged.\n", ctlr);
-		spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-		return ENXIO;
+		dev_info(&h->pdev->dev, "SCSI subsystem already engaged.\n");
+		spin_unlock_irqrestore(&h->lock, flags);
+		return -ENXIO;
 	}
 	sa->registered = 1;
-	spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-	cciss_update_non_disk_devices(ctlr, -1);
-	cciss_scsi_detect(ctlr);
+	spin_unlock_irqrestore(&h->lock, flags);
+	cciss_update_non_disk_devices(h, -1);
+	cciss_scsi_detect(h);
 	return 0;
 }
 
 static void
-cciss_seq_tape_report(struct seq_file *seq, int ctlr)
+cciss_seq_tape_report(struct seq_file *seq, ctlr_info_t *h)
 {
 	unsigned long flags;
 
-	CPQ_TAPE_LOCK(ctlr, flags);
+	CPQ_TAPE_LOCK(h, flags);
 	seq_printf(seq,
 		"Sequential access devices: %d\n\n",
-			ccissscsi[ctlr].ndevices);
-	CPQ_TAPE_UNLOCK(ctlr, flags);
+			ccissscsi[h->ctlr].ndevices);
+	CPQ_TAPE_UNLOCK(h, flags);
 }
 
 static int wait_for_device_to_become_ready(ctlr_info_t *h,
@@ -1576,10 +1577,10 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 	int waittime = HZ;
 	CommandList_struct *c;
 
-	c = cmd_alloc(h, 1);
+	c = cmd_alloc(h);
 	if (!c) {
-		printk(KERN_WARNING "cciss%d: out of memory in "
-			"wait_for_device_to_become_ready.\n", h->ctlr);
+		dev_warn(&h->pdev->dev, "out of memory in "
+			"wait_for_device_to_become_ready.\n");
 		return IO_ERROR;
 	}
 
@@ -1597,7 +1598,7 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 			waittime = waittime * 2;
 
 		/* Send the Test Unit Ready */
-		rc = fill_cmd(c, TEST_UNIT_READY, h->ctlr, NULL, 0, 0,
+		rc = fill_cmd(h, c, TEST_UNIT_READY, NULL, 0, 0,
 			lunaddr, TYPE_CMD);
 		if (rc == 0)
 			rc = sendcmd_withirq_core(h, c, 0);
@@ -1623,18 +1624,18 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 			}
 		}
 retry_tur:
-		printk(KERN_WARNING "cciss%d: Waiting %d secs "
+		dev_warn(&h->pdev->dev, "Waiting %d secs "
 			"for device to become ready.\n",
-			h->ctlr, waittime / HZ);
+			waittime / HZ);
 		rc = 1; /* device not ready. */
 	}
 
 	if (rc)
-		printk("cciss%d: giving up on device.\n", h->ctlr);
+		dev_warn(&h->pdev->dev, "giving up on device.\n");
 	else
-		printk(KERN_WARNING "cciss%d: device is ready.\n", h->ctlr);
+		dev_warn(&h->pdev->dev, "device is ready.\n");
 
-	cmd_free(h, c, 1);
+	cmd_free(h, c);
 	return rc;
 }
 
@@ -1654,26 +1655,24 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
 	int rc;
 	CommandList_struct *cmd_in_trouble;
 	unsigned char lunaddr[8];
-	ctlr_info_t **c;
-	int ctlr;
+	ctlr_info_t *h;
 
 	/* find the controller to which the command to be aborted was sent */
-	c = (ctlr_info_t **) &scsicmd->device->host->hostdata[0];	
-	if (c == NULL) /* paranoia */
+	h = (ctlr_info_t *) scsicmd->device->host->hostdata[0];
+	if (h == NULL) /* paranoia */
 		return FAILED;
-	ctlr = (*c)->ctlr;
-	printk(KERN_WARNING "cciss%d: resetting tape drive or medium changer.\n", ctlr);
+	dev_warn(&h->pdev->dev, "resetting tape drive or medium changer.\n");
 	/* find the command that's giving us trouble */
 	cmd_in_trouble = (CommandList_struct *) scsicmd->host_scribble;
 	if (cmd_in_trouble == NULL) /* paranoia */
 		return FAILED;
 	memcpy(lunaddr, &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 8);
 	/* send a reset to the SCSI LUN which the command was sent to */
-	rc = sendcmd_withirq(CCISS_RESET_MSG, ctlr, NULL, 0, 0, lunaddr,
+	rc = sendcmd_withirq(h, CCISS_RESET_MSG, NULL, 0, 0, lunaddr,
 		TYPE_MSG);
-	if (rc == 0 && wait_for_device_to_become_ready(*c, lunaddr) == 0)
+	if (rc == 0 && wait_for_device_to_become_ready(h, lunaddr) == 0)
 		return SUCCESS;
-	printk(KERN_WARNING "cciss%d: resetting device failed.\n", ctlr);
+	dev_warn(&h->pdev->dev, "resetting device failed.\n");
 	return FAILED;
 }
 
@@ -1682,22 +1681,20 @@ static int  cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
 	int rc;
 	CommandList_struct *cmd_to_abort;
 	unsigned char lunaddr[8];
-	ctlr_info_t **c;
-	int ctlr;
+	ctlr_info_t *h;
 
 	/* find the controller to which the command to be aborted was sent */
-	c = (ctlr_info_t **) &scsicmd->device->host->hostdata[0];	
-	if (c == NULL) /* paranoia */
+	h = (ctlr_info_t *) scsicmd->device->host->hostdata[0];
+	if (h == NULL) /* paranoia */
 		return FAILED;
-	ctlr = (*c)->ctlr;
-	printk(KERN_WARNING "cciss%d: aborting tardy SCSI cmd\n", ctlr);
+	dev_warn(&h->pdev->dev, "aborting tardy SCSI cmd\n");
 
 	/* find the command to be aborted */
 	cmd_to_abort = (CommandList_struct *) scsicmd->host_scribble;
 	if (cmd_to_abort == NULL) /* paranoia */
 		return FAILED;
 	memcpy(lunaddr, &cmd_to_abort->Header.LUN.LunAddrBytes[0], 8);
-	rc = sendcmd_withirq(CCISS_ABORT_MSG, ctlr, &cmd_to_abort->Header.Tag,
+	rc = sendcmd_withirq(h, CCISS_ABORT_MSG, &cmd_to_abort->Header.Tag,
 		0, 0, lunaddr, TYPE_MSG);
 	if (rc == 0)
 		return SUCCESS;
@@ -1710,5 +1707,6 @@ static int  cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
 /* If no tape support, then these become defined out of existence */
 
 #define cciss_scsi_setup(cntl_num)
+#define cciss_engage_scsi(h)
 
 #endif /* CONFIG_CISS_SCSI_TAPE */
diff --git a/drivers/block/cciss_scsi.h b/drivers/block/cciss_scsi.h
index 7b750245ae7..e71d986727c 100644
--- a/drivers/block/cciss_scsi.h
+++ b/drivers/block/cciss_scsi.h
@@ -25,24 +25,20 @@
 
 #include <scsi/scsicam.h> /* possibly irrelevant, since we don't show disks */
 
-		// the scsi id of the adapter...
+		/* the scsi id of the adapter... */
 #define SELF_SCSI_ID 15
-		// 15 is somewhat arbitrary, since the scsi-2 bus
-		// that's presented by the driver to the OS is
-		// fabricated.  The "real" scsi-3 bus the 
-		// hardware presents is fabricated too.
-		// The actual, honest-to-goodness physical
-		// bus that the devices are attached to is not 
-		// addressible natively, and may in fact turn
-		// out to be not scsi at all.
+		/* 15 is somewhat arbitrary, since the scsi-2 bus
+		   that's presented by the driver to the OS is
+		   fabricated.  The "real" scsi-3 bus the
+		   hardware presents is fabricated too.
+		   The actual, honest-to-goodness physical
+		   bus that the devices are attached to is not
+		   addressible natively, and may in fact turn
+		   out to be not scsi at all. */
 
-#define SCSI_CCISS_CAN_QUEUE 2
 
 /* 
 
-Note, cmd_per_lun could give us some trouble, so I'm setting it very low.
-Likewise, SCSI_CCISS_CAN_QUEUE is set very conservatively.
-
 If the upper scsi layer tries to track how many commands we have 
 outstanding, it will be operating under the misapprehension that it is
 the only one sending us requests.  We also have the block interface,
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 44fa2018f6b..2b944038453 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -32,8 +32,10 @@
 #include <linux/blkpg.h>
 #include <linux/timer.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/hdreg.h>
+#include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/genhd.h>
@@ -66,6 +68,7 @@ MODULE_LICENSE("GPL");
 
 #define CPQARRAY_DMA_MASK	0xFFFFFFFF	/* 32 bit DMA */
 
+static DEFINE_MUTEX(cpqarray_mutex);
 static int nr_ctlr;
 static ctlr_info_t *hba[MAX_CTLR];
 
@@ -156,8 +159,8 @@ static int sendcmd(
 	unsigned int blkcnt,
 	unsigned int log_unit );
 
-static int ida_open(struct block_device *bdev, fmode_t mode);
-static int ida_release(struct gendisk *disk, fmode_t mode);
+static int ida_unlocked_open(struct block_device *bdev, fmode_t mode);
+static void ida_release(struct gendisk *disk, fmode_t mode);
 static int ida_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg);
 static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io);
@@ -177,7 +180,6 @@ static int cpqarray_register_ctlr(int ctlr, struct pci_dev *pdev);
 
 #ifdef CONFIG_PROC_FS
 static void ida_procinit(int i);
-static int ida_proc_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
 #else
 static void ida_procinit(int i) {}
 #endif
@@ -193,11 +195,11 @@ static inline ctlr_info_t *get_host(struct gendisk *disk)
 }
 
 
-static struct block_device_operations ida_fops  = {
+static const struct block_device_operations ida_fops  = {
 	.owner		= THIS_MODULE,
-	.open		= ida_open,
+	.open		= ida_unlocked_open,
 	.release	= ida_release,
-	.locked_ioctl	= ida_ioctl,
+	.ioctl		= ida_ioctl,
 	.getgeo		= ida_getgeo,
 	.revalidate_disk= ida_revalidate,
 };
@@ -206,6 +208,7 @@ static struct block_device_operations ida_fops  = {
 #ifdef CONFIG_PROC_FS
 
 static struct proc_dir_entry *proc_array;
+static const struct file_operations ida_proc_fops;
 
 /*
  * Get us a file in /proc/array that says something about each controller.
@@ -218,19 +221,16 @@ static void __init ida_procinit(int i)
 		if (!proc_array) return;
 	}
 
-	create_proc_read_entry(hba[i]->devname, 0, proc_array,
-			       ida_proc_get_info, hba[i]);
+	proc_create_data(hba[i]->devname, 0, proc_array, &ida_proc_fops, hba[i]);
 }
 
 /*
  * Report information about this controller.
  */
-static int ida_proc_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
+static int ida_proc_show(struct seq_file *m, void *v)
 {
-	off_t pos = 0;
-	off_t len = 0;
-	int size, i, ctlr;
-	ctlr_info_t *h = (ctlr_info_t*)data;
+	int i, ctlr;
+	ctlr_info_t *h = (ctlr_info_t*)m->private;
 	drv_info_t *drv;
 #ifdef CPQ_PROC_PRINT_QUEUES
 	cmdlist_t *c;
@@ -238,7 +238,7 @@ static int ida_proc_get_info(char *buffer, char **start, off_t offset, int lengt
 #endif
 
 	ctlr = h->ctlr;
-	size = sprintf(buffer, "%s:  Compaq %s Controller\n"
+	seq_printf(m, "%s:  Compaq %s Controller\n"
 		"       Board ID: 0x%08lx\n"
 		"       Firmware Revision: %c%c%c%c\n"
 		"       Controller Sig: 0x%08lx\n"
@@ -258,55 +258,54 @@ static int ida_proc_get_info(char *buffer, char **start, off_t offset, int lengt
 		h->log_drives, h->phys_drives,
 		h->Qdepth, h->maxQsinceinit);
 
-	pos += size; len += size;
-	
-	size = sprintf(buffer+len, "Logical Drive Info:\n");
-	pos += size; len += size;
+	seq_puts(m, "Logical Drive Info:\n");
 
 	for(i=0; i<h->log_drives; i++) {
 		drv = &h->drv[i];
-		size = sprintf(buffer+len, "ida/c%dd%d: blksz=%d nr_blks=%d\n",
+		seq_printf(m, "ida/c%dd%d: blksz=%d nr_blks=%d\n",
 				ctlr, i, drv->blk_size, drv->nr_blks);
-		pos += size; len += size;
 	}
 
 #ifdef CPQ_PROC_PRINT_QUEUES
 	spin_lock_irqsave(IDA_LOCK(h->ctlr), flags); 
-	size = sprintf(buffer+len, "\nCurrent Queues:\n");
-	pos += size; len += size;
+	seq_puts(m, "\nCurrent Queues:\n");
 
 	c = h->reqQ;
-	size = sprintf(buffer+len, "reqQ = %p", c); pos += size; len += size;
+	seq_printf(m, "reqQ = %p", c);
 	if (c) c=c->next;
 	while(c && c != h->reqQ) {
-		size = sprintf(buffer+len, "->%p", c);
-		pos += size; len += size;
+		seq_printf(m, "->%p", c);
 		c=c->next;
 	}
 
 	c = h->cmpQ;
-	size = sprintf(buffer+len, "\ncmpQ = %p", c); pos += size; len += size;
+	seq_printf(m, "\ncmpQ = %p", c);
 	if (c) c=c->next;
 	while(c && c != h->cmpQ) {
-		size = sprintf(buffer+len, "->%p", c);
-		pos += size; len += size;
+		seq_printf(m, "->%p", c);
 		c=c->next;
 	}
 
-	size = sprintf(buffer+len, "\n"); pos += size; len += size;
+	seq_putc(m, '\n');
 	spin_unlock_irqrestore(IDA_LOCK(h->ctlr), flags); 
 #endif
-	size = sprintf(buffer+len, "nr_allocs = %d\nnr_frees = %d\n",
+	seq_printf(m, "nr_allocs = %d\nnr_frees = %d\n",
 			h->nr_allocs, h->nr_frees);
-	pos += size; len += size;
-
-	*eof = 1;
-	*start = buffer+offset;
-	len -= offset;
-	if (len>length)
-		len = length;
-	return len;
+	return 0;
+}
+
+static int ida_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ida_proc_show, PDE_DATA(inode));
 }
+
+static const struct file_operations ida_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ida_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif /* CONFIG_PROC_FS */
 
 module_param_array(eisa, int, NULL, 0);
@@ -321,7 +320,7 @@ static void release_io_mem(ctlr_info_t *c)
 	c->io_mem_length = 0;
 }
 
-static void __devexit cpqarray_remove_one(int i)
+static void cpqarray_remove_one(int i)
 {
 	int j;
 	char buff[4];
@@ -353,7 +352,7 @@ static void __devexit cpqarray_remove_one(int i)
 	free_hba(i);
 }
 
-static void __devexit cpqarray_remove_one_pci (struct pci_dev *pdev)
+static void cpqarray_remove_one_pci(struct pci_dev *pdev)
 {
 	int i;
 	ctlr_info_t *tmp_ptr;
@@ -378,7 +377,7 @@ static void __devexit cpqarray_remove_one_pci (struct pci_dev *pdev)
 /* removing an instance that was not removed automatically..
  * must be an eisa card.
  */
-static void __devexit cpqarray_remove_one_eisa (int i)
+static void cpqarray_remove_one_eisa(int i)
 {
 	if (hba[i] == NULL) {
 		printk(KERN_ERR "cpqarray: controller %d appears to have"
@@ -389,7 +388,7 @@ static void __devexit cpqarray_remove_one_eisa (int i)
 }
 
 /* pdev is NULL for eisa */
-static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev)
+static int cpqarray_register_ctlr(int i, struct pci_dev *pdev)
 {
 	struct request_queue *q;
 	int j;
@@ -451,11 +450,8 @@ static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev)
 		blk_queue_bounce_limit(q, hba[i]->pci_dev->dma_mask);
 
 	/* This is a hardware imposed limit. */
-	blk_queue_max_hw_segments(q, SG_MAX);
+	blk_queue_max_segments(q, SG_MAX);
 
-	/* This is a driver limit and could be eliminated. */
-	blk_queue_max_phys_segments(q, SG_MAX);
-	
 	init_timer(&hba[i]->timer);
 	hba[i]->timer.expires = jiffies + IDA_TIMER;
 	hba[i]->timer.data = (unsigned long)hba[i];
@@ -509,8 +505,8 @@ Enomem4:
 	return -1;
 }
 
-static int __init cpqarray_init_one( struct pci_dev *pdev,
-	const struct pci_device_id *ent)
+static int cpqarray_init_one(struct pci_dev *pdev,
+			     const struct pci_device_id *ent)
 {
 	int i;
 
@@ -540,7 +536,7 @@ static int __init cpqarray_init_one( struct pci_dev *pdev,
 static struct pci_driver cpqarray_pci_driver = {
 	.name = "cpqarray",
 	.probe = cpqarray_init_one,
-	.remove = __devexit_p(cpqarray_remove_one_pci),
+	.remove = cpqarray_remove_one_pci,
 	.id_table = cpqarray_pci_device_id,
 };
 
@@ -624,6 +620,7 @@ static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 	}
 	vendor_id = pdev->vendor;
 	device_id = pdev->device;
+	revision  = pdev->revision;
 	irq = pdev->irq;
 
 	for(i=0; i<6; i++)
@@ -636,7 +633,6 @@ static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
 	}
 
 	pci_read_config_word(pdev, PCI_COMMAND, &command);
-	pci_read_config_byte(pdev, PCI_CLASS_REVISION, &revision);
 	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache_line_size);
 	pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &latency_timer);
 
@@ -746,7 +742,7 @@ __setup("smart2=", cpqarray_setup);
 /*
  * Find an EISA controller's signature.  Set up an hba if we find it.
  */
-static int __init cpqarray_eisa_detect(void)
+static int cpqarray_eisa_detect(void)
 {
 	int i=0, j;
 	__u32 board_id;
@@ -846,14 +842,28 @@ static int ida_open(struct block_device *bdev, fmode_t mode)
 	return 0;
 }
 
+static int ida_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+	int ret;
+
+	mutex_lock(&cpqarray_mutex);
+	ret = ida_open(bdev, mode);
+	mutex_unlock(&cpqarray_mutex);
+
+	return ret;
+}
+
 /*
  * Close.  Sync first.
  */
-static int ida_release(struct gendisk *disk, fmode_t mode)
+static void ida_release(struct gendisk *disk, fmode_t mode)
 {
-	ctlr_info_t *host = get_host(disk);
+	ctlr_info_t *host;
+
+	mutex_lock(&cpqarray_mutex);
+	host = get_host(disk);
 	host->usage_count--;
-	return 0;
+	mutex_unlock(&cpqarray_mutex);
 }
 
 /*
@@ -899,9 +909,6 @@ static void do_ida_request(struct request_queue *q)
 	struct scatterlist tmp_sg[SG_MAX];
 	int i, dir, seg;
 
-	if (blk_queue_plugged(q))
-		goto startio;
-
 queue_next:
 	creq = blk_peek_request(q);
 	if (!creq)
@@ -1134,7 +1141,7 @@ static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo)
  *  ida_ioctl does some miscellaneous stuff like reporting drive geometry,
  *  setting readahead and submitting commands from userspace to the controller.
  */
-static int ida_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+static int ida_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
 {
 	drv_info_t *drv = get_drv(bdev->bd_disk);
 	ctlr_info_t *host = get_host(bdev->bd_disk);
@@ -1168,7 +1175,8 @@ out_passthru:
 		return error;
 	case IDAGETCTLRSIG:
 		if (!arg) return -EINVAL;
-		put_user(host->ctlr_sig, (int __user *)arg);
+		if (put_user(host->ctlr_sig, (int __user *)arg))
+			return -EFAULT;
 		return 0;
 	case IDAREVALIDATEVOLS:
 		if (MINOR(bdev->bd_dev) != 0)
@@ -1176,7 +1184,8 @@ out_passthru:
 		return revalidate_allvol(host);
 	case IDADRIVERVERSION:
 		if (!arg) return -EINVAL;
-		put_user(DRIVER_VERSION, (unsigned long __user *)arg);
+		if (put_user(DRIVER_VERSION, (unsigned long __user *)arg))
+			return -EFAULT;
 		return 0;
 	case IDAGETPCIINFO:
 	{
@@ -1184,6 +1193,7 @@ out_passthru:
 		ida_pci_info_struct pciinfo;
 
 		if (!arg) return -EINVAL;
+		memset(&pciinfo, 0, sizeof(pciinfo));
 		pciinfo.bus = host->pci_dev->bus->number;
 		pciinfo.dev_fn = host->pci_dev->devfn;
 		pciinfo.board_id = host->board_id;
@@ -1198,6 +1208,19 @@ out_passthru:
 	}
 		
 }
+
+static int ida_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	mutex_lock(&cpqarray_mutex);
+	ret = ida_locked_ioctl(bdev, mode, cmd, param);
+	mutex_unlock(&cpqarray_mutex);
+
+	return ret;
+}
+
 /*
  * ida_ctlr_ioctl is for passing commands to the controller from userspace.
  * The command block (io) has already been copied to kernel space for us,
@@ -1231,17 +1254,11 @@ static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io)
 	/* Pre submit processing */
 	switch(io->cmd) {
 	case PASSTHRU_A:
-		p = kmalloc(io->sg[0].size, GFP_KERNEL);
-		if (!p) 
-		{ 
-			error = -ENOMEM; 
-			cmd_free(h, c, 0); 
-			return(error);
-		}
-		if (copy_from_user(p, io->sg[0].addr, io->sg[0].size)) {
-			kfree(p);
-			cmd_free(h, c, 0); 
-			return -EFAULT;
+		p = memdup_user(io->sg[0].addr, io->sg[0].size);
+		if (IS_ERR(p)) {
+			error = PTR_ERR(p);
+			cmd_free(h, c, 0);
+			return error;
 		}
 		c->req.hdr.blk = pci_map_single(h->pci_dev, &(io->c), 
 				sizeof(ida_ioctl_t), 
@@ -1272,18 +1289,12 @@ static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io)
 	case DIAG_PASS_THRU:
 	case COLLECT_BUFFER:
 	case WRITE_FLASH_ROM:
-		p = kmalloc(io->sg[0].size, GFP_KERNEL);
-		if (!p) 
- 		{ 
-                        error = -ENOMEM; 
-                        cmd_free(h, c, 0);
-                        return(error);
+		p = memdup_user(io->sg[0].addr, io->sg[0].size);
+		if (IS_ERR(p)) {
+			error = PTR_ERR(p);
+			cmd_free(h, c, 0);
+			return error;
                 }
-		if (copy_from_user(p, io->sg[0].addr, io->sg[0].size)) {
-			kfree(p);
-                        cmd_free(h, c, 0);
-			return -EFAULT;
-		}
 		c->req.sg[0].size = io->sg[0].size;
 		c->req.sg[0].addr = pci_map_single(h->pci_dev, p, 
 			c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); 
diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c
index 8b6bb764b0a..99e773cb70d 100644
--- a/drivers/block/cryptoloop.c
+++ b/drivers/block/cryptoloop.c
@@ -25,9 +25,9 @@
 #include <linux/string.h>
 #include <linux/crypto.h>
 #include <linux/blkdev.h>
-#include <linux/loop.h>
 #include <linux/scatterlist.h>
 #include <asm/uaccess.h>
+#include "loop.h"
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("loop blockdevice transferfunction adaptor / CryptoAPI");
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 00000000000..7845bd6ee41
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,73 @@
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS or INET not selected"
+	depends on PROC_FS='n' || INET='n'
+
+config BLK_DEV_DRBD
+	tristate "DRBD Distributed Replicated Block Device support"
+	depends on PROC_FS && INET
+	select LRU_CACHE
+	select LIBCRC32C
+	default n
+	help
+
+	  NOTE: In order to authenticate connections you have to select
+	  CRYPTO_HMAC and a hash function as well.
+
+	  DRBD is a shared-nothing, synchronously replicated block device. It
+	  is designed to serve as a building block for high availability
+	  clusters and in this context, is a "drop-in" replacement for shared
+	  storage. Simplistically, you could see it as a network RAID 1.
+
+	  Each minor device has a role, which can be 'primary' or 'secondary'.
+	  On the node with the primary device the application is supposed to
+	  run and to access the device (/dev/drbdX). Every write is sent to
+	  the local 'lower level block device' and, across the network, to the
+	  node with the device in 'secondary' state.  The secondary device
+	  simply writes the data to its lower level block device.
+
+	  DRBD can also be used in dual-Primary mode (device writable on both
+	  nodes), which means it can exhibit shared disk semantics in a
+	  shared-nothing cluster.  Needless to say, on top of dual-Primary
+	  DRBD utilizing a cluster file system is necessary to maintain for
+	  cache coherency.
+
+	  For automatic failover you need a cluster manager (e.g. heartbeat).
+	  See also: http://www.drbd.org/, http://www.linux-ha.org
+
+	  If unsure, say N.
+
+config DRBD_FAULT_INJECTION
+	bool "DRBD fault injection"
+	depends on BLK_DEV_DRBD
+	help
+
+	  Say Y here if you want to simulate IO errors, in order to test DRBD's
+	  behavior.
+
+	  The actual simulation of IO errors is done by writing 3 values to
+	  /sys/module/drbd/parameters/
+
+	  enable_faults: bitmask of...
+	  1	meta data write
+	  2               read
+	  4	resync data write
+	  8	            read
+	  16	data write
+	  32	data read
+	  64	read ahead
+	  128	kmalloc of bitmap
+	  256	allocation of peer_requests
+	  512	insert data corruption on receiving side
+
+	  fault_devs: bitmask of minor numbers
+	  fault_rate: frequency in percent
+
+	  Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
+		echo 16 > /sys/module/drbd/parameters/enable_faults
+		echo 1 > /sys/module/drbd/parameters/fault_devs
+		echo 5 > /sys/module/drbd/parameters/fault_rate
+
+	  If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 00000000000..8b450338075
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,7 @@
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+drbd-y += drbd_interval.o drbd_state.o
+drbd-y += drbd_nla.o
+
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 00000000000..05a1780ffa8
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1338 @@
+/*
+   drbd_actlog.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/drbd.h>
+#include <linux/drbd_limits.h>
+#include <linux/dynamic_debug.h>
+#include "drbd_int.h"
+
+
+enum al_transaction_types {
+	AL_TR_UPDATE = 0,
+	AL_TR_INITIALIZED = 0xffff
+};
+/* all fields on disc in big endian */
+struct __packed al_transaction_on_disk {
+	/* don't we all like magic */
+	__be32	magic;
+
+	/* to identify the most recent transaction block
+	 * in the on disk ring buffer */
+	__be32	tr_number;
+
+	/* checksum on the full 4k block, with this field set to 0. */
+	__be32	crc32c;
+
+	/* type of transaction, special transaction types like:
+	 * purge-all, set-all-idle, set-all-active, ... to-be-defined
+	 * see also enum al_transaction_types */
+	__be16	transaction_type;
+
+	/* we currently allow only a few thousand extents,
+	 * so 16bit will be enough for the slot number. */
+
+	/* how many updates in this transaction */
+	__be16	n_updates;
+
+	/* maximum slot number, "al-extents" in drbd.conf speak.
+	 * Having this in each transaction should make reconfiguration
+	 * of that parameter easier. */
+	__be16	context_size;
+
+	/* slot number the context starts with */
+	__be16	context_start_slot_nr;
+
+	/* Some reserved bytes.  Expected usage is a 64bit counter of
+	 * sectors-written since device creation, and other data generation tag
+	 * supporting usage */
+	__be32	__reserved[4];
+
+	/* --- 36 byte used --- */
+
+	/* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
+	 * in one transaction, then use the remaining byte in the 4k block for
+	 * context information.  "Flexible" number of updates per transaction
+	 * does not help, as we have to account for the case when all update
+	 * slots are used anyways, so it would only complicate code without
+	 * additional benefit.
+	 */
+	__be16	update_slot_nr[AL_UPDATES_PER_TRANSACTION];
+
+	/* but the extent number is 32bit, which at an extent size of 4 MiB
+	 * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
+	__be32	update_extent_nr[AL_UPDATES_PER_TRANSACTION];
+
+	/* --- 420 bytes used (36 + 64*6) --- */
+
+	/* 4096 - 420 = 3676 = 919 * 4 */
+	__be32	context[AL_CONTEXT_PER_TRANSACTION];
+};
+
+struct update_odbm_work {
+	struct drbd_work w;
+	struct drbd_device *device;
+	unsigned int enr;
+};
+
+struct update_al_work {
+	struct drbd_work w;
+	struct drbd_device *device;
+	struct completion event;
+	int err;
+};
+
+
+void *drbd_md_get_buffer(struct drbd_device *device)
+{
+	int r;
+
+	wait_event(device->misc_wait,
+		   (r = atomic_cmpxchg(&device->md_io_in_use, 0, 1)) == 0 ||
+		   device->state.disk <= D_FAILED);
+
+	return r ? NULL : page_address(device->md_io_page);
+}
+
+void drbd_md_put_buffer(struct drbd_device *device)
+{
+	if (atomic_dec_and_test(&device->md_io_in_use))
+		wake_up(&device->misc_wait);
+}
+
+void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev,
+				     unsigned int *done)
+{
+	long dt;
+
+	rcu_read_lock();
+	dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
+	rcu_read_unlock();
+	dt = dt * HZ / 10;
+	if (dt == 0)
+		dt = MAX_SCHEDULE_TIMEOUT;
+
+	dt = wait_event_timeout(device->misc_wait,
+			*done || test_bit(FORCE_DETACH, &device->flags), dt);
+	if (dt == 0) {
+		drbd_err(device, "meta-data IO operation timed out\n");
+		drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH);
+	}
+}
+
+static int _drbd_md_sync_page_io(struct drbd_device *device,
+				 struct drbd_backing_dev *bdev,
+				 struct page *page, sector_t sector,
+				 int rw, int size)
+{
+	struct bio *bio;
+	int err;
+
+	device->md_io.done = 0;
+	device->md_io.error = -ENODEV;
+
+	if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags))
+		rw |= REQ_FUA | REQ_FLUSH;
+	rw |= REQ_SYNC;
+
+	bio = bio_alloc_drbd(GFP_NOIO);
+	bio->bi_bdev = bdev->md_bdev;
+	bio->bi_iter.bi_sector = sector;
+	err = -EIO;
+	if (bio_add_page(bio, page, size, 0) != size)
+		goto out;
+	bio->bi_private = &device->md_io;
+	bio->bi_end_io = drbd_md_io_complete;
+	bio->bi_rw = rw;
+
+	if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL)
+		/* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
+		;
+	else if (!get_ldev_if_state(device, D_ATTACHING)) {
+		/* Corresponding put_ldev in drbd_md_io_complete() */
+		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+		err = -ENODEV;
+		goto out;
+	}
+
+	bio_get(bio); /* one bio_put() is in the completion handler */
+	atomic_inc(&device->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
+	if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+		bio_endio(bio, -EIO);
+	else
+		submit_bio(rw, bio);
+	wait_until_done_or_force_detached(device, bdev, &device->md_io.done);
+	if (bio_flagged(bio, BIO_UPTODATE))
+		err = device->md_io.error;
+
+ out:
+	bio_put(bio);
+	return err;
+}
+
+int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev,
+			 sector_t sector, int rw)
+{
+	int err;
+	struct page *iop = device->md_io_page;
+
+	D_ASSERT(device, atomic_read(&device->md_io_in_use) == 1);
+
+	BUG_ON(!bdev->md_bdev);
+
+	dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
+	     current->comm, current->pid, __func__,
+	     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
+	     (void*)_RET_IP_ );
+
+	if (sector < drbd_md_first_sector(bdev) ||
+	    sector + 7 > drbd_md_last_sector(bdev))
+		drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+		     current->comm, current->pid, __func__,
+		     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+
+	/* we do all our meta data IO in aligned 4k blocks. */
+	err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096);
+	if (err) {
+		drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
+		    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
+	}
+	return err;
+}
+
+static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr)
+{
+	struct lc_element *tmp;
+	tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
+	if (unlikely(tmp != NULL)) {
+		struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_NO_WRITES, &bm_ext->flags))
+			return bm_ext;
+	}
+	return NULL;
+}
+
+static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock)
+{
+	struct lc_element *al_ext;
+	struct bm_extent *bm_ext;
+	int wake;
+
+	spin_lock_irq(&device->al_lock);
+	bm_ext = find_active_resync_extent(device, enr);
+	if (bm_ext) {
+		wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
+		spin_unlock_irq(&device->al_lock);
+		if (wake)
+			wake_up(&device->al_wait);
+		return NULL;
+	}
+	if (nonblock)
+		al_ext = lc_try_get(device->act_log, enr);
+	else
+		al_ext = lc_get(device->act_log, enr);
+	spin_unlock_irq(&device->al_lock);
+	return al_ext;
+}
+
+bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i)
+{
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+
+	D_ASSERT(device, (unsigned)(last - first) <= 1);
+	D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
+
+	/* FIXME figure out a fast path for bios crossing AL extent boundaries */
+	if (first != last)
+		return false;
+
+	return _al_get(device, first, true);
+}
+
+bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i)
+{
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+	unsigned enr;
+	bool need_transaction = false;
+
+	D_ASSERT(device, first <= last);
+	D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
+
+	for (enr = first; enr <= last; enr++) {
+		struct lc_element *al_ext;
+		wait_event(device->al_wait,
+				(al_ext = _al_get(device, enr, false)) != NULL);
+		if (al_ext->lc_number != enr)
+			need_transaction = true;
+	}
+	return need_transaction;
+}
+
+static int al_write_transaction(struct drbd_device *device, bool delegate);
+
+/* When called through generic_make_request(), we must delegate
+ * activity log I/O to the worker thread: a further request
+ * submitted via generic_make_request() within the same task
+ * would be queued on current->bio_list, and would only start
+ * after this function returns (see generic_make_request()).
+ *
+ * However, if we *are* the worker, we must not delegate to ourselves.
+ */
+
+/*
+ * @delegate:   delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
+{
+	bool locked = false;
+
+	BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
+
+	/* Serialize multiple transactions.
+	 * This uses test_and_set_bit, memory barrier is implicit.
+	 */
+	wait_event(device->al_wait,
+			device->act_log->pending_changes == 0 ||
+			(locked = lc_try_lock_for_transaction(device->act_log)));
+
+	if (locked) {
+		/* Double check: it may have been committed by someone else,
+		 * while we have been waiting for the lock. */
+		if (device->act_log->pending_changes) {
+			bool write_al_updates;
+
+			rcu_read_lock();
+			write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+			rcu_read_unlock();
+
+			if (write_al_updates)
+				al_write_transaction(device, delegate);
+			spin_lock_irq(&device->al_lock);
+			/* FIXME
+			if (err)
+				we need an "lc_cancel" here;
+			*/
+			lc_committed(device->act_log);
+			spin_unlock_irq(&device->al_lock);
+		}
+		lc_unlock(device->act_log);
+		wake_up(&device->al_wait);
+	}
+}
+
+/*
+ * @delegate:   delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate)
+{
+	BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
+
+	if (drbd_al_begin_io_prepare(device, i))
+		drbd_al_begin_io_commit(device, delegate);
+}
+
+int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i)
+{
+	struct lru_cache *al = device->act_log;
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+	unsigned nr_al_extents;
+	unsigned available_update_slots;
+	unsigned enr;
+
+	D_ASSERT(device, first <= last);
+
+	nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
+	available_update_slots = min(al->nr_elements - al->used,
+				al->max_pending_changes - al->pending_changes);
+
+	/* We want all necessary updates for a given request within the same transaction
+	 * We could first check how many updates are *actually* needed,
+	 * and use that instead of the worst-case nr_al_extents */
+	if (available_update_slots < nr_al_extents)
+		return -EWOULDBLOCK;
+
+	/* Is resync active in this area? */
+	for (enr = first; enr <= last; enr++) {
+		struct lc_element *tmp;
+		tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
+		if (unlikely(tmp != NULL)) {
+			struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+			if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+				if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
+					return -EBUSY;
+				return -EWOULDBLOCK;
+			}
+		}
+	}
+
+	/* Checkout the refcounts.
+	 * Given that we checked for available elements and update slots above,
+	 * this has to be successful. */
+	for (enr = first; enr <= last; enr++) {
+		struct lc_element *al_ext;
+		al_ext = lc_get_cumulative(device->act_log, enr);
+		if (!al_ext)
+			drbd_info(device, "LOGIC BUG for enr=%u\n", enr);
+	}
+	return 0;
+}
+
+void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
+{
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+	unsigned enr;
+	struct lc_element *extent;
+	unsigned long flags;
+
+	D_ASSERT(device, first <= last);
+	spin_lock_irqsave(&device->al_lock, flags);
+
+	for (enr = first; enr <= last; enr++) {
+		extent = lc_find(device->act_log, enr);
+		if (!extent) {
+			drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr);
+			continue;
+		}
+		lc_put(device->act_log, extent);
+	}
+	spin_unlock_irqrestore(&device->al_lock, flags);
+	wake_up(&device->al_wait);
+}
+
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+	return al_enr >>
+		/* bit to page */
+		((PAGE_SHIFT + 3) -
+		/* al extent number to bit */
+		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
+{
+	return rs_enr >>
+		/* bit to page */
+		((PAGE_SHIFT + 3) -
+		/* resync extent number to bit */
+		 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
+{
+	const unsigned int stripes = device->ldev->md.al_stripes;
+	const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
+
+	/* transaction number, modulo on-disk ring buffer wrap around */
+	unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
+
+	/* ... to aligned 4k on disk block */
+	t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+	/* ... to 512 byte sector in activity log */
+	t *= 8;
+
+	/* ... plus offset to the on disk position */
+	return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
+}
+
+static int
+_al_write_transaction(struct drbd_device *device)
+{
+	struct al_transaction_on_disk *buffer;
+	struct lc_element *e;
+	sector_t sector;
+	int i, mx;
+	unsigned extent_nr;
+	unsigned crc = 0;
+	int err = 0;
+
+	if (!get_ldev(device)) {
+		drbd_err(device, "disk is %s, cannot start al transaction\n",
+			drbd_disk_str(device->state.disk));
+		return -EIO;
+	}
+
+	/* The bitmap write may have failed, causing a state change. */
+	if (device->state.disk < D_INCONSISTENT) {
+		drbd_err(device,
+			"disk is %s, cannot write al transaction\n",
+			drbd_disk_str(device->state.disk));
+		put_ldev(device);
+		return -EIO;
+	}
+
+	buffer = drbd_md_get_buffer(device); /* protects md_io_buffer, al_tr_cycle, ... */
+	if (!buffer) {
+		drbd_err(device, "disk failed while waiting for md_io buffer\n");
+		put_ldev(device);
+		return -ENODEV;
+	}
+
+	memset(buffer, 0, sizeof(*buffer));
+	buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
+	buffer->tr_number = cpu_to_be32(device->al_tr_number);
+
+	i = 0;
+
+	/* Even though no one can start to change this list
+	 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+	 * lc_try_lock_for_transaction() --, someone may still
+	 * be in the process of changing it. */
+	spin_lock_irq(&device->al_lock);
+	list_for_each_entry(e, &device->act_log->to_be_changed, list) {
+		if (i == AL_UPDATES_PER_TRANSACTION) {
+			i++;
+			break;
+		}
+		buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+		buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+		if (e->lc_number != LC_FREE)
+			drbd_bm_mark_for_writeout(device,
+					al_extent_to_bm_page(e->lc_number));
+		i++;
+	}
+	spin_unlock_irq(&device->al_lock);
+	BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
+
+	buffer->n_updates = cpu_to_be16(i);
+	for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+		buffer->update_slot_nr[i] = cpu_to_be16(-1);
+		buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+	}
+
+	buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
+	buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
+
+	mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
+		   device->act_log->nr_elements - device->al_tr_cycle);
+	for (i = 0; i < mx; i++) {
+		unsigned idx = device->al_tr_cycle + i;
+		extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
+		buffer->context[i] = cpu_to_be32(extent_nr);
+	}
+	for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+		buffer->context[i] = cpu_to_be32(LC_FREE);
+
+	device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
+	if (device->al_tr_cycle >= device->act_log->nr_elements)
+		device->al_tr_cycle = 0;
+
+	sector = al_tr_number_to_on_disk_sector(device);
+
+	crc = crc32c(0, buffer, 4096);
+	buffer->crc32c = cpu_to_be32(crc);
+
+	if (drbd_bm_write_hinted(device))
+		err = -EIO;
+	else {
+		bool write_al_updates;
+		rcu_read_lock();
+		write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+		rcu_read_unlock();
+		if (write_al_updates) {
+			if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+				err = -EIO;
+				drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+			} else {
+				device->al_tr_number++;
+				device->al_writ_cnt++;
+			}
+		}
+	}
+
+	drbd_md_put_buffer(device);
+	put_ldev(device);
+
+	return err;
+}
+
+
+static int w_al_write_transaction(struct drbd_work *w, int unused)
+{
+	struct update_al_work *aw = container_of(w, struct update_al_work, w);
+	struct drbd_device *device = aw->device;
+	int err;
+
+	err = _al_write_transaction(device);
+	aw->err = err;
+	complete(&aw->event);
+
+	return err != -EIO ? err : 0;
+}
+
+/* Calls from worker context (see w_restart_disk_io()) need to write the
+   transaction directly. Others came through generic_make_request(),
+   those need to delegate it to the worker. */
+static int al_write_transaction(struct drbd_device *device, bool delegate)
+{
+	if (delegate) {
+		struct update_al_work al_work;
+		init_completion(&al_work.event);
+		al_work.w.cb = w_al_write_transaction;
+		al_work.device = device;
+		drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
+				      &al_work.w);
+		wait_for_completion(&al_work.event);
+		return al_work.err;
+	} else
+		return _al_write_transaction(device);
+}
+
+static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
+{
+	int rv;
+
+	spin_lock_irq(&device->al_lock);
+	rv = (al_ext->refcnt == 0);
+	if (likely(rv))
+		lc_del(device->act_log, al_ext);
+	spin_unlock_irq(&device->al_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_al_shrink() - Removes all active extents form the activity log
+ * @device:	DRBD device.
+ *
+ * Removes all active extents form the activity log, waiting until
+ * the reference count of each entry dropped to 0 first, of course.
+ *
+ * You need to lock device->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct drbd_device *device)
+{
+	struct lc_element *al_ext;
+	int i;
+
+	D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags));
+
+	for (i = 0; i < device->act_log->nr_elements; i++) {
+		al_ext = lc_element_by_index(device->act_log, i);
+		if (al_ext->lc_number == LC_FREE)
+			continue;
+		wait_event(device->al_wait, _try_lc_del(device, al_ext));
+	}
+
+	wake_up(&device->al_wait);
+}
+
+int drbd_initialize_al(struct drbd_device *device, void *buffer)
+{
+	struct al_transaction_on_disk *al = buffer;
+	struct drbd_md *md = &device->ldev->md;
+	sector_t al_base = md->md_offset + md->al_offset;
+	int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
+	int i;
+
+	memset(al, 0, 4096);
+	al->magic = cpu_to_be32(DRBD_AL_MAGIC);
+	al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
+	al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
+
+	for (i = 0; i < al_size_4k; i++) {
+		int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int w_update_odbm(struct drbd_work *w, int unused)
+{
+	struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+	struct drbd_device *device = udw->device;
+	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
+
+	if (!get_ldev(device)) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_warn(device, "Can not update on disk bitmap, local IO disabled.\n");
+		kfree(udw);
+		return 0;
+	}
+
+	drbd_bm_write_page(device, rs_extent_to_bm_page(udw->enr));
+	put_ldev(device);
+
+	kfree(udw);
+
+	if (drbd_bm_total_weight(device) <= device->rs_failed) {
+		switch (device->state.conn) {
+		case C_SYNC_SOURCE:  case C_SYNC_TARGET:
+		case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
+			drbd_resync_finished(device);
+		default:
+			/* nothing to do */
+			break;
+		}
+	}
+	drbd_bcast_event(device, &sib);
+
+	return 0;
+}
+
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the
+ * resync LRU-cache are 16MB each.
+ * The caller of this function has to hold an get_ldev() reference.
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector,
+				      int count, int success)
+{
+	struct lc_element *e;
+	struct update_odbm_work *udw;
+
+	unsigned int enr;
+
+	D_ASSERT(device, atomic_read(&device->local_cnt));
+
+	/* I simply assume that a sector/size pair never crosses
+	 * a 16 MB extent border. (Currently this is true...) */
+	enr = BM_SECT_TO_EXT(sector);
+
+	e = lc_get(device->resync, enr);
+	if (e) {
+		struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
+		if (ext->lce.lc_number == enr) {
+			if (success)
+				ext->rs_left -= count;
+			else
+				ext->rs_failed += count;
+			if (ext->rs_left < ext->rs_failed) {
+				drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d "
+				    "rs_failed=%d count=%d cstate=%s\n",
+				     (unsigned long long)sector,
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->rs_failed, count,
+				     drbd_conn_str(device->state.conn));
+
+				/* We don't expect to be able to clear more bits
+				 * than have been set when we originally counted
+				 * the set bits to cache that value in ext->rs_left.
+				 * Whatever the reason (disconnect during resync,
+				 * delayed local completion of an application write),
+				 * try to fix it up by recounting here. */
+				ext->rs_left = drbd_bm_e_weight(device, enr);
+			}
+		} else {
+			/* Normally this element should be in the cache,
+			 * since drbd_rs_begin_io() pulled it already in.
+			 *
+			 * But maybe an application write finished, and we set
+			 * something outside the resync lru_cache in sync.
+			 */
+			int rs_left = drbd_bm_e_weight(device, enr);
+			if (ext->flags != 0) {
+				drbd_warn(device, "changing resync lce: %d[%u;%02lx]"
+				     " -> %d[%u;00]\n",
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->flags, enr, rs_left);
+				ext->flags = 0;
+			}
+			if (ext->rs_failed) {
+				drbd_warn(device, "Kicking resync_lru element enr=%u "
+				     "out with rs_failed=%d\n",
+				     ext->lce.lc_number, ext->rs_failed);
+			}
+			ext->rs_left = rs_left;
+			ext->rs_failed = success ? 0 : count;
+			/* we don't keep a persistent log of the resync lru,
+			 * we can commit any change right away. */
+			lc_committed(device->resync);
+		}
+		lc_put(device->resync, &ext->lce);
+		/* no race, we are within the al_lock! */
+
+		if (ext->rs_left == ext->rs_failed) {
+			ext->rs_failed = 0;
+
+			udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
+			if (udw) {
+				udw->enr = ext->lce.lc_number;
+				udw->w.cb = w_update_odbm;
+				udw->device = device;
+				drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
+						      &udw->w);
+			} else {
+				drbd_warn(device, "Could not kmalloc an udw\n");
+			}
+		}
+	} else {
+		drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
+		    device->resync_locked,
+		    device->resync->nr_elements,
+		    device->resync->flags);
+	}
+}
+
+void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
+{
+	unsigned long now = jiffies;
+	unsigned long last = device->rs_mark_time[device->rs_last_mark];
+	int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+	if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+		if (device->rs_mark_left[device->rs_last_mark] != still_to_go &&
+		    device->state.conn != C_PAUSED_SYNC_T &&
+		    device->state.conn != C_PAUSED_SYNC_S) {
+			device->rs_mark_time[next] = now;
+			device->rs_mark_left[next] = still_to_go;
+			device->rs_last_mark = next;
+		}
+	}
+}
+
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
+ *
+ */
+void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size,
+		       const char *file, const unsigned int line)
+{
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count = 0;
+	sector_t esector, nr_sectors;
+	int wake_up = 0;
+	unsigned long flags;
+
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+		drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
+				(unsigned long long)sector, size);
+		return;
+	}
+
+	if (!get_ldev(device))
+		return; /* no disk, no metadata, no bitmap to clear bits in */
+
+	nr_sectors = drbd_get_capacity(device->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	if (!expect(sector < nr_sectors))
+		goto out;
+	if (!expect(esector < nr_sectors))
+		esector = nr_sectors - 1;
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/* we clear it (in sync).
+	 * round up start sector, round down end sector.  we make sure we only
+	 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
+	if (unlikely(esector < BM_SECT_PER_BIT-1))
+		goto out;
+	if (unlikely(esector == (nr_sectors-1)))
+		ebnr = lbnr;
+	else
+		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+	if (sbnr > ebnr)
+		goto out;
+
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	count = drbd_bm_clear_bits(device, sbnr, ebnr);
+	if (count) {
+		drbd_advance_rs_marks(device, drbd_bm_total_weight(device));
+		spin_lock_irqsave(&device->al_lock, flags);
+		drbd_try_clear_on_disk_bm(device, sector, count, true);
+		spin_unlock_irqrestore(&device->al_lock, flags);
+
+		/* just wake_up unconditional now, various lc_chaged(),
+		 * lc_put() in drbd_try_clear_on_disk_bm(). */
+		wake_up = 1;
+	}
+out:
+	put_ldev(device);
+	if (wake_up)
+		wake_up(&device->al_wait);
+}
+
+/*
+ * this is intended to set one request worth of data out of sync.
+ * affects at least 1 bit,
+ * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
+ *
+ * called by tl_clear and drbd_send_dblock (==drbd_make_request).
+ * so this can be _any_ process.
+ */
+int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size,
+			    const char *file, const unsigned int line)
+{
+	unsigned long sbnr, ebnr, flags;
+	sector_t esector, nr_sectors;
+	unsigned int enr, count = 0;
+	struct lc_element *e;
+
+	/* this should be an empty REQ_FLUSH */
+	if (size == 0)
+		return 0;
+
+	if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+		drbd_err(device, "sector: %llus, size: %d\n",
+			(unsigned long long)sector, size);
+		return 0;
+	}
+
+	if (!get_ldev(device))
+		return 0; /* no disk, no metadata, no bitmap to set bits in */
+
+	nr_sectors = drbd_get_capacity(device->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	if (!expect(sector < nr_sectors))
+		goto out;
+	if (!expect(esector < nr_sectors))
+		esector = nr_sectors - 1;
+
+	/* we set it out of sync,
+	 * we do not need to round anything here */
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	/* ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.  */
+	spin_lock_irqsave(&device->al_lock, flags);
+	count = drbd_bm_set_bits(device, sbnr, ebnr);
+
+	enr = BM_SECT_TO_EXT(sector);
+	e = lc_find(device->resync, enr);
+	if (e)
+		lc_entry(e, struct bm_extent, lce)->rs_left += count;
+	spin_unlock_irqrestore(&device->al_lock, flags);
+
+out:
+	put_ldev(device);
+
+	return count;
+}
+
+static
+struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int wakeup = 0;
+	unsigned long rs_flags;
+
+	spin_lock_irq(&device->al_lock);
+	if (device->resync_locked > device->resync->nr_elements/2) {
+		spin_unlock_irq(&device->al_lock);
+		return NULL;
+	}
+	e = lc_get(device->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(device, enr);
+			bm_ext->rs_failed = 0;
+			lc_committed(device->resync);
+			wakeup = 1;
+		}
+		if (bm_ext->lce.refcnt == 1)
+			device->resync_locked++;
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+	}
+	rs_flags = device->resync->flags;
+	spin_unlock_irq(&device->al_lock);
+	if (wakeup)
+		wake_up(&device->al_wait);
+
+	if (!bm_ext) {
+		if (rs_flags & LC_STARVING)
+			drbd_warn(device, "Have to wait for element"
+			     " (resync LRU too small?)\n");
+		BUG_ON(rs_flags & LC_LOCKED);
+	}
+
+	return bm_ext;
+}
+
+static int _is_in_al(struct drbd_device *device, unsigned int enr)
+{
+	int rv;
+
+	spin_lock_irq(&device->al_lock);
+	rv = lc_is_used(device->act_log, enr);
+	spin_unlock_irq(&device->al_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
+ * @device:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
+ */
+int drbd_rs_begin_io(struct drbd_device *device, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct bm_extent *bm_ext;
+	int i, sig;
+	bool sa;
+
+retry:
+	sig = wait_event_interruptible(device->al_wait,
+			(bm_ext = _bme_get(device, enr)));
+	if (sig)
+		return -EINTR;
+
+	if (test_bit(BME_LOCKED, &bm_ext->flags))
+		return 0;
+
+	/* step aside only while we are above c-min-rate; unless disabled. */
+	sa = drbd_rs_c_min_rate_throttle(device);
+
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		sig = wait_event_interruptible(device->al_wait,
+					       !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) ||
+					       (sa && test_bit(BME_PRIORITY, &bm_ext->flags)));
+
+		if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) {
+			spin_lock_irq(&device->al_lock);
+			if (lc_put(device->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
+				device->resync_locked--;
+				wake_up(&device->al_wait);
+			}
+			spin_unlock_irq(&device->al_lock);
+			if (sig)
+				return -EINTR;
+			if (schedule_timeout_interruptible(HZ/10))
+				return -EINTR;
+			goto retry;
+		}
+	}
+	set_bit(BME_LOCKED, &bm_ext->flags);
+	return 0;
+}
+
+/**
+ * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
+ * @device:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
+ * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
+ * if there is still application IO going on in this area.
+ */
+int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+
+	spin_lock_irq(&device->al_lock);
+	if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
+		/* in case you have very heavy scattered io, it may
+		 * stall the syncer undefined if we give up the ref count
+		 * when we try again and requeue.
+		 *
+		 * if we don't give up the refcount, but the next time
+		 * we are scheduled this extent has been "synced" by new
+		 * application writes, we'd miss the lc_put on the
+		 * extent we keep the refcount on.
+		 * so we remembered which extent we had to try again, and
+		 * if the next requested one is something else, we do
+		 * the lc_put here...
+		 * we also have to wake_up
+		 */
+		e = lc_find(device->resync, device->resync_wenr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (bm_ext) {
+			D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+			clear_bit(BME_NO_WRITES, &bm_ext->flags);
+			device->resync_wenr = LC_FREE;
+			if (lc_put(device->resync, &bm_ext->lce) == 0)
+				device->resync_locked--;
+			wake_up(&device->al_wait);
+		} else {
+			drbd_alert(device, "LOGIC BUG\n");
+		}
+	}
+	/* TRY. */
+	e = lc_try_get(device->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (test_bit(BME_LOCKED, &bm_ext->flags))
+			goto proceed;
+		if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
+			device->resync_locked++;
+		} else {
+			/* we did set the BME_NO_WRITES,
+			 * but then could not set BME_LOCKED,
+			 * so we tried again.
+			 * drop the extra reference. */
+			bm_ext->lce.refcnt--;
+			D_ASSERT(device, bm_ext->lce.refcnt > 0);
+		}
+		goto check_al;
+	} else {
+		/* do we rather want to try later? */
+		if (device->resync_locked > device->resync->nr_elements-3)
+			goto try_again;
+		/* Do or do not. There is no try. -- Yoda */
+		e = lc_get(device->resync, enr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (!bm_ext) {
+			const unsigned long rs_flags = device->resync->flags;
+			if (rs_flags & LC_STARVING)
+				drbd_warn(device, "Have to wait for element"
+				     " (resync LRU too small?)\n");
+			BUG_ON(rs_flags & LC_LOCKED);
+			goto try_again;
+		}
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(device, enr);
+			bm_ext->rs_failed = 0;
+			lc_committed(device->resync);
+			wake_up(&device->al_wait);
+			D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0);
+		}
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+		D_ASSERT(device, bm_ext->lce.refcnt == 1);
+		device->resync_locked++;
+		goto check_al;
+	}
+check_al:
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		if (lc_is_used(device->act_log, al_enr+i))
+			goto try_again;
+	}
+	set_bit(BME_LOCKED, &bm_ext->flags);
+proceed:
+	device->resync_wenr = LC_FREE;
+	spin_unlock_irq(&device->al_lock);
+	return 0;
+
+try_again:
+	if (bm_ext)
+		device->resync_wenr = enr;
+	spin_unlock_irq(&device->al_lock);
+	return -EAGAIN;
+}
+
+void drbd_rs_complete_io(struct drbd_device *device, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->al_lock, flags);
+	e = lc_find(device->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (!bm_ext) {
+		spin_unlock_irqrestore(&device->al_lock, flags);
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n");
+		return;
+	}
+
+	if (bm_ext->lce.refcnt == 0) {
+		spin_unlock_irqrestore(&device->al_lock, flags);
+		drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, "
+		    "but refcnt is 0!?\n",
+		    (unsigned long long)sector, enr);
+		return;
+	}
+
+	if (lc_put(device->resync, &bm_ext->lce) == 0) {
+		bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
+		device->resync_locked--;
+		wake_up(&device->al_wait);
+	}
+
+	spin_unlock_irqrestore(&device->al_lock, flags);
+}
+
+/**
+ * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
+ * @device:	DRBD device.
+ */
+void drbd_rs_cancel_all(struct drbd_device *device)
+{
+	spin_lock_irq(&device->al_lock);
+
+	if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */
+		lc_reset(device->resync);
+		put_ldev(device);
+	}
+	device->resync_locked = 0;
+	device->resync_wenr = LC_FREE;
+	spin_unlock_irq(&device->al_lock);
+	wake_up(&device->al_wait);
+}
+
+/**
+ * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
+ * @device:	DRBD device.
+ *
+ * Returns 0 upon success, -EAGAIN if at least one reference count was
+ * not zero.
+ */
+int drbd_rs_del_all(struct drbd_device *device)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+
+	spin_lock_irq(&device->al_lock);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		/* ok, ->resync is there. */
+		for (i = 0; i < device->resync->nr_elements; i++) {
+			e = lc_element_by_index(device->resync, i);
+			bm_ext = lc_entry(e, struct bm_extent, lce);
+			if (bm_ext->lce.lc_number == LC_FREE)
+				continue;
+			if (bm_ext->lce.lc_number == device->resync_wenr) {
+				drbd_info(device, "dropping %u in drbd_rs_del_all, apparently"
+				     " got 'synced' by application io\n",
+				     device->resync_wenr);
+				D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+				D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+				clear_bit(BME_NO_WRITES, &bm_ext->flags);
+				device->resync_wenr = LC_FREE;
+				lc_put(device->resync, &bm_ext->lce);
+			}
+			if (bm_ext->lce.refcnt != 0) {
+				drbd_info(device, "Retrying drbd_rs_del_all() later. "
+				     "refcnt=%d\n", bm_ext->lce.refcnt);
+				put_ldev(device);
+				spin_unlock_irq(&device->al_lock);
+				return -EAGAIN;
+			}
+			D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags));
+			lc_del(device->resync, &bm_ext->lce);
+		}
+		D_ASSERT(device, device->resync->used == 0);
+		put_ldev(device);
+	}
+	spin_unlock_irq(&device->al_lock);
+	wake_up(&device->al_wait);
+
+	return 0;
+}
+
+/**
+ * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
+ * @device:	DRBD device.
+ * @sector:	The sector number.
+ * @size:	Size of failed IO operation, in byte.
+ */
+void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size)
+{
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count;
+	sector_t esector, nr_sectors;
+	int wake_up = 0;
+
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+		drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
+				(unsigned long long)sector, size);
+		return;
+	}
+	nr_sectors = drbd_get_capacity(device->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	if (!expect(sector < nr_sectors))
+		return;
+	if (!expect(esector < nr_sectors))
+		esector = nr_sectors - 1;
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/*
+	 * round up start sector, round down end sector.  we make sure we only
+	 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
+	if (unlikely(esector < BM_SECT_PER_BIT-1))
+		return;
+	if (unlikely(esector == (nr_sectors-1)))
+		ebnr = lbnr;
+	else
+		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+	if (sbnr > ebnr)
+		return;
+
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	spin_lock_irq(&device->al_lock);
+	count = drbd_bm_count_bits(device, sbnr, ebnr);
+	if (count) {
+		device->rs_failed += count;
+
+		if (get_ldev(device)) {
+			drbd_try_clear_on_disk_bm(device, sector, count, false);
+			put_ldev(device);
+		}
+
+		/* just wake_up unconditional now, various lc_chaged(),
+		 * lc_put() in drbd_try_clear_on_disk_bm(). */
+		wake_up = 1;
+	}
+	spin_unlock_irq(&device->al_lock);
+	if (wake_up)
+		wake_up(&device->al_wait);
+}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 00000000000..1aa29f8fdfe
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1694 @@
+/*
+   drbd_bitmap.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/drbd.h>
+#include <linux/slab.h>
+#include <asm/kmap_types.h>
+
+#include "drbd_int.h"
+
+
+/* OPAQUE outside this file!
+ * interface defined in drbd_int.h
+
+ * convention:
+ * function name drbd_bm_... => used elsewhere, "public".
+ * function name      bm_... => internal to implementation, "private".
+ */
+
+
+/*
+ * LIMITATIONS:
+ * We want to support >= peta byte of backend storage, while for now still using
+ * a granularity of one bit per 4KiB of storage.
+ * 1 << 50		bytes backend storage (1 PiB)
+ * 1 << (50 - 12)	bits needed
+ *	38 --> we need u64 to index and count bits
+ * 1 << (38 - 3)	bitmap bytes needed
+ *	35 --> we still need u64 to index and count bytes
+ *			(that's 32 GiB of bitmap for 1 PiB storage)
+ * 1 << (35 - 2)	32bit longs needed
+ *	33 --> we'd even need u64 to index and count 32bit long words.
+ * 1 << (35 - 3)	64bit longs needed
+ *	32 --> we could get away with a 32bit unsigned int to index and count
+ *	64bit long words, but I rather stay with unsigned long for now.
+ *	We probably should neither count nor point to bytes or long words
+ *	directly, but either by bitnumber, or by page index and offset.
+ * 1 << (35 - 12)
+ *	22 --> we need that much 4KiB pages of bitmap.
+ *	1 << (22 + 3) --> on a 64bit arch,
+ *	we need 32 MiB to store the array of page pointers.
+ *
+ * Because I'm lazy, and because the resulting patch was too large, too ugly
+ * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
+ * (1 << 32) bits * 4k storage.
+ *
+
+ * bitmap storage and IO:
+ *	Bitmap is stored little endian on disk, and is kept little endian in
+ *	core memory. Currently we still hold the full bitmap in core as long
+ *	as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
+ *	seems excessive.
+ *
+ *	We plan to reduce the amount of in-core bitmap pages by paging them in
+ *	and out against their on-disk location as necessary, but need to make
+ *	sure we don't cause too much meta data IO, and must not deadlock in
+ *	tight memory situations. This needs some more work.
+ */
+
+/*
+ * NOTE
+ *  Access to the *bm_pages is protected by bm_lock.
+ *  It is safe to read the other members within the lock.
+ *
+ *  drbd_bm_set_bits is called from bio_endio callbacks,
+ *  We may be called with irq already disabled,
+ *  so we need spin_lock_irqsave().
+ *  And we need the kmap_atomic.
+ */
+struct drbd_bitmap {
+	struct page **bm_pages;
+	spinlock_t bm_lock;
+
+	/* see LIMITATIONS: above */
+
+	unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
+	unsigned long bm_bits;
+	size_t   bm_words;
+	size_t   bm_number_of_pages;
+	sector_t bm_dev_capacity;
+	struct mutex bm_change; /* serializes resize operations */
+
+	wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
+
+	enum bm_flag bm_flags;
+
+	/* debugging aid, in case we are still racy somewhere */
+	char          *bm_why;
+	struct task_struct *bm_task;
+};
+
+#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
+static void __bm_print_lock_info(struct drbd_device *device, const char *func)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!__ratelimit(&drbd_ratelimit_state))
+		return;
+	drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
+		 current->comm, task_pid_nr(current),
+		 func, b->bm_why ?: "?",
+		 b->bm_task->comm, task_pid_nr(b->bm_task));
+}
+
+void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	int trylock_failed;
+
+	if (!b) {
+		drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n");
+		return;
+	}
+
+	trylock_failed = !mutex_trylock(&b->bm_change);
+
+	if (trylock_failed) {
+		drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n",
+			  current->comm, task_pid_nr(current),
+			  why, b->bm_why ?: "?",
+			  b->bm_task->comm, task_pid_nr(b->bm_task));
+		mutex_lock(&b->bm_change);
+	}
+	if (BM_LOCKED_MASK & b->bm_flags)
+		drbd_err(device, "FIXME bitmap already locked in bm_lock\n");
+	b->bm_flags |= flags & BM_LOCKED_MASK;
+
+	b->bm_why  = why;
+	b->bm_task = current;
+}
+
+void drbd_bm_unlock(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!b) {
+		drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n");
+		return;
+	}
+
+	if (!(BM_LOCKED_MASK & device->bitmap->bm_flags))
+		drbd_err(device, "FIXME bitmap not locked in bm_unlock\n");
+
+	b->bm_flags &= ~BM_LOCKED_MASK;
+	b->bm_why  = NULL;
+	b->bm_task = NULL;
+	mutex_unlock(&b->bm_change);
+}
+
+/* we store some "meta" info about our pages in page->private */
+/* at a granularity of 4k storage per bitmap bit:
+ * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
+ *  1<<38 bits,
+ *  1<<23 4k bitmap pages.
+ * Use 24 bits as page index, covers 2 peta byte storage
+ * at a granularity of 4k per bit.
+ * Used to report the failed page idx on io error from the endio handlers.
+ */
+#define BM_PAGE_IDX_MASK	((1UL<<24)-1)
+/* this page is currently read in, or written back */
+#define BM_PAGE_IO_LOCK		31
+/* if there has been an IO error for this page */
+#define BM_PAGE_IO_ERROR	30
+/* this is to be able to intelligently skip disk IO,
+ * set if bits have been set since last IO. */
+#define BM_PAGE_NEED_WRITEOUT	29
+/* to mark for lazy writeout once syncer cleared all clearable bits,
+ * we if bits have been cleared since last IO. */
+#define BM_PAGE_LAZY_WRITEOUT	28
+/* pages marked with this "HINT" will be considered for writeout
+ * on activity log transactions */
+#define BM_PAGE_HINT_WRITEOUT	27
+
+/* store_page_idx uses non-atomic assignment. It is only used directly after
+ * allocating the page.  All other bm_set_page_* and bm_clear_page_* need to
+ * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
+ * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
+ * requires it all to be atomic as well. */
+static void bm_store_page_idx(struct page *page, unsigned long idx)
+{
+	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
+	set_page_private(page, idx);
+}
+
+static unsigned long bm_page_to_idx(struct page *page)
+{
+	return page_private(page) & BM_PAGE_IDX_MASK;
+}
+
+/* As is very unlikely that the same page is under IO from more than one
+ * context, we can get away with a bit per page and one wait queue per bitmap.
+ */
+static void bm_page_lock_io(struct drbd_device *device, int page_nr)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	void *addr = &page_private(b->bm_pages[page_nr]);
+	wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
+}
+
+static void bm_page_unlock_io(struct drbd_device *device, int page_nr)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	void *addr = &page_private(b->bm_pages[page_nr]);
+	clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
+	wake_up(&device->bitmap->bm_io_wait);
+}
+
+/* set _before_ submit_io, so it may be reset due to being changed
+ * while this page is in flight... will get submitted later again */
+static void bm_set_page_unchanged(struct page *page)
+{
+	/* use cmpxchg? */
+	clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+	clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static void bm_set_page_need_writeout(struct page *page)
+{
+	set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+}
+
+/**
+ * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
+ * @device:	DRBD device.
+ * @page_nr:	the bitmap page to mark with the "hint" flag
+ *
+ * From within an activity log transaction, we mark a few pages with these
+ * hints, then call drbd_bm_write_hinted(), which will only write out changed
+ * pages which are flagged with this mark.
+ */
+void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
+{
+	struct page *page;
+	if (page_nr >= device->bitmap->bm_number_of_pages) {
+		drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
+			 page_nr, (int)device->bitmap->bm_number_of_pages);
+		return;
+	}
+	page = device->bitmap->bm_pages[page_nr];
+	set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_unchanged(struct page *page)
+{
+	volatile const unsigned long *addr = &page_private(page);
+	return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
+}
+
+static void bm_set_page_io_err(struct page *page)
+{
+	set_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_clear_page_io_err(struct page *page)
+{
+	clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_set_page_lazy_writeout(struct page *page)
+{
+	set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_lazy_writeout(struct page *page)
+{
+	return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+/* on a 32bit box, this would allow for exactly (2<<38) bits. */
+static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
+{
+	/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
+	unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	return page_nr;
+}
+
+static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
+{
+	/* page_nr = (bitnr/8) >> PAGE_SHIFT; */
+	unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	return page_nr;
+}
+
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+	struct page *page = b->bm_pages[idx];
+	return (unsigned long *) kmap_atomic(page);
+}
+
+static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+	return __bm_map_pidx(b, idx);
+}
+
+static void __bm_unmap(unsigned long *p_addr)
+{
+	kunmap_atomic(p_addr);
+};
+
+static void bm_unmap(unsigned long *p_addr)
+{
+	return __bm_unmap(p_addr);
+}
+
+/* long word offset of _bitmap_ sector */
+#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* word offset from start of bitmap to word number _in_page_
+ * modulo longs per page
+#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
+ hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
+ so do it explicitly:
+ */
+#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
+
+/* Long words per page */
+#define LWPP (PAGE_SIZE/sizeof(long))
+
+/*
+ * actually most functions herein should take a struct drbd_bitmap*, not a
+ * struct drbd_device*, but for the debug macros I like to have the device around
+ * to be able to report device specific.
+ */
+
+
+static void bm_free_pages(struct page **pages, unsigned long number)
+{
+	unsigned long i;
+	if (!pages)
+		return;
+
+	for (i = 0; i < number; i++) {
+		if (!pages[i]) {
+			printk(KERN_ALERT "drbd: bm_free_pages tried to free "
+					  "a NULL pointer; i=%lu n=%lu\n",
+					  i, number);
+			continue;
+		}
+		__free_page(pages[i]);
+		pages[i] = NULL;
+	}
+}
+
+static void bm_vk_free(void *ptr, int v)
+{
+	if (v)
+		vfree(ptr);
+	else
+		kfree(ptr);
+}
+
+/*
+ * "have" and "want" are NUMBER OF PAGES.
+ */
+static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
+{
+	struct page **old_pages = b->bm_pages;
+	struct page **new_pages, *page;
+	unsigned int i, bytes, vmalloced = 0;
+	unsigned long have = b->bm_number_of_pages;
+
+	BUG_ON(have == 0 && old_pages != NULL);
+	BUG_ON(have != 0 && old_pages == NULL);
+
+	if (have == want)
+		return old_pages;
+
+	/* Trying kmalloc first, falling back to vmalloc.
+	 * GFP_NOIO, as this is called while drbd IO is "suspended",
+	 * and during resize or attach on diskless Primary,
+	 * we must not block on IO to ourselves.
+	 * Context is receiver thread or dmsetup. */
+	bytes = sizeof(struct page *)*want;
+	new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
+	if (!new_pages) {
+		new_pages = __vmalloc(bytes,
+				GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
+				PAGE_KERNEL);
+		if (!new_pages)
+			return NULL;
+		vmalloced = 1;
+	}
+
+	if (want >= have) {
+		for (i = 0; i < have; i++)
+			new_pages[i] = old_pages[i];
+		for (; i < want; i++) {
+			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+			if (!page) {
+				bm_free_pages(new_pages + have, i - have);
+				bm_vk_free(new_pages, vmalloced);
+				return NULL;
+			}
+			/* we want to know which page it is
+			 * from the endio handlers */
+			bm_store_page_idx(page, i);
+			new_pages[i] = page;
+		}
+	} else {
+		for (i = 0; i < want; i++)
+			new_pages[i] = old_pages[i];
+		/* NOT HERE, we are outside the spinlock!
+		bm_free_pages(old_pages + want, have - want);
+		*/
+	}
+
+	if (vmalloced)
+		b->bm_flags |= BM_P_VMALLOCED;
+	else
+		b->bm_flags &= ~BM_P_VMALLOCED;
+
+	return new_pages;
+}
+
+/*
+ * called on driver init only. TODO call when a device is created.
+ * allocates the drbd_bitmap, and stores it in device->bitmap.
+ */
+int drbd_bm_init(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	WARN_ON(b != NULL);
+	b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+	spin_lock_init(&b->bm_lock);
+	mutex_init(&b->bm_change);
+	init_waitqueue_head(&b->bm_io_wait);
+
+	device->bitmap = b;
+
+	return 0;
+}
+
+sector_t drbd_bm_capacity(struct drbd_device *device)
+{
+	if (!expect(device->bitmap))
+		return 0;
+	return device->bitmap->bm_dev_capacity;
+}
+
+/* called on driver unload. TODO: call when a device is destroyed.
+ */
+void drbd_bm_cleanup(struct drbd_device *device)
+{
+	if (!expect(device->bitmap))
+		return;
+	bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
+	bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags));
+	kfree(device->bitmap);
+	device->bitmap = NULL;
+}
+
+/*
+ * since (b->bm_bits % BITS_PER_LONG) != 0,
+ * this masks out the remaining bits.
+ * Returns the number of bits cleared.
+ */
+#define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE - 1)
+#define BITS_PER_LONG_MASK	(BITS_PER_LONG - 1)
+static int bm_clear_surplus(struct drbd_bitmap *b)
+{
+	unsigned long mask;
+	unsigned long *p_addr, *bm;
+	int tmp;
+	int cleared = 0;
+
+	/* number of bits modulo bits per page */
+	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+	/* mask the used bits of the word containing the last bit */
+	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+	/* bitmap is always stored little endian,
+	 * on disk and in core memory alike */
+	mask = cpu_to_lel(mask);
+
+	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+	bm = p_addr + (tmp/BITS_PER_LONG);
+	if (mask) {
+		/* If mask != 0, we are not exactly aligned, so bm now points
+		 * to the long containing the last bit.
+		 * If mask == 0, bm already points to the word immediately
+		 * after the last (long word aligned) bit. */
+		cleared = hweight_long(*bm & ~mask);
+		*bm &= mask;
+		bm++;
+	}
+
+	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+		/* on a 32bit arch, we may need to zero out
+		 * a padding long to align with a 64bit remote */
+		cleared += hweight_long(*bm);
+		*bm = 0;
+	}
+	bm_unmap(p_addr);
+	return cleared;
+}
+
+static void bm_set_surplus(struct drbd_bitmap *b)
+{
+	unsigned long mask;
+	unsigned long *p_addr, *bm;
+	int tmp;
+
+	/* number of bits modulo bits per page */
+	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+	/* mask the used bits of the word containing the last bit */
+	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+	/* bitmap is always stored little endian,
+	 * on disk and in core memory alike */
+	mask = cpu_to_lel(mask);
+
+	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+	bm = p_addr + (tmp/BITS_PER_LONG);
+	if (mask) {
+		/* If mask != 0, we are not exactly aligned, so bm now points
+		 * to the long containing the last bit.
+		 * If mask == 0, bm already points to the word immediately
+		 * after the last (long word aligned) bit. */
+		*bm |= ~mask;
+		bm++;
+	}
+
+	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+		/* on a 32bit arch, we may need to zero out
+		 * a padding long to align with a 64bit remote */
+		*bm = ~0UL;
+	}
+	bm_unmap(p_addr);
+}
+
+/* you better not modify the bitmap while this is running,
+ * or its results will be stale */
+static unsigned long bm_count_bits(struct drbd_bitmap *b)
+{
+	unsigned long *p_addr;
+	unsigned long bits = 0;
+	unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
+	int idx, i, last_word;
+
+	/* all but last page */
+	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
+		p_addr = __bm_map_pidx(b, idx);
+		for (i = 0; i < LWPP; i++)
+			bits += hweight_long(p_addr[i]);
+		__bm_unmap(p_addr);
+		cond_resched();
+	}
+	/* last (or only) page */
+	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
+	p_addr = __bm_map_pidx(b, idx);
+	for (i = 0; i < last_word; i++)
+		bits += hweight_long(p_addr[i]);
+	p_addr[last_word] &= cpu_to_lel(mask);
+	bits += hweight_long(p_addr[last_word]);
+	/* 32bit arch, may have an unused padding long */
+	if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
+		p_addr[last_word+1] = 0;
+	__bm_unmap(p_addr);
+	return bits;
+}
+
+/* offset and len in long words.*/
+static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
+{
+	unsigned long *p_addr, *bm;
+	unsigned int idx;
+	size_t do_now, end;
+
+	end = offset + len;
+
+	if (end > b->bm_words) {
+		printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
+		return;
+	}
+
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
+		idx = bm_word_to_page_idx(b, offset);
+		p_addr = bm_map_pidx(b, idx);
+		bm = p_addr + MLPP(offset);
+		if (bm+do_now > p_addr + LWPP) {
+			printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+			       p_addr, bm, (int)do_now);
+		} else
+			memset(bm, c, do_now * sizeof(long));
+		bm_unmap(p_addr);
+		bm_set_page_need_writeout(b->bm_pages[idx]);
+		offset += do_now;
+	}
+}
+
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
+{
+	u64 bitmap_sectors;
+	if (ldev->md.al_offset == 8)
+		bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
+	else
+		bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
+	return bitmap_sectors << (9 + 3);
+}
+
+/*
+ * make sure the bitmap has enough room for the attached storage,
+ * if necessary, resize.
+ * called whenever we may have changed the device size.
+ * returns -ENOMEM if we could not allocate enough memory, 0 on success.
+ * In case this is actually a resize, we copy the old bitmap into the new one.
+ * Otherwise, the bitmap is initialized to all bits set.
+ */
+int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long bits, words, owords, obits;
+	unsigned long want, have, onpages; /* number of pages */
+	struct page **npages, **opages = NULL;
+	int err = 0, growing;
+	int opages_vmalloced;
+
+	if (!expect(b))
+		return -ENOMEM;
+
+	drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
+
+	drbd_info(device, "drbd_bm_resize called with capacity == %llu\n",
+			(unsigned long long)capacity);
+
+	if (capacity == b->bm_dev_capacity)
+		goto out;
+
+	opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
+
+	if (capacity == 0) {
+		spin_lock_irq(&b->bm_lock);
+		opages = b->bm_pages;
+		onpages = b->bm_number_of_pages;
+		owords = b->bm_words;
+		b->bm_pages = NULL;
+		b->bm_number_of_pages =
+		b->bm_set   =
+		b->bm_bits  =
+		b->bm_words =
+		b->bm_dev_capacity = 0;
+		spin_unlock_irq(&b->bm_lock);
+		bm_free_pages(opages, onpages);
+		bm_vk_free(opages, opages_vmalloced);
+		goto out;
+	}
+	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
+
+	/* if we would use
+	   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
+	   a 32bit host could present the wrong number of words
+	   to a 64bit host.
+	*/
+	words = ALIGN(bits, 64) >> LN2_BPL;
+
+	if (get_ldev(device)) {
+		u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev);
+		put_ldev(device);
+		if (bits > bits_on_disk) {
+			drbd_info(device, "bits = %lu\n", bits);
+			drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk);
+			err = -ENOSPC;
+			goto out;
+		}
+	}
+
+	want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
+	have = b->bm_number_of_pages;
+	if (want == have) {
+		D_ASSERT(device, b->bm_pages != NULL);
+		npages = b->bm_pages;
+	} else {
+		if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC))
+			npages = NULL;
+		else
+			npages = bm_realloc_pages(b, want);
+	}
+
+	if (!npages) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	spin_lock_irq(&b->bm_lock);
+	opages = b->bm_pages;
+	owords = b->bm_words;
+	obits  = b->bm_bits;
+
+	growing = bits > obits;
+	if (opages && growing && set_new_bits)
+		bm_set_surplus(b);
+
+	b->bm_pages = npages;
+	b->bm_number_of_pages = want;
+	b->bm_bits  = bits;
+	b->bm_words = words;
+	b->bm_dev_capacity = capacity;
+
+	if (growing) {
+		if (set_new_bits) {
+			bm_memset(b, owords, 0xff, words-owords);
+			b->bm_set += bits - obits;
+		} else
+			bm_memset(b, owords, 0x00, words-owords);
+
+	}
+
+	if (want < have) {
+		/* implicit: (opages != NULL) && (opages != npages) */
+		bm_free_pages(opages + want, have - want);
+	}
+
+	(void)bm_clear_surplus(b);
+
+	spin_unlock_irq(&b->bm_lock);
+	if (opages != npages)
+		bm_vk_free(opages, opages_vmalloced);
+	if (!growing)
+		b->bm_set = bm_count_bits(b);
+	drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
+
+ out:
+	drbd_bm_unlock(device);
+	return err;
+}
+
+/* inherently racy:
+ * if not protected by other means, return value may be out of date when
+ * leaving this function...
+ * we still need to lock it, since it is important that this returns
+ * bm_set == 0 precisely.
+ *
+ * maybe bm_set should be atomic_t ?
+ */
+unsigned long _drbd_bm_total_weight(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long s;
+	unsigned long flags;
+
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	s = b->bm_set;
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+
+	return s;
+}
+
+unsigned long drbd_bm_total_weight(struct drbd_device *device)
+{
+	unsigned long s;
+	/* if I don't have a disk, I don't know about out-of-sync status */
+	if (!get_ldev_if_state(device, D_NEGOTIATING))
+		return 0;
+	s = _drbd_bm_total_weight(device);
+	put_ldev(device);
+	return s;
+}
+
+size_t drbd_bm_words(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	return b->bm_words;
+}
+
+unsigned long drbd_bm_bits(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return 0;
+
+	return b->bm_bits;
+}
+
+/* merge number words from buffer into the bitmap starting at offset.
+ * buffer[i] is expected to be little endian unsigned long.
+ * bitmap must be locked by drbd_bm_lock.
+ * currently only used from receive_bitmap.
+ */
+void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
+			unsigned long *buffer)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr, *bm;
+	unsigned long word, bits;
+	unsigned int idx;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+	if (number == 0)
+		return;
+	WARN_ON(offset >= b->bm_words);
+	WARN_ON(end    >  b->bm_words);
+
+	spin_lock_irq(&b->bm_lock);
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+		idx = bm_word_to_page_idx(b, offset);
+		p_addr = bm_map_pidx(b, idx);
+		bm = p_addr + MLPP(offset);
+		offset += do_now;
+		while (do_now--) {
+			bits = hweight_long(*bm);
+			word = *bm | *buffer++;
+			*bm++ = word;
+			b->bm_set += hweight_long(word) - bits;
+		}
+		bm_unmap(p_addr);
+		bm_set_page_need_writeout(b->bm_pages[idx]);
+	}
+	/* with 32bit <-> 64bit cross-platform connect
+	 * this is only correct for current usage,
+	 * where we _know_ that we are 64 bit aligned,
+	 * and know that this function is used in this way, too...
+	 */
+	if (end == b->bm_words)
+		b->bm_set -= bm_clear_surplus(b);
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* copy number words from the bitmap starting at offset into the buffer.
+ * buffer[i] will be little endian unsigned long.
+ */
+void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
+		     unsigned long *buffer)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr, *bm;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+
+	spin_lock_irq(&b->bm_lock);
+	if ((offset >= b->bm_words) ||
+	    (end    >  b->bm_words) ||
+	    (number <= 0))
+		drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n",
+			(unsigned long)	offset,
+			(unsigned long)	number,
+			(unsigned long) b->bm_words);
+	else {
+		while (offset < end) {
+			do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+			p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
+			bm = p_addr + MLPP(offset);
+			offset += do_now;
+			while (do_now--)
+				*buffer++ = *bm++;
+			bm_unmap(p_addr);
+		}
+	}
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* set all bits in the bitmap */
+void drbd_bm_set_all(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0xff, b->bm_words);
+	(void)bm_clear_surplus(b);
+	b->bm_set = b->bm_bits;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* clear all bits in the bitmap */
+void drbd_bm_clear_all(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0, b->bm_words);
+	b->bm_set = 0;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+struct bm_aio_ctx {
+	struct drbd_device *device;
+	atomic_t in_flight;
+	unsigned int done;
+	unsigned flags;
+#define BM_AIO_COPY_PAGES	1
+#define BM_AIO_WRITE_HINTED	2
+#define BM_WRITE_ALL_PAGES	4
+	int error;
+	struct kref kref;
+};
+
+static void bm_aio_ctx_destroy(struct kref *kref)
+{
+	struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+
+	put_ldev(ctx->device);
+	kfree(ctx);
+}
+
+/* bv_page may be a copy, or may be the original */
+static void bm_async_io_complete(struct bio *bio, int error)
+{
+	struct bm_aio_ctx *ctx = bio->bi_private;
+	struct drbd_device *device = ctx->device;
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+
+	/* strange behavior of some lower level drivers...
+	 * fail the request by clearing the uptodate flag,
+	 * but do not return any error?!
+	 * do we want to WARN() on this? */
+	if (!error && !uptodate)
+		error = -EIO;
+
+	if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
+	    !bm_test_page_unchanged(b->bm_pages[idx]))
+		drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
+
+	if (error) {
+		/* ctx error will hold the completed-last non-zero error code,
+		 * in case error codes differ. */
+		ctx->error = error;
+		bm_set_page_io_err(b->bm_pages[idx]);
+		/* Not identical to on disk version of it.
+		 * Is BM_PAGE_IO_ERROR enough? */
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
+					error, idx);
+	} else {
+		bm_clear_page_io_err(b->bm_pages[idx]);
+		dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
+	}
+
+	bm_page_unlock_io(device, idx);
+
+	if (ctx->flags & BM_AIO_COPY_PAGES)
+		mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
+
+	bio_put(bio);
+
+	if (atomic_dec_and_test(&ctx->in_flight)) {
+		ctx->done = 1;
+		wake_up(&device->misc_wait);
+		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	}
+}
+
+static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
+{
+	struct bio *bio = bio_alloc_drbd(GFP_NOIO);
+	struct drbd_device *device = ctx->device;
+	struct drbd_bitmap *b = device->bitmap;
+	struct page *page;
+	unsigned int len;
+
+	sector_t on_disk_sector =
+		device->ldev->md.md_offset + device->ldev->md.bm_offset;
+	on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
+
+	/* this might happen with very small
+	 * flexible external meta data device,
+	 * or with PAGE_SIZE > 4k */
+	len = min_t(unsigned int, PAGE_SIZE,
+		(drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
+
+	/* serialize IO on this page */
+	bm_page_lock_io(device, page_nr);
+	/* before memcpy and submit,
+	 * so it can be redirtied any time */
+	bm_set_page_unchanged(b->bm_pages[page_nr]);
+
+	if (ctx->flags & BM_AIO_COPY_PAGES) {
+		page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+		copy_highpage(page, b->bm_pages[page_nr]);
+		bm_store_page_idx(page, page_nr);
+	} else
+		page = b->bm_pages[page_nr];
+	bio->bi_bdev = device->ldev->md_bdev;
+	bio->bi_iter.bi_sector = on_disk_sector;
+	/* bio_add_page of a single page to an empty bio will always succeed,
+	 * according to api.  Do we want to assert that? */
+	bio_add_page(bio, page, len, 0);
+	bio->bi_private = ctx;
+	bio->bi_end_io = bm_async_io_complete;
+
+	if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
+		bio->bi_rw |= rw;
+		bio_endio(bio, -EIO);
+	} else {
+		submit_bio(rw, bio);
+		/* this should not count as user activity and cause the
+		 * resync to throttle -- see drbd_rs_should_slow_down(). */
+		atomic_add(len >> 9, &device->rs_sect_ev);
+	}
+}
+
+/*
+ * bm_rw: read/write the whole bitmap from/to its on disk location.
+ */
+static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
+{
+	struct bm_aio_ctx *ctx;
+	struct drbd_bitmap *b = device->bitmap;
+	int num_pages, i, count = 0;
+	unsigned long now;
+	char ppb[10];
+	int err = 0;
+
+	/*
+	 * We are protected against bitmap disappearing/resizing by holding an
+	 * ldev reference (caller must have called get_ldev()).
+	 * For read/write, we are protected against changes to the bitmap by
+	 * the bitmap lock (see drbd_bitmap_io).
+	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
+	 * as we submit copies of pages anyways.
+	 */
+
+	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct bm_aio_ctx) {
+		.device = device,
+		.in_flight = ATOMIC_INIT(1),
+		.done = 0,
+		.flags = flags,
+		.error = 0,
+		.kref = { ATOMIC_INIT(2) },
+	};
+
+	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
+		kfree(ctx);
+		return -ENODEV;
+	}
+
+	if (!ctx->flags)
+		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
+
+	num_pages = b->bm_number_of_pages;
+
+	now = jiffies;
+
+	/* let the layers below us try to merge these bios... */
+	for (i = 0; i < num_pages; i++) {
+		/* ignore completely unchanged pages */
+		if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
+			break;
+		if (rw & WRITE) {
+			if ((flags & BM_AIO_WRITE_HINTED) &&
+			    !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
+				    &page_private(b->bm_pages[i])))
+				continue;
+
+			if (!(flags & BM_WRITE_ALL_PAGES) &&
+			    bm_test_page_unchanged(b->bm_pages[i])) {
+				dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
+				continue;
+			}
+			/* during lazy writeout,
+			 * ignore those pages not marked for lazy writeout. */
+			if (lazy_writeout_upper_idx &&
+			    !bm_test_page_lazy_writeout(b->bm_pages[i])) {
+				dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
+				continue;
+			}
+		}
+		atomic_inc(&ctx->in_flight);
+		bm_page_io_async(ctx, i, rw);
+		++count;
+		cond_resched();
+	}
+
+	/*
+	 * We initialize ctx->in_flight to one to make sure bm_async_io_complete
+	 * will not set ctx->done early, and decrement / test it here.  If there
+	 * are still some bios in flight, we need to wait for them here.
+	 * If all IO is done already (or nothing had been submitted), there is
+	 * no need to wait.  Still, we need to put the kref associated with the
+	 * "in_flight reached zero, all done" event.
+	 */
+	if (!atomic_dec_and_test(&ctx->in_flight))
+		wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
+	else
+		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+
+	/* summary for global bitmap IO */
+	if (flags == 0)
+		drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
+			 rw == WRITE ? "WRITE" : "READ",
+			 count, jiffies - now);
+
+	if (ctx->error) {
+		drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
+		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+		err = -EIO; /* ctx->error ? */
+	}
+
+	if (atomic_read(&ctx->in_flight))
+		err = -EIO; /* Disk timeout/force-detach during IO... */
+
+	now = jiffies;
+	if (rw == WRITE) {
+		drbd_md_flush(device);
+	} else /* rw == READ */ {
+		b->bm_set = bm_count_bits(b);
+		drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
+		     jiffies - now);
+	}
+	now = b->bm_set;
+
+	if (flags == 0)
+		drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+		     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+
+	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	return err;
+}
+
+/**
+ * drbd_bm_read() - Read the whole bitmap from its on disk location.
+ * @device:	DRBD device.
+ */
+int drbd_bm_read(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, READ, 0, 0);
+}
+
+/**
+ * drbd_bm_write() - Write the whole bitmap to its on disk location.
+ * @device:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ */
+int drbd_bm_write(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, WRITE, 0, 0);
+}
+
+/**
+ * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
+ * @device:	DRBD device.
+ *
+ * Will write all pages.
+ */
+int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, WRITE, BM_WRITE_ALL_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
+ * @device:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ * In contrast to drbd_bm_write(), this will copy the bitmap pages
+ * to temporary writeout pages. It is intended to trigger a full write-out
+ * while still allowing the bitmap to change, for example if a resync or online
+ * verify is aborted due to a failed peer disk, while local IO continues, or
+ * pending resync acks are still being processed.
+ */
+int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, WRITE, BM_AIO_COPY_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
+ * @device:	DRBD device.
+ */
+int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
+ * @device:	DRBD device.
+ * @idx:	bitmap page index
+ *
+ * We don't want to special case on logical_block_size of the backend device,
+ * so we submit PAGE_SIZE aligned pieces.
+ * Note that on "most" systems, PAGE_SIZE is 4k.
+ *
+ * In case this becomes an issue on systems with larger PAGE_SIZE,
+ * we may want to change this again to write 4k aligned 4k pieces.
+ */
+int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local)
+{
+	struct bm_aio_ctx *ctx;
+	int err;
+
+	if (bm_test_page_unchanged(device->bitmap->bm_pages[idx])) {
+		dynamic_drbd_dbg(device, "skipped bm page write for idx %u\n", idx);
+		return 0;
+	}
+
+	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct bm_aio_ctx) {
+		.device = device,
+		.in_flight = ATOMIC_INIT(1),
+		.done = 0,
+		.flags = BM_AIO_COPY_PAGES,
+		.error = 0,
+		.kref = { ATOMIC_INIT(2) },
+	};
+
+	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
+		kfree(ctx);
+		return -ENODEV;
+	}
+
+	bm_page_io_async(ctx, idx, WRITE_SYNC);
+	wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
+
+	if (ctx->error)
+		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+		/* that causes us to detach, so the in memory bitmap will be
+		 * gone in a moment as well. */
+
+	device->bm_writ_cnt++;
+	err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
+	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	return err;
+}
+
+/* NOTE
+ * find_first_bit returns int, we return unsigned long.
+ * For this to work on 32bit arch with bitnumbers > (1<<32),
+ * we'd need to return u64, and get a whole lot of other places
+ * fixed where we still use unsigned long.
+ *
+ * this returns a bit number, NOT a sector!
+ */
+static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo,
+	const int find_zero_bit)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr;
+	unsigned long bit_offset;
+	unsigned i;
+
+
+	if (bm_fo > b->bm_bits) {
+		drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+		bm_fo = DRBD_END_OF_BITMAP;
+	} else {
+		while (bm_fo < b->bm_bits) {
+			/* bit offset of the first bit in the page */
+			bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
+			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
+
+			if (find_zero_bit)
+				i = find_next_zero_bit_le(p_addr,
+						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+			else
+				i = find_next_bit_le(p_addr,
+						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+
+			__bm_unmap(p_addr);
+			if (i < PAGE_SIZE*8) {
+				bm_fo = bit_offset + i;
+				if (bm_fo >= b->bm_bits)
+					break;
+				goto found;
+			}
+			bm_fo = bit_offset + PAGE_SIZE*8;
+		}
+		bm_fo = DRBD_END_OF_BITMAP;
+	}
+ found:
+	return bm_fo;
+}
+
+static unsigned long bm_find_next(struct drbd_device *device,
+	unsigned long bm_fo, const int find_zero_bit)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long i = DRBD_END_OF_BITMAP;
+
+	if (!expect(b))
+		return i;
+	if (!expect(b->bm_pages))
+		return i;
+
+	spin_lock_irq(&b->bm_lock);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+
+	i = __bm_find_next(device, bm_fo, find_zero_bit);
+
+	spin_unlock_irq(&b->bm_lock);
+	return i;
+}
+
+unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
+{
+	return bm_find_next(device, bm_fo, 0);
+}
+
+#if 0
+/* not yet needed for anything. */
+unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
+{
+	return bm_find_next(device, bm_fo, 1);
+}
+#endif
+
+/* does not spin_lock_irqsave.
+ * you must take drbd_bm_lock() first */
+unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
+{
+	/* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+	return __bm_find_next(device, bm_fo, 0);
+}
+
+unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
+{
+	/* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+	return __bm_find_next(device, bm_fo, 1);
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector.
+ * expected to be called for only a few bits (e - s about BITS_PER_LONG).
+ * Must hold bitmap lock already. */
+static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s,
+	unsigned long e, int val)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr = NULL;
+	unsigned long bitnr;
+	unsigned int last_page_nr = -1U;
+	int c = 0;
+	int changed_total = 0;
+
+	if (e >= b->bm_bits) {
+		drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
+				s, e, b->bm_bits);
+		e = b->bm_bits ? b->bm_bits -1 : 0;
+	}
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
+		if (page_nr != last_page_nr) {
+			if (p_addr)
+				__bm_unmap(p_addr);
+			if (c < 0)
+				bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+			else if (c > 0)
+				bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+			changed_total += c;
+			c = 0;
+			p_addr = __bm_map_pidx(b, page_nr);
+			last_page_nr = page_nr;
+		}
+		if (val)
+			c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+		else
+			c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+	}
+	if (p_addr)
+		__bm_unmap(p_addr);
+	if (c < 0)
+		bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+	else if (c > 0)
+		bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+	changed_total += c;
+	b->bm_set += changed_total;
+	return changed_total;
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector */
+static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
+	const unsigned long e, int val)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = device->bitmap;
+	int c = 0;
+
+	if (!expect(b))
+		return 1;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
+		bm_print_lock_info(device);
+
+	c = __bm_change_bits_to(device, s, e, val);
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+/* returns number of bits changed 0 -> 1 */
+int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	return bm_change_bits_to(device, s, e, 1);
+}
+
+/* returns number of bits changed 1 -> 0 */
+int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	return -bm_change_bits_to(device, s, e, 0);
+}
+
+/* sets all bits in full words,
+ * from first_word up to, but not including, last_word */
+static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
+		int page_nr, int first_word, int last_word)
+{
+	int i;
+	int bits;
+	int changed = 0;
+	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
+	for (i = first_word; i < last_word; i++) {
+		bits = hweight_long(paddr[i]);
+		paddr[i] = ~0UL;
+		changed += BITS_PER_LONG - bits;
+	}
+	kunmap_atomic(paddr);
+	if (changed) {
+		/* We only need lazy writeout, the information is still in the
+		 * remote bitmap as well, and is reconstructed during the next
+		 * bitmap exchange, if lost locally due to a crash. */
+		bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+		b->bm_set += changed;
+	}
+}
+
+/* Same thing as drbd_bm_set_bits,
+ * but more efficient for a large bit range.
+ * You must first drbd_bm_lock().
+ * Can be called to set the whole bitmap in one go.
+ * Sets bits from s to e _inclusive_. */
+void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	/* First set_bit from the first bit (s)
+	 * up to the next long boundary (sl),
+	 * then assign full words up to the last long boundary (el),
+	 * then set_bit up to and including the last bit (e).
+	 *
+	 * Do not use memset, because we must account for changes,
+	 * so we need to loop over the words with hweight() anyways.
+	 */
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long sl = ALIGN(s,BITS_PER_LONG);
+	unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
+	int first_page;
+	int last_page;
+	int page_nr;
+	int first_word;
+	int last_word;
+
+	if (e - s <= 3*BITS_PER_LONG) {
+		/* don't bother; el and sl may even be wrong. */
+		spin_lock_irq(&b->bm_lock);
+		__bm_change_bits_to(device, s, e, 1);
+		spin_unlock_irq(&b->bm_lock);
+		return;
+	}
+
+	/* difference is large enough that we can trust sl and el */
+
+	spin_lock_irq(&b->bm_lock);
+
+	/* bits filling the current long */
+	if (sl)
+		__bm_change_bits_to(device, s, sl-1, 1);
+
+	first_page = sl >> (3 + PAGE_SHIFT);
+	last_page = el >> (3 + PAGE_SHIFT);
+
+	/* MLPP: modulo longs per page */
+	/* LWPP: long words per page */
+	first_word = MLPP(sl >> LN2_BPL);
+	last_word = LWPP;
+
+	/* first and full pages, unless first page == last page */
+	for (page_nr = first_page; page_nr < last_page; page_nr++) {
+		bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word);
+		spin_unlock_irq(&b->bm_lock);
+		cond_resched();
+		first_word = 0;
+		spin_lock_irq(&b->bm_lock);
+	}
+	/* last page (respectively only page, for first page == last page) */
+	last_word = MLPP(el >> LN2_BPL);
+
+	/* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
+	 * ==> e = 32767, el = 32768, last_page = 2,
+	 * and now last_word = 0.
+	 * We do not want to touch last_page in this case,
+	 * as we did not allocate it, it is not present in bitmap->bm_pages.
+	 */
+	if (last_word)
+		bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word);
+
+	/* possibly trailing bits.
+	 * example: (e & 63) == 63, el will be e+1.
+	 * if that even was the very last bit,
+	 * it would trigger an assert in __bm_change_bits_to()
+	 */
+	if (el <= e)
+		__bm_change_bits_to(device, el, e, 1);
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* returns bit state
+ * wants bitnr, NOT sector.
+ * inherently racy... area needs to be locked by means of {al,rs}_lru
+ *  1 ... bit set
+ *  0 ... bit not set
+ * -1 ... first out of bounds access, stop testing for bits!
+ */
+int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr;
+	int i;
+
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+	if (bitnr < b->bm_bits) {
+		p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
+		i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
+		bm_unmap(p_addr);
+	} else if (bitnr == b->bm_bits) {
+		i = -1;
+	} else { /* (bitnr > b->bm_bits) */
+		drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
+		i = 0;
+	}
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return i;
+}
+
+/* returns number of bits set in the range [s, e] */
+int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr = NULL;
+	unsigned long bitnr;
+	unsigned int page_nr = -1U;
+	int c = 0;
+
+	/* If this is called without a bitmap, that is a bug.  But just to be
+	 * robust in case we screwed up elsewhere, in that case pretend there
+	 * was one dirty bit in the requested area, so we won't try to do a
+	 * local read there (no bitmap probably implies no disk) */
+	if (!expect(b))
+		return 1;
+	if (!expect(b->bm_pages))
+		return 1;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		unsigned int idx = bm_bit_to_page_idx(b, bitnr);
+		if (page_nr != idx) {
+			page_nr = idx;
+			if (p_addr)
+				bm_unmap(p_addr);
+			p_addr = bm_map_pidx(b, idx);
+		}
+		if (expect(bitnr < b->bm_bits))
+			c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+		else
+			drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
+	}
+	if (p_addr)
+		bm_unmap(p_addr);
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+
+/* inherently racy...
+ * return value may be already out-of-date when this function returns.
+ * but the general usage is that this is only use during a cstate when bits are
+ * only cleared, not set, and typically only care for the case when the return
+ * value is zero, or we already "locked" this "bitmap extent" by other means.
+ *
+ * enr is bm-extent number, since we chose to name one sector (512 bytes)
+ * worth of the bitmap a "bitmap extent".
+ *
+ * TODO
+ * I think since we use it like a reference count, we should use the real
+ * reference count of some bitmap extent element from some lru instead...
+ *
+ */
+int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	int count, s, e;
+	unsigned long flags;
+	unsigned long *p_addr, *bm;
+
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+
+	s = S2W(enr);
+	e = min((size_t)S2W(enr+1), b->bm_words);
+	count = 0;
+	if (s < b->bm_words) {
+		int n = e-s;
+		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
+		bm = p_addr + MLPP(s);
+		while (n--)
+			count += hweight_long(*bm++);
+		bm_unmap(p_addr);
+	} else {
+		drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
+	}
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return count;
+}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 00000000000..a76ceb344d6
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2229 @@
+/*
+  drbd_int.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#ifndef _DRBD_INT_H
+#define _DRBD_INT_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/ratelimit.h>
+#include <linux/tcp.h>
+#include <linux/mutex.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <net/tcp.h>
+#include <linux/lru_cache.h>
+#include <linux/prefetch.h>
+#include <linux/drbd_genl_api.h>
+#include <linux/drbd.h>
+#include "drbd_strings.h"
+#include "drbd_state.h"
+#include "drbd_protocol.h"
+
+#ifdef __CHECKER__
+# define __protected_by(x)       __attribute__((require_context(x,1,999,"rdwr")))
+# define __protected_read_by(x)  __attribute__((require_context(x,1,999,"read")))
+# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
+# define __must_hold(x)       __attribute__((context(x,1,1), require_context(x,1,999,"call")))
+#else
+# define __protected_by(x)
+# define __protected_read_by(x)
+# define __protected_write_by(x)
+# define __must_hold(x)
+#endif
+
+#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0)
+
+/* module parameter, defined in drbd_main.c */
+extern unsigned int minor_count;
+extern bool disable_sendpage;
+extern bool allow_oos;
+void tl_abort_disk_io(struct drbd_device *device);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+extern int enable_faults;
+extern int fault_rate;
+extern int fault_devs;
+#endif
+
+extern char usermode_helper[];
+
+
+/* I don't remember why XCPU ...
+ * This is used to wake the asender,
+ * and to interrupt sending the sending task
+ * on disconnect.
+ */
+#define DRBD_SIG SIGXCPU
+
+/* This is used to stop/restart our threads.
+ * Cannot use SIGTERM nor SIGKILL, since these
+ * are sent out by init on runlevel changes
+ * I choose SIGHUP for now.
+ */
+#define DRBD_SIGKILL SIGHUP
+
+#define ID_IN_SYNC      (4711ULL)
+#define ID_OUT_OF_SYNC  (4712ULL)
+#define ID_SYNCER (-1ULL)
+
+#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
+
+struct drbd_device;
+struct drbd_connection;
+
+#define __drbd_printk_device(level, device, fmt, args...) \
+	dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args)
+#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \
+	dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args)
+#define __drbd_printk_resource(level, resource, fmt, args...) \
+	printk(level "drbd %s: " fmt, (resource)->name, ## args)
+#define __drbd_printk_connection(level, connection, fmt, args...) \
+	printk(level "drbd %s: " fmt, (connection)->resource->name, ## args)
+
+void drbd_printk_with_wrong_object_type(void);
+
+#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \
+	(__builtin_types_compatible_p(typeof(obj), type) || \
+	 __builtin_types_compatible_p(typeof(obj), const type)), \
+	func(level, (const type)(obj), fmt, ## args)
+
+#define drbd_printk(level, obj, fmt, args...) \
+	__builtin_choose_expr( \
+	  __drbd_printk_if_same_type(obj, struct drbd_device *, \
+			     __drbd_printk_device, level, fmt, ## args), \
+	  __builtin_choose_expr( \
+	    __drbd_printk_if_same_type(obj, struct drbd_resource *, \
+			       __drbd_printk_resource, level, fmt, ## args), \
+	    __builtin_choose_expr( \
+	      __drbd_printk_if_same_type(obj, struct drbd_connection *, \
+				 __drbd_printk_connection, level, fmt, ## args), \
+	      __builtin_choose_expr( \
+		__drbd_printk_if_same_type(obj, struct drbd_peer_device *, \
+				 __drbd_printk_peer_device, level, fmt, ## args), \
+		drbd_printk_with_wrong_object_type()))))
+
+#define drbd_dbg(obj, fmt, args...) \
+	drbd_printk(KERN_DEBUG, obj, fmt, ## args)
+#define drbd_alert(obj, fmt, args...) \
+	drbd_printk(KERN_ALERT, obj, fmt, ## args)
+#define drbd_err(obj, fmt, args...) \
+	drbd_printk(KERN_ERR, obj, fmt, ## args)
+#define drbd_warn(obj, fmt, args...) \
+	drbd_printk(KERN_WARNING, obj, fmt, ## args)
+#define drbd_info(obj, fmt, args...) \
+	drbd_printk(KERN_INFO, obj, fmt, ## args)
+#define drbd_emerg(obj, fmt, args...) \
+	drbd_printk(KERN_EMERG, obj, fmt, ## args)
+
+#define dynamic_drbd_dbg(device, fmt, args...) \
+	dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args)
+
+#define D_ASSERT(device, exp)	do { \
+	if (!(exp)) \
+		drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \
+	} while (0)
+
+/**
+ * expect  -  Make an assertion
+ *
+ * Unlike the assert macro, this macro returns a boolean result.
+ */
+#define expect(exp) ({								\
+		bool _bool = (exp);						\
+		if (!_bool)							\
+			drbd_err(device, "ASSERTION %s FAILED in %s\n",		\
+			        #exp, __func__);				\
+		_bool;								\
+		})
+
+/* Defines to control fault insertion */
+enum {
+	DRBD_FAULT_MD_WR = 0,	/* meta data write */
+	DRBD_FAULT_MD_RD = 1,	/*           read  */
+	DRBD_FAULT_RS_WR = 2,	/* resync          */
+	DRBD_FAULT_RS_RD = 3,
+	DRBD_FAULT_DT_WR = 4,	/* data            */
+	DRBD_FAULT_DT_RD = 5,
+	DRBD_FAULT_DT_RA = 6,	/* data read ahead */
+	DRBD_FAULT_BM_ALLOC = 7,	/* bitmap allocation */
+	DRBD_FAULT_AL_EE = 8,	/* alloc ee */
+	DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
+
+	DRBD_FAULT_MAX,
+};
+
+extern unsigned int
+_drbd_insert_fault(struct drbd_device *device, unsigned int type);
+
+static inline int
+drbd_insert_fault(struct drbd_device *device, unsigned int type) {
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+	return fault_rate &&
+		(enable_faults & (1<<type)) &&
+		_drbd_insert_fault(device, type);
+#else
+	return 0;
+#endif
+}
+
+/* integer division, round _UP_ to the next integer */
+#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
+/* usual integer division */
+#define div_floor(A, B) ((A)/(B))
+
+extern struct ratelimit_state drbd_ratelimit_state;
+extern struct idr drbd_devices; /* RCU, updates: genl_lock() */
+extern struct list_head drbd_resources; /* RCU, updates: genl_lock() */
+
+extern const char *cmdname(enum drbd_packet cmd);
+
+/* for sending/receiving the bitmap,
+ * possibly in some encoding scheme */
+struct bm_xfer_ctx {
+	/* "const"
+	 * stores total bits and long words
+	 * of the bitmap, so we don't need to
+	 * call the accessor functions over and again. */
+	unsigned long bm_bits;
+	unsigned long bm_words;
+	/* during xfer, current position within the bitmap */
+	unsigned long bit_offset;
+	unsigned long word_offset;
+
+	/* statistics; index: (h->command == P_BITMAP) */
+	unsigned packets[2];
+	unsigned bytes[2];
+};
+
+extern void INFO_bm_xfer_stats(struct drbd_device *device,
+		const char *direction, struct bm_xfer_ctx *c);
+
+static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
+{
+	/* word_offset counts "native long words" (32 or 64 bit),
+	 * aligned at 64 bit.
+	 * Encoded packet may end at an unaligned bit offset.
+	 * In case a fallback clear text packet is transmitted in
+	 * between, we adjust this offset back to the last 64bit
+	 * aligned "native long word", which makes coding and decoding
+	 * the plain text bitmap much more convenient.  */
+#if BITS_PER_LONG == 64
+	c->word_offset = c->bit_offset >> 6;
+#elif BITS_PER_LONG == 32
+	c->word_offset = c->bit_offset >> 5;
+	c->word_offset &= ~(1UL);
+#else
+# error "unsupported BITS_PER_LONG"
+#endif
+}
+
+extern unsigned int drbd_header_size(struct drbd_connection *connection);
+
+/**********************************************************************/
+enum drbd_thread_state {
+	NONE,
+	RUNNING,
+	EXITING,
+	RESTARTING
+};
+
+struct drbd_thread {
+	spinlock_t t_lock;
+	struct task_struct *task;
+	struct completion stop;
+	enum drbd_thread_state t_state;
+	int (*function) (struct drbd_thread *);
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+	int reset_cpu_mask;
+	const char *name;
+};
+
+static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
+{
+	/* THINK testing the t_state seems to be uncritical in all cases
+	 * (but thread_{start,stop}), so we can read it *without* the lock.
+	 *	--lge */
+
+	smp_rmb();
+	return thi->t_state;
+}
+
+struct drbd_work {
+	struct list_head list;
+	int (*cb)(struct drbd_work *, int cancel);
+};
+
+struct drbd_device_work {
+	struct drbd_work w;
+	struct drbd_device *device;
+};
+
+#include "drbd_interval.h"
+
+extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
+
+struct drbd_request {
+	struct drbd_work w;
+	struct drbd_device *device;
+
+	/* if local IO is not allowed, will be NULL.
+	 * if local IO _is_ allowed, holds the locally submitted bio clone,
+	 * or, after local IO completion, the ERR_PTR(error).
+	 * see drbd_request_endio(). */
+	struct bio *private_bio;
+
+	struct drbd_interval i;
+
+	/* epoch: used to check on "completion" whether this req was in
+	 * the current epoch, and we therefore have to close it,
+	 * causing a p_barrier packet to be send, starting a new epoch.
+	 *
+	 * This corresponds to "barrier" in struct p_barrier[_ack],
+	 * and to "barrier_nr" in struct drbd_epoch (and various
+	 * comments/function parameters/local variable names).
+	 */
+	unsigned int epoch;
+
+	struct list_head tl_requests; /* ring list in the transfer log */
+	struct bio *master_bio;       /* master bio pointer */
+	unsigned long start_time;
+
+	/* once it hits 0, we may complete the master_bio */
+	atomic_t completion_ref;
+	/* once it hits 0, we may destroy this drbd_request object */
+	struct kref kref;
+
+	unsigned rq_state; /* see comments above _req_mod() */
+};
+
+struct drbd_epoch {
+	struct drbd_connection *connection;
+	struct list_head list;
+	unsigned int barrier_nr;
+	atomic_t epoch_size; /* increased on every request added. */
+	atomic_t active;     /* increased on every req. added, and dec on every finished. */
+	unsigned long flags;
+};
+
+/* Prototype declaration of function defined in drbd_receiver.c */
+int drbdd_init(struct drbd_thread *);
+int drbd_asender(struct drbd_thread *);
+
+/* drbd_epoch flag bits */
+enum {
+	DE_HAVE_BARRIER_NUMBER,
+};
+
+enum epoch_event {
+	EV_PUT,
+	EV_GOT_BARRIER_NR,
+	EV_BECAME_LAST,
+	EV_CLEANUP = 32, /* used as flag */
+};
+
+struct digest_info {
+	int digest_size;
+	void *digest;
+};
+
+struct drbd_peer_request {
+	struct drbd_work w;
+	struct drbd_peer_device *peer_device;
+	struct drbd_epoch *epoch; /* for writes */
+	struct page *pages;
+	atomic_t pending_bios;
+	struct drbd_interval i;
+	/* see comments on ee flag bits below */
+	unsigned long flags;
+	union {
+		u64 block_id;
+		struct digest_info *digest;
+	};
+};
+
+/* ee flag bits.
+ * While corresponding bios are in flight, the only modification will be
+ * set_bit WAS_ERROR, which has to be atomic.
+ * If no bios are in flight yet, or all have been completed,
+ * non-atomic modification to ee->flags is ok.
+ */
+enum {
+	__EE_CALL_AL_COMPLETE_IO,
+	__EE_MAY_SET_IN_SYNC,
+
+	/* is this a TRIM aka REQ_DISCARD? */
+	__EE_IS_TRIM,
+	/* our lower level cannot handle trim,
+	 * and we want to fall back to zeroout instead */
+	__EE_IS_TRIM_USE_ZEROOUT,
+
+	/* In case a barrier failed,
+	 * we need to resubmit without the barrier flag. */
+	__EE_RESUBMITTED,
+
+	/* we may have several bios per peer request.
+	 * if any of those fail, we set this flag atomically
+	 * from the endio callback */
+	__EE_WAS_ERROR,
+
+	/* This ee has a pointer to a digest instead of a block id */
+	__EE_HAS_DIGEST,
+
+	/* Conflicting local requests need to be restarted after this request */
+	__EE_RESTART_REQUESTS,
+
+	/* The peer wants a write ACK for this (wire proto C) */
+	__EE_SEND_WRITE_ACK,
+
+	/* Is set when net_conf had two_primaries set while creating this peer_req */
+	__EE_IN_INTERVAL_TREE,
+};
+#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
+#define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
+#define EE_IS_TRIM             (1<<__EE_IS_TRIM)
+#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
+#define EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
+#define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
+#define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
+#define EE_RESTART_REQUESTS	(1<<__EE_RESTART_REQUESTS)
+#define EE_SEND_WRITE_ACK	(1<<__EE_SEND_WRITE_ACK)
+#define EE_IN_INTERVAL_TREE	(1<<__EE_IN_INTERVAL_TREE)
+
+/* flag bits per device */
+enum {
+	UNPLUG_REMOTE,		/* sending a "UnplugRemote" could help */
+	MD_DIRTY,		/* current uuids and flags not yet on disk */
+	USE_DEGR_WFC_T,		/* degr-wfc-timeout instead of wfc-timeout. */
+	CL_ST_CHG_SUCCESS,
+	CL_ST_CHG_FAIL,
+	CRASHED_PRIMARY,	/* This node was a crashed primary.
+				 * Gets cleared when the state.conn
+				 * goes into C_CONNECTED state. */
+	CONSIDER_RESYNC,
+
+	MD_NO_FUA,		/* Users wants us to not use FUA/FLUSH on meta data dev */
+	SUSPEND_IO,		/* suspend application io */
+	BITMAP_IO,		/* suspend application io;
+				   once no more io in flight, start bitmap io */
+	BITMAP_IO_QUEUED,       /* Started bitmap IO */
+	GO_DISKLESS,		/* Disk is being detached, on io-error or admin request. */
+	WAS_IO_ERROR,		/* Local disk failed, returned IO error */
+	WAS_READ_ERROR,		/* Local disk READ failed (set additionally to the above) */
+	FORCE_DETACH,		/* Force-detach from local disk, aborting any pending local IO */
+	RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
+	RESIZE_PENDING,		/* Size change detected locally, waiting for the response from
+				 * the peer, if it changed there as well. */
+	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
+	AL_SUSPENDED,		/* Activity logging is currently suspended. */
+	AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
+	B_RS_H_DONE,		/* Before resync handler done (already executed) */
+	DISCARD_MY_DATA,	/* discard_my_data flag per volume */
+	READ_BALANCE_RR,
+};
+
+struct drbd_bitmap; /* opaque for drbd_device */
+
+/* definition of bits in bm_flags to be used in drbd_bm_lock
+ * and drbd_bitmap_io and friends. */
+enum bm_flag {
+	/* do we need to kfree, or vfree bm_pages? */
+	BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
+
+	/* currently locked for bulk operation */
+	BM_LOCKED_MASK = 0xf,
+
+	/* in detail, that is: */
+	BM_DONT_CLEAR = 0x1,
+	BM_DONT_SET   = 0x2,
+	BM_DONT_TEST  = 0x4,
+
+	/* so we can mark it locked for bulk operation,
+	 * and still allow all non-bulk operations */
+	BM_IS_LOCKED  = 0x8,
+
+	/* (test bit, count bit) allowed (common case) */
+	BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
+
+	/* testing bits, as well as setting new bits allowed, but clearing bits
+	 * would be unexpected.  Used during bitmap receive.  Setting new bits
+	 * requires sending of "out-of-sync" information, though. */
+	BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
+
+	/* for drbd_bm_write_copy_pages, everything is allowed,
+	 * only concurrent bulk operations are locked out. */
+	BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
+};
+
+struct drbd_work_queue {
+	struct list_head q;
+	spinlock_t q_lock;  /* to protect the list. */
+	wait_queue_head_t q_wait;
+};
+
+struct drbd_socket {
+	struct mutex mutex;
+	struct socket    *socket;
+	/* this way we get our
+	 * send/receive buffers off the stack */
+	void *sbuf;
+	void *rbuf;
+};
+
+struct drbd_md {
+	u64 md_offset;		/* sector offset to 'super' block */
+
+	u64 la_size_sect;	/* last agreed size, unit sectors */
+	spinlock_t uuid_lock;
+	u64 uuid[UI_SIZE];
+	u64 device_uuid;
+	u32 flags;
+	u32 md_size_sect;
+
+	s32 al_offset;	/* signed relative sector offset to activity log */
+	s32 bm_offset;	/* signed relative sector offset to bitmap */
+
+	/* cached value of bdev->disk_conf->meta_dev_idx (see below) */
+	s32 meta_dev_idx;
+
+	/* see al_tr_number_to_on_disk_sector() */
+	u32 al_stripes;
+	u32 al_stripe_size_4k;
+	u32 al_size_4k; /* cached product of the above */
+};
+
+struct drbd_backing_dev {
+	struct block_device *backing_bdev;
+	struct block_device *md_bdev;
+	struct drbd_md md;
+	struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */
+	sector_t known_size; /* last known size of that backing device */
+};
+
+struct drbd_md_io {
+	unsigned int done;
+	int error;
+};
+
+struct bm_io_work {
+	struct drbd_work w;
+	char *why;
+	enum bm_flag flags;
+	int (*io_fn)(struct drbd_device *device);
+	void (*done)(struct drbd_device *device, int rv);
+};
+
+enum write_ordering_e {
+	WO_none,
+	WO_drain_io,
+	WO_bdev_flush,
+};
+
+struct fifo_buffer {
+	unsigned int head_index;
+	unsigned int size;
+	int total; /* sum of all values */
+	int values[0];
+};
+extern struct fifo_buffer *fifo_alloc(int fifo_size);
+
+/* flag bits per connection */
+enum {
+	NET_CONGESTED,		/* The data socket is congested */
+	RESOLVE_CONFLICTS,	/* Set on one node, cleared on the peer! */
+	SEND_PING,		/* whether asender should send a ping asap */
+	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
+	GOT_PING_ACK,		/* set when we receive a ping_ack packet, ping_wait gets woken */
+	CONN_WD_ST_CHG_REQ,	/* A cluster wide state change on the connection is active */
+	CONN_WD_ST_CHG_OKAY,
+	CONN_WD_ST_CHG_FAIL,
+	CONN_DRY_RUN,		/* Expect disconnect after resync handshake. */
+	CREATE_BARRIER,		/* next P_DATA is preceded by a P_BARRIER */
+	STATE_SENT,		/* Do not change state/UUIDs while this is set */
+	CALLBACK_PENDING,	/* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+				 * pending, from drbd worker context.
+				 * If set, bdi_write_congested() returns true,
+				 * so shrink_page_list() would not recurse into,
+				 * and potentially deadlock on, this drbd worker.
+				 */
+	DISCONNECT_SENT,
+};
+
+struct drbd_resource {
+	char *name;
+	struct kref kref;
+	struct idr devices;		/* volume number to device mapping */
+	struct list_head connections;
+	struct list_head resources;
+	struct res_opts res_opts;
+	struct mutex conf_update;	/* mutex for ready-copy-update of net_conf and disk_conf */
+	struct mutex adm_mutex;		/* mutex to serialize administrative requests */
+	spinlock_t req_lock;
+
+	unsigned susp:1;		/* IO suspended by user */
+	unsigned susp_nod:1;		/* IO suspended because no data */
+	unsigned susp_fen:1;		/* IO suspended because fence peer handler runs */
+
+	cpumask_var_t cpu_mask;
+};
+
+struct drbd_connection {
+	struct list_head connections;
+	struct drbd_resource *resource;
+	struct kref kref;
+	struct idr peer_devices;	/* volume number to peer device mapping */
+	enum drbd_conns cstate;		/* Only C_STANDALONE to C_WF_REPORT_PARAMS */
+	struct mutex cstate_mutex;	/* Protects graceful disconnects */
+	unsigned int connect_cnt;	/* Inc each time a connection is established */
+
+	unsigned long flags;
+	struct net_conf *net_conf;	/* content protected by rcu */
+	wait_queue_head_t ping_wait;	/* Woken upon reception of a ping, and a state change */
+
+	struct sockaddr_storage my_addr;
+	int my_addr_len;
+	struct sockaddr_storage peer_addr;
+	int peer_addr_len;
+
+	struct drbd_socket data;	/* data/barrier/cstate/parameter packets */
+	struct drbd_socket meta;	/* ping/ack (metadata) packets */
+	int agreed_pro_version;		/* actually used protocol version */
+	u32 agreed_features;
+	unsigned long last_received;	/* in jiffies, either socket */
+	unsigned int ko_count;
+
+	struct list_head transfer_log;	/* all requests not yet fully processed */
+
+	struct crypto_hash *cram_hmac_tfm;
+	struct crypto_hash *integrity_tfm;  /* checksums we compute, updates protected by connection->data->mutex */
+	struct crypto_hash *peer_integrity_tfm;  /* checksums we verify, only accessed from receiver thread  */
+	struct crypto_hash *csums_tfm;
+	struct crypto_hash *verify_tfm;
+	void *int_dig_in;
+	void *int_dig_vv;
+
+	/* receiver side */
+	struct drbd_epoch *current_epoch;
+	spinlock_t epoch_lock;
+	unsigned int epochs;
+	enum write_ordering_e write_ordering;
+	atomic_t current_tle_nr;	/* transfer log epoch number */
+	unsigned current_tle_writes;	/* writes seen within this tl epoch */
+
+	unsigned long last_reconnect_jif;
+	struct drbd_thread receiver;
+	struct drbd_thread worker;
+	struct drbd_thread asender;
+
+	/* sender side */
+	struct drbd_work_queue sender_work;
+
+	struct {
+		/* whether this sender thread
+		 * has processed a single write yet. */
+		bool seen_any_write_yet;
+
+		/* Which barrier number to send with the next P_BARRIER */
+		int current_epoch_nr;
+
+		/* how many write requests have been sent
+		 * with req->epoch == current_epoch_nr.
+		 * If none, no P_BARRIER will be sent. */
+		unsigned current_epoch_writes;
+	} send;
+};
+
+struct submit_worker {
+	struct workqueue_struct *wq;
+	struct work_struct worker;
+
+	spinlock_t lock;
+	struct list_head writes;
+};
+
+struct drbd_peer_device {
+	struct list_head peer_devices;
+	struct drbd_device *device;
+	struct drbd_connection *connection;
+};
+
+struct drbd_device {
+	struct drbd_resource *resource;
+	struct list_head peer_devices;
+	int vnr;			/* volume number within the connection */
+	struct kref kref;
+
+	/* things that are stored as / read from meta data on disk */
+	unsigned long flags;
+
+	/* configured by drbdsetup */
+	struct drbd_backing_dev *ldev __protected_by(local);
+
+	sector_t p_size;     /* partner's disk size */
+	struct request_queue *rq_queue;
+	struct block_device *this_bdev;
+	struct gendisk	    *vdisk;
+
+	unsigned long last_reattach_jif;
+	struct drbd_work resync_work;
+	struct drbd_work unplug_work;
+	struct drbd_work go_diskless;
+	struct drbd_work md_sync_work;
+	struct drbd_work start_resync_work;
+	struct timer_list resync_timer;
+	struct timer_list md_sync_timer;
+	struct timer_list start_resync_timer;
+	struct timer_list request_timer;
+#ifdef DRBD_DEBUG_MD_SYNC
+	struct {
+		unsigned int line;
+		const char* func;
+	} last_md_mark_dirty;
+#endif
+
+	/* Used after attach while negotiating new disk state. */
+	union drbd_state new_state_tmp;
+
+	union drbd_dev_state state;
+	wait_queue_head_t misc_wait;
+	wait_queue_head_t state_wait;  /* upon each state change. */
+	unsigned int send_cnt;
+	unsigned int recv_cnt;
+	unsigned int read_cnt;
+	unsigned int writ_cnt;
+	unsigned int al_writ_cnt;
+	unsigned int bm_writ_cnt;
+	atomic_t ap_bio_cnt;	 /* Requests we need to complete */
+	atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
+	atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
+	atomic_t unacked_cnt;	 /* Need to send replies for */
+	atomic_t local_cnt;	 /* Waiting for local completion */
+
+	/* Interval tree of pending local requests */
+	struct rb_root read_requests;
+	struct rb_root write_requests;
+
+	/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
+	unsigned long rs_total;
+	/* number of resync blocks that failed in this run */
+	unsigned long rs_failed;
+	/* Syncer's start time [unit jiffies] */
+	unsigned long rs_start;
+	/* cumulated time in PausedSyncX state [unit jiffies] */
+	unsigned long rs_paused;
+	/* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+	unsigned long rs_same_csum;
+#define DRBD_SYNC_MARKS 8
+#define DRBD_SYNC_MARK_STEP (3*HZ)
+	/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
+	unsigned long rs_mark_left[DRBD_SYNC_MARKS];
+	/* marks's time [unit jiffies] */
+	unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+	/* current index into rs_mark_{left,time} */
+	int rs_last_mark;
+	unsigned long rs_last_bcast; /* [unit jiffies] */
+
+	/* where does the admin want us to start? (sector) */
+	sector_t ov_start_sector;
+	sector_t ov_stop_sector;
+	/* where are we now? (sector) */
+	sector_t ov_position;
+	/* Start sector of out of sync range (to merge printk reporting). */
+	sector_t ov_last_oos_start;
+	/* size of out-of-sync range in sectors. */
+	sector_t ov_last_oos_size;
+	unsigned long ov_left; /* in bits */
+
+	struct drbd_bitmap *bitmap;
+	unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
+
+	/* Used to track operations of resync... */
+	struct lru_cache *resync;
+	/* Number of locked elements in resync LRU */
+	unsigned int resync_locked;
+	/* resync extent number waiting for application requests */
+	unsigned int resync_wenr;
+
+	int open_cnt;
+	u64 *p_uuid;
+
+	struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
+	struct list_head sync_ee;   /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
+	struct list_head done_ee;   /* need to send P_WRITE_ACK */
+	struct list_head read_ee;   /* [RS]P_DATA_REQUEST being read */
+	struct list_head net_ee;    /* zero-copy network send in progress */
+
+	int next_barrier_nr;
+	struct list_head resync_reads;
+	atomic_t pp_in_use;		/* allocated from page pool */
+	atomic_t pp_in_use_by_net;	/* sendpage()d, still referenced by tcp */
+	wait_queue_head_t ee_wait;
+	struct page *md_io_page;	/* one page buffer for md_io */
+	struct drbd_md_io md_io;
+	atomic_t md_io_in_use;		/* protects the md_io, md_io_page and md_io_tmpp */
+	spinlock_t al_lock;
+	wait_queue_head_t al_wait;
+	struct lru_cache *act_log;	/* activity log */
+	unsigned int al_tr_number;
+	int al_tr_cycle;
+	wait_queue_head_t seq_wait;
+	atomic_t packet_seq;
+	unsigned int peer_seq;
+	spinlock_t peer_seq_lock;
+	unsigned int minor;
+	unsigned long comm_bm_set; /* communicated number of set bits. */
+	struct bm_io_work bm_io_work;
+	u64 ed_uuid; /* UUID of the exposed data */
+	struct mutex own_state_mutex;
+	struct mutex *state_mutex; /* either own_state_mutex or first_peer_device(device)->connection->cstate_mutex */
+	char congestion_reason;  /* Why we where congested... */
+	atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+	atomic_t rs_sect_ev; /* for submitted resync data rate, both */
+	int rs_last_sect_ev; /* counter to compare with */
+	int rs_last_events;  /* counter of read or write "events" (unit sectors)
+			      * on the lower level device when we last looked. */
+	int c_sync_rate; /* current resync rate after syncer throttle magic */
+	struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */
+	int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+	atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
+	unsigned int peer_max_bio_size;
+	unsigned int local_max_bio_size;
+
+	/* any requests that would block in drbd_make_request()
+	 * are deferred to this single-threaded work queue */
+	struct submit_worker submit;
+};
+
+struct drbd_config_context {
+	/* assigned from drbd_genlmsghdr */
+	unsigned int minor;
+	/* assigned from request attributes, if present */
+	unsigned int volume;
+#define VOLUME_UNSPECIFIED		(-1U)
+	/* pointer into the request skb,
+	 * limited lifetime! */
+	char *resource_name;
+	struct nlattr *my_addr;
+	struct nlattr *peer_addr;
+
+	/* reply buffer */
+	struct sk_buff *reply_skb;
+	/* pointer into reply buffer */
+	struct drbd_genlmsghdr *reply_dh;
+	/* resolved from attributes, if possible */
+	struct drbd_device *device;
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+};
+
+static inline struct drbd_device *minor_to_device(unsigned int minor)
+{
+	return (struct drbd_device *)idr_find(&drbd_devices, minor);
+}
+
+static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device)
+{
+	return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
+}
+
+#define for_each_resource(resource, _resources) \
+	list_for_each_entry(resource, _resources, resources)
+
+#define for_each_resource_rcu(resource, _resources) \
+	list_for_each_entry_rcu(resource, _resources, resources)
+
+#define for_each_resource_safe(resource, tmp, _resources) \
+	list_for_each_entry_safe(resource, tmp, _resources, resources)
+
+#define for_each_connection(connection, resource) \
+	list_for_each_entry(connection, &resource->connections, connections)
+
+#define for_each_connection_rcu(connection, resource) \
+	list_for_each_entry_rcu(connection, &resource->connections, connections)
+
+#define for_each_connection_safe(connection, tmp, resource) \
+	list_for_each_entry_safe(connection, tmp, &resource->connections, connections)
+
+#define for_each_peer_device(peer_device, device) \
+	list_for_each_entry(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_rcu(peer_device, device) \
+	list_for_each_entry_rcu(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_safe(peer_device, tmp, device) \
+	list_for_each_entry_safe(peer_device, tmp, &device->peer_devices, peer_devices)
+
+static inline unsigned int device_to_minor(struct drbd_device *device)
+{
+	return device->minor;
+}
+
+/*
+ * function declarations
+ *************************/
+
+/* drbd_main.c */
+
+enum dds_flags {
+	DDSF_FORCED    = 1,
+	DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
+};
+
+extern void drbd_init_set_defaults(struct drbd_device *device);
+extern int  drbd_thread_start(struct drbd_thread *thi);
+extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+#ifdef CONFIG_SMP
+extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
+#else
+#define drbd_thread_current_set_cpu(A) ({})
+#endif
+extern void tl_release(struct drbd_connection *, unsigned int barrier_nr,
+		       unsigned int set_size);
+extern void tl_clear(struct drbd_connection *);
+extern void drbd_free_sock(struct drbd_connection *connection);
+extern int drbd_send(struct drbd_connection *connection, struct socket *sock,
+		     void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_all(struct drbd_connection *, struct socket *, void *, size_t,
+			 unsigned);
+
+extern int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd);
+extern int drbd_send_protocol(struct drbd_connection *connection);
+extern int drbd_send_uuids(struct drbd_peer_device *);
+extern int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *);
+extern void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *);
+extern int drbd_send_sizes(struct drbd_peer_device *, int trigger_reply, enum dds_flags flags);
+extern int drbd_send_state(struct drbd_peer_device *, union drbd_state s);
+extern int drbd_send_current_state(struct drbd_peer_device *);
+extern int drbd_send_sync_param(struct drbd_peer_device *);
+extern void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr,
+			    u32 set_size);
+extern int drbd_send_ack(struct drbd_peer_device *, enum drbd_packet,
+			 struct drbd_peer_request *);
+extern void drbd_send_ack_rp(struct drbd_peer_device *, enum drbd_packet,
+			     struct p_block_req *rp);
+extern void drbd_send_ack_dp(struct drbd_peer_device *, enum drbd_packet,
+			     struct p_data *dp, int data_size);
+extern int drbd_send_ack_ex(struct drbd_peer_device *, enum drbd_packet,
+			    sector_t sector, int blksize, u64 block_id);
+extern int drbd_send_out_of_sync(struct drbd_peer_device *, struct drbd_request *);
+extern int drbd_send_block(struct drbd_peer_device *, enum drbd_packet,
+			   struct drbd_peer_request *);
+extern int drbd_send_dblock(struct drbd_peer_device *, struct drbd_request *req);
+extern int drbd_send_drequest(struct drbd_peer_device *, int cmd,
+			      sector_t sector, int size, u64 block_id);
+extern int drbd_send_drequest_csum(struct drbd_peer_device *, sector_t sector,
+				   int size, void *digest, int digest_size,
+				   enum drbd_packet cmd);
+extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int size);
+
+extern int drbd_send_bitmap(struct drbd_device *device);
+extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
+extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
+extern void drbd_free_bc(struct drbd_backing_dev *ldev);
+extern void drbd_device_cleanup(struct drbd_device *device);
+void drbd_print_uuids(struct drbd_device *device, const char *text);
+
+extern void conn_md_sync(struct drbd_connection *connection);
+extern void drbd_md_write(struct drbd_device *device, void *buffer);
+extern void drbd_md_sync(struct drbd_device *device);
+extern int  drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev);
+extern void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local);
+extern void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local);
+extern void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local);
+extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
+extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
+extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+#ifndef DRBD_DEBUG_MD_SYNC
+extern void drbd_md_mark_dirty(struct drbd_device *device);
+#else
+#define drbd_md_mark_dirty(m)	drbd_md_mark_dirty_(m, __LINE__ , __func__ )
+extern void drbd_md_mark_dirty_(struct drbd_device *device,
+		unsigned int line, const char *func);
+#endif
+extern void drbd_queue_bitmap_io(struct drbd_device *device,
+				 int (*io_fn)(struct drbd_device *),
+				 void (*done)(struct drbd_device *, int),
+				 char *why, enum bm_flag flags);
+extern int drbd_bitmap_io(struct drbd_device *device,
+		int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags);
+extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
+		int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags);
+extern int drbd_bmio_set_n_write(struct drbd_device *device);
+extern int drbd_bmio_clear_n_write(struct drbd_device *device);
+extern void drbd_ldev_destroy(struct drbd_device *device);
+
+/* Meta data layout
+ *
+ * We currently have two possible layouts.
+ * Offsets in (512 byte) sectors.
+ * external:
+ *   |----------- md_size_sect ------------------|
+ *   [ 4k superblock ][ activity log ][  Bitmap  ]
+ *   | al_offset == 8 |
+ *   | bm_offset = al_offset + X      |
+ *  ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ *  Variants:
+ *     old, indexed fixed size meta data:
+ *
+ * internal:
+ *            |----------- md_size_sect ------------------|
+ * [data.....][  Bitmap  ][ activity log ][ 4k superblock ][padding*]
+ *                        | al_offset < 0 |
+ *            | bm_offset = al_offset - Y |
+ *  ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ *  [padding*] are zero or up to 7 unused 512 Byte sectors to the
+ *  end of the device, so that the [4k superblock] will be 4k aligned.
+ *
+ *  The activity log consists of 4k transaction blocks,
+ *  which are written in a ring-buffer, or striped ring-buffer like fashion,
+ *  which are writtensize used to be fixed 32kB,
+ *  but is about to become configurable.
+ */
+
+/* Our old fixed size meta data layout
+ * allows up to about 3.8TB, so if you want more,
+ * you need to use the "flexible" meta data format. */
+#define MD_128MB_SECT (128LLU << 11)  /* 128 MB, unit sectors */
+#define MD_4kB_SECT	 8
+#define MD_32kB_SECT	64
+
+/* One activity log extent represents 4M of storage */
+#define AL_EXTENT_SHIFT 22
+#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+
+/* We could make these currently hardcoded constants configurable
+ * variables at create-md time (or even re-configurable at runtime?).
+ * Which will require some more changes to the DRBD "super block"
+ * and attach code.
+ *
+ * updates per transaction:
+ *   This many changes to the active set can be logged with one transaction.
+ *   This number is arbitrary.
+ * context per transaction:
+ *   This many context extent numbers are logged with each transaction.
+ *   This number is resulting from the transaction block size (4k), the layout
+ *   of the transaction header, and the number of updates per transaction.
+ *   See drbd_actlog.c:struct al_transaction_on_disk
+ * */
+#define AL_UPDATES_PER_TRANSACTION	 64	// arbitrary
+#define AL_CONTEXT_PER_TRANSACTION	919	// (4096 - 36 - 6*64)/4
+
+#if BITS_PER_LONG == 32
+#define LN2_BPL 5
+#define cpu_to_lel(A) cpu_to_le32(A)
+#define lel_to_cpu(A) le32_to_cpu(A)
+#elif BITS_PER_LONG == 64
+#define LN2_BPL 6
+#define cpu_to_lel(A) cpu_to_le64(A)
+#define lel_to_cpu(A) le64_to_cpu(A)
+#else
+#error "LN2 of BITS_PER_LONG unknown!"
+#endif
+
+/* resync bitmap */
+/* 16MB sized 'bitmap extent' to track syncer usage */
+struct bm_extent {
+	int rs_left; /* number of bits set (out of sync) in this extent. */
+	int rs_failed; /* number of failed resync requests in this extent. */
+	unsigned long flags;
+	struct lc_element lce;
+};
+
+#define BME_NO_WRITES  0  /* bm_extent.flags: no more requests on this one! */
+#define BME_LOCKED     1  /* bm_extent.flags: syncer active on this one. */
+#define BME_PRIORITY   2  /* finish resync IO on this extent ASAP! App IO waiting! */
+
+/* drbd_bitmap.c */
+/*
+ * We need to store one bit for a block.
+ * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
+ * Bit 0 ==> local node thinks this block is binary identical on both nodes
+ * Bit 1 ==> local node thinks this block needs to be synced.
+ */
+
+#define SLEEP_TIME (HZ/10)
+
+/* We do bitmap IO in units of 4k blocks.
+ * We also still have a hardcoded 4k per bit relation. */
+#define BM_BLOCK_SHIFT	12			 /* 4k per bit */
+#define BM_BLOCK_SIZE	 (1<<BM_BLOCK_SHIFT)
+/* mostly arbitrarily set the represented size of one bitmap extent,
+ * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
+ * at 4k per bit resolution) */
+#define BM_EXT_SHIFT	 24	/* 16 MiB per resync extent */
+#define BM_EXT_SIZE	 (1<<BM_EXT_SHIFT)
+
+#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
+#error "HAVE YOU FIXED drbdmeta AS WELL??"
+#endif
+
+/* thus many _storage_ sectors are described by one bit */
+#define BM_SECT_TO_BIT(x)   ((x)>>(BM_BLOCK_SHIFT-9))
+#define BM_BIT_TO_SECT(x)   ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
+#define BM_SECT_PER_BIT     BM_BIT_TO_SECT(1)
+
+/* bit to represented kilo byte conversion */
+#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
+
+/* in which _bitmap_ extent (resp. sector) the bit for a certain
+ * _storage_ sector is located in */
+#define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SHIFT-9))
+
+/* how much _storage_ sectors we have per bitmap sector */
+#define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SHIFT-9))
+#define BM_SECT_PER_EXT     BM_EXT_TO_SECT(1)
+
+/* in one sector of the bitmap, we have this many activity_log extents. */
+#define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
+
+#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
+#define BM_BLOCKS_PER_BM_EXT_MASK  ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
+
+/* the extent in "PER_EXTENT" below is an activity log extent
+ * we need that many (long words/bytes) to store the bitmap
+ *		     of one AL_EXTENT_SIZE chunk of storage.
+ * we can store the bitmap for that many AL_EXTENTS within
+ * one sector of the _on_disk_ bitmap:
+ * bit	 0	  bit 37   bit 38	     bit (512*8)-1
+ *	     ...|........|........|.. // ..|........|
+ * sect. 0	 `296	  `304			   ^(512*8*8)-1
+ *
+#define BM_WORDS_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
+#define BM_BYTES_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 )  // 128
+#define BM_EXT_PER_SECT	    ( 512 / BM_BYTES_PER_EXTENT )	 //   4
+ */
+
+#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
+/* we have a certain meta data variant that has a fixed on-disk size of 128
+ * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
+ * log, leaving this many sectors for the bitmap.
+ */
+
+#define DRBD_MAX_SECTORS_FIXED_BM \
+	  ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
+#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
+#else
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_FIXED_BM
+/* 16 TB in units of sectors */
+#if BITS_PER_LONG == 32
+/* adjust by one page worth of bitmap,
+ * so we won't wrap around in drbd_bm_find_next_bit.
+ * you should use 64bit OS for that much storage, anyways. */
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
+#else
+/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */
+#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
+/* corresponds to (1UL << 38) bits right now. */
+#endif
+#endif
+
+/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+ * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
+ * Since we may live in a mixed-platform cluster,
+ * we limit us to a platform agnostic constant here for now.
+ * A followup commit may allow even bigger BIO sizes,
+ * once we thought that through. */
+#define DRBD_MAX_BIO_SIZE (1U << 20)
+#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#endif
+#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12)       /* Works always = 4k */
+
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
+#define DRBD_MAX_BIO_SIZE_P95    (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
+
+/* For now, don't allow more than one activity log extent worth of data
+ * to be discarded in one go. We may need to rework drbd_al_begin_io()
+ * to allow for even larger discard ranges */
+#define DRBD_MAX_DISCARD_SIZE	AL_EXTENT_SIZE
+#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
+
+extern int  drbd_bm_init(struct drbd_device *device);
+extern int  drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
+extern void drbd_bm_cleanup(struct drbd_device *device);
+extern void drbd_bm_set_all(struct drbd_device *device);
+extern void drbd_bm_clear_all(struct drbd_device *device);
+/* set/clear/test only a few bits at a time */
+extern int  drbd_bm_set_bits(
+		struct drbd_device *device, unsigned long s, unsigned long e);
+extern int  drbd_bm_clear_bits(
+		struct drbd_device *device, unsigned long s, unsigned long e);
+extern int drbd_bm_count_bits(
+	struct drbd_device *device, const unsigned long s, const unsigned long e);
+/* bm_set_bits variant for use while holding drbd_bm_lock,
+ * may process the whole bitmap in one go */
+extern void _drbd_bm_set_bits(struct drbd_device *device,
+		const unsigned long s, const unsigned long e);
+extern int  drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
+extern int  drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
+extern int  drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local);
+extern int  drbd_bm_read(struct drbd_device *device) __must_hold(local);
+extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
+extern int  drbd_bm_write(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local);
+extern size_t	     drbd_bm_words(struct drbd_device *device);
+extern unsigned long drbd_bm_bits(struct drbd_device *device);
+extern sector_t      drbd_bm_capacity(struct drbd_device *device);
+
+#define DRBD_END_OF_BITMAP	(~(unsigned long)0)
+extern unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+/* bm_find_next variants for use while you hold drbd_bm_lock() */
+extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
+extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
+extern int drbd_bm_rs_done(struct drbd_device *device);
+/* for receive_bitmap */
+extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
+		size_t number, unsigned long *buffer);
+/* for _drbd_send_bitmap */
+extern void drbd_bm_get_lel(struct drbd_device *device, size_t offset,
+		size_t number, unsigned long *buffer);
+
+extern void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags);
+extern void drbd_bm_unlock(struct drbd_device *device);
+/* drbd_main.c */
+
+extern struct kmem_cache *drbd_request_cache;
+extern struct kmem_cache *drbd_ee_cache;	/* peer requests */
+extern struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+extern mempool_t *drbd_request_mempool;
+extern mempool_t *drbd_ee_mempool;
+
+/* drbd's page pool, used to buffer data received from the peer,
+ * or data requested by the peer.
+ *
+ * This does not have an emergency reserve.
+ *
+ * When allocating from this pool, it first takes pages from the pool.
+ * Only if the pool is depleted will try to allocate from the system.
+ *
+ * The assumption is that pages taken from this pool will be processed,
+ * and given back, "quickly", and then can be recycled, so we can avoid
+ * frequent calls to alloc_page(), and still will be able to make progress even
+ * under memory pressure.
+ */
+extern struct page *drbd_pp_pool;
+extern spinlock_t   drbd_pp_lock;
+extern int	    drbd_pp_vacant;
+extern wait_queue_head_t drbd_pp_wait;
+
+/* We also need a standard (emergency-reserve backed) page pool
+ * for meta data IO (activity log, bitmap).
+ * We can keep it global, as long as it is used as "N pages at a time".
+ * 128 should be plenty, currently we probably can get away with as few as 1.
+ */
+#define DRBD_MIN_POOL_PAGES	128
+extern mempool_t *drbd_md_io_page_pool;
+
+/* We also need to make sure we get a bio
+ * when we need it for housekeeping purposes */
+extern struct bio_set *drbd_md_io_bio_set;
+/* to allocate from that set */
+extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
+
+extern rwlock_t global_state_lock;
+
+extern int conn_lowest_minor(struct drbd_connection *connection);
+extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
+extern void drbd_destroy_device(struct kref *kref);
+extern void drbd_delete_device(struct drbd_device *device);
+
+extern struct drbd_resource *drbd_create_resource(const char *name);
+extern void drbd_free_resource(struct drbd_resource *resource);
+
+extern int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts);
+extern struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts);
+extern void drbd_destroy_connection(struct kref *kref);
+extern struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+					    void *peer_addr, int peer_addr_len);
+extern struct drbd_resource *drbd_find_resource(const char *name);
+extern void drbd_destroy_resource(struct kref *kref);
+extern void conn_free_crypto(struct drbd_connection *connection);
+
+extern int proc_details;
+
+/* drbd_req */
+extern void do_submit(struct work_struct *ws);
+extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long);
+extern void drbd_make_request(struct request_queue *q, struct bio *bio);
+extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
+extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
+extern int is_valid_ar_handle(struct drbd_request *, sector_t);
+
+
+/* drbd_nl.c */
+extern int drbd_msg_put_info(struct sk_buff *skb, const char *info);
+extern void drbd_suspend_io(struct drbd_device *device);
+extern void drbd_resume_io(struct drbd_device *device);
+extern char *ppsize(char *buf, unsigned long long size);
+extern sector_t drbd_new_dev_size(struct drbd_device *, struct drbd_backing_dev *, sector_t, int);
+enum determine_dev_size {
+	DS_ERROR_SHRINK = -3,
+	DS_ERROR_SPACE_MD = -2,
+	DS_ERROR = -1,
+	DS_UNCHANGED = 0,
+	DS_SHRUNK = 1,
+	DS_GREW = 2,
+	DS_GREW_FROM_ZERO = 3,
+};
+extern enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
+extern void resync_after_online_grow(struct drbd_device *);
+extern void drbd_reconsider_max_bio_size(struct drbd_device *device);
+extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
+					enum drbd_role new_role,
+					int force);
+extern bool conn_try_outdate_peer(struct drbd_connection *connection);
+extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
+extern int drbd_khelper(struct drbd_device *device, char *cmd);
+
+/* drbd_worker.c */
+/* bi_end_io handlers */
+extern void drbd_md_io_complete(struct bio *bio, int error);
+extern void drbd_peer_request_endio(struct bio *bio, int error);
+extern void drbd_request_endio(struct bio *bio, int error);
+extern int drbd_worker(struct drbd_thread *thi);
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor);
+void drbd_resync_after_changed(struct drbd_device *device);
+extern void drbd_start_resync(struct drbd_device *device, enum drbd_conns side);
+extern void resume_next_sg(struct drbd_device *device);
+extern void suspend_other_sg(struct drbd_device *device);
+extern int drbd_resync_finished(struct drbd_device *device);
+/* maybe rather drbd_main.c ? */
+extern void *drbd_md_get_buffer(struct drbd_device *device);
+extern void drbd_md_put_buffer(struct drbd_device *device);
+extern int drbd_md_sync_page_io(struct drbd_device *device,
+		struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int);
+extern void wait_until_done_or_force_detached(struct drbd_device *device,
+		struct drbd_backing_dev *bdev, unsigned int *done);
+extern void drbd_rs_controller_reset(struct drbd_device *device);
+
+static inline void ov_out_of_sync_print(struct drbd_device *device)
+{
+	if (device->ov_last_oos_size) {
+		drbd_err(device, "Out of sync: start=%llu, size=%lu (sectors)\n",
+		     (unsigned long long)device->ov_last_oos_start,
+		     (unsigned long)device->ov_last_oos_size);
+	}
+	device->ov_last_oos_size = 0;
+}
+
+
+extern void drbd_csum_bio(struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_ee(struct crypto_hash *, struct drbd_peer_request *, void *);
+/* worker callbacks */
+extern int w_e_end_data_req(struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_work *, int);
+extern int w_resync_timer(struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_work *, int);
+extern int w_send_out_of_sync(struct drbd_work *, int);
+extern int w_start_resync(struct drbd_work *, int);
+
+extern void resync_timer_fn(unsigned long data);
+extern void start_resync_timer_fn(unsigned long data);
+
+extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
+
+/* drbd_receiver.c */
+extern int drbd_receiver(struct drbd_thread *thi);
+extern int drbd_asender(struct drbd_thread *thi);
+extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
+extern int drbd_submit_peer_request(struct drbd_device *,
+				    struct drbd_peer_request *, const unsigned,
+				    const int);
+extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
+extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
+						     sector_t, unsigned int,
+						     bool,
+						     gfp_t) __must_hold(local);
+extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
+				 int);
+#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
+#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
+extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
+extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
+extern int drbd_connected(struct drbd_peer_device *);
+
+/* Yes, there is kernel_setsockopt, but only since 2.6.18.
+ * So we have our own copy of it here. */
+static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
+				  char *optval, int optlen)
+{
+	mm_segment_t oldfs = get_fs();
+	char __user *uoptval;
+	int err;
+
+	uoptval = (char __user __force *)optval;
+
+	set_fs(KERNEL_DS);
+	if (level == SOL_SOCKET)
+		err = sock_setsockopt(sock, level, optname, uoptval, optlen);
+	else
+		err = sock->ops->setsockopt(sock, level, optname, uoptval,
+					    optlen);
+	set_fs(oldfs);
+	return err;
+}
+
+static inline void drbd_tcp_cork(struct socket *sock)
+{
+	int val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_uncork(struct socket *sock)
+{
+	int val = 0;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_nodelay(struct socket *sock)
+{
+	int val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+			(char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_quickack(struct socket *sock)
+{
+	int val = 2;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+			(char*)&val, sizeof(val));
+}
+
+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(struct drbd_device *device,
+					sector_t size)
+{
+	/* set_capacity(device->this_bdev->bd_disk, size); */
+	set_capacity(device->vdisk, size);
+	device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
+}
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_generic_make_request(struct drbd_device *device,
+					     int fault_type, struct bio *bio)
+{
+	__release(local);
+	if (!bio->bi_bdev) {
+		printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
+				"bio->bi_bdev == NULL\n",
+		       device_to_minor(device));
+		dump_stack();
+		bio_endio(bio, -ENODEV);
+		return;
+	}
+
+	if (drbd_insert_fault(device, fault_type))
+		bio_endio(bio, -EIO);
+	else
+		generic_make_request(bio);
+}
+
+void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo);
+
+/* drbd_proc.c */
+extern struct proc_dir_entry *drbd_proc;
+extern const struct file_operations drbd_proc_fops;
+extern const char *drbd_conn_str(enum drbd_conns s);
+extern const char *drbd_role_str(enum drbd_role s);
+
+/* drbd_actlog.c */
+extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
+extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate);
+extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate);
+extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
+extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern void drbd_rs_cancel_all(struct drbd_device *device);
+extern int drbd_rs_del_all(struct drbd_device *device);
+extern void drbd_rs_failed_io(struct drbd_device *device,
+		sector_t sector, int size);
+extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
+extern void __drbd_set_in_sync(struct drbd_device *device, sector_t sector,
+		int size, const char *file, const unsigned int line);
+#define drbd_set_in_sync(device, sector, size) \
+	__drbd_set_in_sync(device, sector, size, __FILE__, __LINE__)
+extern int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector,
+		int size, const char *file, const unsigned int line);
+#define drbd_set_out_of_sync(device, sector, size) \
+	__drbd_set_out_of_sync(device, sector, size, __FILE__, __LINE__)
+extern void drbd_al_shrink(struct drbd_device *device);
+extern int drbd_initialize_al(struct drbd_device *, void *);
+
+/* drbd_nl.c */
+/* state info broadcast */
+struct sib_info {
+	enum drbd_state_info_bcast_reason sib_reason;
+	union {
+		struct {
+			char *helper_name;
+			unsigned helper_exit_code;
+		};
+		struct {
+			union drbd_state os;
+			union drbd_state ns;
+		};
+	};
+};
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
+
+/*
+ * inline helper functions
+ *************************/
+
+/* see also page_chain_add and friends in drbd_receiver.c */
+static inline struct page *page_chain_next(struct page *page)
+{
+	return (struct page *)page_private(page);
+}
+#define page_chain_for_each(page) \
+	for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
+			page = page_chain_next(page))
+#define page_chain_for_each_safe(page, n) \
+	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
+
+
+static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
+{
+	struct page *page = peer_req->pages;
+	page_chain_for_each(page) {
+		if (page_count(page) > 1)
+			return 1;
+	}
+	return 0;
+}
+
+static inline enum drbd_state_rv
+_drbd_set_state(struct drbd_device *device, union drbd_state ns,
+		enum chg_state_flags flags, struct completion *done)
+{
+	enum drbd_state_rv rv;
+
+	read_lock(&global_state_lock);
+	rv = __drbd_set_state(device, ns, flags, done);
+	read_unlock(&global_state_lock);
+
+	return rv;
+}
+
+static inline union drbd_state drbd_read_state(struct drbd_device *device)
+{
+	struct drbd_resource *resource = device->resource;
+	union drbd_state rv;
+
+	rv.i = device->state.i;
+	rv.susp = resource->susp;
+	rv.susp_nod = resource->susp_nod;
+	rv.susp_fen = resource->susp_fen;
+
+	return rv;
+}
+
+enum drbd_force_detach_flags {
+	DRBD_READ_ERROR,
+	DRBD_WRITE_ERROR,
+	DRBD_META_IO_ERROR,
+	DRBD_FORCE_DETACH,
+};
+
+#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
+static inline void __drbd_chk_io_error_(struct drbd_device *device,
+		enum drbd_force_detach_flags df,
+		const char *where)
+{
+	enum drbd_io_error_p ep;
+
+	rcu_read_lock();
+	ep = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+	rcu_read_unlock();
+	switch (ep) {
+	case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
+		if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
+			if (__ratelimit(&drbd_ratelimit_state))
+				drbd_err(device, "Local IO failed in %s.\n", where);
+			if (device->state.disk > D_INCONSISTENT)
+				_drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL);
+			break;
+		}
+		/* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */
+	case EP_DETACH:
+	case EP_CALL_HELPER:
+		/* Remember whether we saw a READ or WRITE error.
+		 *
+		 * Recovery of the affected area for WRITE failure is covered
+		 * by the activity log.
+		 * READ errors may fall outside that area though. Certain READ
+		 * errors can be "healed" by writing good data to the affected
+		 * blocks, which triggers block re-allocation in lower layers.
+		 *
+		 * If we can not write the bitmap after a READ error,
+		 * we may need to trigger a full sync (see w_go_diskless()).
+		 *
+		 * Force-detach is not really an IO error, but rather a
+		 * desperate measure to try to deal with a completely
+		 * unresponsive lower level IO stack.
+		 * Still it should be treated as a WRITE error.
+		 *
+		 * Meta IO error is always WRITE error:
+		 * we read meta data only once during attach,
+		 * which will fail in case of errors.
+		 */
+		set_bit(WAS_IO_ERROR, &device->flags);
+		if (df == DRBD_READ_ERROR)
+			set_bit(WAS_READ_ERROR, &device->flags);
+		if (df == DRBD_FORCE_DETACH)
+			set_bit(FORCE_DETACH, &device->flags);
+		if (device->state.disk > D_FAILED) {
+			_drbd_set_state(_NS(device, disk, D_FAILED), CS_HARD, NULL);
+			drbd_err(device,
+				"Local IO failed in %s. Detaching...\n", where);
+		}
+		break;
+	}
+}
+
+/**
+ * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
+ * @device:	 DRBD device.
+ * @error:	 Error code passed to the IO completion callback
+ * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
+ *
+ * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
+ */
+#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
+static inline void drbd_chk_io_error_(struct drbd_device *device,
+	int error, enum drbd_force_detach_flags forcedetach, const char *where)
+{
+	if (error) {
+		unsigned long flags;
+		spin_lock_irqsave(&device->resource->req_lock, flags);
+		__drbd_chk_io_error_(device, forcedetach, where);
+		spin_unlock_irqrestore(&device->resource->req_lock, flags);
+	}
+}
+
+
+/**
+ * drbd_md_first_sector() - Returns the first sector number of the meta data area
+ * @bdev:	Meta data block device.
+ *
+ * BTW, for internal meta data, this happens to be the maximum capacity
+ * we could agree upon with our peer node.
+ */
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->md.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + bdev->md.bm_offset;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset;
+	}
+}
+
+/**
+ * drbd_md_last_sector() - Return the last sector number of the meta data area
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->md.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + MD_4kB_SECT -1;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset + bdev->md.md_size_sect -1;
+	}
+}
+
+/* Returns the number of 512 byte sectors of the device */
+static inline sector_t drbd_get_capacity(struct block_device *bdev)
+{
+	/* return bdev ? get_capacity(bdev->bd_disk) : 0; */
+	return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0;
+}
+
+/**
+ * drbd_get_max_capacity() - Returns the capacity we announce to out peer
+ * @bdev:	Meta data block device.
+ *
+ * returns the capacity we announce to out peer.  we clip ourselves at the
+ * various MAX_SECTORS, because if we don't, current implementation will
+ * oops sooner or later
+ */
+static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
+{
+	sector_t s;
+
+	switch (bdev->md.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		s = drbd_get_capacity(bdev->backing_bdev)
+			? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+				drbd_md_first_sector(bdev))
+			: 0;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+				drbd_get_capacity(bdev->backing_bdev));
+		/* clip at maximum size the meta device can support */
+		s = min_t(sector_t, s,
+			BM_EXT_TO_SECT(bdev->md.md_size_sect
+				     - bdev->md.bm_offset));
+		break;
+	default:
+		s = min_t(sector_t, DRBD_MAX_SECTORS,
+				drbd_get_capacity(bdev->backing_bdev));
+	}
+	return s;
+}
+
+/**
+ * drbd_md_ss() - Return the sector number of our meta data super block
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
+{
+	const int meta_dev_idx = bdev->md.meta_dev_idx;
+
+	if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
+		return 0;
+
+	/* Since drbd08, internal meta data is always "flexible".
+	 * position: last 4k aligned block of 4k size */
+	if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	    meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
+		return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
+
+	/* external, some index; this is the old fixed size layout */
+	return MD_128MB_SECT * bdev->md.meta_dev_idx;
+}
+
+static inline void
+drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	list_add(&w->list, &q->q);
+	spin_unlock_irqrestore(&q->q_lock, flags);
+	wake_up(&q->q_wait);
+}
+
+static inline void
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	list_add_tail(&w->list, &q->q);
+	spin_unlock_irqrestore(&q->q_lock, flags);
+	wake_up(&q->q_wait);
+}
+
+extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
+
+static inline void wake_asender(struct drbd_connection *connection)
+{
+	if (test_bit(SIGNAL_ASENDER, &connection->flags))
+		force_sig(DRBD_SIG, connection->asender.task);
+}
+
+static inline void request_ping(struct drbd_connection *connection)
+{
+	set_bit(SEND_PING, &connection->flags);
+	wake_asender(connection);
+}
+
+extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
+extern void *drbd_prepare_command(struct drbd_peer_device *, struct drbd_socket *);
+extern int conn_send_command(struct drbd_connection *, struct drbd_socket *,
+			     enum drbd_packet, unsigned int, void *,
+			     unsigned int);
+extern int drbd_send_command(struct drbd_peer_device *, struct drbd_socket *,
+			     enum drbd_packet, unsigned int, void *,
+			     unsigned int);
+
+extern int drbd_send_ping(struct drbd_connection *connection);
+extern int drbd_send_ping_ack(struct drbd_connection *connection);
+extern int drbd_send_state_req(struct drbd_peer_device *, union drbd_state, union drbd_state);
+extern int conn_send_state_req(struct drbd_connection *, union drbd_state, union drbd_state);
+
+static inline void drbd_thread_stop(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, false, true);
+}
+
+static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, false, false);
+}
+
+static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, true, false);
+}
+
+/* counts how many answer packets packets we expect from our peer,
+ * for either explicit application requests,
+ * or implicit barrier packets as necessary.
+ * increased:
+ *  w_send_barrier
+ *  _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
+ *    it is much easier and equally valid to count what we queue for the
+ *    worker, even before it actually was queued or send.
+ *    (drbd_make_request_common; recovery path on read io-error)
+ * decreased:
+ *  got_BarrierAck (respective tl_clear, tl_clear_barrier)
+ *  _req_mod(req, DATA_RECEIVED)
+ *     [from receive_DataReply]
+ *  _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
+ *     [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
+ *     for some reason it is NOT decreased in got_NegAck,
+ *     but in the resulting cleanup code from report_params.
+ *     we should try to remember the reason for that...
+ *  _req_mod(req, SEND_FAILED or SEND_CANCELED)
+ *  _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
+ *     [from tl_clear_barrier]
+ */
+static inline void inc_ap_pending(struct drbd_device *device)
+{
+	atomic_inc(&device->ap_pending_cnt);
+}
+
+#define ERR_IF_CNT_IS_NEGATIVE(which, func, line)			\
+	if (atomic_read(&device->which) < 0)				\
+		drbd_err(device, "in %s:%d: " #which " = %d < 0 !\n",	\
+			func, line,					\
+			atomic_read(&device->which))
+
+#define dec_ap_pending(device) _dec_ap_pending(device, __FUNCTION__, __LINE__)
+static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line)
+{
+	if (atomic_dec_and_test(&device->ap_pending_cnt))
+		wake_up(&device->misc_wait);
+	ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line);
+}
+
+/* counts how many resync-related answers we still expect from the peer
+ *		     increase			decrease
+ * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
+ * C_SYNC_SOURCE sends P_RS_DATA_REPLY   (and expects P_WRITE_ACK with ID_SYNCER)
+ *					   (or P_NEG_ACK with ID_SYNCER)
+ */
+static inline void inc_rs_pending(struct drbd_device *device)
+{
+	atomic_inc(&device->rs_pending_cnt);
+}
+
+#define dec_rs_pending(device) _dec_rs_pending(device, __FUNCTION__, __LINE__)
+static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line)
+{
+	atomic_dec(&device->rs_pending_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line);
+}
+
+/* counts how many answers we still need to send to the peer.
+ * increased on
+ *  receive_Data	unless protocol A;
+ *			we need to send a P_RECV_ACK (proto B)
+ *			or P_WRITE_ACK (proto C)
+ *  receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
+ *  receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
+ *  receive_Barrier_*	we need to send a P_BARRIER_ACK
+ */
+static inline void inc_unacked(struct drbd_device *device)
+{
+	atomic_inc(&device->unacked_cnt);
+}
+
+#define dec_unacked(device) _dec_unacked(device, __FUNCTION__, __LINE__)
+static inline void _dec_unacked(struct drbd_device *device, const char *func, int line)
+{
+	atomic_dec(&device->unacked_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
+}
+
+#define sub_unacked(device, n) _sub_unacked(device, n, __FUNCTION__, __LINE__)
+static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line)
+{
+	atomic_sub(n, &device->unacked_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
+}
+
+/**
+ * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
+ * @M:		DRBD device.
+ *
+ * You have to call put_ldev() when finished working with device->ldev.
+ */
+#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT))
+#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS))
+
+static inline void put_ldev(struct drbd_device *device)
+{
+	int i = atomic_dec_return(&device->local_cnt);
+
+	/* This may be called from some endio handler,
+	 * so we must not sleep here. */
+
+	__release(local);
+	D_ASSERT(device, i >= 0);
+	if (i == 0) {
+		if (device->state.disk == D_DISKLESS)
+			/* even internal references gone, safe to destroy */
+			drbd_ldev_destroy(device);
+		if (device->state.disk == D_FAILED) {
+			/* all application IO references gone. */
+			if (!test_and_set_bit(GO_DISKLESS, &device->flags))
+				drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+						&device->go_diskless);
+		}
+		wake_up(&device->misc_wait);
+	}
+}
+
+#ifndef __CHECKER__
+static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	/* never get a reference while D_DISKLESS */
+	if (device->state.disk == D_DISKLESS)
+		return 0;
+
+	atomic_inc(&device->local_cnt);
+	io_allowed = (device->state.disk >= mins);
+	if (!io_allowed)
+		put_ldev(device);
+	return io_allowed;
+}
+#else
+extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
+#endif
+
+/* you must have an "get_ldev" reference */
+static inline void drbd_get_syncer_progress(struct drbd_device *device,
+		unsigned long *bits_left, unsigned int *per_mil_done)
+{
+	/* this is to break it at compile time when we change that, in case we
+	 * want to support more than (1<<32) bits on a 32bit arch. */
+	typecheck(unsigned long, device->rs_total);
+
+	/* note: both rs_total and rs_left are in bits, i.e. in
+	 * units of BM_BLOCK_SIZE.
+	 * for the percentage, we don't care. */
+
+	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+		*bits_left = device->ov_left;
+	else
+		*bits_left = drbd_bm_total_weight(device) - device->rs_failed;
+	/* >> 10 to prevent overflow,
+	 * +1 to prevent division by zero */
+	if (*bits_left > device->rs_total) {
+		/* doh. maybe a logic bug somewhere.
+		 * may also be just a race condition
+		 * between this and a disconnect during sync.
+		 * for now, just prevent in-kernel buffer overflow.
+		 */
+		smp_rmb();
+		drbd_warn(device, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
+				drbd_conn_str(device->state.conn),
+				*bits_left, device->rs_total, device->rs_failed);
+		*per_mil_done = 0;
+	} else {
+		/* Make sure the division happens in long context.
+		 * We allow up to one petabyte storage right now,
+		 * at a granularity of 4k per bit that is 2**38 bits.
+		 * After shift right and multiplication by 1000,
+		 * this should still fit easily into a 32bit long,
+		 * so we don't need a 64bit division on 32bit arch.
+		 * Note: currently we don't support such large bitmaps on 32bit
+		 * arch anyways, but no harm done to be prepared for it here.
+		 */
+		unsigned int shift = device->rs_total > UINT_MAX ? 16 : 10;
+		unsigned long left = *bits_left >> shift;
+		unsigned long total = 1UL + (device->rs_total >> shift);
+		unsigned long tmp = 1000UL - left * 1000UL/total;
+		*per_mil_done = tmp;
+	}
+}
+
+
+/* this throttles on-the-fly application requests
+ * according to max_buffers settings;
+ * maybe re-implement using semaphores? */
+static inline int drbd_get_max_buffers(struct drbd_device *device)
+{
+	struct net_conf *nc;
+	int mxb;
+
+	rcu_read_lock();
+	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	mxb = nc ? nc->max_buffers : 1000000;  /* arbitrary limit on open requests */
+	rcu_read_unlock();
+
+	return mxb;
+}
+
+static inline int drbd_state_is_stable(struct drbd_device *device)
+{
+	union drbd_dev_state s = device->state;
+
+	/* DO NOT add a default clause, we want the compiler to warn us
+	 * for any newly introduced state we may have forgotten to add here */
+
+	switch ((enum drbd_conns)s.conn) {
+	/* new io only accepted when there is no connection, ... */
+	case C_STANDALONE:
+	case C_WF_CONNECTION:
+	/* ... or there is a well established connection. */
+	case C_CONNECTED:
+	case C_SYNC_SOURCE:
+	case C_SYNC_TARGET:
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+	case C_PAUSED_SYNC_S:
+	case C_PAUSED_SYNC_T:
+	case C_AHEAD:
+	case C_BEHIND:
+		/* transitional states, IO allowed */
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_REPORT_PARAMS:
+	case C_STARTING_SYNC_S:
+	case C_STARTING_SYNC_T:
+		break;
+
+		/* Allow IO in BM exchange states with new protocols */
+	case C_WF_BITMAP_S:
+		if (first_peer_device(device)->connection->agreed_pro_version < 96)
+			return 0;
+		break;
+
+		/* no new io accepted in these states */
+	case C_WF_BITMAP_T:
+	case C_WF_SYNC_UUID:
+	case C_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	switch ((enum drbd_disk_state)s.disk) {
+	case D_DISKLESS:
+	case D_INCONSISTENT:
+	case D_OUTDATED:
+	case D_CONSISTENT:
+	case D_UP_TO_DATE:
+	case D_FAILED:
+		/* disk state is stable as well. */
+		break;
+
+	/* no new io accepted during transitional states */
+	case D_ATTACHING:
+	case D_NEGOTIATING:
+	case D_UNKNOWN:
+	case D_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline int drbd_suspended(struct drbd_device *device)
+{
+	struct drbd_resource *resource = device->resource;
+
+	return resource->susp || resource->susp_fen || resource->susp_nod;
+}
+
+static inline bool may_inc_ap_bio(struct drbd_device *device)
+{
+	int mxb = drbd_get_max_buffers(device);
+
+	if (drbd_suspended(device))
+		return false;
+	if (test_bit(SUSPEND_IO, &device->flags))
+		return false;
+
+	/* to avoid potential deadlock or bitmap corruption,
+	 * in various places, we only allow new application io
+	 * to start during "stable" states. */
+
+	/* no new io accepted when attaching or detaching the disk */
+	if (!drbd_state_is_stable(device))
+		return false;
+
+	/* since some older kernels don't have atomic_add_unless,
+	 * and we are within the spinlock anyways, we have this workaround.  */
+	if (atomic_read(&device->ap_bio_cnt) > mxb)
+		return false;
+	if (test_bit(BITMAP_IO, &device->flags))
+		return false;
+	return true;
+}
+
+static inline bool inc_ap_bio_cond(struct drbd_device *device)
+{
+	bool rv = false;
+
+	spin_lock_irq(&device->resource->req_lock);
+	rv = may_inc_ap_bio(device);
+	if (rv)
+		atomic_inc(&device->ap_bio_cnt);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	return rv;
+}
+
+static inline void inc_ap_bio(struct drbd_device *device)
+{
+	/* we wait here
+	 *    as long as the device is suspended
+	 *    until the bitmap is no longer on the fly during connection
+	 *    handshake as long as we would exceed the max_buffer limit.
+	 *
+	 * to avoid races with the reconnect code,
+	 * we need to atomic_inc within the spinlock. */
+
+	wait_event(device->misc_wait, inc_ap_bio_cond(device));
+}
+
+static inline void dec_ap_bio(struct drbd_device *device)
+{
+	int mxb = drbd_get_max_buffers(device);
+	int ap_bio = atomic_dec_return(&device->ap_bio_cnt);
+
+	D_ASSERT(device, ap_bio >= 0);
+
+	if (ap_bio == 0 && test_bit(BITMAP_IO, &device->flags)) {
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+			drbd_queue_work(&first_peer_device(device)->
+				connection->sender_work,
+				&device->bm_io_work.w);
+	}
+
+	/* this currently does wake_up for every dec_ap_bio!
+	 * maybe rather introduce some type of hysteresis?
+	 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
+	if (ap_bio < mxb)
+		wake_up(&device->misc_wait);
+}
+
+static inline bool verify_can_do_stop_sector(struct drbd_device *device)
+{
+	return first_peer_device(device)->connection->agreed_pro_version >= 97 &&
+		first_peer_device(device)->connection->agreed_pro_version != 100;
+}
+
+static inline int drbd_set_ed_uuid(struct drbd_device *device, u64 val)
+{
+	int changed = device->ed_uuid != val;
+	device->ed_uuid = val;
+	return changed;
+}
+
+static inline int drbd_queue_order_type(struct drbd_device *device)
+{
+	/* sorry, we currently have no working implementation
+	 * of distributed TCQ stuff */
+#ifndef QUEUE_ORDERED_NONE
+#define QUEUE_ORDERED_NONE 0
+#endif
+	return QUEUE_ORDERED_NONE;
+}
+
+static inline void drbd_md_flush(struct drbd_device *device)
+{
+	int r;
+
+	if (device->ldev == NULL) {
+		drbd_warn(device, "device->ldev == NULL in drbd_md_flush\n");
+		return;
+	}
+
+	if (test_bit(MD_NO_FUA, &device->flags))
+		return;
+
+	r = blkdev_issue_flush(device->ldev->md_bdev, GFP_NOIO, NULL);
+	if (r) {
+		set_bit(MD_NO_FUA, &device->flags);
+		drbd_err(device, "meta data flush failed with status %d, disabling md-flushes\n", r);
+	}
+}
+
+static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
+{
+	return list_first_entry_or_null(&resource->connections,
+				struct drbd_connection, connections);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c
new file mode 100644
index 00000000000..89c497c630b
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.c
@@ -0,0 +1,207 @@
+#include <asm/bug.h>
+#include <linux/rbtree_augmented.h>
+#include "drbd_interval.h"
+
+/**
+ * interval_end  -  return end of @node
+ */
+static inline
+sector_t interval_end(struct rb_node *node)
+{
+	struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+	return this->end;
+}
+
+/**
+ * compute_subtree_last  -  compute end of @node
+ *
+ * The end of an interval is the highest (start + (size >> 9)) value of this
+ * node and of its children.  Called for @node and its parents whenever the end
+ * may have changed.
+ */
+static inline sector_t
+compute_subtree_last(struct drbd_interval *node)
+{
+	sector_t max = node->sector + (node->size >> 9);
+
+	if (node->rb.rb_left) {
+		sector_t left = interval_end(node->rb.rb_left);
+		if (left > max)
+			max = left;
+	}
+	if (node->rb.rb_right) {
+		sector_t right = interval_end(node->rb.rb_right);
+		if (right > max)
+			max = right;
+	}
+	return max;
+}
+
+static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
+{
+	while (rb != stop) {
+		struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb);
+		sector_t subtree_last = compute_subtree_last(node);
+		if (node->end == subtree_last)
+			break;
+		node->end = subtree_last;
+		rb = rb_parent(&node->rb);
+	}
+}
+
+static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
+	struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
+
+	new->end = old->end;
+}
+
+static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
+	struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
+
+	new->end = old->end;
+	old->end = compute_subtree_last(old);
+}
+
+static const struct rb_augment_callbacks augment_callbacks = {
+	augment_propagate,
+	augment_copy,
+	augment_rotate,
+};
+
+/**
+ * drbd_insert_interval  -  insert a new interval into a tree
+ */
+bool
+drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+{
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+
+	BUG_ON(!IS_ALIGNED(this->size, 512));
+
+	while (*new) {
+		struct drbd_interval *here =
+			rb_entry(*new, struct drbd_interval, rb);
+
+		parent = *new;
+		if (this->sector < here->sector)
+			new = &(*new)->rb_left;
+		else if (this->sector > here->sector)
+			new = &(*new)->rb_right;
+		else if (this < here)
+			new = &(*new)->rb_left;
+		else if (this > here)
+			new = &(*new)->rb_right;
+		else
+			return false;
+	}
+
+	rb_link_node(&this->rb, parent, new);
+	rb_insert_augmented(&this->rb, root, &augment_callbacks);
+	return true;
+}
+
+/**
+ * drbd_contains_interval  -  check if a tree contains a given interval
+ * @sector:	start sector of @interval
+ * @interval:	may not be a valid pointer
+ *
+ * Returns if the tree contains the node @interval with start sector @start.
+ * Does not dereference @interval until @interval is known to be a valid object
+ * in @tree.  Returns %false if @interval is in the tree but with a different
+ * sector number.
+ */
+bool
+drbd_contains_interval(struct rb_root *root, sector_t sector,
+		       struct drbd_interval *interval)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct drbd_interval *here =
+			rb_entry(node, struct drbd_interval, rb);
+
+		if (sector < here->sector)
+			node = node->rb_left;
+		else if (sector > here->sector)
+			node = node->rb_right;
+		else if (interval < here)
+			node = node->rb_left;
+		else if (interval > here)
+			node = node->rb_right;
+		else
+			return true;
+	}
+	return false;
+}
+
+/**
+ * drbd_remove_interval  -  remove an interval from a tree
+ */
+void
+drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+{
+	rb_erase_augmented(&this->rb, root, &augment_callbacks);
+}
+
+/**
+ * drbd_find_overlap  - search for an interval overlapping with [sector, sector + size)
+ * @sector:	start sector
+ * @size:	size, aligned to 512 bytes
+ *
+ * Returns an interval overlapping with [sector, sector + size), or NULL if
+ * there is none.  When there is more than one overlapping interval in the
+ * tree, the interval with the lowest start sector is returned, and all other
+ * overlapping intervals will be on the right side of the tree, reachable with
+ * rb_next().
+ */
+struct drbd_interval *
+drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+{
+	struct rb_node *node = root->rb_node;
+	struct drbd_interval *overlap = NULL;
+	sector_t end = sector + (size >> 9);
+
+	BUG_ON(!IS_ALIGNED(size, 512));
+
+	while (node) {
+		struct drbd_interval *here =
+			rb_entry(node, struct drbd_interval, rb);
+
+		if (node->rb_left &&
+		    sector < interval_end(node->rb_left)) {
+			/* Overlap if any must be on left side */
+			node = node->rb_left;
+		} else if (here->sector < end &&
+			   sector < here->sector + (here->size >> 9)) {
+			overlap = here;
+			break;
+		} else if (sector >= here->sector) {
+			/* Overlap if any must be on right side */
+			node = node->rb_right;
+		} else
+			break;
+	}
+	return overlap;
+}
+
+struct drbd_interval *
+drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+{
+	sector_t end = sector + (size >> 9);
+	struct rb_node *node;
+
+	for (;;) {
+		node = rb_next(&i->rb);
+		if (!node)
+			return NULL;
+		i = rb_entry(node, struct drbd_interval, rb);
+		if (i->sector >= end)
+			return NULL;
+		if (sector < i->sector + (i->size >> 9))
+			return i;
+	}
+}
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
new file mode 100644
index 00000000000..f38fcb00c10
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.h
@@ -0,0 +1,40 @@
+#ifndef __DRBD_INTERVAL_H
+#define __DRBD_INTERVAL_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct drbd_interval {
+	struct rb_node rb;
+	sector_t sector;	/* start sector of the interval */
+	unsigned int size;	/* size in bytes */
+	sector_t end;		/* highest interval end in subtree */
+	int local:1		/* local or remote request? */;
+	int waiting:1;
+};
+
+static inline void drbd_clear_interval(struct drbd_interval *i)
+{
+	RB_CLEAR_NODE(&i->rb);
+}
+
+static inline bool drbd_interval_empty(struct drbd_interval *i)
+{
+	return RB_EMPTY_NODE(&i->rb);
+}
+
+extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
+extern bool drbd_contains_interval(struct rb_root *, sector_t,
+				   struct drbd_interval *);
+extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
+extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
+					unsigned int);
+extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
+					unsigned int);
+
+#define drbd_for_each_overlap(i, root, sector, size)		\
+	for (i = drbd_find_overlap(root, sector, size);		\
+	     i;							\
+	     i = drbd_next_overlap(i, sector, size))
+
+#endif  /* __DRBD_INTERVAL_H */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 00000000000..960645c26e6
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3908 @@
+/*
+   drbd.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+   from Logicworks, Inc. for making SDP replication support possible.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <asm/uaccess.h>
+#include <asm/types.h>
+#include <net/sock.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
+
+#include "drbd_vli.h"
+
+static DEFINE_MUTEX(drbd_main_mutex);
+static int drbd_open(struct block_device *bdev, fmode_t mode);
+static void drbd_release(struct gendisk *gd, fmode_t mode);
+static int w_md_sync(struct drbd_work *w, int unused);
+static void md_sync_timer_fn(unsigned long data);
+static int w_bitmap_io(struct drbd_work *w, int unused);
+static int w_go_diskless(struct drbd_work *w, int unused);
+
+MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
+	      "Lars Ellenberg <lars@linbit.com>");
+MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
+MODULE_VERSION(REL_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
+		 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
+MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
+
+#include <linux/moduleparam.h>
+/* allow_open_on_secondary */
+MODULE_PARM_DESC(allow_oos, "DONT USE!");
+/* thanks to these macros, if compiled into the kernel (not-module),
+ * this becomes the boot parameter drbd.minor_count */
+module_param(minor_count, uint, 0444);
+module_param(disable_sendpage, bool, 0644);
+module_param(allow_oos, bool, 0);
+module_param(proc_details, int, 0644);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+int enable_faults;
+int fault_rate;
+static int fault_count;
+int fault_devs;
+/* bitmap of enabled faults */
+module_param(enable_faults, int, 0664);
+/* fault rate % value - applies to all enabled faults */
+module_param(fault_rate, int, 0664);
+/* count of faults inserted */
+module_param(fault_count, int, 0664);
+/* bitmap of devices to insert faults on */
+module_param(fault_devs, int, 0644);
+#endif
+
+/* module parameter, defined */
+unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
+bool disable_sendpage;
+bool allow_oos;
+int proc_details;       /* Detail level in proc drbd*/
+
+/* Module parameter for setting the user mode helper program
+ * to run. Default is /sbin/drbdadm */
+char usermode_helper[80] = "/sbin/drbdadm";
+
+module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
+
+/* in 2.6.x, our device mapping and config info contains our virtual gendisks
+ * as member "struct gendisk *vdisk;"
+ */
+struct idr drbd_devices;
+struct list_head drbd_resources;
+
+struct kmem_cache *drbd_request_cache;
+struct kmem_cache *drbd_ee_cache;	/* peer requests */
+struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+mempool_t *drbd_request_mempool;
+mempool_t *drbd_ee_mempool;
+mempool_t *drbd_md_io_page_pool;
+struct bio_set *drbd_md_io_bio_set;
+
+/* I do not use a standard mempool, because:
+   1) I want to hand out the pre-allocated objects first.
+   2) I want to be able to interrupt sleeping allocation with a signal.
+   Note: This is a single linked list, the next pointer is the private
+	 member of struct page.
+ */
+struct page *drbd_pp_pool;
+spinlock_t   drbd_pp_lock;
+int          drbd_pp_vacant;
+wait_queue_head_t drbd_pp_wait;
+
+DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
+
+static const struct block_device_operations drbd_ops = {
+	.owner =   THIS_MODULE,
+	.open =    drbd_open,
+	.release = drbd_release,
+};
+
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+	struct bio *bio;
+
+	if (!drbd_md_io_bio_set)
+		return bio_alloc(gfp_mask, 1);
+
+	bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+	if (!bio)
+		return NULL;
+	return bio;
+}
+
+#ifdef __CHECKER__
+/* When checking with sparse, and this is an inline function, sparse will
+   give tons of false positives. When this is a real functions sparse works.
+ */
+int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	atomic_inc(&device->local_cnt);
+	io_allowed = (device->state.disk >= mins);
+	if (!io_allowed) {
+		if (atomic_dec_and_test(&device->local_cnt))
+			wake_up(&device->misc_wait);
+	}
+	return io_allowed;
+}
+
+#endif
+
+/**
+ * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+ * @connection:	DRBD connection.
+ * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
+ * @set_size:	Expected number of requests before that barrier.
+ *
+ * In case the passed barrier_nr or set_size does not match the oldest
+ * epoch of not yet barrier-acked requests, this function will cause a
+ * termination of the connection.
+ */
+void tl_release(struct drbd_connection *connection, unsigned int barrier_nr,
+		unsigned int set_size)
+{
+	struct drbd_request *r;
+	struct drbd_request *req = NULL;
+	int expect_epoch = 0;
+	int expect_size = 0;
+
+	spin_lock_irq(&connection->resource->req_lock);
+
+	/* find oldest not yet barrier-acked write request,
+	 * count writes in its epoch. */
+	list_for_each_entry(r, &connection->transfer_log, tl_requests) {
+		const unsigned s = r->rq_state;
+		if (!req) {
+			if (!(s & RQ_WRITE))
+				continue;
+			if (!(s & RQ_NET_MASK))
+				continue;
+			if (s & RQ_NET_DONE)
+				continue;
+			req = r;
+			expect_epoch = req->epoch;
+			expect_size ++;
+		} else {
+			if (r->epoch != expect_epoch)
+				break;
+			if (!(s & RQ_WRITE))
+				continue;
+			/* if (s & RQ_DONE): not expected */
+			/* if (!(s & RQ_NET_MASK)): not expected */
+			expect_size++;
+		}
+	}
+
+	/* first some paranoia code */
+	if (req == NULL) {
+		drbd_err(connection, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+			 barrier_nr);
+		goto bail;
+	}
+	if (expect_epoch != barrier_nr) {
+		drbd_err(connection, "BAD! BarrierAck #%u received, expected #%u!\n",
+			 barrier_nr, expect_epoch);
+		goto bail;
+	}
+
+	if (expect_size != set_size) {
+		drbd_err(connection, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+			 barrier_nr, set_size, expect_size);
+		goto bail;
+	}
+
+	/* Clean up list of requests processed during current epoch. */
+	/* this extra list walk restart is paranoia,
+	 * to catch requests being barrier-acked "unexpectedly".
+	 * It usually should find the same req again, or some READ preceding it. */
+	list_for_each_entry(req, &connection->transfer_log, tl_requests)
+		if (req->epoch == expect_epoch)
+			break;
+	list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) {
+		if (req->epoch != expect_epoch)
+			break;
+		_req_mod(req, BARRIER_ACKED);
+	}
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	return;
+
+bail:
+	spin_unlock_irq(&connection->resource->req_lock);
+	conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+}
+
+
+/**
+ * _tl_restart() - Walks the transfer log, and applies an action to all requests
+ * @device:	DRBD device.
+ * @what:       The action/event to perform with all request objects
+ *
+ * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+ * RESTART_FROZEN_DISK_IO.
+ */
+/* must hold resource->req_lock */
+void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
+{
+	struct drbd_request *req, *r;
+
+	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests)
+		_req_mod(req, what);
+}
+
+void tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
+{
+	spin_lock_irq(&connection->resource->req_lock);
+	_tl_restart(connection, what);
+	spin_unlock_irq(&connection->resource->req_lock);
+}
+
+/**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @device:	DRBD device.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+void tl_clear(struct drbd_connection *connection)
+{
+	tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+}
+
+/**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain device in the TL
+ * @device:	DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_device *device)
+{
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_request *req, *r;
+
+	spin_lock_irq(&connection->resource->req_lock);
+	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) {
+		if (!(req->rq_state & RQ_LOCAL_PENDING))
+			continue;
+		if (req->device != device)
+			continue;
+		_req_mod(req, ABORT_DISK_IO);
+	}
+	spin_unlock_irq(&connection->resource->req_lock);
+}
+
+static int drbd_thread_setup(void *arg)
+{
+	struct drbd_thread *thi = (struct drbd_thread *) arg;
+	struct drbd_resource *resource = thi->resource;
+	unsigned long flags;
+	int retval;
+
+	snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
+		 thi->name[0],
+		 resource->name);
+
+restart:
+	retval = thi->function(thi);
+
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	/* if the receiver has been "EXITING", the last thing it did
+	 * was set the conn state to "StandAlone",
+	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
+	 * and receiver thread will be "started".
+	 * drbd_thread_start needs to set "RESTARTING" in that case.
+	 * t_state check and assignment needs to be within the same spinlock,
+	 * so either thread_start sees EXITING, and can remap to RESTARTING,
+	 * or thread_start see NONE, and can proceed as normal.
+	 */
+
+	if (thi->t_state == RESTARTING) {
+		drbd_info(resource, "Restarting %s thread\n", thi->name);
+		thi->t_state = RUNNING;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		goto restart;
+	}
+
+	thi->task = NULL;
+	thi->t_state = NONE;
+	smp_mb();
+	complete_all(&thi->stop);
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	drbd_info(resource, "Terminating %s\n", current->comm);
+
+	/* Release mod reference taken when thread was started */
+
+	if (thi->connection)
+		kref_put(&thi->connection->kref, drbd_destroy_connection);
+	kref_put(&resource->kref, drbd_destroy_resource);
+	module_put(THIS_MODULE);
+	return retval;
+}
+
+static void drbd_thread_init(struct drbd_resource *resource, struct drbd_thread *thi,
+			     int (*func) (struct drbd_thread *), const char *name)
+{
+	spin_lock_init(&thi->t_lock);
+	thi->task    = NULL;
+	thi->t_state = NONE;
+	thi->function = func;
+	thi->resource = resource;
+	thi->connection = NULL;
+	thi->name = name;
+}
+
+int drbd_thread_start(struct drbd_thread *thi)
+{
+	struct drbd_resource *resource = thi->resource;
+	struct task_struct *nt;
+	unsigned long flags;
+
+	/* is used from state engine doing drbd_thread_stop_nowait,
+	 * while holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	switch (thi->t_state) {
+	case NONE:
+		drbd_info(resource, "Starting %s thread (from %s [%d])\n",
+			 thi->name, current->comm, current->pid);
+
+		/* Get ref on module for thread - this is released when thread exits */
+		if (!try_module_get(THIS_MODULE)) {
+			drbd_err(resource, "Failed to get module reference in drbd_thread_start\n");
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return false;
+		}
+
+		kref_get(&resource->kref);
+		if (thi->connection)
+			kref_get(&thi->connection->kref);
+
+		init_completion(&thi->stop);
+		thi->reset_cpu_mask = 1;
+		thi->t_state = RUNNING;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
+
+		nt = kthread_create(drbd_thread_setup, (void *) thi,
+				    "drbd_%c_%s", thi->name[0], thi->resource->name);
+
+		if (IS_ERR(nt)) {
+			drbd_err(resource, "Couldn't start thread\n");
+
+			if (thi->connection)
+				kref_put(&thi->connection->kref, drbd_destroy_connection);
+			kref_put(&resource->kref, drbd_destroy_resource);
+			module_put(THIS_MODULE);
+			return false;
+		}
+		spin_lock_irqsave(&thi->t_lock, flags);
+		thi->task = nt;
+		thi->t_state = RUNNING;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		wake_up_process(nt);
+		break;
+	case EXITING:
+		thi->t_state = RESTARTING;
+		drbd_info(resource, "Restarting %s thread (from %s [%d])\n",
+				thi->name, current->comm, current->pid);
+		/* fall through */
+	case RUNNING:
+	case RESTARTING:
+	default:
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		break;
+	}
+
+	return true;
+}
+
+
+void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
+{
+	unsigned long flags;
+
+	enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
+
+	/* may be called from state engine, holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	if (thi->t_state == NONE) {
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		if (restart)
+			drbd_thread_start(thi);
+		return;
+	}
+
+	if (thi->t_state != ns) {
+		if (thi->task == NULL) {
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return;
+		}
+
+		thi->t_state = ns;
+		smp_mb();
+		init_completion(&thi->stop);
+		if (thi->task != current)
+			force_sig(DRBD_SIGKILL, thi->task);
+	}
+
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	if (wait)
+		wait_for_completion(&thi->stop);
+}
+
+int conn_lowest_minor(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr = 0, minor = -1;
+
+	rcu_read_lock();
+	peer_device = idr_get_next(&connection->peer_devices, &vnr);
+	if (peer_device)
+		minor = device_to_minor(peer_device->device);
+	rcu_read_unlock();
+
+	return minor;
+}
+
+#ifdef CONFIG_SMP
+/**
+ * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
+ *
+ * Forces all threads of a resource onto the same CPU. This is beneficial for
+ * DRBD's performance. May be overwritten by user's configuration.
+ */
+static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask)
+{
+	unsigned int *resources_per_cpu, min_index = ~0;
+
+	resources_per_cpu = kzalloc(nr_cpu_ids * sizeof(*resources_per_cpu), GFP_KERNEL);
+	if (resources_per_cpu) {
+		struct drbd_resource *resource;
+		unsigned int cpu, min = ~0;
+
+		rcu_read_lock();
+		for_each_resource_rcu(resource, &drbd_resources) {
+			for_each_cpu(cpu, resource->cpu_mask)
+				resources_per_cpu[cpu]++;
+		}
+		rcu_read_unlock();
+		for_each_online_cpu(cpu) {
+			if (resources_per_cpu[cpu] < min) {
+				min = resources_per_cpu[cpu];
+				min_index = cpu;
+			}
+		}
+		kfree(resources_per_cpu);
+	}
+	if (min_index == ~0) {
+		cpumask_setall(*cpu_mask);
+		return;
+	}
+	cpumask_set_cpu(min_index, *cpu_mask);
+}
+
+/**
+ * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
+ * @device:	DRBD device.
+ * @thi:	drbd_thread object
+ *
+ * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
+ * prematurely.
+ */
+void drbd_thread_current_set_cpu(struct drbd_thread *thi)
+{
+	struct drbd_resource *resource = thi->resource;
+	struct task_struct *p = current;
+
+	if (!thi->reset_cpu_mask)
+		return;
+	thi->reset_cpu_mask = 0;
+	set_cpus_allowed_ptr(p, resource->cpu_mask);
+}
+#else
+#define drbd_calc_cpu_mask(A) ({})
+#endif
+
+/**
+ * drbd_header_size  -  size of a packet header
+ *
+ * The header size is a multiple of 8, so any payload following the header is
+ * word aligned on 64-bit architectures.  (The bitmap send and receive code
+ * relies on this.)
+ */
+unsigned int drbd_header_size(struct drbd_connection *connection)
+{
+	if (connection->agreed_pro_version >= 100) {
+		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
+		return sizeof(struct p_header100);
+	} else {
+		BUILD_BUG_ON(sizeof(struct p_header80) !=
+			     sizeof(struct p_header95));
+		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
+		return sizeof(struct p_header80);
+	}
+}
+
+static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
+{
+	h->magic   = cpu_to_be32(DRBD_MAGIC);
+	h->command = cpu_to_be16(cmd);
+	h->length  = cpu_to_be16(size);
+	return sizeof(struct p_header80);
+}
+
+static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
+{
+	h->magic   = cpu_to_be16(DRBD_MAGIC_BIG);
+	h->command = cpu_to_be16(cmd);
+	h->length = cpu_to_be32(size);
+	return sizeof(struct p_header95);
+}
+
+static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
+				      int size, int vnr)
+{
+	h->magic = cpu_to_be32(DRBD_MAGIC_100);
+	h->volume = cpu_to_be16(vnr);
+	h->command = cpu_to_be16(cmd);
+	h->length = cpu_to_be32(size);
+	h->pad = 0;
+	return sizeof(struct p_header100);
+}
+
+static unsigned int prepare_header(struct drbd_connection *connection, int vnr,
+				   void *buffer, enum drbd_packet cmd, int size)
+{
+	if (connection->agreed_pro_version >= 100)
+		return prepare_header100(buffer, cmd, size, vnr);
+	else if (connection->agreed_pro_version >= 95 &&
+		 size > DRBD_MAX_SIZE_H80_PACKET)
+		return prepare_header95(buffer, cmd, size);
+	else
+		return prepare_header80(buffer, cmd, size);
+}
+
+static void *__conn_prepare_command(struct drbd_connection *connection,
+				    struct drbd_socket *sock)
+{
+	if (!sock->socket)
+		return NULL;
+	return sock->sbuf + drbd_header_size(connection);
+}
+
+void *conn_prepare_command(struct drbd_connection *connection, struct drbd_socket *sock)
+{
+	void *p;
+
+	mutex_lock(&sock->mutex);
+	p = __conn_prepare_command(connection, sock);
+	if (!p)
+		mutex_unlock(&sock->mutex);
+
+	return p;
+}
+
+void *drbd_prepare_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock)
+{
+	return conn_prepare_command(peer_device->connection, sock);
+}
+
+static int __send_command(struct drbd_connection *connection, int vnr,
+			  struct drbd_socket *sock, enum drbd_packet cmd,
+			  unsigned int header_size, void *data,
+			  unsigned int size)
+{
+	int msg_flags;
+	int err;
+
+	/*
+	 * Called with @data == NULL and the size of the data blocks in @size
+	 * for commands that send data blocks.  For those commands, omit the
+	 * MSG_MORE flag: this will increase the likelihood that data blocks
+	 * which are page aligned on the sender will end up page aligned on the
+	 * receiver.
+	 */
+	msg_flags = data ? MSG_MORE : 0;
+
+	header_size += prepare_header(connection, vnr, sock->sbuf, cmd,
+				      header_size + size);
+	err = drbd_send_all(connection, sock->socket, sock->sbuf, header_size,
+			    msg_flags);
+	if (data && !err)
+		err = drbd_send_all(connection, sock->socket, data, size, 0);
+	return err;
+}
+
+static int __conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+			       enum drbd_packet cmd, unsigned int header_size,
+			       void *data, unsigned int size)
+{
+	return __send_command(connection, 0, sock, cmd, header_size, data, size);
+}
+
+int conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+		      enum drbd_packet cmd, unsigned int header_size,
+		      void *data, unsigned int size)
+{
+	int err;
+
+	err = __conn_send_command(connection, sock, cmd, header_size, data, size);
+	mutex_unlock(&sock->mutex);
+	return err;
+}
+
+int drbd_send_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock,
+		      enum drbd_packet cmd, unsigned int header_size,
+		      void *data, unsigned int size)
+{
+	int err;
+
+	err = __send_command(peer_device->connection, peer_device->device->vnr,
+			     sock, cmd, header_size, data, size);
+	mutex_unlock(&sock->mutex);
+	return err;
+}
+
+int drbd_send_ping(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+
+	sock = &connection->meta;
+	if (!conn_prepare_command(connection, sock))
+		return -EIO;
+	return conn_send_command(connection, sock, P_PING, 0, NULL, 0);
+}
+
+int drbd_send_ping_ack(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+
+	sock = &connection->meta;
+	if (!conn_prepare_command(connection, sock))
+		return -EIO;
+	return conn_send_command(connection, sock, P_PING_ACK, 0, NULL, 0);
+}
+
+int drbd_send_sync_param(struct drbd_peer_device *peer_device)
+{
+	struct drbd_socket *sock;
+	struct p_rs_param_95 *p;
+	int size;
+	const int apv = peer_device->connection->agreed_pro_version;
+	enum drbd_packet cmd;
+	struct net_conf *nc;
+	struct disk_conf *dc;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->net_conf);
+
+	size = apv <= 87 ? sizeof(struct p_rs_param)
+		: apv == 88 ? sizeof(struct p_rs_param)
+			+ strlen(nc->verify_alg) + 1
+		: apv <= 94 ? sizeof(struct p_rs_param_89)
+		: /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+	cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
+
+	/* initialize verify_alg and csums_alg */
+	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+	if (get_ldev(peer_device->device)) {
+		dc = rcu_dereference(peer_device->device->ldev->disk_conf);
+		p->resync_rate = cpu_to_be32(dc->resync_rate);
+		p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
+		p->c_delay_target = cpu_to_be32(dc->c_delay_target);
+		p->c_fill_target = cpu_to_be32(dc->c_fill_target);
+		p->c_max_rate = cpu_to_be32(dc->c_max_rate);
+		put_ldev(peer_device->device);
+	} else {
+		p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
+		p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
+		p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
+		p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
+		p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
+	}
+
+	if (apv >= 88)
+		strcpy(p->verify_alg, nc->verify_alg);
+	if (apv >= 89)
+		strcpy(p->csums_alg, nc->csums_alg);
+	rcu_read_unlock();
+
+	return drbd_send_command(peer_device, sock, cmd, size, NULL, 0);
+}
+
+int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd)
+{
+	struct drbd_socket *sock;
+	struct p_protocol *p;
+	struct net_conf *nc;
+	int size, cf;
+
+	sock = &connection->data;
+	p = __conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+
+	if (nc->tentative && connection->agreed_pro_version < 92) {
+		rcu_read_unlock();
+		mutex_unlock(&sock->mutex);
+		drbd_err(connection, "--dry-run is not supported by peer");
+		return -EOPNOTSUPP;
+	}
+
+	size = sizeof(*p);
+	if (connection->agreed_pro_version >= 87)
+		size += strlen(nc->integrity_alg) + 1;
+
+	p->protocol      = cpu_to_be32(nc->wire_protocol);
+	p->after_sb_0p   = cpu_to_be32(nc->after_sb_0p);
+	p->after_sb_1p   = cpu_to_be32(nc->after_sb_1p);
+	p->after_sb_2p   = cpu_to_be32(nc->after_sb_2p);
+	p->two_primaries = cpu_to_be32(nc->two_primaries);
+	cf = 0;
+	if (nc->discard_my_data)
+		cf |= CF_DISCARD_MY_DATA;
+	if (nc->tentative)
+		cf |= CF_DRY_RUN;
+	p->conn_flags    = cpu_to_be32(cf);
+
+	if (connection->agreed_pro_version >= 87)
+		strcpy(p->integrity_alg, nc->integrity_alg);
+	rcu_read_unlock();
+
+	return __conn_send_command(connection, sock, cmd, size, NULL, 0);
+}
+
+int drbd_send_protocol(struct drbd_connection *connection)
+{
+	int err;
+
+	mutex_lock(&connection->data.mutex);
+	err = __drbd_send_protocol(connection, P_PROTOCOL);
+	mutex_unlock(&connection->data.mutex);
+
+	return err;
+}
+
+static int _drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_uuids *p;
+	int i;
+
+	if (!get_ldev_if_state(device, D_NEGOTIATING))
+		return 0;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p) {
+		put_ldev(device);
+		return -EIO;
+	}
+	spin_lock_irq(&device->ldev->md.uuid_lock);
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		p->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+	spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+	device->comm_bm_set = drbd_bm_total_weight(device);
+	p->uuid[UI_SIZE] = cpu_to_be64(device->comm_bm_set);
+	rcu_read_lock();
+	uuid_flags |= rcu_dereference(peer_device->connection->net_conf)->discard_my_data ? 1 : 0;
+	rcu_read_unlock();
+	uuid_flags |= test_bit(CRASHED_PRIMARY, &device->flags) ? 2 : 0;
+	uuid_flags |= device->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
+	p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+
+	put_ldev(device);
+	return drbd_send_command(peer_device, sock, P_UUIDS, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_uuids(struct drbd_peer_device *peer_device)
+{
+	return _drbd_send_uuids(peer_device, 0);
+}
+
+int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *peer_device)
+{
+	return _drbd_send_uuids(peer_device, 8);
+}
+
+void drbd_print_uuids(struct drbd_device *device, const char *text)
+{
+	if (get_ldev_if_state(device, D_NEGOTIATING)) {
+		u64 *uuid = device->ldev->md.uuid;
+		drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX\n",
+		     text,
+		     (unsigned long long)uuid[UI_CURRENT],
+		     (unsigned long long)uuid[UI_BITMAP],
+		     (unsigned long long)uuid[UI_HISTORY_START],
+		     (unsigned long long)uuid[UI_HISTORY_END]);
+		put_ldev(device);
+	} else {
+		drbd_info(device, "%s effective data uuid: %016llX\n",
+				text,
+				(unsigned long long)device->ed_uuid);
+	}
+}
+
+void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_rs_uuid *p;
+	u64 uuid;
+
+	D_ASSERT(device, device->state.disk == D_UP_TO_DATE);
+
+	uuid = device->ldev->md.uuid[UI_BITMAP];
+	if (uuid && uuid != UUID_JUST_CREATED)
+		uuid = uuid + UUID_NEW_BM_OFFSET;
+	else
+		get_random_bytes(&uuid, sizeof(u64));
+	drbd_uuid_set(device, UI_BITMAP, uuid);
+	drbd_print_uuids(device, "updated sync UUID");
+	drbd_md_sync(device);
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (p) {
+		p->uuid = cpu_to_be64(uuid);
+		drbd_send_command(peer_device, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
+	}
+}
+
+int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_sizes *p;
+	sector_t d_size, u_size;
+	int q_order_type;
+	unsigned int max_bio_size;
+
+	if (get_ldev_if_state(device, D_NEGOTIATING)) {
+		D_ASSERT(device, device->ldev->backing_bdev);
+		d_size = drbd_get_max_capacity(device->ldev);
+		rcu_read_lock();
+		u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+		rcu_read_unlock();
+		q_order_type = drbd_queue_order_type(device);
+		max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
+		put_ldev(device);
+	} else {
+		d_size = 0;
+		u_size = 0;
+		q_order_type = QUEUE_ORDERED_NONE;
+		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
+	}
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+
+	if (peer_device->connection->agreed_pro_version <= 94)
+		max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+	else if (peer_device->connection->agreed_pro_version < 100)
+		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
+
+	p->d_size = cpu_to_be64(d_size);
+	p->u_size = cpu_to_be64(u_size);
+	p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(device->this_bdev));
+	p->max_bio_size = cpu_to_be32(max_bio_size);
+	p->queue_order_type = cpu_to_be16(q_order_type);
+	p->dds_flags = cpu_to_be16(flags);
+	return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0);
+}
+
+/**
+ * drbd_send_current_state() - Sends the drbd state to the peer
+ * @peer_device:	DRBD peer device.
+ */
+int drbd_send_current_state(struct drbd_peer_device *peer_device)
+{
+	struct drbd_socket *sock;
+	struct p_state *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->state = cpu_to_be32(peer_device->device->state.i); /* Within the send mutex */
+	return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
+
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @peer_device:      DRBD peer device.
+ * @state:     the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state(struct drbd_peer_device *peer_device, union drbd_state state)
+{
+	struct drbd_socket *sock;
+	struct p_state *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->state = cpu_to_be32(state.i); /* Within the send mutex */
+	return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_state_req(struct drbd_peer_device *peer_device, union drbd_state mask, union drbd_state val)
+{
+	struct drbd_socket *sock;
+	struct p_req_state *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->mask = cpu_to_be32(mask.i);
+	p->val = cpu_to_be32(val.i);
+	return drbd_send_command(peer_device, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
+}
+
+int conn_send_state_req(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+	enum drbd_packet cmd;
+	struct drbd_socket *sock;
+	struct p_req_state *p;
+
+	cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
+	sock = &connection->data;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+	p->mask = cpu_to_be32(mask.i);
+	p->val = cpu_to_be32(val.i);
+	return conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+void drbd_send_sr_reply(struct drbd_peer_device *peer_device, enum drbd_state_rv retcode)
+{
+	struct drbd_socket *sock;
+	struct p_req_state_reply *p;
+
+	sock = &peer_device->connection->meta;
+	p = drbd_prepare_command(peer_device, sock);
+	if (p) {
+		p->retcode = cpu_to_be32(retcode);
+		drbd_send_command(peer_device, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
+	}
+}
+
+void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode)
+{
+	struct drbd_socket *sock;
+	struct p_req_state_reply *p;
+	enum drbd_packet cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
+
+	sock = &connection->meta;
+	p = conn_prepare_command(connection, sock);
+	if (p) {
+		p->retcode = cpu_to_be32(retcode);
+		conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
+	}
+}
+
+static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
+{
+	BUG_ON(code & ~0xf);
+	p->encoding = (p->encoding & ~0xf) | code;
+}
+
+static void dcbp_set_start(struct p_compressed_bm *p, int set)
+{
+	p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
+
+static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+	BUG_ON(n & ~0x7);
+	p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
+}
+
+static int fill_bitmap_rle_bits(struct drbd_device *device,
+			 struct p_compressed_bm *p,
+			 unsigned int size,
+			 struct bm_xfer_ctx *c)
+{
+	struct bitstream bs;
+	unsigned long plain_bits;
+	unsigned long tmp;
+	unsigned long rl;
+	unsigned len;
+	unsigned toggle;
+	int bits, use_rle;
+
+	/* may we use this feature? */
+	rcu_read_lock();
+	use_rle = rcu_dereference(first_peer_device(device)->connection->net_conf)->use_rle;
+	rcu_read_unlock();
+	if (!use_rle || first_peer_device(device)->connection->agreed_pro_version < 90)
+		return 0;
+
+	if (c->bit_offset >= c->bm_bits)
+		return 0; /* nothing to do. */
+
+	/* use at most thus many bytes */
+	bitstream_init(&bs, p->code, size, 0);
+	memset(p->code, 0, size);
+	/* plain bits covered in this code string */
+	plain_bits = 0;
+
+	/* p->encoding & 0x80 stores whether the first run length is set.
+	 * bit offset is implicit.
+	 * start with toggle == 2 to be able to tell the first iteration */
+	toggle = 2;
+
+	/* see how much plain bits we can stuff into one packet
+	 * using RLE and VLI. */
+	do {
+		tmp = (toggle == 0) ? _drbd_bm_find_next_zero(device, c->bit_offset)
+				    : _drbd_bm_find_next(device, c->bit_offset);
+		if (tmp == -1UL)
+			tmp = c->bm_bits;
+		rl = tmp - c->bit_offset;
+
+		if (toggle == 2) { /* first iteration */
+			if (rl == 0) {
+				/* the first checked bit was set,
+				 * store start value, */
+				dcbp_set_start(p, 1);
+				/* but skip encoding of zero run length */
+				toggle = !toggle;
+				continue;
+			}
+			dcbp_set_start(p, 0);
+		}
+
+		/* paranoia: catch zero runlength.
+		 * can only happen if bitmap is modified while we scan it. */
+		if (rl == 0) {
+			drbd_err(device, "unexpected zero runlength while encoding bitmap "
+			    "t:%u bo:%lu\n", toggle, c->bit_offset);
+			return -1;
+		}
+
+		bits = vli_encode_bits(&bs, rl);
+		if (bits == -ENOBUFS) /* buffer full */
+			break;
+		if (bits <= 0) {
+			drbd_err(device, "error while encoding bitmap: %d\n", bits);
+			return 0;
+		}
+
+		toggle = !toggle;
+		plain_bits += rl;
+		c->bit_offset = tmp;
+	} while (c->bit_offset < c->bm_bits);
+
+	len = bs.cur.b - p->code + !!bs.cur.bit;
+
+	if (plain_bits < (len << 3)) {
+		/* incompressible with this method.
+		 * we need to rewind both word and bit position. */
+		c->bit_offset -= plain_bits;
+		bm_xfer_ctx_bit_to_word_offset(c);
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+		return 0;
+	}
+
+	/* RLE + VLI was able to compress it just fine.
+	 * update c->word_offset. */
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	/* store pad_bits */
+	dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+
+	return len;
+}
+
+/**
+ * send_bitmap_rle_or_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c)
+{
+	struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+	unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+	struct p_compressed_bm *p = sock->sbuf + header_size;
+	int len, err;
+
+	len = fill_bitmap_rle_bits(device, p,
+			DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
+	if (len < 0)
+		return -EIO;
+
+	if (len) {
+		dcbp_set_code(p, RLE_VLI_Bits);
+		err = __send_command(first_peer_device(device)->connection, device->vnr, sock,
+				     P_COMPRESSED_BITMAP, sizeof(*p) + len,
+				     NULL, 0);
+		c->packets[0]++;
+		c->bytes[0] += header_size + sizeof(*p) + len;
+
+		if (c->bit_offset >= c->bm_bits)
+			len = 0; /* DONE */
+	} else {
+		/* was not compressible.
+		 * send a buffer full of plain text bits instead. */
+		unsigned int data_size;
+		unsigned long num_words;
+		unsigned long *p = sock->sbuf + header_size;
+
+		data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+		num_words = min_t(size_t, data_size / sizeof(*p),
+				  c->bm_words - c->word_offset);
+		len = num_words * sizeof(*p);
+		if (len)
+			drbd_bm_get_lel(device, c->word_offset, num_words, p);
+		err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0);
+		c->word_offset += num_words;
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+
+		c->packets[1]++;
+		c->bytes[1] += header_size + len;
+
+		if (c->bit_offset > c->bm_bits)
+			c->bit_offset = c->bm_bits;
+	}
+	if (!err) {
+		if (len == 0) {
+			INFO_bm_xfer_stats(device, "send", c);
+			return 0;
+		} else
+			return 1;
+	}
+	return -EIO;
+}
+
+/* See the comment at receive_bitmap() */
+static int _drbd_send_bitmap(struct drbd_device *device)
+{
+	struct bm_xfer_ctx c;
+	int err;
+
+	if (!expect(device->bitmap))
+		return false;
+
+	if (get_ldev(device)) {
+		if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) {
+			drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n");
+			drbd_bm_set_all(device);
+			if (drbd_bm_write(device)) {
+				/* write_bm did fail! Leave full sync flag set in Meta P_DATA
+				 * but otherwise process as per normal - need to tell other
+				 * side that a full resync is required! */
+				drbd_err(device, "Failed to write bitmap to disk!\n");
+			} else {
+				drbd_md_clear_flag(device, MDF_FULL_SYNC);
+				drbd_md_sync(device);
+			}
+		}
+		put_ldev(device);
+	}
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(device),
+		.bm_words = drbd_bm_words(device),
+	};
+
+	do {
+		err = send_bitmap_rle_or_plain(device, &c);
+	} while (err > 0);
+
+	return err == 0;
+}
+
+int drbd_send_bitmap(struct drbd_device *device)
+{
+	struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+	int err = -1;
+
+	mutex_lock(&sock->mutex);
+	if (sock->socket)
+		err = !_drbd_send_bitmap(device);
+	mutex_unlock(&sock->mutex);
+	return err;
+}
+
+void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size)
+{
+	struct drbd_socket *sock;
+	struct p_barrier_ack *p;
+
+	if (connection->cstate < C_WF_REPORT_PARAMS)
+		return;
+
+	sock = &connection->meta;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return;
+	p->barrier = barrier_nr;
+	p->set_size = cpu_to_be32(set_size);
+	conn_send_command(connection, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
+}
+
+/**
+ * _drbd_send_ack() - Sends an ack packet
+ * @device:	DRBD device.
+ * @cmd:	Packet command code.
+ * @sector:	sector, needs to be in big endian byte order
+ * @blksize:	size in byte, needs to be in big endian byte order
+ * @block_id:	Id, big endian byte order
+ */
+static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+			  u64 sector, u32 blksize, u64 block_id)
+{
+	struct drbd_socket *sock;
+	struct p_block_ack *p;
+
+	if (peer_device->device->state.conn < C_CONNECTED)
+		return -EIO;
+
+	sock = &peer_device->connection->meta;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = sector;
+	p->block_id = block_id;
+	p->blksize = blksize;
+	p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->device->packet_seq));
+	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+/* dp->sector and dp->block_id already/still in network byte order,
+ * data_size is payload size according to dp->head,
+ * and may need to be corrected for digest size. */
+void drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		      struct p_data *dp, int data_size)
+{
+	if (peer_device->connection->peer_integrity_tfm)
+		data_size -= crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+	_drbd_send_ack(peer_device, cmd, dp->sector, cpu_to_be32(data_size),
+		       dp->block_id);
+}
+
+void drbd_send_ack_rp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		      struct p_block_req *rp)
+{
+	_drbd_send_ack(peer_device, cmd, rp->sector, rp->blksize, rp->block_id);
+}
+
+/**
+ * drbd_send_ack() - Sends an ack packet
+ * @device:	DRBD device
+ * @cmd:	packet command code
+ * @peer_req:	peer request
+ */
+int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		  struct drbd_peer_request *peer_req)
+{
+	return _drbd_send_ack(peer_device, cmd,
+			      cpu_to_be64(peer_req->i.sector),
+			      cpu_to_be32(peer_req->i.size),
+			      peer_req->block_id);
+}
+
+/* This function misuses the block_id field to signal if the blocks
+ * are is sync or not. */
+int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		     sector_t sector, int blksize, u64 block_id)
+{
+	return _drbd_send_ack(peer_device, cmd,
+			      cpu_to_be64(sector),
+			      cpu_to_be32(blksize),
+			      cpu_to_be64(block_id));
+}
+
+int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
+		       sector_t sector, int size, u64 block_id)
+{
+	struct drbd_socket *sock;
+	struct p_block_req *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = block_id;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_drequest_csum(struct drbd_peer_device *peer_device, sector_t sector, int size,
+			    void *digest, int digest_size, enum drbd_packet cmd)
+{
+	struct drbd_socket *sock;
+	struct p_block_req *p;
+
+	/* FIXME: Put the digest into the preallocated socket buffer.  */
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = ID_SYNCER /* unused */;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), digest, digest_size);
+}
+
+int drbd_send_ov_request(struct drbd_peer_device *peer_device, sector_t sector, int size)
+{
+	struct drbd_socket *sock;
+	struct p_block_req *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = ID_SYNCER /* unused */;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(peer_device, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
+}
+
+/* called on sndtimeo
+ * returns false if we should retry,
+ * true if we think connection is dead
+ */
+static int we_should_drop_the_connection(struct drbd_connection *connection, struct socket *sock)
+{
+	int drop_it;
+	/* long elapsed = (long)(jiffies - device->last_received); */
+
+	drop_it =   connection->meta.socket == sock
+		|| !connection->asender.task
+		|| get_t_state(&connection->asender) != RUNNING
+		|| connection->cstate < C_WF_REPORT_PARAMS;
+
+	if (drop_it)
+		return true;
+
+	drop_it = !--connection->ko_count;
+	if (!drop_it) {
+		drbd_err(connection, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+			 current->comm, current->pid, connection->ko_count);
+		request_ping(connection);
+	}
+
+	return drop_it; /* && (device->state == R_PRIMARY) */;
+}
+
+static void drbd_update_congested(struct drbd_connection *connection)
+{
+	struct sock *sk = connection->data.socket->sk;
+	if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+		set_bit(NET_CONGESTED, &connection->flags);
+}
+
+/* The idea of sendpage seems to be to put some kind of reference
+ * to the page into the skb, and to hand it over to the NIC. In
+ * this process get_page() gets called.
+ *
+ * As soon as the page was really sent over the network put_page()
+ * gets called by some part of the network layer. [ NIC driver? ]
+ *
+ * [ get_page() / put_page() increment/decrement the count. If count
+ *   reaches 0 the page will be freed. ]
+ *
+ * This works nicely with pages from FSs.
+ * But this means that in protocol A we might signal IO completion too early!
+ *
+ * In order not to corrupt data during a resync we must make sure
+ * that we do not reuse our own buffer pages (EEs) to early, therefore
+ * we have the net_ee list.
+ *
+ * XFS seems to have problems, still, it submits pages with page_count == 0!
+ * As a workaround, we disable sendpage on pages
+ * with page_count == 0 or PageSlab.
+ */
+static int _drbd_no_send_page(struct drbd_peer_device *peer_device, struct page *page,
+			      int offset, size_t size, unsigned msg_flags)
+{
+	struct socket *socket;
+	void *addr;
+	int err;
+
+	socket = peer_device->connection->data.socket;
+	addr = kmap(page) + offset;
+	err = drbd_send_all(peer_device->connection, socket, addr, size, msg_flags);
+	kunmap(page);
+	if (!err)
+		peer_device->device->send_cnt += size >> 9;
+	return err;
+}
+
+static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *page,
+		    int offset, size_t size, unsigned msg_flags)
+{
+	struct socket *socket = peer_device->connection->data.socket;
+	mm_segment_t oldfs = get_fs();
+	int len = size;
+	int err = -EIO;
+
+	/* e.g. XFS meta- & log-data is in slab pages, which have a
+	 * page_count of 0 and/or have PageSlab() set.
+	 * we cannot use send_page for those, as that does get_page();
+	 * put_page(); and would cause either a VM_BUG directly, or
+	 * __page_cache_release a page that would actually still be referenced
+	 * by someone, leading to some obscure delayed Oops somewhere else. */
+	if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
+		return _drbd_no_send_page(peer_device, page, offset, size, msg_flags);
+
+	msg_flags |= MSG_NOSIGNAL;
+	drbd_update_congested(peer_device->connection);
+	set_fs(KERNEL_DS);
+	do {
+		int sent;
+
+		sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
+		if (sent <= 0) {
+			if (sent == -EAGAIN) {
+				if (we_should_drop_the_connection(peer_device->connection, socket))
+					break;
+				continue;
+			}
+			drbd_warn(peer_device->device, "%s: size=%d len=%d sent=%d\n",
+			     __func__, (int)size, len, sent);
+			if (sent < 0)
+				err = sent;
+			break;
+		}
+		len    -= sent;
+		offset += sent;
+	} while (len > 0 /* THINK && device->cstate >= C_CONNECTED*/);
+	set_fs(oldfs);
+	clear_bit(NET_CONGESTED, &peer_device->connection->flags);
+
+	if (len == 0) {
+		err = 0;
+		peer_device->device->send_cnt += size >> 9;
+	}
+	return err;
+}
+
+static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
+{
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	/* hint all but last page with MSG_MORE */
+	bio_for_each_segment(bvec, bio, iter) {
+		int err;
+
+		err = _drbd_no_send_page(peer_device, bvec.bv_page,
+					 bvec.bv_offset, bvec.bv_len,
+					 bio_iter_last(bvec, iter)
+					 ? 0 : MSG_MORE);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *bio)
+{
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	/* hint all but last page with MSG_MORE */
+	bio_for_each_segment(bvec, bio, iter) {
+		int err;
+
+		err = _drbd_send_page(peer_device, bvec.bv_page,
+				      bvec.bv_offset, bvec.bv_len,
+				      bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
+			    struct drbd_peer_request *peer_req)
+{
+	struct page *page = peer_req->pages;
+	unsigned len = peer_req->i.size;
+	int err;
+
+	/* hint all but last page with MSG_MORE */
+	page_chain_for_each(page) {
+		unsigned l = min_t(unsigned, len, PAGE_SIZE);
+
+		err = _drbd_send_page(peer_device, page, 0, l,
+				      page_chain_next(page) ? MSG_MORE : 0);
+		if (err)
+			return err;
+		len -= l;
+	}
+	return 0;
+}
+
+static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long bi_rw)
+{
+	if (connection->agreed_pro_version >= 95)
+		return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
+			(bi_rw & REQ_FUA ? DP_FUA : 0) |
+			(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
+			(bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
+	else
+		return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
+}
+
+/* Used to send write or TRIM aka REQ_DISCARD requests
+ * R_PRIMARY -> Peer	(P_DATA, P_TRIM)
+ */
+int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_data *p;
+	unsigned int dp_flags = 0;
+	int dgs;
+	int err;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	dgs = peer_device->connection->integrity_tfm ?
+	      crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(req->i.sector);
+	p->block_id = (unsigned long)req;
+	p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq));
+	dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio->bi_rw);
+	if (device->state.conn >= C_SYNC_SOURCE &&
+	    device->state.conn <= C_PAUSED_SYNC_T)
+		dp_flags |= DP_MAY_SET_IN_SYNC;
+	if (peer_device->connection->agreed_pro_version >= 100) {
+		if (req->rq_state & RQ_EXP_RECEIVE_ACK)
+			dp_flags |= DP_SEND_RECEIVE_ACK;
+		if (req->rq_state & RQ_EXP_WRITE_ACK)
+			dp_flags |= DP_SEND_WRITE_ACK;
+	}
+	p->dp_flags = cpu_to_be32(dp_flags);
+
+	if (dp_flags & DP_DISCARD) {
+		struct p_trim *t = (struct p_trim*)p;
+		t->size = cpu_to_be32(req->i.size);
+		err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
+		goto out;
+	}
+
+	/* our digest is still only over the payload.
+	 * TRIM does not carry any payload. */
+	if (dgs)
+		drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
+	err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size);
+	if (!err) {
+		/* For protocol A, we have to memcpy the payload into
+		 * socket buffers, as we may complete right away
+		 * as soon as we handed it over to tcp, at which point the data
+		 * pages may become invalid.
+		 *
+		 * For data-integrity enabled, we copy it as well, so we can be
+		 * sure that even if the bio pages may still be modified, it
+		 * won't change the data on the wire, thus if the digest checks
+		 * out ok after sending on this side, but does not fit on the
+		 * receiving side, we sure have detected corruption elsewhere.
+		 */
+		if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs)
+			err = _drbd_send_bio(peer_device, req->master_bio);
+		else
+			err = _drbd_send_zc_bio(peer_device, req->master_bio);
+
+		/* double check digest, sometimes buffers have been modified in flight. */
+		if (dgs > 0 && dgs <= 64) {
+			/* 64 byte, 512 bit, is the largest digest size
+			 * currently supported in kernel crypto. */
+			unsigned char digest[64];
+			drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest);
+			if (memcmp(p + 1, digest, dgs)) {
+				drbd_warn(device,
+					"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
+					(unsigned long long)req->i.sector, req->i.size);
+			}
+		} /* else if (dgs > 64) {
+		     ... Be noisy about digest too large ...
+		} */
+	}
+out:
+	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
+
+	return err;
+}
+
+/* answer packet, used to send data back for read requests:
+ *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
+ *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
+ */
+int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		    struct drbd_peer_request *peer_req)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_data *p;
+	int err;
+	int dgs;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+
+	dgs = peer_device->connection->integrity_tfm ?
+	      crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(peer_req->i.sector);
+	p->block_id = peer_req->block_id;
+	p->seq_num = 0;  /* unused */
+	p->dp_flags = 0;
+	if (dgs)
+		drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + 1);
+	err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size);
+	if (!err)
+		err = _drbd_send_zc_ee(peer_device, peer_req);
+	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
+
+	return err;
+}
+
+int drbd_send_out_of_sync(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_socket *sock;
+	struct p_block_desc *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(req->i.sector);
+	p->blksize = cpu_to_be32(req->i.size);
+	return drbd_send_command(peer_device, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
+}
+
+/*
+  drbd_send distinguishes two cases:
+
+  Packets sent via the data socket "sock"
+  and packets sent via the meta data socket "msock"
+
+		    sock                      msock
+  -----------------+-------------------------+------------------------------
+  timeout           conf.timeout / 2          conf.timeout / 2
+  timeout action    send a ping via msock     Abort communication
+					      and close all sockets
+*/
+
+/*
+ * you must have down()ed the appropriate [m]sock_mutex elsewhere!
+ */
+int drbd_send(struct drbd_connection *connection, struct socket *sock,
+	      void *buf, size_t size, unsigned msg_flags)
+{
+	struct kvec iov;
+	struct msghdr msg;
+	int rv, sent = 0;
+
+	if (!sock)
+		return -EBADR;
+
+	/* THINK  if (signal_pending) return ... ? */
+
+	iov.iov_base = buf;
+	iov.iov_len  = size;
+
+	msg.msg_name       = NULL;
+	msg.msg_namelen    = 0;
+	msg.msg_control    = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
+
+	if (sock == connection->data.socket) {
+		rcu_read_lock();
+		connection->ko_count = rcu_dereference(connection->net_conf)->ko_count;
+		rcu_read_unlock();
+		drbd_update_congested(connection);
+	}
+	do {
+		/* STRANGE
+		 * tcp_sendmsg does _not_ use its size parameter at all ?
+		 *
+		 * -EAGAIN on timeout, -EINTR on signal.
+		 */
+/* THINK
+ * do we need to block DRBD_SIG if sock == &meta.socket ??
+ * otherwise wake_asender() might interrupt some send_*Ack !
+ */
+		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
+		if (rv == -EAGAIN) {
+			if (we_should_drop_the_connection(connection, sock))
+				break;
+			else
+				continue;
+		}
+		if (rv == -EINTR) {
+			flush_signals(current);
+			rv = 0;
+		}
+		if (rv < 0)
+			break;
+		sent += rv;
+		iov.iov_base += rv;
+		iov.iov_len  -= rv;
+	} while (sent < size);
+
+	if (sock == connection->data.socket)
+		clear_bit(NET_CONGESTED, &connection->flags);
+
+	if (rv <= 0) {
+		if (rv != -EAGAIN) {
+			drbd_err(connection, "%s_sendmsg returned %d\n",
+				 sock == connection->meta.socket ? "msock" : "sock",
+				 rv);
+			conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+		} else
+			conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
+	}
+
+	return sent;
+}
+
+/**
+ * drbd_send_all  -  Send an entire buffer
+ *
+ * Returns 0 upon success and a negative error value otherwise.
+ */
+int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void *buffer,
+		  size_t size, unsigned msg_flags)
+{
+	int err;
+
+	err = drbd_send(connection, sock, buffer, size, msg_flags);
+	if (err < 0)
+		return err;
+	if (err != size)
+		return -EIO;
+	return 0;
+}
+
+static int drbd_open(struct block_device *bdev, fmode_t mode)
+{
+	struct drbd_device *device = bdev->bd_disk->private_data;
+	unsigned long flags;
+	int rv = 0;
+
+	mutex_lock(&drbd_main_mutex);
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	/* to have a stable device->state.role
+	 * and no race with updating open_cnt */
+
+	if (device->state.role != R_PRIMARY) {
+		if (mode & FMODE_WRITE)
+			rv = -EROFS;
+		else if (!allow_oos)
+			rv = -EMEDIUMTYPE;
+	}
+
+	if (!rv)
+		device->open_cnt++;
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+	mutex_unlock(&drbd_main_mutex);
+
+	return rv;
+}
+
+static void drbd_release(struct gendisk *gd, fmode_t mode)
+{
+	struct drbd_device *device = gd->private_data;
+	mutex_lock(&drbd_main_mutex);
+	device->open_cnt--;
+	mutex_unlock(&drbd_main_mutex);
+}
+
+static void drbd_set_defaults(struct drbd_device *device)
+{
+	/* Beware! The actual layout differs
+	 * between big endian and little endian */
+	device->state = (union drbd_dev_state) {
+		{ .role = R_SECONDARY,
+		  .peer = R_UNKNOWN,
+		  .conn = C_STANDALONE,
+		  .disk = D_DISKLESS,
+		  .pdsk = D_UNKNOWN,
+		} };
+}
+
+void drbd_init_set_defaults(struct drbd_device *device)
+{
+	/* the memset(,0,) did most of this.
+	 * note: only assignments, no allocation in here */
+
+	drbd_set_defaults(device);
+
+	atomic_set(&device->ap_bio_cnt, 0);
+	atomic_set(&device->ap_pending_cnt, 0);
+	atomic_set(&device->rs_pending_cnt, 0);
+	atomic_set(&device->unacked_cnt, 0);
+	atomic_set(&device->local_cnt, 0);
+	atomic_set(&device->pp_in_use_by_net, 0);
+	atomic_set(&device->rs_sect_in, 0);
+	atomic_set(&device->rs_sect_ev, 0);
+	atomic_set(&device->ap_in_flight, 0);
+	atomic_set(&device->md_io_in_use, 0);
+
+	mutex_init(&device->own_state_mutex);
+	device->state_mutex = &device->own_state_mutex;
+
+	spin_lock_init(&device->al_lock);
+	spin_lock_init(&device->peer_seq_lock);
+
+	INIT_LIST_HEAD(&device->active_ee);
+	INIT_LIST_HEAD(&device->sync_ee);
+	INIT_LIST_HEAD(&device->done_ee);
+	INIT_LIST_HEAD(&device->read_ee);
+	INIT_LIST_HEAD(&device->net_ee);
+	INIT_LIST_HEAD(&device->resync_reads);
+	INIT_LIST_HEAD(&device->resync_work.list);
+	INIT_LIST_HEAD(&device->unplug_work.list);
+	INIT_LIST_HEAD(&device->go_diskless.list);
+	INIT_LIST_HEAD(&device->md_sync_work.list);
+	INIT_LIST_HEAD(&device->start_resync_work.list);
+	INIT_LIST_HEAD(&device->bm_io_work.w.list);
+
+	device->resync_work.cb  = w_resync_timer;
+	device->unplug_work.cb  = w_send_write_hint;
+	device->go_diskless.cb  = w_go_diskless;
+	device->md_sync_work.cb = w_md_sync;
+	device->bm_io_work.w.cb = w_bitmap_io;
+	device->start_resync_work.cb = w_start_resync;
+
+	init_timer(&device->resync_timer);
+	init_timer(&device->md_sync_timer);
+	init_timer(&device->start_resync_timer);
+	init_timer(&device->request_timer);
+	device->resync_timer.function = resync_timer_fn;
+	device->resync_timer.data = (unsigned long) device;
+	device->md_sync_timer.function = md_sync_timer_fn;
+	device->md_sync_timer.data = (unsigned long) device;
+	device->start_resync_timer.function = start_resync_timer_fn;
+	device->start_resync_timer.data = (unsigned long) device;
+	device->request_timer.function = request_timer_fn;
+	device->request_timer.data = (unsigned long) device;
+
+	init_waitqueue_head(&device->misc_wait);
+	init_waitqueue_head(&device->state_wait);
+	init_waitqueue_head(&device->ee_wait);
+	init_waitqueue_head(&device->al_wait);
+	init_waitqueue_head(&device->seq_wait);
+
+	device->resync_wenr = LC_FREE;
+	device->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+	device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+}
+
+void drbd_device_cleanup(struct drbd_device *device)
+{
+	int i;
+	if (first_peer_device(device)->connection->receiver.t_state != NONE)
+		drbd_err(device, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
+				first_peer_device(device)->connection->receiver.t_state);
+
+	device->al_writ_cnt  =
+	device->bm_writ_cnt  =
+	device->read_cnt     =
+	device->recv_cnt     =
+	device->send_cnt     =
+	device->writ_cnt     =
+	device->p_size       =
+	device->rs_start     =
+	device->rs_total     =
+	device->rs_failed    = 0;
+	device->rs_last_events = 0;
+	device->rs_last_sect_ev = 0;
+	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+		device->rs_mark_left[i] = 0;
+		device->rs_mark_time[i] = 0;
+	}
+	D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL);
+
+	drbd_set_my_capacity(device, 0);
+	if (device->bitmap) {
+		/* maybe never allocated. */
+		drbd_bm_resize(device, 0, 1);
+		drbd_bm_cleanup(device);
+	}
+
+	drbd_free_bc(device->ldev);
+	device->ldev = NULL;
+
+	clear_bit(AL_SUSPENDED, &device->flags);
+
+	D_ASSERT(device, list_empty(&device->active_ee));
+	D_ASSERT(device, list_empty(&device->sync_ee));
+	D_ASSERT(device, list_empty(&device->done_ee));
+	D_ASSERT(device, list_empty(&device->read_ee));
+	D_ASSERT(device, list_empty(&device->net_ee));
+	D_ASSERT(device, list_empty(&device->resync_reads));
+	D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
+	D_ASSERT(device, list_empty(&device->resync_work.list));
+	D_ASSERT(device, list_empty(&device->unplug_work.list));
+	D_ASSERT(device, list_empty(&device->go_diskless.list));
+
+	drbd_set_defaults(device);
+}
+
+
+static void drbd_destroy_mempools(void)
+{
+	struct page *page;
+
+	while (drbd_pp_pool) {
+		page = drbd_pp_pool;
+		drbd_pp_pool = (struct page *)page_private(page);
+		__free_page(page);
+		drbd_pp_vacant--;
+	}
+
+	/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
+
+	if (drbd_md_io_bio_set)
+		bioset_free(drbd_md_io_bio_set);
+	if (drbd_md_io_page_pool)
+		mempool_destroy(drbd_md_io_page_pool);
+	if (drbd_ee_mempool)
+		mempool_destroy(drbd_ee_mempool);
+	if (drbd_request_mempool)
+		mempool_destroy(drbd_request_mempool);
+	if (drbd_ee_cache)
+		kmem_cache_destroy(drbd_ee_cache);
+	if (drbd_request_cache)
+		kmem_cache_destroy(drbd_request_cache);
+	if (drbd_bm_ext_cache)
+		kmem_cache_destroy(drbd_bm_ext_cache);
+	if (drbd_al_ext_cache)
+		kmem_cache_destroy(drbd_al_ext_cache);
+
+	drbd_md_io_bio_set   = NULL;
+	drbd_md_io_page_pool = NULL;
+	drbd_ee_mempool      = NULL;
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+	drbd_bm_ext_cache    = NULL;
+	drbd_al_ext_cache    = NULL;
+
+	return;
+}
+
+static int drbd_create_mempools(void)
+{
+	struct page *page;
+	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
+	int i;
+
+	/* prepare our caches and mempools */
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+	drbd_bm_ext_cache    = NULL;
+	drbd_al_ext_cache    = NULL;
+	drbd_pp_pool         = NULL;
+	drbd_md_io_page_pool = NULL;
+	drbd_md_io_bio_set   = NULL;
+
+	/* caches */
+	drbd_request_cache = kmem_cache_create(
+		"drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
+	if (drbd_request_cache == NULL)
+		goto Enomem;
+
+	drbd_ee_cache = kmem_cache_create(
+		"drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
+	if (drbd_ee_cache == NULL)
+		goto Enomem;
+
+	drbd_bm_ext_cache = kmem_cache_create(
+		"drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
+	if (drbd_bm_ext_cache == NULL)
+		goto Enomem;
+
+	drbd_al_ext_cache = kmem_cache_create(
+		"drbd_al", sizeof(struct lc_element), 0, 0, NULL);
+	if (drbd_al_ext_cache == NULL)
+		goto Enomem;
+
+	/* mempools */
+	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_bio_set == NULL)
+		goto Enomem;
+
+	drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_page_pool == NULL)
+		goto Enomem;
+
+	drbd_request_mempool = mempool_create(number,
+		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
+	if (drbd_request_mempool == NULL)
+		goto Enomem;
+
+	drbd_ee_mempool = mempool_create(number,
+		mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
+	if (drbd_ee_mempool == NULL)
+		goto Enomem;
+
+	/* drbd's page pool */
+	spin_lock_init(&drbd_pp_lock);
+
+	for (i = 0; i < number; i++) {
+		page = alloc_page(GFP_HIGHUSER);
+		if (!page)
+			goto Enomem;
+		set_page_private(page, (unsigned long)drbd_pp_pool);
+		drbd_pp_pool = page;
+	}
+	drbd_pp_vacant = number;
+
+	return 0;
+
+Enomem:
+	drbd_destroy_mempools(); /* in case we allocated some */
+	return -ENOMEM;
+}
+
+static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
+	void *unused)
+{
+	/* just so we have it.  you never know what interesting things we
+	 * might want to do here some day...
+	 */
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block drbd_notifier = {
+	.notifier_call = drbd_notify_sys,
+};
+
+static void drbd_release_all_peer_reqs(struct drbd_device *device)
+{
+	int rr;
+
+	rr = drbd_free_peer_reqs(device, &device->active_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in active list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->sync_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in sync list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->read_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in read list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->done_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in done list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->net_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in net list found!\n", rr);
+}
+
+/* caution. no locking. */
+void drbd_destroy_device(struct kref *kref)
+{
+	struct drbd_device *device = container_of(kref, struct drbd_device, kref);
+	struct drbd_resource *resource = device->resource;
+	struct drbd_connection *connection;
+
+	del_timer_sync(&device->request_timer);
+
+	/* paranoia asserts */
+	D_ASSERT(device, device->open_cnt == 0);
+	/* end paranoia asserts */
+
+	/* cleanup stuff that may have been allocated during
+	 * device (re-)configuration or state changes */
+
+	if (device->this_bdev)
+		bdput(device->this_bdev);
+
+	drbd_free_bc(device->ldev);
+	device->ldev = NULL;
+
+	drbd_release_all_peer_reqs(device);
+
+	lc_destroy(device->act_log);
+	lc_destroy(device->resync);
+
+	kfree(device->p_uuid);
+	/* device->p_uuid = NULL; */
+
+	if (device->bitmap) /* should no longer be there. */
+		drbd_bm_cleanup(device);
+	__free_page(device->md_io_page);
+	put_disk(device->vdisk);
+	blk_cleanup_queue(device->rq_queue);
+	kfree(device->rs_plan_s);
+	kfree(first_peer_device(device));
+	kfree(device);
+
+	for_each_connection(connection, resource)
+		kref_put(&connection->kref, drbd_destroy_connection);
+	kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+/* One global retry thread, if we need to push back some bio and have it
+ * reinserted through our make request function.
+ */
+static struct retry_worker {
+	struct workqueue_struct *wq;
+	struct work_struct worker;
+
+	spinlock_t lock;
+	struct list_head writes;
+} retry;
+
+static void do_retry(struct work_struct *ws)
+{
+	struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
+	LIST_HEAD(writes);
+	struct drbd_request *req, *tmp;
+
+	spin_lock_irq(&retry->lock);
+	list_splice_init(&retry->writes, &writes);
+	spin_unlock_irq(&retry->lock);
+
+	list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
+		struct drbd_device *device = req->device;
+		struct bio *bio = req->master_bio;
+		unsigned long start_time = req->start_time;
+		bool expected;
+
+		expected =
+			expect(atomic_read(&req->completion_ref) == 0) &&
+			expect(req->rq_state & RQ_POSTPONED) &&
+			expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
+				(req->rq_state & RQ_LOCAL_ABORTED) != 0);
+
+		if (!expected)
+			drbd_err(device, "req=%p completion_ref=%d rq_state=%x\n",
+				req, atomic_read(&req->completion_ref),
+				req->rq_state);
+
+		/* We still need to put one kref associated with the
+		 * "completion_ref" going zero in the code path that queued it
+		 * here.  The request object may still be referenced by a
+		 * frozen local req->private_bio, in case we force-detached.
+		 */
+		kref_put(&req->kref, drbd_req_destroy);
+
+		/* A single suspended or otherwise blocking device may stall
+		 * all others as well.  Fortunately, this code path is to
+		 * recover from a situation that "should not happen":
+		 * concurrent writes in multi-primary setup.
+		 * In a "normal" lifecycle, this workqueue is supposed to be
+		 * destroyed without ever doing anything.
+		 * If it turns out to be an issue anyways, we can do per
+		 * resource (replication group) or per device (minor) retry
+		 * workqueues instead.
+		 */
+
+		/* We are not just doing generic_make_request(),
+		 * as we want to keep the start_time information. */
+		inc_ap_bio(device);
+		__drbd_make_request(device, bio, start_time);
+	}
+}
+
+void drbd_restart_request(struct drbd_request *req)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&retry.lock, flags);
+	list_move_tail(&req->tl_requests, &retry.writes);
+	spin_unlock_irqrestore(&retry.lock, flags);
+
+	/* Drop the extra reference that would otherwise
+	 * have been dropped by complete_master_bio.
+	 * do_retry() needs to grab a new one. */
+	dec_ap_bio(req->device);
+
+	queue_work(retry.wq, &retry.worker);
+}
+
+void drbd_destroy_resource(struct kref *kref)
+{
+	struct drbd_resource *resource =
+		container_of(kref, struct drbd_resource, kref);
+
+	idr_destroy(&resource->devices);
+	free_cpumask_var(resource->cpu_mask);
+	kfree(resource->name);
+	kfree(resource);
+}
+
+void drbd_free_resource(struct drbd_resource *resource)
+{
+	struct drbd_connection *connection, *tmp;
+
+	for_each_connection_safe(connection, tmp, resource) {
+		list_del(&connection->connections);
+		kref_put(&connection->kref, drbd_destroy_connection);
+	}
+	kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static void drbd_cleanup(void)
+{
+	unsigned int i;
+	struct drbd_device *device;
+	struct drbd_resource *resource, *tmp;
+
+	unregister_reboot_notifier(&drbd_notifier);
+
+	/* first remove proc,
+	 * drbdsetup uses it's presence to detect
+	 * whether DRBD is loaded.
+	 * If we would get stuck in proc removal,
+	 * but have netlink already deregistered,
+	 * some drbdsetup commands may wait forever
+	 * for an answer.
+	 */
+	if (drbd_proc)
+		remove_proc_entry("drbd", NULL);
+
+	if (retry.wq)
+		destroy_workqueue(retry.wq);
+
+	drbd_genl_unregister();
+
+	idr_for_each_entry(&drbd_devices, device, i)
+		drbd_delete_device(device);
+
+	/* not _rcu since, no other updater anymore. Genl already unregistered */
+	for_each_resource_safe(resource, tmp, &drbd_resources) {
+		list_del(&resource->resources);
+		drbd_free_resource(resource);
+	}
+
+	drbd_destroy_mempools();
+	unregister_blkdev(DRBD_MAJOR, "drbd");
+
+	idr_destroy(&drbd_devices);
+
+	printk(KERN_INFO "drbd: module cleanup done.\n");
+}
+
+/**
+ * drbd_congested() - Callback for the flusher thread
+ * @congested_data:	User data
+ * @bdi_bits:		Bits the BDI flusher thread is currently interested in
+ *
+ * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ */
+static int drbd_congested(void *congested_data, int bdi_bits)
+{
+	struct drbd_device *device = congested_data;
+	struct request_queue *q;
+	char reason = '-';
+	int r = 0;
+
+	if (!may_inc_ap_bio(device)) {
+		/* DRBD has frozen IO */
+		r = bdi_bits;
+		reason = 'd';
+		goto out;
+	}
+
+	if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) {
+		r |= (1 << BDI_async_congested);
+		/* Without good local data, we would need to read from remote,
+		 * and that would need the worker thread as well, which is
+		 * currently blocked waiting for that usermode helper to
+		 * finish.
+		 */
+		if (!get_ldev_if_state(device, D_UP_TO_DATE))
+			r |= (1 << BDI_sync_congested);
+		else
+			put_ldev(device);
+		r &= bdi_bits;
+		reason = 'c';
+		goto out;
+	}
+
+	if (get_ldev(device)) {
+		q = bdev_get_queue(device->ldev->backing_bdev);
+		r = bdi_congested(&q->backing_dev_info, bdi_bits);
+		put_ldev(device);
+		if (r)
+			reason = 'b';
+	}
+
+	if (bdi_bits & (1 << BDI_async_congested) &&
+	    test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) {
+		r |= (1 << BDI_async_congested);
+		reason = reason == 'b' ? 'a' : 'n';
+	}
+
+out:
+	device->congestion_reason = reason;
+	return r;
+}
+
+static void drbd_init_workqueue(struct drbd_work_queue* wq)
+{
+	spin_lock_init(&wq->q_lock);
+	INIT_LIST_HEAD(&wq->q);
+	init_waitqueue_head(&wq->q_wait);
+}
+
+struct completion_work {
+	struct drbd_work w;
+	struct completion done;
+};
+
+static int w_complete(struct drbd_work *w, int cancel)
+{
+	struct completion_work *completion_work =
+		container_of(w, struct completion_work, w);
+
+	complete(&completion_work->done);
+	return 0;
+}
+
+void drbd_flush_workqueue(struct drbd_work_queue *work_queue)
+{
+	struct completion_work completion_work;
+
+	completion_work.w.cb = w_complete;
+	init_completion(&completion_work.done);
+	drbd_queue_work(work_queue, &completion_work.w);
+	wait_for_completion(&completion_work.done);
+}
+
+struct drbd_resource *drbd_find_resource(const char *name)
+{
+	struct drbd_resource *resource;
+
+	if (!name || !name[0])
+		return NULL;
+
+	rcu_read_lock();
+	for_each_resource_rcu(resource, &drbd_resources) {
+		if (!strcmp(resource->name, name)) {
+			kref_get(&resource->kref);
+			goto found;
+		}
+	}
+	resource = NULL;
+found:
+	rcu_read_unlock();
+	return resource;
+}
+
+struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+				     void *peer_addr, int peer_addr_len)
+{
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+
+	rcu_read_lock();
+	for_each_resource_rcu(resource, &drbd_resources) {
+		for_each_connection_rcu(connection, resource) {
+			if (connection->my_addr_len == my_addr_len &&
+			    connection->peer_addr_len == peer_addr_len &&
+			    !memcmp(&connection->my_addr, my_addr, my_addr_len) &&
+			    !memcmp(&connection->peer_addr, peer_addr, peer_addr_len)) {
+				kref_get(&connection->kref);
+				goto found;
+			}
+		}
+	}
+	connection = NULL;
+found:
+	rcu_read_unlock();
+	return connection;
+}
+
+static int drbd_alloc_socket(struct drbd_socket *socket)
+{
+	socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
+	if (!socket->rbuf)
+		return -ENOMEM;
+	socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
+	if (!socket->sbuf)
+		return -ENOMEM;
+	return 0;
+}
+
+static void drbd_free_socket(struct drbd_socket *socket)
+{
+	free_page((unsigned long) socket->sbuf);
+	free_page((unsigned long) socket->rbuf);
+}
+
+void conn_free_crypto(struct drbd_connection *connection)
+{
+	drbd_free_sock(connection);
+
+	crypto_free_hash(connection->csums_tfm);
+	crypto_free_hash(connection->verify_tfm);
+	crypto_free_hash(connection->cram_hmac_tfm);
+	crypto_free_hash(connection->integrity_tfm);
+	crypto_free_hash(connection->peer_integrity_tfm);
+	kfree(connection->int_dig_in);
+	kfree(connection->int_dig_vv);
+
+	connection->csums_tfm = NULL;
+	connection->verify_tfm = NULL;
+	connection->cram_hmac_tfm = NULL;
+	connection->integrity_tfm = NULL;
+	connection->peer_integrity_tfm = NULL;
+	connection->int_dig_in = NULL;
+	connection->int_dig_vv = NULL;
+}
+
+int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts)
+{
+	struct drbd_connection *connection;
+	cpumask_var_t new_cpu_mask;
+	int err;
+
+	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+		/*
+		retcode = ERR_NOMEM;
+		drbd_msg_put_info("unable to allocate cpumask");
+		*/
+
+	/* silently ignore cpu mask on UP kernel */
+	if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
+		err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
+				   cpumask_bits(new_cpu_mask), nr_cpu_ids);
+		if (err) {
+			drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
+			/* retcode = ERR_CPU_MASK_PARSE; */
+			goto fail;
+		}
+	}
+	resource->res_opts = *res_opts;
+	if (cpumask_empty(new_cpu_mask))
+		drbd_calc_cpu_mask(&new_cpu_mask);
+	if (!cpumask_equal(resource->cpu_mask, new_cpu_mask)) {
+		cpumask_copy(resource->cpu_mask, new_cpu_mask);
+		for_each_connection_rcu(connection, resource) {
+			connection->receiver.reset_cpu_mask = 1;
+			connection->asender.reset_cpu_mask = 1;
+			connection->worker.reset_cpu_mask = 1;
+		}
+	}
+	err = 0;
+
+fail:
+	free_cpumask_var(new_cpu_mask);
+	return err;
+
+}
+
+struct drbd_resource *drbd_create_resource(const char *name)
+{
+	struct drbd_resource *resource;
+
+	resource = kzalloc(sizeof(struct drbd_resource), GFP_KERNEL);
+	if (!resource)
+		goto fail;
+	resource->name = kstrdup(name, GFP_KERNEL);
+	if (!resource->name)
+		goto fail_free_resource;
+	if (!zalloc_cpumask_var(&resource->cpu_mask, GFP_KERNEL))
+		goto fail_free_name;
+	kref_init(&resource->kref);
+	idr_init(&resource->devices);
+	INIT_LIST_HEAD(&resource->connections);
+	list_add_tail_rcu(&resource->resources, &drbd_resources);
+	mutex_init(&resource->conf_update);
+	mutex_init(&resource->adm_mutex);
+	spin_lock_init(&resource->req_lock);
+	return resource;
+
+fail_free_name:
+	kfree(resource->name);
+fail_free_resource:
+	kfree(resource);
+fail:
+	return NULL;
+}
+
+/* caller must be under genl_lock() */
+struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
+{
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+
+	connection = kzalloc(sizeof(struct drbd_connection), GFP_KERNEL);
+	if (!connection)
+		return NULL;
+
+	if (drbd_alloc_socket(&connection->data))
+		goto fail;
+	if (drbd_alloc_socket(&connection->meta))
+		goto fail;
+
+	connection->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+	if (!connection->current_epoch)
+		goto fail;
+
+	INIT_LIST_HEAD(&connection->transfer_log);
+
+	INIT_LIST_HEAD(&connection->current_epoch->list);
+	connection->epochs = 1;
+	spin_lock_init(&connection->epoch_lock);
+	connection->write_ordering = WO_bdev_flush;
+
+	connection->send.seen_any_write_yet = false;
+	connection->send.current_epoch_nr = 0;
+	connection->send.current_epoch_writes = 0;
+
+	resource = drbd_create_resource(name);
+	if (!resource)
+		goto fail;
+
+	connection->cstate = C_STANDALONE;
+	mutex_init(&connection->cstate_mutex);
+	init_waitqueue_head(&connection->ping_wait);
+	idr_init(&connection->peer_devices);
+
+	drbd_init_workqueue(&connection->sender_work);
+	mutex_init(&connection->data.mutex);
+	mutex_init(&connection->meta.mutex);
+
+	drbd_thread_init(resource, &connection->receiver, drbd_receiver, "receiver");
+	connection->receiver.connection = connection;
+	drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
+	connection->worker.connection = connection;
+	drbd_thread_init(resource, &connection->asender, drbd_asender, "asender");
+	connection->asender.connection = connection;
+
+	kref_init(&connection->kref);
+
+	connection->resource = resource;
+
+	if (set_resource_options(resource, res_opts))
+		goto fail_resource;
+
+	kref_get(&resource->kref);
+	list_add_tail_rcu(&connection->connections, &resource->connections);
+	return connection;
+
+fail_resource:
+	list_del(&resource->resources);
+	drbd_free_resource(resource);
+fail:
+	kfree(connection->current_epoch);
+	drbd_free_socket(&connection->meta);
+	drbd_free_socket(&connection->data);
+	kfree(connection);
+	return NULL;
+}
+
+void drbd_destroy_connection(struct kref *kref)
+{
+	struct drbd_connection *connection = container_of(kref, struct drbd_connection, kref);
+	struct drbd_resource *resource = connection->resource;
+
+	if (atomic_read(&connection->current_epoch->epoch_size) !=  0)
+		drbd_err(connection, "epoch_size:%d\n", atomic_read(&connection->current_epoch->epoch_size));
+	kfree(connection->current_epoch);
+
+	idr_destroy(&connection->peer_devices);
+
+	drbd_free_socket(&connection->meta);
+	drbd_free_socket(&connection->data);
+	kfree(connection->int_dig_in);
+	kfree(connection->int_dig_vv);
+	kfree(connection);
+	kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static int init_submitter(struct drbd_device *device)
+{
+	/* opencoded create_singlethread_workqueue(),
+	 * to be able to say "drbd%d", ..., minor */
+	device->submit.wq = alloc_workqueue("drbd%u_submit",
+			WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor);
+	if (!device->submit.wq)
+		return -ENOMEM;
+
+	INIT_WORK(&device->submit.worker, do_submit);
+	spin_lock_init(&device->submit.lock);
+	INIT_LIST_HEAD(&device->submit.writes);
+	return 0;
+}
+
+enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor)
+{
+	struct drbd_resource *resource = adm_ctx->resource;
+	struct drbd_connection *connection;
+	struct drbd_device *device;
+	struct drbd_peer_device *peer_device, *tmp_peer_device;
+	struct gendisk *disk;
+	struct request_queue *q;
+	int id;
+	int vnr = adm_ctx->volume;
+	enum drbd_ret_code err = ERR_NOMEM;
+
+	device = minor_to_device(minor);
+	if (device)
+		return ERR_MINOR_EXISTS;
+
+	/* GFP_KERNEL, we are outside of all write-out paths */
+	device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL);
+	if (!device)
+		return ERR_NOMEM;
+	kref_init(&device->kref);
+
+	kref_get(&resource->kref);
+	device->resource = resource;
+	device->minor = minor;
+	device->vnr = vnr;
+
+	drbd_init_set_defaults(device);
+
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		goto out_no_q;
+	device->rq_queue = q;
+	q->queuedata   = device;
+
+	disk = alloc_disk(1);
+	if (!disk)
+		goto out_no_disk;
+	device->vdisk = disk;
+
+	set_disk_ro(disk, true);
+
+	disk->queue = q;
+	disk->major = DRBD_MAJOR;
+	disk->first_minor = minor;
+	disk->fops = &drbd_ops;
+	sprintf(disk->disk_name, "drbd%d", minor);
+	disk->private_data = device;
+
+	device->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
+	/* we have no partitions. we contain only ourselves. */
+	device->this_bdev->bd_contains = device->this_bdev;
+
+	q->backing_dev_info.congested_fn = drbd_congested;
+	q->backing_dev_info.congested_data = device;
+
+	blk_queue_make_request(q, drbd_make_request);
+	blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+	/* Setting the max_hw_sectors to an odd value of 8kibyte here
+	   This triggers a max_bio_size message upon first attach or connect */
+	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	blk_queue_merge_bvec(q, drbd_merge_bvec);
+	q->queue_lock = &resource->req_lock;
+
+	device->md_io_page = alloc_page(GFP_KERNEL);
+	if (!device->md_io_page)
+		goto out_no_io_page;
+
+	if (drbd_bm_init(device))
+		goto out_no_bitmap;
+	device->read_requests = RB_ROOT;
+	device->write_requests = RB_ROOT;
+
+	id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL);
+	if (id < 0) {
+		if (id == -ENOSPC) {
+			err = ERR_MINOR_EXISTS;
+			drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
+		}
+		goto out_no_minor_idr;
+	}
+	kref_get(&device->kref);
+
+	id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL);
+	if (id < 0) {
+		if (id == -ENOSPC) {
+			err = ERR_MINOR_EXISTS;
+			drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
+		}
+		goto out_idr_remove_minor;
+	}
+	kref_get(&device->kref);
+
+	INIT_LIST_HEAD(&device->peer_devices);
+	for_each_connection(connection, resource) {
+		peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
+		if (!peer_device)
+			goto out_idr_remove_from_resource;
+		peer_device->connection = connection;
+		peer_device->device = device;
+
+		list_add(&peer_device->peer_devices, &device->peer_devices);
+		kref_get(&device->kref);
+
+		id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL);
+		if (id < 0) {
+			if (id == -ENOSPC) {
+				err = ERR_INVALID_REQUEST;
+				drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already");
+			}
+			goto out_idr_remove_from_resource;
+		}
+		kref_get(&connection->kref);
+	}
+
+	if (init_submitter(device)) {
+		err = ERR_NOMEM;
+		drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue");
+		goto out_idr_remove_vol;
+	}
+
+	add_disk(disk);
+
+	/* inherit the connection state */
+	device->state.conn = first_connection(resource)->cstate;
+	if (device->state.conn == C_WF_REPORT_PARAMS) {
+		for_each_peer_device(peer_device, device)
+			drbd_connected(peer_device);
+	}
+
+	return NO_ERROR;
+
+out_idr_remove_vol:
+	idr_remove(&connection->peer_devices, vnr);
+out_idr_remove_from_resource:
+	for_each_connection(connection, resource) {
+		peer_device = idr_find(&connection->peer_devices, vnr);
+		if (peer_device) {
+			idr_remove(&connection->peer_devices, vnr);
+			kref_put(&connection->kref, drbd_destroy_connection);
+		}
+	}
+	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+		list_del(&peer_device->peer_devices);
+		kfree(peer_device);
+	}
+	idr_remove(&resource->devices, vnr);
+out_idr_remove_minor:
+	idr_remove(&drbd_devices, minor);
+	synchronize_rcu();
+out_no_minor_idr:
+	drbd_bm_cleanup(device);
+out_no_bitmap:
+	__free_page(device->md_io_page);
+out_no_io_page:
+	put_disk(disk);
+out_no_disk:
+	blk_cleanup_queue(q);
+out_no_q:
+	kref_put(&resource->kref, drbd_destroy_resource);
+	kfree(device);
+	return err;
+}
+
+void drbd_delete_device(struct drbd_device *device)
+{
+	struct drbd_resource *resource = device->resource;
+	struct drbd_connection *connection;
+	int refs = 3;
+
+	for_each_connection(connection, resource) {
+		idr_remove(&connection->peer_devices, device->vnr);
+		refs++;
+	}
+	idr_remove(&resource->devices, device->vnr);
+	idr_remove(&drbd_devices, device_to_minor(device));
+	del_gendisk(device->vdisk);
+	synchronize_rcu();
+	kref_sub(&device->kref, refs, drbd_destroy_device);
+}
+
+int __init drbd_init(void)
+{
+	int err;
+
+	if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
+		printk(KERN_ERR
+		       "drbd: invalid minor_count (%d)\n", minor_count);
+#ifdef MODULE
+		return -EINVAL;
+#else
+		minor_count = DRBD_MINOR_COUNT_DEF;
+#endif
+	}
+
+	err = register_blkdev(DRBD_MAJOR, "drbd");
+	if (err) {
+		printk(KERN_ERR
+		       "drbd: unable to register block device major %d\n",
+		       DRBD_MAJOR);
+		return err;
+	}
+
+	register_reboot_notifier(&drbd_notifier);
+
+	/*
+	 * allocate all necessary structs
+	 */
+	init_waitqueue_head(&drbd_pp_wait);
+
+	drbd_proc = NULL; /* play safe for drbd_cleanup */
+	idr_init(&drbd_devices);
+
+	rwlock_init(&global_state_lock);
+	INIT_LIST_HEAD(&drbd_resources);
+
+	err = drbd_genl_register();
+	if (err) {
+		printk(KERN_ERR "drbd: unable to register generic netlink family\n");
+		goto fail;
+	}
+
+	err = drbd_create_mempools();
+	if (err)
+		goto fail;
+
+	err = -ENOMEM;
+	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
+	if (!drbd_proc)	{
+		printk(KERN_ERR "drbd: unable to register proc file\n");
+		goto fail;
+	}
+
+	retry.wq = create_singlethread_workqueue("drbd-reissue");
+	if (!retry.wq) {
+		printk(KERN_ERR "drbd: unable to create retry workqueue\n");
+		goto fail;
+	}
+	INIT_WORK(&retry.worker, do_retry);
+	spin_lock_init(&retry.lock);
+	INIT_LIST_HEAD(&retry.writes);
+
+	printk(KERN_INFO "drbd: initialized. "
+	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
+	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
+	printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
+	printk(KERN_INFO "drbd: registered as block device major %d\n",
+		DRBD_MAJOR);
+
+	return 0; /* Success! */
+
+fail:
+	drbd_cleanup();
+	if (err == -ENOMEM)
+		printk(KERN_ERR "drbd: ran out of memory\n");
+	else
+		printk(KERN_ERR "drbd: initialization failure\n");
+	return err;
+}
+
+void drbd_free_bc(struct drbd_backing_dev *ldev)
+{
+	if (ldev == NULL)
+		return;
+
+	blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+	blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+
+	kfree(ldev->disk_conf);
+	kfree(ldev);
+}
+
+void drbd_free_sock(struct drbd_connection *connection)
+{
+	if (connection->data.socket) {
+		mutex_lock(&connection->data.mutex);
+		kernel_sock_shutdown(connection->data.socket, SHUT_RDWR);
+		sock_release(connection->data.socket);
+		connection->data.socket = NULL;
+		mutex_unlock(&connection->data.mutex);
+	}
+	if (connection->meta.socket) {
+		mutex_lock(&connection->meta.mutex);
+		kernel_sock_shutdown(connection->meta.socket, SHUT_RDWR);
+		sock_release(connection->meta.socket);
+		connection->meta.socket = NULL;
+		mutex_unlock(&connection->meta.mutex);
+	}
+}
+
+/* meta data management */
+
+void conn_md_sync(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_md_sync(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
+/* aligned 4kByte */
+struct meta_data_on_disk {
+	u64 la_size_sect;      /* last agreed size. */
+	u64 uuid[UI_SIZE];   /* UUIDs. */
+	u64 device_uuid;
+	u64 reserved_u64_1;
+	u32 flags;             /* MDF */
+	u32 magic;
+	u32 md_size_sect;
+	u32 al_offset;         /* offset to this block */
+	u32 al_nr_extents;     /* important for restoring the AL (userspace) */
+	      /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
+	u32 bm_offset;         /* offset to the bitmap, from here */
+	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
+	u32 la_peer_max_bio_size;   /* last peer max_bio_size */
+
+	/* see al_tr_number_to_on_disk_sector() */
+	u32 al_stripes;
+	u32 al_stripe_size_4k;
+
+	u8 reserved_u8[4096 - (7*8 + 10*4)];
+} __packed;
+
+
+
+void drbd_md_write(struct drbd_device *device, void *b)
+{
+	struct meta_data_on_disk *buffer = b;
+	sector_t sector;
+	int i;
+
+	memset(buffer, 0, sizeof(*buffer));
+
+	buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(device->this_bdev));
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		buffer->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+	buffer->flags = cpu_to_be32(device->ldev->md.flags);
+	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
+
+	buffer->md_size_sect  = cpu_to_be32(device->ldev->md.md_size_sect);
+	buffer->al_offset     = cpu_to_be32(device->ldev->md.al_offset);
+	buffer->al_nr_extents = cpu_to_be32(device->act_log->nr_elements);
+	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
+	buffer->device_uuid = cpu_to_be64(device->ldev->md.device_uuid);
+
+	buffer->bm_offset = cpu_to_be32(device->ldev->md.bm_offset);
+	buffer->la_peer_max_bio_size = cpu_to_be32(device->peer_max_bio_size);
+
+	buffer->al_stripes = cpu_to_be32(device->ldev->md.al_stripes);
+	buffer->al_stripe_size_4k = cpu_to_be32(device->ldev->md.al_stripe_size_4k);
+
+	D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset);
+	sector = device->ldev->md.md_offset;
+
+	if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+		/* this was a try anyways ... */
+		drbd_err(device, "meta data update failed!\n");
+		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+	}
+}
+
+/**
+ * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
+ * @device:	DRBD device.
+ */
+void drbd_md_sync(struct drbd_device *device)
+{
+	struct meta_data_on_disk *buffer;
+
+	/* Don't accidentally change the DRBD meta data layout. */
+	BUILD_BUG_ON(UI_SIZE != 4);
+	BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
+
+	del_timer(&device->md_sync_timer);
+	/* timer may be rearmed by drbd_md_mark_dirty() now. */
+	if (!test_and_clear_bit(MD_DIRTY, &device->flags))
+		return;
+
+	/* We use here D_FAILED and not D_ATTACHING because we try to write
+	 * metadata even if we detach due to a disk failure! */
+	if (!get_ldev_if_state(device, D_FAILED))
+		return;
+
+	buffer = drbd_md_get_buffer(device);
+	if (!buffer)
+		goto out;
+
+	drbd_md_write(device, buffer);
+
+	/* Update device->ldev->md.la_size_sect,
+	 * since we updated it on metadata. */
+	device->ldev->md.la_size_sect = drbd_get_capacity(device->this_bdev);
+
+	drbd_md_put_buffer(device);
+out:
+	put_ldev(device);
+}
+
+static int check_activity_log_stripe_size(struct drbd_device *device,
+		struct meta_data_on_disk *on_disk,
+		struct drbd_md *in_core)
+{
+	u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
+	u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
+	u64 al_size_4k;
+
+	/* both not set: default to old fixed size activity log */
+	if (al_stripes == 0 && al_stripe_size_4k == 0) {
+		al_stripes = 1;
+		al_stripe_size_4k = MD_32kB_SECT/8;
+	}
+
+	/* some paranoia plausibility checks */
+
+	/* we need both values to be set */
+	if (al_stripes == 0 || al_stripe_size_4k == 0)
+		goto err;
+
+	al_size_4k = (u64)al_stripes * al_stripe_size_4k;
+
+	/* Upper limit of activity log area, to avoid potential overflow
+	 * problems in al_tr_number_to_on_disk_sector(). As right now, more
+	 * than 72 * 4k blocks total only increases the amount of history,
+	 * limiting this arbitrarily to 16 GB is not a real limitation ;-)  */
+	if (al_size_4k > (16 * 1024 * 1024/4))
+		goto err;
+
+	/* Lower limit: we need at least 8 transaction slots (32kB)
+	 * to not break existing setups */
+	if (al_size_4k < MD_32kB_SECT/8)
+		goto err;
+
+	in_core->al_stripe_size_4k = al_stripe_size_4k;
+	in_core->al_stripes = al_stripes;
+	in_core->al_size_4k = al_size_4k;
+
+	return 0;
+err:
+	drbd_err(device, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
+			al_stripes, al_stripe_size_4k);
+	return -EINVAL;
+}
+
+static int check_offsets_and_sizes(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+	sector_t capacity = drbd_get_capacity(bdev->md_bdev);
+	struct drbd_md *in_core = &bdev->md;
+	s32 on_disk_al_sect;
+	s32 on_disk_bm_sect;
+
+	/* The on-disk size of the activity log, calculated from offsets, and
+	 * the size of the activity log calculated from the stripe settings,
+	 * should match.
+	 * Though we could relax this a bit: it is ok, if the striped activity log
+	 * fits in the available on-disk activity log size.
+	 * Right now, that would break how resize is implemented.
+	 * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
+	 * of possible unused padding space in the on disk layout. */
+	if (in_core->al_offset < 0) {
+		if (in_core->bm_offset > in_core->al_offset)
+			goto err;
+		on_disk_al_sect = -in_core->al_offset;
+		on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
+	} else {
+		if (in_core->al_offset != MD_4kB_SECT)
+			goto err;
+		if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
+			goto err;
+
+		on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
+		on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
+	}
+
+	/* old fixed size meta data is exactly that: fixed. */
+	if (in_core->meta_dev_idx >= 0) {
+		if (in_core->md_size_sect != MD_128MB_SECT
+		||  in_core->al_offset != MD_4kB_SECT
+		||  in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
+		||  in_core->al_stripes != 1
+		||  in_core->al_stripe_size_4k != MD_32kB_SECT/8)
+			goto err;
+	}
+
+	if (capacity < in_core->md_size_sect)
+		goto err;
+	if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
+		goto err;
+
+	/* should be aligned, and at least 32k */
+	if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
+		goto err;
+
+	/* should fit (for now: exactly) into the available on-disk space;
+	 * overflow prevention is in check_activity_log_stripe_size() above. */
+	if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
+		goto err;
+
+	/* again, should be aligned */
+	if (in_core->bm_offset & 7)
+		goto err;
+
+	/* FIXME check for device grow with flex external meta data? */
+
+	/* can the available bitmap space cover the last agreed device size? */
+	if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
+		goto err;
+
+	return 0;
+
+err:
+	drbd_err(device, "meta data offsets don't make sense: idx=%d "
+			"al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
+			"md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
+			in_core->meta_dev_idx,
+			in_core->al_stripes, in_core->al_stripe_size_4k,
+			in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
+			(unsigned long long)in_core->la_size_sect,
+			(unsigned long long)capacity);
+
+	return -EINVAL;
+}
+
+
+/**
+ * drbd_md_read() - Reads in the meta data super block
+ * @device:	DRBD device.
+ * @bdev:	Device from which the meta data should be read in.
+ *
+ * Return NO_ERROR on success, and an enum drbd_ret_code in case
+ * something goes wrong.
+ *
+ * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
+ * even before @bdev is assigned to @device->ldev.
+ */
+int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+	struct meta_data_on_disk *buffer;
+	u32 magic, flags;
+	int i, rv = NO_ERROR;
+
+	if (device->state.disk != D_DISKLESS)
+		return ERR_DISK_CONFIGURED;
+
+	buffer = drbd_md_get_buffer(device);
+	if (!buffer)
+		return ERR_NOMEM;
+
+	/* First, figure out where our meta data superblock is located,
+	 * and read it. */
+	bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
+	bdev->md.md_offset = drbd_md_ss(bdev);
+
+	if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
+		/* NOTE: can't do normal error processing here as this is
+		   called BEFORE disk is attached */
+		drbd_err(device, "Error while reading metadata.\n");
+		rv = ERR_IO_MD_DISK;
+		goto err;
+	}
+
+	magic = be32_to_cpu(buffer->magic);
+	flags = be32_to_cpu(buffer->flags);
+	if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
+	    (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
+			/* btw: that's Activity Log clean, not "all" clean. */
+		drbd_err(device, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
+		rv = ERR_MD_UNCLEAN;
+		goto err;
+	}
+
+	rv = ERR_MD_INVALID;
+	if (magic != DRBD_MD_MAGIC_08) {
+		if (magic == DRBD_MD_MAGIC_07)
+			drbd_err(device, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
+		else
+			drbd_err(device, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
+		goto err;
+	}
+
+	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+		drbd_err(device, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+		    be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
+		goto err;
+	}
+
+
+	/* convert to in_core endian */
+	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+	bdev->md.flags = be32_to_cpu(buffer->flags);
+	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+	bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
+	bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
+	bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
+
+	if (check_activity_log_stripe_size(device, buffer, &bdev->md))
+		goto err;
+	if (check_offsets_and_sizes(device, bdev))
+		goto err;
+
+	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
+		drbd_err(device, "unexpected bm_offset: %d (expected %d)\n",
+		    be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
+		goto err;
+	}
+	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
+		drbd_err(device, "unexpected md_size: %u (expected %u)\n",
+		    be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
+		goto err;
+	}
+
+	rv = NO_ERROR;
+
+	spin_lock_irq(&device->resource->req_lock);
+	if (device->state.conn < C_CONNECTED) {
+		unsigned int peer;
+		peer = be32_to_cpu(buffer->la_peer_max_bio_size);
+		peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
+		device->peer_max_bio_size = peer;
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+
+ err:
+	drbd_md_put_buffer(device);
+
+	return rv;
+}
+
+/**
+ * drbd_md_mark_dirty() - Mark meta data super block as dirty
+ * @device:	DRBD device.
+ *
+ * Call this function if you change anything that should be written to
+ * the meta-data super block. This function sets MD_DIRTY, and starts a
+ * timer that ensures that within five seconds you have to call drbd_md_sync().
+ */
+#ifdef DEBUG
+void drbd_md_mark_dirty_(struct drbd_device *device, unsigned int line, const char *func)
+{
+	if (!test_and_set_bit(MD_DIRTY, &device->flags)) {
+		mod_timer(&device->md_sync_timer, jiffies + HZ);
+		device->last_md_mark_dirty.line = line;
+		device->last_md_mark_dirty.func = func;
+	}
+}
+#else
+void drbd_md_mark_dirty(struct drbd_device *device)
+{
+	if (!test_and_set_bit(MD_DIRTY, &device->flags))
+		mod_timer(&device->md_sync_timer, jiffies + 5*HZ);
+}
+#endif
+
+void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local)
+{
+	int i;
+
+	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+		device->ldev->md.uuid[i+1] = device->ldev->md.uuid[i];
+}
+
+void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+	if (idx == UI_CURRENT) {
+		if (device->state.role == R_PRIMARY)
+			val |= 1;
+		else
+			val &= ~((u64)1);
+
+		drbd_set_ed_uuid(device, val);
+	}
+
+	device->ldev->md.uuid[idx] = val;
+	drbd_md_mark_dirty(device);
+}
+
+void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+	__drbd_uuid_set(device, idx, val);
+	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+}
+
+void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+	if (device->ldev->md.uuid[idx]) {
+		drbd_uuid_move_history(device);
+		device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[idx];
+	}
+	__drbd_uuid_set(device, idx, val);
+	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+}
+
+/**
+ * drbd_uuid_new_current() - Creates a new current UUID
+ * @device:	DRBD device.
+ *
+ * Creates a new current UUID, and rotates the old current UUID into
+ * the bitmap slot. Causes an incremental resync upon next connect.
+ */
+void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local)
+{
+	u64 val;
+	unsigned long long bm_uuid;
+
+	get_random_bytes(&val, sizeof(u64));
+
+	spin_lock_irq(&device->ldev->md.uuid_lock);
+	bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+
+	if (bm_uuid)
+		drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
+
+	device->ldev->md.uuid[UI_BITMAP] = device->ldev->md.uuid[UI_CURRENT];
+	__drbd_uuid_set(device, UI_CURRENT, val);
+	spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+	drbd_print_uuids(device, "new current UUID");
+	/* get it to stable storage _now_ */
+	drbd_md_sync(device);
+}
+
+void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
+{
+	unsigned long flags;
+	if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
+		return;
+
+	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+	if (val == 0) {
+		drbd_uuid_move_history(device);
+		device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+		device->ldev->md.uuid[UI_BITMAP] = 0;
+	} else {
+		unsigned long long bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+		if (bm_uuid)
+			drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
+
+		device->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
+	}
+	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+
+	drbd_md_mark_dirty(device);
+}
+
+/**
+ * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @device:	DRBD device.
+ *
+ * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_set_n_write(struct drbd_device *device)
+{
+	int rv = -EIO;
+
+	if (get_ldev_if_state(device, D_ATTACHING)) {
+		drbd_md_set_flag(device, MDF_FULL_SYNC);
+		drbd_md_sync(device);
+		drbd_bm_set_all(device);
+
+		rv = drbd_bm_write(device);
+
+		if (!rv) {
+			drbd_md_clear_flag(device, MDF_FULL_SYNC);
+			drbd_md_sync(device);
+		}
+
+		put_ldev(device);
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @device:	DRBD device.
+ *
+ * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_clear_n_write(struct drbd_device *device)
+{
+	int rv = -EIO;
+
+	drbd_resume_al(device);
+	if (get_ldev_if_state(device, D_ATTACHING)) {
+		drbd_bm_clear_all(device);
+		rv = drbd_bm_write(device);
+		put_ldev(device);
+	}
+
+	return rv;
+}
+
+static int w_bitmap_io(struct drbd_work *w, int unused)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, bm_io_work.w);
+	struct bm_io_work *work = &device->bm_io_work;
+	int rv = -EIO;
+
+	D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0);
+
+	if (get_ldev(device)) {
+		drbd_bm_lock(device, work->why, work->flags);
+		rv = work->io_fn(device);
+		drbd_bm_unlock(device);
+		put_ldev(device);
+	}
+
+	clear_bit_unlock(BITMAP_IO, &device->flags);
+	wake_up(&device->misc_wait);
+
+	if (work->done)
+		work->done(device, rv);
+
+	clear_bit(BITMAP_IO_QUEUED, &device->flags);
+	work->why = NULL;
+	work->flags = 0;
+
+	return 0;
+}
+
+void drbd_ldev_destroy(struct drbd_device *device)
+{
+	lc_destroy(device->resync);
+	device->resync = NULL;
+	lc_destroy(device->act_log);
+	device->act_log = NULL;
+	__no_warn(local,
+		drbd_free_bc(device->ldev);
+		device->ldev = NULL;);
+
+	clear_bit(GO_DISKLESS, &device->flags);
+}
+
+static int w_go_diskless(struct drbd_work *w, int unused)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, go_diskless);
+
+	D_ASSERT(device, device->state.disk == D_FAILED);
+	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+	 * the protected members anymore, though, so once put_ldev reaches zero
+	 * again, it will be safe to free them. */
+
+	/* Try to write changed bitmap pages, read errors may have just
+	 * set some bits outside the area covered by the activity log.
+	 *
+	 * If we have an IO error during the bitmap writeout,
+	 * we will want a full sync next time, just in case.
+	 * (Do we want a specific meta data flag for this?)
+	 *
+	 * If that does not make it to stable storage either,
+	 * we cannot do anything about that anymore.
+	 *
+	 * We still need to check if both bitmap and ldev are present, we may
+	 * end up here after a failed attach, before ldev was even assigned.
+	 */
+	if (device->bitmap && device->ldev) {
+		/* An interrupted resync or similar is allowed to recounts bits
+		 * while we detach.
+		 * Any modifications would not be expected anymore, though.
+		 */
+		if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
+					"detach", BM_LOCKED_TEST_ALLOWED)) {
+			if (test_bit(WAS_READ_ERROR, &device->flags)) {
+				drbd_md_set_flag(device, MDF_FULL_SYNC);
+				drbd_md_sync(device);
+			}
+		}
+	}
+
+	drbd_force_state(device, NS(disk, D_DISKLESS));
+	return 0;
+}
+
+/**
+ * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
+ * @device:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @done:	callback to be called after the bitmap IO was performed
+ * @why:	Descriptive text of the reason for doing the IO
+ *
+ * While IO on the bitmap happens we freeze application IO thus we ensure
+ * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
+ * called from worker context. It MUST NOT be used while a previous such
+ * work is still pending!
+ */
+void drbd_queue_bitmap_io(struct drbd_device *device,
+			  int (*io_fn)(struct drbd_device *),
+			  void (*done)(struct drbd_device *, int),
+			  char *why, enum bm_flag flags)
+{
+	D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+	D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags));
+	D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags));
+	D_ASSERT(device, list_empty(&device->bm_io_work.w.list));
+	if (device->bm_io_work.why)
+		drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n",
+			why, device->bm_io_work.why);
+
+	device->bm_io_work.io_fn = io_fn;
+	device->bm_io_work.done = done;
+	device->bm_io_work.why = why;
+	device->bm_io_work.flags = flags;
+
+	spin_lock_irq(&device->resource->req_lock);
+	set_bit(BITMAP_IO, &device->flags);
+	if (atomic_read(&device->ap_bio_cnt) == 0) {
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+			drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+					&device->bm_io_work.w);
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+}
+
+/**
+ * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
+ * @device:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @why:	Descriptive text of the reason for doing the IO
+ *
+ * freezes application IO while that the actual IO operations runs. This
+ * functions MAY NOT be called from worker context.
+ */
+int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags)
+{
+	int rv;
+
+	D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
+
+	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+		drbd_suspend_io(device);
+
+	drbd_bm_lock(device, why, flags);
+	rv = io_fn(device);
+	drbd_bm_unlock(device);
+
+	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+		drbd_resume_io(device);
+
+	return rv;
+}
+
+void drbd_md_set_flag(struct drbd_device *device, int flag) __must_hold(local)
+{
+	if ((device->ldev->md.flags & flag) != flag) {
+		drbd_md_mark_dirty(device);
+		device->ldev->md.flags |= flag;
+	}
+}
+
+void drbd_md_clear_flag(struct drbd_device *device, int flag) __must_hold(local)
+{
+	if ((device->ldev->md.flags & flag) != 0) {
+		drbd_md_mark_dirty(device);
+		device->ldev->md.flags &= ~flag;
+	}
+}
+int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
+{
+	return (bdev->md.flags & flag) != 0;
+}
+
+static void md_sync_timer_fn(unsigned long data)
+{
+	struct drbd_device *device = (struct drbd_device *) data;
+
+	/* must not double-queue! */
+	if (list_empty(&device->md_sync_work.list))
+		drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
+				      &device->md_sync_work);
+}
+
+static int w_md_sync(struct drbd_work *w, int unused)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, md_sync_work);
+
+	drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+#ifdef DEBUG
+	drbd_warn(device, "last md_mark_dirty: %s:%u\n",
+		device->last_md_mark_dirty.func, device->last_md_mark_dirty.line);
+#endif
+	drbd_md_sync(device);
+	return 0;
+}
+
+const char *cmdname(enum drbd_packet cmd)
+{
+	/* THINK may need to become several global tables
+	 * when we want to support more than
+	 * one PRO_VERSION */
+	static const char *cmdnames[] = {
+		[P_DATA]	        = "Data",
+		[P_DATA_REPLY]	        = "DataReply",
+		[P_RS_DATA_REPLY]	= "RSDataReply",
+		[P_BARRIER]	        = "Barrier",
+		[P_BITMAP]	        = "ReportBitMap",
+		[P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
+		[P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
+		[P_UNPLUG_REMOTE]	= "UnplugRemote",
+		[P_DATA_REQUEST]	= "DataRequest",
+		[P_RS_DATA_REQUEST]     = "RSDataRequest",
+		[P_SYNC_PARAM]	        = "SyncParam",
+		[P_SYNC_PARAM89]	= "SyncParam89",
+		[P_PROTOCOL]            = "ReportProtocol",
+		[P_UUIDS]	        = "ReportUUIDs",
+		[P_SIZES]	        = "ReportSizes",
+		[P_STATE]	        = "ReportState",
+		[P_SYNC_UUID]           = "ReportSyncUUID",
+		[P_AUTH_CHALLENGE]      = "AuthChallenge",
+		[P_AUTH_RESPONSE]	= "AuthResponse",
+		[P_PING]		= "Ping",
+		[P_PING_ACK]	        = "PingAck",
+		[P_RECV_ACK]	        = "RecvAck",
+		[P_WRITE_ACK]	        = "WriteAck",
+		[P_RS_WRITE_ACK]	= "RSWriteAck",
+		[P_SUPERSEDED]          = "Superseded",
+		[P_NEG_ACK]	        = "NegAck",
+		[P_NEG_DREPLY]	        = "NegDReply",
+		[P_NEG_RS_DREPLY]	= "NegRSDReply",
+		[P_BARRIER_ACK]	        = "BarrierAck",
+		[P_STATE_CHG_REQ]       = "StateChgRequest",
+		[P_STATE_CHG_REPLY]     = "StateChgReply",
+		[P_OV_REQUEST]          = "OVRequest",
+		[P_OV_REPLY]            = "OVReply",
+		[P_OV_RESULT]           = "OVResult",
+		[P_CSUM_RS_REQUEST]     = "CsumRSRequest",
+		[P_RS_IS_IN_SYNC]	= "CsumRSIsInSync",
+		[P_COMPRESSED_BITMAP]   = "CBitmap",
+		[P_DELAY_PROBE]         = "DelayProbe",
+		[P_OUT_OF_SYNC]		= "OutOfSync",
+		[P_RETRY_WRITE]		= "RetryWrite",
+		[P_RS_CANCEL]		= "RSCancel",
+		[P_CONN_ST_CHG_REQ]	= "conn_st_chg_req",
+		[P_CONN_ST_CHG_REPLY]	= "conn_st_chg_reply",
+		[P_RETRY_WRITE]		= "retry_write",
+		[P_PROTOCOL_UPDATE]	= "protocol_update",
+
+		/* enum drbd_packet, but not commands - obsoleted flags:
+		 *	P_MAY_IGNORE
+		 *	P_MAX_OPT_CMD
+		 */
+	};
+
+	/* too big for the array: 0xfffX */
+	if (cmd == P_INITIAL_META)
+		return "InitialMeta";
+	if (cmd == P_INITIAL_DATA)
+		return "InitialData";
+	if (cmd == P_CONNECTION_FEATURES)
+		return "ConnectionFeatures";
+	if (cmd >= ARRAY_SIZE(cmdnames))
+		return "Unknown";
+	return cmdnames[cmd];
+}
+
+/**
+ * drbd_wait_misc  -  wait for a request to make progress
+ * @device:	device associated with the request
+ * @i:		the struct drbd_interval embedded in struct drbd_request or
+ *		struct drbd_peer_request
+ */
+int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
+{
+	struct net_conf *nc;
+	DEFINE_WAIT(wait);
+	long timeout;
+
+	rcu_read_lock();
+	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -ETIMEDOUT;
+	}
+	timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
+	rcu_read_unlock();
+
+	/* Indicate to wake up device->misc_wait on progress.  */
+	i->waiting = true;
+	prepare_to_wait(&device->misc_wait, &wait, TASK_INTERRUPTIBLE);
+	spin_unlock_irq(&device->resource->req_lock);
+	timeout = schedule_timeout(timeout);
+	finish_wait(&device->misc_wait, &wait);
+	spin_lock_irq(&device->resource->req_lock);
+	if (!timeout || device->state.conn < C_CONNECTED)
+		return -ETIMEDOUT;
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	return 0;
+}
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+/* Fault insertion support including random number generator shamelessly
+ * stolen from kernel/rcutorture.c */
+struct fault_random_state {
+	unsigned long state;
+	unsigned long count;
+};
+
+#define FAULT_RANDOM_MULT 39916801  /* prime */
+#define FAULT_RANDOM_ADD	479001701 /* prime */
+#define FAULT_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static unsigned long
+_drbd_fault_random(struct fault_random_state *rsp)
+{
+	long refresh;
+
+	if (!rsp->count--) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rsp->state += refresh;
+		rsp->count = FAULT_RANDOM_REFRESH;
+	}
+	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
+	return swahw32(rsp->state);
+}
+
+static char *
+_drbd_fault_str(unsigned int type) {
+	static char *_faults[] = {
+		[DRBD_FAULT_MD_WR] = "Meta-data write",
+		[DRBD_FAULT_MD_RD] = "Meta-data read",
+		[DRBD_FAULT_RS_WR] = "Resync write",
+		[DRBD_FAULT_RS_RD] = "Resync read",
+		[DRBD_FAULT_DT_WR] = "Data write",
+		[DRBD_FAULT_DT_RD] = "Data read",
+		[DRBD_FAULT_DT_RA] = "Data read ahead",
+		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
+		[DRBD_FAULT_AL_EE] = "EE allocation",
+		[DRBD_FAULT_RECEIVE] = "receive data corruption",
+	};
+
+	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
+}
+
+unsigned int
+_drbd_insert_fault(struct drbd_device *device, unsigned int type)
+{
+	static struct fault_random_state rrs = {0, 0};
+
+	unsigned int ret = (
+		(fault_devs == 0 ||
+			((1 << device_to_minor(device)) & fault_devs) != 0) &&
+		(((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
+
+	if (ret) {
+		fault_count++;
+
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_warn(device, "***Simulating %s failure\n",
+				_drbd_fault_str(type));
+	}
+
+	return ret;
+}
+#endif
+
+const char *drbd_buildtag(void)
+{
+	/* DRBD built from external sources has here a reference to the
+	   git hash of the source code. */
+
+	static char buildtag[38] = "\0uilt-in";
+
+	if (buildtag[0] == 0) {
+#ifdef MODULE
+		sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+#else
+		buildtag[0] = 'b';
+#endif
+	}
+
+	return buildtag;
+}
+
+module_init(drbd_init)
+module_exit(drbd_cleanup)
+
+EXPORT_SYMBOL(drbd_conn_str);
+EXPORT_SYMBOL(drbd_role_str);
+EXPORT_SYMBOL(drbd_disk_str);
+EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 00000000000..3f2e1673808
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,3660 @@
+/*
+   drbd_nl.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/blkpg.h>
+#include <linux/cpumask.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+#include <asm/unaligned.h>
+#include <linux/drbd_limits.h>
+#include <linux/kthread.h>
+
+#include <net/genetlink.h>
+
+/* .doit */
+// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+/* .dumpit */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+#include <linux/genl_magic_func.h>
+
+/* used blkdev_get_by_path, to claim our meta data device(s) */
+static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
+
+static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+{
+	genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+	if (genlmsg_reply(skb, info))
+		printk(KERN_ERR "drbd: error sending genl reply\n");
+}
+
+/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+ * reason it could fail was no space in skb, and there are 4k available. */
+int drbd_msg_put_info(struct sk_buff *skb, const char *info)
+{
+	struct nlattr *nla;
+	int err = -EMSGSIZE;
+
+	if (!info || !info[0])
+		return 0;
+
+	nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
+	if (!nla)
+		return err;
+
+	err = nla_put_string(skb, T_info_text, info);
+	if (err) {
+		nla_nest_cancel(skb, nla);
+		return err;
+	} else
+		nla_nest_end(skb, nla);
+	return 0;
+}
+
+/* This would be a good candidate for a "pre_doit" hook,
+ * and per-family private info->pointers.
+ * But we need to stay compatible with older kernels.
+ * If it returns successfully, adm_ctx members are valid.
+ *
+ * At this point, we still rely on the global genl_lock().
+ * If we want to avoid that, and allow "genl_family.parallel_ops", we may need
+ * to add additional synchronization against object destruction/modification.
+ */
+#define DRBD_ADM_NEED_MINOR	1
+#define DRBD_ADM_NEED_RESOURCE	2
+#define DRBD_ADM_NEED_CONNECTION 4
+static int drbd_adm_prepare(struct drbd_config_context *adm_ctx,
+	struct sk_buff *skb, struct genl_info *info, unsigned flags)
+{
+	struct drbd_genlmsghdr *d_in = info->userhdr;
+	const u8 cmd = info->genlhdr->cmd;
+	int err;
+
+	memset(adm_ctx, 0, sizeof(*adm_ctx));
+
+	/* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
+	if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+	       return -EPERM;
+
+	adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!adm_ctx->reply_skb) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb,
+					info, &drbd_genl_family, 0, cmd);
+	/* put of a few bytes into a fresh skb of >= 4k will always succeed.
+	 * but anyways */
+	if (!adm_ctx->reply_dh) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	adm_ctx->reply_dh->minor = d_in->minor;
+	adm_ctx->reply_dh->ret_code = NO_ERROR;
+
+	adm_ctx->volume = VOLUME_UNSPECIFIED;
+	if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+		struct nlattr *nla;
+		/* parse and validate only */
+		err = drbd_cfg_context_from_attrs(NULL, info);
+		if (err)
+			goto fail;
+
+		/* It was present, and valid,
+		 * copy it over to the reply skb. */
+		err = nla_put_nohdr(adm_ctx->reply_skb,
+				info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+				info->attrs[DRBD_NLA_CFG_CONTEXT]);
+		if (err)
+			goto fail;
+
+		/* and assign stuff to the adm_ctx */
+		nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+		if (nla)
+			adm_ctx->volume = nla_get_u32(nla);
+		nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+		if (nla)
+			adm_ctx->resource_name = nla_data(nla);
+		adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+		adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+		if ((adm_ctx->my_addr &&
+		     nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) ||
+		    (adm_ctx->peer_addr &&
+		     nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) {
+			err = -EINVAL;
+			goto fail;
+		}
+	}
+
+	adm_ctx->minor = d_in->minor;
+	adm_ctx->device = minor_to_device(d_in->minor);
+
+	/* We are protected by the global genl_lock().
+	 * But we may explicitly drop it/retake it in drbd_adm_set_role(),
+	 * so make sure this object stays around. */
+	if (adm_ctx->device)
+		kref_get(&adm_ctx->device->kref);
+
+	if (adm_ctx->resource_name) {
+		adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name);
+	}
+
+	if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor");
+		return ERR_MINOR_INVALID;
+	}
+	if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource");
+		if (adm_ctx->resource_name)
+			return ERR_RES_NOT_KNOWN;
+		return ERR_INVALID_REQUEST;
+	}
+
+	if (flags & DRBD_ADM_NEED_CONNECTION) {
+		if (adm_ctx->resource) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected");
+			return ERR_INVALID_REQUEST;
+		}
+		if (adm_ctx->device) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected");
+			return ERR_INVALID_REQUEST;
+		}
+		if (adm_ctx->my_addr && adm_ctx->peer_addr)
+			adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr),
+							  nla_len(adm_ctx->my_addr),
+							  nla_data(adm_ctx->peer_addr),
+							  nla_len(adm_ctx->peer_addr));
+		if (!adm_ctx->connection) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection");
+			return ERR_INVALID_REQUEST;
+		}
+	}
+
+	/* some more paranoia, if the request was over-determined */
+	if (adm_ctx->device && adm_ctx->resource &&
+	    adm_ctx->device->resource != adm_ctx->resource) {
+		pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n",
+				adm_ctx->minor, adm_ctx->resource->name,
+				adm_ctx->device->resource->name);
+		drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource");
+		return ERR_INVALID_REQUEST;
+	}
+	if (adm_ctx->device &&
+	    adm_ctx->volume != VOLUME_UNSPECIFIED &&
+	    adm_ctx->volume != adm_ctx->device->vnr) {
+		pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+				adm_ctx->minor, adm_ctx->volume,
+				adm_ctx->device->vnr,
+				adm_ctx->device->resource->name);
+		drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume");
+		return ERR_INVALID_REQUEST;
+	}
+
+	/* still, provide adm_ctx->resource always, if possible. */
+	if (!adm_ctx->resource) {
+		adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource
+			: adm_ctx->connection ? adm_ctx->connection->resource : NULL;
+		if (adm_ctx->resource)
+			kref_get(&adm_ctx->resource->kref);
+	}
+
+	return NO_ERROR;
+
+fail:
+	nlmsg_free(adm_ctx->reply_skb);
+	adm_ctx->reply_skb = NULL;
+	return err;
+}
+
+static int drbd_adm_finish(struct drbd_config_context *adm_ctx,
+	struct genl_info *info, int retcode)
+{
+	if (adm_ctx->device) {
+		kref_put(&adm_ctx->device->kref, drbd_destroy_device);
+		adm_ctx->device = NULL;
+	}
+	if (adm_ctx->connection) {
+		kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection);
+		adm_ctx->connection = NULL;
+	}
+	if (adm_ctx->resource) {
+		kref_put(&adm_ctx->resource->kref, drbd_destroy_resource);
+		adm_ctx->resource = NULL;
+	}
+
+	if (!adm_ctx->reply_skb)
+		return -ENOMEM;
+
+	adm_ctx->reply_dh->ret_code = retcode;
+	drbd_adm_send_reply(adm_ctx->reply_skb, info);
+	return 0;
+}
+
+static void setup_khelper_env(struct drbd_connection *connection, char **envp)
+{
+	char *afs;
+
+	/* FIXME: A future version will not allow this case. */
+	if (connection->my_addr_len == 0 || connection->peer_addr_len == 0)
+		return;
+
+	switch (((struct sockaddr *)&connection->peer_addr)->sa_family) {
+	case AF_INET6:
+		afs = "ipv6";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+			 &((struct sockaddr_in6 *)&connection->peer_addr)->sin6_addr);
+		break;
+	case AF_INET:
+		afs = "ipv4";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+			 &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
+		break;
+	default:
+		afs = "ssocks";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+			 &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
+	}
+	snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+}
+
+int drbd_khelper(struct drbd_device *device, char *cmd)
+{
+	char *envp[] = { "HOME=/",
+			"TERM=linux",
+			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+			 (char[20]) { }, /* address family */
+			 (char[60]) { }, /* address */
+			NULL };
+	char mb[12];
+	char *argv[] = {usermode_helper, cmd, mb, NULL };
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct sib_info sib;
+	int ret;
+
+	if (current == connection->worker.task)
+		set_bit(CALLBACK_PENDING, &connection->flags);
+
+	snprintf(mb, 12, "minor-%d", device_to_minor(device));
+	setup_khelper_env(connection, envp);
+
+	/* The helper may take some time.
+	 * write out any unsynced meta data changes now */
+	drbd_md_sync(device);
+
+	drbd_info(device, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
+	sib.sib_reason = SIB_HELPER_PRE;
+	sib.helper_name = cmd;
+	drbd_bcast_event(device, &sib);
+	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+	if (ret)
+		drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+	else
+		drbd_info(device, "helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+	sib.sib_reason = SIB_HELPER_POST;
+	sib.helper_exit_code = ret;
+	drbd_bcast_event(device, &sib);
+
+	if (current == connection->worker.task)
+		clear_bit(CALLBACK_PENDING, &connection->flags);
+
+	if (ret < 0) /* Ignore any ERRNOs we got. */
+		ret = 0;
+
+	return ret;
+}
+
+static int conn_khelper(struct drbd_connection *connection, char *cmd)
+{
+	char *envp[] = { "HOME=/",
+			"TERM=linux",
+			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+			 (char[20]) { }, /* address family */
+			 (char[60]) { }, /* address */
+			NULL };
+	char *resource_name = connection->resource->name;
+	char *argv[] = {usermode_helper, cmd, resource_name, NULL };
+	int ret;
+
+	setup_khelper_env(connection, envp);
+	conn_md_sync(connection);
+
+	drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
+	/* TODO: conn_bcast_event() ?? */
+
+	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+	if (ret)
+		drbd_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+			  usermode_helper, cmd, resource_name,
+			  (ret >> 8) & 0xff, ret);
+	else
+		drbd_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+			  usermode_helper, cmd, resource_name,
+			  (ret >> 8) & 0xff, ret);
+	/* TODO: conn_bcast_event() ?? */
+
+	if (ret < 0) /* Ignore any ERRNOs we got. */
+		ret = 0;
+
+	return ret;
+}
+
+static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connection)
+{
+	enum drbd_fencing_p fp = FP_NOT_AVAIL;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (get_ldev_if_state(device, D_CONSISTENT)) {
+			struct disk_conf *disk_conf =
+				rcu_dereference(peer_device->device->ldev->disk_conf);
+			fp = max_t(enum drbd_fencing_p, fp, disk_conf->fencing);
+			put_ldev(device);
+		}
+	}
+	rcu_read_unlock();
+
+	if (fp == FP_NOT_AVAIL) {
+		/* IO Suspending works on the whole resource.
+		   Do it only for one device. */
+		vnr = 0;
+		peer_device = idr_get_next(&connection->peer_devices, &vnr);
+		drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
+	}
+
+	return fp;
+}
+
+bool conn_try_outdate_peer(struct drbd_connection *connection)
+{
+	unsigned int connect_cnt;
+	union drbd_state mask = { };
+	union drbd_state val = { };
+	enum drbd_fencing_p fp;
+	char *ex_to_string;
+	int r;
+
+	spin_lock_irq(&connection->resource->req_lock);
+	if (connection->cstate >= C_WF_REPORT_PARAMS) {
+		drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
+		spin_unlock_irq(&connection->resource->req_lock);
+		return false;
+	}
+
+	connect_cnt = connection->connect_cnt;
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	fp = highest_fencing_policy(connection);
+	switch (fp) {
+	case FP_NOT_AVAIL:
+		drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
+		goto out;
+	case FP_DONT_CARE:
+		return true;
+	default: ;
+	}
+
+	r = conn_khelper(connection, "fence-peer");
+
+	switch ((r>>8) & 0xff) {
+	case 3: /* peer is inconsistent */
+		ex_to_string = "peer is inconsistent or worse";
+		mask.pdsk = D_MASK;
+		val.pdsk = D_INCONSISTENT;
+		break;
+	case 4: /* peer got outdated, or was already outdated */
+		ex_to_string = "peer was fenced";
+		mask.pdsk = D_MASK;
+		val.pdsk = D_OUTDATED;
+		break;
+	case 5: /* peer was down */
+		if (conn_highest_disk(connection) == D_UP_TO_DATE) {
+			/* we will(have) create(d) a new UUID anyways... */
+			ex_to_string = "peer is unreachable, assumed to be dead";
+			mask.pdsk = D_MASK;
+			val.pdsk = D_OUTDATED;
+		} else {
+			ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
+		}
+		break;
+	case 6: /* Peer is primary, voluntarily outdate myself.
+		 * This is useful when an unconnected R_SECONDARY is asked to
+		 * become R_PRIMARY, but finds the other peer being active. */
+		ex_to_string = "peer is active";
+		drbd_warn(connection, "Peer is primary, outdating myself.\n");
+		mask.disk = D_MASK;
+		val.disk = D_OUTDATED;
+		break;
+	case 7:
+		if (fp != FP_STONITH)
+			drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
+		ex_to_string = "peer was stonithed";
+		mask.pdsk = D_MASK;
+		val.pdsk = D_OUTDATED;
+		break;
+	default:
+		/* The script is broken ... */
+		drbd_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+		return false; /* Eventually leave IO frozen */
+	}
+
+	drbd_info(connection, "fence-peer helper returned %d (%s)\n",
+		  (r>>8) & 0xff, ex_to_string);
+
+ out:
+
+	/* Not using
+	   conn_request_state(connection, mask, val, CS_VERBOSE);
+	   here, because we might were able to re-establish the connection in the
+	   meantime. */
+	spin_lock_irq(&connection->resource->req_lock);
+	if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
+		if (connection->connect_cnt != connect_cnt)
+			/* In case the connection was established and droped
+			   while the fence-peer handler was running, ignore it */
+			drbd_info(connection, "Ignoring fence-peer exit code\n");
+		else
+			_conn_request_state(connection, mask, val, CS_VERBOSE);
+	}
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	return conn_highest_pdsk(connection) <= D_OUTDATED;
+}
+
+static int _try_outdate_peer_async(void *data)
+{
+	struct drbd_connection *connection = (struct drbd_connection *)data;
+
+	conn_try_outdate_peer(connection);
+
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return 0;
+}
+
+void conn_try_outdate_peer_async(struct drbd_connection *connection)
+{
+	struct task_struct *opa;
+
+	kref_get(&connection->kref);
+	/* We may just have force_sig()'ed this thread
+	 * to get it out of some blocking network function.
+	 * Clear signals; otherwise kthread_run(), which internally uses
+	 * wait_on_completion_killable(), will mistake our pending signal
+	 * for a new fatal signal and fail. */
+	flush_signals(current);
+	opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h");
+	if (IS_ERR(opa)) {
+		drbd_err(connection, "out of mem, failed to invoke fence-peer helper\n");
+		kref_put(&connection->kref, drbd_destroy_connection);
+	}
+}
+
+enum drbd_state_rv
+drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
+{
+	const int max_tries = 4;
+	enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
+	struct net_conf *nc;
+	int try = 0;
+	int forced = 0;
+	union drbd_state mask, val;
+
+	if (new_role == R_PRIMARY) {
+		struct drbd_connection *connection;
+
+		/* Detect dead peers as soon as possible.  */
+
+		rcu_read_lock();
+		for_each_connection(connection, device->resource)
+			request_ping(connection);
+		rcu_read_unlock();
+	}
+
+	mutex_lock(device->state_mutex);
+
+	mask.i = 0; mask.role = R_MASK;
+	val.i  = 0; val.role  = new_role;
+
+	while (try++ < max_tries) {
+		rv = _drbd_request_state(device, mask, val, CS_WAIT_COMPLETE);
+
+		/* in case we first succeeded to outdate,
+		 * but now suddenly could establish a connection */
+		if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
+			val.pdsk = 0;
+			mask.pdsk = 0;
+			continue;
+		}
+
+		if (rv == SS_NO_UP_TO_DATE_DISK && force &&
+		    (device->state.disk < D_UP_TO_DATE &&
+		     device->state.disk >= D_INCONSISTENT)) {
+			mask.disk = D_MASK;
+			val.disk  = D_UP_TO_DATE;
+			forced = 1;
+			continue;
+		}
+
+		if (rv == SS_NO_UP_TO_DATE_DISK &&
+		    device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
+			D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
+
+			if (conn_try_outdate_peer(first_peer_device(device)->connection)) {
+				val.disk = D_UP_TO_DATE;
+				mask.disk = D_MASK;
+			}
+			continue;
+		}
+
+		if (rv == SS_NOTHING_TO_DO)
+			goto out;
+		if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
+			if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) {
+				drbd_warn(device, "Forced into split brain situation!\n");
+				mask.pdsk = D_MASK;
+				val.pdsk  = D_OUTDATED;
+
+			}
+			continue;
+		}
+		if (rv == SS_TWO_PRIMARIES) {
+			/* Maybe the peer is detected as dead very soon...
+			   retry at most once more in this case. */
+			int timeo;
+			rcu_read_lock();
+			nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+			timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
+			rcu_read_unlock();
+			schedule_timeout_interruptible(timeo);
+			if (try < max_tries)
+				try = max_tries - 1;
+			continue;
+		}
+		if (rv < SS_SUCCESS) {
+			rv = _drbd_request_state(device, mask, val,
+						CS_VERBOSE + CS_WAIT_COMPLETE);
+			if (rv < SS_SUCCESS)
+				goto out;
+		}
+		break;
+	}
+
+	if (rv < SS_SUCCESS)
+		goto out;
+
+	if (forced)
+		drbd_warn(device, "Forced to consider local data as UpToDate!\n");
+
+	/* Wait until nothing is on the fly :) */
+	wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0);
+
+	/* FIXME also wait for all pending P_BARRIER_ACK? */
+
+	if (new_role == R_SECONDARY) {
+		set_disk_ro(device->vdisk, true);
+		if (get_ldev(device)) {
+			device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+			put_ldev(device);
+		}
+	} else {
+		/* Called from drbd_adm_set_role only.
+		 * We are still holding the conf_update mutex. */
+		nc = first_peer_device(device)->connection->net_conf;
+		if (nc)
+			nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+
+		set_disk_ro(device->vdisk, false);
+		if (get_ldev(device)) {
+			if (((device->state.conn < C_CONNECTED ||
+			       device->state.pdsk <= D_FAILED)
+			      && device->ldev->md.uuid[UI_BITMAP] == 0) || forced)
+				drbd_uuid_new_current(device);
+
+			device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+			put_ldev(device);
+		}
+	}
+
+	/* writeout of activity log covered areas of the bitmap
+	 * to stable storage done in after state change already */
+
+	if (device->state.conn >= C_WF_REPORT_PARAMS) {
+		/* if this was forced, we should consider sync */
+		if (forced)
+			drbd_send_uuids(first_peer_device(device));
+		drbd_send_current_state(first_peer_device(device));
+	}
+
+	drbd_md_sync(device);
+
+	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+out:
+	mutex_unlock(device->state_mutex);
+	return rv;
+}
+
+static const char *from_attrs_err_to_txt(int err)
+{
+	return	err == -ENOMSG ? "required attribute missing" :
+		err == -EOPNOTSUPP ? "unknown mandatory attribute" :
+		err == -EEXIST ? "can not change invariant setting" :
+		"invalid attribute value";
+}
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct set_role_parms parms;
+	int err;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	memset(&parms, 0, sizeof(parms));
+	if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
+		err = set_role_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out;
+		}
+	}
+	genl_unlock();
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
+		retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate);
+	else
+		retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
+
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	genl_lock();
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+/* Initializes the md.*_offset members, so we are able to find
+ * the on disk meta data.
+ *
+ * We currently have two possible layouts:
+ * external:
+ *   |----------- md_size_sect ------------------|
+ *   [ 4k superblock ][ activity log ][  Bitmap  ]
+ *   | al_offset == 8 |
+ *   | bm_offset = al_offset + X      |
+ *  ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * internal:
+ *            |----------- md_size_sect ------------------|
+ * [data.....][  Bitmap  ][ activity log ][ 4k superblock ]
+ *                        | al_offset < 0 |
+ *            | bm_offset = al_offset - Y |
+ *  ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ *  Activity log size used to be fixed 32kB,
+ *  but is about to become configurable.
+ */
+static void drbd_md_set_sector_offsets(struct drbd_device *device,
+				       struct drbd_backing_dev *bdev)
+{
+	sector_t md_size_sect = 0;
+	unsigned int al_size_sect = bdev->md.al_size_4k * 8;
+
+	bdev->md.md_offset = drbd_md_ss(bdev);
+
+	switch (bdev->md.meta_dev_idx) {
+	default:
+		/* v07 style fixed size indexed meta data */
+		bdev->md.md_size_sect = MD_128MB_SECT;
+		bdev->md.al_offset = MD_4kB_SECT;
+		bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		/* just occupy the full device; unit: sectors */
+		bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
+		bdev->md.al_offset = MD_4kB_SECT;
+		bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
+		break;
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		/* al size is still fixed */
+		bdev->md.al_offset = -al_size_sect;
+		/* we need (slightly less than) ~ this much bitmap sectors: */
+		md_size_sect = drbd_get_capacity(bdev->backing_bdev);
+		md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
+		md_size_sect = BM_SECT_TO_EXT(md_size_sect);
+		md_size_sect = ALIGN(md_size_sect, 8);
+
+		/* plus the "drbd meta data super block",
+		 * and the activity log; */
+		md_size_sect += MD_4kB_SECT + al_size_sect;
+
+		bdev->md.md_size_sect = md_size_sect;
+		/* bitmap offset is adjusted by 'super' block size */
+		bdev->md.bm_offset   = -md_size_sect + MD_4kB_SECT;
+		break;
+	}
+}
+
+/* input size is expected to be in KB */
+char *ppsize(char *buf, unsigned long long size)
+{
+	/* Needs 9 bytes at max including trailing NUL:
+	 * -1ULL ==> "16384 EB" */
+	static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
+	int base = 0;
+	while (size >= 10000 && base < sizeof(units)-1) {
+		/* shift + round */
+		size = (size >> 10) + !!(size & (1<<9));
+		base++;
+	}
+	sprintf(buf, "%u %cB", (unsigned)size, units[base]);
+
+	return buf;
+}
+
+/* there is still a theoretical deadlock when called from receiver
+ * on an D_INCONSISTENT R_PRIMARY:
+ *  remote READ does inc_ap_bio, receiver would need to receive answer
+ *  packet from remote to dec_ap_bio again.
+ *  receiver receive_sizes(), comes here,
+ *  waits for ap_bio_cnt == 0. -> deadlock.
+ * but this cannot happen, actually, because:
+ *  R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
+ *  (not connected, or bad/no disk on peer):
+ *  see drbd_fail_request_early, ap_bio_cnt is zero.
+ *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
+ *  peer may not initiate a resize.
+ */
+/* Note these are not to be confused with
+ * drbd_adm_suspend_io/drbd_adm_resume_io,
+ * which are (sub) state changes triggered by admin (drbdsetup),
+ * and can be long lived.
+ * This changes an device->flag, is triggered by drbd internals,
+ * and should be short-lived. */
+void drbd_suspend_io(struct drbd_device *device)
+{
+	set_bit(SUSPEND_IO, &device->flags);
+	if (drbd_suspended(device))
+		return;
+	wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
+}
+
+void drbd_resume_io(struct drbd_device *device)
+{
+	clear_bit(SUSPEND_IO, &device->flags);
+	wake_up(&device->misc_wait);
+}
+
+/**
+ * drbd_determine_dev_size() -  Sets the right device size obeying all constraints
+ * @device:	DRBD device.
+ *
+ * Returns 0 on success, negative return values indicate errors.
+ * You should call drbd_md_sync() after calling this function.
+ */
+enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
+{
+	sector_t prev_first_sect, prev_size; /* previous meta location */
+	sector_t la_size_sect, u_size;
+	struct drbd_md *md = &device->ldev->md;
+	u32 prev_al_stripe_size_4k;
+	u32 prev_al_stripes;
+	sector_t size;
+	char ppb[10];
+	void *buffer;
+
+	int md_moved, la_size_changed;
+	enum determine_dev_size rv = DS_UNCHANGED;
+
+	/* race:
+	 * application request passes inc_ap_bio,
+	 * but then cannot get an AL-reference.
+	 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+	 *
+	 * to avoid that:
+	 * Suspend IO right here.
+	 * still lock the act_log to not trigger ASSERTs there.
+	 */
+	drbd_suspend_io(device);
+	buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */
+	if (!buffer) {
+		drbd_resume_io(device);
+		return DS_ERROR;
+	}
+
+	/* no wait necessary anymore, actually we could assert that */
+	wait_event(device->al_wait, lc_try_lock(device->act_log));
+
+	prev_first_sect = drbd_md_first_sector(device->ldev);
+	prev_size = device->ldev->md.md_size_sect;
+	la_size_sect = device->ldev->md.la_size_sect;
+
+	if (rs) {
+		/* rs is non NULL if we should change the AL layout only */
+
+		prev_al_stripes = md->al_stripes;
+		prev_al_stripe_size_4k = md->al_stripe_size_4k;
+
+		md->al_stripes = rs->al_stripes;
+		md->al_stripe_size_4k = rs->al_stripe_size / 4;
+		md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
+	}
+
+	drbd_md_set_sector_offsets(device, device->ldev);
+
+	rcu_read_lock();
+	u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+	rcu_read_unlock();
+	size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
+
+	if (size < la_size_sect) {
+		if (rs && u_size == 0) {
+			/* Remove "rs &&" later. This check should always be active, but
+			   right now the receiver expects the permissive behavior */
+			drbd_warn(device, "Implicit shrink not allowed. "
+				 "Use --size=%llus for explicit shrink.\n",
+				 (unsigned long long)size);
+			rv = DS_ERROR_SHRINK;
+		}
+		if (u_size > size)
+			rv = DS_ERROR_SPACE_MD;
+		if (rv != DS_UNCHANGED)
+			goto err_out;
+	}
+
+	if (drbd_get_capacity(device->this_bdev) != size ||
+	    drbd_bm_capacity(device) != size) {
+		int err;
+		err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
+		if (unlikely(err)) {
+			/* currently there is only one error: ENOMEM! */
+			size = drbd_bm_capacity(device)>>1;
+			if (size == 0) {
+				drbd_err(device, "OUT OF MEMORY! "
+				    "Could not allocate bitmap!\n");
+			} else {
+				drbd_err(device, "BM resizing failed. "
+				    "Leaving size unchanged at size = %lu KB\n",
+				    (unsigned long)size);
+			}
+			rv = DS_ERROR;
+		}
+		/* racy, see comments above. */
+		drbd_set_my_capacity(device, size);
+		device->ldev->md.la_size_sect = size;
+		drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
+		     (unsigned long long)size>>1);
+	}
+	if (rv <= DS_ERROR)
+		goto err_out;
+
+	la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
+
+	md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
+		|| prev_size	   != device->ldev->md.md_size_sect;
+
+	if (la_size_changed || md_moved || rs) {
+		u32 prev_flags;
+
+		drbd_al_shrink(device); /* All extents inactive. */
+
+		prev_flags = md->flags;
+		md->flags &= ~MDF_PRIMARY_IND;
+		drbd_md_write(device, buffer);
+
+		drbd_info(device, "Writing the whole bitmap, %s\n",
+			 la_size_changed && md_moved ? "size changed and md moved" :
+			 la_size_changed ? "size changed" : "md moved");
+		/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
+		drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+			       "size changed", BM_LOCKED_MASK);
+		drbd_initialize_al(device, buffer);
+
+		md->flags = prev_flags;
+		drbd_md_write(device, buffer);
+
+		if (rs)
+			drbd_info(device, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
+				  md->al_stripes, md->al_stripe_size_4k * 4);
+	}
+
+	if (size > la_size_sect)
+		rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
+	if (size < la_size_sect)
+		rv = DS_SHRUNK;
+
+	if (0) {
+	err_out:
+		if (rs) {
+			md->al_stripes = prev_al_stripes;
+			md->al_stripe_size_4k = prev_al_stripe_size_4k;
+			md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
+
+			drbd_md_set_sector_offsets(device, device->ldev);
+		}
+	}
+	lc_unlock(device->act_log);
+	wake_up(&device->al_wait);
+	drbd_md_put_buffer(device);
+	drbd_resume_io(device);
+
+	return rv;
+}
+
+sector_t
+drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev,
+		  sector_t u_size, int assume_peer_has_space)
+{
+	sector_t p_size = device->p_size;   /* partner's disk size. */
+	sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
+	sector_t m_size; /* my size */
+	sector_t size = 0;
+
+	m_size = drbd_get_max_capacity(bdev);
+
+	if (device->state.conn < C_CONNECTED && assume_peer_has_space) {
+		drbd_warn(device, "Resize while not connected was forced by the user!\n");
+		p_size = m_size;
+	}
+
+	if (p_size && m_size) {
+		size = min_t(sector_t, p_size, m_size);
+	} else {
+		if (la_size_sect) {
+			size = la_size_sect;
+			if (m_size && m_size < size)
+				size = m_size;
+			if (p_size && p_size < size)
+				size = p_size;
+		} else {
+			if (m_size)
+				size = m_size;
+			if (p_size)
+				size = p_size;
+		}
+	}
+
+	if (size == 0)
+		drbd_err(device, "Both nodes diskless!\n");
+
+	if (u_size) {
+		if (u_size > size)
+			drbd_err(device, "Requested disk size is too big (%lu > %lu)\n",
+			    (unsigned long)u_size>>1, (unsigned long)size>>1);
+		else
+			size = u_size;
+	}
+
+	return size;
+}
+
+/**
+ * drbd_check_al_size() - Ensures that the AL is of the right size
+ * @device:	DRBD device.
+ *
+ * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
+ * failed, and 0 on success. You should call drbd_md_sync() after you called
+ * this function.
+ */
+static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
+{
+	struct lru_cache *n, *t;
+	struct lc_element *e;
+	unsigned int in_use;
+	int i;
+
+	if (device->act_log &&
+	    device->act_log->nr_elements == dc->al_extents)
+		return 0;
+
+	in_use = 0;
+	t = device->act_log;
+	n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
+		dc->al_extents, sizeof(struct lc_element), 0);
+
+	if (n == NULL) {
+		drbd_err(device, "Cannot allocate act_log lru!\n");
+		return -ENOMEM;
+	}
+	spin_lock_irq(&device->al_lock);
+	if (t) {
+		for (i = 0; i < t->nr_elements; i++) {
+			e = lc_element_by_index(t, i);
+			if (e->refcnt)
+				drbd_err(device, "refcnt(%d)==%d\n",
+				    e->lc_number, e->refcnt);
+			in_use += e->refcnt;
+		}
+	}
+	if (!in_use)
+		device->act_log = n;
+	spin_unlock_irq(&device->al_lock);
+	if (in_use) {
+		drbd_err(device, "Activity log still in use!\n");
+		lc_destroy(n);
+		return -EBUSY;
+	} else {
+		if (t)
+			lc_destroy(t);
+	}
+	drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
+	return 0;
+}
+
+static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size)
+{
+	struct request_queue * const q = device->rq_queue;
+	unsigned int max_hw_sectors = max_bio_size >> 9;
+	unsigned int max_segments = 0;
+	struct request_queue *b = NULL;
+
+	if (get_ldev_if_state(device, D_ATTACHING)) {
+		b = device->ldev->backing_bdev->bd_disk->queue;
+
+		max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
+		rcu_read_lock();
+		max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
+		rcu_read_unlock();
+
+		blk_set_stacking_limits(&q->limits);
+		blk_queue_max_write_same_sectors(q, 0);
+	}
+
+	blk_queue_logical_block_size(q, 512);
+	blk_queue_max_hw_sectors(q, max_hw_sectors);
+	/* This is the workaround for "bio would need to, but cannot, be split" */
+	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
+	blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
+
+	if (b) {
+		struct drbd_connection *connection = first_peer_device(device)->connection;
+
+		if (blk_queue_discard(b) &&
+		    (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
+			/* For now, don't allow more than one activity log extent worth of data
+			 * to be discarded in one go. We may need to rework drbd_al_begin_io()
+			 * to allow for even larger discard ranges */
+			q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS;
+
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+			/* REALLY? Is stacking secdiscard "legal"? */
+			if (blk_queue_secdiscard(b))
+				queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
+		} else {
+			q->limits.max_discard_sectors = 0;
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+			queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q);
+		}
+
+		blk_queue_stack_limits(q, b);
+
+		if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
+			drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
+				 q->backing_dev_info.ra_pages,
+				 b->backing_dev_info.ra_pages);
+			q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+		}
+		put_ldev(device);
+	}
+}
+
+void drbd_reconsider_max_bio_size(struct drbd_device *device)
+{
+	unsigned int now, new, local, peer;
+
+	now = queue_max_hw_sectors(device->rq_queue) << 9;
+	local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
+	peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
+
+	if (get_ldev_if_state(device, D_ATTACHING)) {
+		local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+		device->local_max_bio_size = local;
+		put_ldev(device);
+	}
+	local = min(local, DRBD_MAX_BIO_SIZE);
+
+	/* We may ignore peer limits if the peer is modern enough.
+	   Because new from 8.3.8 onwards the peer can use multiple
+	   BIOs for a single peer_request */
+	if (device->state.conn >= C_WF_REPORT_PARAMS) {
+		if (first_peer_device(device)->connection->agreed_pro_version < 94)
+			peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+			/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
+		else if (first_peer_device(device)->connection->agreed_pro_version == 94)
+			peer = DRBD_MAX_SIZE_H80_PACKET;
+		else if (first_peer_device(device)->connection->agreed_pro_version < 100)
+			peer = DRBD_MAX_BIO_SIZE_P95;  /* drbd 8.3.8 onwards, before 8.4.0 */
+		else
+			peer = DRBD_MAX_BIO_SIZE;
+
+		/* We may later detach and re-attach on a disconnected Primary.
+		 * Avoid this setting to jump back in that case.
+		 * We want to store what we know the peer DRBD can handle,
+		 * not what the peer IO backend can handle. */
+		if (peer > device->peer_max_bio_size)
+			device->peer_max_bio_size = peer;
+	}
+	new = min(local, peer);
+
+	if (device->state.role == R_PRIMARY && new < now)
+		drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
+
+	if (new != now)
+		drbd_info(device, "max BIO size = %u\n", new);
+
+	drbd_setup_queue_param(device, new);
+}
+
+/* Starts the worker thread */
+static void conn_reconfig_start(struct drbd_connection *connection)
+{
+	drbd_thread_start(&connection->worker);
+	drbd_flush_workqueue(&connection->sender_work);
+}
+
+/* if still unconfigured, stops worker again. */
+static void conn_reconfig_done(struct drbd_connection *connection)
+{
+	bool stop_threads;
+	spin_lock_irq(&connection->resource->req_lock);
+	stop_threads = conn_all_vols_unconf(connection) &&
+		connection->cstate == C_STANDALONE;
+	spin_unlock_irq(&connection->resource->req_lock);
+	if (stop_threads) {
+		/* asender is implicitly stopped by receiver
+		 * in conn_disconnect() */
+		drbd_thread_stop(&connection->receiver);
+		drbd_thread_stop(&connection->worker);
+	}
+}
+
+/* Make sure IO is suspended before calling this function(). */
+static void drbd_suspend_al(struct drbd_device *device)
+{
+	int s = 0;
+
+	if (!lc_try_lock(device->act_log)) {
+		drbd_warn(device, "Failed to lock al in drbd_suspend_al()\n");
+		return;
+	}
+
+	drbd_al_shrink(device);
+	spin_lock_irq(&device->resource->req_lock);
+	if (device->state.conn < C_CONNECTED)
+		s = !test_and_set_bit(AL_SUSPENDED, &device->flags);
+	spin_unlock_irq(&device->resource->req_lock);
+	lc_unlock(device->act_log);
+
+	if (s)
+		drbd_info(device, "Suspended AL updates\n");
+}
+
+
+static bool should_set_defaults(struct genl_info *info)
+{
+	unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
+	return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
+}
+
+static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
+{
+	/* This is limited by 16 bit "slot" numbers,
+	 * and by available on-disk context storage.
+	 *
+	 * Also (u16)~0 is special (denotes a "free" extent).
+	 *
+	 * One transaction occupies one 4kB on-disk block,
+	 * we have n such blocks in the on disk ring buffer,
+	 * the "current" transaction may fail (n-1),
+	 * and there is 919 slot numbers context information per transaction.
+	 *
+	 * 72 transaction blocks amounts to more than 2**16 context slots,
+	 * so cap there first.
+	 */
+	const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
+	const unsigned int sufficient_on_disk =
+		(max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
+		/AL_CONTEXT_PER_TRANSACTION;
+
+	unsigned int al_size_4k = bdev->md.al_size_4k;
+
+	if (al_size_4k > sufficient_on_disk)
+		return max_al_nr;
+
+	return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
+}
+
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct drbd_device *device;
+	struct disk_conf *new_disk_conf, *old_disk_conf;
+	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+	int err, fifo_size;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	device = adm_ctx.device;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	/* we also need a disk
+	 * to change the options on */
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+	if (!new_disk_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	mutex_lock(&device->resource->conf_update);
+	old_disk_conf = device->ldev->disk_conf;
+	*new_disk_conf = *old_disk_conf;
+	if (should_set_defaults(info))
+		set_disk_conf_defaults(new_disk_conf);
+
+	err = disk_conf_from_attrs_for_change(new_disk_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail_unlock;
+	}
+
+	if (!expect(new_disk_conf->resync_rate >= 1))
+		new_disk_conf->resync_rate = 1;
+
+	if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+		new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+	if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
+		new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
+
+	if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+		new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+	fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+	if (fifo_size != device->rs_plan_s->size) {
+		new_plan = fifo_alloc(fifo_size);
+		if (!new_plan) {
+			drbd_err(device, "kmalloc of fifo_buffer failed");
+			retcode = ERR_NOMEM;
+			goto fail_unlock;
+		}
+	}
+
+	drbd_suspend_io(device);
+	wait_event(device->al_wait, lc_try_lock(device->act_log));
+	drbd_al_shrink(device);
+	err = drbd_check_al_size(device, new_disk_conf);
+	lc_unlock(device->act_log);
+	wake_up(&device->al_wait);
+	drbd_resume_io(device);
+
+	if (err) {
+		retcode = ERR_NOMEM;
+		goto fail_unlock;
+	}
+
+	write_lock_irq(&global_state_lock);
+	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+	if (retcode == NO_ERROR) {
+		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+		drbd_resync_after_changed(device);
+	}
+	write_unlock_irq(&global_state_lock);
+
+	if (retcode != NO_ERROR)
+		goto fail_unlock;
+
+	if (new_plan) {
+		old_plan = device->rs_plan_s;
+		rcu_assign_pointer(device->rs_plan_s, new_plan);
+	}
+
+	mutex_unlock(&device->resource->conf_update);
+
+	if (new_disk_conf->al_updates)
+		device->ldev->md.flags &= ~MDF_AL_DISABLED;
+	else
+		device->ldev->md.flags |= MDF_AL_DISABLED;
+
+	if (new_disk_conf->md_flushes)
+		clear_bit(MD_NO_FUA, &device->flags);
+	else
+		set_bit(MD_NO_FUA, &device->flags);
+
+	drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
+
+	drbd_md_sync(device);
+
+	if (device->state.conn >= C_CONNECTED) {
+		struct drbd_peer_device *peer_device;
+
+		for_each_peer_device(peer_device, device)
+			drbd_send_sync_param(peer_device);
+	}
+
+	synchronize_rcu();
+	kfree(old_disk_conf);
+	kfree(old_plan);
+	mod_timer(&device->request_timer, jiffies + HZ);
+	goto success;
+
+fail_unlock:
+	mutex_unlock(&device->resource->conf_update);
+ fail:
+	kfree(new_disk_conf);
+	kfree(new_plan);
+success:
+	put_ldev(device);
+ out:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	int err;
+	enum drbd_ret_code retcode;
+	enum determine_dev_size dd;
+	sector_t max_possible_sectors;
+	sector_t min_md_device_sectors;
+	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+	struct disk_conf *new_disk_conf = NULL;
+	struct block_device *bdev;
+	struct lru_cache *resync_lru = NULL;
+	struct fifo_buffer *new_plan = NULL;
+	union drbd_state ns, os;
+	enum drbd_state_rv rv;
+	struct net_conf *nc;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	device = adm_ctx.device;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	conn_reconfig_start(first_peer_device(device)->connection);
+
+	/* if you want to reconfigure, please tear down first */
+	if (device->state.disk > D_DISKLESS) {
+		retcode = ERR_DISK_CONFIGURED;
+		goto fail;
+	}
+	/* It may just now have detached because of IO error.  Make sure
+	 * drbd_ldev_destroy is done already, we may end up here very fast,
+	 * e.g. if someone calls attach from the on-io-error handler,
+	 * to realize a "hot spare" feature (not that I'd recommend that) */
+	wait_event(device->misc_wait, !atomic_read(&device->local_cnt));
+
+	/* make sure there is no leftover from previous force-detach attempts */
+	clear_bit(FORCE_DETACH, &device->flags);
+	clear_bit(WAS_IO_ERROR, &device->flags);
+	clear_bit(WAS_READ_ERROR, &device->flags);
+
+	/* and no leftover from previously aborted resync or verify, either */
+	device->rs_total = 0;
+	device->rs_failed = 0;
+	atomic_set(&device->rs_pending_cnt, 0);
+
+	/* allocation not in the IO path, drbdsetup context */
+	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
+	if (!nbc) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+	spin_lock_init(&nbc->md.uuid_lock);
+
+	new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+	if (!new_disk_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+	nbc->disk_conf = new_disk_conf;
+
+	set_disk_conf_defaults(new_disk_conf);
+	err = disk_conf_from_attrs(new_disk_conf, info);
+	if (err) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+		new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+	new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
+	if (!new_plan) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto fail;
+	}
+
+	write_lock_irq(&global_state_lock);
+	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+	write_unlock_irq(&global_state_lock);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	rcu_read_lock();
+	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	if (nc) {
+		if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
+			rcu_read_unlock();
+			retcode = ERR_STONITH_AND_PROT_A;
+			goto fail;
+		}
+	}
+	rcu_read_unlock();
+
+	bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
+	if (IS_ERR(bdev)) {
+		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
+			PTR_ERR(bdev));
+		retcode = ERR_OPEN_DISK;
+		goto fail;
+	}
+	nbc->backing_bdev = bdev;
+
+	/*
+	 * meta_dev_idx >= 0: external fixed size, possibly multiple
+	 * drbd sharing one meta device.  TODO in that case, paranoia
+	 * check that [md_bdev, meta_dev_idx] is not yet used by some
+	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
+	 * should check it for you already; but if you don't, or
+	 * someone fooled it, we need to double check here)
+	 */
+	bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+				  (new_disk_conf->meta_dev_idx < 0) ?
+				  (void *)device : (void *)drbd_m_holder);
+	if (IS_ERR(bdev)) {
+		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
+			PTR_ERR(bdev));
+		retcode = ERR_OPEN_MD_DISK;
+		goto fail;
+	}
+	nbc->md_bdev = bdev;
+
+	if ((nbc->backing_bdev == nbc->md_bdev) !=
+	    (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto fail;
+	}
+
+	resync_lru = lc_create("resync", drbd_bm_ext_cache,
+			1, 61, sizeof(struct bm_extent),
+			offsetof(struct bm_extent, lce));
+	if (!resync_lru) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	/* Read our meta data super block early.
+	 * This also sets other on-disk offsets. */
+	retcode = drbd_md_read(device, nbc);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+		new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+	if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
+		new_disk_conf->al_extents = drbd_al_extents_max(nbc);
+
+	if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
+		drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
+			(unsigned long long) drbd_get_max_capacity(nbc),
+			(unsigned long long) new_disk_conf->disk_size);
+		retcode = ERR_DISK_TOO_SMALL;
+		goto fail;
+	}
+
+	if (new_disk_conf->meta_dev_idx < 0) {
+		max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
+		/* at least one MB, otherwise it does not make sense */
+		min_md_device_sectors = (2<<10);
+	} else {
+		max_possible_sectors = DRBD_MAX_SECTORS;
+		min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
+	}
+
+	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
+		retcode = ERR_MD_DISK_TOO_SMALL;
+		drbd_warn(device, "refusing attach: md-device too small, "
+		     "at least %llu sectors needed for this meta-disk type\n",
+		     (unsigned long long) min_md_device_sectors);
+		goto fail;
+	}
+
+	/* Make sure the new disk is big enough
+	 * (we may currently be R_PRIMARY with no local disk...) */
+	if (drbd_get_max_capacity(nbc) <
+	    drbd_get_capacity(device->this_bdev)) {
+		retcode = ERR_DISK_TOO_SMALL;
+		goto fail;
+	}
+
+	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
+
+	if (nbc->known_size > max_possible_sectors) {
+		drbd_warn(device, "==> truncating very big lower level device "
+			"to currently maximum possible %llu sectors <==\n",
+			(unsigned long long) max_possible_sectors);
+		if (new_disk_conf->meta_dev_idx >= 0)
+			drbd_warn(device, "==>> using internal or flexible "
+				      "meta data may help <<==\n");
+	}
+
+	drbd_suspend_io(device);
+	/* also wait for the last barrier ack. */
+	/* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
+	 * We need a way to either ignore barrier acks for barriers sent before a device
+	 * was attached, or a way to wait for all pending barrier acks to come in.
+	 * As barriers are counted per resource,
+	 * we'd need to suspend io on all devices of a resource.
+	 */
+	wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
+	/* and for any other previously queued work */
+	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+	rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
+	retcode = rv;  /* FIXME: Type mismatch. */
+	drbd_resume_io(device);
+	if (rv < SS_SUCCESS)
+		goto fail;
+
+	if (!get_ldev_if_state(device, D_ATTACHING))
+		goto force_diskless;
+
+	if (!device->bitmap) {
+		if (drbd_bm_init(device)) {
+			retcode = ERR_NOMEM;
+			goto force_diskless_dec;
+		}
+	}
+
+	if (device->state.conn < C_CONNECTED &&
+	    device->state.role == R_PRIMARY && device->ed_uuid &&
+	    (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
+		drbd_err(device, "Can only attach to data with current UUID=%016llX\n",
+		    (unsigned long long)device->ed_uuid);
+		retcode = ERR_DATA_NOT_CURRENT;
+		goto force_diskless_dec;
+	}
+
+	/* Since we are diskless, fix the activity log first... */
+	if (drbd_check_al_size(device, new_disk_conf)) {
+		retcode = ERR_NOMEM;
+		goto force_diskless_dec;
+	}
+
+	/* Prevent shrinking of consistent devices ! */
+	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
+	    drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
+		drbd_warn(device, "refusing to truncate a consistent device\n");
+		retcode = ERR_DISK_TOO_SMALL;
+		goto force_diskless_dec;
+	}
+
+	/* Reset the "barriers don't work" bits here, then force meta data to
+	 * be written, to ensure we determine if barriers are supported. */
+	if (new_disk_conf->md_flushes)
+		clear_bit(MD_NO_FUA, &device->flags);
+	else
+		set_bit(MD_NO_FUA, &device->flags);
+
+	/* Point of no return reached.
+	 * Devices and memory are no longer released by error cleanup below.
+	 * now device takes over responsibility, and the state engine should
+	 * clean it up somewhere.  */
+	D_ASSERT(device, device->ldev == NULL);
+	device->ldev = nbc;
+	device->resync = resync_lru;
+	device->rs_plan_s = new_plan;
+	nbc = NULL;
+	resync_lru = NULL;
+	new_disk_conf = NULL;
+	new_plan = NULL;
+
+	drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
+
+	if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
+		set_bit(CRASHED_PRIMARY, &device->flags);
+	else
+		clear_bit(CRASHED_PRIMARY, &device->flags);
+
+	if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+	    !(device->state.role == R_PRIMARY && device->resource->susp_nod))
+		set_bit(CRASHED_PRIMARY, &device->flags);
+
+	device->send_cnt = 0;
+	device->recv_cnt = 0;
+	device->read_cnt = 0;
+	device->writ_cnt = 0;
+
+	drbd_reconsider_max_bio_size(device);
+
+	/* If I am currently not R_PRIMARY,
+	 * but meta data primary indicator is set,
+	 * I just now recover from a hard crash,
+	 * and have been R_PRIMARY before that crash.
+	 *
+	 * Now, if I had no connection before that crash
+	 * (have been degraded R_PRIMARY), chances are that
+	 * I won't find my peer now either.
+	 *
+	 * In that case, and _only_ in that case,
+	 * we use the degr-wfc-timeout instead of the default,
+	 * so we can automatically recover from a crash of a
+	 * degraded but active "cluster" after a certain timeout.
+	 */
+	clear_bit(USE_DEGR_WFC_T, &device->flags);
+	if (device->state.role != R_PRIMARY &&
+	     drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+	    !drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND))
+		set_bit(USE_DEGR_WFC_T, &device->flags);
+
+	dd = drbd_determine_dev_size(device, 0, NULL);
+	if (dd <= DS_ERROR) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto force_diskless_dec;
+	} else if (dd == DS_GREW)
+		set_bit(RESYNC_AFTER_NEG, &device->flags);
+
+	if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ||
+	    (test_bit(CRASHED_PRIMARY, &device->flags) &&
+	     drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) {
+		drbd_info(device, "Assuming that all blocks are out of sync "
+		     "(aka FullSync)\n");
+		if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+			"set_n_write from attaching", BM_LOCKED_MASK)) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	} else {
+		if (drbd_bitmap_io(device, &drbd_bm_read,
+			"read from attaching", BM_LOCKED_MASK)) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	}
+
+	if (_drbd_bm_total_weight(device) == drbd_bm_bits(device))
+		drbd_suspend_al(device); /* IO is still suspended here... */
+
+	spin_lock_irq(&device->resource->req_lock);
+	os = drbd_read_state(device);
+	ns = os;
+	/* If MDF_CONSISTENT is not set go into inconsistent state,
+	   otherwise investigate MDF_WasUpToDate...
+	   If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
+	   otherwise into D_CONSISTENT state.
+	*/
+	if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) {
+		if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
+			ns.disk = D_CONSISTENT;
+		else
+			ns.disk = D_OUTDATED;
+	} else {
+		ns.disk = D_INCONSISTENT;
+	}
+
+	if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED))
+		ns.pdsk = D_OUTDATED;
+
+	rcu_read_lock();
+	if (ns.disk == D_CONSISTENT &&
+	    (ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE))
+		ns.disk = D_UP_TO_DATE;
+
+	/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
+	   MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
+	   this point, because drbd_request_state() modifies these
+	   flags. */
+
+	if (rcu_dereference(device->ldev->disk_conf)->al_updates)
+		device->ldev->md.flags &= ~MDF_AL_DISABLED;
+	else
+		device->ldev->md.flags |= MDF_AL_DISABLED;
+
+	rcu_read_unlock();
+
+	/* In case we are C_CONNECTED postpone any decision on the new disk
+	   state after the negotiation phase. */
+	if (device->state.conn == C_CONNECTED) {
+		device->new_state_tmp.i = ns.i;
+		ns.i = os.i;
+		ns.disk = D_NEGOTIATING;
+
+		/* We expect to receive up-to-date UUIDs soon.
+		   To avoid a race in receive_state, free p_uuid while
+		   holding req_lock. I.e. atomic with the state change */
+		kfree(device->p_uuid);
+		device->p_uuid = NULL;
+	}
+
+	rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (rv < SS_SUCCESS)
+		goto force_diskless_dec;
+
+	mod_timer(&device->request_timer, jiffies + HZ);
+
+	if (device->state.role == R_PRIMARY)
+		device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+	else
+		device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+
+	drbd_md_mark_dirty(device);
+	drbd_md_sync(device);
+
+	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+	put_ldev(device);
+	conn_reconfig_done(first_peer_device(device)->connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+
+ force_diskless_dec:
+	put_ldev(device);
+ force_diskless:
+	drbd_force_state(device, NS(disk, D_DISKLESS));
+	drbd_md_sync(device);
+ fail:
+	conn_reconfig_done(first_peer_device(device)->connection);
+	if (nbc) {
+		if (nbc->backing_bdev)
+			blkdev_put(nbc->backing_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		if (nbc->md_bdev)
+			blkdev_put(nbc->md_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		kfree(nbc);
+	}
+	kfree(new_disk_conf);
+	lc_destroy(resync_lru);
+	kfree(new_plan);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int adm_detach(struct drbd_device *device, int force)
+{
+	enum drbd_state_rv retcode;
+	int ret;
+
+	if (force) {
+		set_bit(FORCE_DETACH, &device->flags);
+		drbd_force_state(device, NS(disk, D_FAILED));
+		retcode = SS_SUCCESS;
+		goto out;
+	}
+
+	drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
+	drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */
+	retcode = drbd_request_state(device, NS(disk, D_FAILED));
+	drbd_md_put_buffer(device);
+	/* D_FAILED will transition to DISKLESS. */
+	ret = wait_event_interruptible(device->misc_wait,
+			device->state.disk != D_FAILED);
+	drbd_resume_io(device);
+	if ((int)retcode == (int)SS_IS_DISKLESS)
+		retcode = SS_NOTHING_TO_DO;
+	if (ret)
+		retcode = ERR_INTR;
+out:
+	return retcode;
+}
+
+/* Detaching the disk is a process in multiple stages.  First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct detach_parms parms = { };
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+		err = detach_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out;
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = adm_detach(adm_ctx.device, parms.force_detach);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static bool conn_resync_running(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = false;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (device->state.conn == C_SYNC_SOURCE ||
+		    device->state.conn == C_SYNC_TARGET ||
+		    device->state.conn == C_PAUSED_SYNC_S ||
+		    device->state.conn == C_PAUSED_SYNC_T) {
+			rv = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static bool conn_ov_running(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = false;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (device->state.conn == C_VERIFY_S ||
+		    device->state.conn == C_VERIFY_T) {
+			rv = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static enum drbd_ret_code
+_check_net_options(struct drbd_connection *connection, struct net_conf *old_net_conf, struct net_conf *new_net_conf)
+{
+	struct drbd_peer_device *peer_device;
+	int i;
+
+	if (old_net_conf && connection->cstate == C_WF_REPORT_PARAMS && connection->agreed_pro_version < 100) {
+		if (new_net_conf->wire_protocol != old_net_conf->wire_protocol)
+			return ERR_NEED_APV_100;
+
+		if (new_net_conf->two_primaries != old_net_conf->two_primaries)
+			return ERR_NEED_APV_100;
+
+		if (strcmp(new_net_conf->integrity_alg, old_net_conf->integrity_alg))
+			return ERR_NEED_APV_100;
+	}
+
+	if (!new_net_conf->two_primaries &&
+	    conn_highest_role(connection) == R_PRIMARY &&
+	    conn_highest_peer(connection) == R_PRIMARY)
+		return ERR_NEED_ALLOW_TWO_PRI;
+
+	if (new_net_conf->two_primaries &&
+	    (new_net_conf->wire_protocol != DRBD_PROT_C))
+		return ERR_NOT_PROTO_C;
+
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct drbd_device *device = peer_device->device;
+		if (get_ldev(device)) {
+			enum drbd_fencing_p fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+			put_ldev(device);
+			if (new_net_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+				return ERR_STONITH_AND_PROT_A;
+		}
+		if (device->state.role == R_PRIMARY && new_net_conf->discard_my_data)
+			return ERR_DISCARD_IMPOSSIBLE;
+	}
+
+	if (new_net_conf->on_congestion != OC_BLOCK && new_net_conf->wire_protocol != DRBD_PROT_A)
+		return ERR_CONG_NOT_PROTO_A;
+
+	return NO_ERROR;
+}
+
+static enum drbd_ret_code
+check_net_options(struct drbd_connection *connection, struct net_conf *new_net_conf)
+{
+	static enum drbd_ret_code rv;
+	struct drbd_peer_device *peer_device;
+	int i;
+
+	rcu_read_lock();
+	rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf);
+	rcu_read_unlock();
+
+	/* connection->volumes protected by genl_lock() here */
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct drbd_device *device = peer_device->device;
+		if (!device->bitmap) {
+			if (drbd_bm_init(device))
+				return ERR_NOMEM;
+		}
+	}
+
+	return rv;
+}
+
+struct crypto {
+	struct crypto_hash *verify_tfm;
+	struct crypto_hash *csums_tfm;
+	struct crypto_hash *cram_hmac_tfm;
+	struct crypto_hash *integrity_tfm;
+};
+
+static int
+alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
+{
+	if (!tfm_name[0])
+		return NO_ERROR;
+
+	*tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(*tfm)) {
+		*tfm = NULL;
+		return err_alg;
+	}
+
+	return NO_ERROR;
+}
+
+static enum drbd_ret_code
+alloc_crypto(struct crypto *crypto, struct net_conf *new_net_conf)
+{
+	char hmac_name[CRYPTO_MAX_ALG_NAME];
+	enum drbd_ret_code rv;
+
+	rv = alloc_hash(&crypto->csums_tfm, new_net_conf->csums_alg,
+		       ERR_CSUMS_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	rv = alloc_hash(&crypto->verify_tfm, new_net_conf->verify_alg,
+		       ERR_VERIFY_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	rv = alloc_hash(&crypto->integrity_tfm, new_net_conf->integrity_alg,
+		       ERR_INTEGRITY_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	if (new_net_conf->cram_hmac_alg[0] != 0) {
+		snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+			 new_net_conf->cram_hmac_alg);
+
+		rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
+			       ERR_AUTH_ALG);
+	}
+
+	return rv;
+}
+
+static void free_crypto(struct crypto *crypto)
+{
+	crypto_free_hash(crypto->cram_hmac_tfm);
+	crypto_free_hash(crypto->integrity_tfm);
+	crypto_free_hash(crypto->csums_tfm);
+	crypto_free_hash(crypto->verify_tfm);
+}
+
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct drbd_connection *connection;
+	struct net_conf *old_net_conf, *new_net_conf = NULL;
+	int err;
+	int ovr; /* online verify running */
+	int rsr; /* re-sync running */
+	struct crypto crypto = { };
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	connection = adm_ctx.connection;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+	if (!new_net_conf) {
+		retcode = ERR_NOMEM;
+		goto out;
+	}
+
+	conn_reconfig_start(connection);
+
+	mutex_lock(&connection->data.mutex);
+	mutex_lock(&connection->resource->conf_update);
+	old_net_conf = connection->net_conf;
+
+	if (!old_net_conf) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect");
+		retcode = ERR_INVALID_REQUEST;
+		goto fail;
+	}
+
+	*new_net_conf = *old_net_conf;
+	if (should_set_defaults(info))
+		set_net_conf_defaults(new_net_conf);
+
+	err = net_conf_from_attrs_for_change(new_net_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	retcode = check_net_options(connection, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	/* re-sync running */
+	rsr = conn_resync_running(connection);
+	if (rsr && strcmp(new_net_conf->csums_alg, old_net_conf->csums_alg)) {
+		retcode = ERR_CSUMS_RESYNC_RUNNING;
+		goto fail;
+	}
+
+	/* online verify running */
+	ovr = conn_ov_running(connection);
+	if (ovr && strcmp(new_net_conf->verify_alg, old_net_conf->verify_alg)) {
+		retcode = ERR_VERIFY_RUNNING;
+		goto fail;
+	}
+
+	retcode = alloc_crypto(&crypto, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+	if (!rsr) {
+		crypto_free_hash(connection->csums_tfm);
+		connection->csums_tfm = crypto.csums_tfm;
+		crypto.csums_tfm = NULL;
+	}
+	if (!ovr) {
+		crypto_free_hash(connection->verify_tfm);
+		connection->verify_tfm = crypto.verify_tfm;
+		crypto.verify_tfm = NULL;
+	}
+
+	crypto_free_hash(connection->integrity_tfm);
+	connection->integrity_tfm = crypto.integrity_tfm;
+	if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100)
+		/* Do this without trying to take connection->data.mutex again.  */
+		__drbd_send_protocol(connection, P_PROTOCOL_UPDATE);
+
+	crypto_free_hash(connection->cram_hmac_tfm);
+	connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
+
+	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&connection->data.mutex);
+	synchronize_rcu();
+	kfree(old_net_conf);
+
+	if (connection->cstate >= C_WF_REPORT_PARAMS) {
+		struct drbd_peer_device *peer_device;
+		int vnr;
+
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+			drbd_send_sync_param(peer_device);
+	}
+
+	goto done;
+
+ fail:
+	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&connection->data.mutex);
+	free_crypto(&crypto);
+	kfree(new_net_conf);
+ done:
+	conn_reconfig_done(connection);
+ out:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_peer_device *peer_device;
+	struct net_conf *old_net_conf, *new_net_conf = NULL;
+	struct crypto crypto = { };
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+	enum drbd_ret_code retcode;
+	int i;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+	if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
+
+	/* No need for _rcu here. All reconfiguration is
+	 * strictly serialized on genl_lock(). We are protected against
+	 * concurrent reconfiguration/addition/deletion */
+	for_each_resource(resource, &drbd_resources) {
+		for_each_connection(connection, resource) {
+			if (nla_len(adm_ctx.my_addr) == connection->my_addr_len &&
+			    !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr,
+				    connection->my_addr_len)) {
+				retcode = ERR_LOCAL_ADDR;
+				goto out;
+			}
+
+			if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len &&
+			    !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr,
+				    connection->peer_addr_len)) {
+				retcode = ERR_PEER_ADDR;
+				goto out;
+			}
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	connection = first_connection(adm_ctx.resource);
+	conn_reconfig_start(connection);
+
+	if (connection->cstate > C_STANDALONE) {
+		retcode = ERR_NET_CONFIGURED;
+		goto fail;
+	}
+
+	/* allocation not in the IO path, drbdsetup / netlink process context */
+	new_net_conf = kzalloc(sizeof(*new_net_conf), GFP_KERNEL);
+	if (!new_net_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	set_net_conf_defaults(new_net_conf);
+
+	err = net_conf_from_attrs(new_net_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	retcode = check_net_options(connection, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	retcode = alloc_crypto(&crypto, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	((char *)new_net_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+
+	drbd_flush_workqueue(&connection->sender_work);
+
+	mutex_lock(&adm_ctx.resource->conf_update);
+	old_net_conf = connection->net_conf;
+	if (old_net_conf) {
+		retcode = ERR_NET_CONFIGURED;
+		mutex_unlock(&adm_ctx.resource->conf_update);
+		goto fail;
+	}
+	rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+	conn_free_crypto(connection);
+	connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
+	connection->integrity_tfm = crypto.integrity_tfm;
+	connection->csums_tfm = crypto.csums_tfm;
+	connection->verify_tfm = crypto.verify_tfm;
+
+	connection->my_addr_len = nla_len(adm_ctx.my_addr);
+	memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len);
+	connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
+	memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
+
+	mutex_unlock(&adm_ctx.resource->conf_update);
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct drbd_device *device = peer_device->device;
+		device->send_cnt = 0;
+		device->recv_cnt = 0;
+	}
+	rcu_read_unlock();
+
+	retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+	conn_reconfig_done(connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+
+fail:
+	free_crypto(&crypto);
+	kfree(new_net_conf);
+
+	conn_reconfig_done(connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force)
+{
+	enum drbd_state_rv rv;
+
+	rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+			force ? CS_HARD : 0);
+
+	switch (rv) {
+	case SS_NOTHING_TO_DO:
+		break;
+	case SS_ALREADY_STANDALONE:
+		return SS_SUCCESS;
+	case SS_PRIMARY_NOP:
+		/* Our state checking code wants to see the peer outdated. */
+		rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
+
+		if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
+			rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_VERBOSE);
+
+		break;
+	case SS_CW_FAILED_BY_PEER:
+		/* The peer probably wants to see us outdated. */
+		rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING,
+							disk, D_OUTDATED), 0);
+		if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
+			rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+					CS_HARD);
+		}
+		break;
+	default:;
+		/* no special handling necessary */
+	}
+
+	if (rv >= SS_SUCCESS) {
+		enum drbd_state_rv rv2;
+		/* No one else can reconfigure the network while I am here.
+		 * The state handling only uses drbd_thread_stop_nowait(),
+		 * we want to really wait here until the receiver is no more.
+		 */
+		drbd_thread_stop(&connection->receiver);
+
+		/* Race breaker.  This additional state change request may be
+		 * necessary, if this was a forced disconnect during a receiver
+		 * restart.  We may have "killed" the receiver thread just
+		 * after drbd_receiver() returned.  Typically, we should be
+		 * C_STANDALONE already, now, and this becomes a no-op.
+		 */
+		rv2 = conn_request_state(connection, NS(conn, C_STANDALONE),
+				CS_VERBOSE | CS_HARD);
+		if (rv2 < SS_SUCCESS)
+			drbd_err(connection,
+				"unexpected rv2=%d in conn_try_disconnect()\n",
+				rv2);
+	}
+	return rv;
+}
+
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct disconnect_parms parms;
+	struct drbd_connection *connection;
+	enum drbd_state_rv rv;
+	enum drbd_ret_code retcode;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	connection = adm_ctx.connection;
+	memset(&parms, 0, sizeof(parms));
+	if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
+		err = disconnect_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto fail;
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	rv = conn_try_disconnect(connection, parms.force_disconnect);
+	if (rv < SS_SUCCESS)
+		retcode = rv;  /* FIXME: Type mismatch. */
+	else
+		retcode = NO_ERROR;
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ fail:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+void resync_after_online_grow(struct drbd_device *device)
+{
+	int iass; /* I am sync source */
+
+	drbd_info(device, "Resync of new storage after online grow\n");
+	if (device->state.role != device->state.peer)
+		iass = (device->state.role == R_PRIMARY);
+	else
+		iass = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
+
+	if (iass)
+		drbd_start_resync(device, C_SYNC_SOURCE);
+	else
+		_drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+}
+
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+	struct resize_parms rs;
+	struct drbd_device *device;
+	enum drbd_ret_code retcode;
+	enum determine_dev_size dd;
+	bool change_al_layout = false;
+	enum dds_flags ddsf;
+	sector_t u_size;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	device = adm_ctx.device;
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto fail;
+	}
+
+	memset(&rs, 0, sizeof(struct resize_parms));
+	rs.al_stripes = device->ldev->md.al_stripes;
+	rs.al_stripe_size = device->ldev->md.al_stripe_size_4k * 4;
+	if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
+		err = resize_parms_from_attrs(&rs, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto fail_ldev;
+		}
+	}
+
+	if (device->state.conn > C_CONNECTED) {
+		retcode = ERR_RESIZE_RESYNC;
+		goto fail_ldev;
+	}
+
+	if (device->state.role == R_SECONDARY &&
+	    device->state.peer == R_SECONDARY) {
+		retcode = ERR_NO_PRIMARY;
+		goto fail_ldev;
+	}
+
+	if (rs.no_resync && first_peer_device(device)->connection->agreed_pro_version < 93) {
+		retcode = ERR_NEED_APV_93;
+		goto fail_ldev;
+	}
+
+	rcu_read_lock();
+	u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+	rcu_read_unlock();
+	if (u_size != (sector_t)rs.resize_size) {
+		new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+		if (!new_disk_conf) {
+			retcode = ERR_NOMEM;
+			goto fail_ldev;
+		}
+	}
+
+	if (device->ldev->md.al_stripes != rs.al_stripes ||
+	    device->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
+		u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
+
+		if (al_size_k > (16 * 1024 * 1024)) {
+			retcode = ERR_MD_LAYOUT_TOO_BIG;
+			goto fail_ldev;
+		}
+
+		if (al_size_k < MD_32kB_SECT/2) {
+			retcode = ERR_MD_LAYOUT_TOO_SMALL;
+			goto fail_ldev;
+		}
+
+		if (device->state.conn != C_CONNECTED && !rs.resize_force) {
+			retcode = ERR_MD_LAYOUT_CONNECTED;
+			goto fail_ldev;
+		}
+
+		change_al_layout = true;
+	}
+
+	if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev))
+		device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
+
+	if (new_disk_conf) {
+		mutex_lock(&device->resource->conf_update);
+		old_disk_conf = device->ldev->disk_conf;
+		*new_disk_conf = *old_disk_conf;
+		new_disk_conf->disk_size = (sector_t)rs.resize_size;
+		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+		mutex_unlock(&device->resource->conf_update);
+		synchronize_rcu();
+		kfree(old_disk_conf);
+	}
+
+	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+	dd = drbd_determine_dev_size(device, ddsf, change_al_layout ? &rs : NULL);
+	drbd_md_sync(device);
+	put_ldev(device);
+	if (dd == DS_ERROR) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto fail;
+	} else if (dd == DS_ERROR_SPACE_MD) {
+		retcode = ERR_MD_LAYOUT_NO_FIT;
+		goto fail;
+	} else if (dd == DS_ERROR_SHRINK) {
+		retcode = ERR_IMPLICIT_SHRINK;
+		goto fail;
+	}
+
+	if (device->state.conn == C_CONNECTED) {
+		if (dd == DS_GREW)
+			set_bit(RESIZE_PENDING, &device->flags);
+
+		drbd_send_uuids(first_peer_device(device));
+		drbd_send_sizes(first_peer_device(device), 1, ddsf);
+	}
+
+ fail:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+
+ fail_ldev:
+	put_ldev(device);
+	goto fail;
+}
+
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct res_opts res_opts;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	res_opts = adm_ctx.resource->res_opts;
+	if (should_set_defaults(info))
+		set_res_opts_defaults(&res_opts);
+
+	err = res_opts_from_attrs(&res_opts, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	err = set_resource_options(adm_ctx.resource, &res_opts);
+	if (err) {
+		retcode = ERR_INVALID_REQUEST;
+		if (err == -ENOMEM)
+			retcode = ERR_NOMEM;
+	}
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+
+fail:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	device = adm_ctx.device;
+
+	/* If there is still bitmap IO pending, probably because of a previous
+	 * resync just being finished, wait for it before requesting a new resync.
+	 * Also wait for it's after_state_ch(). */
+	drbd_suspend_io(device);
+	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+	/* If we happen to be C_STANDALONE R_SECONDARY, just change to
+	 * D_INCONSISTENT, and set all bits in the bitmap.  Otherwise,
+	 * try to start a resync handshake as sync target for full sync.
+	 */
+	if (device->state.conn == C_STANDALONE && device->state.role == R_SECONDARY) {
+		retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT));
+		if (retcode >= SS_SUCCESS) {
+			if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+				"set_n_write from invalidate", BM_LOCKED_MASK))
+				retcode = ERR_IO_MD_DISK;
+		}
+	} else
+		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
+	drbd_resume_io(device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
+		union drbd_state mask, union drbd_state val)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = drbd_request_state(adm_ctx.device, mask, val);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int drbd_bmio_set_susp_al(struct drbd_device *device)
+{
+	int rv;
+
+	rv = drbd_bmio_set_n_write(device);
+	drbd_suspend_al(device);
+	return rv;
+}
+
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	int retcode; /* drbd_ret_code, drbd_state_rv */
+	struct drbd_device *device;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	device = adm_ctx.device;
+
+	/* If there is still bitmap IO pending, probably because of a previous
+	 * resync just being finished, wait for it before requesting a new resync.
+	 * Also wait for it's after_state_ch(). */
+	drbd_suspend_io(device);
+	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+	/* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
+	 * in the bitmap.  Otherwise, try to start a resync handshake
+	 * as sync source for full sync.
+	 */
+	if (device->state.conn == C_STANDALONE && device->state.role == R_PRIMARY) {
+		/* The peer will get a resync upon connect anyways. Just make that
+		   into a full resync. */
+		retcode = drbd_request_state(device, NS(pdsk, D_INCONSISTENT));
+		if (retcode >= SS_SUCCESS) {
+			if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al,
+				"set_n_write from invalidate_peer",
+				BM_LOCKED_SET_ALLOWED))
+				retcode = ERR_IO_MD_DISK;
+		}
+	} else
+		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
+	drbd_resume_io(device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+		retcode = ERR_PAUSE_IS_SET;
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	union drbd_dev_state s;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+		s = adm_ctx.device->state;
+		if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
+			retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
+				  s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
+		} else {
+			retcode = ERR_PAUSE_IS_CLEAR;
+		}
+	}
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
+{
+	return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
+}
+
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	device = adm_ctx.device;
+	if (test_bit(NEW_CUR_UUID, &device->flags)) {
+		drbd_uuid_new_current(device);
+		clear_bit(NEW_CUR_UUID, &device->flags);
+	}
+	drbd_suspend_io(device);
+	retcode = drbd_request_state(device, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+	if (retcode == SS_SUCCESS) {
+		if (device->state.conn < C_CONNECTED)
+			tl_clear(first_peer_device(device)->connection);
+		if (device->state.disk == D_DISKLESS || device->state.disk == D_FAILED)
+			tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO);
+	}
+	drbd_resume_io(device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
+{
+	return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
+}
+
+static int nla_put_drbd_cfg_context(struct sk_buff *skb,
+				    struct drbd_resource *resource,
+				    struct drbd_connection *connection,
+				    struct drbd_device *device)
+{
+	struct nlattr *nla;
+	nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
+	if (!nla)
+		goto nla_put_failure;
+	if (device &&
+	    nla_put_u32(skb, T_ctx_volume, device->vnr))
+		goto nla_put_failure;
+	if (nla_put_string(skb, T_ctx_resource_name, resource->name))
+		goto nla_put_failure;
+	if (connection) {
+		if (connection->my_addr_len &&
+		    nla_put(skb, T_ctx_my_addr, connection->my_addr_len, &connection->my_addr))
+			goto nla_put_failure;
+		if (connection->peer_addr_len &&
+		    nla_put(skb, T_ctx_peer_addr, connection->peer_addr_len, &connection->peer_addr))
+			goto nla_put_failure;
+	}
+	nla_nest_end(skb, nla);
+	return 0;
+
+nla_put_failure:
+	if (nla)
+		nla_nest_cancel(skb, nla);
+	return -EMSGSIZE;
+}
+
+/*
+ * Return the connection of @resource if @resource has exactly one connection.
+ */
+static struct drbd_connection *the_only_connection(struct drbd_resource *resource)
+{
+	struct list_head *connections = &resource->connections;
+
+	if (list_empty(connections) || connections->next->next != connections)
+		return NULL;
+	return list_first_entry(&resource->connections, struct drbd_connection, connections);
+}
+
+int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
+		const struct sib_info *sib)
+{
+	struct drbd_resource *resource = device->resource;
+	struct state_info *si = NULL; /* for sizeof(si->member); */
+	struct nlattr *nla;
+	int got_ldev;
+	int err = 0;
+	int exclude_sensitive;
+
+	/* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+	 * to.  So we better exclude_sensitive information.
+	 *
+	 * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+	 * in the context of the requesting user process. Exclude sensitive
+	 * information, unless current has superuser.
+	 *
+	 * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+	 * relies on the current implementation of netlink_dump(), which
+	 * executes the dump callback successively from netlink_recvmsg(),
+	 * always in the context of the receiving process */
+	exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
+
+	got_ldev = get_ldev(device);
+
+	/* We need to add connection name and volume number information still.
+	 * Minor number is in drbd_genlmsghdr. */
+	if (nla_put_drbd_cfg_context(skb, resource, the_only_connection(resource), device))
+		goto nla_put_failure;
+
+	if (res_opts_to_skb(skb, &device->resource->res_opts, exclude_sensitive))
+		goto nla_put_failure;
+
+	rcu_read_lock();
+	if (got_ldev) {
+		struct disk_conf *disk_conf;
+
+		disk_conf = rcu_dereference(device->ldev->disk_conf);
+		err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
+	}
+	if (!err) {
+		struct net_conf *nc;
+
+		nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+		if (nc)
+			err = net_conf_to_skb(skb, nc, exclude_sensitive);
+	}
+	rcu_read_unlock();
+	if (err)
+		goto nla_put_failure;
+
+	nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
+	if (!nla)
+		goto nla_put_failure;
+	if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+	    nla_put_u32(skb, T_current_state, device->state.i) ||
+	    nla_put_u64(skb, T_ed_uuid, device->ed_uuid) ||
+	    nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) ||
+	    nla_put_u64(skb, T_send_cnt, device->send_cnt) ||
+	    nla_put_u64(skb, T_recv_cnt, device->recv_cnt) ||
+	    nla_put_u64(skb, T_read_cnt, device->read_cnt) ||
+	    nla_put_u64(skb, T_writ_cnt, device->writ_cnt) ||
+	    nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) ||
+	    nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
+	    nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) ||
+	    nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) ||
+	    nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt)))
+		goto nla_put_failure;
+
+	if (got_ldev) {
+		int err;
+
+		spin_lock_irq(&device->ldev->md.uuid_lock);
+		err = nla_put(skb, T_uuids, sizeof(si->uuids), device->ldev->md.uuid);
+		spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+		if (err)
+			goto nla_put_failure;
+
+		if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) ||
+		    nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) ||
+		    nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device)))
+			goto nla_put_failure;
+		if (C_SYNC_SOURCE <= device->state.conn &&
+		    C_PAUSED_SYNC_T >= device->state.conn) {
+			if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) ||
+			    nla_put_u64(skb, T_bits_rs_failed, device->rs_failed))
+				goto nla_put_failure;
+		}
+	}
+
+	if (sib) {
+		switch(sib->sib_reason) {
+		case SIB_SYNC_PROGRESS:
+		case SIB_GET_STATUS_REPLY:
+			break;
+		case SIB_STATE_CHANGE:
+			if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+			    nla_put_u32(skb, T_new_state, sib->ns.i))
+				goto nla_put_failure;
+			break;
+		case SIB_HELPER_POST:
+			if (nla_put_u32(skb, T_helper_exit_code,
+					sib->helper_exit_code))
+				goto nla_put_failure;
+			/* fall through */
+		case SIB_HELPER_PRE:
+			if (nla_put_string(skb, T_helper, sib->helper_name))
+				goto nla_put_failure;
+			break;
+		}
+	}
+	nla_nest_end(skb, nla);
+
+	if (0)
+nla_put_failure:
+		err = -EMSGSIZE;
+	if (got_ldev)
+		put_ldev(device);
+	return err;
+}
+
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL);
+	if (err) {
+		nlmsg_free(adm_ctx.reply_skb);
+		return err;
+	}
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_device *device;
+	struct drbd_genlmsghdr *dh;
+	struct drbd_resource *pos = (struct drbd_resource *)cb->args[0];
+	struct drbd_resource *resource = NULL;
+	struct drbd_resource *tmp;
+	unsigned volume = cb->args[1];
+
+	/* Open coded, deferred, iteration:
+	 * for_each_resource_safe(resource, tmp, &drbd_resources) {
+	 *      connection = "first connection of resource or undefined";
+	 *	idr_for_each_entry(&resource->devices, device, i) {
+	 *	  ...
+	 *	}
+	 * }
+	 * where resource is cb->args[0];
+	 * and i is cb->args[1];
+	 *
+	 * cb->args[2] indicates if we shall loop over all resources,
+	 * or just dump all volumes of a single resource.
+	 *
+	 * This may miss entries inserted after this dump started,
+	 * or entries deleted before they are reached.
+	 *
+	 * We need to make sure the device won't disappear while
+	 * we are looking at it, and revalidate our iterators
+	 * on each iteration.
+	 */
+
+	/* synchronize with conn_create()/drbd_destroy_connection() */
+	rcu_read_lock();
+	/* revalidate iterator position */
+	for_each_resource_rcu(tmp, &drbd_resources) {
+		if (pos == NULL) {
+			/* first iteration */
+			pos = tmp;
+			resource = pos;
+			break;
+		}
+		if (tmp == pos) {
+			resource = pos;
+			break;
+		}
+	}
+	if (resource) {
+next_resource:
+		device = idr_get_next(&resource->devices, &volume);
+		if (!device) {
+			/* No more volumes to dump on this resource.
+			 * Advance resource iterator. */
+			pos = list_entry_rcu(resource->resources.next,
+					     struct drbd_resource, resources);
+			/* Did we dump any volume of this resource yet? */
+			if (volume != 0) {
+				/* If we reached the end of the list,
+				 * or only a single resource dump was requested,
+				 * we are done. */
+				if (&pos->resources == &drbd_resources || cb->args[2])
+					goto out;
+				volume = 0;
+				resource = pos;
+				goto next_resource;
+			}
+		}
+
+		dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				cb->nlh->nlmsg_seq, &drbd_genl_family,
+				NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+		if (!dh)
+			goto out;
+
+		if (!device) {
+			/* This is a connection without a single volume.
+			 * Suprisingly enough, it may have a network
+			 * configuration. */
+			struct drbd_connection *connection;
+
+			dh->minor = -1U;
+			dh->ret_code = NO_ERROR;
+			connection = the_only_connection(resource);
+			if (nla_put_drbd_cfg_context(skb, resource, connection, NULL))
+				goto cancel;
+			if (connection) {
+				struct net_conf *nc;
+
+				nc = rcu_dereference(connection->net_conf);
+				if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+					goto cancel;
+			}
+			goto done;
+		}
+
+		D_ASSERT(device, device->vnr == volume);
+		D_ASSERT(device, device->resource == resource);
+
+		dh->minor = device_to_minor(device);
+		dh->ret_code = NO_ERROR;
+
+		if (nla_put_status_info(skb, device, NULL)) {
+cancel:
+			genlmsg_cancel(skb, dh);
+			goto out;
+		}
+done:
+		genlmsg_end(skb, dh);
+	}
+
+out:
+	rcu_read_unlock();
+	/* where to start the next iteration */
+	cb->args[0] = (long)pos;
+	cb->args[1] = (pos == resource) ? volume + 1 : 0;
+
+	/* No more resources/volumes/minors found results in an empty skb.
+	 * Which will terminate the dump. */
+        return skb->len;
+}
+
+/*
+ * Request status of all resources, or of all volumes within a single resource.
+ *
+ * This is a dump, as the answer may not fit in a single reply skb otherwise.
+ * Which means we cannot use the family->attrbuf or other such members, because
+ * dump is NOT protected by the genl_lock().  During dump, we only have access
+ * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+ *
+ * Once things are setup properly, we call into get_one_status().
+ */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+	struct nlattr *nla;
+	const char *resource_name;
+	struct drbd_resource *resource;
+	int maxtype;
+
+	/* Is this a followup call? */
+	if (cb->args[0]) {
+		/* ... of a single resource dump,
+		 * and the resource iterator has been advanced already? */
+		if (cb->args[2] && cb->args[2] != cb->args[0])
+			return 0; /* DONE. */
+		goto dump;
+	}
+
+	/* First call (from netlink_dump_start).  We need to figure out
+	 * which resource(s) the user wants us to dump. */
+	nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+			nlmsg_attrlen(cb->nlh, hdrlen),
+			DRBD_NLA_CFG_CONTEXT);
+
+	/* No explicit context given.  Dump all. */
+	if (!nla)
+		goto dump;
+	maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+	nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+	if (IS_ERR(nla))
+		return PTR_ERR(nla);
+	/* context given, but no name present? */
+	if (!nla)
+		return -EINVAL;
+	resource_name = nla_data(nla);
+	if (!*resource_name)
+		return -ENODEV;
+	resource = drbd_find_resource(resource_name);
+	if (!resource)
+		return -ENODEV;
+
+	kref_put(&resource->kref, drbd_destroy_resource); /* get_one_status() revalidates the resource */
+
+	/* prime iterators, and set "filter" mode mark:
+	 * only dump this connection. */
+	cb->args[0] = (long)resource;
+	/* cb->args[1] = 0; passed in this way. */
+	cb->args[2] = (long)resource;
+
+dump:
+	return get_one_status(skb, cb);
+}
+
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct timeout_parms tp;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	tp.timeout_type =
+		adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+		test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED :
+		UT_DEFAULT;
+
+	err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+	if (err) {
+		nlmsg_free(adm_ctx.reply_skb);
+		return err;
+	}
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	enum drbd_ret_code retcode;
+	struct start_ov_parms parms;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	device = adm_ctx.device;
+
+	/* resume from last known position, if possible */
+	parms.ov_start_sector = device->ov_start_sector;
+	parms.ov_stop_sector = ULLONG_MAX;
+	if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+		int err = start_ov_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out;
+		}
+	}
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	/* w_make_ov_request expects position to be aligned */
+	device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+	device->ov_stop_sector = parms.ov_stop_sector;
+
+	/* If there is still bitmap IO pending, e.g. previous resync or verify
+	 * just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(device);
+	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+	retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
+	drbd_resume_io(device);
+
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	enum drbd_ret_code retcode;
+	int skip_initial_sync = 0;
+	int err;
+	struct new_c_uuid_parms args;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out_nolock;
+
+	device = adm_ctx.device;
+	memset(&args, 0, sizeof(args));
+	if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
+		err = new_c_uuid_parms_from_attrs(&args, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out_nolock;
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */
+
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	/* this is "skip initial sync", assume to be clean */
+	if (device->state.conn == C_CONNECTED &&
+	    first_peer_device(device)->connection->agreed_pro_version >= 90 &&
+	    device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
+		drbd_info(device, "Preparing to skip initial sync\n");
+		skip_initial_sync = 1;
+	} else if (device->state.conn != C_STANDALONE) {
+		retcode = ERR_CONNECTED;
+		goto out_dec;
+	}
+
+	drbd_uuid_set(device, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
+	drbd_uuid_new_current(device); /* New current, previous to UI_BITMAP */
+
+	if (args.clear_bm) {
+		err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+			"clear_n_write from new_c_uuid", BM_LOCKED_MASK);
+		if (err) {
+			drbd_err(device, "Writing bitmap failed with %d\n", err);
+			retcode = ERR_IO_MD_DISK;
+		}
+		if (skip_initial_sync) {
+			drbd_send_uuids_skip_initial_sync(first_peer_device(device));
+			_drbd_uuid_set(device, UI_BITMAP, 0);
+			drbd_print_uuids(device, "cleared bitmap UUID");
+			spin_lock_irq(&device->resource->req_lock);
+			_drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+	}
+
+	drbd_md_sync(device);
+out_dec:
+	put_ldev(device);
+out:
+	mutex_unlock(device->state_mutex);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out_nolock:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static enum drbd_ret_code
+drbd_check_resource_name(struct drbd_config_context *adm_ctx)
+{
+	const char *name = adm_ctx->resource_name;
+	if (!name || !name[0]) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing");
+		return ERR_MANDATORY_TAG;
+	}
+	/* if we want to use these in sysfs/configfs/debugfs some day,
+	 * we must not allow slashes */
+	if (strchr(name, '/')) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name");
+		return ERR_INVALID_REQUEST;
+	}
+	return NO_ERROR;
+}
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct res_opts res_opts;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	set_res_opts_defaults(&res_opts);
+	err = res_opts_from_attrs(&res_opts, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto out;
+	}
+
+	retcode = drbd_check_resource_name(&adm_ctx);
+	if (retcode != NO_ERROR)
+		goto out;
+
+	if (adm_ctx.resource) {
+		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
+			retcode = ERR_INVALID_REQUEST;
+			drbd_msg_put_info(adm_ctx.reply_skb, "resource exists");
+		}
+		/* else: still NO_ERROR */
+		goto out;
+	}
+
+	/* not yet safe for genl_family.parallel_ops */
+	if (!conn_create(adm_ctx.resource_name, &res_opts))
+		retcode = ERR_NOMEM;
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_genlmsghdr *dh = info->userhdr;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	if (dh->minor > MINORMASK) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
+	if (adm_ctx.volume > DRBD_VOLUME_MAX) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
+
+	/* drbd_adm_prepare made sure already
+	 * that first_peer_device(device)->connection and device->vnr match the request. */
+	if (adm_ctx.device) {
+		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
+			retcode = ERR_MINOR_EXISTS;
+		/* else: still NO_ERROR */
+		goto out;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = drbd_create_device(&adm_ctx, dh->minor);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
+{
+	if (device->state.disk == D_DISKLESS &&
+	    /* no need to be device->state.conn == C_STANDALONE &&
+	     * we may want to delete a minor from a live replication group.
+	     */
+	    device->state.role == R_SECONDARY) {
+		_drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
+				    CS_VERBOSE + CS_WAIT_COMPLETE);
+		drbd_delete_device(device);
+		return NO_ERROR;
+	} else
+		return ERR_MINOR_CONFIGURED;
+}
+
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = adm_del_minor(adm_ctx.device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+	struct drbd_device *device;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+	unsigned i;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	resource = adm_ctx.resource;
+	mutex_lock(&resource->adm_mutex);
+	/* demote */
+	for_each_connection(connection, resource) {
+		struct drbd_peer_device *peer_device;
+
+		idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+			retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0);
+			if (retcode < SS_SUCCESS) {
+				drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote");
+				goto out;
+			}
+		}
+
+		retcode = conn_try_disconnect(connection, 0);
+		if (retcode < SS_SUCCESS) {
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect");
+			goto out;
+		}
+	}
+
+	/* detach */
+	idr_for_each_entry(&resource->devices, device, i) {
+		retcode = adm_detach(device, 0);
+		if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach");
+			goto out;
+		}
+	}
+
+	/* If we reach this, all volumes (of this connection) are Secondary,
+	 * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
+	 * actually stopped, state handling only does drbd_thread_stop_nowait(). */
+	for_each_connection(connection, resource)
+		drbd_thread_stop(&connection->worker);
+
+	/* Now, nothing can fail anymore */
+
+	/* delete volumes */
+	idr_for_each_entry(&resource->devices, device, i) {
+		retcode = adm_del_minor(device);
+		if (retcode != NO_ERROR) {
+			/* "can not happen" */
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume");
+			goto out;
+		}
+	}
+
+	list_del_rcu(&resource->resources);
+	synchronize_rcu();
+	drbd_free_resource(resource);
+	retcode = NO_ERROR;
+out:
+	mutex_unlock(&resource->adm_mutex);
+finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	resource = adm_ctx.resource;
+	mutex_lock(&resource->adm_mutex);
+	for_each_connection(connection, resource) {
+		if (connection->cstate > C_STANDALONE) {
+			retcode = ERR_NET_CONFIGURED;
+			goto out;
+		}
+	}
+	if (!idr_is_empty(&resource->devices)) {
+		retcode = ERR_RES_IN_USE;
+		goto out;
+	}
+
+	list_del_rcu(&resource->resources);
+	for_each_connection(connection, resource)
+		drbd_thread_stop(&connection->worker);
+	synchronize_rcu();
+	drbd_free_resource(resource);
+	retcode = NO_ERROR;
+out:
+	mutex_unlock(&resource->adm_mutex);
+finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
+{
+	static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+	struct sk_buff *msg;
+	struct drbd_genlmsghdr *d_out;
+	unsigned seq;
+	int err = -ENOMEM;
+
+	if (sib->sib_reason == SIB_SYNC_PROGRESS) {
+		if (time_after(jiffies, device->rs_last_bcast + HZ))
+			device->rs_last_bcast = jiffies;
+		else
+			return;
+	}
+
+	seq = atomic_inc_return(&drbd_genl_seq);
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+	if (!msg)
+		goto failed;
+
+	err = -EMSGSIZE;
+	d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
+	if (!d_out) /* cannot happen, but anyways. */
+		goto nla_put_failure;
+	d_out->minor = device_to_minor(device);
+	d_out->ret_code = NO_ERROR;
+
+	if (nla_put_status_info(msg, device, sib))
+		goto nla_put_failure;
+	genlmsg_end(msg, d_out);
+	err = drbd_genl_multicast_events(msg, 0);
+	/* msg has been consumed or freed in netlink_broadcast() */
+	if (err && err != -ESRCH)
+		goto failed;
+
+	return;
+
+nla_put_failure:
+	nlmsg_free(msg);
+failed:
+	drbd_err(device, "Error %d while broadcasting event. "
+			"Event seq:%u sib_reason:%u\n",
+			err, seq, sib->sib_reason);
+}
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
new file mode 100644
index 00000000000..b2d4791498a
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.c
@@ -0,0 +1,54 @@
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+
+static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
+{
+	struct nlattr *head = nla_data(nla);
+	int len = nla_len(nla);
+	int rem;
+
+	/*
+	 * validate_nla (called from nla_parse_nested) ignores attributes
+	 * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
+	 * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
+	 * flag set also, check and remove that flag before calling
+	 * nla_parse_nested.
+	 */
+
+	nla_for_each_attr(nla, head, len, rem) {
+		if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
+			nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
+			if (nla_type(nla) > maxtype)
+				return -EOPNOTSUPP;
+		}
+	}
+	return 0;
+}
+
+int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+			  const struct nla_policy *policy)
+{
+	int err;
+
+	err = drbd_nla_check_mandatory(maxtype, nla);
+	if (!err)
+		err = nla_parse_nested(tb, maxtype, nla, policy);
+
+	return err;
+}
+
+struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
+{
+	int err;
+	/*
+	 * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
+	 * we don't know about that attribute, reject all the nested
+	 * attributes.
+	 */
+	err = drbd_nla_check_mandatory(maxtype, nla);
+	if (err)
+		return ERR_PTR(err);
+	return nla_find_nested(nla, attrtype);
+}
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h
new file mode 100644
index 00000000000..679c2d5b453
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.h
@@ -0,0 +1,8 @@
+#ifndef __DRBD_NLA_H
+#define __DRBD_NLA_H
+
+extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+				 const struct nla_policy *policy);
+extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
+
+#endif  /* __DRBD_NLA_H */
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 00000000000..89736bdbbc7
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,333 @@
+/*
+   drbd_proc.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+static int drbd_proc_open(struct inode *inode, struct file *file);
+static int drbd_proc_release(struct inode *inode, struct file *file);
+
+
+struct proc_dir_entry *drbd_proc;
+const struct file_operations drbd_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= drbd_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= drbd_proc_release,
+};
+
+static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
+{
+	/* v is in kB/sec. We don't expect TiByte/sec yet. */
+	if (unlikely(v >= 1000000)) {
+		/* cool: > GiByte/s */
+		seq_printf(seq, "%ld,", v / 1000000);
+		v %= 1000000;
+		seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
+	} else if (likely(v >= 1000))
+		seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
+	else
+		seq_printf(seq, "%ld", v);
+}
+
+/*lge
+ * progress bars shamelessly adapted from driver/md/md.c
+ * output looks like
+ *	[=====>..............] 33.5% (23456/123456)
+ *	finish: 2:20:20 speed: 6,345 (6,456) K/sec
+ */
+static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq)
+{
+	unsigned long db, dt, dbdt, rt, rs_left;
+	unsigned int res;
+	int i, x, y;
+	int stalled = 0;
+
+	drbd_get_syncer_progress(device, &rs_left, &res);
+
+	x = res/50;
+	y = 20-x;
+	seq_printf(seq, "\t[");
+	for (i = 1; i < x; i++)
+		seq_printf(seq, "=");
+	seq_printf(seq, ">");
+	for (i = 0; i < y; i++)
+		seq_printf(seq, ".");
+	seq_printf(seq, "] ");
+
+	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+		seq_printf(seq, "verified:");
+	else
+		seq_printf(seq, "sync'ed:");
+	seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
+
+	/* if more than a few GB, display in MB */
+	if (device->rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+		seq_printf(seq, "(%lu/%lu)M",
+			    (unsigned long) Bit2KB(rs_left >> 10),
+			    (unsigned long) Bit2KB(device->rs_total >> 10));
+	else
+		seq_printf(seq, "(%lu/%lu)K\n\t",
+			    (unsigned long) Bit2KB(rs_left),
+			    (unsigned long) Bit2KB(device->rs_total));
+
+	/* see drivers/md/md.c
+	 * We do not want to overflow, so the order of operands and
+	 * the * 100 / 100 trick are important. We do a +1 to be
+	 * safe against division by zero. We only estimate anyway.
+	 *
+	 * dt: time from mark until now
+	 * db: blocks written from mark until now
+	 * rt: remaining time
+	 */
+	/* Rolling marks. last_mark+1 may just now be modified.  last_mark+2 is
+	 * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
+	 * least DRBD_SYNC_MARK_STEP time before it will be modified. */
+	/* ------------------------ ~18s average ------------------------ */
+	i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS;
+	dt = (jiffies - device->rs_mark_time[i]) / HZ;
+	if (dt > 180)
+		stalled = 1;
+
+	if (!dt)
+		dt++;
+	db = device->rs_mark_left[i] - rs_left;
+	rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
+
+	seq_printf(seq, "finish: %lu:%02lu:%02lu",
+		rt / 3600, (rt % 3600) / 60, rt % 60);
+
+	dbdt = Bit2KB(db/dt);
+	seq_printf(seq, " speed: ");
+	seq_printf_with_thousands_grouping(seq, dbdt);
+	seq_printf(seq, " (");
+	/* ------------------------- ~3s average ------------------------ */
+	if (proc_details >= 1) {
+		/* this is what drbd_rs_should_slow_down() uses */
+		i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+		dt = (jiffies - device->rs_mark_time[i]) / HZ;
+		if (!dt)
+			dt++;
+		db = device->rs_mark_left[i] - rs_left;
+		dbdt = Bit2KB(db/dt);
+		seq_printf_with_thousands_grouping(seq, dbdt);
+		seq_printf(seq, " -- ");
+	}
+
+	/* --------------------- long term average ---------------------- */
+	/* mean speed since syncer started
+	 * we do account for PausedSync periods */
+	dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
+	if (dt == 0)
+		dt = 1;
+	db = device->rs_total - rs_left;
+	dbdt = Bit2KB(db/dt);
+	seq_printf_with_thousands_grouping(seq, dbdt);
+	seq_printf(seq, ")");
+
+	if (device->state.conn == C_SYNC_TARGET ||
+	    device->state.conn == C_VERIFY_S) {
+		seq_printf(seq, " want: ");
+		seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
+	}
+	seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
+
+	if (proc_details >= 1) {
+		/* 64 bit:
+		 * we convert to sectors in the display below. */
+		unsigned long bm_bits = drbd_bm_bits(device);
+		unsigned long bit_pos;
+		unsigned long long stop_sector = 0;
+		if (device->state.conn == C_VERIFY_S ||
+		    device->state.conn == C_VERIFY_T) {
+			bit_pos = bm_bits - device->ov_left;
+			if (verify_can_do_stop_sector(device))
+				stop_sector = device->ov_stop_sector;
+		} else
+			bit_pos = device->bm_resync_fo;
+		/* Total sectors may be slightly off for oddly
+		 * sized devices. So what. */
+		seq_printf(seq,
+			"\t%3d%% sector pos: %llu/%llu",
+			(int)(bit_pos / (bm_bits/100+1)),
+			(unsigned long long)bit_pos * BM_SECT_PER_BIT,
+			(unsigned long long)bm_bits * BM_SECT_PER_BIT);
+		if (stop_sector != 0 && stop_sector != ULLONG_MAX)
+			seq_printf(seq, " stop sector: %llu", stop_sector);
+		seq_printf(seq, "\n");
+	}
+}
+
+static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
+{
+	struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+	seq_printf(seq, "%5d %s %s\n", bme->rs_left,
+		   bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
+		   bme->flags & BME_LOCKED ? "LOCKED" : "------"
+		   );
+}
+
+static int drbd_seq_show(struct seq_file *seq, void *v)
+{
+	int i, prev_i = -1;
+	const char *sn;
+	struct drbd_device *device;
+	struct net_conf *nc;
+	char wp;
+
+	static char write_ordering_chars[] = {
+		[WO_none] = 'n',
+		[WO_drain_io] = 'd',
+		[WO_bdev_flush] = 'f',
+	};
+
+	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
+		   API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
+
+	/*
+	  cs .. connection state
+	  ro .. node role (local/remote)
+	  ds .. disk state (local/remote)
+	     protocol
+	     various flags
+	  ns .. network send
+	  nr .. network receive
+	  dw .. disk write
+	  dr .. disk read
+	  al .. activity log write count
+	  bm .. bitmap update write count
+	  pe .. pending (waiting for ack or data reply)
+	  ua .. unack'd (still need to send ack or data reply)
+	  ap .. application requests accepted, but not yet completed
+	  ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
+	  wo .. write ordering mode currently in use
+	 oos .. known out-of-sync kB
+	*/
+
+	rcu_read_lock();
+	idr_for_each_entry(&drbd_devices, device, i) {
+		if (prev_i != i - 1)
+			seq_printf(seq, "\n");
+		prev_i = i;
+
+		sn = drbd_conn_str(device->state.conn);
+
+		if (device->state.conn == C_STANDALONE &&
+		    device->state.disk == D_DISKLESS &&
+		    device->state.role == R_SECONDARY) {
+			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
+		} else {
+			/* reset device->congestion_reason */
+			bdi_rw_congested(&device->rq_queue->backing_dev_info);
+
+			nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+			wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
+			seq_printf(seq,
+			   "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
+			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
+			   "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
+			   i, sn,
+			   drbd_role_str(device->state.role),
+			   drbd_role_str(device->state.peer),
+			   drbd_disk_str(device->state.disk),
+			   drbd_disk_str(device->state.pdsk),
+			   wp,
+			   drbd_suspended(device) ? 's' : 'r',
+			   device->state.aftr_isp ? 'a' : '-',
+			   device->state.peer_isp ? 'p' : '-',
+			   device->state.user_isp ? 'u' : '-',
+			   device->congestion_reason ?: '-',
+			   test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
+			   device->send_cnt/2,
+			   device->recv_cnt/2,
+			   device->writ_cnt/2,
+			   device->read_cnt/2,
+			   device->al_writ_cnt,
+			   device->bm_writ_cnt,
+			   atomic_read(&device->local_cnt),
+			   atomic_read(&device->ap_pending_cnt) +
+			   atomic_read(&device->rs_pending_cnt),
+			   atomic_read(&device->unacked_cnt),
+			   atomic_read(&device->ap_bio_cnt),
+			   first_peer_device(device)->connection->epochs,
+			   write_ordering_chars[first_peer_device(device)->connection->write_ordering]
+			);
+			seq_printf(seq, " oos:%llu\n",
+				   Bit2KB((unsigned long long)
+					   drbd_bm_total_weight(device)));
+		}
+		if (device->state.conn == C_SYNC_SOURCE ||
+		    device->state.conn == C_SYNC_TARGET ||
+		    device->state.conn == C_VERIFY_S ||
+		    device->state.conn == C_VERIFY_T)
+			drbd_syncer_progress(device, seq);
+
+		if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
+			lc_seq_printf_stats(seq, device->resync);
+			lc_seq_printf_stats(seq, device->act_log);
+			put_ldev(device);
+		}
+
+		if (proc_details >= 2) {
+			if (device->resync) {
+				lc_seq_dump_details(seq, device->resync, "rs_left",
+					resync_dump_detail);
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int drbd_proc_open(struct inode *inode, struct file *file)
+{
+	int err;
+
+	if (try_module_get(THIS_MODULE)) {
+		err = single_open(file, drbd_seq_show, PDE_DATA(inode));
+		if (err)
+			module_put(THIS_MODULE);
+		return err;
+	}
+	return -ENODEV;
+}
+
+static int drbd_proc_release(struct inode *inode, struct file *file)
+{
+	module_put(THIS_MODULE);
+	return single_release(inode, file);
+}
+
+/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
new file mode 100644
index 00000000000..2da9104a385
--- /dev/null
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -0,0 +1,307 @@
+#ifndef __DRBD_PROTOCOL_H
+#define __DRBD_PROTOCOL_H
+
+enum drbd_packet {
+	/* receiver (data socket) */
+	P_DATA		      = 0x00,
+	P_DATA_REPLY	      = 0x01, /* Response to P_DATA_REQUEST */
+	P_RS_DATA_REPLY	      = 0x02, /* Response to P_RS_DATA_REQUEST */
+	P_BARRIER	      = 0x03,
+	P_BITMAP	      = 0x04,
+	P_BECOME_SYNC_TARGET  = 0x05,
+	P_BECOME_SYNC_SOURCE  = 0x06,
+	P_UNPLUG_REMOTE	      = 0x07, /* Used at various times to hint the peer */
+	P_DATA_REQUEST	      = 0x08, /* Used to ask for a data block */
+	P_RS_DATA_REQUEST     = 0x09, /* Used to ask for a data block for resync */
+	P_SYNC_PARAM	      = 0x0a,
+	P_PROTOCOL	      = 0x0b,
+	P_UUIDS		      = 0x0c,
+	P_SIZES		      = 0x0d,
+	P_STATE		      = 0x0e,
+	P_SYNC_UUID	      = 0x0f,
+	P_AUTH_CHALLENGE      = 0x10,
+	P_AUTH_RESPONSE	      = 0x11,
+	P_STATE_CHG_REQ	      = 0x12,
+
+	/* asender (meta socket */
+	P_PING		      = 0x13,
+	P_PING_ACK	      = 0x14,
+	P_RECV_ACK	      = 0x15, /* Used in protocol B */
+	P_WRITE_ACK	      = 0x16, /* Used in protocol C */
+	P_RS_WRITE_ACK	      = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
+	P_SUPERSEDED	      = 0x18, /* Used in proto C, two-primaries conflict detection */
+	P_NEG_ACK	      = 0x19, /* Sent if local disk is unusable */
+	P_NEG_DREPLY	      = 0x1a, /* Local disk is broken... */
+	P_NEG_RS_DREPLY	      = 0x1b, /* Local disk is broken... */
+	P_BARRIER_ACK	      = 0x1c,
+	P_STATE_CHG_REPLY     = 0x1d,
+
+	/* "new" commands, no longer fitting into the ordering scheme above */
+
+	P_OV_REQUEST	      = 0x1e, /* data socket */
+	P_OV_REPLY	      = 0x1f,
+	P_OV_RESULT	      = 0x20, /* meta socket */
+	P_CSUM_RS_REQUEST     = 0x21, /* data socket */
+	P_RS_IS_IN_SYNC	      = 0x22, /* meta socket */
+	P_SYNC_PARAM89	      = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
+	P_COMPRESSED_BITMAP   = 0x24, /* compressed or otherwise encoded bitmap transfer */
+	/* P_CKPT_FENCE_REQ      = 0x25, * currently reserved for protocol D */
+	/* P_CKPT_DISABLE_REQ    = 0x26, * currently reserved for protocol D */
+	P_DELAY_PROBE         = 0x27, /* is used on BOTH sockets */
+	P_OUT_OF_SYNC         = 0x28, /* Mark as out of sync (Outrunning), data socket */
+	P_RS_CANCEL           = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
+	P_CONN_ST_CHG_REQ     = 0x2a, /* data sock: Connection wide state request */
+	P_CONN_ST_CHG_REPLY   = 0x2b, /* meta sock: Connection side state req reply */
+	P_RETRY_WRITE	      = 0x2c, /* Protocol C: retry conflicting write request */
+	P_PROTOCOL_UPDATE     = 0x2d, /* data sock: is used in established connections */
+        /* 0x2e to 0x30 reserved, used in drbd 9 */
+
+	/* REQ_DISCARD. We used "discard" in different contexts before,
+	 * which is why I chose TRIM here, to disambiguate. */
+	P_TRIM                = 0x31,
+
+	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
+	P_MAX_OPT_CMD	      = 0x101,
+
+	/* special command ids for handshake */
+
+	P_INITIAL_META	      = 0xfff1, /* First Packet on the MetaSock */
+	P_INITIAL_DATA	      = 0xfff2, /* First Packet on the Socket */
+
+	P_CONNECTION_FEATURES = 0xfffe	/* FIXED for the next century! */
+};
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+/* This is the layout for a packet on the wire.
+ * The byteorder is the network byte order.
+ *     (except block_id and barrier fields.
+ *	these are pointers to local structs
+ *	and have no relevance for the partner,
+ *	which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless of 32 or 64 bit arch!
+ */
+struct p_header80 {
+	u32	  magic;
+	u16	  command;
+	u16	  length;	/* bytes of data after this header */
+} __packed;
+
+/* Header for big packets, Used for data packets exceeding 64kB */
+struct p_header95 {
+	u16	  magic;	/* use DRBD_MAGIC_BIG here */
+	u16	  command;
+	u32	  length;
+} __packed;
+
+struct p_header100 {
+	u32	  magic;
+	u16	  volume;
+	u16	  command;
+	u32	  length;
+	u32	  pad;
+} __packed;
+
+/* these defines must not be changed without changing the protocol version */
+#define DP_HARDBARRIER	      1 /* depricated */
+#define DP_RW_SYNC	      2 /* equals REQ_SYNC    */
+#define DP_MAY_SET_IN_SYNC    4
+#define DP_UNPLUG             8 /* not used anymore   */
+#define DP_FUA               16 /* equals REQ_FUA     */
+#define DP_FLUSH             32 /* equals REQ_FLUSH   */
+#define DP_DISCARD           64 /* equals REQ_DISCARD */
+#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
+#define DP_SEND_WRITE_ACK   256 /* This is a proto C write request */
+
+struct p_data {
+	u64	    sector;    /* 64 bits sector number */
+	u64	    block_id;  /* to identify the request in protocol B&C */
+	u32	    seq_num;
+	u32	    dp_flags;
+} __packed;
+
+struct p_trim {
+	struct p_data p_data;
+	u32	    size;	/* == bio->bi_size */
+} __packed;
+
+/*
+ * commands which share a struct:
+ *  p_block_ack:
+ *   P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
+ *   P_SUPERSEDED (proto C, two-primaries conflict detection)
+ *  p_block_req:
+ *   P_DATA_REQUEST, P_RS_DATA_REQUEST
+ */
+struct p_block_ack {
+	u64	    sector;
+	u64	    block_id;
+	u32	    blksize;
+	u32	    seq_num;
+} __packed;
+
+struct p_block_req {
+	u64 sector;
+	u64 block_id;
+	u32 blksize;
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+/*
+ * commands with their own struct for additional fields:
+ *   P_CONNECTION_FEATURES
+ *   P_BARRIER
+ *   P_BARRIER_ACK
+ *   P_SYNC_PARAM
+ *   ReportParams
+ */
+
+#define FF_TRIM      1
+
+struct p_connection_features {
+	u32 protocol_min;
+	u32 feature_flags;
+	u32 protocol_max;
+
+	/* should be more than enough for future enhancements
+	 * for now, feature_flags and the reserved array shall be zero.
+	 */
+
+	u32 _pad;
+	u64 reserved[7];
+} __packed;
+
+struct p_barrier {
+	u32 barrier;	/* barrier number _handle_ only */
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+struct p_barrier_ack {
+	u32 barrier;
+	u32 set_size;
+} __packed;
+
+struct p_rs_param {
+	u32 resync_rate;
+
+	      /* Since protocol version 88 and higher. */
+	char verify_alg[0];
+} __packed;
+
+struct p_rs_param_89 {
+	u32 resync_rate;
+	/* protocol version 89: */
+	char verify_alg[SHARED_SECRET_MAX];
+	char csums_alg[SHARED_SECRET_MAX];
+} __packed;
+
+struct p_rs_param_95 {
+	u32 resync_rate;
+	char verify_alg[SHARED_SECRET_MAX];
+	char csums_alg[SHARED_SECRET_MAX];
+	u32 c_plan_ahead;
+	u32 c_delay_target;
+	u32 c_fill_target;
+	u32 c_max_rate;
+} __packed;
+
+enum drbd_conn_flags {
+	CF_DISCARD_MY_DATA = 1,
+	CF_DRY_RUN = 2,
+};
+
+struct p_protocol {
+	u32 protocol;
+	u32 after_sb_0p;
+	u32 after_sb_1p;
+	u32 after_sb_2p;
+	u32 conn_flags;
+	u32 two_primaries;
+
+	/* Since protocol version 87 and higher. */
+	char integrity_alg[0];
+
+} __packed;
+
+struct p_uuids {
+	u64 uuid[UI_EXTENDED_SIZE];
+} __packed;
+
+struct p_rs_uuid {
+	u64	    uuid;
+} __packed;
+
+struct p_sizes {
+	u64	    d_size;  /* size of disk */
+	u64	    u_size;  /* user requested size */
+	u64	    c_size;  /* current exported size */
+	u32	    max_bio_size;  /* Maximal size of a BIO */
+	u16	    queue_order_type;  /* not yet implemented in DRBD*/
+	u16	    dds_flags; /* use enum dds_flags here. */
+} __packed;
+
+struct p_state {
+	u32	    state;
+} __packed;
+
+struct p_req_state {
+	u32	    mask;
+	u32	    val;
+} __packed;
+
+struct p_req_state_reply {
+	u32	    retcode;
+} __packed;
+
+struct p_drbd06_param {
+	u64	  size;
+	u32	  state;
+	u32	  blksize;
+	u32	  protocol;
+	u32	  version;
+	u32	  gen_cnt[5];
+	u32	  bit_map_gen[5];
+} __packed;
+
+struct p_block_desc {
+	u64 sector;
+	u32 blksize;
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+/* Valid values for the encoding field.
+ * Bump proto version when changing this. */
+enum drbd_bitmap_code {
+	/* RLE_VLI_Bytes = 0,
+	 * and other bit variants had been defined during
+	 * algorithm evaluation. */
+	RLE_VLI_Bits = 2,
+};
+
+struct p_compressed_bm {
+	/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
+	 * (encoding & 0x80): polarity (set/unset) of first runlength
+	 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
+	 * used to pad up to head.length bytes
+	 */
+	u8 encoding;
+
+	u8 code[0];
+} __packed;
+
+struct p_delay_probe93 {
+	u32     seq_num; /* sequence number to match the two probe packets */
+	u32     offset;  /* usecs the probe got sent after the reference time point */
+} __packed;
+
+/*
+ * Bitmap packets need to fit within a single page on the sender and receiver,
+ * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
+ */
+#define DRBD_SOCKET_BUFFER_SIZE 4096
+
+#endif  /* __DRBD_PROTOCOL_H */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 00000000000..5b17ec88ea0
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,5549 @@
+/*
+   drbd_receiver.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/drbd.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/pkt_sched.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+#include "drbd_vli.h"
+
+#define PRO_FEATURES (FF_TRIM)
+
+struct packet_info {
+	enum drbd_packet cmd;
+	unsigned int size;
+	unsigned int vnr;
+	void *data;
+};
+
+enum finish_epoch {
+	FE_STILL_LIVE,
+	FE_DESTROYED,
+	FE_RECYCLED,
+};
+
+static int drbd_do_features(struct drbd_connection *connection);
+static int drbd_do_auth(struct drbd_connection *connection);
+static int drbd_disconnected(struct drbd_peer_device *);
+static void conn_wait_active_ee_empty(struct drbd_connection *connection);
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_work *, int);
+
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+/*
+ * some helper functions to deal with single linked page lists,
+ * page->private being our "next" pointer.
+ */
+
+/* If at least n pages are linked at head, get n pages off.
+ * Otherwise, don't modify head, and return NULL.
+ * Locking is the responsibility of the caller.
+ */
+static struct page *page_chain_del(struct page **head, int n)
+{
+	struct page *page;
+	struct page *tmp;
+
+	BUG_ON(!n);
+	BUG_ON(!head);
+
+	page = *head;
+
+	if (!page)
+		return NULL;
+
+	while (page) {
+		tmp = page_chain_next(page);
+		if (--n == 0)
+			break; /* found sufficient pages */
+		if (tmp == NULL)
+			/* insufficient pages, don't use any of them. */
+			return NULL;
+		page = tmp;
+	}
+
+	/* add end of list marker for the returned list */
+	set_page_private(page, 0);
+	/* actual return value, and adjustment of head */
+	page = *head;
+	*head = tmp;
+	return page;
+}
+
+/* may be used outside of locks to find the tail of a (usually short)
+ * "private" page chain, before adding it back to a global chain head
+ * with page_chain_add() under a spinlock. */
+static struct page *page_chain_tail(struct page *page, int *len)
+{
+	struct page *tmp;
+	int i = 1;
+	while ((tmp = page_chain_next(page)))
+		++i, page = tmp;
+	if (len)
+		*len = i;
+	return page;
+}
+
+static int page_chain_free(struct page *page)
+{
+	struct page *tmp;
+	int i = 0;
+	page_chain_for_each_safe(page, tmp) {
+		put_page(page);
+		++i;
+	}
+	return i;
+}
+
+static void page_chain_add(struct page **head,
+		struct page *chain_first, struct page *chain_last)
+{
+#if 1
+	struct page *tmp;
+	tmp = page_chain_tail(chain_first, NULL);
+	BUG_ON(tmp != chain_last);
+#endif
+
+	/* add chain to head */
+	set_page_private(chain_last, (unsigned long)*head);
+	*head = chain_first;
+}
+
+static struct page *__drbd_alloc_pages(struct drbd_device *device,
+				       unsigned int number)
+{
+	struct page *page = NULL;
+	struct page *tmp = NULL;
+	unsigned int i = 0;
+
+	/* Yes, testing drbd_pp_vacant outside the lock is racy.
+	 * So what. It saves a spin_lock. */
+	if (drbd_pp_vacant >= number) {
+		spin_lock(&drbd_pp_lock);
+		page = page_chain_del(&drbd_pp_pool, number);
+		if (page)
+			drbd_pp_vacant -= number;
+		spin_unlock(&drbd_pp_lock);
+		if (page)
+			return page;
+	}
+
+	/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	for (i = 0; i < number; i++) {
+		tmp = alloc_page(GFP_TRY);
+		if (!tmp)
+			break;
+		set_page_private(tmp, (unsigned long)page);
+		page = tmp;
+	}
+
+	if (i == number)
+		return page;
+
+	/* Not enough pages immediately available this time.
+	 * No need to jump around here, drbd_alloc_pages will retry this
+	 * function "soon". */
+	if (page) {
+		tmp = page_chain_tail(page, NULL);
+		spin_lock(&drbd_pp_lock);
+		page_chain_add(&drbd_pp_pool, page, tmp);
+		drbd_pp_vacant += i;
+		spin_unlock(&drbd_pp_lock);
+	}
+	return NULL;
+}
+
+static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
+					   struct list_head *to_be_freed)
+{
+	struct drbd_peer_request *peer_req, *tmp;
+
+	/* The EEs are always appended to the end of the list. Since
+	   they are sent in order over the wire, they have to finish
+	   in order. As soon as we see the first not finished we can
+	   stop to examine the list... */
+
+	list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
+		if (drbd_peer_req_has_active_page(peer_req))
+			break;
+		list_move(&peer_req->w.list, to_be_freed);
+	}
+}
+
+static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
+{
+	LIST_HEAD(reclaimed);
+	struct drbd_peer_request *peer_req, *t;
+
+	spin_lock_irq(&device->resource->req_lock);
+	reclaim_finished_net_peer_reqs(device, &reclaimed);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+		drbd_free_net_peer_req(device, peer_req);
+}
+
+/**
+ * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
+ * @device:	DRBD device.
+ * @number:	number of pages requested
+ * @retry:	whether to retry, if not enough pages are available right now
+ *
+ * Tries to allocate number pages, first from our own page pool, then from
+ * the kernel.
+ * Possibly retry until DRBD frees sufficient pages somewhere else.
+ *
+ * If this allocation would exceed the max_buffers setting, we throttle
+ * allocation (schedule_timeout) to give the system some room to breathe.
+ *
+ * We do not use max-buffers as hard limit, because it could lead to
+ * congestion and further to a distributed deadlock during online-verify or
+ * (checksum based) resync, if the max-buffers, socket buffer sizes and
+ * resync-rate settings are mis-configured.
+ *
+ * Returns a page chain linked via page->private.
+ */
+struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
+			      bool retry)
+{
+	struct drbd_device *device = peer_device->device;
+	struct page *page = NULL;
+	struct net_conf *nc;
+	DEFINE_WAIT(wait);
+	unsigned int mxb;
+
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->net_conf);
+	mxb = nc ? nc->max_buffers : 1000000;
+	rcu_read_unlock();
+
+	if (atomic_read(&device->pp_in_use) < mxb)
+		page = __drbd_alloc_pages(device, number);
+
+	while (page == NULL) {
+		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
+
+		drbd_kick_lo_and_reclaim_net(device);
+
+		if (atomic_read(&device->pp_in_use) < mxb) {
+			page = __drbd_alloc_pages(device, number);
+			if (page)
+				break;
+		}
+
+		if (!retry)
+			break;
+
+		if (signal_pending(current)) {
+			drbd_warn(device, "drbd_alloc_pages interrupted!\n");
+			break;
+		}
+
+		if (schedule_timeout(HZ/10) == 0)
+			mxb = UINT_MAX;
+	}
+	finish_wait(&drbd_pp_wait, &wait);
+
+	if (page)
+		atomic_add(number, &device->pp_in_use);
+	return page;
+}
+
+/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
+ * Is also used from inside an other spin_lock_irq(&resource->req_lock);
+ * Either links the page chain back to the global pool,
+ * or returns all pages to the system. */
+static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
+{
+	atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
+	int i;
+
+	if (page == NULL)
+		return;
+
+	if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
+		i = page_chain_free(page);
+	else {
+		struct page *tmp;
+		tmp = page_chain_tail(page, &i);
+		spin_lock(&drbd_pp_lock);
+		page_chain_add(&drbd_pp_pool, page, tmp);
+		drbd_pp_vacant += i;
+		spin_unlock(&drbd_pp_lock);
+	}
+	i = atomic_sub_return(i, a);
+	if (i < 0)
+		drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
+			is_net ? "pp_in_use_by_net" : "pp_in_use", i);
+	wake_up(&drbd_pp_wait);
+}
+
+/*
+You need to hold the req_lock:
+ _drbd_wait_ee_list_empty()
+
+You must not have the req_lock:
+ drbd_free_peer_req()
+ drbd_alloc_peer_req()
+ drbd_free_peer_reqs()
+ drbd_ee_fix_bhs()
+ drbd_finish_peer_reqs()
+ drbd_clear_done_ee()
+ drbd_wait_ee_list_empty()
+*/
+
+struct drbd_peer_request *
+drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+		    unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_request *peer_req;
+	struct page *page = NULL;
+	unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
+
+	if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
+		return NULL;
+
+	peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+	if (!peer_req) {
+		if (!(gfp_mask & __GFP_NOWARN))
+			drbd_err(device, "%s: allocation failed\n", __func__);
+		return NULL;
+	}
+
+	if (has_payload && data_size) {
+		page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
+		if (!page)
+			goto fail;
+	}
+
+	drbd_clear_interval(&peer_req->i);
+	peer_req->i.size = data_size;
+	peer_req->i.sector = sector;
+	peer_req->i.local = false;
+	peer_req->i.waiting = false;
+
+	peer_req->epoch = NULL;
+	peer_req->peer_device = peer_device;
+	peer_req->pages = page;
+	atomic_set(&peer_req->pending_bios, 0);
+	peer_req->flags = 0;
+	/*
+	 * The block_id is opaque to the receiver.  It is not endianness
+	 * converted, and sent back to the sender unchanged.
+	 */
+	peer_req->block_id = id;
+
+	return peer_req;
+
+ fail:
+	mempool_free(peer_req, drbd_ee_mempool);
+	return NULL;
+}
+
+void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
+		       int is_net)
+{
+	if (peer_req->flags & EE_HAS_DIGEST)
+		kfree(peer_req->digest);
+	drbd_free_pages(device, peer_req->pages, is_net);
+	D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
+	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+	mempool_free(peer_req, drbd_ee_mempool);
+}
+
+int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
+{
+	LIST_HEAD(work_list);
+	struct drbd_peer_request *peer_req, *t;
+	int count = 0;
+	int is_net = list == &device->net_ee;
+
+	spin_lock_irq(&device->resource->req_lock);
+	list_splice_init(list, &work_list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+		__drbd_free_peer_req(device, peer_req, is_net);
+		count++;
+	}
+	return count;
+}
+
+/*
+ * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
+ */
+static int drbd_finish_peer_reqs(struct drbd_device *device)
+{
+	LIST_HEAD(work_list);
+	LIST_HEAD(reclaimed);
+	struct drbd_peer_request *peer_req, *t;
+	int err = 0;
+
+	spin_lock_irq(&device->resource->req_lock);
+	reclaim_finished_net_peer_reqs(device, &reclaimed);
+	list_splice_init(&device->done_ee, &work_list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+		drbd_free_net_peer_req(device, peer_req);
+
+	/* possible callbacks here:
+	 * e_end_block, and e_end_resync_block, e_send_superseded.
+	 * all ignore the last argument.
+	 */
+	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+		int err2;
+
+		/* list_del not necessary, next/prev members not touched */
+		err2 = peer_req->w.cb(&peer_req->w, !!err);
+		if (!err)
+			err = err2;
+		drbd_free_peer_req(device, peer_req);
+	}
+	wake_up(&device->ee_wait);
+
+	return err;
+}
+
+static void _drbd_wait_ee_list_empty(struct drbd_device *device,
+				     struct list_head *head)
+{
+	DEFINE_WAIT(wait);
+
+	/* avoids spin_lock/unlock
+	 * and calling prepare_to_wait in the fast path */
+	while (!list_empty(head)) {
+		prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&device->resource->req_lock);
+		io_schedule();
+		finish_wait(&device->ee_wait, &wait);
+		spin_lock_irq(&device->resource->req_lock);
+	}
+}
+
+static void drbd_wait_ee_list_empty(struct drbd_device *device,
+				    struct list_head *head)
+{
+	spin_lock_irq(&device->resource->req_lock);
+	_drbd_wait_ee_list_empty(device, head);
+	spin_unlock_irq(&device->resource->req_lock);
+}
+
+static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
+{
+	struct kvec iov = {
+		.iov_base = buf,
+		.iov_len = size,
+	};
+	struct msghdr msg = {
+		.msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
+	};
+	return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
+}
+
+static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
+{
+	int rv;
+
+	rv = drbd_recv_short(connection->data.socket, buf, size, 0);
+
+	if (rv < 0) {
+		if (rv == -ECONNRESET)
+			drbd_info(connection, "sock was reset by peer\n");
+		else if (rv != -ERESTARTSYS)
+			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+	} else if (rv == 0) {
+		if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+			long t;
+			rcu_read_lock();
+			t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+			rcu_read_unlock();
+
+			t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
+
+			if (t)
+				goto out;
+		}
+		drbd_info(connection, "sock was shut down by peer\n");
+	}
+
+	if (rv != size)
+		conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+
+out:
+	return rv;
+}
+
+static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
+{
+	int err;
+
+	err = drbd_recv(connection, buf, size);
+	if (err != size) {
+		if (err >= 0)
+			err = -EIO;
+	} else
+		err = 0;
+	return err;
+}
+
+static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
+{
+	int err;
+
+	err = drbd_recv_all(connection, buf, size);
+	if (err && !signal_pending(current))
+		drbd_warn(connection, "short read (expected size %d)\n", (int)size);
+	return err;
+}
+
+/* quoting tcp(7):
+ *   On individual connections, the socket buffer size must be set prior to the
+ *   listen(2) or connect(2) calls in order to have it take effect.
+ * This is our wrapper to do so.
+ */
+static void drbd_setbufsize(struct socket *sock, unsigned int snd,
+		unsigned int rcv)
+{
+	/* open coded SO_SNDBUF, SO_RCVBUF */
+	if (snd) {
+		sock->sk->sk_sndbuf = snd;
+		sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+	}
+	if (rcv) {
+		sock->sk->sk_rcvbuf = rcv;
+		sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+	}
+}
+
+static struct socket *drbd_try_connect(struct drbd_connection *connection)
+{
+	const char *what;
+	struct socket *sock;
+	struct sockaddr_in6 src_in6;
+	struct sockaddr_in6 peer_in6;
+	struct net_conf *nc;
+	int err, peer_addr_len, my_addr_len;
+	int sndbuf_size, rcvbuf_size, connect_int;
+	int disconnect_on_error = 1;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	sndbuf_size = nc->sndbuf_size;
+	rcvbuf_size = nc->rcvbuf_size;
+	connect_int = nc->connect_int;
+	rcu_read_unlock();
+
+	my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
+	memcpy(&src_in6, &connection->my_addr, my_addr_len);
+
+	if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
+		src_in6.sin6_port = 0;
+	else
+		((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+	peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
+	memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
+
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
+			       SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (err < 0) {
+		sock = NULL;
+		goto out;
+	}
+
+	sock->sk->sk_rcvtimeo =
+	sock->sk->sk_sndtimeo = connect_int * HZ;
+	drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
+
+       /* explicitly bind to the configured IP as source IP
+	*  for the outgoing connections.
+	*  This is needed for multihomed hosts and to be
+	*  able to use lo: interfaces for drbd.
+	* Make sure to use 0 as port number, so linux selects
+	*  a free one dynamically.
+	*/
+	what = "bind before connect";
+	err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
+	if (err < 0)
+		goto out;
+
+	/* connect may fail, peer not yet available.
+	 * stay C_WF_CONNECTION, don't go Disconnecting! */
+	disconnect_on_error = 0;
+	what = "connect";
+	err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
+
+out:
+	if (err < 0) {
+		if (sock) {
+			sock_release(sock);
+			sock = NULL;
+		}
+		switch (-err) {
+			/* timeout, busy, signal pending */
+		case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
+		case EINTR: case ERESTARTSYS:
+			/* peer not (yet) available, network problem */
+		case ECONNREFUSED: case ENETUNREACH:
+		case EHOSTDOWN:    case EHOSTUNREACH:
+			disconnect_on_error = 0;
+			break;
+		default:
+			drbd_err(connection, "%s failed, err = %d\n", what, err);
+		}
+		if (disconnect_on_error)
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	}
+
+	return sock;
+}
+
+struct accept_wait_data {
+	struct drbd_connection *connection;
+	struct socket *s_listen;
+	struct completion door_bell;
+	void (*original_sk_state_change)(struct sock *sk);
+
+};
+
+static void drbd_incoming_connection(struct sock *sk)
+{
+	struct accept_wait_data *ad = sk->sk_user_data;
+	void (*state_change)(struct sock *sk);
+
+	state_change = ad->original_sk_state_change;
+	if (sk->sk_state == TCP_ESTABLISHED)
+		complete(&ad->door_bell);
+	state_change(sk);
+}
+
+static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
+{
+	int err, sndbuf_size, rcvbuf_size, my_addr_len;
+	struct sockaddr_in6 my_addr;
+	struct socket *s_listen;
+	struct net_conf *nc;
+	const char *what;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -EIO;
+	}
+	sndbuf_size = nc->sndbuf_size;
+	rcvbuf_size = nc->rcvbuf_size;
+	rcu_read_unlock();
+
+	my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
+	memcpy(&my_addr, &connection->my_addr, my_addr_len);
+
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
+			       SOCK_STREAM, IPPROTO_TCP, &s_listen);
+	if (err) {
+		s_listen = NULL;
+		goto out;
+	}
+
+	s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+	drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
+
+	what = "bind before listen";
+	err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
+	if (err < 0)
+		goto out;
+
+	ad->s_listen = s_listen;
+	write_lock_bh(&s_listen->sk->sk_callback_lock);
+	ad->original_sk_state_change = s_listen->sk->sk_state_change;
+	s_listen->sk->sk_state_change = drbd_incoming_connection;
+	s_listen->sk->sk_user_data = ad;
+	write_unlock_bh(&s_listen->sk->sk_callback_lock);
+
+	what = "listen";
+	err = s_listen->ops->listen(s_listen, 5);
+	if (err < 0)
+		goto out;
+
+	return 0;
+out:
+	if (s_listen)
+		sock_release(s_listen);
+	if (err < 0) {
+		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+			drbd_err(connection, "%s failed, err = %d\n", what, err);
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		}
+	}
+
+	return -EIO;
+}
+
+static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_state_change = ad->original_sk_state_change;
+	sk->sk_user_data = NULL;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
+{
+	int timeo, connect_int, err = 0;
+	struct socket *s_estab = NULL;
+	struct net_conf *nc;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	connect_int = nc->connect_int;
+	rcu_read_unlock();
+
+	timeo = connect_int * HZ;
+	/* 28.5% random jitter */
+	timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
+
+	err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
+	if (err <= 0)
+		return NULL;
+
+	err = kernel_accept(ad->s_listen, &s_estab, 0);
+	if (err < 0) {
+		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+			drbd_err(connection, "accept failed, err = %d\n", err);
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		}
+	}
+
+	if (s_estab)
+		unregister_state_change(s_estab->sk, ad);
+
+	return s_estab;
+}
+
+static int decode_header(struct drbd_connection *, void *, struct packet_info *);
+
+static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
+			     enum drbd_packet cmd)
+{
+	if (!conn_prepare_command(connection, sock))
+		return -EIO;
+	return conn_send_command(connection, sock, cmd, 0, NULL, 0);
+}
+
+static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
+{
+	unsigned int header_size = drbd_header_size(connection);
+	struct packet_info pi;
+	int err;
+
+	err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
+	if (err != header_size) {
+		if (err >= 0)
+			err = -EIO;
+		return err;
+	}
+	err = decode_header(connection, connection->data.rbuf, &pi);
+	if (err)
+		return err;
+	return pi.cmd;
+}
+
+/**
+ * drbd_socket_okay() - Free the socket if its connection is not okay
+ * @sock:	pointer to the pointer to the socket.
+ */
+static int drbd_socket_okay(struct socket **sock)
+{
+	int rr;
+	char tb[4];
+
+	if (!*sock)
+		return false;
+
+	rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+
+	if (rr > 0 || rr == -EAGAIN) {
+		return true;
+	} else {
+		sock_release(*sock);
+		*sock = NULL;
+		return false;
+	}
+}
+/* Gets called if a connection is established, or if a new minor gets created
+   in a connection */
+int drbd_connected(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	int err;
+
+	atomic_set(&device->packet_seq, 0);
+	device->peer_seq = 0;
+
+	device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
+		&peer_device->connection->cstate_mutex :
+		&device->own_state_mutex;
+
+	err = drbd_send_sync_param(peer_device);
+	if (!err)
+		err = drbd_send_sizes(peer_device, 0, 0);
+	if (!err)
+		err = drbd_send_uuids(peer_device);
+	if (!err)
+		err = drbd_send_current_state(peer_device);
+	clear_bit(USE_DEGR_WFC_T, &device->flags);
+	clear_bit(RESIZE_PENDING, &device->flags);
+	atomic_set(&device->ap_in_flight, 0);
+	mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
+	return err;
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ *  -2 We do not have a network config...
+ */
+static int conn_connect(struct drbd_connection *connection)
+{
+	struct drbd_socket sock, msock;
+	struct drbd_peer_device *peer_device;
+	struct net_conf *nc;
+	int vnr, timeout, h, ok;
+	bool discard_my_data;
+	enum drbd_state_rv rv;
+	struct accept_wait_data ad = {
+		.connection = connection,
+		.door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
+	};
+
+	clear_bit(DISCONNECT_SENT, &connection->flags);
+	if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
+		return -2;
+
+	mutex_init(&sock.mutex);
+	sock.sbuf = connection->data.sbuf;
+	sock.rbuf = connection->data.rbuf;
+	sock.socket = NULL;
+	mutex_init(&msock.mutex);
+	msock.sbuf = connection->meta.sbuf;
+	msock.rbuf = connection->meta.rbuf;
+	msock.socket = NULL;
+
+	/* Assume that the peer only understands protocol 80 until we know better.  */
+	connection->agreed_pro_version = 80;
+
+	if (prepare_listen_socket(connection, &ad))
+		return 0;
+
+	do {
+		struct socket *s;
+
+		s = drbd_try_connect(connection);
+		if (s) {
+			if (!sock.socket) {
+				sock.socket = s;
+				send_first_packet(connection, &sock, P_INITIAL_DATA);
+			} else if (!msock.socket) {
+				clear_bit(RESOLVE_CONFLICTS, &connection->flags);
+				msock.socket = s;
+				send_first_packet(connection, &msock, P_INITIAL_META);
+			} else {
+				drbd_err(connection, "Logic error in conn_connect()\n");
+				goto out_release_sockets;
+			}
+		}
+
+		if (sock.socket && msock.socket) {
+			rcu_read_lock();
+			nc = rcu_dereference(connection->net_conf);
+			timeout = nc->ping_timeo * HZ / 10;
+			rcu_read_unlock();
+			schedule_timeout_interruptible(timeout);
+			ok = drbd_socket_okay(&sock.socket);
+			ok = drbd_socket_okay(&msock.socket) && ok;
+			if (ok)
+				break;
+		}
+
+retry:
+		s = drbd_wait_for_connect(connection, &ad);
+		if (s) {
+			int fp = receive_first_packet(connection, s);
+			drbd_socket_okay(&sock.socket);
+			drbd_socket_okay(&msock.socket);
+			switch (fp) {
+			case P_INITIAL_DATA:
+				if (sock.socket) {
+					drbd_warn(connection, "initial packet S crossed\n");
+					sock_release(sock.socket);
+					sock.socket = s;
+					goto randomize;
+				}
+				sock.socket = s;
+				break;
+			case P_INITIAL_META:
+				set_bit(RESOLVE_CONFLICTS, &connection->flags);
+				if (msock.socket) {
+					drbd_warn(connection, "initial packet M crossed\n");
+					sock_release(msock.socket);
+					msock.socket = s;
+					goto randomize;
+				}
+				msock.socket = s;
+				break;
+			default:
+				drbd_warn(connection, "Error receiving initial packet\n");
+				sock_release(s);
+randomize:
+				if (prandom_u32() & 1)
+					goto retry;
+			}
+		}
+
+		if (connection->cstate <= C_DISCONNECTING)
+			goto out_release_sockets;
+		if (signal_pending(current)) {
+			flush_signals(current);
+			smp_rmb();
+			if (get_t_state(&connection->receiver) == EXITING)
+				goto out_release_sockets;
+		}
+
+		ok = drbd_socket_okay(&sock.socket);
+		ok = drbd_socket_okay(&msock.socket) && ok;
+	} while (!ok);
+
+	if (ad.s_listen)
+		sock_release(ad.s_listen);
+
+	sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+	msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+
+	sock.socket->sk->sk_allocation = GFP_NOIO;
+	msock.socket->sk->sk_allocation = GFP_NOIO;
+
+	sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+	msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
+
+	/* NOT YET ...
+	 * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
+	 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+	 * first set it to the P_CONNECTION_FEATURES timeout,
+	 * which we set to 4x the configured ping_timeout. */
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+
+	sock.socket->sk->sk_sndtimeo =
+	sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
+
+	msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+	timeout = nc->timeout * HZ / 10;
+	discard_my_data = nc->discard_my_data;
+	rcu_read_unlock();
+
+	msock.socket->sk->sk_sndtimeo = timeout;
+
+	/* we don't want delays.
+	 * we use TCP_CORK where appropriate, though */
+	drbd_tcp_nodelay(sock.socket);
+	drbd_tcp_nodelay(msock.socket);
+
+	connection->data.socket = sock.socket;
+	connection->meta.socket = msock.socket;
+	connection->last_received = jiffies;
+
+	h = drbd_do_features(connection);
+	if (h <= 0)
+		return h;
+
+	if (connection->cram_hmac_tfm) {
+		/* drbd_request_state(device, NS(conn, WFAuth)); */
+		switch (drbd_do_auth(connection)) {
+		case -1:
+			drbd_err(connection, "Authentication of peer failed\n");
+			return -1;
+		case 0:
+			drbd_err(connection, "Authentication of peer failed, trying again.\n");
+			return 0;
+		}
+	}
+
+	connection->data.socket->sk->sk_sndtimeo = timeout;
+	connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+
+	if (drbd_send_protocol(connection) == -EOPNOTSUPP)
+		return -1;
+
+	/* Prevent a race between resync-handshake and
+	 * being promoted to Primary.
+	 *
+	 * Grab and release the state mutex, so we know that any current
+	 * drbd_set_role() is finished, and any incoming drbd_set_role
+	 * will see the STATE_SENT flag, and wait for it to be cleared.
+	 */
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		mutex_lock(peer_device->device->state_mutex);
+
+	set_bit(STATE_SENT, &connection->flags);
+
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		mutex_unlock(peer_device->device->state_mutex);
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		kref_get(&device->kref);
+		rcu_read_unlock();
+
+		if (discard_my_data)
+			set_bit(DISCARD_MY_DATA, &device->flags);
+		else
+			clear_bit(DISCARD_MY_DATA, &device->flags);
+
+		drbd_connected(peer_device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
+	if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
+		clear_bit(STATE_SENT, &connection->flags);
+		return 0;
+	}
+
+	drbd_thread_start(&connection->asender);
+
+	mutex_lock(&connection->resource->conf_update);
+	/* The discard_my_data flag is a single-shot modifier to the next
+	 * connection attempt, the handshake of which is now well underway.
+	 * No need for rcu style copying of the whole struct
+	 * just to clear a single value. */
+	connection->net_conf->discard_my_data = 0;
+	mutex_unlock(&connection->resource->conf_update);
+
+	return h;
+
+out_release_sockets:
+	if (ad.s_listen)
+		sock_release(ad.s_listen);
+	if (sock.socket)
+		sock_release(sock.socket);
+	if (msock.socket)
+		sock_release(msock.socket);
+	return -1;
+}
+
+static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
+{
+	unsigned int header_size = drbd_header_size(connection);
+
+	if (header_size == sizeof(struct p_header100) &&
+	    *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
+		struct p_header100 *h = header;
+		if (h->pad != 0) {
+			drbd_err(connection, "Header padding is not zero\n");
+			return -EINVAL;
+		}
+		pi->vnr = be16_to_cpu(h->volume);
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be32_to_cpu(h->length);
+	} else if (header_size == sizeof(struct p_header95) &&
+		   *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
+		struct p_header95 *h = header;
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be32_to_cpu(h->length);
+		pi->vnr = 0;
+	} else if (header_size == sizeof(struct p_header80) &&
+		   *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
+		struct p_header80 *h = header;
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be16_to_cpu(h->length);
+		pi->vnr = 0;
+	} else {
+		drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
+			 be32_to_cpu(*(__be32 *)header),
+			 connection->agreed_pro_version);
+		return -EINVAL;
+	}
+	pi->data = header + header_size;
+	return 0;
+}
+
+static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
+{
+	void *buffer = connection->data.rbuf;
+	int err;
+
+	err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
+	if (err)
+		return err;
+
+	err = decode_header(connection, buffer, pi);
+	connection->last_received = jiffies;
+
+	return err;
+}
+
+static void drbd_flush(struct drbd_connection *connection)
+{
+	int rv;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	if (connection->write_ordering >= WO_bdev_flush) {
+		rcu_read_lock();
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			struct drbd_device *device = peer_device->device;
+
+			if (!get_ldev(device))
+				continue;
+			kref_get(&device->kref);
+			rcu_read_unlock();
+
+			rv = blkdev_issue_flush(device->ldev->backing_bdev,
+					GFP_NOIO, NULL);
+			if (rv) {
+				drbd_info(device, "local disk flush failed with status %d\n", rv);
+				/* would rather check on EOPNOTSUPP, but that is not reliable.
+				 * don't try again for ANY return value != 0
+				 * if (rv == -EOPNOTSUPP) */
+				drbd_bump_write_ordering(connection, WO_drain_io);
+			}
+			put_ldev(device);
+			kref_put(&device->kref, drbd_destroy_device);
+
+			rcu_read_lock();
+			if (rv)
+				break;
+		}
+		rcu_read_unlock();
+	}
+}
+
+/**
+ * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
+ * @device:	DRBD device.
+ * @epoch:	Epoch object.
+ * @ev:		Epoch event.
+ */
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
+					       struct drbd_epoch *epoch,
+					       enum epoch_event ev)
+{
+	int epoch_size;
+	struct drbd_epoch *next_epoch;
+	enum finish_epoch rv = FE_STILL_LIVE;
+
+	spin_lock(&connection->epoch_lock);
+	do {
+		next_epoch = NULL;
+
+		epoch_size = atomic_read(&epoch->epoch_size);
+
+		switch (ev & ~EV_CLEANUP) {
+		case EV_PUT:
+			atomic_dec(&epoch->active);
+			break;
+		case EV_GOT_BARRIER_NR:
+			set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
+			break;
+		case EV_BECAME_LAST:
+			/* nothing to do*/
+			break;
+		}
+
+		if (epoch_size != 0 &&
+		    atomic_read(&epoch->active) == 0 &&
+		    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
+			if (!(ev & EV_CLEANUP)) {
+				spin_unlock(&connection->epoch_lock);
+				drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
+				spin_lock(&connection->epoch_lock);
+			}
+#if 0
+			/* FIXME: dec unacked on connection, once we have
+			 * something to count pending connection packets in. */
+			if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
+				dec_unacked(epoch->connection);
+#endif
+
+			if (connection->current_epoch != epoch) {
+				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
+				list_del(&epoch->list);
+				ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
+				connection->epochs--;
+				kfree(epoch);
+
+				if (rv == FE_STILL_LIVE)
+					rv = FE_DESTROYED;
+			} else {
+				epoch->flags = 0;
+				atomic_set(&epoch->epoch_size, 0);
+				/* atomic_set(&epoch->active, 0); is already zero */
+				if (rv == FE_STILL_LIVE)
+					rv = FE_RECYCLED;
+			}
+		}
+
+		if (!next_epoch)
+			break;
+
+		epoch = next_epoch;
+	} while (1);
+
+	spin_unlock(&connection->epoch_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_bump_write_ordering() - Fall back to an other write ordering method
+ * @connection:	DRBD connection.
+ * @wo:		Write ordering method to try.
+ */
+void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo)
+{
+	struct disk_conf *dc;
+	struct drbd_peer_device *peer_device;
+	enum write_ordering_e pwo;
+	int vnr;
+	static char *write_ordering_str[] = {
+		[WO_none] = "none",
+		[WO_drain_io] = "drain",
+		[WO_bdev_flush] = "flush",
+	};
+
+	pwo = connection->write_ordering;
+	wo = min(pwo, wo);
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		if (!get_ldev_if_state(device, D_ATTACHING))
+			continue;
+		dc = rcu_dereference(device->ldev->disk_conf);
+
+		if (wo == WO_bdev_flush && !dc->disk_flushes)
+			wo = WO_drain_io;
+		if (wo == WO_drain_io && !dc->disk_drain)
+			wo = WO_none;
+		put_ldev(device);
+	}
+	rcu_read_unlock();
+	connection->write_ordering = wo;
+	if (pwo != connection->write_ordering || wo == WO_bdev_flush)
+		drbd_info(connection, "Method to ensure write ordering: %s\n", write_ordering_str[connection->write_ordering]);
+}
+
+/**
+ * drbd_submit_peer_request()
+ * @device:	DRBD device.
+ * @peer_req:	peer request
+ * @rw:		flag field, see bio->bi_rw
+ *
+ * May spread the pages to multiple bios,
+ * depending on bio_add_page restrictions.
+ *
+ * Returns 0 if all bios have been submitted,
+ * -ENOMEM if we could not allocate enough bios,
+ * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
+ *  single page to an empty bio (which should never happen and likely indicates
+ *  that the lower level IO stack is in some way broken). This has been observed
+ *  on certain Xen deployments.
+ */
+/* TODO allocate from our own bio_set. */
+int drbd_submit_peer_request(struct drbd_device *device,
+			     struct drbd_peer_request *peer_req,
+			     const unsigned rw, const int fault_type)
+{
+	struct bio *bios = NULL;
+	struct bio *bio;
+	struct page *page = peer_req->pages;
+	sector_t sector = peer_req->i.sector;
+	unsigned ds = peer_req->i.size;
+	unsigned n_bios = 0;
+	unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
+	int err = -ENOMEM;
+
+	if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
+		/* wait for all pending IO completions, before we start
+		 * zeroing things out. */
+		conn_wait_active_ee_empty(first_peer_device(device)->connection);
+		if (blkdev_issue_zeroout(device->ldev->backing_bdev,
+			sector, ds >> 9, GFP_NOIO))
+			peer_req->flags |= EE_WAS_ERROR;
+		drbd_endio_write_sec_final(peer_req);
+		return 0;
+	}
+
+	/* Discards don't have any payload.
+	 * But the scsi layer still expects a bio_vec it can use internally,
+	 * see sd_setup_discard_cmnd() and blk_add_request_payload(). */
+	if (peer_req->flags & EE_IS_TRIM)
+		nr_pages = 1;
+
+	/* In most cases, we will only need one bio.  But in case the lower
+	 * level restrictions happen to be different at this offset on this
+	 * side than those of the sending peer, we may need to submit the
+	 * request in more than one bio.
+	 *
+	 * Plain bio_alloc is good enough here, this is no DRBD internally
+	 * generated bio, but a bio allocated on behalf of the peer.
+	 */
+next_bio:
+	bio = bio_alloc(GFP_NOIO, nr_pages);
+	if (!bio) {
+		drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
+		goto fail;
+	}
+	/* > peer_req->i.sector, unless this is the first bio */
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_bdev = device->ldev->backing_bdev;
+	bio->bi_rw = rw;
+	bio->bi_private = peer_req;
+	bio->bi_end_io = drbd_peer_request_endio;
+
+	bio->bi_next = bios;
+	bios = bio;
+	++n_bios;
+
+	if (rw & REQ_DISCARD) {
+		bio->bi_iter.bi_size = ds;
+		goto submit;
+	}
+
+	page_chain_for_each(page) {
+		unsigned len = min_t(unsigned, ds, PAGE_SIZE);
+		if (!bio_add_page(bio, page, len, 0)) {
+			/* A single page must always be possible!
+			 * But in case it fails anyways,
+			 * we deal with it, and complain (below). */
+			if (bio->bi_vcnt == 0) {
+				drbd_err(device,
+					"bio_add_page failed for len=%u, "
+					"bi_vcnt=0 (bi_sector=%llu)\n",
+					len, (uint64_t)bio->bi_iter.bi_sector);
+				err = -ENOSPC;
+				goto fail;
+			}
+			goto next_bio;
+		}
+		ds -= len;
+		sector += len >> 9;
+		--nr_pages;
+	}
+	D_ASSERT(device, ds == 0);
+submit:
+	D_ASSERT(device, page == NULL);
+
+	atomic_set(&peer_req->pending_bios, n_bios);
+	do {
+		bio = bios;
+		bios = bios->bi_next;
+		bio->bi_next = NULL;
+
+		drbd_generic_make_request(device, fault_type, bio);
+	} while (bios);
+	return 0;
+
+fail:
+	while (bios) {
+		bio = bios;
+		bios = bios->bi_next;
+		bio_put(bio);
+	}
+	return err;
+}
+
+static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
+					     struct drbd_peer_request *peer_req)
+{
+	struct drbd_interval *i = &peer_req->i;
+
+	drbd_remove_interval(&device->write_requests, i);
+	drbd_clear_interval(i);
+
+	/* Wake up any processes waiting for this peer request to complete.  */
+	if (i->waiting)
+		wake_up(&device->misc_wait);
+}
+
+static void conn_wait_active_ee_empty(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_wait_ee_list_empty(device, &device->active_ee);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
+static struct drbd_peer_device *
+conn_peer_device(struct drbd_connection *connection, int volume_number)
+{
+	return idr_find(&connection->peer_devices, volume_number);
+}
+
+static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
+{
+	int rv;
+	struct p_barrier *p = pi->data;
+	struct drbd_epoch *epoch;
+
+	/* FIXME these are unacked on connection,
+	 * not a specific (peer)device.
+	 */
+	connection->current_epoch->barrier_nr = p->barrier;
+	connection->current_epoch->connection = connection;
+	rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
+
+	/* P_BARRIER_ACK may imply that the corresponding extent is dropped from
+	 * the activity log, which means it would not be resynced in case the
+	 * R_PRIMARY crashes now.
+	 * Therefore we must send the barrier_ack after the barrier request was
+	 * completed. */
+	switch (connection->write_ordering) {
+	case WO_none:
+		if (rv == FE_RECYCLED)
+			return 0;
+
+		/* receiver context, in the writeout path of the other node.
+		 * avoid potential distributed deadlock */
+		epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+		if (epoch)
+			break;
+		else
+			drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
+			/* Fall through */
+
+	case WO_bdev_flush:
+	case WO_drain_io:
+		conn_wait_active_ee_empty(connection);
+		drbd_flush(connection);
+
+		if (atomic_read(&connection->current_epoch->epoch_size)) {
+			epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+			if (epoch)
+				break;
+		}
+
+		return 0;
+	default:
+		drbd_err(connection, "Strangeness in connection->write_ordering %d\n", connection->write_ordering);
+		return -EIO;
+	}
+
+	epoch->flags = 0;
+	atomic_set(&epoch->epoch_size, 0);
+	atomic_set(&epoch->active, 0);
+
+	spin_lock(&connection->epoch_lock);
+	if (atomic_read(&connection->current_epoch->epoch_size)) {
+		list_add(&epoch->list, &connection->current_epoch->list);
+		connection->current_epoch = epoch;
+		connection->epochs++;
+	} else {
+		/* The current_epoch got recycled while we allocated this one... */
+		kfree(epoch);
+	}
+	spin_unlock(&connection->epoch_lock);
+
+	return 0;
+}
+
+/* used from receive_RSDataReply (recv_resync_read)
+ * and from receive_Data */
+static struct drbd_peer_request *
+read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+	      struct packet_info *pi) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	const sector_t capacity = drbd_get_capacity(device->this_bdev);
+	struct drbd_peer_request *peer_req;
+	struct page *page;
+	int dgs, ds, err;
+	int data_size = pi->size;
+	void *dig_in = peer_device->connection->int_dig_in;
+	void *dig_vv = peer_device->connection->int_dig_vv;
+	unsigned long *data;
+	struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
+
+	dgs = 0;
+	if (!trim && peer_device->connection->peer_integrity_tfm) {
+		dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+		/*
+		 * FIXME: Receive the incoming digest into the receive buffer
+		 *	  here, together with its struct p_data?
+		 */
+		err = drbd_recv_all_warn(peer_device->connection, dig_in, dgs);
+		if (err)
+			return NULL;
+		data_size -= dgs;
+	}
+
+	if (trim) {
+		D_ASSERT(peer_device, data_size == 0);
+		data_size = be32_to_cpu(trim->size);
+	}
+
+	if (!expect(IS_ALIGNED(data_size, 512)))
+		return NULL;
+	/* prepare for larger trim requests. */
+	if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
+		return NULL;
+
+	/* even though we trust out peer,
+	 * we sometimes have to double check. */
+	if (sector + (data_size>>9) > capacity) {
+		drbd_err(device, "request from peer beyond end of local disk: "
+			"capacity: %llus < sector: %llus + size: %u\n",
+			(unsigned long long)capacity,
+			(unsigned long long)sector, data_size);
+		return NULL;
+	}
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
+	if (!peer_req)
+		return NULL;
+
+	if (trim)
+		return peer_req;
+
+	ds = data_size;
+	page = peer_req->pages;
+	page_chain_for_each(page) {
+		unsigned len = min_t(int, ds, PAGE_SIZE);
+		data = kmap(page);
+		err = drbd_recv_all_warn(peer_device->connection, data, len);
+		if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
+			drbd_err(device, "Fault injection: Corrupting data on receive\n");
+			data[0] = data[0] ^ (unsigned long)-1;
+		}
+		kunmap(page);
+		if (err) {
+			drbd_free_peer_req(device, peer_req);
+			return NULL;
+		}
+		ds -= len;
+	}
+
+	if (dgs) {
+		drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv);
+		if (memcmp(dig_in, dig_vv, dgs)) {
+			drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
+				(unsigned long long)sector, data_size);
+			drbd_free_peer_req(device, peer_req);
+			return NULL;
+		}
+	}
+	device->recv_cnt += data_size>>9;
+	return peer_req;
+}
+
+/* drbd_drain_block() just takes a data block
+ * out of the socket input buffer, and discards it.
+ */
+static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
+{
+	struct page *page;
+	int err = 0;
+	void *data;
+
+	if (!data_size)
+		return 0;
+
+	page = drbd_alloc_pages(peer_device, 1, 1);
+
+	data = kmap(page);
+	while (data_size) {
+		unsigned int len = min_t(int, data_size, PAGE_SIZE);
+
+		err = drbd_recv_all_warn(peer_device->connection, data, len);
+		if (err)
+			break;
+		data_size -= len;
+	}
+	kunmap(page);
+	drbd_free_pages(peer_device->device, page, 0);
+	return err;
+}
+
+static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
+			   sector_t sector, int data_size)
+{
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	struct bio *bio;
+	int dgs, err, expect;
+	void *dig_in = peer_device->connection->int_dig_in;
+	void *dig_vv = peer_device->connection->int_dig_vv;
+
+	dgs = 0;
+	if (peer_device->connection->peer_integrity_tfm) {
+		dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+		err = drbd_recv_all_warn(peer_device->connection, dig_in, dgs);
+		if (err)
+			return err;
+		data_size -= dgs;
+	}
+
+	/* optimistically update recv_cnt.  if receiving fails below,
+	 * we disconnect anyways, and counters will be reset. */
+	peer_device->device->recv_cnt += data_size>>9;
+
+	bio = req->master_bio;
+	D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
+
+	bio_for_each_segment(bvec, bio, iter) {
+		void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
+		expect = min_t(int, data_size, bvec.bv_len);
+		err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
+		kunmap(bvec.bv_page);
+		if (err)
+			return err;
+		data_size -= expect;
+	}
+
+	if (dgs) {
+		drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
+		if (memcmp(dig_in, dig_vv, dgs)) {
+			drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
+			return -EINVAL;
+		}
+	}
+
+	D_ASSERT(peer_device->device, data_size == 0);
+	return 0;
+}
+
+/*
+ * e_end_resync_block() is called in asender context via
+ * drbd_finish_peer_reqs().
+ */
+static int e_end_resync_block(struct drbd_work *w, int unused)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	sector_t sector = peer_req->i.sector;
+	int err;
+
+	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		drbd_set_in_sync(device, sector, peer_req->i.size);
+		err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
+	} else {
+		/* Record failure to sync */
+		drbd_rs_failed_io(device, sector, peer_req->i.size);
+
+		err  = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
+	}
+	dec_unacked(device);
+
+	return err;
+}
+
+static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
+			    struct packet_info *pi) __releases(local)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_request *peer_req;
+
+	peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
+	if (!peer_req)
+		goto fail;
+
+	dec_rs_pending(device);
+
+	inc_unacked(device);
+	/* corresponding dec_unacked() in e_end_resync_block()
+	 * respective _drbd_clear_done_ee */
+
+	peer_req->w.cb = e_end_resync_block;
+
+	spin_lock_irq(&device->resource->req_lock);
+	list_add(&peer_req->w.list, &device->sync_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	atomic_add(pi->size >> 9, &device->rs_sect_ev);
+	if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
+		return 0;
+
+	/* don't care for the reason here */
+	drbd_err(device, "submit failed, triggering re-connect\n");
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	drbd_free_peer_req(device, peer_req);
+fail:
+	put_ldev(device);
+	return -EIO;
+}
+
+static struct drbd_request *
+find_request(struct drbd_device *device, struct rb_root *root, u64 id,
+	     sector_t sector, bool missing_ok, const char *func)
+{
+	struct drbd_request *req;
+
+	/* Request object according to our peer */
+	req = (struct drbd_request *)(unsigned long)id;
+	if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
+		return req;
+	if (!missing_ok) {
+		drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
+			(unsigned long)id, (unsigned long long)sector);
+	}
+	return NULL;
+}
+
+static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct drbd_request *req;
+	sector_t sector;
+	int err;
+	struct p_data *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+
+	spin_lock_irq(&device->resource->req_lock);
+	req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
+	spin_unlock_irq(&device->resource->req_lock);
+	if (unlikely(!req))
+		return -EIO;
+
+	/* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
+	 * special casing it there for the various failure cases.
+	 * still no race with drbd_fail_pending_reads */
+	err = recv_dless_read(peer_device, req, sector, pi->size);
+	if (!err)
+		req_mod(req, DATA_RECEIVED);
+	/* else: nothing. handled from drbd_disconnect...
+	 * I don't think we may complete this just yet
+	 * in case we are "on-disconnect: freeze" */
+
+	return err;
+}
+
+static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	sector_t sector;
+	int err;
+	struct p_data *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+	D_ASSERT(device, p->block_id == ID_SYNCER);
+
+	if (get_ldev(device)) {
+		/* data is submitted to disk within recv_resync_read.
+		 * corresponding put_ldev done below on error,
+		 * or in drbd_peer_request_endio. */
+		err = recv_resync_read(peer_device, sector, pi);
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Can not write resync data to local disk.\n");
+
+		err = drbd_drain_block(peer_device, pi->size);
+
+		drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+	}
+
+	atomic_add(pi->size >> 9, &device->rs_sect_in);
+
+	return err;
+}
+
+static void restart_conflicting_writes(struct drbd_device *device,
+				       sector_t sector, int size)
+{
+	struct drbd_interval *i;
+	struct drbd_request *req;
+
+	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+		if (!i->local)
+			continue;
+		req = container_of(i, struct drbd_request, i);
+		if (req->rq_state & RQ_LOCAL_PENDING ||
+		    !(req->rq_state & RQ_POSTPONED))
+			continue;
+		/* as it is RQ_POSTPONED, this will cause it to
+		 * be queued on the retry workqueue. */
+		__req_mod(req, CONFLICT_RESOLVED, NULL);
+	}
+}
+
+/*
+ * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+ */
+static int e_end_block(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	sector_t sector = peer_req->i.sector;
+	int err = 0, pcmd;
+
+	if (peer_req->flags & EE_SEND_WRITE_ACK) {
+		if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+			pcmd = (device->state.conn >= C_SYNC_SOURCE &&
+				device->state.conn <= C_PAUSED_SYNC_T &&
+				peer_req->flags & EE_MAY_SET_IN_SYNC) ?
+				P_RS_WRITE_ACK : P_WRITE_ACK;
+			err = drbd_send_ack(peer_device, pcmd, peer_req);
+			if (pcmd == P_RS_WRITE_ACK)
+				drbd_set_in_sync(device, sector, peer_req->i.size);
+		} else {
+			err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
+			/* we expect it to be marked out of sync anyways...
+			 * maybe assert this?  */
+		}
+		dec_unacked(device);
+	}
+	/* we delete from the conflict detection hash _after_ we sent out the
+	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
+	if (peer_req->flags & EE_IN_INTERVAL_TREE) {
+		spin_lock_irq(&device->resource->req_lock);
+		D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
+		drbd_remove_epoch_entry_interval(device, peer_req);
+		if (peer_req->flags & EE_RESTART_REQUESTS)
+			restart_conflicting_writes(device, sector, peer_req->i.size);
+		spin_unlock_irq(&device->resource->req_lock);
+	} else
+		D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+
+	drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+
+	return err;
+}
+
+static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	int err;
+
+	err = drbd_send_ack(peer_device, ack, peer_req);
+	dec_unacked(peer_device->device);
+
+	return err;
+}
+
+static int e_send_superseded(struct drbd_work *w, int unused)
+{
+	return e_send_ack(w, P_SUPERSEDED);
+}
+
+static int e_send_retry_write(struct drbd_work *w, int unused)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_connection *connection = peer_req->peer_device->connection;
+
+	return e_send_ack(w, connection->agreed_pro_version >= 100 ?
+			     P_RETRY_WRITE : P_SUPERSEDED);
+}
+
+static bool seq_greater(u32 a, u32 b)
+{
+	/*
+	 * We assume 32-bit wrap-around here.
+	 * For 24-bit wrap-around, we would have to shift:
+	 *  a <<= 8; b <<= 8;
+	 */
+	return (s32)a - (s32)b > 0;
+}
+
+static u32 seq_max(u32 a, u32 b)
+{
+	return seq_greater(a, b) ? a : b;
+}
+
+static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
+{
+	struct drbd_device *device = peer_device->device;
+	unsigned int newest_peer_seq;
+
+	if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
+		spin_lock(&device->peer_seq_lock);
+		newest_peer_seq = seq_max(device->peer_seq, peer_seq);
+		device->peer_seq = newest_peer_seq;
+		spin_unlock(&device->peer_seq_lock);
+		/* wake up only if we actually changed device->peer_seq */
+		if (peer_seq == newest_peer_seq)
+			wake_up(&device->seq_wait);
+	}
+}
+
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
+{
+	return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
+
+/* maybe change sync_ee into interval trees as well? */
+static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_request *rs_req;
+	bool rv = 0;
+
+	spin_lock_irq(&device->resource->req_lock);
+	list_for_each_entry(rs_req, &device->sync_ee, w.list) {
+		if (overlaps(peer_req->i.sector, peer_req->i.size,
+			     rs_req->i.sector, rs_req->i.size)) {
+			rv = 1;
+			break;
+		}
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+
+	return rv;
+}
+
+/* Called from receive_Data.
+ * Synchronize packets on sock with packets on msock.
+ *
+ * This is here so even when a P_DATA packet traveling via sock overtook an Ack
+ * packet traveling on msock, they are still processed in the order they have
+ * been sent.
+ *
+ * Note: we don't care for Ack packets overtaking P_DATA packets.
+ *
+ * In case packet_seq is larger than device->peer_seq number, there are
+ * outstanding packets on the msock. We wait for them to arrive.
+ * In case we are the logically next packet, we update device->peer_seq
+ * ourselves. Correctly handles 32bit wrap around.
+ *
+ * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
+ * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
+ * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
+ * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
+ *
+ * returns 0 if we may process the packet,
+ * -ERESTARTSYS if we were interrupted (by disconnect signal). */
+static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
+{
+	struct drbd_device *device = peer_device->device;
+	DEFINE_WAIT(wait);
+	long timeout;
+	int ret = 0, tp;
+
+	if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
+		return 0;
+
+	spin_lock(&device->peer_seq_lock);
+	for (;;) {
+		if (!seq_greater(peer_seq - 1, device->peer_seq)) {
+			device->peer_seq = seq_max(device->peer_seq, peer_seq);
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		rcu_read_lock();
+		tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
+		rcu_read_unlock();
+
+		if (!tp)
+			break;
+
+		/* Only need to wait if two_primaries is enabled */
+		prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
+		spin_unlock(&device->peer_seq_lock);
+		rcu_read_lock();
+		timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
+		rcu_read_unlock();
+		timeout = schedule_timeout(timeout);
+		spin_lock(&device->peer_seq_lock);
+		if (!timeout) {
+			ret = -ETIMEDOUT;
+			drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
+			break;
+		}
+	}
+	spin_unlock(&device->peer_seq_lock);
+	finish_wait(&device->seq_wait, &wait);
+	return ret;
+}
+
+/* see also bio_flags_to_wire()
+ * DRBD_REQ_*, because we need to semantically map the flags to data packet
+ * flags and back. We may replicate to other kernel versions. */
+static unsigned long wire_flags_to_bio(u32 dpf)
+{
+	return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+		(dpf & DP_FUA ? REQ_FUA : 0) |
+		(dpf & DP_FLUSH ? REQ_FLUSH : 0) |
+		(dpf & DP_DISCARD ? REQ_DISCARD : 0);
+}
+
+static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
+				    unsigned int size)
+{
+	struct drbd_interval *i;
+
+    repeat:
+	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+		struct drbd_request *req;
+		struct bio_and_error m;
+
+		if (!i->local)
+			continue;
+		req = container_of(i, struct drbd_request, i);
+		if (!(req->rq_state & RQ_POSTPONED))
+			continue;
+		req->rq_state &= ~RQ_POSTPONED;
+		__req_mod(req, NEG_ACKED, &m);
+		spin_unlock_irq(&device->resource->req_lock);
+		if (m.bio)
+			complete_master_bio(device, &m);
+		spin_lock_irq(&device->resource->req_lock);
+		goto repeat;
+	}
+}
+
+static int handle_write_conflicts(struct drbd_device *device,
+				  struct drbd_peer_request *peer_req)
+{
+	struct drbd_connection *connection = peer_req->peer_device->connection;
+	bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
+	sector_t sector = peer_req->i.sector;
+	const unsigned int size = peer_req->i.size;
+	struct drbd_interval *i;
+	bool equal;
+	int err;
+
+	/*
+	 * Inserting the peer request into the write_requests tree will prevent
+	 * new conflicting local requests from being added.
+	 */
+	drbd_insert_interval(&device->write_requests, &peer_req->i);
+
+    repeat:
+	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+		if (i == &peer_req->i)
+			continue;
+
+		if (!i->local) {
+			/*
+			 * Our peer has sent a conflicting remote request; this
+			 * should not happen in a two-node setup.  Wait for the
+			 * earlier peer request to complete.
+			 */
+			err = drbd_wait_misc(device, i);
+			if (err)
+				goto out;
+			goto repeat;
+		}
+
+		equal = i->sector == sector && i->size == size;
+		if (resolve_conflicts) {
+			/*
+			 * If the peer request is fully contained within the
+			 * overlapping request, it can be considered overwritten
+			 * and thus superseded; otherwise, it will be retried
+			 * once all overlapping requests have completed.
+			 */
+			bool superseded = i->sector <= sector && i->sector +
+				       (i->size >> 9) >= sector + (size >> 9);
+
+			if (!equal)
+				drbd_alert(device, "Concurrent writes detected: "
+					       "local=%llus +%u, remote=%llus +%u, "
+					       "assuming %s came first\n",
+					  (unsigned long long)i->sector, i->size,
+					  (unsigned long long)sector, size,
+					  superseded ? "local" : "remote");
+
+			inc_unacked(device);
+			peer_req->w.cb = superseded ? e_send_superseded :
+						   e_send_retry_write;
+			list_add_tail(&peer_req->w.list, &device->done_ee);
+			wake_asender(connection);
+
+			err = -ENOENT;
+			goto out;
+		} else {
+			struct drbd_request *req =
+				container_of(i, struct drbd_request, i);
+
+			if (!equal)
+				drbd_alert(device, "Concurrent writes detected: "
+					       "local=%llus +%u, remote=%llus +%u\n",
+					  (unsigned long long)i->sector, i->size,
+					  (unsigned long long)sector, size);
+
+			if (req->rq_state & RQ_LOCAL_PENDING ||
+			    !(req->rq_state & RQ_POSTPONED)) {
+				/*
+				 * Wait for the node with the discard flag to
+				 * decide if this request has been superseded
+				 * or needs to be retried.
+				 * Requests that have been superseded will
+				 * disappear from the write_requests tree.
+				 *
+				 * In addition, wait for the conflicting
+				 * request to finish locally before submitting
+				 * the conflicting peer request.
+				 */
+				err = drbd_wait_misc(device, &req->i);
+				if (err) {
+					_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
+					fail_postponed_requests(device, sector, size);
+					goto out;
+				}
+				goto repeat;
+			}
+			/*
+			 * Remember to restart the conflicting requests after
+			 * the new peer request has completed.
+			 */
+			peer_req->flags |= EE_RESTART_REQUESTS;
+		}
+	}
+	err = 0;
+
+    out:
+	if (err)
+		drbd_remove_epoch_entry_interval(device, peer_req);
+	return err;
+}
+
+/* mirrored write */
+static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	sector_t sector;
+	struct drbd_peer_request *peer_req;
+	struct p_data *p = pi->data;
+	u32 peer_seq = be32_to_cpu(p->seq_num);
+	int rw = WRITE;
+	u32 dp_flags;
+	int err, tp;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	if (!get_ldev(device)) {
+		int err2;
+
+		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+		drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+		atomic_inc(&connection->current_epoch->epoch_size);
+		err2 = drbd_drain_block(peer_device, pi->size);
+		if (!err)
+			err = err2;
+		return err;
+	}
+
+	/*
+	 * Corresponding put_ldev done either below (on various errors), or in
+	 * drbd_peer_request_endio, if we successfully submit the data at the
+	 * end of this function.
+	 */
+
+	sector = be64_to_cpu(p->sector);
+	peer_req = read_in_block(peer_device, p->block_id, sector, pi);
+	if (!peer_req) {
+		put_ldev(device);
+		return -EIO;
+	}
+
+	peer_req->w.cb = e_end_block;
+
+	dp_flags = be32_to_cpu(p->dp_flags);
+	rw |= wire_flags_to_bio(dp_flags);
+	if (pi->cmd == P_TRIM) {
+		struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
+		peer_req->flags |= EE_IS_TRIM;
+		if (!blk_queue_discard(q))
+			peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+		D_ASSERT(peer_device, peer_req->i.size > 0);
+		D_ASSERT(peer_device, rw & REQ_DISCARD);
+		D_ASSERT(peer_device, peer_req->pages == NULL);
+	} else if (peer_req->pages == NULL) {
+		D_ASSERT(device, peer_req->i.size == 0);
+		D_ASSERT(device, dp_flags & DP_FLUSH);
+	}
+
+	if (dp_flags & DP_MAY_SET_IN_SYNC)
+		peer_req->flags |= EE_MAY_SET_IN_SYNC;
+
+	spin_lock(&connection->epoch_lock);
+	peer_req->epoch = connection->current_epoch;
+	atomic_inc(&peer_req->epoch->epoch_size);
+	atomic_inc(&peer_req->epoch->active);
+	spin_unlock(&connection->epoch_lock);
+
+	rcu_read_lock();
+	tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
+	rcu_read_unlock();
+	if (tp) {
+		peer_req->flags |= EE_IN_INTERVAL_TREE;
+		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+		if (err)
+			goto out_interrupted;
+		spin_lock_irq(&device->resource->req_lock);
+		err = handle_write_conflicts(device, peer_req);
+		if (err) {
+			spin_unlock_irq(&device->resource->req_lock);
+			if (err == -ENOENT) {
+				put_ldev(device);
+				return 0;
+			}
+			goto out_interrupted;
+		}
+	} else {
+		update_peer_seq(peer_device, peer_seq);
+		spin_lock_irq(&device->resource->req_lock);
+	}
+	/* if we use the zeroout fallback code, we process synchronously
+	 * and we wait for all pending requests, respectively wait for
+	 * active_ee to become empty in drbd_submit_peer_request();
+	 * better not add ourselves here. */
+	if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
+		list_add(&peer_req->w.list, &device->active_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (device->state.conn == C_SYNC_TARGET)
+		wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
+
+	if (peer_device->connection->agreed_pro_version < 100) {
+		rcu_read_lock();
+		switch (rcu_dereference(peer_device->connection->net_conf)->wire_protocol) {
+		case DRBD_PROT_C:
+			dp_flags |= DP_SEND_WRITE_ACK;
+			break;
+		case DRBD_PROT_B:
+			dp_flags |= DP_SEND_RECEIVE_ACK;
+			break;
+		}
+		rcu_read_unlock();
+	}
+
+	if (dp_flags & DP_SEND_WRITE_ACK) {
+		peer_req->flags |= EE_SEND_WRITE_ACK;
+		inc_unacked(device);
+		/* corresponding dec_unacked() in e_end_block()
+		 * respective _drbd_clear_done_ee */
+	}
+
+	if (dp_flags & DP_SEND_RECEIVE_ACK) {
+		/* I really don't like it that the receiver thread
+		 * sends on the msock, but anyways */
+		drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+	}
+
+	if (device->state.pdsk < D_INCONSISTENT) {
+		/* In case we have the only disk of the cluster, */
+		drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
+		peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
+		peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
+		drbd_al_begin_io(device, &peer_req->i, true);
+	}
+
+	err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
+	if (!err)
+		return 0;
+
+	/* don't care for the reason here */
+	drbd_err(device, "submit failed, triggering re-connect\n");
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	drbd_remove_epoch_entry_interval(device, peer_req);
+	spin_unlock_irq(&device->resource->req_lock);
+	if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
+		drbd_al_complete_io(device, &peer_req->i);
+
+out_interrupted:
+	drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
+	put_ldev(device);
+	drbd_free_peer_req(device, peer_req);
+	return err;
+}
+
+/* We may throttle resync, if the lower device seems to be busy,
+ * and current sync rate is above c_min_rate.
+ *
+ * To decide whether or not the lower device is busy, we use a scheme similar
+ * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ * (more than 64 sectors) of activity we cannot account for with our own resync
+ * activity, it obviously is "busy".
+ *
+ * The current sync rate used here uses only the most recent two step marks,
+ * to have a short time average so we can react faster.
+ */
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+{
+	struct lc_element *tmp;
+	bool throttle = true;
+
+	if (!drbd_rs_c_min_rate_throttle(device))
+		return false;
+
+	spin_lock_irq(&device->al_lock);
+	tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
+	if (tmp) {
+		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_PRIORITY, &bm_ext->flags))
+			throttle = false;
+		/* Do not slow down if app IO is already waiting for this extent */
+	}
+	spin_unlock_irq(&device->al_lock);
+
+	return throttle;
+}
+
+bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
+{
+	struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
+	unsigned long db, dt, dbdt;
+	unsigned int c_min_rate;
+	int curr_events;
+
+	rcu_read_lock();
+	c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
+	rcu_read_unlock();
+
+	/* feature disabled? */
+	if (c_min_rate == 0)
+		return false;
+
+	curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+		      (int)part_stat_read(&disk->part0, sectors[1]) -
+			atomic_read(&device->rs_sect_ev);
+	if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
+		unsigned long rs_left;
+		int i;
+
+		device->rs_last_events = curr_events;
+
+		/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+		 * approx. */
+		i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+
+		if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+			rs_left = device->ov_left;
+		else
+			rs_left = drbd_bm_total_weight(device) - device->rs_failed;
+
+		dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
+		if (!dt)
+			dt++;
+		db = device->rs_mark_left[i] - rs_left;
+		dbdt = Bit2KB(db/dt);
+
+		if (dbdt > c_min_rate)
+			return true;
+	}
+	return false;
+}
+
+static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	sector_t sector;
+	sector_t capacity;
+	struct drbd_peer_request *peer_req;
+	struct digest_info *di = NULL;
+	int size, verb;
+	unsigned int fault_type;
+	struct p_block_req *p =	pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+	capacity = drbd_get_capacity(device->this_bdev);
+
+	sector = be64_to_cpu(p->sector);
+	size   = be32_to_cpu(p->blksize);
+
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+		drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return -EINVAL;
+	}
+	if (sector + (size>>9) > capacity) {
+		drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return -EINVAL;
+	}
+
+	if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
+		verb = 1;
+		switch (pi->cmd) {
+		case P_DATA_REQUEST:
+			drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
+			break;
+		case P_RS_DATA_REQUEST:
+		case P_CSUM_RS_REQUEST:
+		case P_OV_REQUEST:
+			drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
+			break;
+		case P_OV_REPLY:
+			verb = 0;
+			dec_rs_pending(device);
+			drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
+			break;
+		default:
+			BUG();
+		}
+		if (verb && __ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Can not satisfy peer's read request, "
+			    "no local data.\n");
+
+		/* drain possibly payload */
+		return drbd_drain_block(peer_device, pi->size);
+	}
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
+			true /* has real payload */, GFP_NOIO);
+	if (!peer_req) {
+		put_ldev(device);
+		return -ENOMEM;
+	}
+
+	switch (pi->cmd) {
+	case P_DATA_REQUEST:
+		peer_req->w.cb = w_e_end_data_req;
+		fault_type = DRBD_FAULT_DT_RD;
+		/* application IO, don't drbd_rs_begin_io */
+		goto submit;
+
+	case P_RS_DATA_REQUEST:
+		peer_req->w.cb = w_e_end_rsdata_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		/* used in the sector offset progress display */
+		device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+		break;
+
+	case P_OV_REPLY:
+	case P_CSUM_RS_REQUEST:
+		fault_type = DRBD_FAULT_RS_RD;
+		di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
+		if (!di)
+			goto out_free_e;
+
+		di->digest_size = pi->size;
+		di->digest = (((char *)di)+sizeof(struct digest_info));
+
+		peer_req->digest = di;
+		peer_req->flags |= EE_HAS_DIGEST;
+
+		if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
+			goto out_free_e;
+
+		if (pi->cmd == P_CSUM_RS_REQUEST) {
+			D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+			peer_req->w.cb = w_e_end_csum_rs_req;
+			/* used in the sector offset progress display */
+			device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+		} else if (pi->cmd == P_OV_REPLY) {
+			/* track progress, we may need to throttle */
+			atomic_add(size >> 9, &device->rs_sect_in);
+			peer_req->w.cb = w_e_end_ov_reply;
+			dec_rs_pending(device);
+			/* drbd_rs_begin_io done when we sent this request,
+			 * but accounting still needs to be done. */
+			goto submit_for_resync;
+		}
+		break;
+
+	case P_OV_REQUEST:
+		if (device->ov_start_sector == ~(sector_t)0 &&
+		    peer_device->connection->agreed_pro_version >= 90) {
+			unsigned long now = jiffies;
+			int i;
+			device->ov_start_sector = sector;
+			device->ov_position = sector;
+			device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
+			device->rs_total = device->ov_left;
+			for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+				device->rs_mark_left[i] = device->ov_left;
+				device->rs_mark_time[i] = now;
+			}
+			drbd_info(device, "Online Verify start sector: %llu\n",
+					(unsigned long long)sector);
+		}
+		peer_req->w.cb = w_e_end_ov_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		break;
+
+	default:
+		BUG();
+	}
+
+	/* Throttle, drbd_rs_begin_io and submit should become asynchronous
+	 * wrt the receiver, but it is not as straightforward as it may seem.
+	 * Various places in the resync start and stop logic assume resync
+	 * requests are processed in order, requeuing this on the worker thread
+	 * introduces a bunch of new code for synchronization between threads.
+	 *
+	 * Unlimited throttling before drbd_rs_begin_io may stall the resync
+	 * "forever", throttling after drbd_rs_begin_io will lock that extent
+	 * for application writes for the same time.  For now, just throttle
+	 * here, where the rest of the code expects the receiver to sleep for
+	 * a while, anyways.
+	 */
+
+	/* Throttle before drbd_rs_begin_io, as that locks out application IO;
+	 * this defers syncer requests for some time, before letting at least
+	 * on request through.  The resync controller on the receiving side
+	 * will adapt to the incoming rate accordingly.
+	 *
+	 * We cannot throttle here if remote is Primary/SyncTarget:
+	 * we would also throttle its application reads.
+	 * In that case, throttling is done on the SyncTarget only.
+	 */
+	if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector))
+		schedule_timeout_uninterruptible(HZ/10);
+	if (drbd_rs_begin_io(device, sector))
+		goto out_free_e;
+
+submit_for_resync:
+	atomic_add(size >> 9, &device->rs_sect_ev);
+
+submit:
+	inc_unacked(device);
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&peer_req->w.list, &device->read_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
+		return 0;
+
+	/* don't care for the reason here */
+	drbd_err(device, "submit failed, triggering re-connect\n");
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&device->resource->req_lock);
+	/* no drbd_rs_complete_io(), we are dropping the connection anyways */
+
+out_free_e:
+	put_ldev(device);
+	drbd_free_peer_req(device, peer_req);
+	return -EIO;
+}
+
+/**
+ * drbd_asb_recover_0p  -  Recover after split-brain with no remaining primaries
+ */
+static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	int self, peer, rv = -100;
+	unsigned long ch_self, ch_peer;
+	enum drbd_after_sb_p after_sb_0p;
+
+	self = device->ldev->md.uuid[UI_BITMAP] & 1;
+	peer = device->p_uuid[UI_BITMAP] & 1;
+
+	ch_peer = device->p_uuid[UI_SIZE];
+	ch_self = device->comm_bm_set;
+
+	rcu_read_lock();
+	after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
+	rcu_read_unlock();
+	switch (after_sb_0p) {
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+	case ASB_CALL_HELPER:
+	case ASB_VIOLENTLY:
+		drbd_err(device, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_DISCARD_YOUNGER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = -1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv =  1;
+			break;
+		}
+		/* Else fall through to one of the other strategies... */
+	case ASB_DISCARD_OLDER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = 1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv = -1;
+			break;
+		}
+		/* Else fall through to one of the other strategies... */
+		drbd_warn(device, "Discard younger/older primary did not find a decision\n"
+		     "Using discard-least-changes instead\n");
+	case ASB_DISCARD_ZERO_CHG:
+		if (ch_peer == 0 && ch_self == 0) {
+			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
+				? -1 : 1;
+			break;
+		} else {
+			if (ch_peer == 0) { rv =  1; break; }
+			if (ch_self == 0) { rv = -1; break; }
+		}
+		if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
+			break;
+	case ASB_DISCARD_LEAST_CHG:
+		if	(ch_self < ch_peer)
+			rv = -1;
+		else if (ch_self > ch_peer)
+			rv =  1;
+		else /* ( ch_self == ch_peer ) */
+		     /* Well, then use something else. */
+			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
+				? -1 : 1;
+		break;
+	case ASB_DISCARD_LOCAL:
+		rv = -1;
+		break;
+	case ASB_DISCARD_REMOTE:
+		rv =  1;
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_asb_recover_1p  -  Recover after split-brain with one remaining primary
+ */
+static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	int hg, rv = -100;
+	enum drbd_after_sb_p after_sb_1p;
+
+	rcu_read_lock();
+	after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
+	rcu_read_unlock();
+	switch (after_sb_1p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+	case ASB_DISCARD_ZERO_CHG:
+		drbd_err(device, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CONSENSUS:
+		hg = drbd_asb_recover_0p(peer_device);
+		if (hg == -1 && device->state.role == R_SECONDARY)
+			rv = hg;
+		if (hg == 1  && device->state.role == R_PRIMARY)
+			rv = hg;
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(peer_device);
+		break;
+	case ASB_DISCARD_SECONDARY:
+		return device->state.role == R_PRIMARY ? 1 : -1;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(peer_device);
+		if (hg == -1 && device->state.role == R_PRIMARY) {
+			enum drbd_state_rv rv2;
+
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (rv2 != SS_SUCCESS) {
+				drbd_khelper(device, "pri-lost-after-sb");
+			} else {
+				drbd_warn(device, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_asb_recover_2p  -  Recover after split-brain with two remaining primaries
+ */
+static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	int hg, rv = -100;
+	enum drbd_after_sb_p after_sb_2p;
+
+	rcu_read_lock();
+	after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
+	rcu_read_unlock();
+	switch (after_sb_2p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+	case ASB_DISCARD_ZERO_CHG:
+		drbd_err(device, "Configuration error.\n");
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(peer_device);
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(peer_device);
+		if (hg == -1) {
+			enum drbd_state_rv rv2;
+
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (rv2 != SS_SUCCESS) {
+				drbd_khelper(device, "pri-lost-after-sb");
+			} else {
+				drbd_warn(device, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
+			   u64 bits, u64 flags)
+{
+	if (!uuid) {
+		drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
+		return;
+	}
+	drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+	     text,
+	     (unsigned long long)uuid[UI_CURRENT],
+	     (unsigned long long)uuid[UI_BITMAP],
+	     (unsigned long long)uuid[UI_HISTORY_START],
+	     (unsigned long long)uuid[UI_HISTORY_END],
+	     (unsigned long long)bits,
+	     (unsigned long long)flags);
+}
+
+/*
+  100	after split brain try auto recover
+    2	C_SYNC_SOURCE set BitMap
+    1	C_SYNC_SOURCE use BitMap
+    0	no Sync
+   -1	C_SYNC_TARGET use BitMap
+   -2	C_SYNC_TARGET set BitMap
+ -100	after split brain, disconnect
+-1000	unrelated data
+-1091   requires proto 91
+-1096   requires proto 96
+ */
+static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_hold(local)
+{
+	u64 self, peer;
+	int i, j;
+
+	self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+
+	*rule_nr = 10;
+	if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
+		return 0;
+
+	*rule_nr = 20;
+	if ((self == UUID_JUST_CREATED || self == (u64)0) &&
+	     peer != UUID_JUST_CREATED)
+		return -2;
+
+	*rule_nr = 30;
+	if (self != UUID_JUST_CREATED &&
+	    (peer == UUID_JUST_CREATED || peer == (u64)0))
+		return 2;
+
+	if (self == peer) {
+		int rct, dc; /* roles at crash time */
+
+		if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
+
+			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+				return -1091;
+
+			if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
+			    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
+				drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
+				drbd_uuid_move_history(device);
+				device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+				device->ldev->md.uuid[UI_BITMAP] = 0;
+
+				drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+					       device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
+				*rule_nr = 34;
+			} else {
+				drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
+				*rule_nr = 36;
+			}
+
+			return 1;
+		}
+
+		if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
+
+			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+				return -1091;
+
+			if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
+			    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
+				drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
+
+				device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
+				device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
+				device->p_uuid[UI_BITMAP] = 0UL;
+
+				drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+				*rule_nr = 35;
+			} else {
+				drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
+				*rule_nr = 37;
+			}
+
+			return -1;
+		}
+
+		/* Common power [off|failure] */
+		rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
+			(device->p_uuid[UI_FLAGS] & 2);
+		/* lowest bit is set when we were primary,
+		 * next bit (weight 2) is set when peer was primary */
+		*rule_nr = 40;
+
+		switch (rct) {
+		case 0: /* !self_pri && !peer_pri */ return 0;
+		case 1: /*  self_pri && !peer_pri */ return 1;
+		case 2: /* !self_pri &&  peer_pri */ return -1;
+		case 3: /*  self_pri &&  peer_pri */
+			dc = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
+			return dc ? -1 : 1;
+		}
+	}
+
+	*rule_nr = 50;
+	peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer)
+		return -1;
+
+	*rule_nr = 51;
+	peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+		    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
+		    (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
+		    peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of the peer's UUIDs. */
+
+			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+				return -1091;
+
+			device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
+			device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
+
+			drbd_info(device, "Lost last syncUUID packet, corrected:\n");
+			drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+
+			return -1;
+		}
+	}
+
+	*rule_nr = 60;
+	self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		peer = device->p_uuid[i] & ~((u64)1);
+		if (self == peer)
+			return -2;
+	}
+
+	*rule_nr = 70;
+	self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+	if (self == peer)
+		return 1;
+
+	*rule_nr = 71;
+	self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		if (first_peer_device(device)->connection->agreed_pro_version < 96 ?
+		    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
+		    (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
+		    self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of our UUIDs. */
+
+			if (first_peer_device(device)->connection->agreed_pro_version < 91)
+				return -1091;
+
+			__drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
+			__drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
+
+			drbd_info(device, "Last syncUUID did not get through, corrected:\n");
+			drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+				       device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
+
+			return 1;
+		}
+	}
+
+
+	*rule_nr = 80;
+	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = device->ldev->md.uuid[i] & ~((u64)1);
+		if (self == peer)
+			return 2;
+	}
+
+	*rule_nr = 90;
+	self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer && self != ((u64)0))
+		return 100;
+
+	*rule_nr = 100;
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = device->ldev->md.uuid[i] & ~((u64)1);
+		for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
+			peer = device->p_uuid[j] & ~((u64)1);
+			if (self == peer)
+				return -100;
+		}
+	}
+
+	return -1000;
+}
+
+/* drbd_sync_handshake() returns the new conn state on success, or
+   CONN_MASK (-1) on failure.
+ */
+static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
+					   enum drbd_role peer_role,
+					   enum drbd_disk_state peer_disk) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	enum drbd_conns rv = C_MASK;
+	enum drbd_disk_state mydisk;
+	struct net_conf *nc;
+	int hg, rule_nr, rr_conflict, tentative;
+
+	mydisk = device->state.disk;
+	if (mydisk == D_NEGOTIATING)
+		mydisk = device->new_state_tmp.disk;
+
+	drbd_info(device, "drbd_sync_handshake:\n");
+
+	spin_lock_irq(&device->ldev->md.uuid_lock);
+	drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
+	drbd_uuid_dump(device, "peer", device->p_uuid,
+		       device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+
+	hg = drbd_uuid_compare(device, &rule_nr);
+	spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+	drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
+
+	if (hg == -1000) {
+		drbd_alert(device, "Unrelated data, aborting!\n");
+		return C_MASK;
+	}
+	if (hg < -1000) {
+		drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
+		return C_MASK;
+	}
+
+	if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
+	    (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
+		int f = (hg == -100) || abs(hg) == 2;
+		hg = mydisk > D_INCONSISTENT ? 1 : -1;
+		if (f)
+			hg = hg*2;
+		drbd_info(device, "Becoming sync %s due to disk states.\n",
+		     hg > 0 ? "source" : "target");
+	}
+
+	if (abs(hg) == 100)
+		drbd_khelper(device, "initial-split-brain");
+
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->net_conf);
+
+	if (hg == 100 || (hg == -100 && nc->always_asbp)) {
+		int pcount = (device->state.role == R_PRIMARY)
+			   + (peer_role == R_PRIMARY);
+		int forced = (hg == -100);
+
+		switch (pcount) {
+		case 0:
+			hg = drbd_asb_recover_0p(peer_device);
+			break;
+		case 1:
+			hg = drbd_asb_recover_1p(peer_device);
+			break;
+		case 2:
+			hg = drbd_asb_recover_2p(peer_device);
+			break;
+		}
+		if (abs(hg) < 100) {
+			drbd_warn(device, "Split-Brain detected, %d primaries, "
+			     "automatically solved. Sync from %s node\n",
+			     pcount, (hg < 0) ? "peer" : "this");
+			if (forced) {
+				drbd_warn(device, "Doing a full sync, since"
+				     " UUIDs where ambiguous.\n");
+				hg = hg*2;
+			}
+		}
+	}
+
+	if (hg == -100) {
+		if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
+			hg = -1;
+		if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
+			hg = 1;
+
+		if (abs(hg) < 100)
+			drbd_warn(device, "Split-Brain detected, manually solved. "
+			     "Sync from %s node\n",
+			     (hg < 0) ? "peer" : "this");
+	}
+	rr_conflict = nc->rr_conflict;
+	tentative = nc->tentative;
+	rcu_read_unlock();
+
+	if (hg == -100) {
+		/* FIXME this log message is not correct if we end up here
+		 * after an attempted attach on a diskless node.
+		 * We just refuse to attach -- well, we drop the "connection"
+		 * to that disk, in a way... */
+		drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
+		drbd_khelper(device, "split-brain");
+		return C_MASK;
+	}
+
+	if (hg > 0 && mydisk <= D_INCONSISTENT) {
+		drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
+		return C_MASK;
+	}
+
+	if (hg < 0 && /* by intention we do not use mydisk here. */
+	    device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
+		switch (rr_conflict) {
+		case ASB_CALL_HELPER:
+			drbd_khelper(device, "pri-lost");
+			/* fall through */
+		case ASB_DISCONNECT:
+			drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
+			return C_MASK;
+		case ASB_VIOLENTLY:
+			drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
+			     "assumption\n");
+		}
+	}
+
+	if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
+		if (hg == 0)
+			drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
+		else
+			drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
+				 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
+				 abs(hg) >= 2 ? "full" : "bit-map based");
+		return C_MASK;
+	}
+
+	if (abs(hg) >= 2) {
+		drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
+		if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
+					BM_LOCKED_SET_ALLOWED))
+			return C_MASK;
+	}
+
+	if (hg > 0) { /* become sync source. */
+		rv = C_WF_BITMAP_S;
+	} else if (hg < 0) { /* become sync target */
+		rv = C_WF_BITMAP_T;
+	} else {
+		rv = C_CONNECTED;
+		if (drbd_bm_total_weight(device)) {
+			drbd_info(device, "No resync, but %lu bits in bitmap!\n",
+			     drbd_bm_total_weight(device));
+		}
+	}
+
+	return rv;
+}
+
+static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
+{
+	/* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
+	if (peer == ASB_DISCARD_REMOTE)
+		return ASB_DISCARD_LOCAL;
+
+	/* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
+	if (peer == ASB_DISCARD_LOCAL)
+		return ASB_DISCARD_REMOTE;
+
+	/* everything else is valid if they are equal on both sides. */
+	return peer;
+}
+
+static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_protocol *p = pi->data;
+	enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+	int p_proto, p_discard_my_data, p_two_primaries, cf;
+	struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
+	char integrity_alg[SHARED_SECRET_MAX] = "";
+	struct crypto_hash *peer_integrity_tfm = NULL;
+	void *int_dig_in = NULL, *int_dig_vv = NULL;
+
+	p_proto		= be32_to_cpu(p->protocol);
+	p_after_sb_0p	= be32_to_cpu(p->after_sb_0p);
+	p_after_sb_1p	= be32_to_cpu(p->after_sb_1p);
+	p_after_sb_2p	= be32_to_cpu(p->after_sb_2p);
+	p_two_primaries = be32_to_cpu(p->two_primaries);
+	cf		= be32_to_cpu(p->conn_flags);
+	p_discard_my_data = cf & CF_DISCARD_MY_DATA;
+
+	if (connection->agreed_pro_version >= 87) {
+		int err;
+
+		if (pi->size > sizeof(integrity_alg))
+			return -EIO;
+		err = drbd_recv_all(connection, integrity_alg, pi->size);
+		if (err)
+			return err;
+		integrity_alg[SHARED_SECRET_MAX - 1] = 0;
+	}
+
+	if (pi->cmd != P_PROTOCOL_UPDATE) {
+		clear_bit(CONN_DRY_RUN, &connection->flags);
+
+		if (cf & CF_DRY_RUN)
+			set_bit(CONN_DRY_RUN, &connection->flags);
+
+		rcu_read_lock();
+		nc = rcu_dereference(connection->net_conf);
+
+		if (p_proto != nc->wire_protocol) {
+			drbd_err(connection, "incompatible %s settings\n", "protocol");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
+			drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
+			drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
+			drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (p_discard_my_data && nc->discard_my_data) {
+			drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (p_two_primaries != nc->two_primaries) {
+			drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (strcmp(integrity_alg, nc->integrity_alg)) {
+			drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
+			goto disconnect_rcu_unlock;
+		}
+
+		rcu_read_unlock();
+	}
+
+	if (integrity_alg[0]) {
+		int hash_size;
+
+		/*
+		 * We can only change the peer data integrity algorithm
+		 * here.  Changing our own data integrity algorithm
+		 * requires that we send a P_PROTOCOL_UPDATE packet at
+		 * the same time; otherwise, the peer has no way to
+		 * tell between which packets the algorithm should
+		 * change.
+		 */
+
+		peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
+		if (!peer_integrity_tfm) {
+			drbd_err(connection, "peer data-integrity-alg %s not supported\n",
+				 integrity_alg);
+			goto disconnect;
+		}
+
+		hash_size = crypto_hash_digestsize(peer_integrity_tfm);
+		int_dig_in = kmalloc(hash_size, GFP_KERNEL);
+		int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
+		if (!(int_dig_in && int_dig_vv)) {
+			drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
+			goto disconnect;
+		}
+	}
+
+	new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+	if (!new_net_conf) {
+		drbd_err(connection, "Allocation of new net_conf failed\n");
+		goto disconnect;
+	}
+
+	mutex_lock(&connection->data.mutex);
+	mutex_lock(&connection->resource->conf_update);
+	old_net_conf = connection->net_conf;
+	*new_net_conf = *old_net_conf;
+
+	new_net_conf->wire_protocol = p_proto;
+	new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
+	new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
+	new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
+	new_net_conf->two_primaries = p_two_primaries;
+
+	rcu_assign_pointer(connection->net_conf, new_net_conf);
+	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&connection->data.mutex);
+
+	crypto_free_hash(connection->peer_integrity_tfm);
+	kfree(connection->int_dig_in);
+	kfree(connection->int_dig_vv);
+	connection->peer_integrity_tfm = peer_integrity_tfm;
+	connection->int_dig_in = int_dig_in;
+	connection->int_dig_vv = int_dig_vv;
+
+	if (strcmp(old_net_conf->integrity_alg, integrity_alg))
+		drbd_info(connection, "peer data-integrity-alg: %s\n",
+			  integrity_alg[0] ? integrity_alg : "(none)");
+
+	synchronize_rcu();
+	kfree(old_net_conf);
+	return 0;
+
+disconnect_rcu_unlock:
+	rcu_read_unlock();
+disconnect:
+	crypto_free_hash(peer_integrity_tfm);
+	kfree(int_dig_in);
+	kfree(int_dig_vv);
+	conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	return -EIO;
+}
+
+/* helper function
+ * input: alg name, feature name
+ * return: NULL (alg name was "")
+ *         ERR_PTR(error) if something goes wrong
+ *         or the crypto hash ptr, if it worked out ok. */
+static
+struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
+		const char *alg, const char *name)
+{
+	struct crypto_hash *tfm;
+
+	if (!alg[0])
+		return NULL;
+
+	tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm)) {
+		drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
+			alg, name, PTR_ERR(tfm));
+		return tfm;
+	}
+	return tfm;
+}
+
+static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
+{
+	void *buffer = connection->data.rbuf;
+	int size = pi->size;
+
+	while (size) {
+		int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
+		s = drbd_recv(connection, buffer, s);
+		if (s <= 0) {
+			if (s < 0)
+				return s;
+			break;
+		}
+		size -= s;
+	}
+	if (size)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * config_unknown_volume  -  device configuration command for unknown volume
+ *
+ * When a device is added to an existing connection, the node on which the
+ * device is added first will send configuration commands to its peer but the
+ * peer will not know about the device yet.  It will warn and ignore these
+ * commands.  Once the device is added on the second node, the second node will
+ * send the same device configuration commands, but in the other direction.
+ *
+ * (We can also end up here if drbd is misconfigured.)
+ */
+static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
+{
+	drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
+		  cmdname(pi->cmd), pi->vnr);
+	return ignore_remaining_packet(connection, pi);
+}
+
+static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_rs_param_95 *p;
+	unsigned int header_size, data_size, exp_max_sz;
+	struct crypto_hash *verify_tfm = NULL;
+	struct crypto_hash *csums_tfm = NULL;
+	struct net_conf *old_net_conf, *new_net_conf = NULL;
+	struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
+	const int apv = connection->agreed_pro_version;
+	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+	int fifo_size = 0;
+	int err;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
+		    : apv == 88 ? sizeof(struct p_rs_param)
+					+ SHARED_SECRET_MAX
+		    : apv <= 94 ? sizeof(struct p_rs_param_89)
+		    : /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+	if (pi->size > exp_max_sz) {
+		drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
+		    pi->size, exp_max_sz);
+		return -EIO;
+	}
+
+	if (apv <= 88) {
+		header_size = sizeof(struct p_rs_param);
+		data_size = pi->size - header_size;
+	} else if (apv <= 94) {
+		header_size = sizeof(struct p_rs_param_89);
+		data_size = pi->size - header_size;
+		D_ASSERT(device, data_size == 0);
+	} else {
+		header_size = sizeof(struct p_rs_param_95);
+		data_size = pi->size - header_size;
+		D_ASSERT(device, data_size == 0);
+	}
+
+	/* initialize verify_alg and csums_alg */
+	p = pi->data;
+	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+	err = drbd_recv_all(peer_device->connection, p, header_size);
+	if (err)
+		return err;
+
+	mutex_lock(&connection->resource->conf_update);
+	old_net_conf = peer_device->connection->net_conf;
+	if (get_ldev(device)) {
+		new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+		if (!new_disk_conf) {
+			put_ldev(device);
+			mutex_unlock(&connection->resource->conf_update);
+			drbd_err(device, "Allocation of new disk_conf failed\n");
+			return -ENOMEM;
+		}
+
+		old_disk_conf = device->ldev->disk_conf;
+		*new_disk_conf = *old_disk_conf;
+
+		new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
+	}
+
+	if (apv >= 88) {
+		if (apv == 88) {
+			if (data_size > SHARED_SECRET_MAX || data_size == 0) {
+				drbd_err(device, "verify-alg of wrong size, "
+					"peer wants %u, accepting only up to %u byte\n",
+					data_size, SHARED_SECRET_MAX);
+				err = -EIO;
+				goto reconnect;
+			}
+
+			err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
+			if (err)
+				goto reconnect;
+			/* we expect NUL terminated string */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(device, p->verify_alg[data_size-1] == 0);
+			p->verify_alg[data_size-1] = 0;
+
+		} else /* apv >= 89 */ {
+			/* we still expect NUL terminated strings */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
+			D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
+			p->verify_alg[SHARED_SECRET_MAX-1] = 0;
+			p->csums_alg[SHARED_SECRET_MAX-1] = 0;
+		}
+
+		if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
+			if (device->state.conn == C_WF_REPORT_PARAMS) {
+				drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    old_net_conf->verify_alg, p->verify_alg);
+				goto disconnect;
+			}
+			verify_tfm = drbd_crypto_alloc_digest_safe(device,
+					p->verify_alg, "verify-alg");
+			if (IS_ERR(verify_tfm)) {
+				verify_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+		if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
+			if (device->state.conn == C_WF_REPORT_PARAMS) {
+				drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    old_net_conf->csums_alg, p->csums_alg);
+				goto disconnect;
+			}
+			csums_tfm = drbd_crypto_alloc_digest_safe(device,
+					p->csums_alg, "csums-alg");
+			if (IS_ERR(csums_tfm)) {
+				csums_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+		if (apv > 94 && new_disk_conf) {
+			new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+			new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
+			new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
+			new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
+
+			fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+			if (fifo_size != device->rs_plan_s->size) {
+				new_plan = fifo_alloc(fifo_size);
+				if (!new_plan) {
+					drbd_err(device, "kmalloc of fifo_buffer failed");
+					put_ldev(device);
+					goto disconnect;
+				}
+			}
+		}
+
+		if (verify_tfm || csums_tfm) {
+			new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+			if (!new_net_conf) {
+				drbd_err(device, "Allocation of new net_conf failed\n");
+				goto disconnect;
+			}
+
+			*new_net_conf = *old_net_conf;
+
+			if (verify_tfm) {
+				strcpy(new_net_conf->verify_alg, p->verify_alg);
+				new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
+				crypto_free_hash(peer_device->connection->verify_tfm);
+				peer_device->connection->verify_tfm = verify_tfm;
+				drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
+			}
+			if (csums_tfm) {
+				strcpy(new_net_conf->csums_alg, p->csums_alg);
+				new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
+				crypto_free_hash(peer_device->connection->csums_tfm);
+				peer_device->connection->csums_tfm = csums_tfm;
+				drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
+			}
+			rcu_assign_pointer(connection->net_conf, new_net_conf);
+		}
+	}
+
+	if (new_disk_conf) {
+		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+		put_ldev(device);
+	}
+
+	if (new_plan) {
+		old_plan = device->rs_plan_s;
+		rcu_assign_pointer(device->rs_plan_s, new_plan);
+	}
+
+	mutex_unlock(&connection->resource->conf_update);
+	synchronize_rcu();
+	if (new_net_conf)
+		kfree(old_net_conf);
+	kfree(old_disk_conf);
+	kfree(old_plan);
+
+	return 0;
+
+reconnect:
+	if (new_disk_conf) {
+		put_ldev(device);
+		kfree(new_disk_conf);
+	}
+	mutex_unlock(&connection->resource->conf_update);
+	return -EIO;
+
+disconnect:
+	kfree(new_plan);
+	if (new_disk_conf) {
+		put_ldev(device);
+		kfree(new_disk_conf);
+	}
+	mutex_unlock(&connection->resource->conf_update);
+	/* just for completeness: actually not needed,
+	 * as this is not reached if csums_tfm was ok. */
+	crypto_free_hash(csums_tfm);
+	/* but free the verify_tfm again, if csums_tfm did not work out */
+	crypto_free_hash(verify_tfm);
+	conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	return -EIO;
+}
+
+/* warn if the arguments differ by more than 12.5% */
+static void warn_if_differ_considerably(struct drbd_device *device,
+	const char *s, sector_t a, sector_t b)
+{
+	sector_t d;
+	if (a == 0 || b == 0)
+		return;
+	d = (a > b) ? (a - b) : (b - a);
+	if (d > (a>>3) || d > (b>>3))
+		drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
+		     (unsigned long long)a, (unsigned long long)b);
+}
+
+static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_sizes *p = pi->data;
+	enum determine_dev_size dd = DS_UNCHANGED;
+	sector_t p_size, p_usize, my_usize;
+	int ldsc = 0; /* local disk size changed */
+	enum dds_flags ddsf;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	p_size = be64_to_cpu(p->d_size);
+	p_usize = be64_to_cpu(p->u_size);
+
+	/* just store the peer's disk size for now.
+	 * we still need to figure out whether we accept that. */
+	device->p_size = p_size;
+
+	if (get_ldev(device)) {
+		rcu_read_lock();
+		my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
+		rcu_read_unlock();
+
+		warn_if_differ_considerably(device, "lower level device sizes",
+			   p_size, drbd_get_max_capacity(device->ldev));
+		warn_if_differ_considerably(device, "user requested size",
+					    p_usize, my_usize);
+
+		/* if this is the first connect, or an otherwise expected
+		 * param exchange, choose the minimum */
+		if (device->state.conn == C_WF_REPORT_PARAMS)
+			p_usize = min_not_zero(my_usize, p_usize);
+
+		/* Never shrink a device with usable data during connect.
+		   But allow online shrinking if we are connected. */
+		if (drbd_new_dev_size(device, device->ldev, p_usize, 0) <
+		    drbd_get_capacity(device->this_bdev) &&
+		    device->state.disk >= D_OUTDATED &&
+		    device->state.conn < C_CONNECTED) {
+			drbd_err(device, "The peer's disk size is too small!\n");
+			conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+			put_ldev(device);
+			return -EIO;
+		}
+
+		if (my_usize != p_usize) {
+			struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+
+			new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+			if (!new_disk_conf) {
+				drbd_err(device, "Allocation of new disk_conf failed\n");
+				put_ldev(device);
+				return -ENOMEM;
+			}
+
+			mutex_lock(&connection->resource->conf_update);
+			old_disk_conf = device->ldev->disk_conf;
+			*new_disk_conf = *old_disk_conf;
+			new_disk_conf->disk_size = p_usize;
+
+			rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+			mutex_unlock(&connection->resource->conf_update);
+			synchronize_rcu();
+			kfree(old_disk_conf);
+
+			drbd_info(device, "Peer sets u_size to %lu sectors\n",
+				 (unsigned long)my_usize);
+		}
+
+		put_ldev(device);
+	}
+
+	device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
+	drbd_reconsider_max_bio_size(device);
+	/* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
+	   In case we cleared the QUEUE_FLAG_DISCARD from our queue in
+	   drbd_reconsider_max_bio_size(), we can be sure that after
+	   drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
+
+	ddsf = be16_to_cpu(p->dds_flags);
+	if (get_ldev(device)) {
+		dd = drbd_determine_dev_size(device, ddsf, NULL);
+		put_ldev(device);
+		if (dd == DS_ERROR)
+			return -EIO;
+		drbd_md_sync(device);
+	} else {
+		/* I am diskless, need to accept the peer's size. */
+		drbd_set_my_capacity(device, p_size);
+	}
+
+	if (get_ldev(device)) {
+		if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
+			device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
+			ldsc = 1;
+		}
+
+		put_ldev(device);
+	}
+
+	if (device->state.conn > C_WF_REPORT_PARAMS) {
+		if (be64_to_cpu(p->c_size) !=
+		    drbd_get_capacity(device->this_bdev) || ldsc) {
+			/* we have different sizes, probably peer
+			 * needs to know my new size... */
+			drbd_send_sizes(peer_device, 0, ddsf);
+		}
+		if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
+		    (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
+			if (device->state.pdsk >= D_INCONSISTENT &&
+			    device->state.disk >= D_INCONSISTENT) {
+				if (ddsf & DDSF_NO_RESYNC)
+					drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
+				else
+					resync_after_online_grow(device);
+			} else
+				set_bit(RESYNC_AFTER_NEG, &device->flags);
+		}
+	}
+
+	return 0;
+}
+
+static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_uuids *p = pi->data;
+	u64 *p_uuid;
+	int i, updated_uuids = 0;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
+	if (!p_uuid) {
+		drbd_err(device, "kmalloc of p_uuid failed\n");
+		return false;
+	}
+
+	for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
+		p_uuid[i] = be64_to_cpu(p->uuid[i]);
+
+	kfree(device->p_uuid);
+	device->p_uuid = p_uuid;
+
+	if (device->state.conn < C_CONNECTED &&
+	    device->state.disk < D_INCONSISTENT &&
+	    device->state.role == R_PRIMARY &&
+	    (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
+		drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
+		    (unsigned long long)device->ed_uuid);
+		conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		return -EIO;
+	}
+
+	if (get_ldev(device)) {
+		int skip_initial_sync =
+			device->state.conn == C_CONNECTED &&
+			peer_device->connection->agreed_pro_version >= 90 &&
+			device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
+			(p_uuid[UI_FLAGS] & 8);
+		if (skip_initial_sync) {
+			drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
+			drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+					"clear_n_write from receive_uuids",
+					BM_LOCKED_TEST_ALLOWED);
+			_drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
+			_drbd_uuid_set(device, UI_BITMAP, 0);
+			_drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			drbd_md_sync(device);
+			updated_uuids = 1;
+		}
+		put_ldev(device);
+	} else if (device->state.disk < D_INCONSISTENT &&
+		   device->state.role == R_PRIMARY) {
+		/* I am a diskless primary, the peer just created a new current UUID
+		   for me. */
+		updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
+	}
+
+	/* Before we test for the disk state, we should wait until an eventually
+	   ongoing cluster wide state change is finished. That is important if
+	   we are primary and are detaching from our disk. We need to see the
+	   new disk state... */
+	mutex_lock(device->state_mutex);
+	mutex_unlock(device->state_mutex);
+	if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
+		updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
+
+	if (updated_uuids)
+		drbd_print_uuids(device, "receiver updated UUIDs to");
+
+	return 0;
+}
+
+/**
+ * convert_state() - Converts the peer's view of the cluster state to our point of view
+ * @ps:		The state as seen by the peer.
+ */
+static union drbd_state convert_state(union drbd_state ps)
+{
+	union drbd_state ms;
+
+	static enum drbd_conns c_tab[] = {
+		[C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
+		[C_CONNECTED] = C_CONNECTED,
+
+		[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
+		[C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
+		[C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
+		[C_VERIFY_S]       = C_VERIFY_T,
+		[C_MASK]   = C_MASK,
+	};
+
+	ms.i = ps.i;
+
+	ms.conn = c_tab[ps.conn];
+	ms.peer = ps.role;
+	ms.role = ps.peer;
+	ms.pdsk = ps.disk;
+	ms.disk = ps.pdsk;
+	ms.peer_isp = (ps.aftr_isp | ps.user_isp);
+
+	return ms;
+}
+
+static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_req_state *p = pi->data;
+	union drbd_state mask, val;
+	enum drbd_state_rv rv;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	mask.i = be32_to_cpu(p->mask);
+	val.i = be32_to_cpu(p->val);
+
+	if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
+	    mutex_is_locked(device->state_mutex)) {
+		drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
+		return 0;
+	}
+
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	rv = drbd_change_state(device, CS_VERBOSE, mask, val);
+	drbd_send_sr_reply(peer_device, rv);
+
+	drbd_md_sync(device);
+
+	return 0;
+}
+
+static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_req_state *p = pi->data;
+	union drbd_state mask, val;
+	enum drbd_state_rv rv;
+
+	mask.i = be32_to_cpu(p->mask);
+	val.i = be32_to_cpu(p->val);
+
+	if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
+	    mutex_is_locked(&connection->cstate_mutex)) {
+		conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
+		return 0;
+	}
+
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
+	conn_send_sr_reply(connection, rv);
+
+	return 0;
+}
+
+static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_state *p = pi->data;
+	union drbd_state os, ns, peer_state;
+	enum drbd_disk_state real_peer_disk;
+	enum chg_state_flags cs_flags;
+	int rv;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	peer_state.i = be32_to_cpu(p->state);
+
+	real_peer_disk = peer_state.disk;
+	if (peer_state.disk == D_NEGOTIATING) {
+		real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
+		drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
+	}
+
+	spin_lock_irq(&device->resource->req_lock);
+ retry:
+	os = ns = drbd_read_state(device);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	/* If some other part of the code (asender thread, timeout)
+	 * already decided to close the connection again,
+	 * we must not "re-establish" it here. */
+	if (os.conn <= C_TEAR_DOWN)
+		return -ECONNRESET;
+
+	/* If this is the "end of sync" confirmation, usually the peer disk
+	 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
+	 * set) resync started in PausedSyncT, or if the timing of pause-/
+	 * unpause-sync events has been "just right", the peer disk may
+	 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
+	 */
+	if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
+	    real_peer_disk == D_UP_TO_DATE &&
+	    os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+		/* If we are (becoming) SyncSource, but peer is still in sync
+		 * preparation, ignore its uptodate-ness to avoid flapping, it
+		 * will change to inconsistent once the peer reaches active
+		 * syncing states.
+		 * It may have changed syncer-paused flags, however, so we
+		 * cannot ignore this completely. */
+		if (peer_state.conn > C_CONNECTED &&
+		    peer_state.conn < C_SYNC_SOURCE)
+			real_peer_disk = D_INCONSISTENT;
+
+		/* if peer_state changes to connected at the same time,
+		 * it explicitly notifies us that it finished resync.
+		 * Maybe we should finish it up, too? */
+		else if (os.conn >= C_SYNC_SOURCE &&
+			 peer_state.conn == C_CONNECTED) {
+			if (drbd_bm_total_weight(device) <= device->rs_failed)
+				drbd_resync_finished(device);
+			return 0;
+		}
+	}
+
+	/* explicit verify finished notification, stop sector reached. */
+	if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
+	    peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
+		ov_out_of_sync_print(device);
+		drbd_resync_finished(device);
+		return 0;
+	}
+
+	/* peer says his disk is inconsistent, while we think it is uptodate,
+	 * and this happens while the peer still thinks we have a sync going on,
+	 * but we think we are already done with the sync.
+	 * We ignore this to avoid flapping pdsk.
+	 * This should not happen, if the peer is a recent version of drbd. */
+	if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+	    os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+		real_peer_disk = D_UP_TO_DATE;
+
+	if (ns.conn == C_WF_REPORT_PARAMS)
+		ns.conn = C_CONNECTED;
+
+	if (peer_state.conn == C_AHEAD)
+		ns.conn = C_BEHIND;
+
+	if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
+	    get_ldev_if_state(device, D_NEGOTIATING)) {
+		int cr; /* consider resync */
+
+		/* if we established a new connection */
+		cr  = (os.conn < C_CONNECTED);
+		/* if we had an established connection
+		 * and one of the nodes newly attaches a disk */
+		cr |= (os.conn == C_CONNECTED &&
+		       (peer_state.disk == D_NEGOTIATING ||
+			os.disk == D_NEGOTIATING));
+		/* if we have both been inconsistent, and the peer has been
+		 * forced to be UpToDate with --overwrite-data */
+		cr |= test_bit(CONSIDER_RESYNC, &device->flags);
+		/* if we had been plain connected, and the admin requested to
+		 * start a sync by "invalidate" or "invalidate-remote" */
+		cr |= (os.conn == C_CONNECTED &&
+				(peer_state.conn >= C_STARTING_SYNC_S &&
+				 peer_state.conn <= C_WF_BITMAP_T));
+
+		if (cr)
+			ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
+
+		put_ldev(device);
+		if (ns.conn == C_MASK) {
+			ns.conn = C_CONNECTED;
+			if (device->state.disk == D_NEGOTIATING) {
+				drbd_force_state(device, NS(disk, D_FAILED));
+			} else if (peer_state.disk == D_NEGOTIATING) {
+				drbd_err(device, "Disk attach process on the peer node was aborted.\n");
+				peer_state.disk = D_DISKLESS;
+				real_peer_disk = D_DISKLESS;
+			} else {
+				if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
+					return -EIO;
+				D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
+				conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+				return -EIO;
+			}
+		}
+	}
+
+	spin_lock_irq(&device->resource->req_lock);
+	if (os.i != drbd_read_state(device).i)
+		goto retry;
+	clear_bit(CONSIDER_RESYNC, &device->flags);
+	ns.peer = peer_state.role;
+	ns.pdsk = real_peer_disk;
+	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
+	if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+		ns.disk = device->new_state_tmp.disk;
+	cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+	if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+	    test_bit(NEW_CUR_UUID, &device->flags)) {
+		/* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
+		   for temporal network outages! */
+		spin_unlock_irq(&device->resource->req_lock);
+		drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+		tl_clear(peer_device->connection);
+		drbd_uuid_new_current(device);
+		clear_bit(NEW_CUR_UUID, &device->flags);
+		conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
+		return -EIO;
+	}
+	rv = _drbd_set_state(device, ns, cs_flags, NULL);
+	ns = drbd_read_state(device);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (rv < SS_SUCCESS) {
+		conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		return -EIO;
+	}
+
+	if (os.conn > C_WF_REPORT_PARAMS) {
+		if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+		    peer_state.disk != D_NEGOTIATING ) {
+			/* we want resync, peer has not yet decided to sync... */
+			/* Nowadays only used when forcing a node into primary role and
+			   setting its disk to UpToDate with that */
+			drbd_send_uuids(peer_device);
+			drbd_send_current_state(peer_device);
+		}
+	}
+
+	clear_bit(DISCARD_MY_DATA, &device->flags);
+
+	drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
+
+	return 0;
+}
+
+static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_rs_uuid *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	wait_event(device->misc_wait,
+		   device->state.conn == C_WF_SYNC_UUID ||
+		   device->state.conn == C_BEHIND ||
+		   device->state.conn < C_CONNECTED ||
+		   device->state.disk < D_NEGOTIATING);
+
+	/* D_ASSERT(device,  device->state.conn == C_WF_SYNC_UUID ); */
+
+	/* Here the _drbd_uuid_ functions are right, current should
+	   _not_ be rotated into the history */
+	if (get_ldev_if_state(device, D_NEGOTIATING)) {
+		_drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
+		_drbd_uuid_set(device, UI_BITMAP, 0UL);
+
+		drbd_print_uuids(device, "updated sync uuid");
+		drbd_start_resync(device, C_SYNC_TARGET);
+
+		put_ldev(device);
+	} else
+		drbd_err(device, "Ignoring SyncUUID packet!\n");
+
+	return 0;
+}
+
+/**
+ * receive_bitmap_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
+		     unsigned long *p, struct bm_xfer_ctx *c)
+{
+	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
+				 drbd_header_size(peer_device->connection);
+	unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
+				       c->bm_words - c->word_offset);
+	unsigned int want = num_words * sizeof(*p);
+	int err;
+
+	if (want != size) {
+		drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
+		return -EIO;
+	}
+	if (want == 0)
+		return 0;
+	err = drbd_recv_all(peer_device->connection, p, want);
+	if (err)
+		return err;
+
+	drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
+
+	c->word_offset += num_words;
+	c->bit_offset = c->word_offset * BITS_PER_LONG;
+	if (c->bit_offset > c->bm_bits)
+		c->bit_offset = c->bm_bits;
+
+	return 1;
+}
+
+static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+{
+	return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static int dcbp_get_start(struct p_compressed_bm *p)
+{
+	return (p->encoding & 0x80) != 0;
+}
+
+static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+{
+	return (p->encoding >> 4) & 0x7;
+}
+
+/**
+ * recv_bm_rle_bits
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+recv_bm_rle_bits(struct drbd_peer_device *peer_device,
+		struct p_compressed_bm *p,
+		 struct bm_xfer_ctx *c,
+		 unsigned int len)
+{
+	struct bitstream bs;
+	u64 look_ahead;
+	u64 rl;
+	u64 tmp;
+	unsigned long s = c->bit_offset;
+	unsigned long e;
+	int toggle = dcbp_get_start(p);
+	int have;
+	int bits;
+
+	bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
+
+	bits = bitstream_get_bits(&bs, &look_ahead, 64);
+	if (bits < 0)
+		return -EIO;
+
+	for (have = bits; have > 0; s += rl, toggle = !toggle) {
+		bits = vli_decode_bits(&rl, look_ahead);
+		if (bits <= 0)
+			return -EIO;
+
+		if (toggle) {
+			e = s + rl -1;
+			if (e >= c->bm_bits) {
+				drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
+				return -EIO;
+			}
+			_drbd_bm_set_bits(peer_device->device, s, e);
+		}
+
+		if (have < bits) {
+			drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
+				have, bits, look_ahead,
+				(unsigned int)(bs.cur.b - p->code),
+				(unsigned int)bs.buf_len);
+			return -EIO;
+		}
+		/* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
+		if (likely(bits < 64))
+			look_ahead >>= bits;
+		else
+			look_ahead = 0;
+		have -= bits;
+
+		bits = bitstream_get_bits(&bs, &tmp, 64 - have);
+		if (bits < 0)
+			return -EIO;
+		look_ahead |= tmp << have;
+		have += bits;
+	}
+
+	c->bit_offset = s;
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	return (s != c->bm_bits);
+}
+
+/**
+ * decode_bitmap_c
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+decode_bitmap_c(struct drbd_peer_device *peer_device,
+		struct p_compressed_bm *p,
+		struct bm_xfer_ctx *c,
+		unsigned int len)
+{
+	if (dcbp_get_code(p) == RLE_VLI_Bits)
+		return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
+
+	/* other variants had been implemented for evaluation,
+	 * but have been dropped as this one turned out to be "best"
+	 * during all our tests. */
+
+	drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
+	conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+	return -EIO;
+}
+
+void INFO_bm_xfer_stats(struct drbd_device *device,
+		const char *direction, struct bm_xfer_ctx *c)
+{
+	/* what would it take to transfer it "plaintext" */
+	unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+	unsigned int plain =
+		header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
+		c->bm_words * sizeof(unsigned long);
+	unsigned int total = c->bytes[0] + c->bytes[1];
+	unsigned int r;
+
+	/* total can not be zero. but just in case: */
+	if (total == 0)
+		return;
+
+	/* don't report if not compressed */
+	if (total >= plain)
+		return;
+
+	/* total < plain. check for overflow, still */
+	r = (total > UINT_MAX/1000) ? (total / (plain/1000))
+		                    : (1000 * total / plain);
+
+	if (r > 1000)
+		r = 1000;
+
+	r = 1000 - r;
+	drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
+	     "total %u; compression: %u.%u%%\n",
+			direction,
+			c->bytes[1], c->packets[1],
+			c->bytes[0], c->packets[0],
+			total, r/10, r % 10);
+}
+
+/* Since we are processing the bitfield from lower addresses to higher,
+   it does not matter if the process it in 32 bit chunks or 64 bit
+   chunks as long as it is little endian. (Understand it as byte stream,
+   beginning with the lowest byte...) If we would use big endian
+   we would need to process it from the highest address to the lowest,
+   in order to be agnostic to the 32 vs 64 bits issue.
+
+   returns 0 on failure, 1 if we successfully received it. */
+static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct bm_xfer_ctx c;
+	int err;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
+	/* you are supposed to send additional out-of-sync information
+	 * if you actually set bits during this phase */
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(device),
+		.bm_words = drbd_bm_words(device),
+	};
+
+	for(;;) {
+		if (pi->cmd == P_BITMAP)
+			err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
+		else if (pi->cmd == P_COMPRESSED_BITMAP) {
+			/* MAYBE: sanity check that we speak proto >= 90,
+			 * and the feature is enabled! */
+			struct p_compressed_bm *p = pi->data;
+
+			if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
+				drbd_err(device, "ReportCBitmap packet too large\n");
+				err = -EIO;
+				goto out;
+			}
+			if (pi->size <= sizeof(*p)) {
+				drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
+				err = -EIO;
+				goto out;
+			}
+			err = drbd_recv_all(peer_device->connection, p, pi->size);
+			if (err)
+			       goto out;
+			err = decode_bitmap_c(peer_device, p, &c, pi->size);
+		} else {
+			drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
+			err = -EIO;
+			goto out;
+		}
+
+		c.packets[pi->cmd == P_BITMAP]++;
+		c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
+
+		if (err <= 0) {
+			if (err < 0)
+				goto out;
+			break;
+		}
+		err = drbd_recv_header(peer_device->connection, pi);
+		if (err)
+			goto out;
+	}
+
+	INFO_bm_xfer_stats(device, "receive", &c);
+
+	if (device->state.conn == C_WF_BITMAP_T) {
+		enum drbd_state_rv rv;
+
+		err = drbd_send_bitmap(device);
+		if (err)
+			goto out;
+		/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
+		rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		D_ASSERT(device, rv == SS_SUCCESS);
+	} else if (device->state.conn != C_WF_BITMAP_S) {
+		/* admin may have requested C_DISCONNECTING,
+		 * other threads may have noticed network errors */
+		drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
+		    drbd_conn_str(device->state.conn));
+	}
+	err = 0;
+
+ out:
+	drbd_bm_unlock(device);
+	if (!err && device->state.conn == C_WF_BITMAP_S)
+		drbd_start_resync(device, C_SYNC_SOURCE);
+	return err;
+}
+
+static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+	drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
+		 pi->cmd, pi->size);
+
+	return ignore_remaining_packet(connection, pi);
+}
+
+static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
+{
+	/* Make sure we've acked all the TCP data associated
+	 * with the data requests being unplugged */
+	drbd_tcp_quickack(connection->data.socket);
+
+	return 0;
+}
+
+static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_desc *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	switch (device->state.conn) {
+	case C_WF_SYNC_UUID:
+	case C_WF_BITMAP_T:
+	case C_BEHIND:
+			break;
+	default:
+		drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
+				drbd_conn_str(device->state.conn));
+	}
+
+	drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
+
+	return 0;
+}
+
+struct data_cmd {
+	int expect_payload;
+	size_t pkt_size;
+	int (*fn)(struct drbd_connection *, struct packet_info *);
+};
+
+static struct data_cmd drbd_cmd_handler[] = {
+	[P_DATA]	    = { 1, sizeof(struct p_data), receive_Data },
+	[P_DATA_REPLY]	    = { 1, sizeof(struct p_data), receive_DataReply },
+	[P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
+	[P_BARRIER]	    = { 0, sizeof(struct p_barrier), receive_Barrier } ,
+	[P_BITMAP]	    = { 1, 0, receive_bitmap } ,
+	[P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
+	[P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
+	[P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_SYNC_PARAM]	    = { 1, 0, receive_SyncParam },
+	[P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
+	[P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_UUIDS]	    = { 0, sizeof(struct p_uuids), receive_uuids },
+	[P_SIZES]	    = { 0, sizeof(struct p_sizes), receive_sizes },
+	[P_STATE]	    = { 0, sizeof(struct p_state), receive_state },
+	[P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
+	[P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
+	[P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
+	[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+	[P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
+	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
+	[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
+	[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_TRIM]	    = { 0, sizeof(struct p_trim), receive_Data },
+};
+
+static void drbdd(struct drbd_connection *connection)
+{
+	struct packet_info pi;
+	size_t shs; /* sub header size */
+	int err;
+
+	while (get_t_state(&connection->receiver) == RUNNING) {
+		struct data_cmd *cmd;
+
+		drbd_thread_current_set_cpu(&connection->receiver);
+		if (drbd_recv_header(connection, &pi))
+			goto err_out;
+
+		cmd = &drbd_cmd_handler[pi.cmd];
+		if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
+			drbd_err(connection, "Unexpected data packet %s (0x%04x)",
+				 cmdname(pi.cmd), pi.cmd);
+			goto err_out;
+		}
+
+		shs = cmd->pkt_size;
+		if (pi.size > shs && !cmd->expect_payload) {
+			drbd_err(connection, "No payload expected %s l:%d\n",
+				 cmdname(pi.cmd), pi.size);
+			goto err_out;
+		}
+
+		if (shs) {
+			err = drbd_recv_all_warn(connection, pi.data, shs);
+			if (err)
+				goto err_out;
+			pi.size -= shs;
+		}
+
+		err = cmd->fn(connection, &pi);
+		if (err) {
+			drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
+				 cmdname(pi.cmd), err, pi.size);
+			goto err_out;
+		}
+	}
+	return;
+
+    err_out:
+	conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+}
+
+static void conn_disconnect(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	enum drbd_conns oc;
+	int vnr;
+
+	if (connection->cstate == C_STANDALONE)
+		return;
+
+	/* We are about to start the cleanup after connection loss.
+	 * Make sure drbd_make_request knows about that.
+	 * Usually we should be in some network failure state already,
+	 * but just in case we are not, we fix it up here.
+	 */
+	conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+
+	/* asender does not clean up anything. it must not interfere, either */
+	drbd_thread_stop(&connection->asender);
+	drbd_free_sock(connection);
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_disconnected(peer_device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	if (!list_empty(&connection->current_epoch->list))
+		drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
+	/* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+	atomic_set(&connection->current_epoch->epoch_size, 0);
+	connection->send.seen_any_write_yet = false;
+
+	drbd_info(connection, "Connection closed\n");
+
+	if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
+		conn_try_outdate_peer_async(connection);
+
+	spin_lock_irq(&connection->resource->req_lock);
+	oc = connection->cstate;
+	if (oc >= C_UNCONNECTED)
+		_conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	if (oc == C_DISCONNECTING)
+		conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
+}
+
+static int drbd_disconnected(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	unsigned int i;
+
+	/* wait for current activity to cease. */
+	spin_lock_irq(&device->resource->req_lock);
+	_drbd_wait_ee_list_empty(device, &device->active_ee);
+	_drbd_wait_ee_list_empty(device, &device->sync_ee);
+	_drbd_wait_ee_list_empty(device, &device->read_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	/* We do not have data structures that would allow us to
+	 * get the rs_pending_cnt down to 0 again.
+	 *  * On C_SYNC_TARGET we do not have any data structures describing
+	 *    the pending RSDataRequest's we have sent.
+	 *  * On C_SYNC_SOURCE there is no data structure that tracks
+	 *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
+	 *  And no, it is not the sum of the reference counts in the
+	 *  resync_LRU. The resync_LRU tracks the whole operation including
+	 *  the disk-IO, while the rs_pending_cnt only tracks the blocks
+	 *  on the fly. */
+	drbd_rs_cancel_all(device);
+	device->rs_total = 0;
+	device->rs_failed = 0;
+	atomic_set(&device->rs_pending_cnt, 0);
+	wake_up(&device->misc_wait);
+
+	del_timer_sync(&device->resync_timer);
+	resync_timer_fn((unsigned long)device);
+
+	/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
+	 * w_make_resync_request etc. which may still be on the worker queue
+	 * to be "canceled" */
+	drbd_flush_workqueue(&peer_device->connection->sender_work);
+
+	drbd_finish_peer_reqs(device);
+
+	/* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
+	   might have issued a work again. The one before drbd_finish_peer_reqs() is
+	   necessary to reclain net_ee in drbd_finish_peer_reqs(). */
+	drbd_flush_workqueue(&peer_device->connection->sender_work);
+
+	/* need to do it again, drbd_finish_peer_reqs() may have populated it
+	 * again via drbd_try_clear_on_disk_bm(). */
+	drbd_rs_cancel_all(device);
+
+	kfree(device->p_uuid);
+	device->p_uuid = NULL;
+
+	if (!drbd_suspended(device))
+		tl_clear(peer_device->connection);
+
+	drbd_md_sync(device);
+
+	/* serialize with bitmap writeout triggered by the state change,
+	 * if any. */
+	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+
+	/* tcp_close and release of sendpage pages can be deferred.  I don't
+	 * want to use SO_LINGER, because apparently it can be deferred for
+	 * more than 20 seconds (longest time I checked).
+	 *
+	 * Actually we don't care for exactly when the network stack does its
+	 * put_page(), but release our reference on these pages right here.
+	 */
+	i = drbd_free_peer_reqs(device, &device->net_ee);
+	if (i)
+		drbd_info(device, "net_ee not empty, killed %u entries\n", i);
+	i = atomic_read(&device->pp_in_use_by_net);
+	if (i)
+		drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
+	i = atomic_read(&device->pp_in_use);
+	if (i)
+		drbd_info(device, "pp_in_use = %d, expected 0\n", i);
+
+	D_ASSERT(device, list_empty(&device->read_ee));
+	D_ASSERT(device, list_empty(&device->active_ee));
+	D_ASSERT(device, list_empty(&device->sync_ee));
+	D_ASSERT(device, list_empty(&device->done_ee));
+
+	return 0;
+}
+
+/*
+ * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
+ * we can agree on is stored in agreed_pro_version.
+ *
+ * feature flags and the reserved array should be enough room for future
+ * enhancements of the handshake protocol, and possible plugins...
+ *
+ * for now, they are expected to be zero, but ignored.
+ */
+static int drbd_send_features(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+	struct p_connection_features *p;
+
+	sock = &connection->data;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+	memset(p, 0, sizeof(*p));
+	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
+	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+	p->feature_flags = cpu_to_be32(PRO_FEATURES);
+	return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ */
+static int drbd_do_features(struct drbd_connection *connection)
+{
+	/* ASSERT current == connection->receiver ... */
+	struct p_connection_features *p;
+	const int expect = sizeof(struct p_connection_features);
+	struct packet_info pi;
+	int err;
+
+	err = drbd_send_features(connection);
+	if (err)
+		return 0;
+
+	err = drbd_recv_header(connection, &pi);
+	if (err)
+		return 0;
+
+	if (pi.cmd != P_CONNECTION_FEATURES) {
+		drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
+		return -1;
+	}
+
+	if (pi.size != expect) {
+		drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
+		     expect, pi.size);
+		return -1;
+	}
+
+	p = pi.data;
+	err = drbd_recv_all_warn(connection, p, expect);
+	if (err)
+		return 0;
+
+	p->protocol_min = be32_to_cpu(p->protocol_min);
+	p->protocol_max = be32_to_cpu(p->protocol_max);
+	if (p->protocol_max == 0)
+		p->protocol_max = p->protocol_min;
+
+	if (PRO_VERSION_MAX < p->protocol_min ||
+	    PRO_VERSION_MIN > p->protocol_max)
+		goto incompat;
+
+	connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+	connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
+
+	drbd_info(connection, "Handshake successful: "
+	     "Agreed network protocol version %d\n", connection->agreed_pro_version);
+
+	drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
+		  connection->agreed_features & FF_TRIM ? " " : " not ");
+
+	return 1;
+
+ incompat:
+	drbd_err(connection, "incompatible DRBD dialects: "
+	    "I support %d-%d, peer supports %d-%d\n",
+	    PRO_VERSION_MIN, PRO_VERSION_MAX,
+	    p->protocol_min, p->protocol_max);
+	return -1;
+}
+
+#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
+static int drbd_do_auth(struct drbd_connection *connection)
+{
+	drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+	drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+	return -1;
+}
+#else
+#define CHALLENGE_LEN 64
+
+/* Return value:
+	1 - auth succeeded,
+	0 - failed, try again (network error),
+	-1 - auth failed, don't try again.
+*/
+
+static int drbd_do_auth(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+	char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
+	struct scatterlist sg;
+	char *response = NULL;
+	char *right_response = NULL;
+	char *peers_ch = NULL;
+	unsigned int key_len;
+	char secret[SHARED_SECRET_MAX]; /* 64 byte */
+	unsigned int resp_size;
+	struct hash_desc desc;
+	struct packet_info pi;
+	struct net_conf *nc;
+	int err, rv;
+
+	/* FIXME: Put the challenge/response into the preallocated socket buffer.  */
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	key_len = strlen(nc->shared_secret);
+	memcpy(secret, nc->shared_secret, key_len);
+	rcu_read_unlock();
+
+	desc.tfm = connection->cram_hmac_tfm;
+	desc.flags = 0;
+
+	rv = crypto_hash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
+	if (rv) {
+		drbd_err(connection, "crypto_hash_setkey() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	get_random_bytes(my_challenge, CHALLENGE_LEN);
+
+	sock = &connection->data;
+	if (!conn_prepare_command(connection, sock)) {
+		rv = 0;
+		goto fail;
+	}
+	rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
+				my_challenge, CHALLENGE_LEN);
+	if (!rv)
+		goto fail;
+
+	err = drbd_recv_header(connection, &pi);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	if (pi.cmd != P_AUTH_CHALLENGE) {
+		drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
+		rv = 0;
+		goto fail;
+	}
+
+	if (pi.size > CHALLENGE_LEN * 2) {
+		drbd_err(connection, "expected AuthChallenge payload too big.\n");
+		rv = -1;
+		goto fail;
+	}
+
+	if (pi.size < CHALLENGE_LEN) {
+		drbd_err(connection, "AuthChallenge payload too small.\n");
+		rv = -1;
+		goto fail;
+	}
+
+	peers_ch = kmalloc(pi.size, GFP_NOIO);
+	if (peers_ch == NULL) {
+		drbd_err(connection, "kmalloc of peers_ch failed\n");
+		rv = -1;
+		goto fail;
+	}
+
+	err = drbd_recv_all_warn(connection, peers_ch, pi.size);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
+		drbd_err(connection, "Peer presented the same challenge!\n");
+		rv = -1;
+		goto fail;
+	}
+
+	resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
+	response = kmalloc(resp_size, GFP_NOIO);
+	if (response == NULL) {
+		drbd_err(connection, "kmalloc of response failed\n");
+		rv = -1;
+		goto fail;
+	}
+
+	sg_init_table(&sg, 1);
+	sg_set_buf(&sg, peers_ch, pi.size);
+
+	rv = crypto_hash_digest(&desc, &sg, sg.length, response);
+	if (rv) {
+		drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	if (!conn_prepare_command(connection, sock)) {
+		rv = 0;
+		goto fail;
+	}
+	rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
+				response, resp_size);
+	if (!rv)
+		goto fail;
+
+	err = drbd_recv_header(connection, &pi);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	if (pi.cmd != P_AUTH_RESPONSE) {
+		drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
+		rv = 0;
+		goto fail;
+	}
+
+	if (pi.size != resp_size) {
+		drbd_err(connection, "expected AuthResponse payload of wrong size\n");
+		rv = 0;
+		goto fail;
+	}
+
+	err = drbd_recv_all_warn(connection, response , resp_size);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	right_response = kmalloc(resp_size, GFP_NOIO);
+	if (right_response == NULL) {
+		drbd_err(connection, "kmalloc of right_response failed\n");
+		rv = -1;
+		goto fail;
+	}
+
+	sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
+
+	rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
+	if (rv) {
+		drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	rv = !memcmp(response, right_response, resp_size);
+
+	if (rv)
+		drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
+		     resp_size);
+	else
+		rv = -1;
+
+ fail:
+	kfree(peers_ch);
+	kfree(response);
+	kfree(right_response);
+
+	return rv;
+}
+#endif
+
+int drbd_receiver(struct drbd_thread *thi)
+{
+	struct drbd_connection *connection = thi->connection;
+	int h;
+
+	drbd_info(connection, "receiver (re)started\n");
+
+	do {
+		h = conn_connect(connection);
+		if (h == 0) {
+			conn_disconnect(connection);
+			schedule_timeout_interruptible(HZ);
+		}
+		if (h == -1) {
+			drbd_warn(connection, "Discarding network configuration.\n");
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		}
+	} while (h == 0);
+
+	if (h > 0)
+		drbdd(connection);
+
+	conn_disconnect(connection);
+
+	drbd_info(connection, "receiver terminated\n");
+	return 0;
+}
+
+/* ********* acknowledge sender ******** */
+
+static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_req_state_reply *p = pi->data;
+	int retcode = be32_to_cpu(p->retcode);
+
+	if (retcode >= SS_SUCCESS) {
+		set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
+	} else {
+		set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
+		drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
+			 drbd_set_st_err_str(retcode), retcode);
+	}
+	wake_up(&connection->ping_wait);
+
+	return 0;
+}
+
+static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_req_state_reply *p = pi->data;
+	int retcode = be32_to_cpu(p->retcode);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
+		D_ASSERT(device, connection->agreed_pro_version < 100);
+		return got_conn_RqSReply(connection, pi);
+	}
+
+	if (retcode >= SS_SUCCESS) {
+		set_bit(CL_ST_CHG_SUCCESS, &device->flags);
+	} else {
+		set_bit(CL_ST_CHG_FAIL, &device->flags);
+		drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
+			drbd_set_st_err_str(retcode), retcode);
+	}
+	wake_up(&device->state_wait);
+
+	return 0;
+}
+
+static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
+{
+	return drbd_send_ping_ack(connection);
+
+}
+
+static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	/* restore idle timeout */
+	connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
+	if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
+		wake_up(&connection->ping_wait);
+
+	return 0;
+}
+
+static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (get_ldev(device)) {
+		drbd_rs_complete_io(device, sector);
+		drbd_set_in_sync(device, sector, blksize);
+		/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+		device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+		put_ldev(device);
+	}
+	dec_rs_pending(device);
+	atomic_add(blksize >> 9, &device->rs_sect_in);
+
+	return 0;
+}
+
+static int
+validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
+			      struct rb_root *root, const char *func,
+			      enum drbd_req_event what, bool missing_ok)
+{
+	struct drbd_request *req;
+	struct bio_and_error m;
+
+	spin_lock_irq(&device->resource->req_lock);
+	req = find_request(device, root, id, sector, missing_ok, func);
+	if (unlikely(!req)) {
+		spin_unlock_irq(&device->resource->req_lock);
+		return -EIO;
+	}
+	__req_mod(req, what, &m);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (m.bio)
+		complete_master_bio(device, &m);
+	return 0;
+}
+
+static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+	enum drbd_req_event what;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (p->block_id == ID_SYNCER) {
+		drbd_set_in_sync(device, sector, blksize);
+		dec_rs_pending(device);
+		return 0;
+	}
+	switch (pi->cmd) {
+	case P_RS_WRITE_ACK:
+		what = WRITE_ACKED_BY_PEER_AND_SIS;
+		break;
+	case P_WRITE_ACK:
+		what = WRITE_ACKED_BY_PEER;
+		break;
+	case P_RECV_ACK:
+		what = RECV_ACKED_BY_PEER;
+		break;
+	case P_SUPERSEDED:
+		what = CONFLICT_RESOLVED;
+		break;
+	case P_RETRY_WRITE:
+		what = POSTPONE_WRITE;
+		break;
+	default:
+		BUG();
+	}
+
+	return validate_req_change_req_state(device, p->block_id, sector,
+					     &device->write_requests, __func__,
+					     what, false);
+}
+
+static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+	int size = be32_to_cpu(p->blksize);
+	int err;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (p->block_id == ID_SYNCER) {
+		dec_rs_pending(device);
+		drbd_rs_failed_io(device, sector, size);
+		return 0;
+	}
+
+	err = validate_req_change_req_state(device, p->block_id, sector,
+					    &device->write_requests, __func__,
+					    NEG_ACKED, true);
+	if (err) {
+		/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+		   The master bio might already be completed, therefore the
+		   request is no longer in the collision hash. */
+		/* In Protocol B we might already have got a P_RECV_ACK
+		   but then get a P_NEG_ACK afterwards. */
+		drbd_set_out_of_sync(device, sector, size);
+	}
+	return 0;
+}
+
+static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
+	    (unsigned long long)sector, be32_to_cpu(p->blksize));
+
+	return validate_req_change_req_state(device, p->block_id, sector,
+					     &device->read_requests, __func__,
+					     NEG_ACKED, false);
+}
+
+static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	sector_t sector;
+	int size;
+	struct p_block_ack *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	dec_rs_pending(device);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		drbd_rs_complete_io(device, sector);
+		switch (pi->cmd) {
+		case P_NEG_RS_DREPLY:
+			drbd_rs_failed_io(device, sector, size);
+		case P_RS_CANCEL:
+			break;
+		default:
+			BUG();
+		}
+		put_ldev(device);
+	}
+
+	return 0;
+}
+
+static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_barrier_ack *p = pi->data;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		if (device->state.conn == C_AHEAD &&
+		    atomic_read(&device->ap_in_flight) == 0 &&
+		    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
+			device->start_resync_timer.expires = jiffies + HZ;
+			add_timer(&device->start_resync_timer);
+		}
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	struct drbd_device_work *dw;
+	sector_t sector;
+	int size;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
+		drbd_ov_out_of_sync_found(device, sector, size);
+	else
+		ov_out_of_sync_print(device);
+
+	if (!get_ldev(device))
+		return 0;
+
+	drbd_rs_complete_io(device, sector);
+	dec_rs_pending(device);
+
+	--device->ov_left;
+
+	/* let's advance progress step marks only for every other megabyte */
+	if ((device->ov_left & 0x200) == 0x200)
+		drbd_advance_rs_marks(device, device->ov_left);
+
+	if (device->ov_left == 0) {
+		dw = kmalloc(sizeof(*dw), GFP_NOIO);
+		if (dw) {
+			dw->w.cb = w_ov_finished;
+			dw->device = device;
+			drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
+		} else {
+			drbd_err(device, "kmalloc(dw) failed.");
+			ov_out_of_sync_print(device);
+			drbd_resync_finished(device);
+		}
+	}
+	put_ldev(device);
+	return 0;
+}
+
+static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+	return 0;
+}
+
+static int connection_finish_peer_reqs(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr, not_empty = 0;
+
+	do {
+		clear_bit(SIGNAL_ASENDER, &connection->flags);
+		flush_signals(current);
+
+		rcu_read_lock();
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			struct drbd_device *device = peer_device->device;
+			kref_get(&device->kref);
+			rcu_read_unlock();
+			if (drbd_finish_peer_reqs(device)) {
+				kref_put(&device->kref, drbd_destroy_device);
+				return 1;
+			}
+			kref_put(&device->kref, drbd_destroy_device);
+			rcu_read_lock();
+		}
+		set_bit(SIGNAL_ASENDER, &connection->flags);
+
+		spin_lock_irq(&connection->resource->req_lock);
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			struct drbd_device *device = peer_device->device;
+			not_empty = !list_empty(&device->done_ee);
+			if (not_empty)
+				break;
+		}
+		spin_unlock_irq(&connection->resource->req_lock);
+		rcu_read_unlock();
+	} while (not_empty);
+
+	return 0;
+}
+
+struct asender_cmd {
+	size_t pkt_size;
+	int (*fn)(struct drbd_connection *connection, struct packet_info *);
+};
+
+static struct asender_cmd asender_tbl[] = {
+	[P_PING]	    = { 0, got_Ping },
+	[P_PING_ACK]	    = { 0, got_PingAck },
+	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_WRITE_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_NEG_ACK]	    = { sizeof(struct p_block_ack), got_NegAck },
+	[P_NEG_DREPLY]	    = { sizeof(struct p_block_ack), got_NegDReply },
+	[P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
+	[P_OV_RESULT]	    = { sizeof(struct p_block_ack), got_OVResult },
+	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
+	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
+	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
+	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
+	[P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
+	[P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
+	[P_RETRY_WRITE]	    = { sizeof(struct p_block_ack), got_BlockAck },
+};
+
+int drbd_asender(struct drbd_thread *thi)
+{
+	struct drbd_connection *connection = thi->connection;
+	struct asender_cmd *cmd = NULL;
+	struct packet_info pi;
+	int rv;
+	void *buf    = connection->meta.rbuf;
+	int received = 0;
+	unsigned int header_size = drbd_header_size(connection);
+	int expect   = header_size;
+	bool ping_timeout_active = false;
+	struct net_conf *nc;
+	int ping_timeo, tcp_cork, ping_int;
+	struct sched_param param = { .sched_priority = 2 };
+
+	rv = sched_setscheduler(current, SCHED_RR, &param);
+	if (rv < 0)
+		drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
+
+	while (get_t_state(thi) == RUNNING) {
+		drbd_thread_current_set_cpu(thi);
+
+		rcu_read_lock();
+		nc = rcu_dereference(connection->net_conf);
+		ping_timeo = nc->ping_timeo;
+		tcp_cork = nc->tcp_cork;
+		ping_int = nc->ping_int;
+		rcu_read_unlock();
+
+		if (test_and_clear_bit(SEND_PING, &connection->flags)) {
+			if (drbd_send_ping(connection)) {
+				drbd_err(connection, "drbd_send_ping has failed\n");
+				goto reconnect;
+			}
+			connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+			ping_timeout_active = true;
+		}
+
+		/* TODO: conditionally cork; it may hurt latency if we cork without
+		   much to send */
+		if (tcp_cork)
+			drbd_tcp_cork(connection->meta.socket);
+		if (connection_finish_peer_reqs(connection)) {
+			drbd_err(connection, "connection_finish_peer_reqs() failed\n");
+			goto reconnect;
+		}
+		/* but unconditionally uncork unless disabled */
+		if (tcp_cork)
+			drbd_tcp_uncork(connection->meta.socket);
+
+		/* short circuit, recv_msg would return EINTR anyways. */
+		if (signal_pending(current))
+			continue;
+
+		rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
+		clear_bit(SIGNAL_ASENDER, &connection->flags);
+
+		flush_signals(current);
+
+		/* Note:
+		 * -EINTR	 (on meta) we got a signal
+		 * -EAGAIN	 (on meta) rcvtimeo expired
+		 * -ECONNRESET	 other side closed the connection
+		 * -ERESTARTSYS  (on data) we got a signal
+		 * rv <  0	 other than above: unexpected error!
+		 * rv == expected: full header or command
+		 * rv <  expected: "woken" by signal during receive
+		 * rv == 0	 : "connection shut down by peer"
+		 */
+		if (likely(rv > 0)) {
+			received += rv;
+			buf	 += rv;
+		} else if (rv == 0) {
+			if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+				long t;
+				rcu_read_lock();
+				t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+				rcu_read_unlock();
+
+				t = wait_event_timeout(connection->ping_wait,
+						       connection->cstate < C_WF_REPORT_PARAMS,
+						       t);
+				if (t)
+					break;
+			}
+			drbd_err(connection, "meta connection shut down by peer.\n");
+			goto reconnect;
+		} else if (rv == -EAGAIN) {
+			/* If the data socket received something meanwhile,
+			 * that is good enough: peer is still alive. */
+			if (time_after(connection->last_received,
+				jiffies - connection->meta.socket->sk->sk_rcvtimeo))
+				continue;
+			if (ping_timeout_active) {
+				drbd_err(connection, "PingAck did not arrive in time.\n");
+				goto reconnect;
+			}
+			set_bit(SEND_PING, &connection->flags);
+			continue;
+		} else if (rv == -EINTR) {
+			continue;
+		} else {
+			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+			goto reconnect;
+		}
+
+		if (received == expect && cmd == NULL) {
+			if (decode_header(connection, connection->meta.rbuf, &pi))
+				goto reconnect;
+			cmd = &asender_tbl[pi.cmd];
+			if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+				drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
+					 cmdname(pi.cmd), pi.cmd);
+				goto disconnect;
+			}
+			expect = header_size + cmd->pkt_size;
+			if (pi.size != expect - header_size) {
+				drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
+					pi.cmd, pi.size);
+				goto reconnect;
+			}
+		}
+		if (received == expect) {
+			bool err;
+
+			err = cmd->fn(connection, &pi);
+			if (err) {
+				drbd_err(connection, "%pf failed\n", cmd->fn);
+				goto reconnect;
+			}
+
+			connection->last_received = jiffies;
+
+			if (cmd == &asender_tbl[P_PING_ACK]) {
+				/* restore idle timeout */
+				connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
+				ping_timeout_active = false;
+			}
+
+			buf	 = connection->meta.rbuf;
+			received = 0;
+			expect	 = header_size;
+			cmd	 = NULL;
+		}
+	}
+
+	if (0) {
+reconnect:
+		conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+		conn_md_sync(connection);
+	}
+	if (0) {
+disconnect:
+		conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	}
+	clear_bit(SIGNAL_ASENDER, &connection->flags);
+
+	drbd_info(connection, "asender terminated\n");
+
+	return 0;
+}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 00000000000..09803d0d520
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1453 @@
+/*
+   drbd_req.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size);
+
+/* Update disk stats at start of I/O request */
+static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req)
+{
+	const int rw = bio_data_dir(req->master_bio);
+	int cpu;
+	cpu = part_stat_lock();
+	part_round_stats(cpu, &device->vdisk->part0);
+	part_stat_inc(cpu, &device->vdisk->part0, ios[rw]);
+	part_stat_add(cpu, &device->vdisk->part0, sectors[rw], req->i.size >> 9);
+	(void) cpu; /* The macro invocations above want the cpu argument, I do not like
+		       the compiler warning about cpu only assigned but never used... */
+	part_inc_in_flight(&device->vdisk->part0, rw);
+	part_stat_unlock();
+}
+
+/* Update disk stats when completing request upwards */
+static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
+{
+	int rw = bio_data_dir(req->master_bio);
+	unsigned long duration = jiffies - req->start_time;
+	int cpu;
+	cpu = part_stat_lock();
+	part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration);
+	part_round_stats(cpu, &device->vdisk->part0);
+	part_dec_in_flight(&device->vdisk->part0, rw);
+	part_stat_unlock();
+}
+
+static struct drbd_request *drbd_req_new(struct drbd_device *device,
+					       struct bio *bio_src)
+{
+	struct drbd_request *req;
+
+	req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+	if (!req)
+		return NULL;
+
+	drbd_req_make_private_bio(req, bio_src);
+	req->rq_state    = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
+	req->device   = device;
+	req->master_bio  = bio_src;
+	req->epoch       = 0;
+
+	drbd_clear_interval(&req->i);
+	req->i.sector     = bio_src->bi_iter.bi_sector;
+	req->i.size      = bio_src->bi_iter.bi_size;
+	req->i.local = true;
+	req->i.waiting = false;
+
+	INIT_LIST_HEAD(&req->tl_requests);
+	INIT_LIST_HEAD(&req->w.list);
+
+	/* one reference to be put by __drbd_make_request */
+	atomic_set(&req->completion_ref, 1);
+	/* one kref as long as completion_ref > 0 */
+	kref_init(&req->kref);
+	return req;
+}
+
+void drbd_req_destroy(struct kref *kref)
+{
+	struct drbd_request *req = container_of(kref, struct drbd_request, kref);
+	struct drbd_device *device = req->device;
+	const unsigned s = req->rq_state;
+
+	if ((req->master_bio && !(s & RQ_POSTPONED)) ||
+		atomic_read(&req->completion_ref) ||
+		(s & RQ_LOCAL_PENDING) ||
+		((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) {
+		drbd_err(device, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n",
+				s, atomic_read(&req->completion_ref));
+		return;
+	}
+
+	/* remove it from the transfer log.
+	 * well, only if it had been there in the first
+	 * place... if it had not (local only or conflicting
+	 * and never sent), it should still be "empty" as
+	 * initialized in drbd_req_new(), so we can list_del() it
+	 * here unconditionally */
+	list_del_init(&req->tl_requests);
+
+	/* if it was a write, we may have to set the corresponding
+	 * bit(s) out-of-sync first. If it had a local part, we need to
+	 * release the reference to the activity log. */
+	if (s & RQ_WRITE) {
+		/* Set out-of-sync unless both OK flags are set
+		 * (local only or remote failed).
+		 * Other places where we set out-of-sync:
+		 * READ with local io-error */
+
+		/* There is a special case:
+		 * we may notice late that IO was suspended,
+		 * and postpone, or schedule for retry, a write,
+		 * before it even was submitted or sent.
+		 * In that case we do not want to touch the bitmap at all.
+		 */
+		if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) {
+			if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+				drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+
+			if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+				drbd_set_in_sync(device, req->i.sector, req->i.size);
+		}
+
+		/* one might be tempted to move the drbd_al_complete_io
+		 * to the local io completion callback drbd_request_endio.
+		 * but, if this was a mirror write, we may only
+		 * drbd_al_complete_io after this is RQ_NET_DONE,
+		 * otherwise the extent could be dropped from the al
+		 * before it has actually been written on the peer.
+		 * if we crash before our peer knows about the request,
+		 * but after the extent has been dropped from the al,
+		 * we would forget to resync the corresponding extent.
+		 */
+		if (s & RQ_IN_ACT_LOG) {
+			if (get_ldev_if_state(device, D_FAILED)) {
+				drbd_al_complete_io(device, &req->i);
+				put_ldev(device);
+			} else if (__ratelimit(&drbd_ratelimit_state)) {
+				drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), "
+					 "but my Disk seems to have failed :(\n",
+					 (unsigned long long) req->i.sector, req->i.size);
+			}
+		}
+	}
+
+	mempool_free(req, drbd_request_mempool);
+}
+
+static void wake_all_senders(struct drbd_connection *connection)
+{
+	wake_up(&connection->sender_work.q_wait);
+}
+
+/* must hold resource->req_lock */
+void start_new_tl_epoch(struct drbd_connection *connection)
+{
+	/* no point closing an epoch, if it is empty, anyways. */
+	if (connection->current_tle_writes == 0)
+		return;
+
+	connection->current_tle_writes = 0;
+	atomic_inc(&connection->current_tle_nr);
+	wake_all_senders(connection);
+}
+
+void complete_master_bio(struct drbd_device *device,
+		struct bio_and_error *m)
+{
+	bio_endio(m->bio, m->error);
+	dec_ap_bio(device);
+}
+
+
+static void drbd_remove_request_interval(struct rb_root *root,
+					 struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	struct drbd_interval *i = &req->i;
+
+	drbd_remove_interval(root, i);
+
+	/* Wake up any processes waiting for this request to complete.  */
+	if (i->waiting)
+		wake_up(&device->misc_wait);
+}
+
+/* Helper for __req_mod().
+ * Set m->bio to the master bio, if it is fit to be completed,
+ * or leave it alone (it is initialized to NULL in __req_mod),
+ * if it has already been completed, or cannot be completed yet.
+ * If m->bio is set, the error status to be returned is placed in m->error.
+ */
+static
+void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
+{
+	const unsigned s = req->rq_state;
+	struct drbd_device *device = req->device;
+	int rw;
+	int error, ok;
+
+	/* we must not complete the master bio, while it is
+	 *	still being processed by _drbd_send_zc_bio (drbd_send_dblock)
+	 *	not yet acknowledged by the peer
+	 *	not yet completed by the local io subsystem
+	 * these flags may get cleared in any order by
+	 *	the worker,
+	 *	the receiver,
+	 *	the bio_endio completion callbacks.
+	 */
+	if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) ||
+	    (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) ||
+	    (s & RQ_COMPLETION_SUSP)) {
+		drbd_err(device, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s);
+		return;
+	}
+
+	if (!req->master_bio) {
+		drbd_err(device, "drbd_req_complete: Logic BUG, master_bio == NULL!\n");
+		return;
+	}
+
+	rw = bio_rw(req->master_bio);
+
+	/*
+	 * figure out whether to report success or failure.
+	 *
+	 * report success when at least one of the operations succeeded.
+	 * or, to put the other way,
+	 * only report failure, when both operations failed.
+	 *
+	 * what to do about the failures is handled elsewhere.
+	 * what we need to do here is just: complete the master_bio.
+	 *
+	 * local completion error, if any, has been stored as ERR_PTR
+	 * in private_bio within drbd_request_endio.
+	 */
+	ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+	error = PTR_ERR(req->private_bio);
+
+	/* remove the request from the conflict detection
+	 * respective block_id verification hash */
+	if (!drbd_interval_empty(&req->i)) {
+		struct rb_root *root;
+
+		if (rw == WRITE)
+			root = &device->write_requests;
+		else
+			root = &device->read_requests;
+		drbd_remove_request_interval(root, req);
+	}
+
+	/* Before we can signal completion to the upper layers,
+	 * we may need to close the current transfer log epoch.
+	 * We are within the request lock, so we can simply compare
+	 * the request epoch number with the current transfer log
+	 * epoch number.  If they match, increase the current_tle_nr,
+	 * and reset the transfer log epoch write_cnt.
+	 */
+	if (rw == WRITE &&
+	    req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr))
+		start_new_tl_epoch(first_peer_device(device)->connection);
+
+	/* Update disk stats */
+	_drbd_end_io_acct(device, req);
+
+	/* If READ failed,
+	 * have it be pushed back to the retry work queue,
+	 * so it will re-enter __drbd_make_request(),
+	 * and be re-assigned to a suitable local or remote path,
+	 * or failed if we do not have access to good data anymore.
+	 *
+	 * Unless it was failed early by __drbd_make_request(),
+	 * because no path was available, in which case
+	 * it was not even added to the transfer_log.
+	 *
+	 * READA may fail, and will not be retried.
+	 *
+	 * WRITE should have used all available paths already.
+	 */
+	if (!ok && rw == READ && !list_empty(&req->tl_requests))
+		req->rq_state |= RQ_POSTPONED;
+
+	if (!(req->rq_state & RQ_POSTPONED)) {
+		m->error = ok ? 0 : (error ?: -EIO);
+		m->bio = req->master_bio;
+		req->master_bio = NULL;
+	}
+}
+
+static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
+{
+	struct drbd_device *device = req->device;
+	D_ASSERT(device, m || (req->rq_state & RQ_POSTPONED));
+
+	if (!atomic_sub_and_test(put, &req->completion_ref))
+		return 0;
+
+	drbd_req_complete(req, m);
+
+	if (req->rq_state & RQ_POSTPONED) {
+		/* don't destroy the req object just yet,
+		 * but queue it for retry */
+		drbd_restart_request(req);
+		return 0;
+	}
+
+	return 1;
+}
+
+/* I'd like this to be the only place that manipulates
+ * req->completion_ref and req->kref. */
+static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
+		int clear, int set)
+{
+	struct drbd_device *device = req->device;
+	unsigned s = req->rq_state;
+	int c_put = 0;
+	int k_put = 0;
+
+	if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP))
+		set |= RQ_COMPLETION_SUSP;
+
+	/* apply */
+
+	req->rq_state &= ~clear;
+	req->rq_state |= set;
+
+	/* no change? */
+	if (req->rq_state == s)
+		return;
+
+	/* intent: get references */
+
+	if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
+		atomic_inc(&req->completion_ref);
+
+	if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) {
+		inc_ap_pending(device);
+		atomic_inc(&req->completion_ref);
+	}
+
+	if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED))
+		atomic_inc(&req->completion_ref);
+
+	if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
+		kref_get(&req->kref); /* wait for the DONE */
+
+	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT))
+		atomic_add(req->i.size >> 9, &device->ap_in_flight);
+
+	if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
+		atomic_inc(&req->completion_ref);
+
+	/* progress: put references */
+
+	if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP))
+		++c_put;
+
+	if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
+		D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING);
+		/* local completion may still come in later,
+		 * we need to keep the req object around. */
+		kref_get(&req->kref);
+		++c_put;
+	}
+
+	if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
+		if (req->rq_state & RQ_LOCAL_ABORTED)
+			++k_put;
+		else
+			++c_put;
+	}
+
+	if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
+		dec_ap_pending(device);
+		++c_put;
+	}
+
+	if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
+		++c_put;
+
+	if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+		if (req->rq_state & RQ_NET_SENT)
+			atomic_sub(req->i.size >> 9, &device->ap_in_flight);
+		++k_put;
+	}
+
+	/* potentially complete and destroy */
+
+	if (k_put || c_put) {
+		/* Completion does it's own kref_put.  If we are going to
+		 * kref_sub below, we need req to be still around then. */
+		int at_least = k_put + !!c_put;
+		int refcount = atomic_read(&req->kref.refcount);
+		if (refcount < at_least)
+			drbd_err(device,
+				"mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
+				s, req->rq_state, refcount, at_least);
+	}
+
+	/* If we made progress, retry conflicting peer requests, if any. */
+	if (req->i.waiting)
+		wake_up(&device->misc_wait);
+
+	if (c_put)
+		k_put += drbd_req_put_completion_ref(req, m, c_put);
+	if (k_put)
+		kref_sub(&req->kref, k_put, drbd_req_destroy);
+}
+
+static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
+{
+        char b[BDEVNAME_SIZE];
+
+	if (!__ratelimit(&drbd_ratelimit_state))
+		return;
+
+	drbd_warn(device, "local %s IO error sector %llu+%u on %s\n",
+			(req->rq_state & RQ_WRITE) ? "WRITE" : "READ",
+			(unsigned long long)req->i.sector,
+			req->i.size >> 9,
+			bdevname(device->ldev->backing_bdev, b));
+}
+
+/* obviously this could be coded as many single functions
+ * instead of one huge switch,
+ * or by putting the code directly in the respective locations
+ * (as it has been before).
+ *
+ * but having it this way
+ *  enforces that it is all in this one place, where it is easier to audit,
+ *  it makes it obvious that whatever "event" "happens" to a request should
+ *  happen "atomically" within the req_lock,
+ *  and it enforces that we have to think in a very structured manner
+ *  about the "events" that may happen to a request during its life time ...
+ */
+int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m)
+{
+	struct drbd_device *device = req->device;
+	struct net_conf *nc;
+	int p, rv = 0;
+
+	if (m)
+		m->bio = NULL;
+
+	switch (what) {
+	default:
+		drbd_err(device, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
+		break;
+
+	/* does not happen...
+	 * initialization done in drbd_req_new
+	case CREATED:
+		break;
+		*/
+
+	case TO_BE_SENT: /* via network */
+		/* reached via __drbd_make_request
+		 * and from w_read_retry_remote */
+		D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
+		rcu_read_lock();
+		nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+		p = nc->wire_protocol;
+		rcu_read_unlock();
+		req->rq_state |=
+			p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
+			p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
+		mod_rq_state(req, m, 0, RQ_NET_PENDING);
+		break;
+
+	case TO_BE_SUBMITTED: /* locally */
+		/* reached via __drbd_make_request */
+		D_ASSERT(device, !(req->rq_state & RQ_LOCAL_MASK));
+		mod_rq_state(req, m, 0, RQ_LOCAL_PENDING);
+		break;
+
+	case COMPLETED_OK:
+		if (req->rq_state & RQ_WRITE)
+			device->writ_cnt += req->i.size >> 9;
+		else
+			device->read_cnt += req->i.size >> 9;
+
+		mod_rq_state(req, m, RQ_LOCAL_PENDING,
+				RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
+		break;
+
+	case ABORT_DISK_IO:
+		mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
+		break;
+
+	case WRITE_COMPLETED_WITH_ERROR:
+		drbd_report_io_error(device, req);
+		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+		break;
+
+	case READ_COMPLETED_WITH_ERROR:
+		drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+		drbd_report_io_error(device, req);
+		__drbd_chk_io_error(device, DRBD_READ_ERROR);
+		/* fall through. */
+	case READ_AHEAD_COMPLETED_WITH_ERROR:
+		/* it is legal to fail READA, no __drbd_chk_io_error in that case. */
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+		break;
+
+	case DISCARD_COMPLETED_NOTSUPP:
+	case DISCARD_COMPLETED_WITH_ERROR:
+		/* I'd rather not detach from local disk just because it
+		 * failed a REQ_DISCARD. */
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+		break;
+
+	case QUEUE_FOR_NET_READ:
+		/* READ or READA, and
+		 * no local disk,
+		 * or target area marked as invalid,
+		 * or just got an io-error. */
+		/* from __drbd_make_request
+		 * or from bio_endio during read io-error recovery */
+
+		/* So we can verify the handle in the answer packet.
+		 * Corresponding drbd_remove_request_interval is in
+		 * drbd_req_complete() */
+		D_ASSERT(device, drbd_interval_empty(&req->i));
+		drbd_insert_interval(&device->read_requests, &req->i);
+
+		set_bit(UNPLUG_REMOTE, &device->flags);
+
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+		req->w.cb = w_send_read_req;
+		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+				&req->w);
+		break;
+
+	case QUEUE_FOR_NET_WRITE:
+		/* assert something? */
+		/* from __drbd_make_request only */
+
+		/* Corresponding drbd_remove_request_interval is in
+		 * drbd_req_complete() */
+		D_ASSERT(device, drbd_interval_empty(&req->i));
+		drbd_insert_interval(&device->write_requests, &req->i);
+
+		/* NOTE
+		 * In case the req ended up on the transfer log before being
+		 * queued on the worker, it could lead to this request being
+		 * missed during cleanup after connection loss.
+		 * So we have to do both operations here,
+		 * within the same lock that protects the transfer log.
+		 *
+		 * _req_add_to_epoch(req); this has to be after the
+		 * _maybe_start_new_epoch(req); which happened in
+		 * __drbd_make_request, because we now may set the bit
+		 * again ourselves to close the current epoch.
+		 *
+		 * Add req to the (now) current epoch (barrier). */
+
+		/* otherwise we may lose an unplug, which may cause some remote
+		 * io-scheduler timeout to expire, increasing maximum latency,
+		 * hurting performance. */
+		set_bit(UNPLUG_REMOTE, &device->flags);
+
+		/* queue work item to send data */
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
+		req->w.cb =  w_send_dblock;
+		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+				&req->w);
+
+		/* close the epoch, in case it outgrew the limit */
+		rcu_read_lock();
+		nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+		p = nc->max_epoch_size;
+		rcu_read_unlock();
+		if (first_peer_device(device)->connection->current_tle_writes >= p)
+			start_new_tl_epoch(first_peer_device(device)->connection);
+
+		break;
+
+	case QUEUE_FOR_SEND_OOS:
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+		req->w.cb =  w_send_out_of_sync;
+		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+				&req->w);
+		break;
+
+	case READ_RETRY_REMOTE_CANCELED:
+	case SEND_CANCELED:
+	case SEND_FAILED:
+		/* real cleanup will be done from tl_clear.  just update flags
+		 * so it is no longer marked as on the worker queue */
+		mod_rq_state(req, m, RQ_NET_QUEUED, 0);
+		break;
+
+	case HANDED_OVER_TO_NETWORK:
+		/* assert something? */
+		if (bio_data_dir(req->master_bio) == WRITE &&
+		    !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
+			/* this is what is dangerous about protocol A:
+			 * pretend it was successfully written on the peer. */
+			if (req->rq_state & RQ_NET_PENDING)
+				mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
+			/* else: neg-ack was faster... */
+			/* it is still not yet RQ_NET_DONE until the
+			 * corresponding epoch barrier got acked as well,
+			 * so we know what to dirty on connection loss */
+		}
+		mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+		break;
+
+	case OOS_HANDED_TO_NETWORK:
+		/* Was not set PENDING, no longer QUEUED, so is now DONE
+		 * as far as this connection is concerned. */
+		mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE);
+		break;
+
+	case CONNECTION_LOST_WHILE_PENDING:
+		/* transfer log cleanup after connection loss */
+		mod_rq_state(req, m,
+				RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP,
+				RQ_NET_DONE);
+		break;
+
+	case CONFLICT_RESOLVED:
+		/* for superseded conflicting writes of multiple primaries,
+		 * there is no need to keep anything in the tl, potential
+		 * node crashes are covered by the activity log.
+		 *
+		 * If this request had been marked as RQ_POSTPONED before,
+		 * it will actually not be completed, but "restarted",
+		 * resubmitted from the retry worker context. */
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK);
+		break;
+
+	case WRITE_ACKED_BY_PEER_AND_SIS:
+		req->rq_state |= RQ_NET_SIS;
+	case WRITE_ACKED_BY_PEER:
+		D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+		/* protocol C; successfully written on peer.
+		 * Nothing more to do here.
+		 * We want to keep the tl in place for all protocols, to cater
+		 * for volatile write-back caches on lower level devices. */
+
+		goto ack_common;
+	case RECV_ACKED_BY_PEER:
+		D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
+		/* protocol B; pretends to be successfully written on peer.
+		 * see also notes above in HANDED_OVER_TO_NETWORK about
+		 * protocol != C */
+	ack_common:
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
+		break;
+
+	case POSTPONE_WRITE:
+		D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+		/* If this node has already detected the write conflict, the
+		 * worker will be waiting on misc_wait.  Wake it up once this
+		 * request has completed locally.
+		 */
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		req->rq_state |= RQ_POSTPONED;
+		if (req->i.waiting)
+			wake_up(&device->misc_wait);
+		/* Do not clear RQ_NET_PENDING. This request will make further
+		 * progress via restart_conflicting_writes() or
+		 * fail_postponed_requests(). Hopefully. */
+		break;
+
+	case NEG_ACKED:
+		mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0);
+		break;
+
+	case FAIL_FROZEN_DISK_IO:
+		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+			break;
+		mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+		break;
+
+	case RESTART_FROZEN_DISK_IO:
+		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+			break;
+
+		mod_rq_state(req, m,
+				RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED,
+				RQ_LOCAL_PENDING);
+
+		rv = MR_READ;
+		if (bio_data_dir(req->master_bio) == WRITE)
+			rv = MR_WRITE;
+
+		get_ldev(device); /* always succeeds in this call path */
+		req->w.cb = w_restart_disk_io;
+		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+				&req->w);
+		break;
+
+	case RESEND:
+		/* Simply complete (local only) READs. */
+		if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
+			mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+			break;
+		}
+
+		/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
+		   before the connection loss (B&C only); only P_BARRIER_ACK
+		   (or the local completion?) was missing when we suspended.
+		   Throwing them out of the TL here by pretending we got a BARRIER_ACK.
+		   During connection handshake, we ensure that the peer was not rebooted. */
+		if (!(req->rq_state & RQ_NET_OK)) {
+			/* FIXME could this possibly be a req->dw.cb == w_send_out_of_sync?
+			 * in that case we must not set RQ_NET_PENDING. */
+
+			mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
+			if (req->w.cb) {
+				drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+						&req->w);
+				rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
+			} /* else: FIXME can this happen? */
+			break;
+		}
+		/* else, fall through to BARRIER_ACKED */
+
+	case BARRIER_ACKED:
+		/* barrier ack for READ requests does not make sense */
+		if (!(req->rq_state & RQ_WRITE))
+			break;
+
+		if (req->rq_state & RQ_NET_PENDING) {
+			/* barrier came in before all requests were acked.
+			 * this is bad, because if the connection is lost now,
+			 * we won't be able to clean them up... */
+			drbd_err(device, "FIXME (BARRIER_ACKED but pending)\n");
+		}
+		/* Allowed to complete requests, even while suspended.
+		 * As this is called for all requests within a matching epoch,
+		 * we need to filter, and only set RQ_NET_DONE for those that
+		 * have actually been on the wire. */
+		mod_rq_state(req, m, RQ_COMPLETION_SUSP,
+				(req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0);
+		break;
+
+	case DATA_RECEIVED:
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
+		break;
+
+	case QUEUE_AS_DRBD_BARRIER:
+		start_new_tl_epoch(first_peer_device(device)->connection);
+		mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
+		break;
+	};
+
+	return rv;
+}
+
+/* we may do a local read if:
+ * - we are consistent (of course),
+ * - or we are generally inconsistent,
+ *   BUT we are still/already IN SYNC for this area.
+ *   since size may be bigger than BM_BLOCK_SIZE,
+ *   we may need to check several bits.
+ */
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size)
+{
+	unsigned long sbnr, ebnr;
+	sector_t esector, nr_sectors;
+
+	if (device->state.disk == D_UP_TO_DATE)
+		return true;
+	if (device->state.disk != D_INCONSISTENT)
+		return false;
+	esector = sector + (size >> 9) - 1;
+	nr_sectors = drbd_get_capacity(device->this_bdev);
+	D_ASSERT(device, sector  < nr_sectors);
+	D_ASSERT(device, esector < nr_sectors);
+
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	return drbd_bm_count_bits(device, sbnr, ebnr) == 0;
+}
+
+static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector,
+		enum drbd_read_balancing rbm)
+{
+	struct backing_dev_info *bdi;
+	int stripe_shift;
+
+	switch (rbm) {
+	case RB_CONGESTED_REMOTE:
+		bdi = &device->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
+		return bdi_read_congested(bdi);
+	case RB_LEAST_PENDING:
+		return atomic_read(&device->local_cnt) >
+			atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt);
+	case RB_32K_STRIPING:  /* stripe_shift = 15 */
+	case RB_64K_STRIPING:
+	case RB_128K_STRIPING:
+	case RB_256K_STRIPING:
+	case RB_512K_STRIPING:
+	case RB_1M_STRIPING:   /* stripe_shift = 20 */
+		stripe_shift = (rbm - RB_32K_STRIPING + 15);
+		return (sector >> (stripe_shift - 9)) & 1;
+	case RB_ROUND_ROBIN:
+		return test_and_change_bit(READ_BALANCE_RR, &device->flags);
+	case RB_PREFER_REMOTE:
+		return true;
+	case RB_PREFER_LOCAL:
+	default:
+		return false;
+	}
+}
+
+/*
+ * complete_conflicting_writes  -  wait for any conflicting write requests
+ *
+ * The write_requests tree contains all active write requests which we
+ * currently know about.  Wait for any requests to complete which conflict with
+ * the new one.
+ *
+ * Only way out: remove the conflicting intervals from the tree.
+ */
+static void complete_conflicting_writes(struct drbd_request *req)
+{
+	DEFINE_WAIT(wait);
+	struct drbd_device *device = req->device;
+	struct drbd_interval *i;
+	sector_t sector = req->i.sector;
+	int size = req->i.size;
+
+	i = drbd_find_overlap(&device->write_requests, sector, size);
+	if (!i)
+		return;
+
+	for (;;) {
+		prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+		i = drbd_find_overlap(&device->write_requests, sector, size);
+		if (!i)
+			break;
+		/* Indicate to wake up device->misc_wait on progress.  */
+		i->waiting = true;
+		spin_unlock_irq(&device->resource->req_lock);
+		schedule();
+		spin_lock_irq(&device->resource->req_lock);
+	}
+	finish_wait(&device->misc_wait, &wait);
+}
+
+/* called within req_lock and rcu_read_lock() */
+static void maybe_pull_ahead(struct drbd_device *device)
+{
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct net_conf *nc;
+	bool congested = false;
+	enum drbd_on_congestion on_congestion;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+	rcu_read_unlock();
+	if (on_congestion == OC_BLOCK ||
+	    connection->agreed_pro_version < 96)
+		return;
+
+	/* If I don't even have good local storage, we can not reasonably try
+	 * to pull ahead of the peer. We also need the local reference to make
+	 * sure device->act_log is there.
+	 */
+	if (!get_ldev_if_state(device, D_UP_TO_DATE))
+		return;
+
+	if (nc->cong_fill &&
+	    atomic_read(&device->ap_in_flight) >= nc->cong_fill) {
+		drbd_info(device, "Congestion-fill threshold reached\n");
+		congested = true;
+	}
+
+	if (device->act_log->used >= nc->cong_extents) {
+		drbd_info(device, "Congestion-extents threshold reached\n");
+		congested = true;
+	}
+
+	if (congested) {
+		/* start a new epoch for non-mirrored writes */
+		start_new_tl_epoch(first_peer_device(device)->connection);
+
+		if (on_congestion == OC_PULL_AHEAD)
+			_drbd_set_state(_NS(device, conn, C_AHEAD), 0, NULL);
+		else  /*nc->on_congestion == OC_DISCONNECT */
+			_drbd_set_state(_NS(device, conn, C_DISCONNECTING), 0, NULL);
+	}
+	put_ldev(device);
+}
+
+/* If this returns false, and req->private_bio is still set,
+ * this should be submitted locally.
+ *
+ * If it returns false, but req->private_bio is not set,
+ * we do not have access to good data :(
+ *
+ * Otherwise, this destroys req->private_bio, if any,
+ * and returns true.
+ */
+static bool do_remote_read(struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	enum drbd_read_balancing rbm;
+
+	if (req->private_bio) {
+		if (!drbd_may_do_local_read(device,
+					req->i.sector, req->i.size)) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(device);
+		}
+	}
+
+	if (device->state.pdsk != D_UP_TO_DATE)
+		return false;
+
+	if (req->private_bio == NULL)
+		return true;
+
+	/* TODO: improve read balancing decisions, take into account drbd
+	 * protocol, pending requests etc. */
+
+	rcu_read_lock();
+	rbm = rcu_dereference(device->ldev->disk_conf)->read_balancing;
+	rcu_read_unlock();
+
+	if (rbm == RB_PREFER_LOCAL && req->private_bio)
+		return false; /* submit locally */
+
+	if (remote_due_to_read_balancing(device, req->i.sector, rbm)) {
+		if (req->private_bio) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(device);
+		}
+		return true;
+	}
+
+	return false;
+}
+
+/* returns number of connections (== 1, for drbd 8.4)
+ * expected to actually write this data,
+ * which does NOT include those that we are L_AHEAD for. */
+static int drbd_process_write_request(struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	int remote, send_oos;
+
+	remote = drbd_should_do_remote(device->state);
+	send_oos = drbd_should_send_out_of_sync(device->state);
+
+	/* Need to replicate writes.  Unless it is an empty flush,
+	 * which is better mapped to a DRBD P_BARRIER packet,
+	 * also for drbd wire protocol compatibility reasons.
+	 * If this was a flush, just start a new epoch.
+	 * Unless the current epoch was empty anyways, or we are not currently
+	 * replicating, in which case there is no point. */
+	if (unlikely(req->i.size == 0)) {
+		/* The only size==0 bios we expect are empty flushes. */
+		D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH);
+		if (remote)
+			_req_mod(req, QUEUE_AS_DRBD_BARRIER);
+		return remote;
+	}
+
+	if (!remote && !send_oos)
+		return 0;
+
+	D_ASSERT(device, !(remote && send_oos));
+
+	if (remote) {
+		_req_mod(req, TO_BE_SENT);
+		_req_mod(req, QUEUE_FOR_NET_WRITE);
+	} else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size))
+		_req_mod(req, QUEUE_FOR_SEND_OOS);
+
+	return remote;
+}
+
+static void
+drbd_submit_req_private_bio(struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	struct bio *bio = req->private_bio;
+	const int rw = bio_rw(bio);
+
+	bio->bi_bdev = device->ldev->backing_bdev;
+
+	/* State may have changed since we grabbed our reference on the
+	 * ->ldev member. Double check, and short-circuit to endio.
+	 * In case the last activity log transaction failed to get on
+	 * stable storage, and this is a WRITE, we may not even submit
+	 * this bio. */
+	if (get_ldev(device)) {
+		if (drbd_insert_fault(device,
+				      rw == WRITE ? DRBD_FAULT_DT_WR
+				    : rw == READ  ? DRBD_FAULT_DT_RD
+				    :               DRBD_FAULT_DT_RA))
+			bio_endio(bio, -EIO);
+		else
+			generic_make_request(bio);
+		put_ldev(device);
+	} else
+		bio_endio(bio, -EIO);
+}
+
+static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
+{
+	spin_lock(&device->submit.lock);
+	list_add_tail(&req->tl_requests, &device->submit.writes);
+	spin_unlock(&device->submit.lock);
+	queue_work(device->submit.wq, &device->submit.worker);
+}
+
+/* returns the new drbd_request pointer, if the caller is expected to
+ * drbd_send_and_submit() it (to save latency), or NULL if we queued the
+ * request on the submitter thread.
+ * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
+ */
+static struct drbd_request *
+drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+{
+	const int rw = bio_data_dir(bio);
+	struct drbd_request *req;
+
+	/* allocate outside of all locks; */
+	req = drbd_req_new(device, bio);
+	if (!req) {
+		dec_ap_bio(device);
+		/* only pass the error to the upper layers.
+		 * if user cannot handle io errors, that's not our business. */
+		drbd_err(device, "could not kmalloc() req\n");
+		bio_endio(bio, -ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+	req->start_time = start_time;
+
+	if (!get_ldev(device)) {
+		bio_put(req->private_bio);
+		req->private_bio = NULL;
+	}
+
+	/* Update disk stats */
+	_drbd_start_io_acct(device, req);
+
+	if (rw == WRITE && req->private_bio && req->i.size
+	&& !test_bit(AL_SUSPENDED, &device->flags)) {
+		if (!drbd_al_begin_io_fastpath(device, &req->i)) {
+			drbd_queue_write(device, req);
+			return NULL;
+		}
+		req->rq_state |= RQ_IN_ACT_LOG;
+	}
+
+	return req;
+}
+
+static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
+{
+	const int rw = bio_rw(req->master_bio);
+	struct bio_and_error m = { NULL, };
+	bool no_remote = false;
+
+	spin_lock_irq(&device->resource->req_lock);
+	if (rw == WRITE) {
+		/* This may temporarily give up the req_lock,
+		 * but will re-aquire it before it returns here.
+		 * Needs to be before the check on drbd_suspended() */
+		complete_conflicting_writes(req);
+		/* no more giving up req_lock from now on! */
+
+		/* check for congestion, and potentially stop sending
+		 * full data updates, but start sending "dirty bits" only. */
+		maybe_pull_ahead(device);
+	}
+
+
+	if (drbd_suspended(device)) {
+		/* push back and retry: */
+		req->rq_state |= RQ_POSTPONED;
+		if (req->private_bio) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(device);
+		}
+		goto out;
+	}
+
+	/* We fail READ/READA early, if we can not serve it.
+	 * We must do this before req is registered on any lists.
+	 * Otherwise, drbd_req_complete() will queue failed READ for retry. */
+	if (rw != WRITE) {
+		if (!do_remote_read(req) && !req->private_bio)
+			goto nodata;
+	}
+
+	/* which transfer log epoch does this belong to? */
+	req->epoch = atomic_read(&first_peer_device(device)->connection->current_tle_nr);
+
+	/* no point in adding empty flushes to the transfer log,
+	 * they are mapped to drbd barriers already. */
+	if (likely(req->i.size!=0)) {
+		if (rw == WRITE)
+			first_peer_device(device)->connection->current_tle_writes++;
+
+		list_add_tail(&req->tl_requests, &first_peer_device(device)->connection->transfer_log);
+	}
+
+	if (rw == WRITE) {
+		if (!drbd_process_write_request(req))
+			no_remote = true;
+	} else {
+		/* We either have a private_bio, or we can read from remote.
+		 * Otherwise we had done the goto nodata above. */
+		if (req->private_bio == NULL) {
+			_req_mod(req, TO_BE_SENT);
+			_req_mod(req, QUEUE_FOR_NET_READ);
+		} else
+			no_remote = true;
+	}
+
+	if (req->private_bio) {
+		/* needs to be marked within the same spinlock */
+		_req_mod(req, TO_BE_SUBMITTED);
+		/* but we need to give up the spinlock to submit */
+		spin_unlock_irq(&device->resource->req_lock);
+		drbd_submit_req_private_bio(req);
+		spin_lock_irq(&device->resource->req_lock);
+	} else if (no_remote) {
+nodata:
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
+					(unsigned long long)req->i.sector, req->i.size >> 9);
+		/* A write may have been queued for send_oos, however.
+		 * So we can not simply free it, we must go through drbd_req_put_completion_ref() */
+	}
+
+out:
+	if (drbd_req_put_completion_ref(req, &m, 1))
+		kref_put(&req->kref, drbd_req_destroy);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (m.bio)
+		complete_master_bio(device, &m);
+}
+
+void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_time)
+{
+	struct drbd_request *req = drbd_request_prepare(device, bio, start_time);
+	if (IS_ERR_OR_NULL(req))
+		return;
+	drbd_send_and_submit(device, req);
+}
+
+static void submit_fast_path(struct drbd_device *device, struct list_head *incoming)
+{
+	struct drbd_request *req, *tmp;
+	list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+		const int rw = bio_data_dir(req->master_bio);
+
+		if (rw == WRITE /* rw != WRITE should not even end up here! */
+		&& req->private_bio && req->i.size
+		&& !test_bit(AL_SUSPENDED, &device->flags)) {
+			if (!drbd_al_begin_io_fastpath(device, &req->i))
+				continue;
+
+			req->rq_state |= RQ_IN_ACT_LOG;
+		}
+
+		list_del_init(&req->tl_requests);
+		drbd_send_and_submit(device, req);
+	}
+}
+
+static bool prepare_al_transaction_nonblock(struct drbd_device *device,
+					    struct list_head *incoming,
+					    struct list_head *pending)
+{
+	struct drbd_request *req, *tmp;
+	int wake = 0;
+	int err;
+
+	spin_lock_irq(&device->al_lock);
+	list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+		err = drbd_al_begin_io_nonblock(device, &req->i);
+		if (err == -EBUSY)
+			wake = 1;
+		if (err)
+			continue;
+		req->rq_state |= RQ_IN_ACT_LOG;
+		list_move_tail(&req->tl_requests, pending);
+	}
+	spin_unlock_irq(&device->al_lock);
+	if (wake)
+		wake_up(&device->al_wait);
+
+	return !list_empty(pending);
+}
+
+void do_submit(struct work_struct *ws)
+{
+	struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
+	LIST_HEAD(incoming);
+	LIST_HEAD(pending);
+	struct drbd_request *req, *tmp;
+
+	for (;;) {
+		spin_lock(&device->submit.lock);
+		list_splice_tail_init(&device->submit.writes, &incoming);
+		spin_unlock(&device->submit.lock);
+
+		submit_fast_path(device, &incoming);
+		if (list_empty(&incoming))
+			break;
+
+skip_fast_path:
+		wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending));
+		/* Maybe more was queued, while we prepared the transaction?
+		 * Try to stuff them into this transaction as well.
+		 * Be strictly non-blocking here, no wait_event, we already
+		 * have something to commit.
+		 * Stop if we don't make any more progres.
+		 */
+		for (;;) {
+			LIST_HEAD(more_pending);
+			LIST_HEAD(more_incoming);
+			bool made_progress;
+
+			/* It is ok to look outside the lock,
+			 * it's only an optimization anyways */
+			if (list_empty(&device->submit.writes))
+				break;
+
+			spin_lock(&device->submit.lock);
+			list_splice_tail_init(&device->submit.writes, &more_incoming);
+			spin_unlock(&device->submit.lock);
+
+			if (list_empty(&more_incoming))
+				break;
+
+			made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending);
+
+			list_splice_tail_init(&more_pending, &pending);
+			list_splice_tail_init(&more_incoming, &incoming);
+
+			if (!made_progress)
+				break;
+		}
+		drbd_al_begin_io_commit(device, false);
+
+		list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
+			list_del_init(&req->tl_requests);
+			drbd_send_and_submit(device, req);
+		}
+
+		/* If all currently hot activity log extents are kept busy by
+		 * incoming requests, we still must not totally starve new
+		 * requests to cold extents. In that case, prepare one request
+		 * in blocking mode. */
+		list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
+			list_del_init(&req->tl_requests);
+			req->rq_state |= RQ_IN_ACT_LOG;
+			if (!drbd_al_begin_io_prepare(device, &req->i)) {
+				/* Corresponding extent was hot after all? */
+				drbd_send_and_submit(device, req);
+			} else {
+				/* Found a request to a cold extent.
+				 * Put on "pending" list,
+				 * and try to cumulate with more. */
+				list_add(&req->tl_requests, &pending);
+				goto skip_fast_path;
+			}
+		}
+	}
+}
+
+void drbd_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct drbd_device *device = (struct drbd_device *) q->queuedata;
+	unsigned long start_time;
+
+	start_time = jiffies;
+
+	/*
+	 * what we "blindly" assume:
+	 */
+	D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
+
+	inc_ap_bio(device);
+	__drbd_make_request(device, bio, start_time);
+}
+
+/* This is called by bio_add_page().
+ *
+ * q->max_hw_sectors and other global limits are already enforced there.
+ *
+ * We need to call down to our lower level device,
+ * in case it has special restrictions.
+ *
+ * We also may need to enforce configured max-bio-bvecs limits.
+ *
+ * As long as the BIO is empty we have to allow at least one bvec,
+ * regardless of size and offset, so no need to ask lower levels.
+ */
+int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
+{
+	struct drbd_device *device = (struct drbd_device *) q->queuedata;
+	unsigned int bio_size = bvm->bi_size;
+	int limit = DRBD_MAX_BIO_SIZE;
+	int backing_limit;
+
+	if (bio_size && get_ldev(device)) {
+		unsigned int max_hw_sectors = queue_max_hw_sectors(q);
+		struct request_queue * const b =
+			device->ldev->backing_bdev->bd_disk->queue;
+		if (b->merge_bvec_fn) {
+			backing_limit = b->merge_bvec_fn(b, bvm, bvec);
+			limit = min(limit, backing_limit);
+		}
+		put_ldev(device);
+		if ((limit >> 9) > max_hw_sectors)
+			limit = max_hw_sectors << 9;
+	}
+	return limit;
+}
+
+static void find_oldest_requests(
+		struct drbd_connection *connection,
+		struct drbd_device *device,
+		struct drbd_request **oldest_req_waiting_for_peer,
+		struct drbd_request **oldest_req_waiting_for_disk)
+{
+	struct drbd_request *r;
+	*oldest_req_waiting_for_peer = NULL;
+	*oldest_req_waiting_for_disk = NULL;
+	list_for_each_entry(r, &connection->transfer_log, tl_requests) {
+		const unsigned s = r->rq_state;
+		if (!*oldest_req_waiting_for_peer
+		&& ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE)))
+			*oldest_req_waiting_for_peer = r;
+
+		if (!*oldest_req_waiting_for_disk
+		&& (s & RQ_LOCAL_PENDING) && r->device == device)
+			*oldest_req_waiting_for_disk = r;
+
+		if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk)
+			break;
+	}
+}
+
+void request_timer_fn(unsigned long data)
+{
+	struct drbd_device *device = (struct drbd_device *) data;
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_request *req_disk, *req_peer; /* oldest request */
+	struct net_conf *nc;
+	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
+	unsigned long now;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
+		ent = nc->timeout * HZ/10 * nc->ko_count;
+
+	if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
+		dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
+		put_ldev(device);
+	}
+	rcu_read_unlock();
+
+	et = min_not_zero(dt, ent);
+
+	if (!et)
+		return; /* Recurring timer stopped */
+
+	now = jiffies;
+
+	spin_lock_irq(&device->resource->req_lock);
+	find_oldest_requests(connection, device, &req_peer, &req_disk);
+	if (req_peer == NULL && req_disk == NULL) {
+		spin_unlock_irq(&device->resource->req_lock);
+		mod_timer(&device->request_timer, now + et);
+		return;
+	}
+
+	/* The request is considered timed out, if
+	 * - we have some effective timeout from the configuration,
+	 *   with above state restrictions applied,
+	 * - the oldest request is waiting for a response from the network
+	 *   resp. the local disk,
+	 * - the oldest request is in fact older than the effective timeout,
+	 * - the connection was established (resp. disk was attached)
+	 *   for longer than the timeout already.
+	 * Note that for 32bit jiffies and very stable connections/disks,
+	 * we may have a wrap around, which is catched by
+	 *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+	 *
+	 * Side effect: once per 32bit wrap-around interval, which means every
+	 * ~198 days with 250 HZ, we have a window where the timeout would need
+	 * to expire twice (worst case) to become effective. Good enough.
+	 */
+	if (ent && req_peer &&
+		 time_after(now, req_peer->start_time + ent) &&
+		!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
+		drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
+		_drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
+	}
+	if (dt && req_disk &&
+		 time_after(now, req_disk->start_time + dt) &&
+		!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
+		drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
+		__drbd_chk_io_error(device, DRBD_FORCE_DETACH);
+	}
+
+	/* Reschedule timer for the nearest not already expired timeout.
+	 * Fallback to now + min(effective network timeout, disk timeout). */
+	ent = (ent && req_peer && time_before(now, req_peer->start_time + ent))
+		? req_peer->start_time + ent : now + et;
+	dt = (dt && req_disk && time_before(now, req_disk->start_time + dt))
+		? req_disk->start_time + dt : now + et;
+	nt = time_before(ent, dt) ? ent : dt;
+	spin_unlock_irq(&connection->resource->req_lock);
+	mod_timer(&device->request_timer, nt);
+}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 00000000000..8566cd5866b
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,350 @@
+/*
+   drbd_req.h
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+   Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+
+   DRBD is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   DRBD is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_REQ_H
+#define _DRBD_REQ_H
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+/* The request callbacks will be called in irq context by the IDE drivers,
+   and in Softirqs/Tasklets/BH context by the SCSI drivers,
+   and by the receiver and worker in kernel-thread context.
+   Try to get the locking right :) */
+
+/*
+ * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
+ * associated with IO requests originating from the block layer above us.
+ *
+ * There are quite a few things that may happen to a drbd request
+ * during its lifetime.
+ *
+ *  It will be created.
+ *  It will be marked with the intention to be
+ *    submitted to local disk and/or
+ *    send via the network.
+ *
+ *  It has to be placed on the transfer log and other housekeeping lists,
+ *  In case we have a network connection.
+ *
+ *  It may be identified as a concurrent (write) request
+ *    and be handled accordingly.
+ *
+ *  It may me handed over to the local disk subsystem.
+ *  It may be completed by the local disk subsystem,
+ *    either successfully or with io-error.
+ *  In case it is a READ request, and it failed locally,
+ *    it may be retried remotely.
+ *
+ *  It may be queued for sending.
+ *  It may be handed over to the network stack,
+ *    which may fail.
+ *  It may be acknowledged by the "peer" according to the wire_protocol in use.
+ *    this may be a negative ack.
+ *  It may receive a faked ack when the network connection is lost and the
+ *  transfer log is cleaned up.
+ *  Sending may be canceled due to network connection loss.
+ *  When it finally has outlived its time,
+ *    corresponding dirty bits in the resync-bitmap may be cleared or set,
+ *    it will be destroyed,
+ *    and completion will be signalled to the originator,
+ *      with or without "success".
+ */
+
+enum drbd_req_event {
+	CREATED,
+	TO_BE_SENT,
+	TO_BE_SUBMITTED,
+
+	/* XXX yes, now I am inconsistent...
+	 * these are not "events" but "actions"
+	 * oh, well... */
+	QUEUE_FOR_NET_WRITE,
+	QUEUE_FOR_NET_READ,
+	QUEUE_FOR_SEND_OOS,
+
+	/* An empty flush is queued as P_BARRIER,
+	 * which will cause it to complete "successfully",
+	 * even if the local disk flush failed.
+	 *
+	 * Just like "real" requests, empty flushes (blkdev_issue_flush()) will
+	 * only see an error if neither local nor remote data is reachable. */
+	QUEUE_AS_DRBD_BARRIER,
+
+	SEND_CANCELED,
+	SEND_FAILED,
+	HANDED_OVER_TO_NETWORK,
+	OOS_HANDED_TO_NETWORK,
+	CONNECTION_LOST_WHILE_PENDING,
+	READ_RETRY_REMOTE_CANCELED,
+	RECV_ACKED_BY_PEER,
+	WRITE_ACKED_BY_PEER,
+	WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
+	CONFLICT_RESOLVED,
+	POSTPONE_WRITE,
+	NEG_ACKED,
+	BARRIER_ACKED, /* in protocol A and B */
+	DATA_RECEIVED, /* (remote read) */
+
+	COMPLETED_OK,
+	READ_COMPLETED_WITH_ERROR,
+	READ_AHEAD_COMPLETED_WITH_ERROR,
+	WRITE_COMPLETED_WITH_ERROR,
+	DISCARD_COMPLETED_NOTSUPP,
+	DISCARD_COMPLETED_WITH_ERROR,
+
+	ABORT_DISK_IO,
+	RESEND,
+	FAIL_FROZEN_DISK_IO,
+	RESTART_FROZEN_DISK_IO,
+	NOTHING,
+};
+
+/* encoding of request states for now.  we don't actually need that many bits.
+ * we don't need to do atomic bit operations either, since most of the time we
+ * need to look at the connection state and/or manipulate some lists at the
+ * same time, so we should hold the request lock anyways.
+ */
+enum drbd_req_state_bits {
+	/* 3210
+	 * 0000: no local possible
+	 * 0001: to be submitted
+	 *    UNUSED, we could map: 011: submitted, completion still pending
+	 * 0110: completed ok
+	 * 0010: completed with error
+	 * 1001: Aborted (before completion)
+	 * 1x10: Aborted and completed -> free
+	 */
+	__RQ_LOCAL_PENDING,
+	__RQ_LOCAL_COMPLETED,
+	__RQ_LOCAL_OK,
+	__RQ_LOCAL_ABORTED,
+
+	/* 87654
+	 * 00000: no network possible
+	 * 00001: to be send
+	 * 00011: to be send, on worker queue
+	 * 00101: sent, expecting recv_ack (B) or write_ack (C)
+	 * 11101: sent,
+	 *        recv_ack (B) or implicit "ack" (A),
+	 *        still waiting for the barrier ack.
+	 *        master_bio may already be completed and invalidated.
+	 * 11100: write acked (C),
+	 *        data received (for remote read, any protocol)
+	 *        or finally the barrier ack has arrived (B,A)...
+	 *        request can be freed
+	 * 01100: neg-acked (write, protocol C)
+	 *        or neg-d-acked (read, any protocol)
+	 *        or killed from the transfer log
+	 *        during cleanup after connection loss
+	 *        request can be freed
+	 * 01000: canceled or send failed...
+	 *        request can be freed
+	 */
+
+	/* if "SENT" is not set, yet, this can still fail or be canceled.
+	 * if "SENT" is set already, we still wait for an Ack packet.
+	 * when cleared, the master_bio may be completed.
+	 * in (B,A) the request object may still linger on the transaction log
+	 * until the corresponding barrier ack comes in */
+	__RQ_NET_PENDING,
+
+	/* If it is QUEUED, and it is a WRITE, it is also registered in the
+	 * transfer log. Currently we need this flag to avoid conflicts between
+	 * worker canceling the request and tl_clear_barrier killing it from
+	 * transfer log.  We should restructure the code so this conflict does
+	 * no longer occur. */
+	__RQ_NET_QUEUED,
+
+	/* well, actually only "handed over to the network stack".
+	 *
+	 * TODO can potentially be dropped because of the similar meaning
+	 * of RQ_NET_SENT and ~RQ_NET_QUEUED.
+	 * however it is not exactly the same. before we drop it
+	 * we must ensure that we can tell a request with network part
+	 * from a request without, regardless of what happens to it. */
+	__RQ_NET_SENT,
+
+	/* when set, the request may be freed (if RQ_NET_QUEUED is clear).
+	 * basically this means the corresponding P_BARRIER_ACK was received */
+	__RQ_NET_DONE,
+
+	/* whether or not we know (C) or pretend (B,A) that the write
+	 * was successfully written on the peer.
+	 */
+	__RQ_NET_OK,
+
+	/* peer called drbd_set_in_sync() for this write */
+	__RQ_NET_SIS,
+
+	/* keep this last, its for the RQ_NET_MASK */
+	__RQ_NET_MAX,
+
+	/* Set when this is a write, clear for a read */
+	__RQ_WRITE,
+
+	/* Should call drbd_al_complete_io() for this request... */
+	__RQ_IN_ACT_LOG,
+
+	/* The peer has sent a retry ACK */
+	__RQ_POSTPONED,
+
+	/* would have been completed,
+	 * but was not, because of drbd_suspended() */
+	__RQ_COMPLETION_SUSP,
+
+	/* We expect a receive ACK (wire proto B) */
+	__RQ_EXP_RECEIVE_ACK,
+
+	/* We expect a write ACK (wite proto C) */
+	__RQ_EXP_WRITE_ACK,
+
+	/* waiting for a barrier ack, did an extra kref_get */
+	__RQ_EXP_BARR_ACK,
+};
+
+#define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
+#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
+#define RQ_LOCAL_OK        (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED   (1UL << __RQ_LOCAL_ABORTED)
+
+#define RQ_LOCAL_MASK      ((RQ_LOCAL_ABORTED << 1)-1)
+
+#define RQ_NET_PENDING     (1UL << __RQ_NET_PENDING)
+#define RQ_NET_QUEUED      (1UL << __RQ_NET_QUEUED)
+#define RQ_NET_SENT        (1UL << __RQ_NET_SENT)
+#define RQ_NET_DONE        (1UL << __RQ_NET_DONE)
+#define RQ_NET_OK          (1UL << __RQ_NET_OK)
+#define RQ_NET_SIS         (1UL << __RQ_NET_SIS)
+
+/* 0x1f8 */
+#define RQ_NET_MASK        (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+
+#define RQ_WRITE           (1UL << __RQ_WRITE)
+#define RQ_IN_ACT_LOG      (1UL << __RQ_IN_ACT_LOG)
+#define RQ_POSTPONED	   (1UL << __RQ_POSTPONED)
+#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
+#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
+#define RQ_EXP_WRITE_ACK   (1UL << __RQ_EXP_WRITE_ACK)
+#define RQ_EXP_BARR_ACK    (1UL << __RQ_EXP_BARR_ACK)
+
+/* For waking up the frozen transfer log mod_req() has to return if the request
+   should be counted in the epoch object*/
+#define MR_WRITE       1
+#define MR_READ        2
+
+static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
+{
+	struct bio *bio;
+	bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+
+	req->private_bio = bio;
+
+	bio->bi_private  = req;
+	bio->bi_end_io   = drbd_request_endio;
+	bio->bi_next     = NULL;
+}
+
+/* Short lived temporary struct on the stack.
+ * We could squirrel the error to be returned into
+ * bio->bi_iter.bi_size, or similar. But that would be too ugly. */
+struct bio_and_error {
+	struct bio *bio;
+	int error;
+};
+
+extern void start_new_tl_epoch(struct drbd_connection *connection);
+extern void drbd_req_destroy(struct kref *kref);
+extern void _req_may_be_done(struct drbd_request *req,
+		struct bio_and_error *m);
+extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m);
+extern void complete_master_bio(struct drbd_device *device,
+		struct bio_and_error *m);
+extern void request_timer_fn(unsigned long data);
+extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+
+/* this is in drbd_main.c */
+extern void drbd_restart_request(struct drbd_request *req);
+
+/* use this if you don't want to deal with calling complete_master_bio()
+ * outside the spinlock, e.g. when walking some list on cleanup. */
+static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
+{
+	struct drbd_device *device = req->device;
+	struct bio_and_error m;
+	int rv;
+
+	/* __req_mod possibly frees req, do not touch req after that! */
+	rv = __req_mod(req, what, &m);
+	if (m.bio)
+		complete_master_bio(device, &m);
+
+	return rv;
+}
+
+/* completion of master bio is outside of our spinlock.
+ * We still may or may not be inside some irqs disabled section
+ * of the lower level driver completion callback, so we need to
+ * spin_lock_irqsave here. */
+static inline int req_mod(struct drbd_request *req,
+		enum drbd_req_event what)
+{
+	unsigned long flags;
+	struct drbd_device *device = req->device;
+	struct bio_and_error m;
+	int rv;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	rv = __req_mod(req, what, &m);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	if (m.bio)
+		complete_master_bio(device, &m);
+
+	return rv;
+}
+
+static inline bool drbd_should_do_remote(union drbd_dev_state s)
+{
+	return s.pdsk == D_UP_TO_DATE ||
+		(s.pdsk >= D_INCONSISTENT &&
+		 s.conn >= C_WF_BITMAP_T &&
+		 s.conn < C_AHEAD);
+	/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+	   That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+	   states. */
+}
+static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
+{
+	return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+	/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+	   since we enter state C_AHEAD only if proto >= 96 */
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
new file mode 100644
index 00000000000..a5d8aae00e0
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.c
@@ -0,0 +1,1884 @@
+/*
+   drbd_state.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+   from Logicworks, Inc. for making SDP replication support possible.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+
+struct after_state_chg_work {
+	struct drbd_work w;
+	struct drbd_device *device;
+	union drbd_state os;
+	union drbd_state ns;
+	enum chg_state_flags flags;
+	struct completion *done;
+};
+
+enum sanitize_state_warnings {
+	NO_WARNING,
+	ABORTED_ONLINE_VERIFY,
+	ABORTED_RESYNC,
+	CONNECTION_LOST_NEGOTIATING,
+	IMPLICITLY_UPGRADED_DISK,
+	IMPLICITLY_UPGRADED_PDSK,
+};
+
+static int w_after_state_ch(struct drbd_work *w, int unused);
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags);
+static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
+static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
+static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn);
+
+static inline bool is_susp(union drbd_state s)
+{
+        return s.susp || s.susp_nod || s.susp_fen;
+}
+
+bool conn_all_vols_unconf(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = true;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (device->state.disk != D_DISKLESS ||
+		    device->state.conn != C_STANDALONE ||
+		    device->state.role != R_SECONDARY) {
+			rv = false;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/* Unfortunately the states where not correctly ordered, when
+   they where defined. therefore can not use max_t() here. */
+static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
+{
+	if (role1 == R_PRIMARY || role2 == R_PRIMARY)
+		return R_PRIMARY;
+	if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+		return R_SECONDARY;
+	return R_UNKNOWN;
+}
+static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
+{
+	if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
+		return R_UNKNOWN;
+	if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+		return R_SECONDARY;
+	return R_PRIMARY;
+}
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection)
+{
+	enum drbd_role role = R_UNKNOWN;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		role = max_role(role, device->state.role);
+	}
+	rcu_read_unlock();
+
+	return role;
+}
+
+enum drbd_role conn_highest_peer(struct drbd_connection *connection)
+{
+	enum drbd_role peer = R_UNKNOWN;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		peer = max_role(peer, device->state.peer);
+	}
+	rcu_read_unlock();
+
+	return peer;
+}
+
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection)
+{
+	enum drbd_disk_state ds = D_DISKLESS;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		ds = max_t(enum drbd_disk_state, ds, device->state.disk);
+	}
+	rcu_read_unlock();
+
+	return ds;
+}
+
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection)
+{
+	enum drbd_disk_state ds = D_MASK;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		ds = min_t(enum drbd_disk_state, ds, device->state.disk);
+	}
+	rcu_read_unlock();
+
+	return ds;
+}
+
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection)
+{
+	enum drbd_disk_state ds = D_DISKLESS;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		ds = max_t(enum drbd_disk_state, ds, device->state.pdsk);
+	}
+	rcu_read_unlock();
+
+	return ds;
+}
+
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection)
+{
+	enum drbd_conns conn = C_MASK;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		conn = min_t(enum drbd_conns, conn, device->state.conn);
+	}
+	rcu_read_unlock();
+
+	return conn;
+}
+
+static bool no_peer_wf_report_params(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+	bool rv = true;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		if (peer_device->device->state.conn == C_WF_REPORT_PARAMS) {
+			rv = false;
+			break;
+		}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+
+/**
+ * cl_wide_st_chg() - true if the state change is a cluster wide one
+ * @device:	DRBD device.
+ * @os:		old (current) state.
+ * @ns:		new (wanted) state.
+ */
+static int cl_wide_st_chg(struct drbd_device *device,
+			  union drbd_state os, union drbd_state ns)
+{
+	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+		  (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
+		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) ||
+		(os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS);
+}
+
+static union drbd_state
+apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val)
+{
+	union drbd_state ns;
+	ns.i = (os.i & ~mask.i) | val.i;
+	return ns;
+}
+
+enum drbd_state_rv
+drbd_change_state(struct drbd_device *device, enum chg_state_flags f,
+		  union drbd_state mask, union drbd_state val)
+{
+	unsigned long flags;
+	union drbd_state ns;
+	enum drbd_state_rv rv;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	ns = apply_mask_val(drbd_read_state(device), mask, val);
+	rv = _drbd_set_state(device, ns, f, NULL);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ */
+void drbd_force_state(struct drbd_device *device,
+	union drbd_state mask, union drbd_state val)
+{
+	drbd_change_state(device, CS_HARD, mask, val);
+}
+
+static enum drbd_state_rv
+_req_st_cond(struct drbd_device *device, union drbd_state mask,
+	     union drbd_state val)
+{
+	union drbd_state os, ns;
+	unsigned long flags;
+	enum drbd_state_rv rv;
+
+	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &device->flags))
+		return SS_CW_SUCCESS;
+
+	if (test_and_clear_bit(CL_ST_CHG_FAIL, &device->flags))
+		return SS_CW_FAILED_BY_PEER;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	os = drbd_read_state(device);
+	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+	rv = is_valid_transition(os, ns);
+	if (rv >= SS_SUCCESS)
+		rv = SS_UNKNOWN_ERROR;  /* cont waiting, otherwise fail. */
+
+	if (!cl_wide_st_chg(device, os, ns))
+		rv = SS_CW_NO_NEED;
+	if (rv == SS_UNKNOWN_ERROR) {
+		rv = is_valid_state(device, ns);
+		if (rv >= SS_SUCCESS) {
+			rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+			if (rv >= SS_SUCCESS)
+				rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+		}
+	}
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+static enum drbd_state_rv
+drbd_req_state(struct drbd_device *device, union drbd_state mask,
+	       union drbd_state val, enum chg_state_flags f)
+{
+	struct completion done;
+	unsigned long flags;
+	union drbd_state os, ns;
+	enum drbd_state_rv rv;
+
+	init_completion(&done);
+
+	if (f & CS_SERIALIZE)
+		mutex_lock(device->state_mutex);
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	os = drbd_read_state(device);
+	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+	rv = is_valid_transition(os, ns);
+	if (rv < SS_SUCCESS) {
+		spin_unlock_irqrestore(&device->resource->req_lock, flags);
+		goto abort;
+	}
+
+	if (cl_wide_st_chg(device, os, ns)) {
+		rv = is_valid_state(device, ns);
+		if (rv == SS_SUCCESS)
+			rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+		spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+		if (rv < SS_SUCCESS) {
+			if (f & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			goto abort;
+		}
+
+		if (drbd_send_state_req(first_peer_device(device), mask, val)) {
+			rv = SS_CW_FAILED_BY_PEER;
+			if (f & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			goto abort;
+		}
+
+		wait_event(device->state_wait,
+			(rv = _req_st_cond(device, mask, val)));
+
+		if (rv < SS_SUCCESS) {
+			if (f & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			goto abort;
+		}
+		spin_lock_irqsave(&device->resource->req_lock, flags);
+		ns = apply_mask_val(drbd_read_state(device), mask, val);
+		rv = _drbd_set_state(device, ns, f, &done);
+	} else {
+		rv = _drbd_set_state(device, ns, f, &done);
+	}
+
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+		D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
+		wait_for_completion(&done);
+	}
+
+abort:
+	if (f & CS_SERIALIZE)
+		mutex_unlock(device->state_mutex);
+
+	return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+enum drbd_state_rv
+_drbd_request_state(struct drbd_device *device, union drbd_state mask,
+		    union drbd_state val, enum chg_state_flags f)
+{
+	enum drbd_state_rv rv;
+
+	wait_event(device->state_wait,
+		   (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+	return rv;
+}
+
+static void print_st(struct drbd_device *device, char *name, union drbd_state ns)
+{
+	drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
+	    name,
+	    drbd_conn_str(ns.conn),
+	    drbd_role_str(ns.role),
+	    drbd_role_str(ns.peer),
+	    drbd_disk_str(ns.disk),
+	    drbd_disk_str(ns.pdsk),
+	    is_susp(ns) ? 's' : 'r',
+	    ns.aftr_isp ? 'a' : '-',
+	    ns.peer_isp ? 'p' : '-',
+	    ns.user_isp ? 'u' : '-',
+	    ns.susp_fen ? 'F' : '-',
+	    ns.susp_nod ? 'N' : '-'
+	    );
+}
+
+void print_st_err(struct drbd_device *device, union drbd_state os,
+	          union drbd_state ns, enum drbd_state_rv err)
+{
+	if (err == SS_IN_TRANSIENT_STATE)
+		return;
+	drbd_err(device, "State change failed: %s\n", drbd_set_st_err_str(err));
+	print_st(device, " state", os);
+	print_st(device, "wanted", ns);
+}
+
+static long print_state_change(char *pb, union drbd_state os, union drbd_state ns,
+			       enum chg_state_flags flags)
+{
+	char *pbp;
+	pbp = pb;
+	*pbp = 0;
+
+	if (ns.role != os.role && flags & CS_DC_ROLE)
+		pbp += sprintf(pbp, "role( %s -> %s ) ",
+			       drbd_role_str(os.role),
+			       drbd_role_str(ns.role));
+	if (ns.peer != os.peer && flags & CS_DC_PEER)
+		pbp += sprintf(pbp, "peer( %s -> %s ) ",
+			       drbd_role_str(os.peer),
+			       drbd_role_str(ns.peer));
+	if (ns.conn != os.conn && flags & CS_DC_CONN)
+		pbp += sprintf(pbp, "conn( %s -> %s ) ",
+			       drbd_conn_str(os.conn),
+			       drbd_conn_str(ns.conn));
+	if (ns.disk != os.disk && flags & CS_DC_DISK)
+		pbp += sprintf(pbp, "disk( %s -> %s ) ",
+			       drbd_disk_str(os.disk),
+			       drbd_disk_str(ns.disk));
+	if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK)
+		pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
+			       drbd_disk_str(os.pdsk),
+			       drbd_disk_str(ns.pdsk));
+
+	return pbp - pb;
+}
+
+static void drbd_pr_state_change(struct drbd_device *device, union drbd_state os, union drbd_state ns,
+				 enum chg_state_flags flags)
+{
+	char pb[300];
+	char *pbp = pb;
+
+	pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK);
+
+	if (ns.aftr_isp != os.aftr_isp)
+		pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
+			       os.aftr_isp,
+			       ns.aftr_isp);
+	if (ns.peer_isp != os.peer_isp)
+		pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
+			       os.peer_isp,
+			       ns.peer_isp);
+	if (ns.user_isp != os.user_isp)
+		pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
+			       os.user_isp,
+			       ns.user_isp);
+
+	if (pbp != pb)
+		drbd_info(device, "%s\n", pb);
+}
+
+static void conn_pr_state_change(struct drbd_connection *connection, union drbd_state os, union drbd_state ns,
+				 enum chg_state_flags flags)
+{
+	char pb[300];
+	char *pbp = pb;
+
+	pbp += print_state_change(pbp, os, ns, flags);
+
+	if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP)
+		pbp += sprintf(pbp, "susp( %d -> %d ) ",
+			       is_susp(os),
+			       is_susp(ns));
+
+	if (pbp != pb)
+		drbd_info(connection, "%s\n", pb);
+}
+
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @device:	DRBD device.
+ * @ns:		State to consider.
+ */
+static enum drbd_state_rv
+is_valid_state(struct drbd_device *device, union drbd_state ns)
+{
+	/* See drbd_state_sw_errors in drbd_strings.c */
+
+	enum drbd_fencing_p fp;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	struct net_conf *nc;
+
+	rcu_read_lock();
+	fp = FP_DONT_CARE;
+	if (get_ldev(device)) {
+		fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+		put_ldev(device);
+	}
+
+	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	if (nc) {
+		if (!nc->two_primaries && ns.role == R_PRIMARY) {
+			if (ns.peer == R_PRIMARY)
+				rv = SS_TWO_PRIMARIES;
+			else if (conn_highest_peer(first_peer_device(device)->connection) == R_PRIMARY)
+				rv = SS_O_VOL_PEER_PRI;
+		}
+	}
+
+	if (rv <= 0)
+		/* already found a reason to abort */;
+	else if (ns.role == R_SECONDARY && device->open_cnt)
+		rv = SS_DEVICE_IN_USE;
+
+	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (fp >= FP_RESOURCE &&
+		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+		rv = SS_PRIMARY_NOP;
+
+	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+		rv = SS_NO_LOCAL_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+		rv = SS_NO_REMOTE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if ((ns.conn == C_CONNECTED ||
+		  ns.conn == C_WF_BITMAP_S ||
+		  ns.conn == C_SYNC_SOURCE ||
+		  ns.conn == C_PAUSED_SYNC_S) &&
+		  ns.disk == D_OUTDATED)
+		rv = SS_CONNECTED_OUTDATES;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		 (nc->verify_alg[0] == 0))
+		rv = SS_NO_VERIFY_ALG;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		  first_peer_device(device)->connection->agreed_pro_version < 88)
+		rv = SS_NOT_SUPPORTED;
+
+	else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+                 ns.pdsk == D_UNKNOWN)
+		rv = SS_NEED_CONNECTION;
+
+	else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
+		rv = SS_CONNECTED_OUTDATES;
+
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/**
+ * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible
+ * This function limits state transitions that may be declined by DRBD. I.e.
+ * user requests (aka soft transitions).
+ * @device:	DRBD device.
+ * @ns:		new state.
+ * @os:		old state.
+ */
+static enum drbd_state_rv
+is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_connection *connection)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+
+	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+	    os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+		rv = SS_ALREADY_STANDALONE;
+
+	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+		rv = SS_IS_DISKLESS;
+
+	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+		rv = SS_NO_NET_CONFIG;
+
+	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+		rv = SS_LOWER_THAN_OUTDATED;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+		rv = SS_IN_TRANSIENT_STATE;
+
+	/* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
+	   rv = SS_IN_TRANSIENT_STATE; */
+
+	/* While establishing a connection only allow cstate to change.
+	   Delay/refuse role changes, detach attach etc... */
+	if (test_bit(STATE_SENT, &connection->flags) &&
+	    !(os.conn == C_WF_REPORT_PARAMS ||
+	      (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
+		rv = SS_IN_TRANSIENT_STATE;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+	    ns.conn != os.conn && os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+	    os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
+	    && os.conn < C_WF_REPORT_PARAMS)
+		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+
+	if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
+	    os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
+		rv = SS_OUTDATE_WO_CONN;
+
+	return rv;
+}
+
+static enum drbd_state_rv
+is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc)
+{
+	/* no change -> nothing to do, at least for the connection part */
+	if (oc == nc)
+		return SS_NOTHING_TO_DO;
+
+	/* disconnect of an unconfigured connection does not make sense */
+	if (oc == C_STANDALONE && nc == C_DISCONNECTING)
+		return SS_ALREADY_STANDALONE;
+
+	/* from C_STANDALONE, we start with C_UNCONNECTED */
+	if (oc == C_STANDALONE && nc != C_UNCONNECTED)
+		return SS_NEED_CONNECTION;
+
+	/* When establishing a connection we need to go through WF_REPORT_PARAMS!
+	   Necessary to do the right thing upon invalidate-remote on a disconnected resource */
+	if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED)
+		return SS_NEED_CONNECTION;
+
+	/* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */
+	if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING)
+		return SS_IN_TRANSIENT_STATE;
+
+	/* After C_DISCONNECTING only C_STANDALONE may follow */
+	if (oc == C_DISCONNECTING && nc != C_STANDALONE)
+		return SS_IN_TRANSIENT_STATE;
+
+	return SS_SUCCESS;
+}
+
+
+/**
+ * is_valid_transition() - Returns an SS_ error code if the state transition is not possible
+ * This limits hard state transitions. Hard state transitions are facts there are
+ * imposed on DRBD by the environment. E.g. disk broke or network broke down.
+ * But those hard state transitions are still not allowed to do everything.
+ * @ns:		new state.
+ * @os:		old state.
+ */
+static enum drbd_state_rv
+is_valid_transition(union drbd_state os, union drbd_state ns)
+{
+	enum drbd_state_rv rv;
+
+	rv = is_valid_conn_transition(os.conn, ns.conn);
+
+	/* we cannot fail (again) if we already detached */
+	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
+		rv = SS_IS_DISKLESS;
+
+	return rv;
+}
+
+static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_state_warnings warn)
+{
+	static const char *msg_table[] = {
+		[NO_WARNING] = "",
+		[ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+		[ABORTED_RESYNC] = "Resync aborted.",
+		[CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+		[IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+		[IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+	};
+
+	if (warn != NO_WARNING)
+		drbd_warn(device, "%s\n", msg_table[warn]);
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @device:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @warn_sync_abort:
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn)
+{
+	enum drbd_fencing_p fp;
+	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
+
+	if (warn)
+		*warn = NO_WARNING;
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(device)) {
+		rcu_read_lock();
+		fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+		rcu_read_unlock();
+		put_ldev(device);
+	}
+
+	/* Implications from connection to peer and peer_isp */
+	if (ns.conn < C_CONNECTED) {
+		ns.peer_isp = 0;
+		ns.peer = R_UNKNOWN;
+		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+			ns.pdsk = D_UNKNOWN;
+	}
+
+	/* Clear the aftr_isp when becoming unconfigured */
+	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+		ns.aftr_isp = 0;
+
+	/* An implication of the disk states onto the connection state */
+	/* Abort resync if a disk fails/detaches */
+	if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+		if (warn)
+			*warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ?
+				ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
+		ns.conn = C_CONNECTED;
+	}
+
+	/* Connection breaks down before we finished "Negotiating" */
+	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+	    get_ldev_if_state(device, D_NEGOTIATING)) {
+		if (device->ed_uuid == device->ldev->md.uuid[UI_CURRENT]) {
+			ns.disk = device->new_state_tmp.disk;
+			ns.pdsk = device->new_state_tmp.pdsk;
+		} else {
+			if (warn)
+				*warn = CONNECTION_LOST_NEGOTIATING;
+			ns.disk = D_DISKLESS;
+			ns.pdsk = D_UNKNOWN;
+		}
+		put_ldev(device);
+	}
+
+	/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
+	if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
+		if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
+			ns.disk = D_UP_TO_DATE;
+		if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
+			ns.pdsk = D_UP_TO_DATE;
+	}
+
+	/* Implications of the connection stat on the disk states */
+	disk_min = D_DISKLESS;
+	disk_max = D_UP_TO_DATE;
+	pdsk_min = D_INCONSISTENT;
+	pdsk_max = D_UNKNOWN;
+	switch ((enum drbd_conns)ns.conn) {
+	case C_WF_BITMAP_T:
+	case C_PAUSED_SYNC_T:
+	case C_STARTING_SYNC_T:
+	case C_WF_SYNC_UUID:
+	case C_BEHIND:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_OUTDATED;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_CONNECTED:
+		disk_min = D_DISKLESS;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_DISKLESS;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_WF_BITMAP_S:
+	case C_PAUSED_SYNC_S:
+	case C_STARTING_SYNC_S:
+	case C_AHEAD:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
+		break;
+	case C_SYNC_TARGET:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_INCONSISTENT;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_SYNC_SOURCE:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_INCONSISTENT;
+		break;
+	case C_STANDALONE:
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_CONNECTION:
+	case C_WF_REPORT_PARAMS:
+	case C_MASK:
+		break;
+	}
+	if (ns.disk > disk_max)
+		ns.disk = disk_max;
+
+	if (ns.disk < disk_min) {
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_DISK;
+		ns.disk = disk_min;
+	}
+	if (ns.pdsk > pdsk_max)
+		ns.pdsk = pdsk_max;
+
+	if (ns.pdsk < pdsk_min) {
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_PDSK;
+		ns.pdsk = pdsk_min;
+	}
+
+	if (fp == FP_STONITH &&
+	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
+	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
+		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+	if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
+		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
+
+	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+		if (ns.conn == C_SYNC_SOURCE)
+			ns.conn = C_PAUSED_SYNC_S;
+		if (ns.conn == C_SYNC_TARGET)
+			ns.conn = C_PAUSED_SYNC_T;
+	} else {
+		if (ns.conn == C_PAUSED_SYNC_S)
+			ns.conn = C_SYNC_SOURCE;
+		if (ns.conn == C_PAUSED_SYNC_T)
+			ns.conn = C_SYNC_TARGET;
+	}
+
+	return ns;
+}
+
+void drbd_resume_al(struct drbd_device *device)
+{
+	if (test_and_clear_bit(AL_SUSPENDED, &device->flags))
+		drbd_info(device, "Resumed AL updates\n");
+}
+
+/* helper for __drbd_set_state */
+static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
+{
+	if (first_peer_device(device)->connection->agreed_pro_version < 90)
+		device->ov_start_sector = 0;
+	device->rs_total = drbd_bm_bits(device);
+	device->ov_position = 0;
+	if (cs == C_VERIFY_T) {
+		/* starting online verify from an arbitrary position
+		 * does not fit well into the existing protocol.
+		 * on C_VERIFY_T, we initialize ov_left and friends
+		 * implicitly in receive_DataRequest once the
+		 * first P_OV_REQUEST is received */
+		device->ov_start_sector = ~(sector_t)0;
+	} else {
+		unsigned long bit = BM_SECT_TO_BIT(device->ov_start_sector);
+		if (bit >= device->rs_total) {
+			device->ov_start_sector =
+				BM_BIT_TO_SECT(device->rs_total - 1);
+			device->rs_total = 1;
+		} else
+			device->rs_total -= bit;
+		device->ov_position = device->ov_start_sector;
+	}
+	device->ov_left = device->rs_total;
+}
+
+/**
+ * __drbd_set_state() - Set a new DRBD state
+ * @device:	DRBD device.
+ * @ns:		new state.
+ * @flags:	Flags
+ * @done:	Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ */
+enum drbd_state_rv
+__drbd_set_state(struct drbd_device *device, union drbd_state ns,
+	         enum chg_state_flags flags, struct completion *done)
+{
+	union drbd_state os;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	enum sanitize_state_warnings ssw;
+	struct after_state_chg_work *ascw;
+	bool did_remote, should_do_remote;
+
+	os = drbd_read_state(device);
+
+	ns = sanitize_state(device, os, ns, &ssw);
+	if (ns.i == os.i)
+		return SS_NOTHING_TO_DO;
+
+	rv = is_valid_transition(os, ns);
+	if (rv < SS_SUCCESS)
+		return rv;
+
+	if (!(flags & CS_HARD)) {
+		/*  pre-state-change checks ; only look at ns  */
+		/* See drbd_state_sw_errors in drbd_strings.c */
+
+		rv = is_valid_state(device, ns);
+		if (rv < SS_SUCCESS) {
+			/* If the old state was illegal as well, then let
+			   this happen...*/
+
+			if (is_valid_state(device, os) == rv)
+				rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+		} else
+			rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+	}
+
+	if (rv < SS_SUCCESS) {
+		if (flags & CS_VERBOSE)
+			print_st_err(device, os, ns, rv);
+		return rv;
+	}
+
+	print_sanitize_warnings(device, ssw);
+
+	drbd_pr_state_change(device, os, ns, flags);
+
+	/* Display changes to the susp* flags that where caused by the call to
+	   sanitize_state(). Only display it here if we where not called from
+	   _conn_request_state() */
+	if (!(flags & CS_DC_SUSP))
+		conn_pr_state_change(first_peer_device(device)->connection, os, ns,
+				     (flags & ~CS_DC_MASK) | CS_DC_SUSP);
+
+	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+	 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+	 * drbd_ldev_destroy() won't happen before our corresponding
+	 * after_state_ch works run, where we put_ldev again. */
+	if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
+	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
+		atomic_inc(&device->local_cnt);
+
+	did_remote = drbd_should_do_remote(device->state);
+	device->state.i = ns.i;
+	should_do_remote = drbd_should_do_remote(device->state);
+	device->resource->susp = ns.susp;
+	device->resource->susp_nod = ns.susp_nod;
+	device->resource->susp_fen = ns.susp_fen;
+
+	/* put replicated vs not-replicated requests in seperate epochs */
+	if (did_remote != should_do_remote)
+		start_new_tl_epoch(first_peer_device(device)->connection);
+
+	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
+		drbd_print_uuids(device, "attached to UUIDs");
+
+	/* Wake up role changes, that were delayed because of connection establishing */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
+	    no_peer_wf_report_params(first_peer_device(device)->connection))
+		clear_bit(STATE_SENT, &first_peer_device(device)->connection->flags);
+
+	wake_up(&device->misc_wait);
+	wake_up(&device->state_wait);
+	wake_up(&first_peer_device(device)->connection->ping_wait);
+
+	/* Aborted verify run, or we reached the stop sector.
+	 * Log the last position, unless end-of-device. */
+	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+	    ns.conn <= C_CONNECTED) {
+		device->ov_start_sector =
+			BM_BIT_TO_SECT(drbd_bm_bits(device) - device->ov_left);
+		if (device->ov_left)
+			drbd_info(device, "Online Verify reached sector %llu\n",
+				(unsigned long long)device->ov_start_sector);
+	}
+
+	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
+		drbd_info(device, "Syncer continues.\n");
+		device->rs_paused += (long)jiffies
+				  -(long)device->rs_mark_time[device->rs_last_mark];
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&device->resync_timer, jiffies);
+	}
+
+	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
+	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+		drbd_info(device, "Resync suspended\n");
+		device->rs_mark_time[device->rs_last_mark] = jiffies;
+	}
+
+	if (os.conn == C_CONNECTED &&
+	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+		unsigned long now = jiffies;
+		int i;
+
+		set_ov_position(device, ns.conn);
+		device->rs_start = now;
+		device->rs_last_events = 0;
+		device->rs_last_sect_ev = 0;
+		device->ov_last_oos_size = 0;
+		device->ov_last_oos_start = 0;
+
+		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+			device->rs_mark_left[i] = device->ov_left;
+			device->rs_mark_time[i] = now;
+		}
+
+		drbd_rs_controller_reset(device);
+
+		if (ns.conn == C_VERIFY_S) {
+			drbd_info(device, "Starting Online Verify from sector %llu\n",
+					(unsigned long long)device->ov_position);
+			mod_timer(&device->resync_timer, jiffies);
+		}
+	}
+
+	if (get_ldev(device)) {
+		u32 mdf = device->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+		mdf &= ~MDF_AL_CLEAN;
+		if (test_bit(CRASHED_PRIMARY, &device->flags))
+			mdf |= MDF_CRASHED_PRIMARY;
+		if (device->state.role == R_PRIMARY ||
+		    (device->state.pdsk < D_INCONSISTENT && device->state.peer == R_PRIMARY))
+			mdf |= MDF_PRIMARY_IND;
+		if (device->state.conn > C_WF_REPORT_PARAMS)
+			mdf |= MDF_CONNECTED_IND;
+		if (device->state.disk > D_INCONSISTENT)
+			mdf |= MDF_CONSISTENT;
+		if (device->state.disk > D_OUTDATED)
+			mdf |= MDF_WAS_UP_TO_DATE;
+		if (device->state.pdsk <= D_OUTDATED && device->state.pdsk >= D_INCONSISTENT)
+			mdf |= MDF_PEER_OUT_DATED;
+		if (mdf != device->ldev->md.flags) {
+			device->ldev->md.flags = mdf;
+			drbd_md_mark_dirty(device);
+		}
+		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+			drbd_set_ed_uuid(device, device->ldev->md.uuid[UI_CURRENT]);
+		put_ldev(device);
+	}
+
+	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+		set_bit(CONSIDER_RESYNC, &device->flags);
+
+	/* Receiver should clean up itself */
+	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+		drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+
+	/* Now the receiver finished cleaning up itself, it should die */
+	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+		drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver);
+
+	/* Upon network failure, we need to restart the receiver. */
+	if (os.conn > C_WF_CONNECTION &&
+	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+		drbd_thread_restart_nowait(&first_peer_device(device)->connection->receiver);
+
+	/* Resume AL writing if we get a connection */
+	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+		drbd_resume_al(device);
+		first_peer_device(device)->connection->connect_cnt++;
+	}
+
+	/* remember last attach time so request_timer_fn() won't
+	 * kill newly established sessions while we are still trying to thaw
+	 * previously frozen IO */
+	if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+	    ns.disk > D_NEGOTIATING)
+		device->last_reattach_jif = jiffies;
+
+	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+	if (ascw) {
+		ascw->os = os;
+		ascw->ns = ns;
+		ascw->flags = flags;
+		ascw->w.cb = w_after_state_ch;
+		ascw->device = device;
+		ascw->done = done;
+		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+				&ascw->w);
+	} else {
+		drbd_err(device, "Could not kmalloc an ascw\n");
+	}
+
+	return rv;
+}
+
+static int w_after_state_ch(struct drbd_work *w, int unused)
+{
+	struct after_state_chg_work *ascw =
+		container_of(w, struct after_state_chg_work, w);
+	struct drbd_device *device = ascw->device;
+
+	after_state_ch(device, ascw->os, ascw->ns, ascw->flags);
+	if (ascw->flags & CS_WAIT_COMPLETE)
+		complete(ascw->done);
+	kfree(ascw);
+
+	return 0;
+}
+
+static void abw_start_sync(struct drbd_device *device, int rv)
+{
+	if (rv) {
+		drbd_err(device, "Writing the bitmap failed not starting resync.\n");
+		_drbd_request_state(device, NS(conn, C_CONNECTED), CS_VERBOSE);
+		return;
+	}
+
+	switch (device->state.conn) {
+	case C_STARTING_SYNC_T:
+		_drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		break;
+	case C_STARTING_SYNC_S:
+		drbd_start_resync(device, C_SYNC_SOURCE);
+		break;
+	}
+}
+
+int drbd_bitmap_io_from_worker(struct drbd_device *device,
+		int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags)
+{
+	int rv;
+
+	D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+	/* open coded non-blocking drbd_suspend_io(device); */
+	set_bit(SUSPEND_IO, &device->flags);
+
+	drbd_bm_lock(device, why, flags);
+	rv = io_fn(device);
+	drbd_bm_unlock(device);
+
+	drbd_resume_io(device);
+
+	return rv;
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @device:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @flags:	Flags
+ */
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags)
+{
+	struct drbd_resource *resource = device->resource;
+	struct sib_info sib;
+
+	sib.sib_reason = SIB_STATE_CHANGE;
+	sib.os = os;
+	sib.ns = ns;
+
+	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+		clear_bit(CRASHED_PRIMARY, &device->flags);
+		if (device->p_uuid)
+			device->p_uuid[UI_FLAGS] &= ~((u64)2);
+	}
+
+	/* Inform userspace about the change... */
+	drbd_bcast_event(device, &sib);
+
+	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+		drbd_khelper(device, "pri-on-incon-degr");
+
+	/* Here we have the actions that are performed after a
+	   state change. This function might sleep */
+
+	if (ns.susp_nod) {
+		struct drbd_connection *connection = first_peer_device(device)->connection;
+		enum drbd_req_event what = NOTHING;
+
+		spin_lock_irq(&device->resource->req_lock);
+		if (os.conn < C_CONNECTED && conn_lowest_conn(connection) >= C_CONNECTED)
+			what = RESEND;
+
+		if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+		    conn_lowest_disk(connection) > D_NEGOTIATING)
+			what = RESTART_FROZEN_DISK_IO;
+
+		if (resource->susp_nod && what != NOTHING) {
+			_tl_restart(connection, what);
+			_conn_request_state(connection,
+					    (union drbd_state) { { .susp_nod = 1 } },
+					    (union drbd_state) { { .susp_nod = 0 } },
+					    CS_VERBOSE);
+		}
+		spin_unlock_irq(&device->resource->req_lock);
+	}
+
+	if (ns.susp_fen) {
+		struct drbd_connection *connection = first_peer_device(device)->connection;
+
+		spin_lock_irq(&device->resource->req_lock);
+		if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
+			/* case2: The connection was established again: */
+			struct drbd_peer_device *peer_device;
+			int vnr;
+
+			rcu_read_lock();
+			idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+				clear_bit(NEW_CUR_UUID, &peer_device->device->flags);
+			rcu_read_unlock();
+			_tl_restart(connection, RESEND);
+			_conn_request_state(connection,
+					    (union drbd_state) { { .susp_fen = 1 } },
+					    (union drbd_state) { { .susp_fen = 0 } },
+					    CS_VERBOSE);
+		}
+		spin_unlock_irq(&device->resource->req_lock);
+	}
+
+	/* Became sync source.  With protocol >= 96, we still need to send out
+	 * the sync uuid now. Need to do that before any drbd_send_state, or
+	 * the other side may go "paused sync" before receiving the sync uuids,
+	 * which is unexpected. */
+	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
+	    first_peer_device(device)->connection->agreed_pro_version >= 96 && get_ldev(device)) {
+		drbd_gen_and_send_sync_uuid(first_peer_device(device));
+		put_ldev(device);
+	}
+
+	/* Do not change the order of the if above and the two below... */
+	if (os.pdsk == D_DISKLESS &&
+	    ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) {      /* attach on the peer */
+		/* we probably will start a resync soon.
+		 * make sure those things are properly reset. */
+		device->rs_total = 0;
+		device->rs_failed = 0;
+		atomic_set(&device->rs_pending_cnt, 0);
+		drbd_rs_cancel_all(device);
+
+		drbd_send_uuids(first_peer_device(device));
+		drbd_send_state(first_peer_device(device), ns);
+	}
+	/* No point in queuing send_bitmap if we don't have a connection
+	 * anymore, so check also the _current_ state, not only the new state
+	 * at the time this work was queued. */
+	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
+	    device->state.conn == C_WF_BITMAP_S)
+		drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL,
+				"send_bitmap (WFBitMapS)",
+				BM_LOCKED_TEST_ALLOWED);
+
+	/* Lost contact to peer's copy of the data */
+	if ((os.pdsk >= D_INCONSISTENT &&
+	     os.pdsk != D_UNKNOWN &&
+	     os.pdsk != D_OUTDATED)
+	&&  (ns.pdsk < D_INCONSISTENT ||
+	     ns.pdsk == D_UNKNOWN ||
+	     ns.pdsk == D_OUTDATED)) {
+		if (get_ldev(device)) {
+			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+			    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+				if (drbd_suspended(device)) {
+					set_bit(NEW_CUR_UUID, &device->flags);
+				} else {
+					drbd_uuid_new_current(device);
+					drbd_send_uuids(first_peer_device(device));
+				}
+			}
+			put_ldev(device);
+		}
+	}
+
+	if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
+		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+		    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+			drbd_uuid_new_current(device);
+			drbd_send_uuids(first_peer_device(device));
+		}
+		/* D_DISKLESS Peer becomes secondary */
+		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+			/* We may still be Primary ourselves.
+			 * No harm done if the bitmap still changes,
+			 * redirtied pages will follow later. */
+			drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+				"demote diskless peer", BM_LOCKED_SET_ALLOWED);
+		put_ldev(device);
+	}
+
+	/* Write out all changed bits on demote.
+	 * Though, no need to da that just yet
+	 * if there is a resync going on still */
+	if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
+		device->state.conn <= C_CONNECTED && get_ldev(device)) {
+		/* No changes to the bitmap expected this time, so assert that,
+		 * even though no harm was done if it did change. */
+		drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+				"demote", BM_LOCKED_TEST_ALLOWED);
+		put_ldev(device);
+	}
+
+	/* Last part of the attaching process ... */
+	if (ns.conn >= C_CONNECTED &&
+	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+		drbd_send_sizes(first_peer_device(device), 0, 0);  /* to start sync... */
+		drbd_send_uuids(first_peer_device(device));
+		drbd_send_state(first_peer_device(device), ns);
+	}
+
+	/* We want to pause/continue resync, tell peer. */
+	if (ns.conn >= C_CONNECTED &&
+	     ((os.aftr_isp != ns.aftr_isp) ||
+	      (os.user_isp != ns.user_isp)))
+		drbd_send_state(first_peer_device(device), ns);
+
+	/* In case one of the isp bits got set, suspend other devices. */
+	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+		suspend_other_sg(device);
+
+	/* Make sure the peer gets informed about eventual state
+	   changes (ISP bits) while we were in WFReportParams. */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+		drbd_send_state(first_peer_device(device), ns);
+
+	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
+		drbd_send_state(first_peer_device(device), ns);
+
+	/* We are in the progress to start a full sync... */
+	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+		/* no other bitmap changes expected during this phase */
+		drbd_queue_bitmap_io(device,
+			&drbd_bmio_set_n_write, &abw_start_sync,
+			"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
+
+	/* first half of local IO error, failure to attach,
+	 * or administrative detach */
+	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
+		enum drbd_io_error_p eh = EP_PASS_ON;
+		int was_io_error = 0;
+		/* corresponding get_ldev was in __drbd_set_state, to serialize
+		 * our cleanup here with the transition to D_DISKLESS.
+		 * But is is still not save to dreference ldev here, since
+		 * we might come from an failed Attach before ldev was set. */
+		if (device->ldev) {
+			rcu_read_lock();
+			eh = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+			rcu_read_unlock();
+
+			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
+
+			if (was_io_error && eh == EP_CALL_HELPER)
+				drbd_khelper(device, "local-io-error");
+
+			/* Immediately allow completion of all application IO,
+			 * that waits for completion from the local disk,
+			 * if this was a force-detach due to disk_timeout
+			 * or administrator request (drbdsetup detach --force).
+			 * Do NOT abort otherwise.
+			 * Aborting local requests may cause serious problems,
+			 * if requests are completed to upper layers already,
+			 * and then later the already submitted local bio completes.
+			 * This can cause DMA into former bio pages that meanwhile
+			 * have been re-used for other things.
+			 * So aborting local requests may cause crashes,
+			 * or even worse, silent data corruption.
+			 */
+			if (test_and_clear_bit(FORCE_DETACH, &device->flags))
+				tl_abort_disk_io(device);
+
+			/* current state still has to be D_FAILED,
+			 * there is only one way out: to D_DISKLESS,
+			 * and that may only happen after our put_ldev below. */
+			if (device->state.disk != D_FAILED)
+				drbd_err(device,
+					"ASSERT FAILED: disk is %s during detach\n",
+					drbd_disk_str(device->state.disk));
+
+			if (ns.conn >= C_CONNECTED)
+				drbd_send_state(first_peer_device(device), ns);
+
+			drbd_rs_cancel_all(device);
+
+			/* In case we want to get something to stable storage still,
+			 * this may be the last chance.
+			 * Following put_ldev may transition to D_DISKLESS. */
+			drbd_md_sync(device);
+		}
+		put_ldev(device);
+	}
+
+	/* second half of local IO error, failure to attach,
+	 * or administrative detach,
+	 * after local_cnt references have reached zero again */
+	if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
+		/* We must still be diskless,
+		 * re-attach has to be serialized with this! */
+		if (device->state.disk != D_DISKLESS)
+			drbd_err(device,
+				 "ASSERT FAILED: disk is %s while going diskless\n",
+				 drbd_disk_str(device->state.disk));
+
+		if (ns.conn >= C_CONNECTED)
+			drbd_send_state(first_peer_device(device), ns);
+		/* corresponding get_ldev in __drbd_set_state
+		 * this may finally trigger drbd_ldev_destroy. */
+		put_ldev(device);
+	}
+
+	/* Notify peer that I had a local IO error, and did not detached.. */
+	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+		drbd_send_state(first_peer_device(device), ns);
+
+	/* Disks got bigger while they were detached */
+	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+	    test_and_clear_bit(RESYNC_AFTER_NEG, &device->flags)) {
+		if (ns.conn == C_CONNECTED)
+			resync_after_online_grow(device);
+	}
+
+	/* A resync finished or aborted, wake paused devices... */
+	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+	    (os.peer_isp && !ns.peer_isp) ||
+	    (os.user_isp && !ns.user_isp))
+		resume_next_sg(device);
+
+	/* sync target done with resync.  Explicitly notify peer, even though
+	 * it should (at least for non-empty resyncs) already know itself. */
+	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+		drbd_send_state(first_peer_device(device), ns);
+
+	/* Verify finished, or reached stop sector.  Peer did not know about
+	 * the stop sector, and we may even have changed the stop sector during
+	 * verify to interrupt/stop early.  Send the new state. */
+	if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
+	&& verify_can_do_stop_sector(device))
+		drbd_send_state(first_peer_device(device), ns);
+
+	/* This triggers bitmap writeout of potentially still unwritten pages
+	 * if the resync finished cleanly, or aborted because of peer disk
+	 * failure, or because of connection loss.
+	 * For resync aborted because of local disk failure, we cannot do
+	 * any bitmap writeout anymore.
+	 * No harm done if some bits change during this phase.
+	 */
+	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) {
+		drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
+			"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
+		put_ldev(device);
+	}
+
+	if (ns.disk == D_DISKLESS &&
+	    ns.conn == C_STANDALONE &&
+	    ns.role == R_SECONDARY) {
+		if (os.aftr_isp != ns.aftr_isp)
+			resume_next_sg(device);
+	}
+
+	drbd_md_sync(device);
+}
+
+struct after_conn_state_chg_work {
+	struct drbd_work w;
+	enum drbd_conns oc;
+	union drbd_state ns_min;
+	union drbd_state ns_max; /* new, max state, over all devices */
+	enum chg_state_flags flags;
+	struct drbd_connection *connection;
+};
+
+static int w_after_conn_state_ch(struct drbd_work *w, int unused)
+{
+	struct after_conn_state_chg_work *acscw =
+		container_of(w, struct after_conn_state_chg_work, w);
+	struct drbd_connection *connection = acscw->connection;
+	enum drbd_conns oc = acscw->oc;
+	union drbd_state ns_max = acscw->ns_max;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	kfree(acscw);
+
+	/* Upon network configuration, we need to start the receiver */
+	if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED)
+		drbd_thread_start(&connection->receiver);
+
+	if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
+		struct net_conf *old_conf;
+
+		mutex_lock(&connection->resource->conf_update);
+		old_conf = connection->net_conf;
+		connection->my_addr_len = 0;
+		connection->peer_addr_len = 0;
+		rcu_assign_pointer(connection->net_conf, NULL);
+		conn_free_crypto(connection);
+		mutex_unlock(&connection->resource->conf_update);
+
+		synchronize_rcu();
+		kfree(old_conf);
+	}
+
+	if (ns_max.susp_fen) {
+		/* case1: The outdate peer handler is successful: */
+		if (ns_max.pdsk <= D_OUTDATED) {
+			rcu_read_lock();
+			idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+				struct drbd_device *device = peer_device->device;
+				if (test_bit(NEW_CUR_UUID, &device->flags)) {
+					drbd_uuid_new_current(device);
+					clear_bit(NEW_CUR_UUID, &device->flags);
+				}
+			}
+			rcu_read_unlock();
+			spin_lock_irq(&connection->resource->req_lock);
+			_tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+			_conn_request_state(connection,
+					    (union drbd_state) { { .susp_fen = 1 } },
+					    (union drbd_state) { { .susp_fen = 0 } },
+					    CS_VERBOSE);
+			spin_unlock_irq(&connection->resource->req_lock);
+		}
+	}
+	kref_put(&connection->kref, drbd_destroy_connection);
+
+	conn_md_sync(connection);
+
+	return 0;
+}
+
+void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
+{
+	enum chg_state_flags flags = ~0;
+	struct drbd_peer_device *peer_device;
+	int vnr, first_vol = 1;
+	union drbd_dev_state os, cs = {
+		{ .role = R_SECONDARY,
+		  .peer = R_UNKNOWN,
+		  .conn = connection->cstate,
+		  .disk = D_DISKLESS,
+		  .pdsk = D_UNKNOWN,
+		} };
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		os = device->state;
+
+		if (first_vol) {
+			cs = os;
+			first_vol = 0;
+			continue;
+		}
+
+		if (cs.role != os.role)
+			flags &= ~CS_DC_ROLE;
+
+		if (cs.peer != os.peer)
+			flags &= ~CS_DC_PEER;
+
+		if (cs.conn != os.conn)
+			flags &= ~CS_DC_CONN;
+
+		if (cs.disk != os.disk)
+			flags &= ~CS_DC_DISK;
+
+		if (cs.pdsk != os.pdsk)
+			flags &= ~CS_DC_PDSK;
+	}
+	rcu_read_unlock();
+
+	*pf |= CS_DC_MASK;
+	*pf &= flags;
+	(*pcs).i = cs.i;
+}
+
+static enum drbd_state_rv
+conn_is_valid_transition(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+			 enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+	union drbd_state ns, os;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		os = drbd_read_state(device);
+		ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+
+		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+			ns.disk = os.disk;
+
+		if (ns.i == os.i)
+			continue;
+
+		rv = is_valid_transition(os, ns);
+
+		if (rv >= SS_SUCCESS && !(flags & CS_HARD)) {
+			rv = is_valid_state(device, ns);
+			if (rv < SS_SUCCESS) {
+				if (is_valid_state(device, os) == rv)
+					rv = is_valid_soft_transition(os, ns, connection);
+			} else
+				rv = is_valid_soft_transition(os, ns, connection);
+		}
+
+		if (rv < SS_SUCCESS) {
+			if (flags & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+void
+conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+	       union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
+{
+	union drbd_state ns, os, ns_max = { };
+	union drbd_state ns_min = {
+		{ .role = R_MASK,
+		  .peer = R_MASK,
+		  .conn = val.conn,
+		  .disk = D_MASK,
+		  .pdsk = D_MASK
+		} };
+	struct drbd_peer_device *peer_device;
+	enum drbd_state_rv rv;
+	int vnr, number_of_volumes = 0;
+
+	if (mask.conn == C_MASK) {
+		/* remember last connect time so request_timer_fn() won't
+		 * kill newly established sessions while we are still trying to thaw
+		 * previously frozen IO */
+		if (connection->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS)
+			connection->last_reconnect_jif = jiffies;
+
+		connection->cstate = val.conn;
+	}
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		number_of_volumes++;
+		os = drbd_read_state(device);
+		ns = apply_mask_val(os, mask, val);
+		ns = sanitize_state(device, os, ns, NULL);
+
+		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+			ns.disk = os.disk;
+
+		rv = __drbd_set_state(device, ns, flags, NULL);
+		if (rv < SS_SUCCESS)
+			BUG();
+
+		ns.i = device->state.i;
+		ns_max.role = max_role(ns.role, ns_max.role);
+		ns_max.peer = max_role(ns.peer, ns_max.peer);
+		ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn);
+		ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk);
+		ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk);
+
+		ns_min.role = min_role(ns.role, ns_min.role);
+		ns_min.peer = min_role(ns.peer, ns_min.peer);
+		ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn);
+		ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk);
+		ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk);
+	}
+	rcu_read_unlock();
+
+	if (number_of_volumes == 0) {
+		ns_min = ns_max = (union drbd_state) { {
+				.role = R_SECONDARY,
+				.peer = R_UNKNOWN,
+				.conn = val.conn,
+				.disk = D_DISKLESS,
+				.pdsk = D_UNKNOWN
+			} };
+	}
+
+	ns_min.susp = ns_max.susp = connection->resource->susp;
+	ns_min.susp_nod = ns_max.susp_nod = connection->resource->susp_nod;
+	ns_min.susp_fen = ns_max.susp_fen = connection->resource->susp_fen;
+
+	*pns_min = ns_min;
+	*pns_max = ns_max;
+}
+
+static enum drbd_state_rv
+_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+	enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */;
+
+	if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags))
+		rv = SS_CW_SUCCESS;
+
+	if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags))
+		rv = SS_CW_FAILED_BY_PEER;
+
+	err = conn_is_valid_transition(connection, mask, val, 0);
+	if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
+		return rv;
+
+	return err;
+}
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		    enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+	struct after_conn_state_chg_work *acscw;
+	enum drbd_conns oc = connection->cstate;
+	union drbd_state ns_max, ns_min, os;
+	bool have_mutex = false;
+
+	if (mask.conn) {
+		rv = is_valid_conn_transition(oc, val.conn);
+		if (rv < SS_SUCCESS)
+			goto abort;
+	}
+
+	rv = conn_is_valid_transition(connection, mask, val, flags);
+	if (rv < SS_SUCCESS)
+		goto abort;
+
+	if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING &&
+	    !(flags & (CS_LOCAL_ONLY | CS_HARD))) {
+
+		/* This will be a cluster-wide state change.
+		 * Need to give up the spinlock, grab the mutex,
+		 * then send the state change request, ... */
+		spin_unlock_irq(&connection->resource->req_lock);
+		mutex_lock(&connection->cstate_mutex);
+		have_mutex = true;
+
+		set_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+		if (conn_send_state_req(connection, mask, val)) {
+			/* sending failed. */
+			clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+			rv = SS_CW_FAILED_BY_PEER;
+			/* need to re-aquire the spin lock, though */
+			goto abort_unlocked;
+		}
+
+		if (val.conn == C_DISCONNECTING)
+			set_bit(DISCONNECT_SENT, &connection->flags);
+
+		/* ... and re-aquire the spinlock.
+		 * If _conn_rq_cond() returned >= SS_SUCCESS, we must call
+		 * conn_set_state() within the same spinlock. */
+		spin_lock_irq(&connection->resource->req_lock);
+		wait_event_lock_irq(connection->ping_wait,
+				(rv = _conn_rq_cond(connection, mask, val)),
+				connection->resource->req_lock);
+		clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+		if (rv < SS_SUCCESS)
+			goto abort;
+	}
+
+	conn_old_common_state(connection, &os, &flags);
+	flags |= CS_DC_SUSP;
+	conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
+	conn_pr_state_change(connection, os, ns_max, flags);
+
+	acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
+	if (acscw) {
+		acscw->oc = os.conn;
+		acscw->ns_min = ns_min;
+		acscw->ns_max = ns_max;
+		acscw->flags = flags;
+		acscw->w.cb = w_after_conn_state_ch;
+		kref_get(&connection->kref);
+		acscw->connection = connection;
+		drbd_queue_work(&connection->sender_work, &acscw->w);
+	} else {
+		drbd_err(connection, "Could not kmalloc an acscw\n");
+	}
+
+ abort:
+	if (have_mutex) {
+		/* mutex_unlock() "... must not be used in interrupt context.",
+		 * so give up the spinlock, then re-aquire it */
+		spin_unlock_irq(&connection->resource->req_lock);
+ abort_unlocked:
+		mutex_unlock(&connection->cstate_mutex);
+		spin_lock_irq(&connection->resource->req_lock);
+	}
+	if (rv < SS_SUCCESS && flags & CS_VERBOSE) {
+		drbd_err(connection, "State change failed: %s\n", drbd_set_st_err_str(rv));
+		drbd_err(connection, " mask = 0x%x val = 0x%x\n", mask.i, val.i);
+		drbd_err(connection, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn));
+	}
+	return rv;
+}
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		   enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv;
+
+	spin_lock_irq(&connection->resource->req_lock);
+	rv = _conn_request_state(connection, mask, val, flags);
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	return rv;
+}
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
new file mode 100644
index 00000000000..cc41605ba21
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.h
@@ -0,0 +1,161 @@
+#ifndef DRBD_STATE_H
+#define DRBD_STATE_H
+
+struct drbd_device;
+struct drbd_connection;
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
+
+#define NS(T, S) \
+	({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+	({ union drbd_state val;  val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+enum chg_state_flags {
+	CS_HARD	         = 1 << 0,
+	CS_VERBOSE       = 1 << 1,
+	CS_WAIT_COMPLETE = 1 << 2,
+	CS_SERIALIZE     = 1 << 3,
+	CS_ORDERED       = CS_WAIT_COMPLETE + CS_SERIALIZE,
+	CS_LOCAL_ONLY    = 1 << 4, /* Do not consider a device pair wide state change */
+	CS_DC_ROLE       = 1 << 5, /* DC = display as connection state change */
+	CS_DC_PEER       = 1 << 6,
+	CS_DC_CONN       = 1 << 7,
+	CS_DC_DISK       = 1 << 8,
+	CS_DC_PDSK       = 1 << 9,
+	CS_DC_SUSP       = 1 << 10,
+	CS_DC_MASK       = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
+	CS_IGN_OUTD_FAIL = 1 << 11,
+};
+
+/* drbd_dev_state and drbd_state are different types. This is to stress the
+   small difference. There is no suspended flag (.susp), and no suspended
+   while fence handler runs flas (susp_fen). */
+union drbd_dev_state {
+	struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned _unused:1 ;
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned peer_isp:1 ;
+		unsigned user_isp:1 ;
+		unsigned _pad:11;   /* 0	 unused */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+		unsigned _pad:11;
+		unsigned user_isp:1 ;
+		unsigned peer_isp:1 ;
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned _unused:1 ;
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+#else
+# error "this endianess is not supported"
+#endif
+	};
+	unsigned int i;
+};
+
+extern enum drbd_state_rv drbd_change_state(struct drbd_device *device,
+					    enum chg_state_flags f,
+					    union drbd_state mask,
+					    union drbd_state val);
+extern void drbd_force_state(struct drbd_device *, union drbd_state,
+			union drbd_state);
+extern enum drbd_state_rv _drbd_request_state(struct drbd_device *,
+					      union drbd_state,
+					      union drbd_state,
+					      enum chg_state_flags);
+extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
+					   enum chg_state_flags,
+					   struct completion *done);
+extern void print_st_err(struct drbd_device *, union drbd_state,
+			union drbd_state, int);
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		    enum chg_state_flags flags);
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		   enum chg_state_flags flags);
+
+extern void drbd_resume_al(struct drbd_device *device);
+extern bool conn_all_vols_unconf(struct drbd_connection *connection);
+
+/**
+ * drbd_request_state() - Reqest a state change
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_device *device,
+				     union drbd_state mask,
+				     union drbd_state val)
+{
+	return _drbd_request_state(device, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection);
+enum drbd_role conn_highest_peer(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection);
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection);
+
+#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 00000000000..80b0f63c707
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,118 @@
+/*
+  drbd.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/drbd.h>
+#include "drbd_strings.h"
+
+static const char *drbd_conn_s_names[] = {
+	[C_STANDALONE]       = "StandAlone",
+	[C_DISCONNECTING]    = "Disconnecting",
+	[C_UNCONNECTED]      = "Unconnected",
+	[C_TIMEOUT]          = "Timeout",
+	[C_BROKEN_PIPE]      = "BrokenPipe",
+	[C_NETWORK_FAILURE]  = "NetworkFailure",
+	[C_PROTOCOL_ERROR]   = "ProtocolError",
+	[C_WF_CONNECTION]    = "WFConnection",
+	[C_WF_REPORT_PARAMS] = "WFReportParams",
+	[C_TEAR_DOWN]        = "TearDown",
+	[C_CONNECTED]        = "Connected",
+	[C_STARTING_SYNC_S]  = "StartingSyncS",
+	[C_STARTING_SYNC_T]  = "StartingSyncT",
+	[C_WF_BITMAP_S]      = "WFBitMapS",
+	[C_WF_BITMAP_T]      = "WFBitMapT",
+	[C_WF_SYNC_UUID]     = "WFSyncUUID",
+	[C_SYNC_SOURCE]      = "SyncSource",
+	[C_SYNC_TARGET]      = "SyncTarget",
+	[C_PAUSED_SYNC_S]    = "PausedSyncS",
+	[C_PAUSED_SYNC_T]    = "PausedSyncT",
+	[C_VERIFY_S]         = "VerifyS",
+	[C_VERIFY_T]         = "VerifyT",
+	[C_AHEAD]            = "Ahead",
+	[C_BEHIND]           = "Behind",
+};
+
+static const char *drbd_role_s_names[] = {
+	[R_PRIMARY]   = "Primary",
+	[R_SECONDARY] = "Secondary",
+	[R_UNKNOWN]   = "Unknown"
+};
+
+static const char *drbd_disk_s_names[] = {
+	[D_DISKLESS]     = "Diskless",
+	[D_ATTACHING]    = "Attaching",
+	[D_FAILED]       = "Failed",
+	[D_NEGOTIATING]  = "Negotiating",
+	[D_INCONSISTENT] = "Inconsistent",
+	[D_OUTDATED]     = "Outdated",
+	[D_UNKNOWN]      = "DUnknown",
+	[D_CONSISTENT]   = "Consistent",
+	[D_UP_TO_DATE]   = "UpToDate",
+};
+
+static const char *drbd_state_sw_errors[] = {
+	[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
+	[-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
+	[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
+	[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
+	[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
+	[-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
+	[-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
+	[-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
+	[-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
+	[-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
+	[-SS_DEVICE_IN_USE] = "Device is held open by someone",
+	[-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
+	[-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
+	[-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
+	[-SS_NOT_SUPPORTED] = "Peer does not support protocol",
+	[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
+	[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
+	[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+	[-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
+	[-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
+};
+
+const char *drbd_conn_str(enum drbd_conns s)
+{
+	/* enums are unsigned... */
+	return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s];
+}
+
+const char *drbd_role_str(enum drbd_role s)
+{
+	return s > R_SECONDARY   ? "TOO_LARGE" : drbd_role_s_names[s];
+}
+
+const char *drbd_disk_str(enum drbd_disk_state s)
+{
+	return s > D_UP_TO_DATE    ? "TOO_LARGE" : drbd_disk_s_names[s];
+}
+
+const char *drbd_set_st_err_str(enum drbd_state_rv err)
+{
+	return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
+	       err > SS_TWO_PRIMARIES ? "TOO_LARGE"
+			: drbd_state_sw_errors[-err];
+}
diff --git a/drivers/block/drbd/drbd_strings.h b/drivers/block/drbd/drbd_strings.h
new file mode 100644
index 00000000000..f9923cc88af
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.h
@@ -0,0 +1,9 @@
+#ifndef __DRBD_STRINGS_H
+#define __DRBD_STRINGS_H
+
+extern const char *drbd_conn_str(enum drbd_conns);
+extern const char *drbd_role_str(enum drbd_role);
+extern const char *drbd_disk_str(enum drbd_disk_state);
+extern const char *drbd_set_st_err_str(enum drbd_state_rv);
+
+#endif  /* __DRBD_STRINGS_H */
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 00000000000..8cb1532a381
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,351 @@
+/*
+-*- linux-c -*-
+   drbd_receiver.c
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_VLI_H
+#define _DRBD_VLI_H
+
+/*
+ * At a granularity of 4KiB storage represented per bit,
+ * and stroage sizes of several TiB,
+ * and possibly small-bandwidth replication,
+ * the bitmap transfer time can take much too long,
+ * if transmitted in plain text.
+ *
+ * We try to reduce the transferred bitmap information
+ * by encoding runlengths of bit polarity.
+ *
+ * We never actually need to encode a "zero" (runlengths are positive).
+ * But then we have to store the value of the first bit.
+ * The first bit of information thus shall encode if the first runlength
+ * gives the number of set or unset bits.
+ *
+ * We assume that large areas are either completely set or unset,
+ * which gives good compression with any runlength method,
+ * even when encoding the runlength as fixed size 32bit/64bit integers.
+ *
+ * Still, there may be areas where the polarity flips every few bits,
+ * and encoding the runlength sequence of those areas with fix size
+ * integers would be much worse than plaintext.
+ *
+ * We want to encode small runlength values with minimum code length,
+ * while still being able to encode a Huge run of all zeros.
+ *
+ * Thus we need a Variable Length Integer encoding, VLI.
+ *
+ * For some cases, we produce more code bits than plaintext input.
+ * We need to send incompressible chunks as plaintext, skip over them
+ * and then see if the next chunk compresses better.
+ *
+ * We don't care too much about "excellent" compression ratio for large
+ * runlengths (all set/all clear): whether we achieve a factor of 100
+ * or 1000 is not that much of an issue.
+ * We do not want to waste too much on short runlengths in the "noisy"
+ * parts of the bitmap, though.
+ *
+ * There are endless variants of VLI, we experimented with:
+ *  * simple byte-based
+ *  * various bit based with different code word length.
+ *
+ * To avoid yet an other configuration parameter (choice of bitmap compression
+ * algorithm) which was difficult to explain and tune, we just chose the one
+ * variant that turned out best in all test cases.
+ * Based on real world usage patterns, with device sizes ranging from a few GiB
+ * to several TiB, file server/mailserver/webserver/mysql/postgress,
+ * mostly idle to really busy, the all time winner (though sometimes only
+ * marginally better) is:
+ */
+
+/*
+ * encoding is "visualised" as
+ * __little endian__ bitstream, least significant bit first (left most)
+ *
+ * this particular encoding is chosen so that the prefix code
+ * starts as unary encoding the level, then modified so that
+ * 10 levels can be described in 8bit, with minimal overhead
+ * for the smaller levels.
+ *
+ * Number of data bits follow fibonacci sequence, with the exception of the
+ * last level (+1 data bit, so it makes 64bit total).  The only worse code when
+ * encoding bit polarity runlength is 1 plain bits => 2 code bits.
+prefix    data bits                                    max val  Nº data bits
+0 x                                                         0x2            1
+10 x                                                        0x4            1
+110 xx                                                      0x8            2
+1110 xxx                                                   0x10            3
+11110 xxx xx                                               0x30            5
+111110 xx xxxxxx                                          0x130            8
+11111100  xxxxxxxx xxxxx                                 0x2130           13
+11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
+11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
+11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
+ * maximum encodable value: 0x100000400202130 == 2**56 + some */
+
+/* compression "table":
+ transmitted   x                                0.29
+ as plaintext x                                  ........................
+             x                                   ........................
+            x                                    ........................
+           x    0.59                         0.21........................
+          x      ........................................................
+         x       .. c ...................................................
+        x    0.44.. o ...................................................
+       x .......... d ...................................................
+      x  .......... e ...................................................
+     X.............   ...................................................
+    x.............. b ...................................................
+2.0x............... i ...................................................
+ #X................ t ...................................................
+ #................. s ...........................  plain bits  ..........
+-+-----------------------------------------------------------------------
+ 1             16              32                              64
+*/
+
+/* LEVEL: (total bits, prefix bits, prefix value),
+ * sorted ascending by number of total bits.
+ * The rest of the code table is calculated at compiletime from this. */
+
+/* fibonacci data 1, 1, ... */
+#define VLI_L_1_1() do { \
+	LEVEL( 2, 1, 0x00); \
+	LEVEL( 3, 2, 0x01); \
+	LEVEL( 5, 3, 0x03); \
+	LEVEL( 7, 4, 0x07); \
+	LEVEL(10, 5, 0x0f); \
+	LEVEL(14, 6, 0x1f); \
+	LEVEL(21, 8, 0x3f); \
+	LEVEL(29, 8, 0x7f); \
+	LEVEL(42, 8, 0xbf); \
+	LEVEL(64, 8, 0xff); \
+	} while (0)
+
+/* finds a suitable level to decode the least significant part of in.
+ * returns number of bits consumed.
+ *
+ * BUG() for bad input, as that would mean a buggy code table. */
+static inline int vli_decode_bits(u64 *out, const u64 in)
+{
+	u64 adj = 1;
+
+#define LEVEL(t,b,v)					\
+	do {						\
+		if ((in & ((1 << b) -1)) == v) {	\
+			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
+			return t;			\
+		}					\
+		adj += 1ULL << (t - b);			\
+	} while (0)
+
+	VLI_L_1_1();
+
+	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
+	BUG();
+#undef LEVEL
+}
+
+/* return number of code bits needed,
+ * or negative error number */
+static inline int __vli_encode_bits(u64 *out, const u64 in)
+{
+	u64 max = 0;
+	u64 adj = 1;
+
+	if (in == 0)
+		return -EINVAL;
+
+#define LEVEL(t,b,v) do {		\
+		max += 1ULL << (t - b);	\
+		if (in <= max) {	\
+			if (out)	\
+				*out = ((in - adj) << b) | v;	\
+			return t;	\
+		}			\
+		adj = max + 1;		\
+	} while (0)
+
+	VLI_L_1_1();
+
+	return -EOVERFLOW;
+#undef LEVEL
+}
+
+#undef VLI_L_1_1
+
+/* code from here down is independend of actually used bit code */
+
+/*
+ * Code length is determined by some unique (e.g. unary) prefix.
+ * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
+ * not a byte stream.
+ */
+
+/* for the bitstream, we need a cursor */
+struct bitstream_cursor {
+	/* the current byte */
+	u8 *b;
+	/* the current bit within *b, nomalized: 0..7 */
+	unsigned int bit;
+};
+
+/* initialize cursor to point to first bit of stream */
+static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
+{
+	cur->b = s;
+	cur->bit = 0;
+}
+
+/* advance cursor by that many bits; maximum expected input value: 64,
+ * but depending on VLI implementation, it may be more. */
+static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
+{
+	bits += cur->bit;
+	cur->b = cur->b + (bits >> 3);
+	cur->bit = bits & 7;
+}
+
+/* the bitstream itself knows its length */
+struct bitstream {
+	struct bitstream_cursor cur;
+	unsigned char *buf;
+	size_t buf_len;		/* in bytes */
+
+	/* for input stream:
+	 * number of trailing 0 bits for padding
+	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
+	unsigned int pad_bits;
+};
+
+static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
+{
+	bs->buf = s;
+	bs->buf_len = len;
+	bs->pad_bits = pad_bits;
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+}
+
+static inline void bitstream_rewind(struct bitstream *bs)
+{
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+	memset(bs->buf, 0, bs->buf_len);
+}
+
+/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
+ * Ignores "pad_bits".
+ * Returns zero if bits == 0 (nothing to do).
+ * Returns number of bits used if successful.
+ *
+ * If there is not enough room left in bitstream,
+ * leaves bitstream unchanged and returns -ENOBUFS.
+ */
+static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
+{
+	unsigned char *b = bs->cur.b;
+	unsigned int tmp;
+
+	if (bits == 0)
+		return 0;
+
+	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
+		return -ENOBUFS;
+
+	/* paranoia: strip off hi bits; they should not be set anyways. */
+	if (bits < 64)
+		val &= ~0ULL >> (64 - bits);
+
+	*b++ |= (val & 0xff) << bs->cur.bit;
+
+	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
+		*b++ |= (val >> tmp) & 0xff;
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	return bits;
+}
+
+/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
+ *
+ * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
+ *
+ * If there are less than the requested number of valid bits left in the
+ * bitstream, still fetches all available bits.
+ *
+ * Returns number of actually fetched bits.
+ */
+static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
+{
+	u64 val;
+	unsigned int n;
+
+	if (bits > 64)
+		return -EINVAL;
+
+	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
+		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
+			- bs->cur.bit - bs->pad_bits;
+
+	if (bits == 0) {
+		*out = 0;
+		return 0;
+	}
+
+	/* get the high bits */
+	val = 0;
+	n = (bs->cur.bit + bits + 7) >> 3;
+	/* n may be at most 9, if cur.bit + bits > 64 */
+	/* which means this copies at most 8 byte */
+	if (n) {
+		memcpy(&val, bs->cur.b+1, n - 1);
+		val = le64_to_cpu(val) << (8 - bs->cur.bit);
+	}
+
+	/* we still need the low bits */
+	val |= bs->cur.b[0] >> bs->cur.bit;
+
+	/* and mask out bits we don't want */
+	val &= ~0ULL >> (64 - bits);
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	*out = val;
+
+	return bits;
+}
+
+/* encodes @in as vli into @bs;
+
+ * return values
+ *  > 0: number of bits successfully stored in bitstream
+ * -ENOBUFS @bs is full
+ * -EINVAL input zero (invalid)
+ * -EOVERFLOW input too large for this vli code (invalid)
+ */
+static inline int vli_encode_bits(struct bitstream *bs, u64 in)
+{
+	u64 code = code;
+	int bits = __vli_encode_bits(&code, in);
+
+	if (bits <= 0)
+		return bits;
+
+	return bitstream_put_bits(bs, code, bits);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 00000000000..d8f57b6305c
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,1958 @@
+/*
+   drbd_worker.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+
+static int make_ov_request(struct drbd_device *, int);
+static int make_resync_request(struct drbd_device *, int);
+
+/* endio handlers:
+ *   drbd_md_io_complete (defined here)
+ *   drbd_request_endio (defined here)
+ *   drbd_peer_request_endio (defined here)
+ *   bm_async_io_complete (defined in drbd_bitmap.c)
+ *
+ * For all these callbacks, note the following:
+ * The callbacks will be called in irq context by the IDE drivers,
+ * and in Softirqs/Tasklets/BH context by the SCSI drivers.
+ * Try to get the locking right :)
+ *
+ */
+
+
+/* About the global_state_lock
+   Each state transition on an device holds a read lock. In case we have
+   to evaluate the resync after dependencies, we grab a write lock, because
+   we need stable states on all devices for that.  */
+rwlock_t global_state_lock;
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by drbd_md_sync_page_io()
+ */
+void drbd_md_io_complete(struct bio *bio, int error)
+{
+	struct drbd_md_io *md_io;
+	struct drbd_device *device;
+
+	md_io = (struct drbd_md_io *)bio->bi_private;
+	device = container_of(md_io, struct drbd_device, md_io);
+
+	md_io->error = error;
+
+	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+	 * to timeout on the lower level device, and eventually detach from it.
+	 * If this io completion runs after that timeout expired, this
+	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
+	 * During normal operation, this only puts that extra reference
+	 * down to 1 again.
+	 * Make sure we first drop the reference, and only then signal
+	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+	 * next drbd_md_sync_page_io(), that we trigger the
+	 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
+	 */
+	drbd_md_put_buffer(device);
+	md_io->done = 1;
+	wake_up(&device->misc_wait);
+	bio_put(bio);
+	if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
+		put_ldev(device);
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	device->read_cnt += peer_req->i.size >> 9;
+	list_del(&peer_req->w.list);
+	if (list_empty(&device->read_ee))
+		wake_up(&device->ee_wait);
+	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+		__drbd_chk_io_error(device, DRBD_READ_ERROR);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
+	put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver, final stage.  */
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct drbd_interval i;
+	int do_wake;
+	u64 block_id;
+	int do_al_complete_io;
+
+	/* after we moved peer_req to done_ee,
+	 * we may no longer access it,
+	 * it may be freed/reused already!
+	 * (as soon as we release the req_lock) */
+	i = peer_req->i;
+	do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
+	block_id = peer_req->block_id;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	device->writ_cnt += peer_req->i.size >> 9;
+	list_move_tail(&peer_req->w.list, &device->done_ee);
+
+	/*
+	 * Do not remove from the write_requests tree here: we did not send the
+	 * Ack yet and did not wake possibly waiting conflicting requests.
+	 * Removed from the tree from "drbd_process_done_ee" within the
+	 * appropriate dw.cb (e_end_block/e_end_resync_block) or from
+	 * _drbd_clear_done_ee.
+	 */
+
+	do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
+
+	/* FIXME do we want to detach for failed REQ_DISCARD?
+	 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
+	if (peer_req->flags & EE_WAS_ERROR)
+		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	if (block_id == ID_SYNCER)
+		drbd_rs_complete_io(device, i.sector);
+
+	if (do_wake)
+		wake_up(&device->ee_wait);
+
+	if (do_al_complete_io)
+		drbd_al_complete_io(device, &i);
+
+	wake_asender(peer_device->connection);
+	put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_peer_request_endio(struct bio *bio, int error)
+{
+	struct drbd_peer_request *peer_req = bio->bi_private;
+	struct drbd_device *device = peer_req->peer_device->device;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+	int is_write = bio_data_dir(bio) == WRITE;
+	int is_discard = !!(bio->bi_rw & REQ_DISCARD);
+
+	if (error && __ratelimit(&drbd_ratelimit_state))
+		drbd_warn(device, "%s: error=%d s=%llus\n",
+				is_write ? (is_discard ? "discard" : "write")
+					: "read", error,
+				(unsigned long long)peer_req->i.sector);
+	if (!error && !uptodate) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_warn(device, "%s: setting error to -EIO s=%llus\n",
+					is_write ? "write" : "read",
+					(unsigned long long)peer_req->i.sector);
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+	if (error)
+		set_bit(__EE_WAS_ERROR, &peer_req->flags);
+
+	bio_put(bio); /* no need for the bio anymore */
+	if (atomic_dec_and_test(&peer_req->pending_bios)) {
+		if (is_write)
+			drbd_endio_write_sec_final(peer_req);
+		else
+			drbd_endio_read_sec_final(peer_req);
+	}
+}
+
+/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
+ */
+void drbd_request_endio(struct bio *bio, int error)
+{
+	unsigned long flags;
+	struct drbd_request *req = bio->bi_private;
+	struct drbd_device *device = req->device;
+	struct bio_and_error m;
+	enum drbd_req_event what;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+	if (!error && !uptodate) {
+		drbd_warn(device, "p %s: setting error to -EIO\n",
+			 bio_data_dir(bio) == WRITE ? "write" : "read");
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+
+	/* If this request was aborted locally before,
+	 * but now was completed "successfully",
+	 * chances are that this caused arbitrary data corruption.
+	 *
+	 * "aborting" requests, or force-detaching the disk, is intended for
+	 * completely blocked/hung local backing devices which do no longer
+	 * complete requests at all, not even do error completions.  In this
+	 * situation, usually a hard-reset and failover is the only way out.
+	 *
+	 * By "aborting", basically faking a local error-completion,
+	 * we allow for a more graceful swichover by cleanly migrating services.
+	 * Still the affected node has to be rebooted "soon".
+	 *
+	 * By completing these requests, we allow the upper layers to re-use
+	 * the associated data pages.
+	 *
+	 * If later the local backing device "recovers", and now DMAs some data
+	 * from disk into the original request pages, in the best case it will
+	 * just put random data into unused pages; but typically it will corrupt
+	 * meanwhile completely unrelated data, causing all sorts of damage.
+	 *
+	 * Which means delayed successful completion,
+	 * especially for READ requests,
+	 * is a reason to panic().
+	 *
+	 * We assume that a delayed *error* completion is OK,
+	 * though we still will complain noisily about it.
+	 */
+	if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
+
+		if (!error)
+			panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+	}
+
+	/* to avoid recursion in __req_mod */
+	if (unlikely(error)) {
+		if (bio->bi_rw & REQ_DISCARD)
+			what = (error == -EOPNOTSUPP)
+				? DISCARD_COMPLETED_NOTSUPP
+				: DISCARD_COMPLETED_WITH_ERROR;
+		else
+			what = (bio_data_dir(bio) == WRITE)
+			? WRITE_COMPLETED_WITH_ERROR
+			: (bio_rw(bio) == READ)
+			  ? READ_COMPLETED_WITH_ERROR
+			  : READ_AHEAD_COMPLETED_WITH_ERROR;
+	} else
+		what = COMPLETED_OK;
+
+	bio_put(req->private_bio);
+	req->private_bio = ERR_PTR(error);
+
+	/* not req_mod(), we need irqsave here! */
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	__req_mod(req, what, &m);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+	put_ldev(device);
+
+	if (m.bio)
+		complete_master_bio(device, &m);
+}
+
+void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg;
+	struct page *page = peer_req->pages;
+	struct page *tmp;
+	unsigned len;
+
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	sg_init_table(&sg, 1);
+	crypto_hash_init(&desc);
+
+	while ((tmp = page_chain_next(page))) {
+		/* all but the last page will be fully used */
+		sg_set_page(&sg, page, PAGE_SIZE, 0);
+		crypto_hash_update(&desc, &sg, sg.length);
+		page = tmp;
+	}
+	/* and now the last, possibly only partially used page */
+	len = peer_req->i.size & (PAGE_SIZE - 1);
+	sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
+	crypto_hash_update(&desc, &sg, sg.length);
+	crypto_hash_final(&desc, digest);
+}
+
+void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	sg_init_table(&sg, 1);
+	crypto_hash_init(&desc);
+
+	bio_for_each_segment(bvec, bio, iter) {
+		sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
+		crypto_hash_update(&desc, &sg, sg.length);
+	}
+	crypto_hash_final(&desc, digest);
+}
+
+/* MAYBE merge common code with w_e_end_ov_req */
+static int w_e_send_csum(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	int digest_size;
+	void *digest;
+	int err = 0;
+
+	if (unlikely(cancel))
+		goto out;
+
+	if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
+		goto out;
+
+	digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+	digest = kmalloc(digest_size, GFP_NOIO);
+	if (digest) {
+		sector_t sector = peer_req->i.sector;
+		unsigned int size = peer_req->i.size;
+		drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+		/* Free peer_req and pages before send.
+		 * In case we block on congestion, we could otherwise run into
+		 * some distributed deadlock, if the other side blocks on
+		 * congestion as well, because our receiver blocks in
+		 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+		drbd_free_peer_req(device, peer_req);
+		peer_req = NULL;
+		inc_rs_pending(device);
+		err = drbd_send_drequest_csum(peer_device, sector, size,
+					      digest, digest_size,
+					      P_CSUM_RS_REQUEST);
+		kfree(digest);
+	} else {
+		drbd_err(device, "kmalloc() of digest failed.\n");
+		err = -ENOMEM;
+	}
+
+out:
+	if (peer_req)
+		drbd_free_peer_req(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
+	return err;
+}
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_request *peer_req;
+
+	if (!get_ldev(device))
+		return -EIO;
+
+	if (drbd_rs_should_slow_down(device, sector))
+		goto defer;
+
+	/* GFP_TRY, because if there is no memory available right now, this may
+	 * be rescheduled for later. It is "only" background resync, after all. */
+	peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
+				       size, true /* has real payload */, GFP_TRY);
+	if (!peer_req)
+		goto defer;
+
+	peer_req->w.cb = w_e_send_csum;
+	spin_lock_irq(&device->resource->req_lock);
+	list_add(&peer_req->w.list, &device->read_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	atomic_add(size >> 9, &device->rs_sect_ev);
+	if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
+		return 0;
+
+	/* If it failed because of ENOMEM, retry should help.  If it failed
+	 * because bio_add_page failed (probably broken lower level driver),
+	 * retry may or may not help.
+	 * If it does not, you may need to force disconnect. */
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	drbd_free_peer_req(device, peer_req);
+defer:
+	put_ldev(device);
+	return -EAGAIN;
+}
+
+int w_resync_timer(struct drbd_work *w, int cancel)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, resync_work);
+
+	switch (device->state.conn) {
+	case C_VERIFY_S:
+		make_ov_request(device, cancel);
+		break;
+	case C_SYNC_TARGET:
+		make_resync_request(device, cancel);
+		break;
+	}
+
+	return 0;
+}
+
+void resync_timer_fn(unsigned long data)
+{
+	struct drbd_device *device = (struct drbd_device *) data;
+
+	if (list_empty(&device->resync_work.list))
+		drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+				&device->resync_work);
+}
+
+static void fifo_set(struct fifo_buffer *fb, int value)
+{
+	int i;
+
+	for (i = 0; i < fb->size; i++)
+		fb->values[i] = value;
+}
+
+static int fifo_push(struct fifo_buffer *fb, int value)
+{
+	int ov;
+
+	ov = fb->values[fb->head_index];
+	fb->values[fb->head_index++] = value;
+
+	if (fb->head_index >= fb->size)
+		fb->head_index = 0;
+
+	return ov;
+}
+
+static void fifo_add_val(struct fifo_buffer *fb, int value)
+{
+	int i;
+
+	for (i = 0; i < fb->size; i++)
+		fb->values[i] += value;
+}
+
+struct fifo_buffer *fifo_alloc(int fifo_size)
+{
+	struct fifo_buffer *fb;
+
+	fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
+	if (!fb)
+		return NULL;
+
+	fb->head_index = 0;
+	fb->size = fifo_size;
+	fb->total = 0;
+
+	return fb;
+}
+
+static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
+{
+	struct disk_conf *dc;
+	unsigned int want;     /* The number of sectors we want in the proxy */
+	int req_sect; /* Number of sectors to request in this turn */
+	int correction; /* Number of sectors more we need in the proxy*/
+	int cps; /* correction per invocation of drbd_rs_controller() */
+	int steps; /* Number of time steps to plan ahead */
+	int curr_corr;
+	int max_sect;
+	struct fifo_buffer *plan;
+
+	dc = rcu_dereference(device->ldev->disk_conf);
+	plan = rcu_dereference(device->rs_plan_s);
+
+	steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+
+	if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
+		want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
+	} else { /* normal path */
+		want = dc->c_fill_target ? dc->c_fill_target :
+			sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
+	}
+
+	correction = want - device->rs_in_flight - plan->total;
+
+	/* Plan ahead */
+	cps = correction / steps;
+	fifo_add_val(plan, cps);
+	plan->total += cps * steps;
+
+	/* What we do in this step */
+	curr_corr = fifo_push(plan, 0);
+	plan->total -= curr_corr;
+
+	req_sect = sect_in + curr_corr;
+	if (req_sect < 0)
+		req_sect = 0;
+
+	max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
+	if (req_sect > max_sect)
+		req_sect = max_sect;
+
+	/*
+	drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+		 sect_in, device->rs_in_flight, want, correction,
+		 steps, cps, device->rs_planed, curr_corr, req_sect);
+	*/
+
+	return req_sect;
+}
+
+static int drbd_rs_number_requests(struct drbd_device *device)
+{
+	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
+	int number, mxb;
+
+	sect_in = atomic_xchg(&device->rs_sect_in, 0);
+	device->rs_in_flight -= sect_in;
+
+	rcu_read_lock();
+	mxb = drbd_get_max_buffers(device) / 2;
+	if (rcu_dereference(device->rs_plan_s)->size) {
+		number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
+		device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+	} else {
+		device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
+		number = SLEEP_TIME * device->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
+	}
+	rcu_read_unlock();
+
+	/* Don't have more than "max-buffers"/2 in-flight.
+	 * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
+	 * potentially causing a distributed deadlock on congestion during
+	 * online-verify or (checksum-based) resync, if max-buffers,
+	 * socket buffer sizes and resync rate settings are mis-configured. */
+	if (mxb - device->rs_in_flight < number)
+		number = mxb - device->rs_in_flight;
+
+	return number;
+}
+
+static int make_resync_request(struct drbd_device *device, int cancel)
+{
+	unsigned long bit;
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(device->this_bdev);
+	int max_bio_size;
+	int number, rollback_i, size;
+	int align, queued, sndbuf;
+	int i = 0;
+
+	if (unlikely(cancel))
+		return 0;
+
+	if (device->rs_total == 0) {
+		/* empty resync? */
+		drbd_resync_finished(device);
+		return 0;
+	}
+
+	if (!get_ldev(device)) {
+		/* Since we only need to access device->rsync a
+		   get_ldev_if_state(device,D_FAILED) would be sufficient, but
+		   to continue resync with a broken disk makes no sense at
+		   all */
+		drbd_err(device, "Disk broke down during resync!\n");
+		return 0;
+	}
+
+	max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
+	number = drbd_rs_number_requests(device);
+	if (number <= 0)
+		goto requeue;
+
+	for (i = 0; i < number; i++) {
+		/* Stop generating RS requests, when half of the send buffer is filled */
+		mutex_lock(&first_peer_device(device)->connection->data.mutex);
+		if (first_peer_device(device)->connection->data.socket) {
+			queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued;
+			sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf;
+		} else {
+			queued = 1;
+			sndbuf = 0;
+		}
+		mutex_unlock(&first_peer_device(device)->connection->data.mutex);
+		if (queued > sndbuf / 2)
+			goto requeue;
+
+next_sector:
+		size = BM_BLOCK_SIZE;
+		bit  = drbd_bm_find_next(device, device->bm_resync_fo);
+
+		if (bit == DRBD_END_OF_BITMAP) {
+			device->bm_resync_fo = drbd_bm_bits(device);
+			put_ldev(device);
+			return 0;
+		}
+
+		sector = BM_BIT_TO_SECT(bit);
+
+		if (drbd_rs_should_slow_down(device, sector) ||
+		    drbd_try_rs_begin_io(device, sector)) {
+			device->bm_resync_fo = bit;
+			goto requeue;
+		}
+		device->bm_resync_fo = bit + 1;
+
+		if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
+			drbd_rs_complete_io(device, sector);
+			goto next_sector;
+		}
+
+#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
+		/* try to find some adjacent bits.
+		 * we stop if we have already the maximum req size.
+		 *
+		 * Additionally always align bigger requests, in order to
+		 * be prepared for all stripe sizes of software RAIDs.
+		 */
+		align = 1;
+		rollback_i = i;
+		while (i < number) {
+			if (size + BM_BLOCK_SIZE > max_bio_size)
+				break;
+
+			/* Be always aligned */
+			if (sector & ((1<<(align+3))-1))
+				break;
+
+			/* do not cross extent boundaries */
+			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
+				break;
+			/* now, is it actually dirty, after all?
+			 * caution, drbd_bm_test_bit is tri-state for some
+			 * obscure reason; ( b == 0 ) would get the out-of-band
+			 * only accidentally right because of the "oddly sized"
+			 * adjustment below */
+			if (drbd_bm_test_bit(device, bit+1) != 1)
+				break;
+			bit++;
+			size += BM_BLOCK_SIZE;
+			if ((BM_BLOCK_SIZE << align) <= size)
+				align++;
+			i++;
+		}
+		/* if we merged some,
+		 * reset the offset to start the next drbd_bm_find_next from */
+		if (size > BM_BLOCK_SIZE)
+			device->bm_resync_fo = bit + 1;
+#endif
+
+		/* adjust very last sectors, in case we are oddly sized */
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+		if (first_peer_device(device)->connection->agreed_pro_version >= 89 &&
+		    first_peer_device(device)->connection->csums_tfm) {
+			switch (read_for_csum(first_peer_device(device), sector, size)) {
+			case -EIO: /* Disk failure */
+				put_ldev(device);
+				return -EIO;
+			case -EAGAIN: /* allocation failed, or ldev busy */
+				drbd_rs_complete_io(device, sector);
+				device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+				i = rollback_i;
+				goto requeue;
+			case 0:
+				/* everything ok */
+				break;
+			default:
+				BUG();
+			}
+		} else {
+			int err;
+
+			inc_rs_pending(device);
+			err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST,
+						 sector, size, ID_SYNCER);
+			if (err) {
+				drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
+				dec_rs_pending(device);
+				put_ldev(device);
+				return err;
+			}
+		}
+	}
+
+	if (device->bm_resync_fo >= drbd_bm_bits(device)) {
+		/* last syncer _request_ was sent,
+		 * but the P_RS_DATA_REPLY not yet received.  sync will end (and
+		 * next sync group will resume), as soon as we receive the last
+		 * resync data block, and the last bit is cleared.
+		 * until then resync "work" is "inactive" ...
+		 */
+		put_ldev(device);
+		return 0;
+	}
+
+ requeue:
+	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+	mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+	put_ldev(device);
+	return 0;
+}
+
+static int make_ov_request(struct drbd_device *device, int cancel)
+{
+	int number, i, size;
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(device->this_bdev);
+	bool stop_sector_reached = false;
+
+	if (unlikely(cancel))
+		return 1;
+
+	number = drbd_rs_number_requests(device);
+
+	sector = device->ov_position;
+	for (i = 0; i < number; i++) {
+		if (sector >= capacity)
+			return 1;
+
+		/* We check for "finished" only in the reply path:
+		 * w_e_end_ov_reply().
+		 * We need to send at least one request out. */
+		stop_sector_reached = i > 0
+			&& verify_can_do_stop_sector(device)
+			&& sector >= device->ov_stop_sector;
+		if (stop_sector_reached)
+			break;
+
+		size = BM_BLOCK_SIZE;
+
+		if (drbd_rs_should_slow_down(device, sector) ||
+		    drbd_try_rs_begin_io(device, sector)) {
+			device->ov_position = sector;
+			goto requeue;
+		}
+
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+
+		inc_rs_pending(device);
+		if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
+			dec_rs_pending(device);
+			return 0;
+		}
+		sector += BM_SECT_PER_BIT;
+	}
+	device->ov_position = sector;
+
+ requeue:
+	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+	if (i == 0 || !stop_sector_reached)
+		mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+	return 1;
+}
+
+int w_ov_finished(struct drbd_work *w, int cancel)
+{
+	struct drbd_device_work *dw =
+		container_of(w, struct drbd_device_work, w);
+	struct drbd_device *device = dw->device;
+	kfree(dw);
+	ov_out_of_sync_print(device);
+	drbd_resync_finished(device);
+
+	return 0;
+}
+
+static int w_resync_finished(struct drbd_work *w, int cancel)
+{
+	struct drbd_device_work *dw =
+		container_of(w, struct drbd_device_work, w);
+	struct drbd_device *device = dw->device;
+	kfree(dw);
+
+	drbd_resync_finished(device);
+
+	return 0;
+}
+
+static void ping_peer(struct drbd_device *device)
+{
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+
+	clear_bit(GOT_PING_ACK, &connection->flags);
+	request_ping(connection);
+	wait_event(connection->ping_wait,
+		   test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
+}
+
+int drbd_resync_finished(struct drbd_device *device)
+{
+	unsigned long db, dt, dbdt;
+	unsigned long n_oos;
+	union drbd_state os, ns;
+	struct drbd_device_work *dw;
+	char *khelper_cmd = NULL;
+	int verify_done = 0;
+
+	/* Remove all elements from the resync LRU. Since future actions
+	 * might set bits in the (main) bitmap, then the entries in the
+	 * resync LRU would be wrong. */
+	if (drbd_rs_del_all(device)) {
+		/* In case this is not possible now, most probably because
+		 * there are P_RS_DATA_REPLY Packets lingering on the worker's
+		 * queue (or even the read operations for those packets
+		 * is not finished by now).   Retry in 100ms. */
+
+		schedule_timeout_interruptible(HZ / 10);
+		dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
+		if (dw) {
+			dw->w.cb = w_resync_finished;
+			dw->device = device;
+			drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+					&dw->w);
+			return 1;
+		}
+		drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
+	}
+
+	dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
+	if (dt <= 0)
+		dt = 1;
+
+	db = device->rs_total;
+	/* adjust for verify start and stop sectors, respective reached position */
+	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+		db -= device->ov_left;
+
+	dbdt = Bit2KB(db/dt);
+	device->rs_paused /= HZ;
+
+	if (!get_ldev(device))
+		goto out;
+
+	ping_peer(device);
+
+	spin_lock_irq(&device->resource->req_lock);
+	os = drbd_read_state(device);
+
+	verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
+
+	/* This protects us against multiple calls (that can happen in the presence
+	   of application IO), and against connectivity loss just before we arrive here. */
+	if (os.conn <= C_CONNECTED)
+		goto out_unlock;
+
+	ns = os;
+	ns.conn = C_CONNECTED;
+
+	drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
+	     verify_done ? "Online verify" : "Resync",
+	     dt + device->rs_paused, device->rs_paused, dbdt);
+
+	n_oos = drbd_bm_total_weight(device);
+
+	if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
+		if (n_oos) {
+			drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
+			      n_oos, Bit2KB(1));
+			khelper_cmd = "out-of-sync";
+		}
+	} else {
+		D_ASSERT(device, (n_oos - device->rs_failed) == 0);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
+			khelper_cmd = "after-resync-target";
+
+		if (first_peer_device(device)->connection->csums_tfm && device->rs_total) {
+			const unsigned long s = device->rs_same_csum;
+			const unsigned long t = device->rs_total;
+			const int ratio =
+				(t == 0)     ? 0 :
+			(t < 100000) ? ((s*100)/t) : (s/(t/100));
+			drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
+			     "transferred %luK total %luK\n",
+			     ratio,
+			     Bit2KB(device->rs_same_csum),
+			     Bit2KB(device->rs_total - device->rs_same_csum),
+			     Bit2KB(device->rs_total));
+		}
+	}
+
+	if (device->rs_failed) {
+		drbd_info(device, "            %lu failed blocks\n", device->rs_failed);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			ns.disk = D_INCONSISTENT;
+			ns.pdsk = D_UP_TO_DATE;
+		} else {
+			ns.disk = D_UP_TO_DATE;
+			ns.pdsk = D_INCONSISTENT;
+		}
+	} else {
+		ns.disk = D_UP_TO_DATE;
+		ns.pdsk = D_UP_TO_DATE;
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			if (device->p_uuid) {
+				int i;
+				for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
+					_drbd_uuid_set(device, i, device->p_uuid[i]);
+				drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
+				_drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
+			} else {
+				drbd_err(device, "device->p_uuid is NULL! BUG\n");
+			}
+		}
+
+		if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
+			/* for verify runs, we don't update uuids here,
+			 * so there would be nothing to report. */
+			drbd_uuid_set_bm(device, 0UL);
+			drbd_print_uuids(device, "updated UUIDs");
+			if (device->p_uuid) {
+				/* Now the two UUID sets are equal, update what we
+				 * know of the peer. */
+				int i;
+				for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
+					device->p_uuid[i] = device->ldev->md.uuid[i];
+			}
+		}
+	}
+
+	_drbd_set_state(device, ns, CS_VERBOSE, NULL);
+out_unlock:
+	spin_unlock_irq(&device->resource->req_lock);
+	put_ldev(device);
+out:
+	device->rs_total  = 0;
+	device->rs_failed = 0;
+	device->rs_paused = 0;
+
+	/* reset start sector, if we reached end of device */
+	if (verify_done && device->ov_left == 0)
+		device->ov_start_sector = 0;
+
+	drbd_md_sync(device);
+
+	if (khelper_cmd)
+		drbd_khelper(device, khelper_cmd);
+
+	return 1;
+}
+
+/* helper */
+static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+	if (drbd_peer_req_has_active_page(peer_req)) {
+		/* This might happen if sendpage() has not finished */
+		int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
+		atomic_add(i, &device->pp_in_use_by_net);
+		atomic_sub(i, &device->pp_in_use);
+		spin_lock_irq(&device->resource->req_lock);
+		list_add_tail(&peer_req->w.list, &device->net_ee);
+		spin_unlock_irq(&device->resource->req_lock);
+		wake_up(&drbd_pp_wait);
+	} else
+		drbd_free_peer_req(device, peer_req);
+}
+
+/**
+ * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
+ * @device:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_data_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	int err;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Sending NegDReply. sector=%llus.\n",
+			    (unsigned long long)peer_req->i.sector);
+
+		err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
+	}
+
+	dec_unacked(device);
+
+	move_to_net_ee_or_free(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_block() failed\n");
+	return err;
+}
+
+/**
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	int err;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		drbd_rs_complete_io(device, peer_req->i.sector);
+		put_ldev(device);
+	}
+
+	if (device->state.conn == C_AHEAD) {
+		err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
+	} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		if (likely(device->state.pdsk >= D_INCONSISTENT)) {
+			inc_rs_pending(device);
+			err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+		} else {
+			if (__ratelimit(&drbd_ratelimit_state))
+				drbd_err(device, "Not sending RSDataReply, "
+				    "partner DISKLESS!\n");
+			err = 0;
+		}
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
+			    (unsigned long long)peer_req->i.sector);
+
+		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+
+		/* update resync data with failure */
+		drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
+	}
+
+	dec_unacked(device);
+
+	move_to_net_ee_or_free(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_block() failed\n");
+	return err;
+}
+
+int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct digest_info *di;
+	int digest_size;
+	void *digest = NULL;
+	int err, eq = 0;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	if (get_ldev(device)) {
+		drbd_rs_complete_io(device, peer_req->i.sector);
+		put_ldev(device);
+	}
+
+	di = peer_req->digest;
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		/* quick hack to try to avoid a race against reconfiguration.
+		 * a real fix would be much more involved,
+		 * introducing more locking mechanisms */
+		if (peer_device->connection->csums_tfm) {
+			digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+			D_ASSERT(device, digest_size == di->digest_size);
+			digest = kmalloc(digest_size, GFP_NOIO);
+		}
+		if (digest) {
+			drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+
+		if (eq) {
+			drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
+			/* rs_same_csums unit is BM_BLOCK_SIZE */
+			device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
+			err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
+		} else {
+			inc_rs_pending(device);
+			peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+			peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
+			kfree(di);
+			err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+		}
+	} else {
+		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
+	}
+
+	dec_unacked(device);
+	move_to_net_ee_or_free(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_block/ack() failed\n");
+	return err;
+}
+
+int w_e_end_ov_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	sector_t sector = peer_req->i.sector;
+	unsigned int size = peer_req->i.size;
+	int digest_size;
+	void *digest;
+	int err = 0;
+
+	if (unlikely(cancel))
+		goto out;
+
+	digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
+	digest = kmalloc(digest_size, GFP_NOIO);
+	if (!digest) {
+		err = 1;	/* terminate the connection in case the allocation failed */
+		goto out;
+	}
+
+	if (likely(!(peer_req->flags & EE_WAS_ERROR)))
+		drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+	else
+		memset(digest, 0, digest_size);
+
+	/* Free e and pages before send.
+	 * In case we block on congestion, we could otherwise run into
+	 * some distributed deadlock, if the other side blocks on
+	 * congestion as well, because our receiver blocks in
+	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+	drbd_free_peer_req(device, peer_req);
+	peer_req = NULL;
+	inc_rs_pending(device);
+	err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
+	if (err)
+		dec_rs_pending(device);
+	kfree(digest);
+
+out:
+	if (peer_req)
+		drbd_free_peer_req(device, peer_req);
+	dec_unacked(device);
+	return err;
+}
+
+void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
+{
+	if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
+		device->ov_last_oos_size += size>>9;
+	} else {
+		device->ov_last_oos_start = sector;
+		device->ov_last_oos_size = size>>9;
+	}
+	drbd_set_out_of_sync(device, sector, size);
+}
+
+int w_e_end_ov_reply(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct digest_info *di;
+	void *digest;
+	sector_t sector = peer_req->i.sector;
+	unsigned int size = peer_req->i.size;
+	int digest_size;
+	int err, eq = 0;
+	bool stop_sector_reached = false;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
+	 * the resync lru has been cleaned up already */
+	if (get_ldev(device)) {
+		drbd_rs_complete_io(device, peer_req->i.sector);
+		put_ldev(device);
+	}
+
+	di = peer_req->digest;
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
+		digest = kmalloc(digest_size, GFP_NOIO);
+		if (digest) {
+			drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+
+			D_ASSERT(device, digest_size == di->digest_size);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+	}
+
+	/* Free peer_req and pages before send.
+	 * In case we block on congestion, we could otherwise run into
+	 * some distributed deadlock, if the other side blocks on
+	 * congestion as well, because our receiver blocks in
+	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+	drbd_free_peer_req(device, peer_req);
+	if (!eq)
+		drbd_ov_out_of_sync_found(device, sector, size);
+	else
+		ov_out_of_sync_print(device);
+
+	err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
+			       eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+
+	dec_unacked(device);
+
+	--device->ov_left;
+
+	/* let's advance progress step marks only for every other megabyte */
+	if ((device->ov_left & 0x200) == 0x200)
+		drbd_advance_rs_marks(device, device->ov_left);
+
+	stop_sector_reached = verify_can_do_stop_sector(device) &&
+		(sector + (size>>9)) >= device->ov_stop_sector;
+
+	if (device->ov_left == 0 || stop_sector_reached) {
+		ov_out_of_sync_print(device);
+		drbd_resync_finished(device);
+	}
+
+	return err;
+}
+
+/* FIXME
+ * We need to track the number of pending barrier acks,
+ * and to be able to wait for them.
+ * See also comment in drbd_adm_attach before drbd_suspend_io.
+ */
+static int drbd_send_barrier(struct drbd_connection *connection)
+{
+	struct p_barrier *p;
+	struct drbd_socket *sock;
+
+	sock = &connection->data;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+	p->barrier = connection->send.current_epoch_nr;
+	p->pad = 0;
+	connection->send.current_epoch_writes = 0;
+
+	return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
+}
+
+int w_send_write_hint(struct drbd_work *w, int cancel)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, unplug_work);
+	struct drbd_socket *sock;
+
+	if (cancel)
+		return 0;
+	sock = &first_peer_device(device)->connection->data;
+	if (!drbd_prepare_command(first_peer_device(device), sock))
+		return -EIO;
+	return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0);
+}
+
+static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
+{
+	if (!connection->send.seen_any_write_yet) {
+		connection->send.seen_any_write_yet = true;
+		connection->send.current_epoch_nr = epoch;
+		connection->send.current_epoch_writes = 0;
+	}
+}
+
+static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
+{
+	/* re-init if first write on this connection */
+	if (!connection->send.seen_any_write_yet)
+		return;
+	if (connection->send.current_epoch_nr != epoch) {
+		if (connection->send.current_epoch_writes)
+			drbd_send_barrier(connection);
+		connection->send.current_epoch_nr = epoch;
+	}
+}
+
+int w_send_out_of_sync(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	int err;
+
+	if (unlikely(cancel)) {
+		req_mod(req, SEND_CANCELED);
+		return 0;
+	}
+
+	/* this time, no connection->send.current_epoch_writes++;
+	 * If it was sent, it was the closing barrier for the last
+	 * replicated epoch, before we went into AHEAD mode.
+	 * No more barriers will be sent, until we leave AHEAD mode again. */
+	maybe_send_barrier(connection, req->epoch);
+
+	err = drbd_send_out_of_sync(first_peer_device(device), req);
+	req_mod(req, OOS_HANDED_TO_NETWORK);
+
+	return err;
+}
+
+/**
+ * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_dblock(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	int err;
+
+	if (unlikely(cancel)) {
+		req_mod(req, SEND_CANCELED);
+		return 0;
+	}
+
+	re_init_if_first_write(connection, req->epoch);
+	maybe_send_barrier(connection, req->epoch);
+	connection->send.current_epoch_writes++;
+
+	err = drbd_send_dblock(first_peer_device(device), req);
+	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+	return err;
+}
+
+/**
+ * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_read_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	int err;
+
+	if (unlikely(cancel)) {
+		req_mod(req, SEND_CANCELED);
+		return 0;
+	}
+
+	/* Even read requests may close a write epoch,
+	 * if there was any yet. */
+	maybe_send_barrier(connection, req->epoch);
+
+	err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size,
+				 (unsigned long)req);
+
+	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+	return err;
+}
+
+int w_restart_disk_io(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+
+	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
+		drbd_al_begin_io(device, &req->i, false);
+
+	drbd_req_make_private_bio(req, req->master_bio);
+	req->private_bio->bi_bdev = device->ldev->backing_bdev;
+	generic_make_request(req->private_bio);
+
+	return 0;
+}
+
+static int _drbd_may_sync_now(struct drbd_device *device)
+{
+	struct drbd_device *odev = device;
+	int resync_after;
+
+	while (1) {
+		if (!odev->ldev || odev->state.disk == D_DISKLESS)
+			return 1;
+		rcu_read_lock();
+		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+		rcu_read_unlock();
+		if (resync_after == -1)
+			return 1;
+		odev = minor_to_device(resync_after);
+		if (!odev)
+			return 1;
+		if ((odev->state.conn >= C_SYNC_SOURCE &&
+		     odev->state.conn <= C_PAUSED_SYNC_T) ||
+		    odev->state.aftr_isp || odev->state.peer_isp ||
+		    odev->state.user_isp)
+			return 0;
+	}
+}
+
+/**
+ * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * @device:	DRBD device.
+ *
+ * Called from process context only (admin command and after_state_ch).
+ */
+static int _drbd_pause_after(struct drbd_device *device)
+{
+	struct drbd_device *odev;
+	int i, rv = 0;
+
+	rcu_read_lock();
+	idr_for_each_entry(&drbd_devices, odev, i) {
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (!_drbd_may_sync_now(odev))
+			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
+			       != SS_NOTHING_TO_DO);
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/**
+ * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * @device:	DRBD device.
+ *
+ * Called from process context only (admin command and worker).
+ */
+static int _drbd_resume_next(struct drbd_device *device)
+{
+	struct drbd_device *odev;
+	int i, rv = 0;
+
+	rcu_read_lock();
+	idr_for_each_entry(&drbd_devices, odev, i) {
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (odev->state.aftr_isp) {
+			if (_drbd_may_sync_now(odev))
+				rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
+							CS_HARD, NULL)
+				       != SS_NOTHING_TO_DO) ;
+		}
+	}
+	rcu_read_unlock();
+	return rv;
+}
+
+void resume_next_sg(struct drbd_device *device)
+{
+	write_lock_irq(&global_state_lock);
+	_drbd_resume_next(device);
+	write_unlock_irq(&global_state_lock);
+}
+
+void suspend_other_sg(struct drbd_device *device)
+{
+	write_lock_irq(&global_state_lock);
+	_drbd_pause_after(device);
+	write_unlock_irq(&global_state_lock);
+}
+
+/* caller must hold global_state_lock */
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
+{
+	struct drbd_device *odev;
+	int resync_after;
+
+	if (o_minor == -1)
+		return NO_ERROR;
+	if (o_minor < -1 || o_minor > MINORMASK)
+		return ERR_RESYNC_AFTER;
+
+	/* check for loops */
+	odev = minor_to_device(o_minor);
+	while (1) {
+		if (odev == device)
+			return ERR_RESYNC_AFTER_CYCLE;
+
+		/* You are free to depend on diskless, non-existing,
+		 * or not yet/no longer existing minors.
+		 * We only reject dependency loops.
+		 * We cannot follow the dependency chain beyond a detached or
+		 * missing minor.
+		 */
+		if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
+			return NO_ERROR;
+
+		rcu_read_lock();
+		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+		rcu_read_unlock();
+		/* dependency chain ends here, no cycles. */
+		if (resync_after == -1)
+			return NO_ERROR;
+
+		/* follow the dependency chain */
+		odev = minor_to_device(resync_after);
+	}
+}
+
+/* caller must hold global_state_lock */
+void drbd_resync_after_changed(struct drbd_device *device)
+{
+	int changes;
+
+	do {
+		changes  = _drbd_pause_after(device);
+		changes |= _drbd_resume_next(device);
+	} while (changes);
+}
+
+void drbd_rs_controller_reset(struct drbd_device *device)
+{
+	struct fifo_buffer *plan;
+
+	atomic_set(&device->rs_sect_in, 0);
+	atomic_set(&device->rs_sect_ev, 0);
+	device->rs_in_flight = 0;
+
+	/* Updating the RCU protected object in place is necessary since
+	   this function gets called from atomic context.
+	   It is valid since all other updates also lead to an completely
+	   empty fifo */
+	rcu_read_lock();
+	plan = rcu_dereference(device->rs_plan_s);
+	plan->total = 0;
+	fifo_set(plan, 0);
+	rcu_read_unlock();
+}
+
+void start_resync_timer_fn(unsigned long data)
+{
+	struct drbd_device *device = (struct drbd_device *) data;
+
+	drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+			&device->start_resync_work);
+}
+
+int w_start_resync(struct drbd_work *w, int cancel)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, start_resync_work);
+
+	if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
+		drbd_warn(device, "w_start_resync later...\n");
+		device->start_resync_timer.expires = jiffies + HZ/10;
+		add_timer(&device->start_resync_timer);
+		return 0;
+	}
+
+	drbd_start_resync(device, C_SYNC_SOURCE);
+	clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
+	return 0;
+}
+
+/**
+ * drbd_start_resync() - Start the resync process
+ * @device:	DRBD device.
+ * @side:	Either C_SYNC_SOURCE or C_SYNC_TARGET
+ *
+ * This function might bring you directly into one of the
+ * C_PAUSED_SYNC_* states.
+ */
+void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
+{
+	union drbd_state ns;
+	int r;
+
+	if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
+		drbd_err(device, "Resync already running!\n");
+		return;
+	}
+
+	if (!test_bit(B_RS_H_DONE, &device->flags)) {
+		if (side == C_SYNC_TARGET) {
+			/* Since application IO was locked out during C_WF_BITMAP_T and
+			   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+			   we check that we might make the data inconsistent. */
+			r = drbd_khelper(device, "before-resync-target");
+			r = (r >> 8) & 0xff;
+			if (r > 0) {
+				drbd_info(device, "before-resync-target handler returned %d, "
+					 "dropping connection.\n", r);
+				conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+				return;
+			}
+		} else /* C_SYNC_SOURCE */ {
+			r = drbd_khelper(device, "before-resync-source");
+			r = (r >> 8) & 0xff;
+			if (r > 0) {
+				if (r == 3) {
+					drbd_info(device, "before-resync-source handler returned %d, "
+						 "ignoring. Old userland tools?", r);
+				} else {
+					drbd_info(device, "before-resync-source handler returned %d, "
+						 "dropping connection.\n", r);
+					conn_request_state(first_peer_device(device)->connection,
+							   NS(conn, C_DISCONNECTING), CS_HARD);
+					return;
+				}
+			}
+		}
+	}
+
+	if (current == first_peer_device(device)->connection->worker.task) {
+		/* The worker should not sleep waiting for state_mutex,
+		   that can take long */
+		if (!mutex_trylock(device->state_mutex)) {
+			set_bit(B_RS_H_DONE, &device->flags);
+			device->start_resync_timer.expires = jiffies + HZ/5;
+			add_timer(&device->start_resync_timer);
+			return;
+		}
+	} else {
+		mutex_lock(device->state_mutex);
+	}
+	clear_bit(B_RS_H_DONE, &device->flags);
+
+	/* req_lock: serialize with drbd_send_and_submit() and others
+	 * global_state_lock: for stable sync-after dependencies */
+	spin_lock_irq(&device->resource->req_lock);
+	write_lock(&global_state_lock);
+	/* Did some connection breakage or IO error race with us? */
+	if (device->state.conn < C_CONNECTED
+	|| !get_ldev_if_state(device, D_NEGOTIATING)) {
+		write_unlock(&global_state_lock);
+		spin_unlock_irq(&device->resource->req_lock);
+		mutex_unlock(device->state_mutex);
+		return;
+	}
+
+	ns = drbd_read_state(device);
+
+	ns.aftr_isp = !_drbd_may_sync_now(device);
+
+	ns.conn = side;
+
+	if (side == C_SYNC_TARGET)
+		ns.disk = D_INCONSISTENT;
+	else /* side == C_SYNC_SOURCE */
+		ns.pdsk = D_INCONSISTENT;
+
+	r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
+	ns = drbd_read_state(device);
+
+	if (ns.conn < C_CONNECTED)
+		r = SS_UNKNOWN_ERROR;
+
+	if (r == SS_SUCCESS) {
+		unsigned long tw = drbd_bm_total_weight(device);
+		unsigned long now = jiffies;
+		int i;
+
+		device->rs_failed    = 0;
+		device->rs_paused    = 0;
+		device->rs_same_csum = 0;
+		device->rs_last_events = 0;
+		device->rs_last_sect_ev = 0;
+		device->rs_total     = tw;
+		device->rs_start     = now;
+		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+			device->rs_mark_left[i] = tw;
+			device->rs_mark_time[i] = now;
+		}
+		_drbd_pause_after(device);
+	}
+	write_unlock(&global_state_lock);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (r == SS_SUCCESS) {
+		/* reset rs_last_bcast when a resync or verify is started,
+		 * to deal with potential jiffies wrap. */
+		device->rs_last_bcast = jiffies - HZ;
+
+		drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
+		     drbd_conn_str(ns.conn),
+		     (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
+		     (unsigned long) device->rs_total);
+		if (side == C_SYNC_TARGET)
+			device->bm_resync_fo = 0;
+
+		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
+		 * with w_send_oos, or the sync target will get confused as to
+		 * how much bits to resync.  We cannot do that always, because for an
+		 * empty resync and protocol < 95, we need to do it here, as we call
+		 * drbd_resync_finished from here in that case.
+		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
+		 * and from after_state_ch otherwise. */
+		if (side == C_SYNC_SOURCE &&
+		    first_peer_device(device)->connection->agreed_pro_version < 96)
+			drbd_gen_and_send_sync_uuid(first_peer_device(device));
+
+		if (first_peer_device(device)->connection->agreed_pro_version < 95 &&
+		    device->rs_total == 0) {
+			/* This still has a race (about when exactly the peers
+			 * detect connection loss) that can lead to a full sync
+			 * on next handshake. In 8.3.9 we fixed this with explicit
+			 * resync-finished notifications, but the fix
+			 * introduces a protocol change.  Sleeping for some
+			 * time longer than the ping interval + timeout on the
+			 * SyncSource, to give the SyncTarget the chance to
+			 * detect connection loss, then waiting for a ping
+			 * response (implicit in drbd_resync_finished) reduces
+			 * the race considerably, but does not solve it. */
+			if (side == C_SYNC_SOURCE) {
+				struct net_conf *nc;
+				int timeo;
+
+				rcu_read_lock();
+				nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+				timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
+				rcu_read_unlock();
+				schedule_timeout_interruptible(timeo);
+			}
+			drbd_resync_finished(device);
+		}
+
+		drbd_rs_controller_reset(device);
+		/* ns.conn may already be != device->state.conn,
+		 * we may have been paused in between, or become paused until
+		 * the timer triggers.
+		 * No matter, that is handled in resync_timer_fn() */
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&device->resync_timer, jiffies);
+
+		drbd_md_sync(device);
+	}
+	put_ldev(device);
+	mutex_unlock(device->state_mutex);
+}
+
+static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+	spin_lock_irq(&queue->q_lock);
+	list_splice_init(&queue->q, work_list);
+	spin_unlock_irq(&queue->q_lock);
+	return !list_empty(work_list);
+}
+
+static bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+	spin_lock_irq(&queue->q_lock);
+	if (!list_empty(&queue->q))
+		list_move(queue->q.next, work_list);
+	spin_unlock_irq(&queue->q_lock);
+	return !list_empty(work_list);
+}
+
+static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
+{
+	DEFINE_WAIT(wait);
+	struct net_conf *nc;
+	int uncork, cork;
+
+	dequeue_work_item(&connection->sender_work, work_list);
+	if (!list_empty(work_list))
+		return;
+
+	/* Still nothing to do?
+	 * Maybe we still need to close the current epoch,
+	 * even if no new requests are queued yet.
+	 *
+	 * Also, poke TCP, just in case.
+	 * Then wait for new work (or signal). */
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	uncork = nc ? nc->tcp_cork : 0;
+	rcu_read_unlock();
+	if (uncork) {
+		mutex_lock(&connection->data.mutex);
+		if (connection->data.socket)
+			drbd_tcp_uncork(connection->data.socket);
+		mutex_unlock(&connection->data.mutex);
+	}
+
+	for (;;) {
+		int send_barrier;
+		prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
+		spin_lock_irq(&connection->resource->req_lock);
+		spin_lock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
+		/* dequeue single item only,
+		 * we still use drbd_queue_work_front() in some places */
+		if (!list_empty(&connection->sender_work.q))
+			list_move(connection->sender_work.q.next, work_list);
+		spin_unlock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
+		if (!list_empty(work_list) || signal_pending(current)) {
+			spin_unlock_irq(&connection->resource->req_lock);
+			break;
+		}
+
+		/* We found nothing new to do, no to-be-communicated request,
+		 * no other work item.  We may still need to close the last
+		 * epoch.  Next incoming request epoch will be connection ->
+		 * current transfer log epoch number.  If that is different
+		 * from the epoch of the last request we communicated, it is
+		 * safe to send the epoch separating barrier now.
+		 */
+		send_barrier =
+			atomic_read(&connection->current_tle_nr) !=
+			connection->send.current_epoch_nr;
+		spin_unlock_irq(&connection->resource->req_lock);
+
+		if (send_barrier)
+			maybe_send_barrier(connection,
+					connection->send.current_epoch_nr + 1);
+		schedule();
+		/* may be woken up for other things but new work, too,
+		 * e.g. if the current epoch got closed.
+		 * In which case we send the barrier above. */
+	}
+	finish_wait(&connection->sender_work.q_wait, &wait);
+
+	/* someone may have changed the config while we have been waiting above. */
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	cork = nc ? nc->tcp_cork : 0;
+	rcu_read_unlock();
+	mutex_lock(&connection->data.mutex);
+	if (connection->data.socket) {
+		if (cork)
+			drbd_tcp_cork(connection->data.socket);
+		else if (!uncork)
+			drbd_tcp_uncork(connection->data.socket);
+	}
+	mutex_unlock(&connection->data.mutex);
+}
+
+int drbd_worker(struct drbd_thread *thi)
+{
+	struct drbd_connection *connection = thi->connection;
+	struct drbd_work *w = NULL;
+	struct drbd_peer_device *peer_device;
+	LIST_HEAD(work_list);
+	int vnr;
+
+	while (get_t_state(thi) == RUNNING) {
+		drbd_thread_current_set_cpu(thi);
+
+		/* as long as we use drbd_queue_work_front(),
+		 * we may only dequeue single work items here, not batches. */
+		if (list_empty(&work_list))
+			wait_for_work(connection, &work_list);
+
+		if (signal_pending(current)) {
+			flush_signals(current);
+			if (get_t_state(thi) == RUNNING) {
+				drbd_warn(connection, "Worker got an unexpected signal\n");
+				continue;
+			}
+			break;
+		}
+
+		if (get_t_state(thi) != RUNNING)
+			break;
+
+		while (!list_empty(&work_list)) {
+			w = list_first_entry(&work_list, struct drbd_work, list);
+			list_del_init(&w->list);
+			if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
+				continue;
+			if (connection->cstate >= C_WF_REPORT_PARAMS)
+				conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+		}
+	}
+
+	do {
+		while (!list_empty(&work_list)) {
+			w = list_first_entry(&work_list, struct drbd_work, list);
+			list_del_init(&w->list);
+			w->cb(w, 1);
+		}
+		dequeue_work_batch(&connection->sender_work, &work_list);
+	} while (!list_empty(&work_list));
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_device_cleanup(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 862b40c9018..56d46ffb08e 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -144,13 +144,23 @@
  * Better audit of register_blkdev.
  */
 
-#define FLOPPY_SANITY_CHECK
 #undef  FLOPPY_SILENT_DCL_CLEAR
 
 #define REALLY_SLOW_IO
 
 #define DEBUGT 2
-#define DCL_DEBUG	/* debug disk change line */
+
+#define DPRINT(format, args...) \
+	pr_info("floppy%d: " format, current_drive, ##args)
+
+#define DCL_DEBUG		/* debug disk change line */
+#ifdef DCL_DEBUG
+#define debug_dcl(test, fmt, args...) \
+	do { if ((test) & FD_DEBUG) DPRINT(fmt, ##args); } while (0)
+#else
+#define debug_dcl(test, fmt, args...) \
+	do { if (0) DPRINT(fmt, ##args); } while (0)
+#endif
 
 /* do print messages for unexpected interrupts */
 static int print_unex = 1;
@@ -178,21 +188,21 @@ static int print_unex = 1;
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/mod_devicetable.h>
-#include <linux/buffer_head.h>	/* for invalidate_buffers() */
 #include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <linux/async.h>
 
 /*
  * PS/2 floppies have much slower step rates than regular floppies.
  * It's been recommended that take about 1/4 of the default speed
  * in some more extreme cases.
  */
+static DEFINE_MUTEX(floppy_mutex);
 static int slow_floppy;
 
 #include <asm/dma.h>
 #include <asm/irq.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
 
 static int FLOPPY_IRQ = 6;
 static int FLOPPY_DMA = 2;
@@ -241,16 +251,14 @@ static int allowed_drive_mask = 0x33;
 
 static int irqdma_allocated;
 
-#define DEVICE_NAME "floppy"
-
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
 #include <linux/cdrom.h>	/* for the compatibility eject ioctl */
 #include <linux/completion.h>
 
 static struct request *current_req;
-static struct request_queue *floppy_queue;
-static void do_fd_request(struct request_queue * q);
+static void do_fd_request(struct request_queue *q);
+static int set_next_request(void);
 
 #ifndef fd_get_dma_residue
 #define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA)
@@ -263,7 +271,7 @@ static void do_fd_request(struct request_queue * q);
 #endif
 
 #ifndef fd_dma_mem_alloc
-#define fd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL,get_order(size))
+#define fd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL, get_order(size))
 #endif
 
 static inline void fallback_on_nodma_alloc(char **addr, size_t l)
@@ -273,7 +281,7 @@ static inline void fallback_on_nodma_alloc(char **addr, size_t l)
 		return;		/* we have the memory */
 	if (can_use_virtual_dma != 2)
 		return;		/* no fallback allowed */
-	printk("DMA memory shortage. Temporarily falling back on virtual DMA\n");
+	pr_info("DMA memory shortage. Temporarily falling back on virtual DMA\n");
 	*addr = (char *)nodma_mem_alloc(l);
 #else
 	return;
@@ -283,59 +291,50 @@ static inline void fallback_on_nodma_alloc(char **addr, size_t l)
 /* End dma memory related stuff */
 
 static unsigned long fake_change;
-static int initialising = 1;
+static bool initialized;
 
-#define ITYPE(x) (((x)>>2) & 0x1f)
-#define TOMINOR(x) ((x & 3) | ((x & 4) << 5))
-#define UNIT(x) ((x) & 0x03)	/* drive on fdc */
-#define FDC(x) (((x) & 0x04) >> 2)	/* fdc of drive */
+#define ITYPE(x)	(((x) >> 2) & 0x1f)
+#define TOMINOR(x)	((x & 3) | ((x & 4) << 5))
+#define UNIT(x)		((x) & 0x03)		/* drive on fdc */
+#define FDC(x)		(((x) & 0x04) >> 2)	/* fdc of drive */
 	/* reverse mapping from unit and fdc to drive */
 #define REVDRIVE(fdc, unit) ((unit) + ((fdc) << 2))
-#define DP (&drive_params[current_drive])
-#define DRS (&drive_state[current_drive])
-#define DRWE (&write_errors[current_drive])
-#define FDCS (&fdc_state[fdc])
-#define CLEARF(x) clear_bit(x##_BIT, &DRS->flags)
-#define SETF(x) set_bit(x##_BIT, &DRS->flags)
-#define TESTF(x) test_bit(x##_BIT, &DRS->flags)
-
-#define UDP (&drive_params[drive])
-#define UDRS (&drive_state[drive])
-#define UDRWE (&write_errors[drive])
-#define UFDCS (&fdc_state[FDC(drive)])
-#define UCLEARF(x) clear_bit(x##_BIT, &UDRS->flags)
-#define USETF(x) set_bit(x##_BIT, &UDRS->flags)
-#define UTESTF(x) test_bit(x##_BIT, &UDRS->flags)
 
-#define DPRINT(format, args...) printk(DEVICE_NAME "%d: " format, current_drive , ## args)
+#define DP	(&drive_params[current_drive])
+#define DRS	(&drive_state[current_drive])
+#define DRWE	(&write_errors[current_drive])
+#define FDCS	(&fdc_state[fdc])
 
-#define PH_HEAD(floppy,head) (((((floppy)->stretch & 2) >>1) ^ head) << 2)
-#define STRETCH(floppy) ((floppy)->stretch & FD_STRETCH)
+#define UDP	(&drive_params[drive])
+#define UDRS	(&drive_state[drive])
+#define UDRWE	(&write_errors[drive])
+#define UFDCS	(&fdc_state[FDC(drive)])
 
-#define CLEARSTRUCT(x) memset((x), 0, sizeof(*(x)))
+#define PH_HEAD(floppy, head) (((((floppy)->stretch & 2) >> 1) ^ head) << 2)
+#define STRETCH(floppy)	((floppy)->stretch & FD_STRETCH)
 
 /* read/write */
-#define COMMAND raw_cmd->cmd[0]
-#define DR_SELECT raw_cmd->cmd[1]
-#define TRACK raw_cmd->cmd[2]
-#define HEAD raw_cmd->cmd[3]
-#define SECTOR raw_cmd->cmd[4]
-#define SIZECODE raw_cmd->cmd[5]
-#define SECT_PER_TRACK raw_cmd->cmd[6]
-#define GAP raw_cmd->cmd[7]
-#define SIZECODE2 raw_cmd->cmd[8]
+#define COMMAND		(raw_cmd->cmd[0])
+#define DR_SELECT	(raw_cmd->cmd[1])
+#define TRACK		(raw_cmd->cmd[2])
+#define HEAD		(raw_cmd->cmd[3])
+#define SECTOR		(raw_cmd->cmd[4])
+#define SIZECODE	(raw_cmd->cmd[5])
+#define SECT_PER_TRACK	(raw_cmd->cmd[6])
+#define GAP		(raw_cmd->cmd[7])
+#define SIZECODE2	(raw_cmd->cmd[8])
 #define NR_RW 9
 
 /* format */
-#define F_SIZECODE raw_cmd->cmd[2]
-#define F_SECT_PER_TRACK raw_cmd->cmd[3]
-#define F_GAP raw_cmd->cmd[4]
-#define F_FILL raw_cmd->cmd[5]
+#define F_SIZECODE	(raw_cmd->cmd[2])
+#define F_SECT_PER_TRACK (raw_cmd->cmd[3])
+#define F_GAP		(raw_cmd->cmd[4])
+#define F_FILL		(raw_cmd->cmd[5])
 #define NR_F 6
 
 /*
- * Maximum disk size (in kilobytes). This default is used whenever the
- * current disk size is unknown.
+ * Maximum disk size (in kilobytes).
+ * This default is used whenever the current disk size is unknown.
  * [Now it is rather a minimum]
  */
 #define MAX_DISK_SIZE 4		/* 3984 */
@@ -345,16 +344,17 @@ static int initialising = 1;
  */
 #define MAX_REPLIES 16
 static unsigned char reply_buffer[MAX_REPLIES];
-static int inr;			/* size of reply buffer, when called from interrupt */
-#define ST0 (reply_buffer[0])
-#define ST1 (reply_buffer[1])
-#define ST2 (reply_buffer[2])
-#define ST3 (reply_buffer[0])	/* result of GETSTATUS */
-#define R_TRACK (reply_buffer[3])
-#define R_HEAD (reply_buffer[4])
-#define R_SECTOR (reply_buffer[5])
-#define R_SIZECODE (reply_buffer[6])
-#define SEL_DLY (2*HZ/100)
+static int inr;		/* size of reply buffer, when called from interrupt */
+#define ST0		(reply_buffer[0])
+#define ST1		(reply_buffer[1])
+#define ST2		(reply_buffer[2])
+#define ST3		(reply_buffer[0])	/* result of GETSTATUS */
+#define R_TRACK		(reply_buffer[3])
+#define R_HEAD		(reply_buffer[4])
+#define R_SECTOR	(reply_buffer[5])
+#define R_SIZECODE	(reply_buffer[6])
+
+#define SEL_DLY		(2 * HZ / 100)
 
 /*
  * this struct defines the different floppy drive types.
@@ -412,6 +412,7 @@ static struct gendisk *disks[N_DRIVE];
 static struct block_device *opened_bdev[N_DRIVE];
 static DEFINE_MUTEX(open_lock);
 static struct floppy_raw_cmd *raw_cmd, default_raw_cmd;
+static int fdc_queue;
 
 /*
  * This struct defines the different floppy types.
@@ -505,22 +506,15 @@ static char floppy_device_name[] = "floppy";
 static int probing;
 
 /* Synchronization of FDC access. */
-#define FD_COMMAND_NONE -1
-#define FD_COMMAND_ERROR 2
-#define FD_COMMAND_OKAY 3
+#define FD_COMMAND_NONE		-1
+#define FD_COMMAND_ERROR	2
+#define FD_COMMAND_OKAY		3
 
 static volatile int command_status = FD_COMMAND_NONE;
 static unsigned long fdc_busy;
 static DECLARE_WAIT_QUEUE_HEAD(fdc_wait);
 static DECLARE_WAIT_QUEUE_HEAD(command_done);
 
-#define NO_SIGNAL (!interruptible || !signal_pending(current))
-#define CALL(x) if ((x) == -EINTR) return -EINTR
-#define ECALL(x) if ((ret = (x))) return ret;
-#define _WAIT(x,i) CALL(ret=wait_til_done((x),i))
-#define WAIT(x) _WAIT((x),interruptible)
-#define IWAIT(x) _WAIT((x),1)
-
 /* Errors during formatting are counted here. */
 static int format_errors;
 
@@ -544,9 +538,10 @@ static int max_buffer_sectors;
 
 static int *errors;
 typedef void (*done_f)(int);
-static struct cont_t {
-	void (*interrupt)(void);	/* this is called after the interrupt of the
-					 * main command */
+static const struct cont_t {
+	void (*interrupt)(void);
+				/* this is called after the interrupt of the
+				 * main command */
 	void (*redo)(void);	/* this is called to retry the operation */
 	void (*error)(void);	/* this is called to tally an error */
 	done_f done;		/* this is called to say if the operation has
@@ -557,7 +552,7 @@ static void floppy_ready(void);
 static void floppy_start(void);
 static void process_fd_request(void);
 static void recalibrate_floppy(void);
-static void floppy_shutdown(unsigned long);
+static void floppy_shutdown(struct work_struct *);
 
 static int floppy_request_regions(int);
 static void floppy_release_regions(int);
@@ -571,7 +566,6 @@ static void floppy_release_irq_and_dma(void);
  * reset doesn't need to be tested before sending commands, because
  * output_byte is automatically disabled when reset is set.
  */
-#define CHECK_RESET { if (FDCS->reset){ reset_fdc(); return; } }
 static void reset_fdc(void);
 
 /*
@@ -579,11 +573,11 @@ static void reset_fdc(void);
  * information to interrupts. They are the data used for the current
  * request.
  */
-#define NO_TRACK -1
-#define NEED_1_RECAL -2
-#define NEED_2_RECAL -3
+#define NO_TRACK	-1
+#define NEED_1_RECAL	-2
+#define NEED_2_RECAL	-3
 
-static int usage_count;
+static atomic_t usage_count = ATOMIC_INIT(0);
 
 /* buffer related variables */
 static int buffer_track = -1;
@@ -595,6 +589,8 @@ static int buffer_max = -1;
 static struct floppy_fdc_state fdc_state[N_FDC];
 static int fdc;			/* current fdc */
 
+static struct workqueue_struct *floppy_wq;
+
 static struct floppy_struct *_floppy = floppy_type;
 static unsigned char current_drive;
 static long current_count_sectors;
@@ -602,6 +598,11 @@ static unsigned char fsector_t;	/* sector in track */
 static unsigned char in_sector_offset;	/* offset within physical sector,
 					 * expressed in units of 512 bytes */
 
+static inline bool drive_no_geom(int drive)
+{
+	return !current_type[drive] && !ITYPE(UDRS->fd_device);
+}
+
 #ifndef fd_eject
 static inline int fd_eject(int drive)
 {
@@ -621,39 +622,34 @@ static inline void set_debugt(void)
 	debugtimer = jiffies;
 }
 
-static inline void debugt(const char *message)
+static inline void debugt(const char *func, const char *msg)
 {
 	if (DP->flags & DEBUGT)
-		printk("%s dtime=%lu\n", message, jiffies - debugtimer);
+		pr_info("%s:%s dtime=%lu\n", func, msg, jiffies - debugtimer);
 }
 #else
 static inline void set_debugt(void) { }
-static inline void debugt(const char *message) { }
+static inline void debugt(const char *func, const char *msg) { }
 #endif /* DEBUGT */
 
-typedef void (*timeout_fn) (unsigned long);
-static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0);
 
+static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown);
 static const char *timeout_message;
 
-#ifdef FLOPPY_SANITY_CHECK
-static void is_alive(const char *message)
+static void is_alive(const char *func, const char *message)
 {
 	/* this routine checks whether the floppy driver is "alive" */
-	if (test_bit(0, &fdc_busy) && command_status < 2
-	    && !timer_pending(&fd_timeout)) {
-		DPRINT("timeout handler died: %s\n", message);
+	if (test_bit(0, &fdc_busy) && command_status < 2 &&
+	    !delayed_work_pending(&fd_timeout)) {
+		DPRINT("%s: timeout handler died.  %s\n", func, message);
 	}
 }
-#endif
 
-static void (*do_floppy) (void) = NULL;
-
-#ifdef FLOPPY_SANITY_CHECK
+static void (*do_floppy)(void) = NULL;
 
 #define OLOGSIZE 20
 
-static void (*lasthandler) (void);
+static void (*lasthandler)(void);
 static unsigned long interruptjiffies;
 static unsigned long resultjiffies;
 static int resultsize;
@@ -666,41 +662,40 @@ static struct output_log {
 } output_log[OLOGSIZE];
 
 static int output_log_pos;
-#endif
 
 #define current_reqD -1
 #define MAXTIMEOUT -2
 
-static void __reschedule_timeout(int drive, const char *message, int marg)
+static void __reschedule_timeout(int drive, const char *message)
 {
+	unsigned long delay;
+
 	if (drive == current_reqD)
 		drive = current_drive;
-	del_timer(&fd_timeout);
+
 	if (drive < 0 || drive >= N_DRIVE) {
-		fd_timeout.expires = jiffies + 20UL * HZ;
+		delay = 20UL * HZ;
 		drive = 0;
 	} else
-		fd_timeout.expires = jiffies + UDP->timeout;
-	add_timer(&fd_timeout);
-	if (UDP->flags & FD_DEBUG) {
-		DPRINT("reschedule timeout ");
-		printk(message, marg);
-		printk("\n");
-	}
+		delay = UDP->timeout;
+
+	mod_delayed_work(floppy_wq, &fd_timeout, delay);
+	if (UDP->flags & FD_DEBUG)
+		DPRINT("reschedule timeout %s\n", message);
 	timeout_message = message;
 }
 
-static void reschedule_timeout(int drive, const char *message, int marg)
+static void reschedule_timeout(int drive, const char *message)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&floppy_lock, flags);
-	__reschedule_timeout(drive, message, marg);
+	__reschedule_timeout(drive, message);
 	spin_unlock_irqrestore(&floppy_lock, flags);
 }
 
-#define INFBOUND(a,b) (a)=max_t(int, a, b)
-#define SUPBOUND(a,b) (a)=min_t(int, a, b)
+#define INFBOUND(a, b) (a) = max_t(int, a, b)
+#define SUPBOUND(a, b) (a) = min_t(int, a, b)
 
 /*
  * Bottom half floppy driver.
@@ -739,7 +734,6 @@ static int disk_change(int drive)
 {
 	int fdc = FDC(drive);
 
-#ifdef FLOPPY_SANITY_CHECK
 	if (time_before(jiffies, UDRS->select_date + UDP->select_delay))
 		DPRINT("WARNING disk change called early\n");
 	if (!(FDCS->dor & (0x10 << UNIT(drive))) ||
@@ -748,31 +742,27 @@ static int disk_change(int drive)
 		DPRINT("drive=%d fdc=%d dor=%x\n", drive, FDC(drive),
 		       (unsigned int)FDCS->dor);
 	}
-#endif
 
-#ifdef DCL_DEBUG
-	if (UDP->flags & FD_DEBUG) {
-		DPRINT("checking disk change line for drive %d\n", drive);
-		DPRINT("jiffies=%lu\n", jiffies);
-		DPRINT("disk change line=%x\n", fd_inb(FD_DIR) & 0x80);
-		DPRINT("flags=%lx\n", UDRS->flags);
-	}
-#endif
+	debug_dcl(UDP->flags,
+		  "checking disk change line for drive %d\n", drive);
+	debug_dcl(UDP->flags, "jiffies=%lu\n", jiffies);
+	debug_dcl(UDP->flags, "disk change line=%x\n", fd_inb(FD_DIR) & 0x80);
+	debug_dcl(UDP->flags, "flags=%lx\n", UDRS->flags);
+
 	if (UDP->flags & FD_BROKEN_DCL)
-		return UTESTF(FD_DISK_CHANGED);
+		return test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
 	if ((fd_inb(FD_DIR) ^ UDP->flags) & 0x80) {
-		USETF(FD_VERIFY);	/* verify write protection */
-		if (UDRS->maxblock) {
-			/* mark it changed */
-			USETF(FD_DISK_CHANGED);
-		}
+		set_bit(FD_VERIFY_BIT, &UDRS->flags);
+					/* verify write protection */
+
+		if (UDRS->maxblock)	/* mark it changed */
+			set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
 
 		/* invalidate its geometry */
 		if (UDRS->keep_data >= 0) {
 			if ((UDP->flags & FTD_MSG) &&
 			    current_type[drive] != NULL)
-				DPRINT("Disk type is undefined after "
-				       "disk change\n");
+				DPRINT("Disk type is undefined after disk change\n");
 			current_type[drive] = NULL;
 			floppy_sizes[TOMINOR(drive)] = MAX_DISK_SIZE << 1;
 		}
@@ -780,7 +770,7 @@ static int disk_change(int drive)
 		return 1;
 	} else {
 		UDRS->last_checked = jiffies;
-		UCLEARF(FD_DISK_NEWCHANGE);
+		clear_bit(FD_DISK_NEWCHANGE_BIT, &UDRS->flags);
 	}
 	return 0;
 }
@@ -790,6 +780,12 @@ static inline int is_selected(int dor, int unit)
 	return ((dor & (0x10 << unit)) && (dor & 3) == unit);
 }
 
+static bool is_ready_state(int status)
+{
+	int state = status & (STATUS_READY | STATUS_DIR | STATUS_DMA);
+	return state == STATUS_READY;
+}
+
 static int set_dor(int fdc, char mask, char data)
 {
 	unsigned char unit;
@@ -806,11 +802,8 @@ static int set_dor(int fdc, char mask, char data)
 		unit = olddor & 0x3;
 		if (is_selected(olddor, unit) && !is_selected(newdor, unit)) {
 			drive = REVDRIVE(fdc, unit);
-#ifdef DCL_DEBUG
-			if (UDP->flags & FD_DEBUG) {
-				DPRINT("calling disk change from set_dor\n");
-			}
-#endif
+			debug_dcl(UDP->flags,
+				  "calling disk change from set_dor\n");
 			disk_change(drive);
 		}
 		FDCS->dor = newdor;
@@ -834,8 +827,10 @@ static void twaddle(void)
 	DRS->select_date = jiffies;
 }
 
-/* reset all driver information about the current fdc. This is needed after
- * a reset, and after a raw command. */
+/*
+ * Reset all driver information about the current fdc.
+ * This is needed after a reset, and after a raw command.
+ */
 static void reset_fdc_info(int mode)
 {
 	int drive;
@@ -857,7 +852,7 @@ static void set_fdc(int drive)
 		current_drive = drive;
 	}
 	if (fdc != 1 && fdc != 0) {
-		printk("bad fdc value\n");
+		pr_info("bad fdc value\n");
 		return;
 	}
 	set_dor(fdc, ~0, 8);
@@ -871,69 +866,34 @@ static void set_fdc(int drive)
 }
 
 /* locks the driver */
-static int _lock_fdc(int drive, int interruptible, int line)
+static int lock_fdc(int drive, bool interruptible)
 {
-	if (!usage_count) {
-		printk(KERN_ERR
-		       "Trying to lock fdc while usage count=0 at line %d\n",
-		       line);
+	if (WARN(atomic_read(&usage_count) == 0,
+		 "Trying to lock fdc while usage count=0\n"))
 		return -1;
-	}
-
-	if (test_and_set_bit(0, &fdc_busy)) {
-		DECLARE_WAITQUEUE(wait, current);
-		add_wait_queue(&fdc_wait, &wait);
-
-		for (;;) {
-			set_current_state(TASK_INTERRUPTIBLE);
 
-			if (!test_and_set_bit(0, &fdc_busy))
-				break;
-
-			schedule();
-
-			if (!NO_SIGNAL) {
-				remove_wait_queue(&fdc_wait, &wait);
-				return -EINTR;
-			}
-		}
+	if (wait_event_interruptible(fdc_wait, !test_and_set_bit(0, &fdc_busy)))
+		return -EINTR;
 
-		set_current_state(TASK_RUNNING);
-		remove_wait_queue(&fdc_wait, &wait);
-		flush_scheduled_work();
-	}
 	command_status = FD_COMMAND_NONE;
 
-	__reschedule_timeout(drive, "lock fdc", 0);
+	reschedule_timeout(drive, "lock fdc");
 	set_fdc(drive);
 	return 0;
 }
 
-#define lock_fdc(drive,interruptible) _lock_fdc(drive,interruptible, __LINE__)
-
-#define LOCK_FDC(drive,interruptible) \
-if (lock_fdc(drive,interruptible)) return -EINTR;
-
 /* unlocks the driver */
-static inline void unlock_fdc(void)
+static void unlock_fdc(void)
 {
-	unsigned long flags;
-
-	raw_cmd = NULL;
 	if (!test_bit(0, &fdc_busy))
 		DPRINT("FDC access conflict!\n");
 
-	if (do_floppy)
-		DPRINT("device interrupt still active at FDC release: %p!\n",
-		       do_floppy);
+	raw_cmd = NULL;
 	command_status = FD_COMMAND_NONE;
-	spin_lock_irqsave(&floppy_lock, flags);
-	del_timer(&fd_timeout);
+	cancel_delayed_work(&fd_timeout);
+	do_floppy = NULL;
 	cont = NULL;
 	clear_bit(0, &fdc_busy);
-	if (current_req || blk_peek_request(floppy_queue))
-		do_fd_request(floppy_queue);
-	spin_unlock_irqrestore(&floppy_lock, flags);
 	wake_up(&fdc_wait);
 }
 
@@ -1001,36 +961,44 @@ static void empty(void)
 {
 }
 
-static DECLARE_WORK(floppy_work, NULL);
+static void (*floppy_work_fn)(void);
 
-static void schedule_bh(void (*handler) (void))
+static void floppy_work_workfn(struct work_struct *work)
 {
-	PREPARE_WORK(&floppy_work, (work_func_t)handler);
-	schedule_work(&floppy_work);
+	floppy_work_fn();
 }
 
-static DEFINE_TIMER(fd_timer, NULL, 0, 0);
+static DECLARE_WORK(floppy_work, floppy_work_workfn);
 
-static void cancel_activity(void)
+static void schedule_bh(void (*handler)(void))
 {
-	unsigned long flags;
+	WARN_ON(work_pending(&floppy_work));
 
-	spin_lock_irqsave(&floppy_lock, flags);
+	floppy_work_fn = handler;
+	queue_work(floppy_wq, &floppy_work);
+}
+
+static void (*fd_timer_fn)(void) = NULL;
+
+static void fd_timer_workfn(struct work_struct *work)
+{
+	fd_timer_fn();
+}
+
+static DECLARE_DELAYED_WORK(fd_timer, fd_timer_workfn);
+
+static void cancel_activity(void)
+{
 	do_floppy = NULL;
-	PREPARE_WORK(&floppy_work, (work_func_t)empty);
-	del_timer(&fd_timer);
-	spin_unlock_irqrestore(&floppy_lock, flags);
+	cancel_delayed_work_sync(&fd_timer);
+	cancel_work_sync(&floppy_work);
 }
 
 /* this function makes sure that the disk stays in the drive during the
  * transfer */
 static void fd_watchdog(void)
 {
-#ifdef DCL_DEBUG
-	if (DP->flags & FD_DEBUG) {
-		DPRINT("calling disk change from watchdog\n");
-	}
-#endif
+	debug_dcl(DP->flags, "calling disk change from watchdog\n");
 
 	if (disk_change(current_drive)) {
 		DPRINT("disk removed during i/o\n");
@@ -1038,21 +1006,21 @@ static void fd_watchdog(void)
 		cont->done(0);
 		reset_fdc();
 	} else {
-		del_timer(&fd_timer);
-		fd_timer.function = (timeout_fn) fd_watchdog;
-		fd_timer.expires = jiffies + HZ / 10;
-		add_timer(&fd_timer);
+		cancel_delayed_work(&fd_timer);
+		fd_timer_fn = fd_watchdog;
+		queue_delayed_work(floppy_wq, &fd_timer, HZ / 10);
 	}
 }
 
 static void main_command_interrupt(void)
 {
-	del_timer(&fd_timer);
+	cancel_delayed_work(&fd_timer);
 	cont->interrupt();
 }
 
 /* waits for a delay (spinup or select) to pass */
-static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
+static int fd_wait_for_completion(unsigned long expires,
+				  void (*function)(void))
 {
 	if (FDCS->reset) {
 		reset_fdc();	/* do the reset during sleep to win time
@@ -1061,69 +1029,36 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
 		return 1;
 	}
 
-	if (time_before(jiffies, delay)) {
-		del_timer(&fd_timer);
-		fd_timer.function = function;
-		fd_timer.expires = delay;
-		add_timer(&fd_timer);
+	if (time_before(jiffies, expires)) {
+		cancel_delayed_work(&fd_timer);
+		fd_timer_fn = function;
+		queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies);
 		return 1;
 	}
 	return 0;
 }
 
-static DEFINE_SPINLOCK(floppy_hlt_lock);
-static int hlt_disabled;
-static void floppy_disable_hlt(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&floppy_hlt_lock, flags);
-	if (!hlt_disabled) {
-		hlt_disabled = 1;
-#ifdef HAVE_DISABLE_HLT
-		disable_hlt();
-#endif
-	}
-	spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
-static void floppy_enable_hlt(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&floppy_hlt_lock, flags);
-	if (hlt_disabled) {
-		hlt_disabled = 0;
-#ifdef HAVE_DISABLE_HLT
-		enable_hlt();
-#endif
-	}
-	spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
 static void setup_DMA(void)
 {
 	unsigned long f;
 
-#ifdef FLOPPY_SANITY_CHECK
 	if (raw_cmd->length == 0) {
 		int i;
 
-		printk("zero dma transfer size:");
+		pr_info("zero dma transfer size:");
 		for (i = 0; i < raw_cmd->cmd_count; i++)
-			printk("%x,", raw_cmd->cmd[i]);
-		printk("\n");
+			pr_cont("%x,", raw_cmd->cmd[i]);
+		pr_cont("\n");
 		cont->done(0);
 		FDCS->reset = 1;
 		return;
 	}
 	if (((unsigned long)raw_cmd->kernel_data) % 512) {
-		printk("non aligned address: %p\n", raw_cmd->kernel_data);
+		pr_info("non aligned address: %p\n", raw_cmd->kernel_data);
 		cont->done(0);
 		FDCS->reset = 1;
 		return;
 	}
-#endif
 	f = claim_dma_lock();
 	fd_disable_dma();
 #ifdef fd_dma_setup
@@ -1147,7 +1082,6 @@ static void setup_DMA(void)
 	fd_enable_dma();
 	release_dma_lock(f);
 #endif
-	floppy_disable_hlt();
 }
 
 static void show_floppy(void);
@@ -1165,7 +1099,7 @@ static int wait_til_ready(void)
 		if (status & STATUS_READY)
 			return status;
 	}
-	if (!initialising) {
+	if (initialized) {
 		DPRINT("Getstatus times out (%x) on fdc %d\n", status, fdc);
 		show_floppy();
 	}
@@ -1176,22 +1110,21 @@ static int wait_til_ready(void)
 /* sends a command byte to the fdc */
 static int output_byte(char byte)
 {
-	int status;
+	int status = wait_til_ready();
 
-	if ((status = wait_til_ready()) < 0)
+	if (status < 0)
 		return -1;
-	if ((status & (STATUS_READY | STATUS_DIR | STATUS_DMA)) == STATUS_READY) {
+
+	if (is_ready_state(status)) {
 		fd_outb(byte, FD_DATA);
-#ifdef FLOPPY_SANITY_CHECK
 		output_log[output_log_pos].data = byte;
 		output_log[output_log_pos].status = status;
 		output_log[output_log_pos].jiffies = jiffies;
 		output_log_pos = (output_log_pos + 1) % OLOGSIZE;
-#endif
 		return 0;
 	}
 	FDCS->reset = 1;
-	if (!initialising) {
+	if (initialized) {
 		DPRINT("Unable to send byte %x to FDC. Fdc=%x Status=%x\n",
 		       byte, fdc, status);
 		show_floppy();
@@ -1199,8 +1132,6 @@ static int output_byte(char byte)
 	return -1;
 }
 
-#define LAST_OUT(x) if (output_byte(x)<0){ reset_fdc();return;}
-
 /* gets the response from the fdc */
 static int result(void)
 {
@@ -1208,14 +1139,13 @@ static int result(void)
 	int status = 0;
 
 	for (i = 0; i < MAX_REPLIES; i++) {
-		if ((status = wait_til_ready()) < 0)
+		status = wait_til_ready();
+		if (status < 0)
 			break;
 		status &= STATUS_DIR | STATUS_READY | STATUS_BUSY | STATUS_DMA;
 		if ((status & ~STATUS_BUSY) == STATUS_READY) {
-#ifdef FLOPPY_SANITY_CHECK
 			resultjiffies = jiffies;
 			resultsize = i;
-#endif
 			return i;
 		}
 		if (status == (STATUS_DIR | STATUS_READY | STATUS_BUSY))
@@ -1223,10 +1153,9 @@ static int result(void)
 		else
 			break;
 	}
-	if (!initialising) {
-		DPRINT
-		    ("get result error. Fdc=%d Last status=%x Read bytes=%d\n",
-		     fdc, status, i);
+	if (initialized) {
+		DPRINT("get result error. Fdc=%d Last status=%x Read bytes=%d\n",
+		       fdc, status, i);
 		show_floppy();
 	}
 	FDCS->reset = 1;
@@ -1237,19 +1166,21 @@ static int result(void)
 /* does the fdc need more output? */
 static int need_more_output(void)
 {
-	int status;
+	int status = wait_til_ready();
 
-	if ((status = wait_til_ready()) < 0)
+	if (status < 0)
 		return -1;
-	if ((status & (STATUS_READY | STATUS_DIR | STATUS_DMA)) == STATUS_READY)
+
+	if (is_ready_state(status))
 		return MORE_OUTPUT;
+
 	return result();
 }
 
 /* Set perpendicular mode as required, based on data rate, if supported.
  * 82077 Now tested. 1Mbps data rate only possible with 82077-1.
  */
-static inline void perpendicular_mode(void)
+static void perpendicular_mode(void)
 {
 	unsigned char perp_mode;
 
@@ -1264,9 +1195,12 @@ static inline void perpendicular_mode(void)
 		default:
 			DPRINT("Invalid data rate for perpendicular mode!\n");
 			cont->done(0);
-			FDCS->reset = 1;	/* convenient way to return to
-						 * redo without to much hassle (deep
-						 * stack et al. */
+			FDCS->reset = 1;
+					/*
+					 * convenient way to return to
+					 * redo without too much hassle
+					 * (deep stack et al.)
+					 */
 			return;
 		}
 	} else
@@ -1366,9 +1300,9 @@ static void fdc_specify(void)
 
 	/* Convert step rate from microseconds to milliseconds and 4 bits */
 	srt = 16 - DIV_ROUND_UP(DP->srt * scale_dtr / 1000, NOMINAL_DTR);
-	if (slow_floppy) {
+	if (slow_floppy)
 		srt = srt / 4;
-	}
+
 	SUPBOUND(srt, 0xf);
 	INFBOUND(srt, 0);
 
@@ -1415,16 +1349,45 @@ static int fdc_dtr(void)
 	 * Pause 5 msec to avoid trouble. (Needs to be 2 jiffies)
 	 */
 	FDCS->dtr = raw_cmd->rate & 3;
-	return (fd_wait_for_completion(jiffies + 2UL * HZ / 100,
-				       (timeout_fn) floppy_ready));
+	return fd_wait_for_completion(jiffies + 2UL * HZ / 100, floppy_ready);
 }				/* fdc_dtr */
 
 static void tell_sector(void)
 {
-	printk(": track %d, head %d, sector %d, size %d",
-	       R_TRACK, R_HEAD, R_SECTOR, R_SIZECODE);
+	pr_cont(": track %d, head %d, sector %d, size %d",
+		R_TRACK, R_HEAD, R_SECTOR, R_SIZECODE);
 }				/* tell_sector */
 
+static void print_errors(void)
+{
+	DPRINT("");
+	if (ST0 & ST0_ECE) {
+		pr_cont("Recalibrate failed!");
+	} else if (ST2 & ST2_CRC) {
+		pr_cont("data CRC error");
+		tell_sector();
+	} else if (ST1 & ST1_CRC) {
+		pr_cont("CRC error");
+		tell_sector();
+	} else if ((ST1 & (ST1_MAM | ST1_ND)) ||
+		   (ST2 & ST2_MAM)) {
+		if (!probing) {
+			pr_cont("sector not found");
+			tell_sector();
+		} else
+			pr_cont("probe failed...");
+	} else if (ST2 & ST2_WC) {	/* seek error */
+		pr_cont("wrong cylinder");
+	} else if (ST2 & ST2_BC) {	/* cylinder marked as bad */
+		pr_cont("bad cylinder");
+	} else {
+		pr_cont("unknown error. ST[0..2] are: 0x%x 0x%x 0x%x",
+			ST0, ST1, ST2);
+		tell_sector();
+	}
+	pr_cont("\n");
+}
+
 /*
  * OK, this error interpreting routine is called after a
  * DMA read/write has succeeded
@@ -1437,7 +1400,7 @@ static int interpret_errors(void)
 	char bad;
 
 	if (inr != 7) {
-		DPRINT("-- FDC reply error");
+		DPRINT("-- FDC reply error\n");
 		FDCS->reset = 1;
 		return 1;
 	}
@@ -1450,43 +1413,17 @@ static int interpret_errors(void)
 		bad = 1;
 		if (ST1 & ST1_WP) {
 			DPRINT("Drive is write protected\n");
-			CLEARF(FD_DISK_WRITABLE);
+			clear_bit(FD_DISK_WRITABLE_BIT, &DRS->flags);
 			cont->done(0);
 			bad = 2;
 		} else if (ST1 & ST1_ND) {
-			SETF(FD_NEED_TWADDLE);
+			set_bit(FD_NEED_TWADDLE_BIT, &DRS->flags);
 		} else if (ST1 & ST1_OR) {
 			if (DP->flags & FTD_MSG)
 				DPRINT("Over/Underrun - retrying\n");
 			bad = 0;
 		} else if (*errors >= DP->max_errors.reporting) {
-			DPRINT("");
-			if (ST0 & ST0_ECE) {
-				printk("Recalibrate failed!");
-			} else if (ST2 & ST2_CRC) {
-				printk("data CRC error");
-				tell_sector();
-			} else if (ST1 & ST1_CRC) {
-				printk("CRC error");
-				tell_sector();
-			} else if ((ST1 & (ST1_MAM | ST1_ND))
-				   || (ST2 & ST2_MAM)) {
-				if (!probing) {
-					printk("sector not found");
-					tell_sector();
-				} else
-					printk("probe failed...");
-			} else if (ST2 & ST2_WC) {	/* seek error */
-				printk("wrong cylinder");
-			} else if (ST2 & ST2_BC) {	/* cylinder marked as bad */
-				printk("bad cylinder");
-			} else {
-				printk
-				    ("unknown error. ST[0..2] are: 0x%x 0x%x 0x%x",
-				     ST0, ST1, ST2);
-				tell_sector();
-			}
-			printk("\n");
+			print_errors();
 		}
 		if (ST2 & ST2_WC || ST2 & ST2_BC)
 			/* wrong cylinder => recal */
@@ -1517,7 +1454,7 @@ static void setup_rw_floppy(void)
 	int flags;
 	int dflags;
 	unsigned long ready_date;
-	timeout_fn function;
+	void (*function)(void);
 
 	flags = raw_cmd->flags;
 	if (flags & (FD_RAW_READ | FD_RAW_WRITE))
@@ -1531,9 +1468,9 @@ static void setup_rw_floppy(void)
 		 */
 		if (time_after(ready_date, jiffies + DP->select_delay)) {
 			ready_date -= DP->select_delay;
-			function = (timeout_fn) floppy_start;
+			function = floppy_start;
 		} else
-			function = (timeout_fn) setup_rw_floppy;
+			function = setup_rw_floppy;
 
 		/* wait until the floppy is spinning fast enough */
 		if (fd_wait_for_completion(ready_date, function))
@@ -1551,7 +1488,7 @@ static void setup_rw_floppy(void)
 	for (i = 0; i < raw_cmd->cmd_count; i++)
 		r |= output_byte(raw_cmd->cmd[i]);
 
-	debugt("rw_command: ");
+	debugt(__func__, "rw_command");
 
 	if (r) {
 		cont->error();
@@ -1574,7 +1511,7 @@ static int blind_seek;
  */
 static void seek_interrupt(void)
 {
-	debugt("seek interrupt:");
+	debugt(__func__, "");
 	if (inr != 2 || (ST0 & 0xF8) != 0x20) {
 		DPRINT("seek failed\n");
 		DRS->track = NEED_2_RECAL;
@@ -1583,14 +1520,11 @@ static void seek_interrupt(void)
 		return;
 	}
 	if (DRS->track >= 0 && DRS->track != ST1 && !blind_seek) {
-#ifdef DCL_DEBUG
-		if (DP->flags & FD_DEBUG) {
-			DPRINT
-			    ("clearing NEWCHANGE flag because of effective seek\n");
-			DPRINT("jiffies=%lu\n", jiffies);
-		}
-#endif
-		CLEARF(FD_DISK_NEWCHANGE);	/* effective seek */
+		debug_dcl(DP->flags,
+			  "clearing NEWCHANGE flag because of effective seek\n");
+		debug_dcl(DP->flags, "jiffies=%lu\n", jiffies);
+		clear_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
+					/* effective seek */
 		DRS->select_date = jiffies;
 	}
 	DRS->track = ST1;
@@ -1599,26 +1533,23 @@ static void seek_interrupt(void)
 
 static void check_wp(void)
 {
-	if (TESTF(FD_VERIFY)) {
-		/* check write protection */
+	if (test_bit(FD_VERIFY_BIT, &DRS->flags)) {
+					/* check write protection */
 		output_byte(FD_GETSTATUS);
 		output_byte(UNIT(current_drive));
 		if (result() != 1) {
 			FDCS->reset = 1;
 			return;
 		}
-		CLEARF(FD_VERIFY);
-		CLEARF(FD_NEED_TWADDLE);
-#ifdef DCL_DEBUG
-		if (DP->flags & FD_DEBUG) {
-			DPRINT("checking whether disk is write protected\n");
-			DPRINT("wp=%x\n", ST3 & 0x40);
-		}
-#endif
+		clear_bit(FD_VERIFY_BIT, &DRS->flags);
+		clear_bit(FD_NEED_TWADDLE_BIT, &DRS->flags);
+		debug_dcl(DP->flags,
+			  "checking whether disk is write protected\n");
+		debug_dcl(DP->flags, "wp=%x\n", ST3 & 0x40);
 		if (!(ST3 & 0x40))
-			SETF(FD_DISK_WRITABLE);
+			set_bit(FD_DISK_WRITABLE_BIT, &DRS->flags);
 		else
-			CLEARF(FD_DISK_WRITABLE);
+			clear_bit(FD_DISK_WRITABLE_BIT, &DRS->flags);
 	}
 }
 
@@ -1628,19 +1559,15 @@ static void seek_floppy(void)
 
 	blind_seek = 0;
 
-#ifdef DCL_DEBUG
-	if (DP->flags & FD_DEBUG) {
-		DPRINT("calling disk change from seek\n");
-	}
-#endif
+	debug_dcl(DP->flags, "calling disk change from %s\n", __func__);
 
-	if (!TESTF(FD_DISK_NEWCHANGE) &&
+	if (!test_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags) &&
 	    disk_change(current_drive) && (raw_cmd->flags & FD_RAW_NEED_DISK)) {
 		/* the media changed flag should be cleared after the seek.
 		 * If it isn't, this means that there is really no disk in
 		 * the drive.
 		 */
-		SETF(FD_DISK_CHANGED);
+		set_bit(FD_DISK_CHANGED_BIT, &DRS->flags);
 		cont->done(0);
 		cont->redo();
 		return;
@@ -1648,7 +1575,7 @@ static void seek_floppy(void)
 	if (DRS->track <= NEED_1_RECAL) {
 		recalibrate_floppy();
 		return;
-	} else if (TESTF(FD_DISK_NEWCHANGE) &&
+	} else if (test_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags) &&
 		   (raw_cmd->flags & FD_RAW_NEED_DISK) &&
 		   (DRS->track <= NO_TRACK || DRS->track == raw_cmd->track)) {
 		/* we seek to clear the media-changed condition. Does anybody
@@ -1677,19 +1604,22 @@ static void seek_floppy(void)
 	do_floppy = seek_interrupt;
 	output_byte(FD_SEEK);
 	output_byte(UNIT(current_drive));
-	LAST_OUT(track);
-	debugt("seek command:");
+	if (output_byte(track) < 0) {
+		reset_fdc();
+		return;
+	}
+	debugt(__func__, "");
 }
 
 static void recal_interrupt(void)
 {
-	debugt("recal interrupt:");
+	debugt(__func__, "");
 	if (inr != 2)
 		FDCS->reset = 1;
 	else if (ST0 & ST0_ECE) {
 		switch (DRS->track) {
 		case NEED_1_RECAL:
-			debugt("recal interrupt need 1 recal:");
+			debugt(__func__, "need 1 recal");
 			/* after a second recalibrate, we still haven't
 			 * reached track 0. Probably no drive. Raise an
 			 * error, as failing immediately might upset
@@ -1698,25 +1628,21 @@ static void recal_interrupt(void)
 			cont->redo();
 			return;
 		case NEED_2_RECAL:
-			debugt("recal interrupt need 2 recal:");
+			debugt(__func__, "need 2 recal");
 			/* If we already did a recalibrate,
 			 * and we are not at track 0, this
 			 * means we have moved. (The only way
 			 * not to move at recalibration is to
 			 * be already at track 0.) Clear the
 			 * new change flag */
-#ifdef DCL_DEBUG
-			if (DP->flags & FD_DEBUG) {
-				DPRINT
-				    ("clearing NEWCHANGE flag because of second recalibrate\n");
-			}
-#endif
+			debug_dcl(DP->flags,
+				  "clearing NEWCHANGE flag because of second recalibrate\n");
 
-			CLEARF(FD_DISK_NEWCHANGE);
+			clear_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
 			DRS->select_date = jiffies;
 			/* fall through */
 		default:
-			debugt("recal interrupt default:");
+			debugt(__func__, "default");
 			/* Recalibrate moves the head by at
 			 * most 80 steps. If after one
 			 * recalibrate we don't have reached
@@ -1738,8 +1664,8 @@ static void print_result(char *message, int inr)
 	DPRINT("%s ", message);
 	if (inr >= 0)
 		for (i = 0; i < inr; i++)
-			printk("repl[%d]=%x ", i, reply_buffer[i]);
-	printk("\n");
+			pr_cont("repl[%d]=%x ", i, reply_buffer[i]);
+	pr_cont("\n");
 }
 
 /* interrupt handler. Note that this can be called externally on the Sparc */
@@ -1756,14 +1682,13 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
 	fd_disable_dma();
 	release_dma_lock(f);
 
-	floppy_enable_hlt();
 	do_floppy = NULL;
 	if (fdc >= N_FDC || FDCS->address == -1) {
 		/* we don't even know which FDC is the culprit */
-		printk("DOR0=%x\n", fdc_state[0].dor);
-		printk("floppy interrupt on bizarre fdc %d\n", fdc);
-		printk("handler=%p\n", handler);
-		is_alive("bizarre fdc");
+		pr_info("DOR0=%x\n", fdc_state[0].dor);
+		pr_info("floppy interrupt on bizarre fdc %d\n", fdc);
+		pr_info("handler=%pf\n", handler);
+		is_alive(__func__, "bizarre fdc");
 		return IRQ_NONE;
 	}
 
@@ -1777,7 +1702,7 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
 	 * activity.
 	 */
 
-	do_print = !handler && print_unex && !initialising;
+	do_print = !handler && print_unex && initialized;
 
 	inr = result();
 	if (do_print)
@@ -1790,15 +1715,15 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
 			if (do_print)
 				print_result("sensei", inr);
 			max_sensei--;
-		} while ((ST0 & 0x83) != UNIT(current_drive) && inr == 2
-			 && max_sensei);
+		} while ((ST0 & 0x83) != UNIT(current_drive) &&
+			 inr == 2 && max_sensei);
 	}
 	if (!handler) {
 		FDCS->reset = 1;
 		return IRQ_NONE;
 	}
 	schedule_bh(handler);
-	is_alive("normal interrupt end");
+	is_alive(__func__, "normal interrupt end");
 
 	/* FIXME! Was it really for us? */
 	return IRQ_HANDLED;
@@ -1806,10 +1731,11 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
 
 static void recalibrate_floppy(void)
 {
-	debugt("recalibrate floppy:");
+	debugt(__func__, "");
 	do_floppy = recal_interrupt;
 	output_byte(FD_RECALIBRATE);
-	LAST_OUT(UNIT(current_drive));
+	if (output_byte(UNIT(current_drive)) < 0)
+		reset_fdc();
 }
 
 /*
@@ -1817,10 +1743,10 @@ static void recalibrate_floppy(void)
  */
 static void reset_interrupt(void)
 {
-	debugt("reset interrupt:");
+	debugt(__func__, "");
 	result();		/* get the status ready for set_fdc */
 	if (FDCS->reset) {
-		printk("reset set in interrupt, calling %p\n", cont->error);
+		pr_info("reset set in interrupt, calling %pf\n", cont->error);
 		cont->error();	/* a reset just after a reset. BAD! */
 	}
 	cont->redo();
@@ -1858,75 +1784,71 @@ static void show_floppy(void)
 {
 	int i;
 
-	printk("\n");
-	printk("floppy driver state\n");
-	printk("-------------------\n");
-	printk("now=%lu last interrupt=%lu diff=%lu last called handler=%p\n",
-	       jiffies, interruptjiffies, jiffies - interruptjiffies,
-	       lasthandler);
+	pr_info("\n");
+	pr_info("floppy driver state\n");
+	pr_info("-------------------\n");
+	pr_info("now=%lu last interrupt=%lu diff=%lu last called handler=%pf\n",
+		jiffies, interruptjiffies, jiffies - interruptjiffies,
+		lasthandler);
 
-#ifdef FLOPPY_SANITY_CHECK
-	printk("timeout_message=%s\n", timeout_message);
-	printk("last output bytes:\n");
+	pr_info("timeout_message=%s\n", timeout_message);
+	pr_info("last output bytes:\n");
 	for (i = 0; i < OLOGSIZE; i++)
-		printk("%2x %2x %lu\n",
-		       output_log[(i + output_log_pos) % OLOGSIZE].data,
-		       output_log[(i + output_log_pos) % OLOGSIZE].status,
-		       output_log[(i + output_log_pos) % OLOGSIZE].jiffies);
-	printk("last result at %lu\n", resultjiffies);
-	printk("last redo_fd_request at %lu\n", lastredo);
-	for (i = 0; i < resultsize; i++) {
-		printk("%2x ", reply_buffer[i]);
-	}
-	printk("\n");
-#endif
-
-	printk("status=%x\n", fd_inb(FD_STATUS));
-	printk("fdc_busy=%lu\n", fdc_busy);
+		pr_info("%2x %2x %lu\n",
+			output_log[(i + output_log_pos) % OLOGSIZE].data,
+			output_log[(i + output_log_pos) % OLOGSIZE].status,
+			output_log[(i + output_log_pos) % OLOGSIZE].jiffies);
+	pr_info("last result at %lu\n", resultjiffies);
+	pr_info("last redo_fd_request at %lu\n", lastredo);
+	print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1,
+		       reply_buffer, resultsize, true);
+
+	pr_info("status=%x\n", fd_inb(FD_STATUS));
+	pr_info("fdc_busy=%lu\n", fdc_busy);
 	if (do_floppy)
-		printk("do_floppy=%p\n", do_floppy);
+		pr_info("do_floppy=%pf\n", do_floppy);
 	if (work_pending(&floppy_work))
-		printk("floppy_work.func=%p\n", floppy_work.func);
-	if (timer_pending(&fd_timer))
-		printk("fd_timer.function=%p\n", fd_timer.function);
-	if (timer_pending(&fd_timeout)) {
-		printk("timer_function=%p\n", fd_timeout.function);
-		printk("expires=%lu\n", fd_timeout.expires - jiffies);
-		printk("now=%lu\n", jiffies);
-	}
-	printk("cont=%p\n", cont);
-	printk("current_req=%p\n", current_req);
-	printk("command_status=%d\n", command_status);
-	printk("\n");
+		pr_info("floppy_work.func=%pf\n", floppy_work.func);
+	if (delayed_work_pending(&fd_timer))
+		pr_info("delayed work.function=%p expires=%ld\n",
+		       fd_timer.work.func,
+		       fd_timer.timer.expires - jiffies);
+	if (delayed_work_pending(&fd_timeout))
+		pr_info("timer_function=%p expires=%ld\n",
+		       fd_timeout.work.func,
+		       fd_timeout.timer.expires - jiffies);
+
+	pr_info("cont=%p\n", cont);
+	pr_info("current_req=%p\n", current_req);
+	pr_info("command_status=%d\n", command_status);
+	pr_info("\n");
 }
 
-static void floppy_shutdown(unsigned long data)
+static void floppy_shutdown(struct work_struct *arg)
 {
 	unsigned long flags;
 
-	if (!initialising)
+	if (initialized)
 		show_floppy();
 	cancel_activity();
 
-	floppy_enable_hlt();
-
 	flags = claim_dma_lock();
 	fd_disable_dma();
 	release_dma_lock(flags);
 
 	/* avoid dma going to a random drive after shutdown */
 
-	if (!initialising)
+	if (initialized)
 		DPRINT("floppy timeout called\n");
 	FDCS->reset = 1;
 	if (cont) {
 		cont->done(0);
 		cont->redo();	/* this will recall reset when needed */
 	} else {
-		printk("no cont in shutdown!\n");
+		pr_info("no cont in shutdown!\n");
 		process_fd_request();
 	}
-	is_alive("floppy shutdown");
+	is_alive(__func__, "");
 }
 
 /* start motor, check media-changed condition and write protection */
@@ -1954,27 +1876,26 @@ static int start_motor(void (*function)(void))
 	set_dor(fdc, mask, data);
 
 	/* wait_for_completion also schedules reset if needed. */
-	return (fd_wait_for_completion(DRS->select_date + DP->select_delay,
-				       (timeout_fn) function));
+	return fd_wait_for_completion(DRS->select_date + DP->select_delay,
+				      function);
 }
 
 static void floppy_ready(void)
 {
-	CHECK_RESET;
+	if (FDCS->reset) {
+		reset_fdc();
+		return;
+	}
 	if (start_motor(floppy_ready))
 		return;
 	if (fdc_dtr())
 		return;
 
-#ifdef DCL_DEBUG
-	if (DP->flags & FD_DEBUG) {
-		DPRINT("calling disk change from floppy_ready\n");
-	}
-#endif
+	debug_dcl(DP->flags, "calling disk change from floppy_ready\n");
 	if (!(raw_cmd->flags & FD_RAW_NO_MOTOR) &&
 	    disk_change(current_drive) && !DP->select_delay)
-		twaddle();	/* this clears the dcl on certain drive/controller
-				 * combinations */
+		twaddle();	/* this clears the dcl on certain
+				 * drive/controller combinations */
 
 #ifdef fd_chose_dma_mode
 	if ((raw_cmd->flags & FD_RAW_READ) || (raw_cmd->flags & FD_RAW_WRITE)) {
@@ -1998,15 +1919,11 @@ static void floppy_ready(void)
 
 static void floppy_start(void)
 {
-	reschedule_timeout(current_reqD, "floppy start", 0);
+	reschedule_timeout(current_reqD, "floppy start");
 
 	scandrives();
-#ifdef DCL_DEBUG
-	if (DP->flags & FD_DEBUG) {
-		DPRINT("setting NEWCHANGE in floppy_start\n");
-	}
-#endif
-	SETF(FD_DISK_NEWCHANGE);
+	debug_dcl(DP->flags, "setting NEWCHANGE in floppy_start\n");
+	set_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
 	floppy_ready();
 }
 
@@ -2026,51 +1943,36 @@ static void floppy_start(void)
 
 static void do_wakeup(void)
 {
-	reschedule_timeout(MAXTIMEOUT, "do wakeup", 0);
+	reschedule_timeout(MAXTIMEOUT, "do wakeup");
 	cont = NULL;
 	command_status += 2;
 	wake_up(&command_done);
 }
 
-static struct cont_t wakeup_cont = {
+static const struct cont_t wakeup_cont = {
 	.interrupt	= empty,
 	.redo		= do_wakeup,
 	.error		= empty,
 	.done		= (done_f)empty
 };
 
-static struct cont_t intr_cont = {
+static const struct cont_t intr_cont = {
 	.interrupt	= empty,
 	.redo		= process_fd_request,
 	.error		= empty,
 	.done		= (done_f)empty
 };
 
-static int wait_til_done(void (*handler)(void), int interruptible)
+static int wait_til_done(void (*handler)(void), bool interruptible)
 {
 	int ret;
 
 	schedule_bh(handler);
 
-	if (command_status < 2 && NO_SIGNAL) {
-		DECLARE_WAITQUEUE(wait, current);
-
-		add_wait_queue(&command_done, &wait);
-		for (;;) {
-			set_current_state(interruptible ?
-					  TASK_INTERRUPTIBLE :
-					  TASK_UNINTERRUPTIBLE);
-
-			if (command_status >= 2 || !NO_SIGNAL)
-				break;
-
-			is_alive("wait_til_done");
-			schedule();
-		}
-
-		set_current_state(TASK_RUNNING);
-		remove_wait_queue(&command_done, &wait);
-	}
+	if (interruptible)
+		wait_event_interruptible(command_done, command_status >= 2);
+	else
+		wait_event(command_done, command_status >= 2);
 
 	if (command_status < 2) {
 		cancel_activity();
@@ -2180,9 +2082,9 @@ static void format_interrupt(void)
 	cont->redo();
 }
 
-#define CODE2SIZE (ssize = ((1 << SIZECODE) + 3) >> 2)
-#define FM_MODE(x,y) ((y) & ~(((x)->rate & 0x80) >>1))
+#define FM_MODE(x, y) ((y) & ~(((x)->rate & 0x80) >> 1))
 #define CT(x) ((x) | 0xc0)
+
 static void setup_format_params(int track)
 {
 	int n;
@@ -2197,8 +2099,8 @@ static void setup_format_params(int track)
 	raw_cmd = &default_raw_cmd;
 	raw_cmd->track = track;
 
-	raw_cmd->flags = FD_RAW_WRITE | FD_RAW_INTR | FD_RAW_SPIN |
-	    FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK;
+	raw_cmd->flags = (FD_RAW_WRITE | FD_RAW_INTR | FD_RAW_SPIN |
+			  FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK);
 	raw_cmd->rate = _floppy->rate & 0x43;
 	raw_cmd->cmd_count = NR_F;
 	COMMAND = FM_MODE(_floppy, FD_FORMAT);
@@ -2257,10 +2159,10 @@ static void redo_format(void)
 	buffer_track = -1;
 	setup_format_params(format_req.track << STRETCH(_floppy));
 	floppy_start();
-	debugt("queue format request");
+	debugt(__func__, "queue format request");
 }
 
-static struct cont_t format_cont = {
+static const struct cont_t format_cont = {
 	.interrupt	= format_interrupt,
 	.redo		= redo_format,
 	.error		= bad_flp_intr,
@@ -2271,7 +2173,9 @@ static int do_format(int drive, struct format_descr *tmp_format_req)
 {
 	int ret;
 
-	LOCK_FDC(drive, 1);
+	if (lock_fdc(drive, true))
+		return -EINTR;
+
 	set_floppy(drive);
 	if (!_floppy ||
 	    _floppy->track > DP->tracks ||
@@ -2286,7 +2190,9 @@ static int do_format(int drive, struct format_descr *tmp_format_req)
 	format_errors = 0;
 	cont = &format_cont;
 	errors = &format_errors;
-	IWAIT(redo_format);
+	ret = wait_til_done(redo_format, true);
+	if (ret == -EINTR)
+		return -EINTR;
 	process_fd_request();
 	return ret;
 }
@@ -2316,19 +2222,23 @@ static void floppy_end_request(struct request *req, int error)
  * logical buffer */
 static void request_done(int uptodate)
 {
-	struct request_queue *q = floppy_queue;
 	struct request *req = current_req;
+	struct request_queue *q;
 	unsigned long flags;
 	int block;
+	char msg[sizeof("request done ") + sizeof(int) * 3];
 
 	probing = 0;
-	reschedule_timeout(MAXTIMEOUT, "request done %d", uptodate);
+	snprintf(msg, sizeof(msg), "request done %d", uptodate);
+	reschedule_timeout(MAXTIMEOUT, msg);
 
 	if (!req) {
-		printk("floppy.c: no request in request_done\n");
+		pr_info("floppy.c: no request in request_done\n");
 		return;
 	}
 
+	q = req->q;
+
 	if (uptodate) {
 		/* maintain values for invalidation on geometry
 		 * change */
@@ -2377,7 +2287,7 @@ static void rw_interrupt(void)
 		DRS->first_read_date = jiffies;
 
 	nr_sectors = 0;
-	CODE2SIZE;
+	ssize = DIV_ROUND_UP(1 << SIZECODE, 4);
 
 	if (ST1 & ST1_EOC)
 		eoc = 1;
@@ -2393,20 +2303,18 @@ static void rw_interrupt(void)
 		       R_HEAD - HEAD) * SECT_PER_TRACK +
 		      R_SECTOR - SECTOR + eoc) << SIZECODE >> 2;
 
-#ifdef FLOPPY_SANITY_CHECK
 	if (nr_sectors / ssize >
 	    DIV_ROUND_UP(in_sector_offset + current_count_sectors, ssize)) {
 		DPRINT("long rw: %x instead of %lx\n",
 		       nr_sectors, current_count_sectors);
-		printk("rs=%d s=%d\n", R_SECTOR, SECTOR);
-		printk("rh=%d h=%d\n", R_HEAD, HEAD);
-		printk("rt=%d t=%d\n", R_TRACK, TRACK);
-		printk("heads=%d eoc=%d\n", heads, eoc);
-		printk("spt=%d st=%d ss=%d\n", SECT_PER_TRACK,
-		       fsector_t, ssize);
-		printk("in_sector_offset=%d\n", in_sector_offset);
+		pr_info("rs=%d s=%d\n", R_SECTOR, SECTOR);
+		pr_info("rh=%d h=%d\n", R_HEAD, HEAD);
+		pr_info("rt=%d t=%d\n", R_TRACK, TRACK);
+		pr_info("heads=%d eoc=%d\n", heads, eoc);
+		pr_info("spt=%d st=%d ss=%d\n",
+			SECT_PER_TRACK, fsector_t, ssize);
+		pr_info("in_sector_offset=%d\n", in_sector_offset);
 	}
-#endif
 
 	nr_sectors -= in_sector_offset;
 	INFBOUND(nr_sectors, 0);
@@ -2443,7 +2351,7 @@ static void rw_interrupt(void)
 	}
 
 	if (CT(COMMAND) != FD_READ ||
-	    raw_cmd->kernel_data == current_req->buffer) {
+	    raw_cmd->kernel_data == bio_data(current_req->bio)) {
 		/* transfer directly from buffer */
 		cont->done(1);
 	} else if (CT(COMMAND) == FD_READ) {
@@ -2457,7 +2365,7 @@ static void rw_interrupt(void)
 /* Compute maximal contiguous buffer size. */
 static int buffer_chain_size(void)
 {
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	int size;
 	struct req_iterator iter;
 	char *base;
@@ -2466,10 +2374,10 @@ static int buffer_chain_size(void)
 	size = 0;
 
 	rq_for_each_segment(bv, current_req, iter) {
-		if (page_address(bv->bv_page) + bv->bv_offset != base + size)
+		if (page_address(bv.bv_page) + bv.bv_offset != base + size)
 			break;
 
-		size += bv->bv_len;
+		size += bv.bv_len;
 	}
 
 	return size >> 9;
@@ -2495,7 +2403,7 @@ static int transfer_size(int ssize, int max_sector, int max_size)
 static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 {
 	int remaining;		/* number of transferred 512-byte sectors */
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	char *buffer;
 	char *dma_buffer;
 	int size;
@@ -2511,19 +2419,17 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 					      blk_rq_sectors(current_req));
 
 	remaining = current_count_sectors << 9;
-#ifdef FLOPPY_SANITY_CHECK
 	if (remaining > blk_rq_bytes(current_req) && CT(COMMAND) == FD_WRITE) {
 		DPRINT("in copy buffer\n");
-		printk("current_count_sectors=%ld\n", current_count_sectors);
-		printk("remaining=%d\n", remaining >> 9);
-		printk("current_req->nr_sectors=%u\n",
-		       blk_rq_sectors(current_req));
-		printk("current_req->current_nr_sectors=%u\n",
-		       blk_rq_cur_sectors(current_req));
-		printk("max_sector=%d\n", max_sector);
-		printk("ssize=%d\n", ssize);
+		pr_info("current_count_sectors=%ld\n", current_count_sectors);
+		pr_info("remaining=%d\n", remaining >> 9);
+		pr_info("current_req->nr_sectors=%u\n",
+			blk_rq_sectors(current_req));
+		pr_info("current_req->current_nr_sectors=%u\n",
+			blk_rq_cur_sectors(current_req));
+		pr_info("max_sector=%d\n", max_sector);
+		pr_info("ssize=%d\n", ssize);
 	}
-#endif
 
 	buffer_max = max(max_sector, buffer_max);
 
@@ -2535,30 +2441,28 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 		if (!remaining)
 			break;
 
-		size = bv->bv_len;
+		size = bv.bv_len;
 		SUPBOUND(size, remaining);
 
-		buffer = page_address(bv->bv_page) + bv->bv_offset;
-#ifdef FLOPPY_SANITY_CHECK
+		buffer = page_address(bv.bv_page) + bv.bv_offset;
 		if (dma_buffer + size >
 		    floppy_track_buffer + (max_buffer_sectors << 10) ||
 		    dma_buffer < floppy_track_buffer) {
 			DPRINT("buffer overrun in copy buffer %d\n",
-			       (int)((floppy_track_buffer -
-				      dma_buffer) >> 9));
-			printk("fsector_t=%d buffer_min=%d\n",
-			       fsector_t, buffer_min);
-			printk("current_count_sectors=%ld\n",
-			       current_count_sectors);
+			       (int)((floppy_track_buffer - dma_buffer) >> 9));
+			pr_info("fsector_t=%d buffer_min=%d\n",
+				fsector_t, buffer_min);
+			pr_info("current_count_sectors=%ld\n",
+				current_count_sectors);
 			if (CT(COMMAND) == FD_READ)
-				printk("read\n");
+				pr_info("read\n");
 			if (CT(COMMAND) == FD_WRITE)
-				printk("write\n");
+				pr_info("write\n");
 			break;
 		}
 		if (((unsigned long)buffer) % 512)
 			DPRINT("%p buffer not aligned\n", buffer);
-#endif
+
 		if (CT(COMMAND) == FD_READ)
 			memcpy(buffer, dma_buffer, size);
 		else
@@ -2567,13 +2471,11 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 		remaining -= size;
 		dma_buffer += size;
 	}
-#ifdef FLOPPY_SANITY_CHECK
 	if (remaining) {
 		if (remaining > 0)
 			max_sector -= remaining >> 9;
 		DPRINT("weirdness: remaining %d\n", remaining >> 9);
 	}
-#endif
 }
 
 /* work around a bug in pseudo DMA
@@ -2593,15 +2495,14 @@ static void virtualdmabug_workaround(void)
 
 		hard_sectors = raw_cmd->length >> (7 + SIZECODE);
 		end_sector = SECTOR + hard_sectors - 1;
-#ifdef FLOPPY_SANITY_CHECK
 		if (end_sector > SECT_PER_TRACK) {
-			printk("too many sectors %d > %d\n",
-			       end_sector, SECT_PER_TRACK);
+			pr_info("too many sectors %d > %d\n",
+				end_sector, SECT_PER_TRACK);
 			return;
 		}
-#endif
-		SECT_PER_TRACK = end_sector;	/* make sure SECT_PER_TRACK points
-						 * to end of transfer */
+		SECT_PER_TRACK = end_sector;
+					/* make sure SECT_PER_TRACK
+					 * points to end of transfer */
 	}
 }
 
@@ -2623,16 +2524,13 @@ static int make_raw_rw_request(void)
 	int tracksize;
 	int ssize;
 
-	if (max_buffer_sectors == 0) {
-		printk("VFS: Block I/O scheduled on unopened device\n");
+	if (WARN(max_buffer_sectors == 0, "VFS: Block I/O scheduled on unopened device\n"))
 		return 0;
-	}
 
 	set_fdc((long)current_req->rq_disk->private_data);
 
 	raw_cmd = &default_raw_cmd;
-	raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_DISK |
-	    FD_RAW_NEED_SEEK;
+	raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK;
 	raw_cmd->cmd_count = NR_RW;
 	if (rq_data_dir(current_req) == READ) {
 		raw_cmd->flags |= FD_RAW_READ;
@@ -2641,7 +2539,7 @@ static int make_raw_rw_request(void)
 		raw_cmd->flags |= FD_RAW_WRITE;
 		COMMAND = FM_MODE(_floppy, FD_WRITE);
 	} else {
-		DPRINT("make_raw_rw_request: unknown command\n");
+		DPRINT("%s: unknown command\n", __func__);
 		return 0;
 	}
 
@@ -2659,7 +2557,8 @@ static int make_raw_rw_request(void)
 	HEAD = fsector_t / _floppy->sect;
 
 	if (((_floppy->stretch & (FD_SWAPSIDES | FD_SECTBASEMASK)) ||
-	     TESTF(FD_NEED_TWADDLE)) && fsector_t < _floppy->sect)
+	     test_bit(FD_NEED_TWADDLE_BIT, &DRS->flags)) &&
+	    fsector_t < _floppy->sect)
 		max_sector = _floppy->sect;
 
 	/* 2M disks have phantom sectors on the first track */
@@ -2685,7 +2584,7 @@ static int make_raw_rw_request(void)
 	raw_cmd->track = TRACK << STRETCH(_floppy);
 	DR_SELECT = UNIT(current_drive) + PH_HEAD(_floppy, HEAD);
 	GAP = _floppy->gap;
-	CODE2SIZE;
+	ssize = DIV_ROUND_UP(1 << SIZECODE, 4);
 	SECT_PER_TRACK = _floppy->sect << 2 >> SIZECODE;
 	SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) +
 	    FD_SECTBASE(_floppy);
@@ -2730,8 +2629,10 @@ static int make_raw_rw_request(void)
 		}
 	} else if (in_sector_offset || blk_rq_sectors(current_req) < ssize) {
 		if (CT(COMMAND) == FD_WRITE) {
-			if (fsector_t + blk_rq_sectors(current_req) > ssize &&
-			    fsector_t + blk_rq_sectors(current_req) < ssize + ssize)
+			unsigned int sectors;
+
+			sectors = fsector_t + blk_rq_sectors(current_req);
+			if (sectors > ssize && sectors < ssize + ssize)
 				max_size = ssize + ssize;
 			else
 				max_size = ssize;
@@ -2739,7 +2640,7 @@ static int make_raw_rw_request(void)
 		raw_cmd->flags &= ~FD_RAW_WRITE;
 		raw_cmd->flags |= FD_RAW_READ;
 		COMMAND = FM_MODE(_floppy, FD_READ);
-	} else if ((unsigned long)current_req->buffer < MAX_DMA_ADDRESS) {
+	} else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) {
 		unsigned long dma_limit;
 		int direct, indirect;
 
@@ -2752,16 +2653,14 @@ static int make_raw_rw_request(void)
 		 * on a 64 bit machine!
 		 */
 		max_size = buffer_chain_size();
-		dma_limit =
-		    (MAX_DMA_ADDRESS -
-		     ((unsigned long)current_req->buffer)) >> 9;
-		if ((unsigned long)max_size > dma_limit) {
+		dma_limit = (MAX_DMA_ADDRESS -
+			     ((unsigned long)bio_data(current_req->bio))) >> 9;
+		if ((unsigned long)max_size > dma_limit)
 			max_size = dma_limit;
-		}
 		/* 64 kb boundaries */
-		if (CROSS_64KB(current_req->buffer, max_size << 9))
+		if (CROSS_64KB(bio_data(current_req->bio), max_size << 9))
 			max_size = (K_64 -
-				    ((unsigned long)current_req->buffer) %
+				    ((unsigned long)bio_data(current_req->bio)) %
 				    K_64) >> 9;
 		direct = transfer_size(ssize, max_sector, max_size) - fsector_t;
 		/*
@@ -2773,16 +2672,16 @@ static int make_raw_rw_request(void)
 		 */
 		if (!direct ||
 		    (indirect * 2 > direct * 3 &&
-		     *errors < DP->max_errors.read_track && ((!probing
-		       || (DP->read_track & (1 << DRS->probed_format)))))) {
+		     *errors < DP->max_errors.read_track &&
+		     ((!probing ||
+		       (DP->read_track & (1 << DRS->probed_format)))))) {
 			max_size = blk_rq_sectors(current_req);
 		} else {
-			raw_cmd->kernel_data = current_req->buffer;
+			raw_cmd->kernel_data = bio_data(current_req->bio);
 			raw_cmd->length = current_count_sectors << 9;
 			if (raw_cmd->length == 0) {
-				DPRINT
-				    ("zero dma transfer attempted from make_raw_request\n");
-				DPRINT("indirect=%d direct=%d fsector_t=%d",
+				DPRINT("%s: zero dma transfer attempted\n", __func__);
+				DPRINT("indirect=%d direct=%d fsector_t=%d\n",
 				       indirect, direct, fsector_t);
 				return 0;
 			}
@@ -2802,25 +2701,22 @@ static int make_raw_rw_request(void)
 	    ((CT(COMMAND) == FD_READ ||
 	      (!in_sector_offset && blk_rq_sectors(current_req) >= ssize)) &&
 	     max_sector > 2 * max_buffer_sectors + buffer_min &&
-	     max_size + fsector_t > 2 * max_buffer_sectors + buffer_min)
-	    /* not enough space */
-	    ) {
+	     max_size + fsector_t > 2 * max_buffer_sectors + buffer_min)) {
+		/* not enough space */
 		buffer_track = -1;
 		buffer_drive = current_drive;
 		buffer_max = buffer_min = aligned_sector_t;
 	}
 	raw_cmd->kernel_data = floppy_track_buffer +
-	    ((aligned_sector_t - buffer_min) << 9);
+		((aligned_sector_t - buffer_min) << 9);
 
 	if (CT(COMMAND) == FD_WRITE) {
 		/* copy write buffer to track buffer.
 		 * if we get here, we know that the write
 		 * is either aligned or the data already in the buffer
 		 * (buffer will be overwritten) */
-#ifdef FLOPPY_SANITY_CHECK
 		if (in_sector_offset && buffer_track == -1)
 			DPRINT("internal error offset !=0 on write\n");
-#endif
 		buffer_track = raw_cmd->track;
 		buffer_drive = current_drive;
 		copy_buffer(ssize, max_sector,
@@ -2834,9 +2730,8 @@ static int make_raw_rw_request(void)
 	raw_cmd->length = in_sector_offset + current_count_sectors;
 	raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1;
 	raw_cmd->length <<= 9;
-#ifdef FLOPPY_SANITY_CHECK
 	if ((raw_cmd->length < current_count_sectors << 9) ||
-	    (raw_cmd->kernel_data != current_req->buffer &&
+	    (raw_cmd->kernel_data != bio_data(current_req->bio) &&
 	     CT(COMMAND) == FD_WRITE &&
 	     (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max ||
 	      aligned_sector_t < buffer_min)) ||
@@ -2844,38 +2739,38 @@ static int make_raw_rw_request(void)
 	    raw_cmd->length <= 0 || current_count_sectors <= 0) {
 		DPRINT("fractionary current count b=%lx s=%lx\n",
 		       raw_cmd->length, current_count_sectors);
-		if (raw_cmd->kernel_data != current_req->buffer)
-			printk("addr=%d, length=%ld\n",
-			       (int)((raw_cmd->kernel_data -
-				      floppy_track_buffer) >> 9),
-			       current_count_sectors);
-		printk("st=%d ast=%d mse=%d msi=%d\n",
-		       fsector_t, aligned_sector_t, max_sector, max_size);
-		printk("ssize=%x SIZECODE=%d\n", ssize, SIZECODE);
-		printk("command=%x SECTOR=%d HEAD=%d, TRACK=%d\n",
-		       COMMAND, SECTOR, HEAD, TRACK);
-		printk("buffer drive=%d\n", buffer_drive);
-		printk("buffer track=%d\n", buffer_track);
-		printk("buffer_min=%d\n", buffer_min);
-		printk("buffer_max=%d\n", buffer_max);
+		if (raw_cmd->kernel_data != bio_data(current_req->bio))
+			pr_info("addr=%d, length=%ld\n",
+				(int)((raw_cmd->kernel_data -
+				       floppy_track_buffer) >> 9),
+				current_count_sectors);
+		pr_info("st=%d ast=%d mse=%d msi=%d\n",
+			fsector_t, aligned_sector_t, max_sector, max_size);
+		pr_info("ssize=%x SIZECODE=%d\n", ssize, SIZECODE);
+		pr_info("command=%x SECTOR=%d HEAD=%d, TRACK=%d\n",
+			COMMAND, SECTOR, HEAD, TRACK);
+		pr_info("buffer drive=%d\n", buffer_drive);
+		pr_info("buffer track=%d\n", buffer_track);
+		pr_info("buffer_min=%d\n", buffer_min);
+		pr_info("buffer_max=%d\n", buffer_max);
 		return 0;
 	}
 
-	if (raw_cmd->kernel_data != current_req->buffer) {
+	if (raw_cmd->kernel_data != bio_data(current_req->bio)) {
 		if (raw_cmd->kernel_data < floppy_track_buffer ||
 		    current_count_sectors < 0 ||
 		    raw_cmd->length < 0 ||
 		    raw_cmd->kernel_data + raw_cmd->length >
 		    floppy_track_buffer + (max_buffer_sectors << 10)) {
 			DPRINT("buffer overrun in schedule dma\n");
-			printk("fsector_t=%d buffer_min=%d current_count=%ld\n",
-			       fsector_t, buffer_min, raw_cmd->length >> 9);
-			printk("current_count_sectors=%ld\n",
-			       current_count_sectors);
+			pr_info("fsector_t=%d buffer_min=%d current_count=%ld\n",
+				fsector_t, buffer_min, raw_cmd->length >> 9);
+			pr_info("current_count_sectors=%ld\n",
+				current_count_sectors);
 			if (CT(COMMAND) == FD_READ)
-				printk("read\n");
+				pr_info("read\n");
 			if (CT(COMMAND) == FD_WRITE)
-				printk("write\n");
+				pr_info("write\n");
 			return 0;
 		}
 	} else if (raw_cmd->length > blk_rq_bytes(current_req) ||
@@ -2884,22 +2779,42 @@ static int make_raw_rw_request(void)
 		return 0;
 	} else if (raw_cmd->length < current_count_sectors << 9) {
 		DPRINT("more sectors than bytes\n");
-		printk("bytes=%ld\n", raw_cmd->length >> 9);
-		printk("sectors=%ld\n", current_count_sectors);
+		pr_info("bytes=%ld\n", raw_cmd->length >> 9);
+		pr_info("sectors=%ld\n", current_count_sectors);
 	}
 	if (raw_cmd->length == 0) {
 		DPRINT("zero dma transfer attempted from make_raw_request\n");
 		return 0;
 	}
-#endif
 
 	virtualdmabug_workaround();
 	return 2;
 }
 
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static int set_next_request(void)
+{
+	struct request_queue *q;
+	int old_pos = fdc_queue;
+
+	do {
+		q = disks[fdc_queue]->queue;
+		if (++fdc_queue == N_DRIVE)
+			fdc_queue = 0;
+		if (q) {
+			current_req = blk_fetch_request(q);
+			if (current_req)
+				break;
+		}
+	} while (fdc_queue != old_pos);
+
+	return current_req != NULL;
+}
+
 static void redo_fd_request(void)
 {
-#define REPEAT {request_done(0); continue; }
 	int drive;
 	int tmp;
 
@@ -2907,66 +2822,65 @@ static void redo_fd_request(void)
 	if (current_drive < N_DRIVE)
 		floppy_off(current_drive);
 
-	for (;;) {
-		if (!current_req) {
-			struct request *req;
-
-			spin_lock_irq(floppy_queue->queue_lock);
-			req = blk_fetch_request(floppy_queue);
-			spin_unlock_irq(floppy_queue->queue_lock);
-			if (!req) {
-				do_floppy = NULL;
-				unlock_fdc();
-				return;
-			}
-			current_req = req;
-		}
-		drive = (long)current_req->rq_disk->private_data;
-		set_fdc(drive);
-		reschedule_timeout(current_reqD, "redo fd request", 0);
+do_request:
+	if (!current_req) {
+		int pending;
 
-		set_floppy(drive);
-		raw_cmd = &default_raw_cmd;
-		raw_cmd->flags = 0;
-		if (start_motor(redo_fd_request))
+		spin_lock_irq(&floppy_lock);
+		pending = set_next_request();
+		spin_unlock_irq(&floppy_lock);
+		if (!pending) {
+			do_floppy = NULL;
+			unlock_fdc();
 			return;
-		disk_change(current_drive);
-		if (test_bit(current_drive, &fake_change) ||
-		    TESTF(FD_DISK_CHANGED)) {
-			DPRINT("disk absent or changed during operation\n");
-			REPEAT;
-		}
-		if (!_floppy) {	/* Autodetection */
-			if (!probing) {
-				DRS->probed_format = 0;
-				if (next_valid_format()) {
-					DPRINT("no autodetectable formats\n");
-					_floppy = NULL;
-					REPEAT;
-				}
-			}
-			probing = 1;
-			_floppy =
-			    floppy_type + DP->autodetect[DRS->probed_format];
-		} else
-			probing = 0;
-		errors = &(current_req->errors);
-		tmp = make_raw_rw_request();
-		if (tmp < 2) {
-			request_done(tmp);
-			continue;
 		}
+	}
+	drive = (long)current_req->rq_disk->private_data;
+	set_fdc(drive);
+	reschedule_timeout(current_reqD, "redo fd request");
 
-		if (TESTF(FD_NEED_TWADDLE))
-			twaddle();
-		schedule_bh(floppy_start);
-		debugt("queue fd request");
+	set_floppy(drive);
+	raw_cmd = &default_raw_cmd;
+	raw_cmd->flags = 0;
+	if (start_motor(redo_fd_request))
 		return;
+
+	disk_change(current_drive);
+	if (test_bit(current_drive, &fake_change) ||
+	    test_bit(FD_DISK_CHANGED_BIT, &DRS->flags)) {
+		DPRINT("disk absent or changed during operation\n");
+		request_done(0);
+		goto do_request;
+	}
+	if (!_floppy) {	/* Autodetection */
+		if (!probing) {
+			DRS->probed_format = 0;
+			if (next_valid_format()) {
+				DPRINT("no autodetectable formats\n");
+				_floppy = NULL;
+				request_done(0);
+				goto do_request;
+			}
+		}
+		probing = 1;
+		_floppy = floppy_type + DP->autodetect[DRS->probed_format];
+	} else
+		probing = 0;
+	errors = &(current_req->errors);
+	tmp = make_raw_rw_request();
+	if (tmp < 2) {
+		request_done(tmp);
+		goto do_request;
 	}
-#undef REPEAT
+
+	if (test_bit(FD_NEED_TWADDLE_BIT, &DRS->flags))
+		twaddle();
+	schedule_bh(floppy_start);
+	debugt(__func__, "queue fd request");
+	return;
 }
 
-static struct cont_t rw_cont = {
+static const struct cont_t rw_cont = {
 	.interrupt	= rw_interrupt,
 	.redo		= redo_fd_request,
 	.error		= bad_flp_intr,
@@ -2979,57 +2893,50 @@ static void process_fd_request(void)
 	schedule_bh(redo_fd_request);
 }
 
-static void do_fd_request(struct request_queue * q)
+static void do_fd_request(struct request_queue *q)
 {
-	if (max_buffer_sectors == 0) {
-		printk("VFS: do_fd_request called on non-open device\n");
+	if (WARN(max_buffer_sectors == 0,
+		 "VFS: %s called on non-open device\n", __func__))
 		return;
-	}
 
-	if (usage_count == 0) {
-		printk("warning: usage count=0, current_req=%p exiting\n",
-		       current_req);
-		printk("sect=%ld type=%x flags=%x\n",
-		       (long)blk_rq_pos(current_req), current_req->cmd_type,
-		       current_req->cmd_flags);
+	if (WARN(atomic_read(&usage_count) == 0,
+		 "warning: usage count=0, current_req=%p sect=%ld type=%x flags=%llx\n",
+		 current_req, (long)blk_rq_pos(current_req), current_req->cmd_type,
+		 (unsigned long long) current_req->cmd_flags))
 		return;
-	}
-	if (test_bit(0, &fdc_busy)) {
+
+	if (test_and_set_bit(0, &fdc_busy)) {
 		/* fdc busy, this new request will be treated when the
 		   current one is done */
-		is_alive("do fd request, old request running");
+		is_alive(__func__, "old request running");
 		return;
 	}
-	lock_fdc(MAXTIMEOUT, 0);
+	command_status = FD_COMMAND_NONE;
+	__reschedule_timeout(MAXTIMEOUT, "fd_request");
+	set_fdc(0);
 	process_fd_request();
-	is_alive("do fd request");
+	is_alive(__func__, "");
 }
 
-static struct cont_t poll_cont = {
+static const struct cont_t poll_cont = {
 	.interrupt	= success_and_wakeup,
 	.redo		= floppy_ready,
 	.error		= generic_failure,
 	.done		= generic_done
 };
 
-static int poll_drive(int interruptible, int flag)
+static int poll_drive(bool interruptible, int flag)
 {
-	int ret;
-
 	/* no auto-sense, just clear dcl */
 	raw_cmd = &default_raw_cmd;
 	raw_cmd->flags = flag;
 	raw_cmd->track = 0;
 	raw_cmd->cmd_count = 0;
 	cont = &poll_cont;
-#ifdef DCL_DEBUG
-	if (DP->flags & FD_DEBUG) {
-		DPRINT("setting NEWCHANGE in poll_drive\n");
-	}
-#endif
-	SETF(FD_DISK_NEWCHANGE);
-	WAIT(floppy_ready);
-	return ret;
+	debug_dcl(DP->flags, "setting NEWCHANGE in poll_drive\n");
+	set_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
+
+	return wait_til_done(floppy_ready, interruptible);
 }
 
 /*
@@ -3039,30 +2946,33 @@ static int poll_drive(int interruptible, int flag)
 
 static void reset_intr(void)
 {
-	printk("weird, reset interrupt called\n");
+	pr_info("weird, reset interrupt called\n");
 }
 
-static struct cont_t reset_cont = {
+static const struct cont_t reset_cont = {
 	.interrupt	= reset_intr,
 	.redo		= success_and_wakeup,
 	.error		= generic_failure,
 	.done		= generic_done
 };
 
-static int user_reset_fdc(int drive, int arg, int interruptible)
+static int user_reset_fdc(int drive, int arg, bool interruptible)
 {
 	int ret;
 
-	ret = 0;
-	LOCK_FDC(drive, interruptible);
+	if (lock_fdc(drive, interruptible))
+		return -EINTR;
+
 	if (arg == FD_RESET_ALWAYS)
 		FDCS->reset = 1;
 	if (FDCS->reset) {
 		cont = &reset_cont;
-		WAIT(reset_fdc);
+		ret = wait_til_done(reset_fdc, interruptible);
+		if (ret == -EINTR)
+			return -EINTR;
 	}
 	process_fd_request();
-	return ret;
+	return 0;
 }
 
 /*
@@ -3075,18 +2985,13 @@ static inline int fd_copyout(void __user *param, const void *address,
 	return copy_to_user(param, address, size) ? -EFAULT : 0;
 }
 
-static inline int fd_copyin(void __user *param, void *address, unsigned long size)
+static inline int fd_copyin(void __user *param, void *address,
+			    unsigned long size)
 {
 	return copy_from_user(address, param, size) ? -EFAULT : 0;
 }
 
-#define _COPYOUT(x) (copy_to_user((void __user *)param, &(x), sizeof(x)) ? -EFAULT : 0)
-#define _COPYIN(x) (copy_from_user(&(x), (void __user *)param, sizeof(x)) ? -EFAULT : 0)
-
-#define COPYOUT(x) ECALL(_COPYOUT(x))
-#define COPYIN(x) ECALL(_COPYIN(x))
-
-static inline const char *drive_name(int type, int drive)
+static const char *drive_name(int type, int drive)
 {
 	struct floppy_struct *floppy;
 
@@ -3149,30 +3054,39 @@ static void raw_cmd_done(int flag)
 	generic_done(flag);
 }
 
-static struct cont_t raw_cmd_cont = {
+static const struct cont_t raw_cmd_cont = {
 	.interrupt	= success_and_wakeup,
 	.redo		= floppy_start,
 	.error		= generic_failure,
 	.done		= raw_cmd_done
 };
 
-static inline int raw_cmd_copyout(int cmd, char __user *param,
+static int raw_cmd_copyout(int cmd, void __user *param,
 				  struct floppy_raw_cmd *ptr)
 {
 	int ret;
 
 	while (ptr) {
-		COPYOUT(*ptr);
+		struct floppy_raw_cmd cmd = *ptr;
+		cmd.next = NULL;
+		cmd.kernel_data = NULL;
+		ret = copy_to_user(param, &cmd, sizeof(cmd));
+		if (ret)
+			return -EFAULT;
 		param += sizeof(struct floppy_raw_cmd);
 		if ((ptr->flags & FD_RAW_READ) && ptr->buffer_length) {
-			if (ptr->length >= 0
-			    && ptr->length <= ptr->buffer_length)
-				ECALL(fd_copyout
-				      (ptr->data, ptr->kernel_data,
-				       ptr->buffer_length - ptr->length));
+			if (ptr->length >= 0 &&
+			    ptr->length <= ptr->buffer_length) {
+				long length = ptr->buffer_length - ptr->length;
+				ret = fd_copyout(ptr->data, ptr->kernel_data,
+						 length);
+				if (ret)
+					return ret;
+			}
 		}
 		ptr = ptr->next;
 	}
+
 	return 0;
 }
 
@@ -3195,7 +3109,7 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr)
 	}
 }
 
-static inline int raw_cmd_copyin(int cmd, char __user *param,
+static int raw_cmd_copyin(int cmd, void __user *param,
 				 struct floppy_raw_cmd **rcmd)
 {
 	struct floppy_raw_cmd *ptr;
@@ -3203,17 +3117,20 @@ static inline int raw_cmd_copyin(int cmd, char __user *param,
 	int i;
 
 	*rcmd = NULL;
-	while (1) {
-		ptr = (struct floppy_raw_cmd *)
-		    kmalloc(sizeof(struct floppy_raw_cmd), GFP_USER);
-		if (!ptr)
-			return -ENOMEM;
-		*rcmd = ptr;
-		COPYIN(*ptr);
-		ptr->next = NULL;
-		ptr->buffer_length = 0;
-		param += sizeof(struct floppy_raw_cmd);
-		if (ptr->cmd_count > 33)
+
+loop:
+	ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_USER);
+	if (!ptr)
+		return -ENOMEM;
+	*rcmd = ptr;
+	ret = copy_from_user(ptr, param, sizeof(*ptr));
+	ptr->next = NULL;
+	ptr->buffer_length = 0;
+	ptr->kernel_data = NULL;
+	if (ret)
+		return -EFAULT;
+	param += sizeof(struct floppy_raw_cmd);
+	if (ptr->cmd_count > 33)
 			/* the command may now also take up the space
 			 * initially intended for the reply & the
 			 * reply count. Needed for long 82078 commands
@@ -3222,31 +3139,34 @@ static inline int raw_cmd_copyin(int cmd, char __user *param,
 			 * 16 bytes for a structure, you'll one day
 			 * discover that you really need 17...
 			 */
+		return -EINVAL;
+
+	for (i = 0; i < 16; i++)
+		ptr->reply[i] = 0;
+	ptr->resultcode = 0;
+
+	if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) {
+		if (ptr->length <= 0)
 			return -EINVAL;
+		ptr->kernel_data = (char *)fd_dma_mem_alloc(ptr->length);
+		fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length);
+		if (!ptr->kernel_data)
+			return -ENOMEM;
+		ptr->buffer_length = ptr->length;
+	}
+	if (ptr->flags & FD_RAW_WRITE) {
+		ret = fd_copyin(ptr->data, ptr->kernel_data, ptr->length);
+		if (ret)
+			return ret;
+	}
 
-		for (i = 0; i < 16; i++)
-			ptr->reply[i] = 0;
-		ptr->resultcode = 0;
-		ptr->kernel_data = NULL;
-
-		if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) {
-			if (ptr->length <= 0)
-				return -EINVAL;
-			ptr->kernel_data =
-			    (char *)fd_dma_mem_alloc(ptr->length);
-			fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length);
-			if (!ptr->kernel_data)
-				return -ENOMEM;
-			ptr->buffer_length = ptr->length;
-		}
-		if (ptr->flags & FD_RAW_WRITE)
-			ECALL(fd_copyin(ptr->data, ptr->kernel_data,
-					ptr->length));
+	if (ptr->flags & FD_RAW_MORE) {
 		rcmd = &(ptr->next);
-		if (!(ptr->flags & FD_RAW_MORE))
-			return 0;
 		ptr->rate &= 0x43;
+		goto loop;
 	}
+
+	return 0;
 }
 
 static int raw_cmd_ioctl(int cmd, void __user *param)
@@ -3283,12 +3203,8 @@ static int raw_cmd_ioctl(int cmd, void __user *param)
 
 	raw_cmd = my_raw_cmd;
 	cont = &raw_cmd_cont;
-	ret = wait_til_done(floppy_start, 1);
-#ifdef DCL_DEBUG
-	if (DP->flags & FD_DEBUG) {
-		DPRINT("calling disk change from raw_cmd ioctl\n");
-	}
-#endif
+	ret = wait_til_done(floppy_start, true);
+	debug_dcl(DP->flags, "calling disk change from raw_cmd ioctl\n");
 
 	if (ret != -EINTR && FDCS->reset)
 		ret = -EIO;
@@ -3311,7 +3227,7 @@ static int invalidate_drive(struct block_device *bdev)
 	return 0;
 }
 
-static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
+static int set_geometry(unsigned int cmd, struct floppy_struct *g,
 			       int drive, int type, struct block_device *bdev)
 {
 	int cnt;
@@ -3327,7 +3243,10 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		mutex_lock(&open_lock);
-		LOCK_FDC(drive, 1);
+		if (lock_fdc(drive, true)) {
+			mutex_unlock(&open_lock);
+			return -EINTR;
+		}
 		floppy_type[type] = *g;
 		floppy_type[type].name = "user format";
 		for (cnt = type << 2; cnt < (type << 2) + 4; cnt++)
@@ -3338,16 +3257,20 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
 			struct block_device *bdev = opened_bdev[cnt];
 			if (!bdev || ITYPE(drive_state[cnt].fd_device) != type)
 				continue;
-			__invalidate_device(bdev);
+			__invalidate_device(bdev, true);
 		}
 		mutex_unlock(&open_lock);
 	} else {
 		int oldStretch;
-		LOCK_FDC(drive, 1);
-		if (cmd != FDDEFPRM)
+
+		if (lock_fdc(drive, true))
+			return -EINTR;
+		if (cmd != FDDEFPRM) {
 			/* notice a disk change immediately, else
 			 * we lose our settings immediately*/
-			CALL(poll_drive(1, FD_RAW_NEED_DISK));
+			if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
+				return -EINTR;
+		}
 		oldStretch = g->stretch;
 		user_params[drive] = *g;
 		if (buffer_drive == drive)
@@ -3375,7 +3298,7 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
 }
 
 /* handle obsolete ioctl's */
-static int ioctl_table[] = {
+static unsigned int ioctl_table[] = {
 	FDCLRPRM,
 	FDSETPRM,
 	FDDEFPRM,
@@ -3403,7 +3326,7 @@ static int ioctl_table[] = {
 	FDTWADDLE
 };
 
-static inline int normalize_ioctl(int *cmd, int *size)
+static int normalize_ioctl(unsigned int *cmd, int *size)
 {
 	int i;
 
@@ -3412,7 +3335,7 @@ static inline int normalize_ioctl(int *cmd, int *size)
 			*size = _IOC_SIZE(*cmd);
 			*cmd = ioctl_table[i];
 			if (*size > _IOC_SIZE(*cmd)) {
-				printk("ioctl not yet supported\n");
+				pr_info("ioctl not yet supported\n");
 				return -EFAULT;
 			}
 			return 0;
@@ -3426,8 +3349,10 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g)
 	if (type)
 		*g = &floppy_type[type];
 	else {
-		LOCK_FDC(drive, 0);
-		CALL(poll_drive(0, 0));
+		if (lock_fdc(drive, false))
+			return -EINTR;
+		if (poll_drive(false, 0) == -EINTR)
+			return -EINTR;
 		process_fd_request();
 		*g = current_type[drive];
 	}
@@ -3453,13 +3378,9 @@ static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		    unsigned long param)
 {
-#define FD_IOCTL_ALLOWED (mode & (FMODE_WRITE|FMODE_WRITE_IOCTL))
-#define OUT(c,x) case c: outparam = (const char *) (x); break
-#define IN(c,x,tag) case c: *(x) = inparam. tag ; return 0
-
 	int drive = (long)bdev->bd_disk->private_data;
 	int type = ITYPE(UDRS->fd_device);
 	int i;
@@ -3471,153 +3392,183 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		struct floppy_max_errors max_errors;
 		struct floppy_drive_params dp;
 	} inparam;		/* parameters coming from user space */
-	const char *outparam;	/* parameters passed back to user space */
+	const void *outparam;	/* parameters passed back to user space */
 
 	/* convert compatibility eject ioctls into floppy eject ioctl.
 	 * We do this in order to provide a means to eject floppy disks before
 	 * installing the new fdutils package */
 	if (cmd == CDROMEJECT ||	/* CD-ROM eject */
-	    cmd == 0x6470 /* SunOS floppy eject */ ) {
+	    cmd == 0x6470) {		/* SunOS floppy eject */
 		DPRINT("obsolete eject ioctl\n");
 		DPRINT("please use floppycontrol --eject\n");
 		cmd = FDEJECT;
 	}
 
-	/* convert the old style command into a new style command */
-	if ((cmd & 0xff00) == 0x0200) {
-		ECALL(normalize_ioctl(&cmd, &size));
-	} else
+	if (!((cmd & 0xff00) == 0x0200))
 		return -EINVAL;
 
+	/* convert the old style command into a new style command */
+	ret = normalize_ioctl(&cmd, &size);
+	if (ret)
+		return ret;
+
 	/* permission checks */
-	if (((cmd & 0x40) && !FD_IOCTL_ALLOWED) ||
+	if (((cmd & 0x40) && !(mode & (FMODE_WRITE | FMODE_WRITE_IOCTL))) ||
 	    ((cmd & 0x80) && !capable(CAP_SYS_ADMIN)))
 		return -EPERM;
 
+	if (WARN_ON(size < 0 || size > sizeof(inparam)))
+		return -EINVAL;
+
 	/* copyin */
-	CLEARSTRUCT(&inparam);
-	if (_IOC_DIR(cmd) & _IOC_WRITE)
-	    ECALL(fd_copyin((void __user *)param, &inparam, size))
-
-		switch (cmd) {
-		case FDEJECT:
-			if (UDRS->fd_ref != 1)
-				/* somebody else has this drive open */
-				return -EBUSY;
-			LOCK_FDC(drive, 1);
-
-			/* do the actual eject. Fails on
-			 * non-Sparc architectures */
-			ret = fd_eject(UNIT(drive));
-
-			USETF(FD_DISK_CHANGED);
-			USETF(FD_VERIFY);
-			process_fd_request();
+	memset(&inparam, 0, sizeof(inparam));
+	if (_IOC_DIR(cmd) & _IOC_WRITE) {
+		ret = fd_copyin((void __user *)param, &inparam, size);
+		if (ret)
 			return ret;
-		case FDCLRPRM:
-			LOCK_FDC(drive, 1);
-			current_type[drive] = NULL;
-			floppy_sizes[drive] = MAX_DISK_SIZE << 1;
-			UDRS->keep_data = 0;
-			return invalidate_drive(bdev);
-		case FDSETPRM:
-		case FDDEFPRM:
-			return set_geometry(cmd, &inparam.g,
-					    drive, type, bdev);
-		case FDGETPRM:
-			ECALL(get_floppy_geometry(drive, type,
-						  (struct floppy_struct **)
-						  &outparam));
-			break;
-
-		case FDMSGON:
-			UDP->flags |= FTD_MSG;
-			return 0;
-		case FDMSGOFF:
-			UDP->flags &= ~FTD_MSG;
-			return 0;
-
-		case FDFMTBEG:
-			LOCK_FDC(drive, 1);
-			CALL(poll_drive(1, FD_RAW_NEED_DISK));
-			ret = UDRS->flags;
-			process_fd_request();
-			if (ret & FD_VERIFY)
-				return -ENODEV;
-			if (!(ret & FD_DISK_WRITABLE))
-				return -EROFS;
-			return 0;
-		case FDFMTTRK:
-			if (UDRS->fd_ref != 1)
-				return -EBUSY;
-			return do_format(drive, &inparam.f);
-		case FDFMTEND:
-		case FDFLUSH:
-			LOCK_FDC(drive, 1);
-			return invalidate_drive(bdev);
-
-		case FDSETEMSGTRESH:
-			UDP->max_errors.reporting =
-			    (unsigned short)(param & 0x0f);
-			return 0;
-			OUT(FDGETMAXERRS, &UDP->max_errors);
-			IN(FDSETMAXERRS, &UDP->max_errors, max_errors);
-
-		case FDGETDRVTYP:
-			outparam = drive_name(type, drive);
-			SUPBOUND(size, strlen(outparam) + 1);
-			break;
+	}
 
-			IN(FDSETDRVPRM, UDP, dp);
-			OUT(FDGETDRVPRM, UDP);
+	switch (cmd) {
+	case FDEJECT:
+		if (UDRS->fd_ref != 1)
+			/* somebody else has this drive open */
+			return -EBUSY;
+		if (lock_fdc(drive, true))
+			return -EINTR;
 
-		case FDPOLLDRVSTAT:
-			LOCK_FDC(drive, 1);
-			CALL(poll_drive(1, FD_RAW_NEED_DISK));
-			process_fd_request();
-			/* fall through */
-			OUT(FDGETDRVSTAT, UDRS);
+		/* do the actual eject. Fails on
+		 * non-Sparc architectures */
+		ret = fd_eject(UNIT(drive));
 
-		case FDRESET:
-			return user_reset_fdc(drive, (int)param, 1);
+		set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
+		set_bit(FD_VERIFY_BIT, &UDRS->flags);
+		process_fd_request();
+		return ret;
+	case FDCLRPRM:
+		if (lock_fdc(drive, true))
+			return -EINTR;
+		current_type[drive] = NULL;
+		floppy_sizes[drive] = MAX_DISK_SIZE << 1;
+		UDRS->keep_data = 0;
+		return invalidate_drive(bdev);
+	case FDSETPRM:
+	case FDDEFPRM:
+		return set_geometry(cmd, &inparam.g, drive, type, bdev);
+	case FDGETPRM:
+		ret = get_floppy_geometry(drive, type,
+					  (struct floppy_struct **)&outparam);
+		if (ret)
+			return ret;
+		break;
+	case FDMSGON:
+		UDP->flags |= FTD_MSG;
+		return 0;
+	case FDMSGOFF:
+		UDP->flags &= ~FTD_MSG;
+		return 0;
+	case FDFMTBEG:
+		if (lock_fdc(drive, true))
+			return -EINTR;
+		if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
+			return -EINTR;
+		ret = UDRS->flags;
+		process_fd_request();
+		if (ret & FD_VERIFY)
+			return -ENODEV;
+		if (!(ret & FD_DISK_WRITABLE))
+			return -EROFS;
+		return 0;
+	case FDFMTTRK:
+		if (UDRS->fd_ref != 1)
+			return -EBUSY;
+		return do_format(drive, &inparam.f);
+	case FDFMTEND:
+	case FDFLUSH:
+		if (lock_fdc(drive, true))
+			return -EINTR;
+		return invalidate_drive(bdev);
+	case FDSETEMSGTRESH:
+		UDP->max_errors.reporting = (unsigned short)(param & 0x0f);
+		return 0;
+	case FDGETMAXERRS:
+		outparam = &UDP->max_errors;
+		break;
+	case FDSETMAXERRS:
+		UDP->max_errors = inparam.max_errors;
+		break;
+	case FDGETDRVTYP:
+		outparam = drive_name(type, drive);
+		SUPBOUND(size, strlen((const char *)outparam) + 1);
+		break;
+	case FDSETDRVPRM:
+		*UDP = inparam.dp;
+		break;
+	case FDGETDRVPRM:
+		outparam = UDP;
+		break;
+	case FDPOLLDRVSTAT:
+		if (lock_fdc(drive, true))
+			return -EINTR;
+		if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
+			return -EINTR;
+		process_fd_request();
+		/* fall through */
+	case FDGETDRVSTAT:
+		outparam = UDRS;
+		break;
+	case FDRESET:
+		return user_reset_fdc(drive, (int)param, true);
+	case FDGETFDCSTAT:
+		outparam = UFDCS;
+		break;
+	case FDWERRORCLR:
+		memset(UDRWE, 0, sizeof(*UDRWE));
+		return 0;
+	case FDWERRORGET:
+		outparam = UDRWE;
+		break;
+	case FDRAWCMD:
+		if (type)
+			return -EINVAL;
+		if (lock_fdc(drive, true))
+			return -EINTR;
+		set_floppy(drive);
+		i = raw_cmd_ioctl(cmd, (void __user *)param);
+		if (i == -EINTR)
+			return -EINTR;
+		process_fd_request();
+		return i;
+	case FDTWADDLE:
+		if (lock_fdc(drive, true))
+			return -EINTR;
+		twaddle();
+		process_fd_request();
+		return 0;
+	default:
+		return -EINVAL;
+	}
 
-			OUT(FDGETFDCSTAT, UFDCS);
+	if (_IOC_DIR(cmd) & _IOC_READ)
+		return fd_copyout((void __user *)param, outparam, size);
 
-		case FDWERRORCLR:
-			CLEARSTRUCT(UDRWE);
-			return 0;
-			OUT(FDWERRORGET, UDRWE);
-
-		case FDRAWCMD:
-			if (type)
-				return -EINVAL;
-			LOCK_FDC(drive, 1);
-			set_floppy(drive);
-			CALL(i = raw_cmd_ioctl(cmd, (void __user *)param));
-			process_fd_request();
-			return i;
+	return 0;
+}
 
-		case FDTWADDLE:
-			LOCK_FDC(drive, 1);
-			twaddle();
-			process_fd_request();
-			return 0;
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long param)
+{
+	int ret;
 
-		default:
-			return -EINVAL;
-		}
+	mutex_lock(&floppy_mutex);
+	ret = fd_locked_ioctl(bdev, mode, cmd, param);
+	mutex_unlock(&floppy_mutex);
 
-	if (_IOC_DIR(cmd) & _IOC_READ)
-		return fd_copyout((void __user *)param, outparam, size);
-	else
-		return 0;
-#undef OUT
-#undef IN
+	return ret;
 }
 
 static void __init config_types(void)
 {
-	int first = 1;
+	bool has_drive = false;
 	int drive;
 
 	/* read drive info out of physical CMOS */
@@ -3649,35 +3600,38 @@ static void __init config_types(void)
 			name = temparea;
 		}
 		if (name) {
-			const char *prepend = ",";
-			if (first) {
-				prepend = KERN_INFO "Floppy drive(s):";
-				first = 0;
+			const char *prepend;
+			if (!has_drive) {
+				prepend = "";
+				has_drive = true;
+				pr_info("Floppy drive(s):");
+			} else {
+				prepend = ",";
 			}
-			printk("%s fd%d is %s", prepend, drive, name);
+
+			pr_cont("%s fd%d is %s", prepend, drive, name);
 		}
 		*UDP = *params;
 	}
-	if (!first)
-		printk("\n");
+
+	if (has_drive)
+		pr_cont("\n");
 }
 
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	int drive = (long)disk->private_data;
 
+	mutex_lock(&floppy_mutex);
 	mutex_lock(&open_lock);
-	if (UDRS->fd_ref < 0)
-		UDRS->fd_ref = 0;
-	else if (!UDRS->fd_ref--) {
+	if (!UDRS->fd_ref--) {
 		DPRINT("floppy_release with fd_ref == 0");
 		UDRS->fd_ref = 0;
 	}
 	if (!UDRS->fd_ref)
 		opened_bdev[drive] = NULL;
 	mutex_unlock(&open_lock);
-
-	return 0;
+	mutex_unlock(&floppy_mutex);
 }
 
 /*
@@ -3693,23 +3647,18 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	int res = -EBUSY;
 	char *tmp;
 
+	mutex_lock(&floppy_mutex);
 	mutex_lock(&open_lock);
 	old_dev = UDRS->fd_device;
 	if (opened_bdev[drive] && opened_bdev[drive] != bdev)
 		goto out2;
 
 	if (!UDRS->fd_ref && (UDP->flags & FD_BROKEN_DCL)) {
-		USETF(FD_DISK_CHANGED);
-		USETF(FD_VERIFY);
+		set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
+		set_bit(FD_VERIFY_BIT, &UDRS->flags);
 	}
 
-	if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL)))
-		goto out2;
-
-	if (mode & FMODE_EXCL)
-		UDRS->fd_ref = -1;
-	else
-		UDRS->fd_ref++;
+	UDRS->fd_ref++;
 
 	opened_bdev[drive] = bdev;
 
@@ -3729,9 +3678,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 			INFBOUND(try, 16);
 			tmp = (char *)fd_dma_mem_alloc(1024 * try);
 		}
-		if (!tmp && !floppy_track_buffer) {
+		if (!tmp && !floppy_track_buffer)
 			fallback_on_nodma_alloc(&tmp, 2048 * try);
-		}
 		if (!tmp && !floppy_track_buffer) {
 			DPRINT("Unable to allocate DMA memory\n");
 			goto out;
@@ -3760,49 +3708,55 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	if (!(mode & FMODE_NDELAY)) {
 		if (mode & (FMODE_READ|FMODE_WRITE)) {
 			UDRS->last_checked = 0;
+			clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
 			check_disk_change(bdev);
-			if (UTESTF(FD_DISK_CHANGED))
+			if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags))
+				goto out;
+			if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags))
 				goto out;
 		}
 		res = -EROFS;
-		if ((mode & FMODE_WRITE) && !(UTESTF(FD_DISK_WRITABLE)))
+		if ((mode & FMODE_WRITE) &&
+		    !test_bit(FD_DISK_WRITABLE_BIT, &UDRS->flags))
 			goto out;
 	}
 	mutex_unlock(&open_lock);
+	mutex_unlock(&floppy_mutex);
 	return 0;
 out:
-	if (UDRS->fd_ref < 0)
-		UDRS->fd_ref = 0;
-	else
-		UDRS->fd_ref--;
+	UDRS->fd_ref--;
+
 	if (!UDRS->fd_ref)
 		opened_bdev[drive] = NULL;
 out2:
 	mutex_unlock(&open_lock);
+	mutex_unlock(&floppy_mutex);
 	return res;
 }
 
 /*
  * Check if the disk has been changed or if a change has been faked.
  */
-static int check_floppy_change(struct gendisk *disk)
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
 {
 	int drive = (long)disk->private_data;
 
-	if (UTESTF(FD_DISK_CHANGED) || UTESTF(FD_VERIFY))
-		return 1;
+	if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
+	    test_bit(FD_VERIFY_BIT, &UDRS->flags))
+		return DISK_EVENT_MEDIA_CHANGE;
 
 	if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) {
-		lock_fdc(drive, 0);
-		poll_drive(0, 0);
+		lock_fdc(drive, false);
+		poll_drive(false, 0);
 		process_fd_request();
 	}
 
-	if (UTESTF(FD_DISK_CHANGED) ||
-	    UTESTF(FD_VERIFY) ||
+	if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
+	    test_bit(FD_VERIFY_BIT, &UDRS->flags) ||
 	    test_bit(drive, &fake_change) ||
-	    (!ITYPE(UDRS->fd_device) && !current_type[drive]))
-		return 1;
+	    drive_no_geom(drive))
+		return DISK_EVENT_MEDIA_CHANGE;
 	return 0;
 }
 
@@ -3812,18 +3766,29 @@ static int check_floppy_change(struct gendisk *disk)
  * a disk in the drive, and whether that disk is writable.
  */
 
-static void floppy_rb0_complete(struct bio *bio,
-			       int err)
+struct rb0_cbdata {
+	int drive;
+	struct completion complete;
+};
+
+static void floppy_rb0_cb(struct bio *bio, int err)
 {
-	complete((struct completion *)bio->bi_private);
+	struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private;
+	int drive = cbdata->drive;
+
+	if (err) {
+		pr_info("floppy: error %d while reading block 0\n", err);
+		set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
+	}
+	complete(&cbdata->complete);
 }
 
-static int __floppy_read_block_0(struct block_device *bdev)
+static int __floppy_read_block_0(struct block_device *bdev, int drive)
 {
 	struct bio bio;
 	struct bio_vec bio_vec;
-	struct completion complete;
 	struct page *page;
+	struct rb0_cbdata cbdata;
 	size_t size;
 
 	page = alloc_page(GFP_NOIO);
@@ -3836,24 +3801,26 @@ static int __floppy_read_block_0(struct block_device *bdev)
 	if (!size)
 		size = 1024;
 
+	cbdata.drive = drive;
+
 	bio_init(&bio);
 	bio.bi_io_vec = &bio_vec;
 	bio_vec.bv_page = page;
 	bio_vec.bv_len = size;
 	bio_vec.bv_offset = 0;
 	bio.bi_vcnt = 1;
-	bio.bi_idx = 0;
-	bio.bi_size = size;
+	bio.bi_iter.bi_size = size;
 	bio.bi_bdev = bdev;
-	bio.bi_sector = 0;
-	init_completion(&complete);
-	bio.bi_private = &complete;
-	bio.bi_end_io = floppy_rb0_complete;
+	bio.bi_iter.bi_sector = 0;
+	bio.bi_flags |= (1 << BIO_QUIET);
+	bio.bi_private = &cbdata;
+	bio.bi_end_io = floppy_rb0_cb;
 
 	submit_bio(READ, &bio);
-	generic_unplug_device(bdev_get_queue(bdev));
 	process_fd_request();
-	wait_for_completion(&complete);
+
+	init_completion(&cbdata.complete);
+	wait_for_completion(&cbdata.complete);
 
 	__free_page(page);
 
@@ -3867,19 +3834,21 @@ static int __floppy_read_block_0(struct block_device *bdev)
 static int floppy_revalidate(struct gendisk *disk)
 {
 	int drive = (long)disk->private_data;
-#define NO_GEOM (!current_type[drive] && !ITYPE(UDRS->fd_device))
 	int cf;
 	int res = 0;
 
-	if (UTESTF(FD_DISK_CHANGED) ||
-	    UTESTF(FD_VERIFY) || test_bit(drive, &fake_change) || NO_GEOM) {
-		if (usage_count == 0) {
-			printk("VFS: revalidate called on non-open device.\n");
+	if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
+	    test_bit(FD_VERIFY_BIT, &UDRS->flags) ||
+	    test_bit(drive, &fake_change) ||
+	    drive_no_geom(drive)) {
+		if (WARN(atomic_read(&usage_count) == 0,
+			 "VFS: revalidate called on non-open device.\n"))
 			return -EFAULT;
-		}
-		lock_fdc(drive, 0);
-		cf = UTESTF(FD_DISK_CHANGED) || UTESTF(FD_VERIFY);
-		if (!(cf || test_bit(drive, &fake_change) || NO_GEOM)) {
+
+		lock_fdc(drive, false);
+		cf = (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
+		      test_bit(FD_VERIFY_BIT, &UDRS->flags));
+		if (!(cf || test_bit(drive, &fake_change) || drive_no_geom(drive))) {
 			process_fd_request();	/*already done by another thread */
 			return 0;
 		}
@@ -3888,15 +3857,15 @@ static int floppy_revalidate(struct gendisk *disk)
 		if (buffer_drive == drive)
 			buffer_track = -1;
 		clear_bit(drive, &fake_change);
-		UCLEARF(FD_DISK_CHANGED);
+		clear_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
 		if (cf)
 			UDRS->generation++;
-		if (NO_GEOM) {
+		if (drive_no_geom(drive)) {
 			/* auto-sensing */
-			res = __floppy_read_block_0(opened_bdev[drive]);
+			res = __floppy_read_block_0(opened_bdev[drive], drive);
 		} else {
 			if (cf)
-				poll_drive(0, FD_RAW_NEED_DISK);
+				poll_drive(false, FD_RAW_NEED_DISK);
 			process_fd_request();
 		}
 	}
@@ -3904,13 +3873,13 @@ static int floppy_revalidate(struct gendisk *disk)
 	return res;
 }
 
-static struct block_device_operations floppy_fops = {
+static const struct block_device_operations floppy_fops = {
 	.owner			= THIS_MODULE,
 	.open			= floppy_open,
 	.release		= floppy_release,
-	.locked_ioctl		= fd_ioctl,
+	.ioctl			= fd_ioctl,
 	.getgeo			= fd_getgeo,
-	.media_changed		= check_floppy_change,
+	.check_events		= floppy_check_events,
 	.revalidate_disk	= floppy_revalidate,
 };
 
@@ -3928,21 +3897,21 @@ static char __init get_fdc_version(void)
 	output_byte(FD_DUMPREGS);	/* 82072 and better know DUMPREGS */
 	if (FDCS->reset)
 		return FDC_NONE;
-	if ((r = result()) <= 0x00)
+	r = result();
+	if (r <= 0x00)
 		return FDC_NONE;	/* No FDC present ??? */
 	if ((r == 1) && (reply_buffer[0] == 0x80)) {
-		printk(KERN_INFO "FDC %d is an 8272A\n", fdc);
+		pr_info("FDC %d is an 8272A\n", fdc);
 		return FDC_8272A;	/* 8272a/765 don't know DUMPREGS */
 	}
 	if (r != 10) {
-		printk
-		    ("FDC %d init: DUMPREGS: unexpected return of %d bytes.\n",
-		     fdc, r);
+		pr_info("FDC %d init: DUMPREGS: unexpected return of %d bytes.\n",
+			fdc, r);
 		return FDC_UNKNOWN;
 	}
 
 	if (!fdc_configure()) {
-		printk(KERN_INFO "FDC %d is an 82072\n", fdc);
+		pr_info("FDC %d is an 82072\n", fdc);
 		return FDC_82072;	/* 82072 doesn't know CONFIGURE */
 	}
 
@@ -3950,52 +3919,50 @@ static char __init get_fdc_version(void)
 	if (need_more_output() == MORE_OUTPUT) {
 		output_byte(0);
 	} else {
-		printk(KERN_INFO "FDC %d is an 82072A\n", fdc);
+		pr_info("FDC %d is an 82072A\n", fdc);
 		return FDC_82072A;	/* 82072A as found on Sparcs. */
 	}
 
 	output_byte(FD_UNLOCK);
 	r = result();
 	if ((r == 1) && (reply_buffer[0] == 0x80)) {
-		printk(KERN_INFO "FDC %d is a pre-1991 82077\n", fdc);
-		return FDC_82077_ORIG;	/* Pre-1991 82077, doesn't know 
+		pr_info("FDC %d is a pre-1991 82077\n", fdc);
+		return FDC_82077_ORIG;	/* Pre-1991 82077, doesn't know
 					 * LOCK/UNLOCK */
 	}
 	if ((r != 1) || (reply_buffer[0] != 0x00)) {
-		printk("FDC %d init: UNLOCK: unexpected return of %d bytes.\n",
-		       fdc, r);
+		pr_info("FDC %d init: UNLOCK: unexpected return of %d bytes.\n",
+			fdc, r);
 		return FDC_UNKNOWN;
 	}
 	output_byte(FD_PARTID);
 	r = result();
 	if (r != 1) {
-		printk("FDC %d init: PARTID: unexpected return of %d bytes.\n",
-		       fdc, r);
+		pr_info("FDC %d init: PARTID: unexpected return of %d bytes.\n",
+			fdc, r);
 		return FDC_UNKNOWN;
 	}
 	if (reply_buffer[0] == 0x80) {
-		printk(KERN_INFO "FDC %d is a post-1991 82077\n", fdc);
+		pr_info("FDC %d is a post-1991 82077\n", fdc);
 		return FDC_82077;	/* Revised 82077AA passes all the tests */
 	}
 	switch (reply_buffer[0] >> 5) {
 	case 0x0:
 		/* Either a 82078-1 or a 82078SL running at 5Volt */
-		printk(KERN_INFO "FDC %d is an 82078.\n", fdc);
+		pr_info("FDC %d is an 82078.\n", fdc);
 		return FDC_82078;
 	case 0x1:
-		printk(KERN_INFO "FDC %d is a 44pin 82078\n", fdc);
+		pr_info("FDC %d is a 44pin 82078\n", fdc);
 		return FDC_82078;
 	case 0x2:
-		printk(KERN_INFO "FDC %d is a S82078B\n", fdc);
+		pr_info("FDC %d is a S82078B\n", fdc);
 		return FDC_S82078B;
 	case 0x3:
-		printk(KERN_INFO "FDC %d is a National Semiconductor PC87306\n",
-		       fdc);
+		pr_info("FDC %d is a National Semiconductor PC87306\n", fdc);
 		return FDC_87306;
 	default:
-		printk(KERN_INFO
-		       "FDC %d init: 82078 variant with unknown PARTID=%d.\n",
-		       fdc, reply_buffer[0] >> 5);
+		pr_info("FDC %d init: 82078 variant with unknown PARTID=%d.\n",
+			fdc, reply_buffer[0] >> 5);
 		return FDC_82078_UNKN;
 	}
 }				/* get_fdc_version */
@@ -4107,9 +4074,9 @@ static int __init floppy_setup(char *str)
 				else
 					param = config_params[i].def_param;
 				if (config_params[i].fn)
-					config_params[i].
-					    fn(ints, param,
-					       config_params[i].param2);
+					config_params[i].fn(ints, param,
+							    config_params[i].
+							    param2);
 				if (config_params[i].var) {
 					DPRINT("%s=%d\n", str, param);
 					*config_params[i].var = param;
@@ -4123,8 +4090,8 @@ static int __init floppy_setup(char *str)
 
 		DPRINT("allowed options are:");
 		for (i = 0; i < ARRAY_SIZE(config_params); i++)
-			printk(" %s", config_params[i].name);
-		printk("\n");
+			pr_cont(" %s", config_params[i].name);
+		pr_cont("\n");
 	} else
 		DPRINT("botched floppy option\n");
 	DPRINT("Read Documentation/blockdev/floppy.txt\n");
@@ -4142,38 +4109,51 @@ static ssize_t floppy_cmos_show(struct device *dev,
 	drive = p->id;
 	return sprintf(buf, "%X\n", UDP->cmos);
 }
-DEVICE_ATTR(cmos,S_IRUGO,floppy_cmos_show,NULL);
+
+static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
 
 static void floppy_device_release(struct device *dev)
 {
 }
 
-static int floppy_resume(struct platform_device *dev)
+static int floppy_resume(struct device *dev)
 {
 	int fdc;
 
 	for (fdc = 0; fdc < N_FDC; fdc++)
 		if (FDCS->address != -1)
-			user_reset_fdc(-1, FD_RESET_ALWAYS, 0);
+			user_reset_fdc(-1, FD_RESET_ALWAYS, false);
 
 	return 0;
 }
 
-static struct platform_driver floppy_driver = {
+static const struct dev_pm_ops floppy_pm_ops = {
 	.resume = floppy_resume,
+	.restore = floppy_resume,
+};
+
+static struct platform_driver floppy_driver = {
 	.driver = {
-		.name = "floppy",
+		   .name = "floppy",
+		   .pm = &floppy_pm_ops,
 	},
 };
 
 static struct platform_device floppy_device[N_DRIVE];
 
+static bool floppy_available(int drive)
+{
+	if (!(allowed_drive_mask & (1 << drive)))
+		return false;
+	if (fdc_state[FDC(drive)].version == FDC_NONE)
+		return false;
+	return true;
+}
+
 static struct kobject *floppy_find(dev_t dev, int *part, void *data)
 {
 	int drive = (*part & 3) | ((*part & 0x80) >> 5);
-	if (drive >= N_DRIVE ||
-	    !(allowed_drive_mask & (1 << drive)) ||
-	    fdc_state[FDC(drive)].version == FDC_NONE)
+	if (drive >= N_DRIVE || !floppy_available(drive))
 		return NULL;
 	if (((*part >> 2) & 0x1f) >= ARRAY_SIZE(floppy_type))
 		return NULL;
@@ -4181,10 +4161,12 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
 	return get_disk(disks[drive]);
 }
 
-static int __init floppy_init(void)
+static int __init do_floppy_init(void)
 {
-	int i, unit, drive;
-	int err, dr;
+	int i, unit, drive, err;
+
+	set_debugt();
+	interruptjiffies = resultjiffies = jiffies;
 
 #if defined(CONFIG_PPC)
 	if (check_legacy_ioport(FDC1))
@@ -4193,21 +4175,32 @@ static int __init floppy_init(void)
 
 	raw_cmd = NULL;
 
-	for (dr = 0; dr < N_DRIVE; dr++) {
-		disks[dr] = alloc_disk(1);
-		if (!disks[dr]) {
+	floppy_wq = alloc_ordered_workqueue("floppy", 0);
+	if (!floppy_wq)
+		return -ENOMEM;
+
+	for (drive = 0; drive < N_DRIVE; drive++) {
+		disks[drive] = alloc_disk(1);
+		if (!disks[drive]) {
 			err = -ENOMEM;
 			goto out_put_disk;
 		}
 
-		disks[dr]->major = FLOPPY_MAJOR;
-		disks[dr]->first_minor = TOMINOR(dr);
-		disks[dr]->fops = &floppy_fops;
-		sprintf(disks[dr]->disk_name, "fd%d", dr);
+		disks[drive]->queue = blk_init_queue(do_fd_request, &floppy_lock);
+		if (!disks[drive]->queue) {
+			err = -ENOMEM;
+			goto out_put_disk;
+		}
+
+		blk_queue_max_hw_sectors(disks[drive]->queue, 64);
+		disks[drive]->major = FLOPPY_MAJOR;
+		disks[drive]->first_minor = TOMINOR(drive);
+		disks[drive]->fops = &floppy_fops;
+		sprintf(disks[drive]->disk_name, "fd%d", drive);
 
-		init_timer(&motor_off_timer[dr]);
-		motor_off_timer[dr].data = dr;
-		motor_off_timer[dr].function = motor_off_callback;
+		init_timer(&motor_off_timer[drive]);
+		motor_off_timer[drive].data = drive;
+		motor_off_timer[drive].function = motor_off_callback;
 	}
 
 	err = register_blkdev(FLOPPY_MAJOR, "fd");
@@ -4218,13 +4211,6 @@ static int __init floppy_init(void)
 	if (err)
 		goto out_unreg_blkdev;
 
-	floppy_queue = blk_init_queue(do_fd_request, &floppy_lock);
-	if (!floppy_queue) {
-		err = -ENOMEM;
-		goto out_unreg_driver;
-	}
-	blk_queue_max_sectors(floppy_queue, 64);
-
 	blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
 			    floppy_find, NULL, NULL);
 
@@ -4234,16 +4220,16 @@ static int __init floppy_init(void)
 		else
 			floppy_sizes[i] = MAX_DISK_SIZE << 1;
 
-	reschedule_timeout(MAXTIMEOUT, "floppy init", MAXTIMEOUT);
+	reschedule_timeout(MAXTIMEOUT, "floppy init");
 	config_types();
 
 	for (i = 0; i < N_FDC; i++) {
 		fdc = i;
-		CLEARSTRUCT(FDCS);
+		memset(FDCS, 0, sizeof(*FDCS));
 		FDCS->dtr = -1;
 		FDCS->dor = 0x4;
 #if defined(__sparc__) || defined(__mc68000__)
-		/*sparcs/sun3x don't have a DOR reset which we can fall back on to */
+	/*sparcs/sun3x don't have a DOR reset which we can fall back on to */
 #ifdef __mc68000__
 		if (MACH_IS_SUN3X)
 #endif
@@ -4254,7 +4240,7 @@ static int __init floppy_init(void)
 	use_virtual_dma = can_use_virtual_dma & 1;
 	fdc_state[0].address = FDC1;
 	if (fdc_state[0].address == -1) {
-		del_timer(&fd_timeout);
+		cancel_delayed_work(&fd_timeout);
 		err = -ENODEV;
 		goto out_unreg_region;
 	}
@@ -4265,18 +4251,18 @@ static int __init floppy_init(void)
 	fdc = 0;		/* reset fdc in case of unexpected interrupt */
 	err = floppy_grab_irq_and_dma();
 	if (err) {
-		del_timer(&fd_timeout);
+		cancel_delayed_work(&fd_timeout);
 		err = -EBUSY;
 		goto out_unreg_region;
 	}
 
 	/* initialise drive state */
 	for (drive = 0; drive < N_DRIVE; drive++) {
-		CLEARSTRUCT(UDRS);
-		CLEARSTRUCT(UDRWE);
-		USETF(FD_DISK_NEWCHANGE);
-		USETF(FD_DISK_CHANGED);
-		USETF(FD_VERIFY);
+		memset(UDRS, 0, sizeof(*UDRS));
+		memset(UDRWE, 0, sizeof(*UDRWE));
+		set_bit(FD_DISK_NEWCHANGE_BIT, &UDRS->flags);
+		set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
+		set_bit(FD_VERIFY_BIT, &UDRS->flags);
 		UDRS->fd_device = -1;
 		floppy_track_buffer = NULL;
 		max_buffer_sectors = 0;
@@ -4296,7 +4282,7 @@ static int __init floppy_init(void)
 		if (FDCS->address == -1)
 			continue;
 		FDCS->rawcmd = 2;
-		if (user_reset_fdc(-1, FD_RESET_ALWAYS, 0)) {
+		if (user_reset_fdc(-1, FD_RESET_ALWAYS, false)) {
 			/* free ioports reserved by floppy_grab_irq_and_dma() */
 			floppy_release_regions(fdc);
 			FDCS->address = -1;
@@ -4319,22 +4305,20 @@ static int __init floppy_init(void)
 		 * properly, so force a reset for the standard FDC clones,
 		 * to avoid interrupt garbage.
 		 */
-		user_reset_fdc(-1, FD_RESET_ALWAYS, 0);
+		user_reset_fdc(-1, FD_RESET_ALWAYS, false);
 	}
 	fdc = 0;
-	del_timer(&fd_timeout);
+	cancel_delayed_work(&fd_timeout);
 	current_drive = 0;
-	initialising = 0;
+	initialized = true;
 	if (have_no_fdc) {
 		DPRINT("no floppy controllers found\n");
 		err = have_no_fdc;
-		goto out_flush_work;
+		goto out_release_dma;
 	}
 
 	for (drive = 0; drive < N_DRIVE; drive++) {
-		if (!(allowed_drive_mask & (1 << drive)))
-			continue;
-		if (fdc_state[FDC(drive)].version == FDC_NONE)
+		if (!floppy_available(drive))
 			continue;
 
 		floppy_device[drive].name = floppy_device_name;
@@ -4343,15 +4327,15 @@ static int __init floppy_init(void)
 
 		err = platform_device_register(&floppy_device[drive]);
 		if (err)
-			goto out_flush_work;
+			goto out_remove_drives;
 
-		err = device_create_file(&floppy_device[drive].dev,&dev_attr_cmos);
+		err = device_create_file(&floppy_device[drive].dev,
+					 &dev_attr_cmos);
 		if (err)
 			goto out_unreg_platform_dev;
 
 		/* to be cleaned up... */
 		disks[drive]->private_data = (void *)(long)drive;
-		disks[drive]->queue = floppy_queue;
 		disks[drive]->flags |= GENHD_FL_REMOVABLE;
 		disks[drive]->driverfs_dev = &floppy_device[drive].dev;
 		add_disk(disks[drive]);
@@ -4361,26 +4345,54 @@ static int __init floppy_init(void)
 
 out_unreg_platform_dev:
 	platform_device_unregister(&floppy_device[drive]);
-out_flush_work:
-	flush_scheduled_work();
-	if (usage_count)
+out_remove_drives:
+	while (drive--) {
+		if (floppy_available(drive)) {
+			del_gendisk(disks[drive]);
+			device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
+			platform_device_unregister(&floppy_device[drive]);
+		}
+	}
+out_release_dma:
+	if (atomic_read(&usage_count))
 		floppy_release_irq_and_dma();
 out_unreg_region:
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
-	blk_cleanup_queue(floppy_queue);
-out_unreg_driver:
 	platform_driver_unregister(&floppy_driver);
 out_unreg_blkdev:
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 out_put_disk:
-	while (dr--) {
-		del_timer(&motor_off_timer[dr]);
-		put_disk(disks[dr]);
+	destroy_workqueue(floppy_wq);
+	for (drive = 0; drive < N_DRIVE; drive++) {
+		if (!disks[drive])
+			break;
+		if (disks[drive]->queue) {
+			del_timer_sync(&motor_off_timer[drive]);
+			blk_cleanup_queue(disks[drive]->queue);
+			disks[drive]->queue = NULL;
+		}
+		put_disk(disks[drive]);
 	}
 	return err;
 }
 
-static DEFINE_SPINLOCK(floppy_usage_lock);
+#ifndef MODULE
+static __init void floppy_async_init(void *data, async_cookie_t cookie)
+{
+	do_floppy_init();
+}
+#endif
+
+static int __init floppy_init(void)
+{
+#ifdef MODULE
+	return do_floppy_init();
+#else
+	/* Don't hold up the bootup by the floppy initialization */
+	async_schedule(floppy_async_init, NULL);
+	return 0;
+#endif
+}
 
 static const struct io_region {
 	int offset;
@@ -4409,8 +4421,10 @@ static int floppy_request_regions(int fdc)
 	const struct io_region *p;
 
 	for (p = io_regions; p < ARRAY_END(io_regions); p++) {
-		if (!request_region(FDCS->address + p->offset, p->size, "floppy")) {
-			DPRINT("Floppy io-port 0x%04lx in use\n", FDCS->address + p->offset);
+		if (!request_region(FDCS->address + p->offset,
+				    p->size, "floppy")) {
+			DPRINT("Floppy io-port 0x%04lx in use\n",
+			       FDCS->address + p->offset);
 			floppy_release_allocated_regions(fdc, p);
 			return -EBUSY;
 		}
@@ -4425,27 +4439,19 @@ static void floppy_release_regions(int fdc)
 
 static int floppy_grab_irq_and_dma(void)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&floppy_usage_lock, flags);
-	if (usage_count++) {
-		spin_unlock_irqrestore(&floppy_usage_lock, flags);
+	if (atomic_inc_return(&usage_count) > 1)
 		return 0;
-	}
-	spin_unlock_irqrestore(&floppy_usage_lock, flags);
 
 	/*
 	 * We might have scheduled a free_irq(), wait it to
 	 * drain first:
 	 */
-	flush_scheduled_work();
+	flush_workqueue(floppy_wq);
 
 	if (fd_request_irq()) {
 		DPRINT("Unable to grab IRQ%d for the floppy driver\n",
 		       FLOPPY_IRQ);
-		spin_lock_irqsave(&floppy_usage_lock, flags);
-		usage_count--;
-		spin_unlock_irqrestore(&floppy_usage_lock, flags);
+		atomic_dec(&usage_count);
 		return -1;
 	}
 	if (fd_request_dma()) {
@@ -4455,9 +4461,7 @@ static int floppy_grab_irq_and_dma(void)
 			use_virtual_dma = can_use_virtual_dma = 1;
 		if (!(can_use_virtual_dma & 1)) {
 			fd_free_irq();
-			spin_lock_irqsave(&floppy_usage_lock, flags);
-			usage_count--;
-			spin_unlock_irqrestore(&floppy_usage_lock, flags);
+			atomic_dec(&usage_count);
 			return -1;
 		}
 	}
@@ -4492,30 +4496,22 @@ cleanup:
 	fd_free_dma();
 	while (--fdc >= 0)
 		floppy_release_regions(fdc);
-	spin_lock_irqsave(&floppy_usage_lock, flags);
-	usage_count--;
-	spin_unlock_irqrestore(&floppy_usage_lock, flags);
+	atomic_dec(&usage_count);
 	return -1;
 }
 
 static void floppy_release_irq_and_dma(void)
 {
 	int old_fdc;
-#ifdef FLOPPY_SANITY_CHECK
 #ifndef __sparc__
 	int drive;
 #endif
-#endif
 	long tmpsize;
 	unsigned long tmpaddr;
-	unsigned long flags;
 
-	spin_lock_irqsave(&floppy_usage_lock, flags);
-	if (--usage_count) {
-		spin_unlock_irqrestore(&floppy_usage_lock, flags);
+	if (!atomic_dec_and_test(&usage_count))
 		return;
-	}
-	spin_unlock_irqrestore(&floppy_usage_lock, flags);
+
 	if (irqdma_allocated) {
 		fd_disable_dma();
 		fd_free_dma();
@@ -4526,7 +4522,6 @@ static void floppy_release_irq_and_dma(void)
 #if N_FDC > 1
 	set_dor(1, ~8, 0);
 #endif
-	floppy_enable_hlt();
 
 	if (floppy_track_buffer && max_buffer_sectors) {
 		tmpsize = max_buffer_sectors * 1024;
@@ -4536,20 +4531,18 @@ static void floppy_release_irq_and_dma(void)
 		buffer_min = buffer_max = -1;
 		fd_dma_mem_free(tmpaddr, tmpsize);
 	}
-#ifdef FLOPPY_SANITY_CHECK
 #ifndef __sparc__
 	for (drive = 0; drive < N_FDC * 4; drive++)
 		if (timer_pending(motor_off_timer + drive))
-			printk("motor off timer %d still active\n", drive);
+			pr_info("motor off timer %d still active\n", drive);
 #endif
 
-	if (timer_pending(&fd_timeout))
-		printk("floppy timer still active:%s\n", timeout_message);
-	if (timer_pending(&fd_timer))
-		printk("auxiliary floppy timer still active\n");
+	if (delayed_work_pending(&fd_timeout))
+		pr_info("floppy timer still active:%s\n", timeout_message);
+	if (delayed_work_pending(&fd_timer))
+		pr_info("auxiliary floppy timer still active\n");
 	if (work_pending(&floppy_work))
-		printk("work still pending\n");
-#endif
+		pr_info("work still pending\n");
 	old_fdc = fdc;
 	for (fdc = 0; fdc < N_FDC; fdc++)
 		if (FDCS->address != -1)
@@ -4566,7 +4559,9 @@ static void __init parse_floppy_cfg_string(char *cfg)
 	char *ptr;
 
 	while (*cfg) {
-		for (ptr = cfg; *cfg && *cfg != ' ' && *cfg != '\t'; cfg++) ;
+		ptr = cfg;
+		while (*cfg && *cfg != ' ' && *cfg != '\t')
+			cfg++;
 		if (*cfg) {
 			*cfg = '\0';
 			cfg++;
@@ -4592,28 +4587,39 @@ static void __exit floppy_module_exit(void)
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 	platform_driver_unregister(&floppy_driver);
 
+	destroy_workqueue(floppy_wq);
+
 	for (drive = 0; drive < N_DRIVE; drive++) {
 		del_timer_sync(&motor_off_timer[drive]);
 
-		if ((allowed_drive_mask & (1 << drive)) &&
-		    fdc_state[FDC(drive)].version != FDC_NONE) {
+		if (floppy_available(drive)) {
 			del_gendisk(disks[drive]);
 			device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
 			platform_device_unregister(&floppy_device[drive]);
 		}
+		blk_cleanup_queue(disks[drive]->queue);
+
+		/*
+		 * These disks have not called add_disk().  Don't put down
+		 * queue reference in put_disk().
+		 */
+		if (!(allowed_drive_mask & (1 << drive)) ||
+		    fdc_state[FDC(drive)].version == FDC_NONE)
+			disks[drive]->queue = NULL;
+
 		put_disk(disks[drive]);
 	}
 
-	del_timer_sync(&fd_timeout);
-	del_timer_sync(&fd_timer);
-	blk_cleanup_queue(floppy_queue);
+	cancel_delayed_work_sync(&fd_timeout);
+	cancel_delayed_work_sync(&fd_timer);
 
-	if (usage_count)
+	if (atomic_read(&usage_count))
 		floppy_release_irq_and_dma();
 
 	/* eject disk, if any */
 	fd_eject(0);
 }
+
 module_exit(floppy_module_exit);
 
 module_param(floppy, charp, 0);
@@ -4625,9 +4631,10 @@ MODULE_LICENSE("GPL");
 
 /* This doesn't actually get used other than for module information */
 static const struct pnp_device_id floppy_pnpids[] = {
-	{ "PNP0700", 0 },
-	{ }
+	{"PNP0700", 0},
+	{}
 };
+
 MODULE_DEVICE_TABLE(pnp, floppy_pnpids);
 
 #else
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index f65b3f369eb..8a290c08262 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -34,7 +34,6 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/genhd.h>
-#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
@@ -45,7 +44,6 @@
 #define HD_IRQ 14
 
 #define REALLY_SLOW_IO
-#include <asm/system.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 
@@ -100,8 +98,6 @@ static DEFINE_SPINLOCK(hd_lock);
 static struct request_queue *hd_queue;
 static struct request *hd_req;
 
-#define MAJOR_NR HD_MAJOR
-
 #define TIMEOUT_VALUE	(6*HZ)
 #define	HD_DELAY	0
 
@@ -158,7 +154,7 @@ else \
 
 #if (HD_DELAY > 0)
 
-#include <asm/i8253.h>
+#include <linux/i8253.h>
 
 unsigned long last_req;
 
@@ -167,12 +163,12 @@ unsigned long read_timer(void)
 	unsigned long t, flags;
 	int i;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	t = jiffies * 11932;
 	outb_p(0, 0x43);
 	i = inb_p(0x40);
 	i |= inb(0x40) << 8;
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 	return(t - i);
 }
 #endif
@@ -468,11 +464,11 @@ static void read_intr(void)
 
 ok_to_read:
 	req = hd_req;
-	insw(HD_DATA, req->buffer, 256);
+	insw(HD_DATA, bio_data(req->bio), 256);
 #ifdef DEBUG
 	printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
 	       req->rq_disk->disk_name, blk_rq_pos(req) + 1,
-	       blk_rq_sectors(req) - 1, req->buffer+512);
+	       blk_rq_sectors(req) - 1, bio_data(req->bio)+512);
 #endif
 	if (hd_end_request(0, 512)) {
 		SET_HANDLER(&read_intr);
@@ -509,7 +505,7 @@ static void write_intr(void)
 ok_to_write:
 	if (hd_end_request(0, 512)) {
 		SET_HANDLER(&write_intr);
-		outsw(HD_DATA, req->buffer, 256);
+		outsw(HD_DATA, bio_data(req->bio), 256);
 		return;
 	}
 
@@ -628,9 +624,9 @@ repeat:
 	printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
 		req->rq_disk->disk_name,
 		req_data_dir(req) == READ ? "read" : "writ",
-		cyl, head, sec, nsect, req->buffer);
+		cyl, head, sec, nsect, bio_data(req->bio));
 #endif
-	if (blk_fs_request(req)) {
+	if (req->cmd_type == REQ_TYPE_FS) {
 		switch (rq_data_dir(req)) {
 		case READ:
 			hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
@@ -647,7 +643,7 @@ repeat:
 				bad_rw_intr();
 				goto repeat;
 			}
-			outsw(HD_DATA, req->buffer, 256);
+			outsw(HD_DATA, bio_data(req->bio), 256);
 			break;
 		default:
 			printk("unknown hd-command\n");
@@ -694,7 +690,7 @@ static irqreturn_t hd_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static struct block_device_operations hd_fops = {
+static const struct block_device_operations hd_fops = {
 	.getgeo =	hd_getgeo,
 };
 
@@ -712,16 +708,16 @@ static int __init hd_init(void)
 {
 	int drive;
 
-	if (register_blkdev(MAJOR_NR, "hd"))
+	if (register_blkdev(HD_MAJOR, "hd"))
 		return -1;
 
 	hd_queue = blk_init_queue(do_hd_request, &hd_lock);
 	if (!hd_queue) {
-		unregister_blkdev(MAJOR_NR, "hd");
+		unregister_blkdev(HD_MAJOR, "hd");
 		return -ENOMEM;
 	}
 
-	blk_queue_max_sectors(hd_queue, 255);
+	blk_queue_max_hw_sectors(hd_queue, 255);
 	init_timer(&device_timer);
 	device_timer.function = hd_times_out;
 	blk_queue_logical_block_size(hd_queue, 512);
@@ -736,7 +732,7 @@ static int __init hd_init(void)
 		 * the BIOS or CMOS.  This doesn't work all that well,
 		 * since this assumes that this is a primary or secondary
 		 * drive, and if we're using this legacy driver, it's
-		 * probably an auxilliary controller added to recover
+		 * probably an auxiliary controller added to recover
 		 * legacy data off an ST-506 drive.  Either way, it's
 		 * definitely safest to have the user explicitly specify
 		 * the information.
@@ -751,7 +747,7 @@ static int __init hd_init(void)
 		struct hd_i_struct *p = &hd_info[drive];
 		if (!disk)
 			goto Enomem;
-		disk->major = MAJOR_NR;
+		disk->major = HD_MAJOR;
 		disk->first_minor = drive << 6;
 		disk->fops = &hd_fops;
 		sprintf(disk->disk_name, "hd%c", 'a'+drive);
@@ -795,7 +791,7 @@ out1:
 	NR_HD = 0;
 out:
 	del_timer(&device_timer);
-	unregister_blkdev(MAJOR_NR, "hd");
+	unregister_blkdev(HD_MAJOR, "hd");
 	blk_cleanup_queue(hd_queue);
 	return -1;
 Enomem:
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 801f4ab8330..6cb1beb47c2 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -61,25 +61,26 @@
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
 #include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
-#include <linux/loop.h>
 #include <linux/compat.h>
 #include <linux/suspend.h>
 #include <linux/freezer.h>
+#include <linux/mutex.h>
 #include <linux/writeback.h>
-#include <linux/buffer_head.h>		/* for invalidate_bdev() */
 #include <linux/completion.h>
 #include <linux/highmem.h>
-#include <linux/gfp.h>
 #include <linux/kthread.h>
 #include <linux/splice.h>
+#include <linux/sysfs.h>
+#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+#include "loop.h"
 
 #include <asm/uaccess.h>
 
-static LIST_HEAD(loop_devices);
-static DEFINE_MUTEX(loop_devices_mutex);
+static DEFINE_IDR(loop_index_idr);
+static DEFINE_MUTEX(loop_index_mutex);
 
 static int max_part;
 static int part_shift;
@@ -92,16 +93,16 @@ static int transfer_none(struct loop_device *lo, int cmd,
 			 struct page *loop_page, unsigned loop_off,
 			 int size, sector_t real_block)
 {
-	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
-	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
+	char *raw_buf = kmap_atomic(raw_page) + raw_off;
+	char *loop_buf = kmap_atomic(loop_page) + loop_off;
 
 	if (cmd == READ)
 		memcpy(loop_buf, raw_buf, size);
 	else
 		memcpy(raw_buf, loop_buf, size);
 
-	kunmap_atomic(raw_buf, KM_USER0);
-	kunmap_atomic(loop_buf, KM_USER1);
+	kunmap_atomic(loop_buf);
+	kunmap_atomic(raw_buf);
 	cond_resched();
 	return 0;
 }
@@ -111,8 +112,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
 			struct page *loop_page, unsigned loop_off,
 			int size, sector_t real_block)
 {
-	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
-	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
+	char *raw_buf = kmap_atomic(raw_page) + raw_off;
+	char *loop_buf = kmap_atomic(loop_page) + loop_off;
 	char *in, *out, *key;
 	int i, keysize;
 
@@ -129,8 +130,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
 	for (i = 0; i < size; i++)
 		*out++ = *in++ ^ key[(i & 511) % keysize];
 
-	kunmap_atomic(raw_buf, KM_USER0);
-	kunmap_atomic(loop_buf, KM_USER1);
+	kunmap_atomic(loop_buf);
+	kunmap_atomic(raw_buf);
 	cond_resched();
 	return 0;
 }
@@ -159,17 +160,20 @@ static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
 	&xor_funcs
 };
 
-static loff_t get_loop_size(struct loop_device *lo, struct file *file)
+static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
 {
-	loff_t size, offset, loopsize;
+	loff_t loopsize;
 
 	/* Compute loopsize in bytes */
-	size = i_size_read(file->f_mapping->host);
-	offset = lo->lo_offset;
-	loopsize = size - offset;
-	if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
-		loopsize = lo->lo_sizelimit;
+	loopsize = i_size_read(file->f_mapping->host);
+	if (offset > 0)
+		loopsize -= offset;
+	/* offset is beyond i_size, weird but possible */
+	if (loopsize < 0)
+		return 0;
 
+	if (sizelimit > 0 && sizelimit < loopsize)
+		loopsize = sizelimit;
 	/*
 	 * Unfortunately, if we want to do I/O on the device,
 	 * the number of 512-byte sectors has to fit into a sector_t.
@@ -177,17 +181,29 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file)
 	return loopsize >> 9;
 }
 
+static loff_t get_loop_size(struct loop_device *lo, struct file *file)
+{
+	return get_size(lo->lo_offset, lo->lo_sizelimit, file);
+}
+
 static int
-figure_loop_size(struct loop_device *lo)
+figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
 {
-	loff_t size = get_loop_size(lo, lo->lo_backing_file);
+	loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
 	sector_t x = (sector_t)size;
+	struct block_device *bdev = lo->lo_device;
 
 	if (unlikely((loff_t)x != size))
 		return -EFBIG;
-
+	if (lo->lo_offset != offset)
+		lo->lo_offset = offset;
+	if (lo->lo_sizelimit != sizelimit)
+		lo->lo_sizelimit = sizelimit;
 	set_capacity(lo->lo_disk, x);
-	return 0;					
+	bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9);
+	/* let user-space know about the new size */
+	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
+	return 0;
 }
 
 static inline int
@@ -203,72 +219,6 @@ lo_do_transfer(struct loop_device *lo, int cmd,
 }
 
 /**
- * do_lo_send_aops - helper for writing data to a loop device
- *
- * This is the fast version for backing filesystems which implement the address
- * space operations write_begin and write_end.
- */
-static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
-		loff_t pos, struct page *unused)
-{
-	struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
-	struct address_space *mapping = file->f_mapping;
-	pgoff_t index;
-	unsigned offset, bv_offs;
-	int len, ret;
-
-	mutex_lock(&mapping->host->i_mutex);
-	index = pos >> PAGE_CACHE_SHIFT;
-	offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
-	bv_offs = bvec->bv_offset;
-	len = bvec->bv_len;
-	while (len > 0) {
-		sector_t IV;
-		unsigned size, copied;
-		int transfer_result;
-		struct page *page;
-		void *fsdata;
-
-		IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
-		size = PAGE_CACHE_SIZE - offset;
-		if (size > len)
-			size = len;
-
-		ret = pagecache_write_begin(file, mapping, pos, size, 0,
-							&page, &fsdata);
-		if (ret)
-			goto fail;
-
-		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
-				bvec->bv_page, bv_offs, size, IV);
-		copied = size;
-		if (unlikely(transfer_result))
-			copied = 0;
-
-		ret = pagecache_write_end(file, mapping, pos, size, copied,
-							page, fsdata);
-		if (ret < 0 || ret != copied)
-			goto fail;
-
-		if (unlikely(transfer_result))
-			goto fail;
-
-		bv_offs += copied;
-		len -= copied;
-		offset = 0;
-		index++;
-		pos += copied;
-	}
-	ret = 0;
-out:
-	mutex_unlock(&mapping->host->i_mutex);
-	return ret;
-fail:
-	ret = -1;
-	goto out;
-}
-
-/**
  * __do_lo_send_write - helper for writing data to a loop device
  *
  * This helper just factors out common code between do_lo_send_direct_write()
@@ -280,12 +230,14 @@ static int __do_lo_send_write(struct file *file,
 	ssize_t bw;
 	mm_segment_t old_fs = get_fs();
 
+	file_start_write(file);
 	set_fs(get_ds());
 	bw = file->f_op->write(file, buf, len, &pos);
 	set_fs(old_fs);
+	file_end_write(file);
 	if (likely(bw == len))
 		return 0;
-	printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
+	printk_ratelimited(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
 			(unsigned long long)pos, len);
 	if (bw >= 0)
 		bw = -EIO;
@@ -295,10 +247,8 @@ static int __do_lo_send_write(struct file *file,
 /**
  * do_lo_send_direct_write - helper for writing data to a loop device
  *
- * This is the fast, non-transforming version for backing filesystems which do
- * not implement the address space operations write_begin and write_end.
- * It uses the write file operation which should be present on all writeable
- * filesystems.
+ * This is the fast, non-transforming version that does not need double
+ * buffering.
  */
 static int do_lo_send_direct_write(struct loop_device *lo,
 		struct bio_vec *bvec, loff_t pos, struct page *page)
@@ -314,15 +264,9 @@ static int do_lo_send_direct_write(struct loop_device *lo,
 /**
  * do_lo_send_write - helper for writing data to a loop device
  *
- * This is the slow, transforming version for filesystems which do not
- * implement the address space operations write_begin and write_end.  It
- * uses the write file operation which should be present on all writeable
- * filesystems.
- *
- * Using fops->write is slower than using aops->{prepare,commit}_write in the
- * transforming case because we need to double buffer the data as we cannot do
- * the transformations in place as we do not have direct access to the
- * destination pages of the backing file.
+ * This is the slow, transforming version that needs to double buffer the
+ * data as it cannot do the transformations in place without having direct
+ * access to the destination pages of the backing file.
  */
 static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
 		loff_t pos, struct page *page)
@@ -333,7 +277,7 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
 		return __do_lo_send_write(lo->lo_backing_file,
 				page_address(page), bvec->bv_len,
 				pos);
-	printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
+	printk_ratelimited(KERN_ERR "loop: Transfer error at byte offset %llu, "
 			"length %i.\n", (unsigned long long)pos, bvec->bv_len);
 	if (ret > 0)
 		ret = -EIO;
@@ -344,26 +288,26 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
 {
 	int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t,
 			struct page *page);
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
 	struct page *page = NULL;
-	int i, ret = 0;
+	int ret = 0;
 
-	do_lo_send = do_lo_send_aops;
-	if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
+	if (lo->transfer != transfer_none) {
+		page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+		if (unlikely(!page))
+			goto fail;
+		kmap(page);
+		do_lo_send = do_lo_send_write;
+	} else {
 		do_lo_send = do_lo_send_direct_write;
-		if (lo->transfer != transfer_none) {
-			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
-			if (unlikely(!page))
-				goto fail;
-			kmap(page);
-			do_lo_send = do_lo_send_write;
-		}
 	}
-	bio_for_each_segment(bvec, bio, i) {
-		ret = do_lo_send(lo, bvec, pos, page);
+
+	bio_for_each_segment(bvec, bio, iter) {
+		ret = do_lo_send(lo, &bvec, pos, page);
 		if (ret < 0)
 			break;
-		pos += bvec->bv_len;
+		pos += bvec.bv_len;
 	}
 	if (page) {
 		kunmap(page);
@@ -372,7 +316,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
 out:
 	return ret;
 fail:
-	printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
+	printk_ratelimited(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
 	ret = -ENOMEM;
 	goto out;
 }
@@ -392,11 +336,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	struct loop_device *lo = p->lo;
 	struct page *page = buf->page;
 	sector_t IV;
-	int size, ret;
-
-	ret = buf->ops->confirm(pipe, buf);
-	if (unlikely(ret))
-		return ret;
+	int size;
 
 	IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +
 							(buf->offset >> 9);
@@ -405,7 +345,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 		size = p->bsize;
 
 	if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) {
-		printk(KERN_ERR "loop: transfer error block %ld\n",
+		printk_ratelimited(KERN_ERR "loop: transfer error block %ld\n",
 		       page->index);
 		size = -EINVAL;
 	}
@@ -424,14 +364,14 @@ lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd)
 	return __splice_from_pipe(pipe, sd, lo_splice_actor);
 }
 
-static int
+static ssize_t
 do_lo_receive(struct loop_device *lo,
 	      struct bio_vec *bvec, int bsize, loff_t pos)
 {
 	struct lo_read_data cookie;
 	struct splice_desc sd;
 	struct file *file;
-	long retval;
+	ssize_t retval;
 
 	cookie.lo = lo;
 	cookie.page = bvec->bv_page;
@@ -447,25 +387,28 @@ do_lo_receive(struct loop_device *lo,
 	file = lo->lo_backing_file;
 	retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor);
 
-	if (retval < 0)
-		return retval;
-
-	return 0;
+	return retval;
 }
 
 static int
 lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
 {
-	struct bio_vec *bvec;
-	int i, ret = 0;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	ssize_t s;
 
-	bio_for_each_segment(bvec, bio, i) {
-		ret = do_lo_receive(lo, bvec, bsize, pos);
-		if (ret < 0)
+	bio_for_each_segment(bvec, bio, iter) {
+		s = do_lo_receive(lo, &bvec, bsize, pos);
+		if (s < 0)
+			return s;
+
+		if (s != bvec.bv_len) {
+			zero_fill_bio(bio);
 			break;
-		pos += bvec->bv_len;
+		}
+		pos += bvec.bv_len;
 	}
-	return ret;
+	return 0;
 }
 
 static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
@@ -473,30 +416,47 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 	loff_t pos;
 	int ret;
 
-	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
+	pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset;
 
 	if (bio_rw(bio) == WRITE) {
-		int barrier = bio_barrier(bio);
 		struct file *file = lo->lo_backing_file;
 
-		if (barrier) {
-			if (unlikely(!file->f_op->fsync)) {
-				ret = -EOPNOTSUPP;
+		if (bio->bi_rw & REQ_FLUSH) {
+			ret = vfs_fsync(file, 0);
+			if (unlikely(ret && ret != -EINVAL)) {
+				ret = -EIO;
 				goto out;
 			}
+		}
 
-			ret = vfs_fsync(file, file->f_path.dentry, 0);
-			if (unlikely(ret)) {
-				ret = -EIO;
+		/*
+		 * We use punch hole to reclaim the free space used by the
+		 * image a.k.a. discard. However we do not support discard if
+		 * encryption is enabled, because it may give an attacker
+		 * useful information.
+		 */
+		if (bio->bi_rw & REQ_DISCARD) {
+			struct file *file = lo->lo_backing_file;
+			int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+
+			if ((!file->f_op->fallocate) ||
+			    lo->lo_encrypt_key_size) {
+				ret = -EOPNOTSUPP;
 				goto out;
 			}
+			ret = file->f_op->fallocate(file, mode, pos,
+						    bio->bi_iter.bi_size);
+			if (unlikely(ret && ret != -EINVAL &&
+				     ret != -EOPNOTSUPP))
+				ret = -EIO;
+			goto out;
 		}
 
 		ret = lo_send(lo, bio, pos);
 
-		if (barrier && !ret) {
-			ret = vfs_fsync(file, file->f_path.dentry, 0);
-			if (unlikely(ret))
+		if ((bio->bi_rw & REQ_FUA) && !ret) {
+			ret = vfs_fsync(file, 0);
+			if (unlikely(ret && ret != -EINVAL))
 				ret = -EIO;
 		}
 	} else
@@ -511,6 +471,7 @@ out:
  */
 static void loop_add_bio(struct loop_device *lo, struct bio *bio)
 {
+	lo->lo_bio_count++;
 	bio_list_add(&lo->lo_bio_list, bio);
 }
 
@@ -519,10 +480,11 @@ static void loop_add_bio(struct loop_device *lo, struct bio *bio)
  */
 static struct bio *loop_get_bio(struct loop_device *lo)
 {
+	lo->lo_bio_count--;
 	return bio_list_pop(&lo->lo_bio_list);
 }
 
-static int loop_make_request(struct request_queue *q, struct bio *old_bio)
+static void loop_make_request(struct request_queue *q, struct bio *old_bio)
 {
 	struct loop_device *lo = q->queuedata;
 	int rw = bio_rw(old_bio);
@@ -537,26 +499,18 @@ static int loop_make_request(struct request_queue *q, struct bio *old_bio)
 		goto out;
 	if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
 		goto out;
+	if (lo->lo_bio_count >= q->nr_congestion_on)
+		wait_event_lock_irq(lo->lo_req_wait,
+				    lo->lo_bio_count < q->nr_congestion_off,
+				    lo->lo_lock);
 	loop_add_bio(lo, old_bio);
 	wake_up(&lo->lo_event);
 	spin_unlock_irq(&lo->lo_lock);
-	return 0;
+	return;
 
 out:
 	spin_unlock_irq(&lo->lo_lock);
 	bio_io_error(old_bio);
-	return 0;
-}
-
-/*
- * kick off io on the underlying address space
- */
-static void loop_unplug(struct request_queue *q)
-{
-	struct loop_device *lo = q->queuedata;
-
-	queue_flag_clear_unlocked(QUEUE_FLAG_PLUGGED, q);
-	blk_run_address_space(lo->lo_backing_file->f_mapping);
 }
 
 struct switch_request {
@@ -594,7 +548,7 @@ static int loop_thread(void *data)
 	struct loop_device *lo = data;
 	struct bio *bio;
 
-	set_user_nice(current, -20);
+	set_user_nice(current, MIN_NICE);
 
 	while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
 
@@ -606,6 +560,8 @@ static int loop_thread(void *data)
 			continue;
 		spin_lock_irq(&lo->lo_lock);
 		bio = loop_get_bio(lo);
+		if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
+			wake_up(&lo->lo_req_wait);
 		spin_unlock_irq(&lo->lo_lock);
 
 		BUG_ON(!bio);
@@ -719,7 +675,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 		goto out_putf;
 
 	fput(old_file);
-	if (max_part > 0)
+	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
 		ioctl_by_bdev(bdev, BLKRRPART, 0);
 	return 0;
 
@@ -736,6 +692,134 @@ static inline int is_loop_device(struct file *file)
 	return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
 }
 
+/* loop sysfs attributes */
+
+static ssize_t loop_attr_show(struct device *dev, char *page,
+			      ssize_t (*callback)(struct loop_device *, char *))
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct loop_device *lo = disk->private_data;
+
+	return callback(lo, page);
+}
+
+#define LOOP_ATTR_RO(_name)						\
+static ssize_t loop_attr_##_name##_show(struct loop_device *, char *);	\
+static ssize_t loop_attr_do_show_##_name(struct device *d,		\
+				struct device_attribute *attr, char *b)	\
+{									\
+	return loop_attr_show(d, b, loop_attr_##_name##_show);		\
+}									\
+static struct device_attribute loop_attr_##_name =			\
+	__ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
+
+static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
+{
+	ssize_t ret;
+	char *p = NULL;
+
+	spin_lock_irq(&lo->lo_lock);
+	if (lo->lo_backing_file)
+		p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+	spin_unlock_irq(&lo->lo_lock);
+
+	if (IS_ERR_OR_NULL(p))
+		ret = PTR_ERR(p);
+	else {
+		ret = strlen(p);
+		memmove(buf, p, ret);
+		buf[ret++] = '\n';
+		buf[ret] = 0;
+	}
+
+	return ret;
+}
+
+static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
+{
+	return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
+}
+
+static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
+{
+	return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
+}
+
+static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
+{
+	int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
+
+	return sprintf(buf, "%s\n", autoclear ? "1" : "0");
+}
+
+static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf)
+{
+	int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN);
+
+	return sprintf(buf, "%s\n", partscan ? "1" : "0");
+}
+
+LOOP_ATTR_RO(backing_file);
+LOOP_ATTR_RO(offset);
+LOOP_ATTR_RO(sizelimit);
+LOOP_ATTR_RO(autoclear);
+LOOP_ATTR_RO(partscan);
+
+static struct attribute *loop_attrs[] = {
+	&loop_attr_backing_file.attr,
+	&loop_attr_offset.attr,
+	&loop_attr_sizelimit.attr,
+	&loop_attr_autoclear.attr,
+	&loop_attr_partscan.attr,
+	NULL,
+};
+
+static struct attribute_group loop_attribute_group = {
+	.name = "loop",
+	.attrs= loop_attrs,
+};
+
+static int loop_sysfs_init(struct loop_device *lo)
+{
+	return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
+				  &loop_attribute_group);
+}
+
+static void loop_sysfs_exit(struct loop_device *lo)
+{
+	sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
+			   &loop_attribute_group);
+}
+
+static void loop_config_discard(struct loop_device *lo)
+{
+	struct file *file = lo->lo_backing_file;
+	struct inode *inode = file->f_mapping->host;
+	struct request_queue *q = lo->lo_queue;
+
+	/*
+	 * We use punch hole to reclaim the free space used by the
+	 * image a.k.a. discard. However we do not support discard if
+	 * encryption is enabled, because it may give an attacker
+	 * useful information.
+	 */
+	if ((!file->f_op->fallocate) ||
+	    lo->lo_encrypt_key_size) {
+		q->limits.discard_granularity = 0;
+		q->limits.discard_alignment = 0;
+		q->limits.max_discard_sectors = 0;
+		q->limits.discard_zeroes_data = 0;
+		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+		return;
+	}
+
+	q->limits.discard_granularity = inode->i_sb->s_blocksize;
+	q->limits.discard_alignment = 0;
+	q->limits.max_discard_sectors = UINT_MAX >> 9;
+	q->limits.discard_zeroes_data = 1;
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+}
+
 static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 		       struct block_device *bdev, unsigned int arg)
 {
@@ -778,35 +862,23 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	mapping = file->f_mapping;
 	inode = mapping->host;
 
-	if (!(file->f_mode & FMODE_WRITE))
-		lo_flags |= LO_FLAGS_READ_ONLY;
-
 	error = -EINVAL;
-	if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-		const struct address_space_operations *aops = mapping->a_ops;
-
-		if (aops->write_begin)
-			lo_flags |= LO_FLAGS_USE_AOPS;
-		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
-			lo_flags |= LO_FLAGS_READ_ONLY;
+	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+		goto out_putf;
 
-		lo_blocksize = S_ISBLK(inode->i_mode) ?
-			inode->i_bdev->bd_block_size : PAGE_SIZE;
+	if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) ||
+	    !file->f_op->write)
+		lo_flags |= LO_FLAGS_READ_ONLY;
 
-		error = 0;
-	} else {
-		goto out_putf;
-	}
+	lo_blocksize = S_ISBLK(inode->i_mode) ?
+		inode->i_bdev->bd_block_size : PAGE_SIZE;
 
+	error = -EFBIG;
 	size = get_loop_size(lo, file);
-
-	if ((loff_t)(sector_t)size != size) {
-		error = -EFBIG;
+	if ((loff_t)(sector_t)size != size)
 		goto out_putf;
-	}
 
-	if (!(mode & FMODE_WRITE))
-		lo_flags |= LO_FLAGS_READ_ONLY;
+	error = 0;
 
 	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
@@ -817,24 +889,20 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->transfer = transfer_none;
 	lo->ioctl = NULL;
 	lo->lo_sizelimit = 0;
+	lo->lo_bio_count = 0;
 	lo->old_gfp_mask = mapping_gfp_mask(mapping);
 	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 
 	bio_list_init(&lo->lo_bio_list);
 
-	/*
-	 * set queue make_request_fn, and add limits based on lower level
-	 * device
-	 */
-	blk_queue_make_request(lo->lo_queue, loop_make_request);
-	lo->lo_queue->queuedata = lo;
-	lo->lo_queue->unplug_fn = loop_unplug;
-
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
-		blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN, NULL);
+		blk_queue_flush(lo->lo_queue, REQ_FLUSH);
 
 	set_capacity(lo->lo_disk, size);
 	bd_set_size(bdev, size << 9);
+	loop_sysfs_init(lo);
+	/* let user-space know about the new size */
+	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
 
 	set_blocksize(bdev, lo_blocksize);
 
@@ -846,11 +914,19 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	}
 	lo->lo_state = Lo_bound;
 	wake_up_process(lo->lo_thread);
-	if (max_part > 0)
+	if (part_shift)
+		lo->lo_flags |= LO_FLAGS_PARTSCAN;
+	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
 		ioctl_by_bdev(bdev, BLKRRPART, 0);
+
+	/* Grab the block_device to prevent its destruction after we
+	 * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
+	 */
+	bdgrab(bdev);
 	return 0;
 
 out_clr:
+	loop_sysfs_exit(lo);
 	lo->lo_thread = NULL;
 	lo->lo_device = NULL;
 	lo->lo_backing_file = NULL;
@@ -858,6 +934,7 @@ out_clr:
 	set_capacity(lo->lo_disk, 0);
 	invalidate_bdev(bdev);
 	bd_set_size(bdev, 0);
+	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
 	mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
 	lo->lo_state = Lo_unbound;
  out_putf:
@@ -905,16 +982,30 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
 	return err;
 }
 
-static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
+static int loop_clr_fd(struct loop_device *lo)
 {
 	struct file *filp = lo->lo_backing_file;
 	gfp_t gfp = lo->old_gfp_mask;
+	struct block_device *bdev = lo->lo_device;
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
 
-	if (lo->lo_refcnt > 1)	/* we needed one fd for the ioctl */
-		return -EBUSY;
+	/*
+	 * If we've explicitly asked to tear down the loop device,
+	 * and it has an elevated reference count, set it for auto-teardown when
+	 * the last reference goes away. This stops $!~#$@ udev from
+	 * preventing teardown because it decided that it needs to run blkid on
+	 * the loopback device whenever they appear. xfstests is notorious for
+	 * failing tests because blkid via udev races with a losetup
+	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
+	 * command to fail with EBUSY.
+	 */
+	if (lo->lo_refcnt > 1) {
+		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
+		mutex_unlock(&lo->lo_ctl_mutex);
+		return 0;
+	}
 
 	if (filp == NULL)
 		return -EINVAL;
@@ -925,8 +1016,9 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 
 	kthread_stop(lo->lo_thread);
 
-	lo->lo_queue->unplug_fn = NULL;
+	spin_lock_irq(&lo->lo_lock);
 	lo->lo_backing_file = NULL;
+	spin_unlock_irq(&lo->lo_lock);
 
 	loop_release_xfer(lo);
 	lo->transfer = NULL;
@@ -936,22 +1028,30 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 	lo->lo_offset = 0;
 	lo->lo_sizelimit = 0;
 	lo->lo_encrypt_key_size = 0;
-	lo->lo_flags = 0;
 	lo->lo_thread = NULL;
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
 	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
-	if (bdev)
+	if (bdev) {
+		bdput(bdev);
 		invalidate_bdev(bdev);
+	}
 	set_capacity(lo->lo_disk, 0);
-	if (bdev)
+	loop_sysfs_exit(lo);
+	if (bdev) {
 		bd_set_size(bdev, 0);
+		/* let user-space know about this change */
+		kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
+	}
 	mapping_set_gfp_mask(filp->f_mapping, gfp);
 	lo->lo_state = Lo_unbound;
 	/* This is safe: open() is still holding a reference. */
 	module_put(THIS_MODULE);
-	if (max_part > 0)
+	if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
 		ioctl_by_bdev(bdev, BLKRRPART, 0);
+	lo->lo_flags = 0;
+	if (!part_shift)
+		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
 	mutex_unlock(&lo->lo_ctl_mutex);
 	/*
 	 * Need not hold lo_ctl_mutex to fput backing file.
@@ -968,10 +1068,10 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 {
 	int err;
 	struct loop_func_table *xfer;
-	uid_t uid = current_uid();
+	kuid_t uid = current_uid();
 
 	if (lo->lo_encrypt_key_size &&
-	    lo->lo_key_owner != uid &&
+	    !uid_eq(lo->lo_key_owner, uid) &&
 	    !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (lo->lo_state != Lo_bound)
@@ -999,12 +1099,11 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		return err;
 
 	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit) {
-		lo->lo_offset = info->lo_offset;
-		lo->lo_sizelimit = info->lo_sizelimit;
-		if (figure_loop_size(lo))
+	    lo->lo_sizelimit != info->lo_sizelimit)
+		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit))
 			return -EFBIG;
-	}
+
+	loop_config_discard(lo);
 
 	memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
 	memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
@@ -1020,6 +1119,13 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	     (info->lo_flags & LO_FLAGS_AUTOCLEAR))
 		lo->lo_flags ^= LO_FLAGS_AUTOCLEAR;
 
+	if ((info->lo_flags & LO_FLAGS_PARTSCAN) &&
+	     !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
+		lo->lo_flags |= LO_FLAGS_PARTSCAN;
+		lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
+		ioctl_by_bdev(lo->lo_device, BLKRRPART, 0);
+	}
+
 	lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
 	lo->lo_init[0] = info->lo_init[0];
 	lo->lo_init[1] = info->lo_init[1];
@@ -1041,7 +1147,7 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
-	error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat);
+	error = vfs_getattr(&file->f_path, &stat);
 	if (error)
 		return error;
 	memset(info, 0, sizeof(*info));
@@ -1173,26 +1279,10 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
 
 static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev)
 {
-	int err;
-	sector_t sec;
-	loff_t sz;
-
-	err = -ENXIO;
 	if (unlikely(lo->lo_state != Lo_bound))
-		goto out;
-	err = figure_loop_size(lo);
-	if (unlikely(err))
-		goto out;
-	sec = get_capacity(lo->lo_disk);
-	/* the width of sector_t may be narrow for bit-shift */
-	sz = sec;
-	sz <<= 9;
-	mutex_lock(&bdev->bd_mutex);
-	bd_set_size(bdev, sz);
-	mutex_unlock(&bdev->bd_mutex);
+		return -ENXIO;
 
- out:
-	return err;
+	return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
 }
 
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,
@@ -1211,18 +1301,24 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		break;
 	case LOOP_CLR_FD:
 		/* loop_clr_fd would have unlocked lo_ctl_mutex on success */
-		err = loop_clr_fd(lo, bdev);
+		err = loop_clr_fd(lo);
 		if (!err)
 			goto out_unlocked;
 		break;
 	case LOOP_SET_STATUS:
-		err = loop_set_status_old(lo, (struct loop_info __user *) arg);
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+			err = loop_set_status_old(lo,
+					(struct loop_info __user *)arg);
 		break;
 	case LOOP_GET_STATUS:
 		err = loop_get_status_old(lo, (struct loop_info __user *) arg);
 		break;
 	case LOOP_SET_STATUS64:
-		err = loop_set_status64(lo, (struct loop_info64 __user *) arg);
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+			err = loop_set_status64(lo,
+					(struct loop_info64 __user *) arg);
 		break;
 	case LOOP_GET_STATUS64:
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
@@ -1398,16 +1494,25 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 
 static int lo_open(struct block_device *bdev, fmode_t mode)
 {
-	struct loop_device *lo = bdev->bd_disk->private_data;
+	struct loop_device *lo;
+	int err = 0;
+
+	mutex_lock(&loop_index_mutex);
+	lo = bdev->bd_disk->private_data;
+	if (!lo) {
+		err = -ENXIO;
+		goto out;
+	}
 
 	mutex_lock(&lo->lo_ctl_mutex);
 	lo->lo_refcnt++;
 	mutex_unlock(&lo->lo_ctl_mutex);
-
-	return 0;
+out:
+	mutex_unlock(&loop_index_mutex);
+	return err;
 }
 
-static int lo_release(struct gendisk *disk, fmode_t mode)
+static void lo_release(struct gendisk *disk, fmode_t mode)
 {
 	struct loop_device *lo = disk->private_data;
 	int err;
@@ -1422,9 +1527,9 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		err = loop_clr_fd(lo, NULL);
+		err = loop_clr_fd(lo);
 		if (!err)
-			goto out_unlocked;
+			return;
 	} else {
 		/*
 		 * Otherwise keep thread (if running) and config,
@@ -1435,11 +1540,9 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
 
 out:
 	mutex_unlock(&lo->lo_ctl_mutex);
-out_unlocked:
-	return 0;
 }
 
-static struct block_device_operations lo_fops = {
+static const struct block_device_operations lo_fops = {
 	.owner =	THIS_MODULE,
 	.open =		lo_open,
 	.release =	lo_release,
@@ -1453,9 +1556,9 @@ static struct block_device_operations lo_fops = {
  * And now the modules code and kernel interface.
  */
 static int max_loop;
-module_param(max_loop, int, 0);
+module_param(max_loop, int, S_IRUGO);
 MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
-module_param(max_part, int, 0);
+module_param(max_part, int, S_IRUGO);
 MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
@@ -1470,53 +1573,100 @@ int loop_register_transfer(struct loop_func_table *funcs)
 	return 0;
 }
 
+static int unregister_transfer_cb(int id, void *ptr, void *data)
+{
+	struct loop_device *lo = ptr;
+	struct loop_func_table *xfer = data;
+
+	mutex_lock(&lo->lo_ctl_mutex);
+	if (lo->lo_encryption == xfer)
+		loop_release_xfer(lo);
+	mutex_unlock(&lo->lo_ctl_mutex);
+	return 0;
+}
+
 int loop_unregister_transfer(int number)
 {
 	unsigned int n = number;
-	struct loop_device *lo;
 	struct loop_func_table *xfer;
 
 	if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
 		return -EINVAL;
 
 	xfer_funcs[n] = NULL;
-
-	list_for_each_entry(lo, &loop_devices, lo_list) {
-		mutex_lock(&lo->lo_ctl_mutex);
-
-		if (lo->lo_encryption == xfer)
-			loop_release_xfer(lo);
-
-		mutex_unlock(&lo->lo_ctl_mutex);
-	}
-
+	idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer);
 	return 0;
 }
 
 EXPORT_SYMBOL(loop_register_transfer);
 EXPORT_SYMBOL(loop_unregister_transfer);
 
-static struct loop_device *loop_alloc(int i)
+static int loop_add(struct loop_device **l, int i)
 {
 	struct loop_device *lo;
 	struct gendisk *disk;
+	int err;
 
+	err = -ENOMEM;
 	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
 	if (!lo)
 		goto out;
 
+	lo->lo_state = Lo_unbound;
+
+	/* allocate id, if @id >= 0, we're requesting that specific id */
+	if (i >= 0) {
+		err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
+		if (err == -ENOSPC)
+			err = -EEXIST;
+	} else {
+		err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL);
+	}
+	if (err < 0)
+		goto out_free_dev;
+	i = err;
+
+	err = -ENOMEM;
 	lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
 	if (!lo->lo_queue)
-		goto out_free_dev;
+		goto out_free_idr;
+
+	/*
+	 * set queue make_request_fn
+	 */
+	blk_queue_make_request(lo->lo_queue, loop_make_request);
+	lo->lo_queue->queuedata = lo;
 
 	disk = lo->lo_disk = alloc_disk(1 << part_shift);
 	if (!disk)
 		goto out_free_queue;
 
+	/*
+	 * Disable partition scanning by default. The in-kernel partition
+	 * scanning can be requested individually per-device during its
+	 * setup. Userspace can always add and remove partitions from all
+	 * devices. The needed partition minors are allocated from the
+	 * extended minor space, the main loop device numbers will continue
+	 * to match the loop minors, regardless of the number of partitions
+	 * used.
+	 *
+	 * If max_part is given, partition scanning is globally enabled for
+	 * all loop devices. The minors for the main loop devices will be
+	 * multiples of max_part.
+	 *
+	 * Note: Global-for-all-devices, set-only-at-init, read-only module
+	 * parameteters like 'max_loop' and 'max_part' make things needlessly
+	 * complicated, are too static, inflexible and may surprise
+	 * userspace tools. Parameters like this in general should be avoided.
+	 */
+	if (!part_shift)
+		disk->flags |= GENHD_FL_NO_PART_SCAN;
+	disk->flags |= GENHD_FL_EXT_DEVT;
 	mutex_init(&lo->lo_ctl_mutex);
 	lo->lo_number		= i;
 	lo->lo_thread		= NULL;
 	init_waitqueue_head(&lo->lo_event);
+	init_waitqueue_head(&lo->lo_req_wait);
 	spin_lock_init(&lo->lo_lock);
 	disk->major		= LOOP_MAJOR;
 	disk->first_minor	= i << part_shift;
@@ -1524,140 +1674,245 @@ static struct loop_device *loop_alloc(int i)
 	disk->private_data	= lo;
 	disk->queue		= lo->lo_queue;
 	sprintf(disk->disk_name, "loop%d", i);
-	return lo;
+	add_disk(disk);
+	*l = lo;
+	return lo->lo_number;
 
 out_free_queue:
 	blk_cleanup_queue(lo->lo_queue);
+out_free_idr:
+	idr_remove(&loop_index_idr, i);
 out_free_dev:
 	kfree(lo);
 out:
-	return NULL;
+	return err;
 }
 
-static void loop_free(struct loop_device *lo)
+static void loop_remove(struct loop_device *lo)
 {
+	del_gendisk(lo->lo_disk);
 	blk_cleanup_queue(lo->lo_queue);
 	put_disk(lo->lo_disk);
-	list_del(&lo->lo_list);
 	kfree(lo);
 }
 
-static struct loop_device *loop_init_one(int i)
+static int find_free_cb(int id, void *ptr, void *data)
+{
+	struct loop_device *lo = ptr;
+	struct loop_device **l = data;
+
+	if (lo->lo_state == Lo_unbound) {
+		*l = lo;
+		return 1;
+	}
+	return 0;
+}
+
+static int loop_lookup(struct loop_device **l, int i)
 {
 	struct loop_device *lo;
+	int ret = -ENODEV;
+
+	if (i < 0) {
+		int err;
 
-	list_for_each_entry(lo, &loop_devices, lo_list) {
-		if (lo->lo_number == i)
-			return lo;
+		err = idr_for_each(&loop_index_idr, &find_free_cb, &lo);
+		if (err == 1) {
+			*l = lo;
+			ret = lo->lo_number;
+		}
+		goto out;
 	}
 
-	lo = loop_alloc(i);
+	/* lookup and return a specific i */
+	lo = idr_find(&loop_index_idr, i);
 	if (lo) {
-		add_disk(lo->lo_disk);
-		list_add_tail(&lo->lo_list, &loop_devices);
+		*l = lo;
+		ret = lo->lo_number;
 	}
-	return lo;
-}
-
-static void loop_del_one(struct loop_device *lo)
-{
-	del_gendisk(lo->lo_disk);
-	loop_free(lo);
+out:
+	return ret;
 }
 
 static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 {
 	struct loop_device *lo;
 	struct kobject *kobj;
+	int err;
 
-	mutex_lock(&loop_devices_mutex);
-	lo = loop_init_one(dev & MINORMASK);
-	kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
-	mutex_unlock(&loop_devices_mutex);
+	mutex_lock(&loop_index_mutex);
+	err = loop_lookup(&lo, MINOR(dev) >> part_shift);
+	if (err < 0)
+		err = loop_add(&lo, MINOR(dev) >> part_shift);
+	if (err < 0)
+		kobj = NULL;
+	else
+		kobj = get_disk(lo->lo_disk);
+	mutex_unlock(&loop_index_mutex);
 
 	*part = 0;
 	return kobj;
 }
 
+static long loop_control_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long parm)
+{
+	struct loop_device *lo;
+	int ret = -ENOSYS;
+
+	mutex_lock(&loop_index_mutex);
+	switch (cmd) {
+	case LOOP_CTL_ADD:
+		ret = loop_lookup(&lo, parm);
+		if (ret >= 0) {
+			ret = -EEXIST;
+			break;
+		}
+		ret = loop_add(&lo, parm);
+		break;
+	case LOOP_CTL_REMOVE:
+		ret = loop_lookup(&lo, parm);
+		if (ret < 0)
+			break;
+		mutex_lock(&lo->lo_ctl_mutex);
+		if (lo->lo_state != Lo_unbound) {
+			ret = -EBUSY;
+			mutex_unlock(&lo->lo_ctl_mutex);
+			break;
+		}
+		if (lo->lo_refcnt > 0) {
+			ret = -EBUSY;
+			mutex_unlock(&lo->lo_ctl_mutex);
+			break;
+		}
+		lo->lo_disk->private_data = NULL;
+		mutex_unlock(&lo->lo_ctl_mutex);
+		idr_remove(&loop_index_idr, lo->lo_number);
+		loop_remove(lo);
+		break;
+	case LOOP_CTL_GET_FREE:
+		ret = loop_lookup(&lo, -1);
+		if (ret >= 0)
+			break;
+		ret = loop_add(&lo, -1);
+	}
+	mutex_unlock(&loop_index_mutex);
+
+	return ret;
+}
+
+static const struct file_operations loop_ctl_fops = {
+	.open		= nonseekable_open,
+	.unlocked_ioctl	= loop_control_ioctl,
+	.compat_ioctl	= loop_control_ioctl,
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice loop_misc = {
+	.minor		= LOOP_CTRL_MINOR,
+	.name		= "loop-control",
+	.fops		= &loop_ctl_fops,
+};
+
+MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR);
+MODULE_ALIAS("devname:loop-control");
+
 static int __init loop_init(void)
 {
 	int i, nr;
 	unsigned long range;
-	struct loop_device *lo, *next;
+	struct loop_device *lo;
+	int err;
 
-	/*
-	 * loop module now has a feature to instantiate underlying device
-	 * structure on-demand, provided that there is an access dev node.
-	 * However, this will not work well with user space tool that doesn't
-	 * know about such "feature".  In order to not break any existing
-	 * tool, we do the following:
-	 *
-	 * (1) if max_loop is specified, create that many upfront, and this
-	 *     also becomes a hard limit.
-	 * (2) if max_loop is not specified, create 8 loop device on module
-	 *     load, user can further extend loop device by create dev node
-	 *     themselves and have kernel automatically instantiate actual
-	 *     device on-demand.
-	 */
+	err = misc_register(&loop_misc);
+	if (err < 0)
+		return err;
 
 	part_shift = 0;
-	if (max_part > 0)
+	if (max_part > 0) {
 		part_shift = fls(max_part);
 
-	if (max_loop > 1UL << (MINORBITS - part_shift))
-		return -EINVAL;
+		/*
+		 * Adjust max_part according to part_shift as it is exported
+		 * to user space so that user can decide correct minor number
+		 * if [s]he want to create more devices.
+		 *
+		 * Note that -1 is required because partition 0 is reserved
+		 * for the whole disk.
+		 */
+		max_part = (1UL << part_shift) - 1;
+	}
 
+	if ((1UL << part_shift) > DISK_MAX_PARTS) {
+		err = -EINVAL;
+		goto misc_out;
+	}
+
+	if (max_loop > 1UL << (MINORBITS - part_shift)) {
+		err = -EINVAL;
+		goto misc_out;
+	}
+
+	/*
+	 * If max_loop is specified, create that many devices upfront.
+	 * This also becomes a hard limit. If max_loop is not specified,
+	 * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+	 * init time. Loop devices can be requested on-demand with the
+	 * /dev/loop-control interface, or be instantiated by accessing
+	 * a 'dead' device node.
+	 */
 	if (max_loop) {
 		nr = max_loop;
-		range = max_loop;
+		range = max_loop << part_shift;
 	} else {
-		nr = 8;
-		range = 1UL << (MINORBITS - part_shift);
+		nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
+		range = 1UL << MINORBITS;
 	}
 
-	if (register_blkdev(LOOP_MAJOR, "loop"))
-		return -EIO;
-
-	for (i = 0; i < nr; i++) {
-		lo = loop_alloc(i);
-		if (!lo)
-			goto Enomem;
-		list_add_tail(&lo->lo_list, &loop_devices);
+	if (register_blkdev(LOOP_MAJOR, "loop")) {
+		err = -EIO;
+		goto misc_out;
 	}
 
-	/* point of no return */
-
-	list_for_each_entry(lo, &loop_devices, lo_list)
-		add_disk(lo->lo_disk);
-
 	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
 				  THIS_MODULE, loop_probe, NULL, NULL);
 
+	/* pre-create number of devices given by config or max_loop */
+	mutex_lock(&loop_index_mutex);
+	for (i = 0; i < nr; i++)
+		loop_add(&lo, i);
+	mutex_unlock(&loop_index_mutex);
+
 	printk(KERN_INFO "loop: module loaded\n");
 	return 0;
 
-Enomem:
-	printk(KERN_INFO "loop: out of memory\n");
+misc_out:
+	misc_deregister(&loop_misc);
+	return err;
+}
 
-	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
-		loop_free(lo);
+static int loop_exit_cb(int id, void *ptr, void *data)
+{
+	struct loop_device *lo = ptr;
 
-	unregister_blkdev(LOOP_MAJOR, "loop");
-	return -ENOMEM;
+	loop_remove(lo);
+	return 0;
 }
 
 static void __exit loop_exit(void)
 {
 	unsigned long range;
-	struct loop_device *lo, *next;
 
-	range = max_loop ? max_loop :  1UL << (MINORBITS - part_shift);
+	range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
 
-	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
-		loop_del_one(lo);
+	idr_for_each(&loop_index_idr, &loop_exit_cb, NULL);
+	idr_destroy(&loop_index_idr);
 
 	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
 	unregister_blkdev(LOOP_MAJOR, "loop");
+
+	misc_deregister(&loop_misc);
 }
 
 module_init(loop_init);
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
new file mode 100644
index 00000000000..90df5d6485b
--- /dev/null
+++ b/drivers/block/loop.h
@@ -0,0 +1,85 @@
+/*
+ * loop.h
+ *
+ * Written by Theodore Ts'o, 3/29/93.
+ *
+ * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
+ * permitted under the GNU General Public License.
+ */
+#ifndef _LINUX_LOOP_H
+#define _LINUX_LOOP_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <uapi/linux/loop.h>
+
+/* Possible states of device */
+enum {
+	Lo_unbound,
+	Lo_bound,
+	Lo_rundown,
+};
+
+struct loop_func_table;
+
+struct loop_device {
+	int		lo_number;
+	int		lo_refcnt;
+	loff_t		lo_offset;
+	loff_t		lo_sizelimit;
+	int		lo_flags;
+	int		(*transfer)(struct loop_device *, int cmd,
+				    struct page *raw_page, unsigned raw_off,
+				    struct page *loop_page, unsigned loop_off,
+				    int size, sector_t real_block);
+	char		lo_file_name[LO_NAME_SIZE];
+	char		lo_crypt_name[LO_NAME_SIZE];
+	char		lo_encrypt_key[LO_KEY_SIZE];
+	int		lo_encrypt_key_size;
+	struct loop_func_table *lo_encryption;
+	__u32           lo_init[2];
+	kuid_t		lo_key_owner;	/* Who set the key */
+	int		(*ioctl)(struct loop_device *, int cmd, 
+				 unsigned long arg); 
+
+	struct file *	lo_backing_file;
+	struct block_device *lo_device;
+	unsigned	lo_blocksize;
+	void		*key_data; 
+
+	gfp_t		old_gfp_mask;
+
+	spinlock_t		lo_lock;
+	struct bio_list		lo_bio_list;
+	unsigned int		lo_bio_count;
+	int			lo_state;
+	struct mutex		lo_ctl_mutex;
+	struct task_struct	*lo_thread;
+	wait_queue_head_t	lo_event;
+	/* wait queue for incoming requests */
+	wait_queue_head_t	lo_req_wait;
+
+	struct request_queue	*lo_queue;
+	struct gendisk		*lo_disk;
+};
+
+/* Support for loadable transfer modules */
+struct loop_func_table {
+	int number;	/* filter type */ 
+	int (*transfer)(struct loop_device *lo, int cmd,
+			struct page *raw_page, unsigned raw_off,
+			struct page *loop_page, unsigned loop_off,
+			int size, sector_t real_block);
+	int (*init)(struct loop_device *, const struct loop_info64 *); 
+	/* release is called from loop_unregister_transfer or clr_fd */
+	int (*release)(struct loop_device *); 
+	int (*ioctl)(struct loop_device *, int cmd, unsigned long arg);
+	struct module *owner;
+}; 
+
+int loop_register_transfer(struct loop_func_table *funcs);
+int loop_unregister_transfer(int number); 
+
+#endif
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 60de5a01e71..e352cac707e 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -22,13 +22,13 @@
 #include <linux/delay.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
+#include <linux/mg_disk.h>
+#include <linux/slab.h>
 
 #define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
 
 /* name for block device */
 #define MG_DISK_NAME "mgd"
-/* name for platform device */
-#define MG_DEV_NAME "mg_disk"
 
 #define MG_DISK_MAJ 0
 #define MG_DISK_MAX_PART 16
@@ -37,7 +37,6 @@
 
 /* Register offsets */
 #define MG_BUFF_OFFSET			0x8000
-#define MG_STORAGE_BUFFER_SIZE		0x200
 #define MG_REG_OFFSET			0xC000
 #define MG_REG_FEATURE			(MG_REG_OFFSET + 2)	/* write case */
 #define MG_REG_ERROR			(MG_REG_OFFSET + 2)	/* read case */
@@ -103,33 +102,8 @@
 #define MG_TMAX_SWRST_TO_RDY	500
 #define MG_TMAX_RSTOUT		3000
 
-/* device attribution */
-/* use mflash as boot device */
-#define MG_BOOT_DEV		(1 << 0)
-/* use mflash as storage device */
-#define MG_STORAGE_DEV		(1 << 1)
-/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
-#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
-
 #define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
 
-/* names of GPIO resource */
-#define MG_RST_PIN	"mg_rst"
-/* except MG_BOOT_DEV, reset-out pin should be assigned */
-#define MG_RSTOUT_PIN	"mg_rstout"
-
-/* private driver data */
-struct mg_drv_data {
-	/* disk resource */
-	u32 use_polling;
-
-	/* device attribution */
-	u32 dev_attr;
-
-	/* internally used */
-	struct mg_host *host;
-};
-
 /* main structure for mflash driver */
 struct mg_host {
 	struct device *dev;
@@ -245,6 +219,16 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
 	host->error = MG_ERR_NONE;
 	expire = jiffies + msecs_to_jiffies(msec);
 
+	/* These 2 times dummy status read prevents reading invalid
+	 * status. A very little time (3 times of mflash operating clk)
+	 * is required for busy bit is set. Use dummy read instead of
+	 * busy wait, because mflash's PLL is machine dependent.
+	 */
+	if (prv_data->use_polling) {
+		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+	}
+
 	status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
 
 	do {
@@ -271,8 +255,6 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
 			mg_dump_status("not ready", status, host);
 			return MG_ERR_INV_STAT;
 		}
-		if (prv_data->use_polling)
-			msleep(1);
 
 		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
 	} while (time_before(cur_jiffies, expire));
@@ -495,9 +477,18 @@ static unsigned int mg_out(struct mg_host *host,
 	return MG_ERR_NONE;
 }
 
+static void mg_read_one(struct mg_host *host, struct request *req)
+{
+	u16 *buff = (u16 *)bio_data(req->bio);
+	u32 i;
+
+	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
+		*buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
+			      (i << 1));
+}
+
 static void mg_read(struct request *req)
 {
-	u32 j;
 	struct mg_host *host = req->rq_disk->private_data;
 
 	if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
@@ -505,52 +496,68 @@ static void mg_read(struct request *req)
 		mg_bad_rw_intr(host);
 
 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       blk_rq_sectors(req), blk_rq_pos(req), req->buffer);
+	       blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
 
 	do {
-		u16 *buff = (u16 *)req->buffer;
-
 		if (mg_wait(host, ATA_DRQ,
 			    MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return;
 		}
-		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++)
-			*buff++ = inw((unsigned long)host->dev_base +
-				      MG_BUFF_OFFSET + (j << 1));
+
+		mg_read_one(host, req);
 
 		outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
 	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
 }
 
+static void mg_write_one(struct mg_host *host, struct request *req)
+{
+	u16 *buff = (u16 *)bio_data(req->bio);
+	u32 i;
+
+	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
+		outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET +
+		     (i << 1));
+}
+
 static void mg_write(struct request *req)
 {
-	u32 j;
 	struct mg_host *host = req->rq_disk->private_data;
+	unsigned int rem = blk_rq_sectors(req);
 
-	if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
+	if (mg_out(host, blk_rq_pos(req), rem,
 		   MG_CMD_WR, NULL) != MG_ERR_NONE) {
 		mg_bad_rw_intr(host);
 		return;
 	}
 
 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       blk_rq_sectors(req), blk_rq_pos(req), req->buffer);
+	       rem, blk_rq_pos(req), bio_data(req->bio));
+
+	if (mg_wait(host, ATA_DRQ,
+		    MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
+		mg_bad_rw_intr(host);
+		return;
+	}
 
 	do {
-		u16 *buff = (u16 *)req->buffer;
+		mg_write_one(host, req);
+
+		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
+				MG_REG_COMMAND);
 
-	if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
+		rem--;
+		if (rem > 1 && mg_wait(host, ATA_DRQ,
+					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
+			mg_bad_rw_intr(host);
+			return;
+		} else if (mg_wait(host, MG_STAT_READY,
+					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return;
 		}
-		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++)
-			outw(*buff++, (unsigned long)host->dev_base +
-				      MG_BUFF_OFFSET + (j << 1));
-
-		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
-				MG_REG_COMMAND);
 	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
 }
 
@@ -558,7 +565,6 @@ static void mg_read_intr(struct mg_host *host)
 {
 	struct request *req = host->req;
 	u32 i;
-	u16 *buff;
 
 	/* check status */
 	do {
@@ -576,16 +582,10 @@ static void mg_read_intr(struct mg_host *host)
 	return;
 
 ok_to_read:
-	/* get current segment of request */
-	buff = (u16 *)req->buffer;
-
-	/* read 1 sector */
-	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
-		*buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
-			      (i << 1));
+	mg_read_one(host, req);
 
 	MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-	       blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer);
+	       blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
 
 	/* send read confirm */
 	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
@@ -601,8 +601,7 @@ ok_to_read:
 static void mg_write_intr(struct mg_host *host)
 {
 	struct request *req = host->req;
-	u32 i, j;
-	u16 *buff;
+	u32 i;
 	bool rem;
 
 	/* check status */
@@ -623,14 +622,9 @@ static void mg_write_intr(struct mg_host *host)
 ok_to_write:
 	if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) {
 		/* write 1 sector and set handler if remains */
-		buff = (u16 *)req->buffer;
-		for (j = 0; j < MG_STORAGE_BUFFER_SIZE >> 1; j++) {
-			outw(*buff, (unsigned long)host->dev_base +
-					MG_BUFF_OFFSET + (j << 1));
-			buff++;
-		}
+		mg_write_one(host, req);
 		MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-		       blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
+		       blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
 		host->mg_do_intr = mg_write_intr;
 		mod_timer(&host->timer, jiffies + 3 * HZ);
 	}
@@ -642,7 +636,7 @@ ok_to_write:
 		mg_request(host->breq);
 }
 
-void mg_times_out(unsigned long data)
+static void mg_times_out(unsigned long data)
 {
 	struct mg_host *host = (struct mg_host *)data;
 	char *name;
@@ -676,7 +670,7 @@ static void mg_request_poll(struct request_queue *q)
 				break;
 		}
 
-		if (unlikely(!blk_fs_request(host->req))) {
+		if (unlikely(host->req->cmd_type != REQ_TYPE_FS)) {
 			mg_end_request_cur(host, -EIO);
 			continue;
 		}
@@ -693,9 +687,6 @@ static unsigned int mg_issue_req(struct request *req,
 		unsigned int sect_num,
 		unsigned int sect_cnt)
 {
-	u16 *buff;
-	u32 i;
-
 	switch (rq_data_dir(req)) {
 	case READ:
 		if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
@@ -719,12 +710,7 @@ static unsigned int mg_issue_req(struct request *req,
 			mg_bad_rw_intr(host);
 			return host->error;
 		}
-		buff = (u16 *)req->buffer;
-		for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) {
-			outw(*buff, (unsigned long)host->dev_base +
-					MG_BUFF_OFFSET + (i << 1));
-			buff++;
-		}
+		mg_write_one(host, req);
 		mod_timer(&host->timer, jiffies + 3 * HZ);
 		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
@@ -770,7 +756,7 @@ static void mg_request(struct request_queue *q)
 			continue;
 		}
 
-		if (unlikely(!blk_fs_request(req))) {
+		if (unlikely(req->cmd_type != REQ_TYPE_FS)) {
 			mg_end_request_cur(host, -EIO);
 			continue;
 		}
@@ -790,13 +776,14 @@ static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations mg_disk_ops = {
+static const struct block_device_operations mg_disk_ops = {
 	.getgeo = mg_getgeo
 };
 
-static int mg_suspend(struct platform_device *plat_dev, pm_message_t state)
+#ifdef CONFIG_PM_SLEEP
+static int mg_suspend(struct device *dev)
 {
-	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+	struct mg_drv_data *prv_data = dev->platform_data;
 	struct mg_host *host = prv_data->host;
 
 	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
@@ -818,9 +805,9 @@ static int mg_suspend(struct platform_device *plat_dev, pm_message_t state)
 	return 0;
 }
 
-static int mg_resume(struct platform_device *plat_dev)
+static int mg_resume(struct device *dev)
 {
-	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+	struct mg_drv_data *prv_data = dev->platform_data;
 	struct mg_host *host = prv_data->host;
 
 	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
@@ -838,6 +825,9 @@ static int mg_resume(struct platform_device *plat_dev)
 
 	return 0;
 }
+#endif
+
+static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
 
 static int mg_probe(struct platform_device *plat_dev)
 {
@@ -875,7 +865,7 @@ static int mg_probe(struct platform_device *plat_dev)
 		err = -EINVAL;
 		goto probe_err_2;
 	}
-	host->dev_base = ioremap(rsc->start , rsc->end + 1);
+	host->dev_base = ioremap(rsc->start, resource_size(rsc));
 	if (!host->dev_base) {
 		printk(KERN_ERR "%s:%d ioremap fail\n",
 				__func__, __LINE__);
@@ -902,8 +892,10 @@ static int mg_probe(struct platform_device *plat_dev)
 	gpio_direction_output(host->rst, 1);
 
 	/* reset out pin */
-	if (!(prv_data->dev_attr & MG_DEV_MASK))
+	if (!(prv_data->dev_attr & MG_DEV_MASK)) {
+		err = -EINVAL;
 		goto probe_err_3a;
+	}
 
 	if (prv_data->dev_attr != MG_BOOT_DEV) {
 		rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
@@ -923,7 +915,7 @@ static int mg_probe(struct platform_device *plat_dev)
 
 	/* disk reset */
 	if (prv_data->dev_attr == MG_STORAGE_DEV) {
-		/* If POR seq. not yet finised, wait */
+		/* If POR seq. not yet finished, wait */
 		err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
 		if (err)
 			goto probe_err_3b;
@@ -944,7 +936,7 @@ static int mg_probe(struct platform_device *plat_dev)
 			goto probe_err_3b;
 		}
 		err = request_irq(host->irq, mg_irq,
-				IRQF_DISABLED | IRQF_TRIGGER_RISING,
+				IRQF_TRIGGER_RISING,
 				MG_DEV_NAME, host);
 		if (err) {
 			printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",
@@ -988,14 +980,13 @@ static int mg_probe(struct platform_device *plat_dev)
 	host->breq->queuedata = host;
 
 	/* mflash is random device, thanx for the noop */
-	elevator_exit(host->breq->elevator);
-	err = elevator_init(host->breq, "noop");
+	err = elevator_change(host->breq, "noop");
 	if (err) {
 		printk(KERN_ERR "%s:%d (elevator_init) fail\n",
 				__func__, __LINE__);
 		goto probe_err_6;
 	}
-	blk_queue_max_sectors(host->breq, MG_MAX_SECTS);
+	blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS);
 	blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
 
 	init_timer(&host->timer);
@@ -1089,11 +1080,10 @@ static int mg_remove(struct platform_device *plat_dev)
 static struct platform_driver mg_disk_driver = {
 	.probe = mg_probe,
 	.remove = mg_remove,
-	.suspend = mg_suspend,
-	.resume = mg_resume,
 	.driver = {
 		.name = MG_DEV_NAME,
 		.owner = THIS_MODULE,
+		.pm = &mg_pm,
 	}
 };
 
diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig
new file mode 100644
index 00000000000..0ba837fc62a
--- /dev/null
+++ b/drivers/block/mtip32xx/Kconfig
@@ -0,0 +1,9 @@
+#
+# mtip32xx device driver configuration
+#
+
+config BLK_DEV_PCIESSD_MTIP32XX
+	tristate "Block Device Driver for Micron PCIe SSDs"
+	depends on PCI
+	help
+          This enables the block driver for Micron PCIe SSDs.
diff --git a/drivers/block/mtip32xx/Makefile b/drivers/block/mtip32xx/Makefile
new file mode 100644
index 00000000000..4fbef8c8329
--- /dev/null
+++ b/drivers/block/mtip32xx/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for  Block device driver for Micron PCIe SSD
+#
+
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx.o
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
new file mode 100644
index 00000000000..295f3afbbef
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -0,0 +1,4741 @@
+/*
+ * Driver for the Micron P320 SSD
+ *   Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ *    Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/ata.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/uaccess.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/bio.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <../drivers/ata/ahci.h>
+#include <linux/export.h>
+#include <linux/debugfs.h>
+#include <linux/prefetch.h>
+#include "mtip32xx.h"
+
+#define HW_CMD_SLOT_SZ		(MTIP_MAX_COMMAND_SLOTS * 32)
+
+/* DMA region containing RX Fis, Identify, RLE10, and SMART buffers */
+#define AHCI_RX_FIS_SZ          0x100
+#define AHCI_RX_FIS_OFFSET      0x0
+#define AHCI_IDFY_SZ            ATA_SECT_SIZE
+#define AHCI_IDFY_OFFSET        0x400
+#define AHCI_SECTBUF_SZ         ATA_SECT_SIZE
+#define AHCI_SECTBUF_OFFSET     0x800
+#define AHCI_SMARTBUF_SZ        ATA_SECT_SIZE
+#define AHCI_SMARTBUF_OFFSET    0xC00
+/* 0x100 + 0x200 + 0x200 + 0x200 is smaller than 4k but we pad it out */
+#define BLOCK_DMA_ALLOC_SZ      4096
+
+/* DMA region containing command table (should be 8192 bytes) */
+#define AHCI_CMD_SLOT_SZ        sizeof(struct mtip_cmd_hdr)
+#define AHCI_CMD_TBL_SZ         (MTIP_MAX_COMMAND_SLOTS * AHCI_CMD_SLOT_SZ)
+#define AHCI_CMD_TBL_OFFSET     0x0
+
+/* DMA region per command (contains header and SGL) */
+#define AHCI_CMD_TBL_HDR_SZ     0x80
+#define AHCI_CMD_TBL_HDR_OFFSET 0x0
+#define AHCI_CMD_TBL_SGL_SZ     (MTIP_MAX_SG * sizeof(struct mtip_cmd_sg))
+#define AHCI_CMD_TBL_SGL_OFFSET AHCI_CMD_TBL_HDR_SZ
+#define CMD_DMA_ALLOC_SZ        (AHCI_CMD_TBL_SGL_SZ + AHCI_CMD_TBL_HDR_SZ)
+
+
+#define HOST_CAP_NZDMA		(1 << 19)
+#define HOST_HSORG		0xFC
+#define HSORG_DISABLE_SLOTGRP_INTR (1<<24)
+#define HSORG_DISABLE_SLOTGRP_PXIS (1<<16)
+#define HSORG_HWREV		0xFF00
+#define HSORG_STYLE		0x8
+#define HSORG_SLOTGROUPS	0x7
+
+#define PORT_COMMAND_ISSUE	0x38
+#define PORT_SDBV		0x7C
+
+#define PORT_OFFSET		0x100
+#define PORT_MEM_SIZE		0x80
+
+#define PORT_IRQ_ERR \
+	(PORT_IRQ_HBUS_ERR | PORT_IRQ_IF_ERR | PORT_IRQ_CONNECT | \
+	 PORT_IRQ_PHYRDY | PORT_IRQ_UNK_FIS | PORT_IRQ_BAD_PMP | \
+	 PORT_IRQ_TF_ERR | PORT_IRQ_HBUS_DATA_ERR | PORT_IRQ_IF_NONFATAL | \
+	 PORT_IRQ_OVERFLOW)
+#define PORT_IRQ_LEGACY \
+	(PORT_IRQ_PIOS_FIS | PORT_IRQ_D2H_REG_FIS)
+#define PORT_IRQ_HANDLED \
+	(PORT_IRQ_SDB_FIS | PORT_IRQ_LEGACY | \
+	 PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR | \
+	 PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY)
+#define DEF_PORT_IRQ \
+	(PORT_IRQ_ERR | PORT_IRQ_LEGACY | PORT_IRQ_SDB_FIS)
+
+/* product numbers */
+#define MTIP_PRODUCT_UNKNOWN	0x00
+#define MTIP_PRODUCT_ASICFPGA	0x11
+
+/* Device instance number, incremented each time a device is probed. */
+static int instance;
+
+struct list_head online_list;
+struct list_head removing_list;
+spinlock_t dev_lock;
+
+/*
+ * Global variable used to hold the major block device number
+ * allocated in mtip_init().
+ */
+static int mtip_major;
+static struct dentry *dfs_parent;
+static struct dentry *dfs_device_status;
+
+static u32 cpu_use[NR_CPUS];
+
+static DEFINE_SPINLOCK(rssd_index_lock);
+static DEFINE_IDA(rssd_index_ida);
+
+static int mtip_block_initialize(struct driver_data *dd);
+
+#ifdef CONFIG_COMPAT
+struct mtip_compat_ide_task_request_s {
+	__u8		io_ports[8];
+	__u8		hob_ports[8];
+	ide_reg_valid_t	out_flags;
+	ide_reg_valid_t	in_flags;
+	int		data_phase;
+	int		req_cmd;
+	compat_ulong_t	out_size;
+	compat_ulong_t	in_size;
+};
+#endif
+
+/*
+ * This function check_for_surprise_removal is called
+ * while card is removed from the system and it will
+ * read the vendor id from the configration space
+ *
+ * @pdev Pointer to the pci_dev structure.
+ *
+ * return value
+ *	 true if device removed, else false
+ */
+static bool mtip_check_surprise_removal(struct pci_dev *pdev)
+{
+	u16 vendor_id = 0;
+	struct driver_data *dd = pci_get_drvdata(pdev);
+
+	if (dd->sr)
+		return true;
+
+       /* Read the vendorID from the configuration space */
+	pci_read_config_word(pdev, 0x00, &vendor_id);
+	if (vendor_id == 0xFFFF) {
+		dd->sr = true;
+		if (dd->queue)
+			set_bit(QUEUE_FLAG_DEAD, &dd->queue->queue_flags);
+		else
+			dev_warn(&dd->pdev->dev,
+				"%s: dd->queue is NULL\n", __func__);
+		if (dd->port) {
+			set_bit(MTIP_PF_SR_CLEANUP_BIT, &dd->port->flags);
+			wake_up_interruptible(&dd->port->svc_wait);
+		} else
+			dev_warn(&dd->pdev->dev,
+				"%s: dd->port is NULL\n", __func__);
+		return true; /* device removed */
+	}
+
+	return false; /* device present */
+}
+
+static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
+{
+	struct request *rq;
+
+	rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true);
+	return blk_mq_rq_to_pdu(rq);
+}
+
+static void mtip_put_int_command(struct driver_data *dd, struct mtip_cmd *cmd)
+{
+	blk_put_request(blk_mq_rq_from_pdu(cmd));
+}
+
+/*
+ * Once we add support for one hctx per mtip group, this will change a bit
+ */
+static struct request *mtip_rq_from_tag(struct driver_data *dd,
+					unsigned int tag)
+{
+	struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0];
+
+	return blk_mq_tag_to_rq(hctx->tags, tag);
+}
+
+static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
+					  unsigned int tag)
+{
+	struct request *rq = mtip_rq_from_tag(dd, tag);
+
+	return blk_mq_rq_to_pdu(rq);
+}
+
+/*
+ * IO completion function.
+ *
+ * This completion function is called by the driver ISR when a
+ * command that was issued by the kernel completes. It first calls the
+ * asynchronous completion function which normally calls back into the block
+ * layer passing the asynchronous callback data, then unmaps the
+ * scatter list associated with the completed command, and finally
+ * clears the allocated bit associated with the completed command.
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command.
+ * @data   Pointer to driver_data.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_async_complete(struct mtip_port *port,
+				int tag, struct mtip_cmd *cmd, int status)
+{
+	struct driver_data *dd = port->dd;
+	struct request *rq;
+
+	if (unlikely(!dd) || unlikely(!port))
+		return;
+
+	if (unlikely(status == PORT_IRQ_TF_ERR)) {
+		dev_warn(&port->dd->pdev->dev,
+			"Command tag %d failed due to TFE\n", tag);
+	}
+
+	/* Unmap the DMA scatter list entries */
+	dma_unmap_sg(&dd->pdev->dev, cmd->sg, cmd->scatter_ents, cmd->direction);
+
+	rq = mtip_rq_from_tag(dd, tag);
+
+	if (unlikely(cmd->unaligned))
+		up(&port->cmd_slot_unal);
+
+	blk_mq_end_io(rq, status ? -EIO : 0);
+}
+
+/*
+ * Reset the HBA (without sleeping)
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	The reset was successful.
+ *	-1	The HBA Reset bit did not clear.
+ */
+static int mtip_hba_reset(struct driver_data *dd)
+{
+	unsigned long timeout;
+
+	/* Set the reset bit */
+	writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+	/* Flush */
+	readl(dd->mmio + HOST_CTL);
+
+	/* Spin for up to 2 seconds, waiting for reset acknowledgement */
+	timeout = jiffies + msecs_to_jiffies(2000);
+	do {
+		mdelay(10);
+		if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
+			return -1;
+
+	} while ((readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		 && time_before(jiffies, timeout));
+
+	if (readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Issue a command to the hardware.
+ *
+ * Set the appropriate bit in the s_active and Command Issue hardware
+ * registers, causing hardware command processing to begin.
+ *
+ * @port Pointer to the port structure.
+ * @tag  The tag of the command to be issued.
+ *
+ * return value
+ *      None
+ */
+static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
+{
+	int group = tag >> 5;
+
+	/* guard SACT and CI registers */
+	spin_lock(&port->cmd_issue_lock[group]);
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->s_active[MTIP_TAG_INDEX(tag)]);
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+	spin_unlock(&port->cmd_issue_lock[group]);
+}
+
+/*
+ * Enable/disable the reception of FIS
+ *
+ * @port   Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ *	Previous state: 1 enabled, 0 disabled
+ */
+static int mtip_enable_fis(struct mtip_port *port, int enable)
+{
+	u32 tmp;
+
+	/* enable FIS reception */
+	tmp = readl(port->mmio + PORT_CMD);
+	if (enable)
+		writel(tmp | PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+	else
+		writel(tmp & ~PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+
+	/* Flush */
+	readl(port->mmio + PORT_CMD);
+
+	return (((tmp & PORT_CMD_FIS_RX) == PORT_CMD_FIS_RX));
+}
+
+/*
+ * Enable/disable the DMA engine
+ *
+ * @port   Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ *	Previous state: 1 enabled, 0 disabled.
+ */
+static int mtip_enable_engine(struct mtip_port *port, int enable)
+{
+	u32 tmp;
+
+	/* enable FIS reception */
+	tmp = readl(port->mmio + PORT_CMD);
+	if (enable)
+		writel(tmp | PORT_CMD_START, port->mmio + PORT_CMD);
+	else
+		writel(tmp & ~PORT_CMD_START, port->mmio + PORT_CMD);
+
+	readl(port->mmio + PORT_CMD);
+	return (((tmp & PORT_CMD_START) == PORT_CMD_START));
+}
+
+/*
+ * Enables the port DMA engine and FIS reception.
+ *
+ * return value
+ *	None
+ */
+static inline void mtip_start_port(struct mtip_port *port)
+{
+	/* Enable FIS reception */
+	mtip_enable_fis(port, 1);
+
+	/* Enable the DMA engine */
+	mtip_enable_engine(port, 1);
+}
+
+/*
+ * Deinitialize a port by disabling port interrupts, the DMA engine,
+ * and FIS reception.
+ *
+ * @port Pointer to the port structure
+ *
+ * return value
+ *	None
+ */
+static inline void mtip_deinit_port(struct mtip_port *port)
+{
+	/* Disable interrupts on this port */
+	writel(0, port->mmio + PORT_IRQ_MASK);
+
+	/* Disable the DMA engine */
+	mtip_enable_engine(port, 0);
+
+	/* Disable FIS reception */
+	mtip_enable_fis(port, 0);
+}
+
+/*
+ * Initialize a port.
+ *
+ * This function deinitializes the port by calling mtip_deinit_port() and
+ * then initializes it by setting the command header and RX FIS addresses,
+ * clearing the SError register and any pending port interrupts before
+ * re-enabling the default set of port interrupts.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_init_port(struct mtip_port *port)
+{
+	int i;
+	mtip_deinit_port(port);
+
+	/* Program the command list base and FIS base addresses */
+	if (readl(port->dd->mmio + HOST_CAP) & HOST_CAP_64) {
+		writel((port->command_list_dma >> 16) >> 16,
+			 port->mmio + PORT_LST_ADDR_HI);
+		writel((port->rxfis_dma >> 16) >> 16,
+			 port->mmio + PORT_FIS_ADDR_HI);
+	}
+
+	writel(port->command_list_dma & 0xFFFFFFFF,
+			port->mmio + PORT_LST_ADDR);
+	writel(port->rxfis_dma & 0xFFFFFFFF, port->mmio + PORT_FIS_ADDR);
+
+	/* Clear SError */
+	writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
+
+	/* reset the completed registers.*/
+	for (i = 0; i < port->dd->slot_groups; i++)
+		writel(0xFFFFFFFF, port->completed[i]);
+
+	/* Clear any pending interrupts for this port */
+	writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT);
+
+	/* Clear any pending interrupts on the HBA. */
+	writel(readl(port->dd->mmio + HOST_IRQ_STAT),
+					port->dd->mmio + HOST_IRQ_STAT);
+
+	/* Enable port interrupts */
+	writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK);
+}
+
+/*
+ * Restart a port
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_restart_port(struct mtip_port *port)
+{
+	unsigned long timeout;
+
+	/* Disable the DMA engine */
+	mtip_enable_engine(port, 0);
+
+	/* Chip quirk: wait up to 500ms for PxCMD.CR == 0 */
+	timeout = jiffies + msecs_to_jiffies(500);
+	while ((readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON)
+		 && time_before(jiffies, timeout))
+		;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
+	/*
+	 * Chip quirk: escalate to hba reset if
+	 * PxCMD.CR not clear after 500 ms
+	 */
+	if (readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) {
+		dev_warn(&port->dd->pdev->dev,
+			"PxCMD.CR not clear, escalating reset\n");
+
+		if (mtip_hba_reset(port->dd))
+			dev_err(&port->dd->pdev->dev,
+				"HBA reset escalation failed.\n");
+
+		/* 30 ms delay before com reset to quiesce chip */
+		mdelay(30);
+	}
+
+	dev_warn(&port->dd->pdev->dev, "Issuing COM reset\n");
+
+	/* Set PxSCTL.DET */
+	writel(readl(port->mmio + PORT_SCR_CTL) |
+			 1, port->mmio + PORT_SCR_CTL);
+	readl(port->mmio + PORT_SCR_CTL);
+
+	/* Wait 1 ms to quiesce chip function */
+	timeout = jiffies + msecs_to_jiffies(1);
+	while (time_before(jiffies, timeout))
+		;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
+	/* Clear PxSCTL.DET */
+	writel(readl(port->mmio + PORT_SCR_CTL) & ~1,
+			 port->mmio + PORT_SCR_CTL);
+	readl(port->mmio + PORT_SCR_CTL);
+
+	/* Wait 500 ms for bit 0 of PORT_SCR_STS to be set */
+	timeout = jiffies + msecs_to_jiffies(500);
+	while (((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+			 && time_before(jiffies, timeout))
+		;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
+	if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+		dev_warn(&port->dd->pdev->dev,
+			"COM reset failed\n");
+
+	mtip_init_port(port);
+	mtip_start_port(port);
+
+}
+
+static int mtip_device_reset(struct driver_data *dd)
+{
+	int rv = 0;
+
+	if (mtip_check_surprise_removal(dd->pdev))
+		return 0;
+
+	if (mtip_hba_reset(dd) < 0)
+		rv = -EFAULT;
+
+	mdelay(1);
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Enable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+					dd->mmio + HOST_CTL);
+	return rv;
+}
+
+/*
+ * Helper function for tag logging
+ */
+static void print_tags(struct driver_data *dd,
+			char *msg,
+			unsigned long *tagbits,
+			int cnt)
+{
+	unsigned char tagmap[128];
+	int group, tagmap_len = 0;
+
+	memset(tagmap, 0, sizeof(tagmap));
+	for (group = SLOTBITS_IN_LONGS; group > 0; group--)
+		tagmap_len += sprintf(tagmap + tagmap_len, "%016lX ",
+						tagbits[group-1]);
+	dev_warn(&dd->pdev->dev,
+			"%d command(s) %s: tagmap [%s]", cnt, msg, tagmap);
+}
+
+/*
+ * Internal command completion callback function.
+ *
+ * This function is normally called by the driver ISR when an internal
+ * command completed. This function signals the command completion by
+ * calling complete().
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command that has completed.
+ * @data   Pointer to a completion structure.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_completion(struct mtip_port *port,
+			    int tag, struct mtip_cmd *command, int status)
+{
+	struct completion *waiting = command->comp_data;
+	if (unlikely(status == PORT_IRQ_TF_ERR))
+		dev_warn(&port->dd->pdev->dev,
+			"Internal command %d completed with TFE\n", tag);
+
+	complete(waiting);
+}
+
+static void mtip_null_completion(struct mtip_port *port,
+			    int tag, struct mtip_cmd *command, int status)
+{
+}
+
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+				dma_addr_t buffer_dma, unsigned int sectors);
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+						struct smart_attr *attrib);
+/*
+ * Handle an error.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_handle_tfe(struct driver_data *dd)
+{
+	int group, tag, bit, reissue, rv;
+	struct mtip_port *port;
+	struct mtip_cmd  *cmd;
+	u32 completed;
+	struct host_to_dev_fis *fis;
+	unsigned long tagaccum[SLOTBITS_IN_LONGS];
+	unsigned int cmd_cnt = 0;
+	unsigned char *buf;
+	char *fail_reason = NULL;
+	int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0;
+
+	dev_warn(&dd->pdev->dev, "Taskfile error\n");
+
+	port = dd->port;
+
+	set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+
+	if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
+			test_bit(MTIP_TAG_INTERNAL, port->allocated)) {
+		cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
+		dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n");
+
+		if (cmd->comp_data && cmd->comp_func) {
+			cmd->comp_func(port, MTIP_TAG_INTERNAL,
+					cmd, PORT_IRQ_TF_ERR);
+		}
+		goto handle_tfe_exit;
+	}
+
+	/* clear the tag accumulator */
+	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+	/* Loop through all the groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		completed = readl(port->completed[group]);
+
+		dev_warn(&dd->pdev->dev, "g=%u, comp=%x\n", group, completed);
+
+		/* clear completed status register in the hardware.*/
+		writel(completed, port->completed[group]);
+
+		/* Process successfully completed commands */
+		for (bit = 0; bit < 32 && completed; bit++) {
+			if (!(completed & (1<<bit)))
+				continue;
+			tag = (group << 5) + bit;
+
+			/* Skip the internal command slot */
+			if (tag == MTIP_TAG_INTERNAL)
+				continue;
+
+			cmd = mtip_cmd_from_tag(dd, tag);
+			if (likely(cmd->comp_func)) {
+				set_bit(tag, tagaccum);
+				cmd_cnt++;
+				cmd->comp_func(port, tag, cmd, 0);
+			} else {
+				dev_err(&port->dd->pdev->dev,
+					"Missing completion func for tag %d",
+					tag);
+				if (mtip_check_surprise_removal(dd->pdev)) {
+					/* don't proceed further */
+					return;
+				}
+			}
+		}
+	}
+
+	print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt);
+
+	/* Restart the port */
+	mdelay(20);
+	mtip_restart_port(port);
+
+	/* Trying to determine the cause of the error */
+	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+				dd->port->log_buf,
+				dd->port->log_buf_dma, 1);
+	if (rv) {
+		dev_warn(&dd->pdev->dev,
+			"Error in READ LOG EXT (10h) command\n");
+		/* non-critical error, don't fail the load */
+	} else {
+		buf = (unsigned char *)dd->port->log_buf;
+		if (buf[259] & 0x1) {
+			dev_info(&dd->pdev->dev,
+				"Write protect bit is set.\n");
+			set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+			fail_all_ncq_write = 1;
+			fail_reason = "write protect";
+		}
+		if (buf[288] == 0xF7) {
+			dev_info(&dd->pdev->dev,
+				"Exceeded Tmax, drive in thermal shutdown.\n");
+			set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+			fail_all_ncq_cmds = 1;
+			fail_reason = "thermal shutdown";
+		}
+		if (buf[288] == 0xBF) {
+			set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag);
+			dev_info(&dd->pdev->dev,
+				"Drive indicates rebuild has failed. Secure erase required.\n");
+			fail_all_ncq_cmds = 1;
+			fail_reason = "rebuild failed";
+		}
+	}
+
+	/* clear the tag accumulator */
+	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+	/* Loop through all the groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		for (bit = 0; bit < 32; bit++) {
+			reissue = 1;
+			tag = (group << 5) + bit;
+			cmd = mtip_cmd_from_tag(dd, tag);
+
+			fis = (struct host_to_dev_fis *)cmd->command;
+
+			/* Should re-issue? */
+			if (tag == MTIP_TAG_INTERNAL ||
+			    fis->command == ATA_CMD_SET_FEATURES)
+				reissue = 0;
+			else {
+				if (fail_all_ncq_cmds ||
+					(fail_all_ncq_write &&
+					fis->command == ATA_CMD_FPDMA_WRITE)) {
+					dev_warn(&dd->pdev->dev,
+					"  Fail: %s w/tag %d [%s].\n",
+					fis->command == ATA_CMD_FPDMA_WRITE ?
+						"write" : "read",
+					tag,
+					fail_reason != NULL ?
+						fail_reason : "unknown");
+					if (cmd->comp_func) {
+						cmd->comp_func(port, tag,
+							cmd, -ENODATA);
+					}
+					continue;
+				}
+			}
+
+			/*
+			 * First check if this command has
+			 *  exceeded its retries.
+			 */
+			if (reissue && (cmd->retries-- > 0)) {
+
+				set_bit(tag, tagaccum);
+
+				/* Re-issue the command. */
+				mtip_issue_ncq_command(port, tag);
+
+				continue;
+			}
+
+			/* Retire a command that will not be reissued */
+			dev_warn(&port->dd->pdev->dev,
+				"retiring tag %d\n", tag);
+
+			if (cmd->comp_func)
+				cmd->comp_func(port, tag, cmd, PORT_IRQ_TF_ERR);
+			else
+				dev_warn(&port->dd->pdev->dev,
+					"Bad completion for tag %d\n",
+					tag);
+		}
+	}
+	print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt);
+
+handle_tfe_exit:
+	/* clear eh_active */
+	clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+	wake_up_interruptible(&port->svc_wait);
+}
+
+/*
+ * Handle a set device bits interrupt
+ */
+static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
+							u32 completed)
+{
+	struct driver_data *dd = port->dd;
+	int tag, bit;
+	struct mtip_cmd *command;
+
+	if (!completed) {
+		WARN_ON_ONCE(!completed);
+		return;
+	}
+	/* clear completed status register in the hardware.*/
+	writel(completed, port->completed[group]);
+
+	/* Process completed commands. */
+	for (bit = 0; (bit < 32) && completed; bit++) {
+		if (completed & 0x01) {
+			tag = (group << 5) | bit;
+
+			/* skip internal command slot. */
+			if (unlikely(tag == MTIP_TAG_INTERNAL))
+				continue;
+
+			command = mtip_cmd_from_tag(dd, tag);
+			if (likely(command->comp_func))
+				command->comp_func(port, tag, command, 0);
+			else {
+				dev_dbg(&dd->pdev->dev,
+					"Null completion for tag %d",
+					tag);
+
+				if (mtip_check_surprise_removal(
+					dd->pdev)) {
+					return;
+				}
+			}
+		}
+		completed >>= 1;
+	}
+
+	/* If last, re-enable interrupts */
+	if (atomic_dec_return(&dd->irq_workers_active) == 0)
+		writel(0xffffffff, dd->mmio + HOST_IRQ_STAT);
+}
+
+/*
+ * Process legacy pio and d2h interrupts
+ */
+static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
+{
+	struct mtip_port *port = dd->port;
+	struct mtip_cmd *cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
+
+	if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
+	    (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+		& (1 << MTIP_TAG_INTERNAL))) {
+		if (cmd->comp_func) {
+			cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd, 0);
+			return;
+		}
+	}
+
+	return;
+}
+
+/*
+ * Demux and handle errors
+ */
+static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
+{
+
+	if (unlikely(port_stat & PORT_IRQ_CONNECT)) {
+		dev_warn(&dd->pdev->dev,
+			"Clearing PxSERR.DIAG.x\n");
+		writel((1 << 26), dd->port->mmio + PORT_SCR_ERR);
+	}
+
+	if (unlikely(port_stat & PORT_IRQ_PHYRDY)) {
+		dev_warn(&dd->pdev->dev,
+			"Clearing PxSERR.DIAG.n\n");
+		writel((1 << 16), dd->port->mmio + PORT_SCR_ERR);
+	}
+
+	if (unlikely(port_stat & ~PORT_IRQ_HANDLED)) {
+		dev_warn(&dd->pdev->dev,
+			"Port stat errors %x unhandled\n",
+			(port_stat & ~PORT_IRQ_HANDLED));
+		if (mtip_check_surprise_removal(dd->pdev))
+			return;
+	}
+	if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) {
+		set_bit(MTIP_PF_EH_ACTIVE_BIT, &dd->port->flags);
+		wake_up_interruptible(&dd->port->svc_wait);
+	}
+}
+
+static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
+{
+	struct driver_data *dd = (struct driver_data *) data;
+	struct mtip_port *port = dd->port;
+	u32 hba_stat, port_stat;
+	int rv = IRQ_NONE;
+	int do_irq_enable = 1, i, workers;
+	struct mtip_work *twork;
+
+	hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
+	if (hba_stat) {
+		rv = IRQ_HANDLED;
+
+		/* Acknowledge the interrupt status on the port.*/
+		port_stat = readl(port->mmio + PORT_IRQ_STAT);
+		writel(port_stat, port->mmio + PORT_IRQ_STAT);
+
+		/* Demux port status */
+		if (likely(port_stat & PORT_IRQ_SDB_FIS)) {
+			do_irq_enable = 0;
+			WARN_ON_ONCE(atomic_read(&dd->irq_workers_active) != 0);
+
+			/* Start at 1: group zero is always local? */
+			for (i = 0, workers = 0; i < MTIP_MAX_SLOT_GROUPS;
+									i++) {
+				twork = &dd->work[i];
+				twork->completed = readl(port->completed[i]);
+				if (twork->completed)
+					workers++;
+			}
+
+			atomic_set(&dd->irq_workers_active, workers);
+			if (workers) {
+				for (i = 1; i < MTIP_MAX_SLOT_GROUPS; i++) {
+					twork = &dd->work[i];
+					if (twork->completed)
+						queue_work_on(
+							twork->cpu_binding,
+							dd->isr_workq,
+							&twork->work);
+				}
+
+				if (likely(dd->work[0].completed))
+					mtip_workq_sdbfx(port, 0,
+							dd->work[0].completed);
+
+			} else {
+				/*
+				 * Chip quirk: SDB interrupt but nothing
+				 * to complete
+				 */
+				do_irq_enable = 1;
+			}
+		}
+
+		if (unlikely(port_stat & PORT_IRQ_ERR)) {
+			if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+				/* don't proceed further */
+				return IRQ_HANDLED;
+			}
+			if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+							&dd->dd_flag))
+				return rv;
+
+			mtip_process_errors(dd, port_stat & PORT_IRQ_ERR);
+		}
+
+		if (unlikely(port_stat & PORT_IRQ_LEGACY))
+			mtip_process_legacy(dd, port_stat & PORT_IRQ_LEGACY);
+	}
+
+	/* acknowledge interrupt */
+	if (unlikely(do_irq_enable))
+		writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
+
+	return rv;
+}
+
+/*
+ * HBA interrupt subroutine.
+ *
+ * @irq		IRQ number.
+ * @instance	Pointer to the driver data structure.
+ *
+ * return value
+ *	IRQ_HANDLED	A HBA interrupt was pending and handled.
+ *	IRQ_NONE	This interrupt was not for the HBA.
+ */
+static irqreturn_t mtip_irq_handler(int irq, void *instance)
+{
+	struct driver_data *dd = instance;
+
+	return mtip_handle_irq(dd);
+}
+
+static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
+{
+	writel(1 << MTIP_TAG_BIT(tag),
+		port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+}
+
+static bool mtip_pause_ncq(struct mtip_port *port,
+				struct host_to_dev_fis *fis)
+{
+	struct host_to_dev_fis *reply;
+	unsigned long task_file_data;
+
+	reply = port->rxfis + RX_FIS_D2H_REG;
+	task_file_data = readl(port->mmio+PORT_TFDATA);
+
+	if (fis->command == ATA_CMD_SEC_ERASE_UNIT)
+		clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+
+	if ((task_file_data & 1))
+		return false;
+
+	if (fis->command == ATA_CMD_SEC_ERASE_PREP) {
+		set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+		set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+		port->ic_pause_timer = jiffies;
+		return true;
+	} else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) &&
+					(fis->features == 0x03)) {
+		set_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+		port->ic_pause_timer = jiffies;
+		return true;
+	} else if ((fis->command == ATA_CMD_SEC_ERASE_UNIT) ||
+		((fis->command == 0xFC) &&
+			(fis->features == 0x27 || fis->features == 0x72 ||
+			 fis->features == 0x62 || fis->features == 0x26))) {
+		/* Com reset after secure erase or lowlevel format */
+		mtip_restart_port(port);
+		return false;
+	}
+
+	return false;
+}
+
+/*
+ * Wait for port to quiesce
+ *
+ * @port    Pointer to port data structure
+ * @timeout Max duration to wait (ms)
+ *
+ * return value
+ *	0	Success
+ *	-EBUSY  Commands still active
+ */
+static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
+{
+	unsigned long to;
+	unsigned int n;
+	unsigned int active = 1;
+
+	blk_mq_stop_hw_queues(port->dd->queue);
+
+	to = jiffies + msecs_to_jiffies(timeout);
+	do {
+		if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) &&
+			test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+			msleep(20);
+			continue; /* svc thd is actively issuing commands */
+		}
+
+		msleep(100);
+		if (mtip_check_surprise_removal(port->dd->pdev))
+			goto err_fault;
+		if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+			goto err_fault;
+
+		/*
+		 * Ignore s_active bit 0 of array element 0.
+		 * This bit will always be set
+		 */
+		active = readl(port->s_active[0]) & 0xFFFFFFFE;
+		for (n = 1; n < port->dd->slot_groups; n++)
+			active |= readl(port->s_active[n]);
+
+		if (!active)
+			break;
+	} while (time_before(jiffies, to));
+
+	blk_mq_start_stopped_hw_queues(port->dd->queue, true);
+	return active ? -EBUSY : 0;
+err_fault:
+	blk_mq_start_stopped_hw_queues(port->dd->queue, true);
+	return -EFAULT;
+}
+
+/*
+ * Execute an internal command and wait for the completion.
+ *
+ * @port    Pointer to the port data structure.
+ * @fis     Pointer to the FIS that describes the command.
+ * @fis_len  Length in WORDS of the FIS.
+ * @buffer  DMA accessible for command data.
+ * @buf_len  Length, in bytes, of the data buffer.
+ * @opts    Command header options, excluding the FIS length
+ *             and the number of PRD entries.
+ * @timeout Time in ms to wait for the command to complete.
+ *
+ * return value
+ *	0	 Command completed successfully.
+ *	-EFAULT  The buffer address is not correctly aligned.
+ *	-EBUSY   Internal command or other IO in progress.
+ *	-EAGAIN  Time out waiting for command to complete.
+ */
+static int mtip_exec_internal_command(struct mtip_port *port,
+					struct host_to_dev_fis *fis,
+					int fis_len,
+					dma_addr_t buffer,
+					int buf_len,
+					u32 opts,
+					gfp_t atomic,
+					unsigned long timeout)
+{
+	struct mtip_cmd_sg *command_sg;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct mtip_cmd *int_cmd;
+	struct driver_data *dd = port->dd;
+	int rv = 0;
+
+	/* Make sure the buffer is 8 byte aligned. This is asic specific. */
+	if (buffer & 0x00000007) {
+		dev_err(&dd->pdev->dev, "SG buffer is not 8 byte aligned\n");
+		return -EFAULT;
+	}
+
+	int_cmd = mtip_get_int_command(dd);
+
+	set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+	port->ic_pause_timer = 0;
+
+	clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+	clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+
+	if (atomic == GFP_KERNEL) {
+		if (fis->command != ATA_CMD_STANDBYNOW1) {
+			/* wait for io to complete if non atomic */
+			if (mtip_quiesce_io(port,
+					MTIP_QUIESCE_IO_TIMEOUT_MS) < 0) {
+				dev_warn(&dd->pdev->dev,
+					"Failed to quiesce IO\n");
+				mtip_put_int_command(dd, int_cmd);
+				clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+				wake_up_interruptible(&port->svc_wait);
+				return -EBUSY;
+			}
+		}
+
+		/* Set the completion function and data for the command. */
+		int_cmd->comp_data = &wait;
+		int_cmd->comp_func = mtip_completion;
+
+	} else {
+		/* Clear completion - we're going to poll */
+		int_cmd->comp_data = NULL;
+		int_cmd->comp_func = mtip_null_completion;
+	}
+
+	/* Copy the command to the command table */
+	memcpy(int_cmd->command, fis, fis_len*4);
+
+	/* Populate the SG list */
+	int_cmd->command_header->opts =
+		 __force_bit2int cpu_to_le32(opts | fis_len);
+	if (buf_len) {
+		command_sg = int_cmd->command + AHCI_CMD_TBL_HDR_SZ;
+
+		command_sg->info =
+			__force_bit2int cpu_to_le32((buf_len-1) & 0x3FFFFF);
+		command_sg->dba	=
+			__force_bit2int cpu_to_le32(buffer & 0xFFFFFFFF);
+		command_sg->dba_upper =
+			__force_bit2int cpu_to_le32((buffer >> 16) >> 16);
+
+		int_cmd->command_header->opts |=
+			__force_bit2int cpu_to_le32((1 << 16));
+	}
+
+	/* Populate the command header */
+	int_cmd->command_header->byte_count = 0;
+
+	/* Issue the command to the hardware */
+	mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL);
+
+	if (atomic == GFP_KERNEL) {
+		/* Wait for the command to complete or timeout. */
+		if ((rv = wait_for_completion_interruptible_timeout(
+				&wait,
+				msecs_to_jiffies(timeout))) <= 0) {
+			if (rv == -ERESTARTSYS) { /* interrupted */
+				dev_err(&dd->pdev->dev,
+					"Internal command [%02X] was interrupted after %lu ms\n",
+					fis->command, timeout);
+				rv = -EINTR;
+				goto exec_ic_exit;
+			} else if (rv == 0) /* timeout */
+				dev_err(&dd->pdev->dev,
+					"Internal command did not complete [%02X] within timeout of  %lu ms\n",
+					fis->command, timeout);
+			else
+				dev_err(&dd->pdev->dev,
+					"Internal command [%02X] wait returned code [%d] after %lu ms - unhandled\n",
+					fis->command, rv, timeout);
+
+			if (mtip_check_surprise_removal(dd->pdev) ||
+				test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+						&dd->dd_flag)) {
+				dev_err(&dd->pdev->dev,
+					"Internal command [%02X] wait returned due to SR\n",
+					fis->command);
+				rv = -ENXIO;
+				goto exec_ic_exit;
+			}
+			mtip_device_reset(dd); /* recover from timeout issue */
+			rv = -EAGAIN;
+			goto exec_ic_exit;
+		}
+	} else {
+		u32 hba_stat, port_stat;
+
+		/* Spin for <timeout> checking if command still outstanding */
+		timeout = jiffies + msecs_to_jiffies(timeout);
+		while ((readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+				& (1 << MTIP_TAG_INTERNAL))
+				&& time_before(jiffies, timeout)) {
+			if (mtip_check_surprise_removal(dd->pdev)) {
+				rv = -ENXIO;
+				goto exec_ic_exit;
+			}
+			if ((fis->command != ATA_CMD_STANDBYNOW1) &&
+				test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+						&dd->dd_flag)) {
+				rv = -ENXIO;
+				goto exec_ic_exit;
+			}
+			port_stat = readl(port->mmio + PORT_IRQ_STAT);
+			if (!port_stat)
+				continue;
+
+			if (port_stat & PORT_IRQ_ERR) {
+				dev_err(&dd->pdev->dev,
+					"Internal command [%02X] failed\n",
+					fis->command);
+				mtip_device_reset(dd);
+				rv = -EIO;
+				goto exec_ic_exit;
+			} else {
+				writel(port_stat, port->mmio + PORT_IRQ_STAT);
+				hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
+				if (hba_stat)
+					writel(hba_stat,
+						dd->mmio + HOST_IRQ_STAT);
+			}
+			break;
+		}
+	}
+
+	if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+			& (1 << MTIP_TAG_INTERNAL)) {
+		rv = -ENXIO;
+		if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
+			mtip_device_reset(dd);
+			rv = -EAGAIN;
+		}
+	}
+exec_ic_exit:
+	/* Clear the allocated and active bits for the internal command. */
+	mtip_put_int_command(dd, int_cmd);
+	if (rv >= 0 && mtip_pause_ncq(port, fis)) {
+		/* NCQ paused */
+		return rv;
+	}
+	clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+	wake_up_interruptible(&port->svc_wait);
+
+	return rv;
+}
+
+/*
+ * Byte-swap ATA ID strings.
+ *
+ * ATA identify data contains strings in byte-swapped 16-bit words.
+ * They must be swapped (on all architectures) to be usable as C strings.
+ * This function swaps bytes in-place.
+ *
+ * @buf The buffer location of the string
+ * @len The number of bytes to swap
+ *
+ * return value
+ *	None
+ */
+static inline void ata_swap_string(u16 *buf, unsigned int len)
+{
+	int i;
+	for (i = 0; i < (len/2); i++)
+		be16_to_cpus(&buf[i]);
+}
+
+static void mtip_set_timeout(struct driver_data *dd,
+					struct host_to_dev_fis *fis,
+					unsigned int *timeout, u8 erasemode)
+{
+	switch (fis->command) {
+	case ATA_CMD_DOWNLOAD_MICRO:
+		*timeout = 120000; /* 2 minutes */
+		break;
+	case ATA_CMD_SEC_ERASE_UNIT:
+	case 0xFC:
+		if (erasemode)
+			*timeout = ((*(dd->port->identify + 90) * 2) * 60000);
+		else
+			*timeout = ((*(dd->port->identify + 89) * 2) * 60000);
+		break;
+	case ATA_CMD_STANDBYNOW1:
+		*timeout = 120000;  /* 2 minutes */
+		break;
+	case 0xF7:
+	case 0xFA:
+		*timeout = 60000;  /* 60 seconds */
+		break;
+	case ATA_CMD_SMART:
+		*timeout = 15000;  /* 15 seconds */
+		break;
+	default:
+		*timeout = MTIP_IOCTL_CMD_TIMEOUT_MS;
+		break;
+	}
+}
+
+/*
+ * Request the device identity information.
+ *
+ * If a user space buffer is not specified, i.e. is NULL, the
+ * identify information is still read from the drive and placed
+ * into the identify data buffer (@e port->identify) in the
+ * port data structure.
+ * When the identify buffer contains valid identify information @e
+ * port->identify_valid is non-zero.
+ *
+ * @port	 Pointer to the port structure.
+ * @user_buffer  A user space buffer where the identify data should be
+ *                    copied.
+ *
+ * return value
+ *	0	Command completed successfully.
+ *	-EFAULT An error occurred while coping data to the user buffer.
+ *	-1	Command failed.
+ */
+static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
+{
+	int rv = 0;
+	struct host_to_dev_fis fis;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return -EFAULT;
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_ID_ATA;
+
+	/* Set the identify information as invalid. */
+	port->identify_valid = 0;
+
+	/* Clear the identify information. */
+	memset(port->identify, 0, sizeof(u16) * ATA_ID_WORDS);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				&fis,
+				5,
+				port->identify_dma,
+				sizeof(u16) * ATA_ID_WORDS,
+				0,
+				GFP_KERNEL,
+				MTIP_INT_CMD_TIMEOUT_MS)
+				< 0) {
+		rv = -1;
+		goto out;
+	}
+
+	/*
+	 * Perform any necessary byte-swapping.  Yes, the kernel does in fact
+	 * perform field-sensitive swapping on the string fields.
+	 * See the kernel use of ata_id_string() for proof of this.
+	 */
+#ifdef __LITTLE_ENDIAN
+	ata_swap_string(port->identify + 27, 40);  /* model string*/
+	ata_swap_string(port->identify + 23, 8);   /* firmware string*/
+	ata_swap_string(port->identify + 10, 20);  /* serial# string*/
+#else
+	{
+		int i;
+		for (i = 0; i < ATA_ID_WORDS; i++)
+			port->identify[i] = le16_to_cpu(port->identify[i]);
+	}
+#endif
+
+	/* Check security locked state */
+	if (port->identify[128] & 0x4)
+		set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+	else
+		clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+
+#ifdef MTIP_TRIM /* Disabling TRIM support temporarily */
+	/* Demux ID.DRAT & ID.RZAT to determine trim support */
+	if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5))
+		port->dd->trim_supp = true;
+	else
+#endif
+		port->dd->trim_supp = false;
+
+	/* Set the identify buffer as valid. */
+	port->identify_valid = 1;
+
+	if (user_buffer) {
+		if (copy_to_user(
+			user_buffer,
+			port->identify,
+			ATA_ID_WORDS * sizeof(u16))) {
+			rv = -EFAULT;
+			goto out;
+		}
+	}
+
+out:
+	return rv;
+}
+
+/*
+ * Issue a standby immediate command to the device.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ *	0	Command was executed successfully.
+ *	-1	An error occurred while executing the command.
+ */
+static int mtip_standby_immediate(struct mtip_port *port)
+{
+	int rv;
+	struct host_to_dev_fis	fis;
+	unsigned long start;
+	unsigned int timeout;
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_STANDBYNOW1;
+
+	mtip_set_timeout(port->dd, &fis, &timeout, 0);
+
+	start = jiffies;
+	rv = mtip_exec_internal_command(port,
+					&fis,
+					5,
+					0,
+					0,
+					0,
+					GFP_ATOMIC,
+					timeout);
+	dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n",
+			jiffies_to_msecs(jiffies - start));
+	if (rv)
+		dev_warn(&port->dd->pdev->dev,
+			"STANDBY IMMEDIATE command failed.\n");
+
+	return rv;
+}
+
+/*
+ * Issue a READ LOG EXT command to the device.
+ *
+ * @port	pointer to the port structure.
+ * @page	page number to fetch
+ * @buffer	pointer to buffer
+ * @buffer_dma	dma address corresponding to @buffer
+ * @sectors	page length to fetch, in sectors
+ *
+ * return value
+ *	@rv	return value from mtip_exec_internal_command()
+ */
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+				dma_addr_t buffer_dma, unsigned int sectors)
+{
+	struct host_to_dev_fis fis;
+
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_READ_LOG_EXT;
+	fis.sect_count	= sectors & 0xFF;
+	fis.sect_cnt_ex	= (sectors >> 8) & 0xFF;
+	fis.lba_low	= page;
+	fis.lba_mid	= 0;
+	fis.device	= ATA_DEVICE_OBS;
+
+	memset(buffer, 0, sectors * ATA_SECT_SIZE);
+
+	return mtip_exec_internal_command(port,
+					&fis,
+					5,
+					buffer_dma,
+					sectors * ATA_SECT_SIZE,
+					0,
+					GFP_ATOMIC,
+					MTIP_INT_CMD_TIMEOUT_MS);
+}
+
+/*
+ * Issue a SMART READ DATA command to the device.
+ *
+ * @port	pointer to the port structure.
+ * @buffer	pointer to buffer
+ * @buffer_dma	dma address corresponding to @buffer
+ *
+ * return value
+ *	@rv	return value from mtip_exec_internal_command()
+ */
+static int mtip_get_smart_data(struct mtip_port *port, u8 *buffer,
+					dma_addr_t buffer_dma)
+{
+	struct host_to_dev_fis fis;
+
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_SMART;
+	fis.features	= 0xD0;
+	fis.sect_count	= 1;
+	fis.lba_mid	= 0x4F;
+	fis.lba_hi	= 0xC2;
+	fis.device	= ATA_DEVICE_OBS;
+
+	return mtip_exec_internal_command(port,
+					&fis,
+					5,
+					buffer_dma,
+					ATA_SECT_SIZE,
+					0,
+					GFP_ATOMIC,
+					15000);
+}
+
+/*
+ * Get the value of a smart attribute
+ *
+ * @port	pointer to the port structure
+ * @id		attribute number
+ * @attrib	pointer to return attrib information corresponding to @id
+ *
+ * return value
+ *	-EINVAL	NULL buffer passed or unsupported attribute @id.
+ *	-EPERM	Identify data not valid, SMART not supported or not enabled
+ */
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+						struct smart_attr *attrib)
+{
+	int rv, i;
+	struct smart_attr *pattr;
+
+	if (!attrib)
+		return -EINVAL;
+
+	if (!port->identify_valid) {
+		dev_warn(&port->dd->pdev->dev, "IDENTIFY DATA not valid\n");
+		return -EPERM;
+	}
+	if (!(port->identify[82] & 0x1)) {
+		dev_warn(&port->dd->pdev->dev, "SMART not supported\n");
+		return -EPERM;
+	}
+	if (!(port->identify[85] & 0x1)) {
+		dev_warn(&port->dd->pdev->dev, "SMART not enabled\n");
+		return -EPERM;
+	}
+
+	memset(port->smart_buf, 0, ATA_SECT_SIZE);
+	rv = mtip_get_smart_data(port, port->smart_buf, port->smart_buf_dma);
+	if (rv) {
+		dev_warn(&port->dd->pdev->dev, "Failed to ge SMART data\n");
+		return rv;
+	}
+
+	pattr = (struct smart_attr *)(port->smart_buf + 2);
+	for (i = 0; i < 29; i++, pattr++)
+		if (pattr->attr_id == id) {
+			memcpy(attrib, pattr, sizeof(struct smart_attr));
+			break;
+		}
+
+	if (i == 29) {
+		dev_warn(&port->dd->pdev->dev,
+			"Query for invalid SMART attribute ID\n");
+		rv = -EINVAL;
+	}
+
+	return rv;
+}
+
+/*
+ * Trim unused sectors
+ *
+ * @dd		pointer to driver_data structure
+ * @lba		starting lba
+ * @len		# of 512b sectors to trim
+ *
+ * return value
+ *      -ENOMEM		Out of dma memory
+ *      -EINVAL		Invalid parameters passed in, trim not supported
+ *      -EIO		Error submitting trim request to hw
+ */
+static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
+				unsigned int len)
+{
+	int i, rv = 0;
+	u64 tlba, tlen, sect_left;
+	struct mtip_trim_entry *buf;
+	dma_addr_t dma_addr;
+	struct host_to_dev_fis fis;
+
+	if (!len || dd->trim_supp == false)
+		return -EINVAL;
+
+	/* Trim request too big */
+	WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES));
+
+	/* Trim request not aligned on 4k boundary */
+	WARN_ON(len % 8 != 0);
+
+	/* Warn if vu_trim structure is too big */
+	WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE);
+
+	/* Allocate a DMA buffer for the trim structure */
+	buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
+								GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	memset(buf, 0, ATA_SECT_SIZE);
+
+	for (i = 0, sect_left = len, tlba = lba;
+			i < MTIP_MAX_TRIM_ENTRIES && sect_left;
+			i++) {
+		tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ?
+					MTIP_MAX_TRIM_ENTRY_LEN :
+					sect_left);
+		buf[i].lba = __force_bit2int cpu_to_le32(tlba);
+		buf[i].range = __force_bit2int cpu_to_le16(tlen);
+		tlba += tlen;
+		sect_left -= tlen;
+	}
+	WARN_ON(sect_left != 0);
+
+	/* Build the fis */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type       = 0x27;
+	fis.opts       = 1 << 7;
+	fis.command    = 0xfb;
+	fis.features   = 0x60;
+	fis.sect_count = 1;
+	fis.device     = ATA_DEVICE_OBS;
+
+	if (mtip_exec_internal_command(dd->port,
+					&fis,
+					5,
+					dma_addr,
+					ATA_SECT_SIZE,
+					0,
+					GFP_KERNEL,
+					MTIP_TRIM_TIMEOUT_MS) < 0)
+		rv = -EIO;
+
+	dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
+	return rv;
+}
+
+/*
+ * Get the drive capacity.
+ *
+ * @dd      Pointer to the device data structure.
+ * @sectors Pointer to the variable that will receive the sector count.
+ *
+ * return value
+ *	1 Capacity was returned successfully.
+ *	0 The identify information is invalid.
+ */
+static bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors)
+{
+	struct mtip_port *port = dd->port;
+	u64 total, raw0, raw1, raw2, raw3;
+	raw0 = port->identify[100];
+	raw1 = port->identify[101];
+	raw2 = port->identify[102];
+	raw3 = port->identify[103];
+	total = raw0 | raw1<<16 | raw2<<32 | raw3<<48;
+	*sectors = total;
+	return (bool) !!port->identify_valid;
+}
+
+/*
+ * Display the identify command data.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_dump_identify(struct mtip_port *port)
+{
+	sector_t sectors;
+	unsigned short revid;
+	char cbuf[42];
+
+	if (!port->identify_valid)
+		return;
+
+	strlcpy(cbuf, (char *)(port->identify+10), 21);
+	dev_info(&port->dd->pdev->dev,
+		"Serial No.: %s\n", cbuf);
+
+	strlcpy(cbuf, (char *)(port->identify+23), 9);
+	dev_info(&port->dd->pdev->dev,
+		"Firmware Ver.: %s\n", cbuf);
+
+	strlcpy(cbuf, (char *)(port->identify+27), 41);
+	dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);
+
+	dev_info(&port->dd->pdev->dev, "Security: %04x %s\n",
+		port->identify[128],
+		port->identify[128] & 0x4 ? "(LOCKED)" : "");
+
+	if (mtip_hw_get_capacity(port->dd, &sectors))
+		dev_info(&port->dd->pdev->dev,
+			"Capacity: %llu sectors (%llu MB)\n",
+			 (u64)sectors,
+			 ((u64)sectors) * ATA_SECT_SIZE >> 20);
+
+	pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid);
+	switch (revid & 0xFF) {
+	case 0x1:
+		strlcpy(cbuf, "A0", 3);
+		break;
+	case 0x3:
+		strlcpy(cbuf, "A2", 3);
+		break;
+	default:
+		strlcpy(cbuf, "?", 2);
+		break;
+	}
+	dev_info(&port->dd->pdev->dev,
+		"Card Type: %s\n", cbuf);
+}
+
+/*
+ * Map the commands scatter list into the command table.
+ *
+ * @command Pointer to the command.
+ * @nents Number of scatter list entries.
+ *
+ * return value
+ *	None
+ */
+static inline void fill_command_sg(struct driver_data *dd,
+				struct mtip_cmd *command,
+				int nents)
+{
+	int n;
+	unsigned int dma_len;
+	struct mtip_cmd_sg *command_sg;
+	struct scatterlist *sg = command->sg;
+
+	command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
+
+	for (n = 0; n < nents; n++) {
+		dma_len = sg_dma_len(sg);
+		if (dma_len > 0x400000)
+			dev_err(&dd->pdev->dev,
+				"DMA segment length truncated\n");
+		command_sg->info = __force_bit2int
+			cpu_to_le32((dma_len-1) & 0x3FFFFF);
+		command_sg->dba	= __force_bit2int
+			cpu_to_le32(sg_dma_address(sg));
+		command_sg->dba_upper = __force_bit2int
+			cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
+		command_sg++;
+		sg++;
+	}
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * return value 0 The command completed successfully.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_task(struct mtip_port *port, u8 *command)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+	unsigned int to;
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= command[0];
+	fis.features	= command[1];
+	fis.sect_count	= command[2];
+	fis.sector	= command[3];
+	fis.cyl_low	= command[4];
+	fis.cyl_hi	= command[5];
+	fis.device	= command[6] & ~0x10; /* Clear the dev bit*/
+
+	mtip_set_timeout(port->dd, &fis, &to, 0);
+
+	dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2],
+		command[3],
+		command[4],
+		command[5],
+		command[6]);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				 &fis,
+				 5,
+				 0,
+				 0,
+				 0,
+				 GFP_KERNEL,
+				 to) < 0) {
+		return -1;
+	}
+
+	command[0] = reply->command; /* Status*/
+	command[1] = reply->features; /* Error*/
+	command[4] = reply->cyl_low;
+	command[5] = reply->cyl_hi;
+
+	dbg_printk(MTIP_DRV_NAME " %s: Completion Status: stat %x, err %x , cyl_lo %x cyl_hi %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[4],
+		command[5]);
+
+	return 0;
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * @param port Pointer to the port data structure.
+ * @param command Pointer to the user specified command parameters.
+ * @param user_buffer Pointer to the user space buffer where read sector
+ *                   data should be copied.
+ *
+ * return value 0 The command completed successfully.
+ * return value -EFAULT An error occurred while copying the completion
+ *                 data to the user space buffer.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_command(struct mtip_port *port, u8 *command,
+				void __user *user_buffer)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply;
+	u8 *buf = NULL;
+	dma_addr_t dma_addr = 0;
+	int rv = 0, xfer_sz = command[3];
+	unsigned int to;
+
+	if (xfer_sz) {
+		if (!user_buffer)
+			return -EFAULT;
+
+		buf = dmam_alloc_coherent(&port->dd->pdev->dev,
+				ATA_SECT_SIZE * xfer_sz,
+				&dma_addr,
+				GFP_KERNEL);
+		if (!buf) {
+			dev_err(&port->dd->pdev->dev,
+				"Memory allocation failed (%d bytes)\n",
+				ATA_SECT_SIZE * xfer_sz);
+			return -ENOMEM;
+		}
+		memset(buf, 0, ATA_SECT_SIZE * xfer_sz);
+	}
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= command[0];
+	fis.features	= command[2];
+	fis.sect_count	= command[3];
+	if (fis.command == ATA_CMD_SMART) {
+		fis.sector	= command[1];
+		fis.cyl_low	= 0x4F;
+		fis.cyl_hi	= 0xC2;
+	}
+
+	mtip_set_timeout(port->dd, &fis, &to, 0);
+
+	if (xfer_sz)
+		reply = (port->rxfis + RX_FIS_PIO_SETUP);
+	else
+		reply = (port->rxfis + RX_FIS_D2H_REG);
+
+	dbg_printk(MTIP_DRV_NAME
+		" %s: User Command: cmd %x, sect %x, "
+		"feat %x, sectcnt %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2],
+		command[3]);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				&fis,
+				 5,
+				 (xfer_sz ? dma_addr : 0),
+				 (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0),
+				 0,
+				 GFP_KERNEL,
+				 to)
+				 < 0) {
+		rv = -EFAULT;
+		goto exit_drive_command;
+	}
+
+	/* Collect the completion status. */
+	command[0] = reply->command; /* Status*/
+	command[1] = reply->features; /* Error*/
+	command[2] = reply->sect_count;
+
+	dbg_printk(MTIP_DRV_NAME
+		" %s: Completion Status: stat %x, "
+		"err %x, nsect %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2]);
+
+	if (xfer_sz) {
+		if (copy_to_user(user_buffer,
+				 buf,
+				 ATA_SECT_SIZE * command[3])) {
+			rv = -EFAULT;
+			goto exit_drive_command;
+		}
+	}
+exit_drive_command:
+	if (buf)
+		dmam_free_coherent(&port->dd->pdev->dev,
+				ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
+	return rv;
+}
+
+/*
+ *  Indicates whether a command has a single sector payload.
+ *
+ *  @command passed to the device to perform the certain event.
+ *  @features passed to the device to perform the certain event.
+ *
+ *  return value
+ *	1	command is one that always has a single sector payload,
+ *		regardless of the value in the Sector Count field.
+ *      0       otherwise
+ *
+ */
+static unsigned int implicit_sector(unsigned char command,
+				    unsigned char features)
+{
+	unsigned int rv = 0;
+
+	/* list of commands that have an implicit sector count of 1 */
+	switch (command) {
+	case ATA_CMD_SEC_SET_PASS:
+	case ATA_CMD_SEC_UNLOCK:
+	case ATA_CMD_SEC_ERASE_PREP:
+	case ATA_CMD_SEC_ERASE_UNIT:
+	case ATA_CMD_SEC_FREEZE_LOCK:
+	case ATA_CMD_SEC_DISABLE_PASS:
+	case ATA_CMD_PMP_READ:
+	case ATA_CMD_PMP_WRITE:
+		rv = 1;
+		break;
+	case ATA_CMD_SET_MAX:
+		if (features == ATA_SET_MAX_UNLOCK)
+			rv = 1;
+		break;
+	case ATA_CMD_SMART:
+		if ((features == ATA_SMART_READ_VALUES) ||
+				(features == ATA_SMART_READ_THRESHOLDS))
+			rv = 1;
+		break;
+	case ATA_CMD_CONF_OVERLAY:
+		if ((features == ATA_DCO_IDENTIFY) ||
+				(features == ATA_DCO_SET))
+			rv = 1;
+		break;
+	}
+	return rv;
+}
+
+/*
+ * Executes a taskfile
+ * See ide_taskfile_ioctl() for derivation
+ */
+static int exec_drive_taskfile(struct driver_data *dd,
+			       void __user *buf,
+			       ide_task_request_t *req_task,
+			       int outtotal)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply;
+	u8 *outbuf = NULL;
+	u8 *inbuf = NULL;
+	dma_addr_t outbuf_dma = 0;
+	dma_addr_t inbuf_dma = 0;
+	dma_addr_t dma_buffer = 0;
+	int err = 0;
+	unsigned int taskin = 0;
+	unsigned int taskout = 0;
+	u8 nsect = 0;
+	unsigned int timeout;
+	unsigned int force_single_sector;
+	unsigned int transfer_size;
+	unsigned long task_file_data;
+	int intotal = outtotal + req_task->out_size;
+	int erasemode = 0;
+
+	taskout = req_task->out_size;
+	taskin = req_task->in_size;
+	/* 130560 = 512 * 0xFF*/
+	if (taskin > 130560 || taskout > 130560) {
+		err = -EINVAL;
+		goto abort;
+	}
+
+	if (taskout) {
+		outbuf = kzalloc(taskout, GFP_KERNEL);
+		if (outbuf == NULL) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		if (copy_from_user(outbuf, buf + outtotal, taskout)) {
+			err = -EFAULT;
+			goto abort;
+		}
+		outbuf_dma = pci_map_single(dd->pdev,
+					 outbuf,
+					 taskout,
+					 DMA_TO_DEVICE);
+		if (outbuf_dma == 0) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		dma_buffer = outbuf_dma;
+	}
+
+	if (taskin) {
+		inbuf = kzalloc(taskin, GFP_KERNEL);
+		if (inbuf == NULL) {
+			err = -ENOMEM;
+			goto abort;
+		}
+
+		if (copy_from_user(inbuf, buf + intotal, taskin)) {
+			err = -EFAULT;
+			goto abort;
+		}
+		inbuf_dma = pci_map_single(dd->pdev,
+					 inbuf,
+					 taskin, DMA_FROM_DEVICE);
+		if (inbuf_dma == 0) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		dma_buffer = inbuf_dma;
+	}
+
+	/* only supports PIO and non-data commands from this ioctl. */
+	switch (req_task->data_phase) {
+	case TASKFILE_OUT:
+		nsect = taskout / ATA_SECT_SIZE;
+		reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+		break;
+	case TASKFILE_IN:
+		reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+		break;
+	case TASKFILE_NO_DATA:
+		reply = (dd->port->rxfis + RX_FIS_D2H_REG);
+		break;
+	default:
+		err = -EINVAL;
+		goto abort;
+	}
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= req_task->io_ports[7];
+	fis.features	= req_task->io_ports[1];
+	fis.sect_count	= req_task->io_ports[2];
+	fis.lba_low	= req_task->io_ports[3];
+	fis.lba_mid	= req_task->io_ports[4];
+	fis.lba_hi	= req_task->io_ports[5];
+	 /* Clear the dev bit*/
+	fis.device	= req_task->io_ports[6] & ~0x10;
+
+	if ((req_task->in_flags.all == 0) && (req_task->out_flags.all & 1)) {
+		req_task->in_flags.all	=
+			IDE_TASKFILE_STD_IN_FLAGS |
+			(IDE_HOB_STD_IN_FLAGS << 8);
+		fis.lba_low_ex		= req_task->hob_ports[3];
+		fis.lba_mid_ex		= req_task->hob_ports[4];
+		fis.lba_hi_ex		= req_task->hob_ports[5];
+		fis.features_ex		= req_task->hob_ports[1];
+		fis.sect_cnt_ex		= req_task->hob_ports[2];
+
+	} else {
+		req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS;
+	}
+
+	force_single_sector = implicit_sector(fis.command, fis.features);
+
+	if ((taskin || taskout) && (!fis.sect_count)) {
+		if (nsect)
+			fis.sect_count = nsect;
+		else {
+			if (!force_single_sector) {
+				dev_warn(&dd->pdev->dev,
+					"data movement but "
+					"sect_count is 0\n");
+					err = -EINVAL;
+					goto abort;
+			}
+		}
+	}
+
+	dbg_printk(MTIP_DRV_NAME
+		" %s: cmd %x, feat %x, nsect %x,"
+		" sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x,"
+		" head/dev %x\n",
+		__func__,
+		fis.command,
+		fis.features,
+		fis.sect_count,
+		fis.lba_low,
+		fis.lba_mid,
+		fis.lba_hi,
+		fis.device);
+
+	/* check for erase mode support during secure erase.*/
+	if ((fis.command == ATA_CMD_SEC_ERASE_UNIT) && outbuf &&
+					(outbuf[0] & MTIP_SEC_ERASE_MODE)) {
+		erasemode = 1;
+	}
+
+	mtip_set_timeout(dd, &fis, &timeout, erasemode);
+
+	/* Determine the correct transfer size.*/
+	if (force_single_sector)
+		transfer_size = ATA_SECT_SIZE;
+	else
+		transfer_size = ATA_SECT_SIZE * fis.sect_count;
+
+	/* Execute the command.*/
+	if (mtip_exec_internal_command(dd->port,
+				 &fis,
+				 5,
+				 dma_buffer,
+				 transfer_size,
+				 0,
+				 GFP_KERNEL,
+				 timeout) < 0) {
+		err = -EIO;
+		goto abort;
+	}
+
+	task_file_data = readl(dd->port->mmio+PORT_TFDATA);
+
+	if ((req_task->data_phase == TASKFILE_IN) && !(task_file_data & 1)) {
+		reply = dd->port->rxfis + RX_FIS_PIO_SETUP;
+		req_task->io_ports[7] = reply->control;
+	} else {
+		reply = dd->port->rxfis + RX_FIS_D2H_REG;
+		req_task->io_ports[7] = reply->command;
+	}
+
+	/* reclaim the DMA buffers.*/
+	if (inbuf_dma)
+		pci_unmap_single(dd->pdev, inbuf_dma,
+			taskin, DMA_FROM_DEVICE);
+	if (outbuf_dma)
+		pci_unmap_single(dd->pdev, outbuf_dma,
+			taskout, DMA_TO_DEVICE);
+	inbuf_dma  = 0;
+	outbuf_dma = 0;
+
+	/* return the ATA registers to the caller.*/
+	req_task->io_ports[1] = reply->features;
+	req_task->io_ports[2] = reply->sect_count;
+	req_task->io_ports[3] = reply->lba_low;
+	req_task->io_ports[4] = reply->lba_mid;
+	req_task->io_ports[5] = reply->lba_hi;
+	req_task->io_ports[6] = reply->device;
+
+	if (req_task->out_flags.all & 1)  {
+
+		req_task->hob_ports[3] = reply->lba_low_ex;
+		req_task->hob_ports[4] = reply->lba_mid_ex;
+		req_task->hob_ports[5] = reply->lba_hi_ex;
+		req_task->hob_ports[1] = reply->features_ex;
+		req_task->hob_ports[2] = reply->sect_cnt_ex;
+	}
+	dbg_printk(MTIP_DRV_NAME
+		" %s: Completion: stat %x,"
+		"err %x, sect_cnt %x, lbalo %x,"
+		"lbamid %x, lbahi %x, dev %x\n",
+		__func__,
+		req_task->io_ports[7],
+		req_task->io_ports[1],
+		req_task->io_ports[2],
+		req_task->io_ports[3],
+		req_task->io_ports[4],
+		req_task->io_ports[5],
+		req_task->io_ports[6]);
+
+	if (taskout) {
+		if (copy_to_user(buf + outtotal, outbuf, taskout)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	}
+	if (taskin) {
+		if (copy_to_user(buf + intotal, inbuf, taskin)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	}
+abort:
+	if (inbuf_dma)
+		pci_unmap_single(dd->pdev, inbuf_dma,
+					taskin, DMA_FROM_DEVICE);
+	if (outbuf_dma)
+		pci_unmap_single(dd->pdev, outbuf_dma,
+					taskout, DMA_TO_DEVICE);
+	kfree(outbuf);
+	kfree(inbuf);
+
+	return err;
+}
+
+/*
+ * Handle IOCTL calls from the Block Layer.
+ *
+ * This function is called by the Block Layer when it receives an IOCTL
+ * command that it does not understand. If the IOCTL command is not supported
+ * this function returns -ENOTTY.
+ *
+ * @dd  Pointer to the driver data structure.
+ * @cmd IOCTL command passed from the Block Layer.
+ * @arg IOCTL argument passed from the Block Layer.
+ *
+ * return value
+ *	0	The IOCTL completed successfully.
+ *	-ENOTTY The specified command is not supported.
+ *	-EFAULT An error occurred copying data to a user space buffer.
+ *	-EIO	An error occurred while executing the command.
+ */
+static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
+			 unsigned long arg)
+{
+	switch (cmd) {
+	case HDIO_GET_IDENTITY:
+	{
+		if (copy_to_user((void __user *)arg, dd->port->identify,
+						sizeof(u16) * ATA_ID_WORDS))
+			return -EFAULT;
+		break;
+	}
+	case HDIO_DRIVE_CMD:
+	{
+		u8 drive_command[4];
+
+		/* Copy the user command info to our buffer. */
+		if (copy_from_user(drive_command,
+					 (void __user *) arg,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		/* Execute the drive command. */
+		if (exec_drive_command(dd->port,
+					 drive_command,
+					 (void __user *) (arg+4)))
+			return -EIO;
+
+		/* Copy the status back to the users buffer. */
+		if (copy_to_user((void __user *) arg,
+					 drive_command,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		break;
+	}
+	case HDIO_DRIVE_TASK:
+	{
+		u8 drive_command[7];
+
+		/* Copy the user command info to our buffer. */
+		if (copy_from_user(drive_command,
+					 (void __user *) arg,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		/* Execute the drive command. */
+		if (exec_drive_task(dd->port, drive_command))
+			return -EIO;
+
+		/* Copy the status back to the users buffer. */
+		if (copy_to_user((void __user *) arg,
+					 drive_command,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		break;
+	}
+	case HDIO_DRIVE_TASKFILE: {
+		ide_task_request_t req_task;
+		int ret, outtotal;
+
+		if (copy_from_user(&req_task, (void __user *) arg,
+					sizeof(req_task)))
+			return -EFAULT;
+
+		outtotal = sizeof(req_task);
+
+		ret = exec_drive_taskfile(dd, (void __user *) arg,
+						&req_task, outtotal);
+
+		if (copy_to_user((void __user *) arg, &req_task,
+							sizeof(req_task)))
+			return -EFAULT;
+
+		return ret;
+	}
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Submit an IO to the hw
+ *
+ * This function is called by the block layer to issue an io
+ * to the device. Upon completion, the callback function will
+ * be called with the data parameter passed as the callback data.
+ *
+ * @dd       Pointer to the driver data structure.
+ * @start    First sector to read.
+ * @nsect    Number of sectors to read.
+ * @nents    Number of entries in scatter list for the read command.
+ * @tag      The tag of this read command.
+ * @callback Pointer to the function that should be called
+ *	     when the read completes.
+ * @data     Callback data passed to the callback function
+ *	     when the read completes.
+ * @dir      Direction (read or write)
+ *
+ * return value
+ *	None
+ */
+static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
+			      struct mtip_cmd *command, int nents,
+			      struct blk_mq_hw_ctx *hctx)
+{
+	struct host_to_dev_fis	*fis;
+	struct mtip_port *port = dd->port;
+	int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	u64 start = blk_rq_pos(rq);
+	unsigned int nsect = blk_rq_sectors(rq);
+
+	/* Map the scatter list for DMA access */
+	nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
+
+	prefetch(&port->flags);
+
+	command->scatter_ents = nents;
+
+	/*
+	 * The number of retries for this command before it is
+	 * reported as a failure to the upper layers.
+	 */
+	command->retries = MTIP_MAX_RETRIES;
+
+	/* Fill out fis */
+	fis = command->command;
+	fis->type        = 0x27;
+	fis->opts        = 1 << 7;
+	if (dma_dir == DMA_FROM_DEVICE)
+		fis->command = ATA_CMD_FPDMA_READ;
+	else
+		fis->command = ATA_CMD_FPDMA_WRITE;
+	fis->lba_low     = start & 0xFF;
+	fis->lba_mid     = (start >> 8) & 0xFF;
+	fis->lba_hi      = (start >> 16) & 0xFF;
+	fis->lba_low_ex  = (start >> 24) & 0xFF;
+	fis->lba_mid_ex  = (start >> 32) & 0xFF;
+	fis->lba_hi_ex   = (start >> 40) & 0xFF;
+	fis->device	 = 1 << 6;
+	fis->features    = nsect & 0xFF;
+	fis->features_ex = (nsect >> 8) & 0xFF;
+	fis->sect_count  = ((rq->tag << 3) | (rq->tag >> 5));
+	fis->sect_cnt_ex = 0;
+	fis->control     = 0;
+	fis->res2        = 0;
+	fis->res3        = 0;
+	fill_command_sg(dd, command, nents);
+
+	if (unlikely(command->unaligned))
+		fis->device |= 1 << 7;
+
+	/* Populate the command header */
+	command->command_header->opts =
+			__force_bit2int cpu_to_le32(
+				(nents << 16) | 5 | AHCI_CMD_PREFETCH);
+	command->command_header->byte_count = 0;
+
+	/*
+	 * Set the completion function and data for the command
+	 * within this layer.
+	 */
+	command->comp_data = dd;
+	command->comp_func = mtip_async_complete;
+	command->direction = dma_dir;
+
+	/*
+	 * To prevent this command from being issued
+	 * if an internal command is in progress or error handling is active.
+	 */
+	if (unlikely(port->flags & MTIP_PF_PAUSE_IO)) {
+		set_bit(rq->tag, port->cmds_to_issue);
+		set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+		return;
+	}
+
+	/* Issue the command to the hardware */
+	mtip_issue_ncq_command(port, rq->tag);
+}
+
+/*
+ * Sysfs status dump.
+ *
+ * @dev  Pointer to the device structure, passed by the kernrel.
+ * @attr Pointer to the device_attribute structure passed by the kernel.
+ * @buf  Pointer to the char buffer that will receive the stats info.
+ *
+ * return value
+ *	The size, in bytes, of the data copied into buf.
+ */
+static ssize_t mtip_hw_show_status(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct driver_data *dd = dev_to_disk(dev)->private_data;
+	int size = 0;
+
+	if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
+		size += sprintf(buf, "%s", "thermal_shutdown\n");
+	else if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag))
+		size += sprintf(buf, "%s", "write_protect\n");
+	else
+		size += sprintf(buf, "%s", "online\n");
+
+	return size;
+}
+
+static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL);
+
+/* debugsfs entries */
+
+static ssize_t show_device_status(struct device_driver *drv, char *buf)
+{
+	int size = 0;
+	struct driver_data *dd, *tmp;
+	unsigned long flags;
+	char id_buf[42];
+	u16 status = 0;
+
+	spin_lock_irqsave(&dev_lock, flags);
+	size += sprintf(&buf[size], "Devices Present:\n");
+	list_for_each_entry_safe(dd, tmp, &online_list, online_list) {
+		if (dd->pdev) {
+			if (dd->port &&
+			    dd->port->identify &&
+			    dd->port->identify_valid) {
+				strlcpy(id_buf,
+					(char *) (dd->port->identify + 10), 21);
+				status = *(dd->port->identify + 141);
+			} else {
+				memset(id_buf, 0, 42);
+				status = 0;
+			}
+
+			if (dd->port &&
+			    test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) {
+				size += sprintf(&buf[size],
+					" device %s %s (ftl rebuild %d %%)\n",
+					dev_name(&dd->pdev->dev),
+					id_buf,
+					status);
+			} else {
+				size += sprintf(&buf[size],
+					" device %s %s\n",
+					dev_name(&dd->pdev->dev),
+					id_buf);
+			}
+		}
+	}
+
+	size += sprintf(&buf[size], "Devices Being Removed:\n");
+	list_for_each_entry_safe(dd, tmp, &removing_list, remove_list) {
+		if (dd->pdev) {
+			if (dd->port &&
+			    dd->port->identify &&
+			    dd->port->identify_valid) {
+				strlcpy(id_buf,
+					(char *) (dd->port->identify+10), 21);
+				status = *(dd->port->identify + 141);
+			} else {
+				memset(id_buf, 0, 42);
+				status = 0;
+			}
+
+			if (dd->port &&
+			    test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) {
+				size += sprintf(&buf[size],
+					" device %s %s (ftl rebuild %d %%)\n",
+					dev_name(&dd->pdev->dev),
+					id_buf,
+					status);
+			} else {
+				size += sprintf(&buf[size],
+					" device %s %s\n",
+					dev_name(&dd->pdev->dev),
+					id_buf);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dev_lock, flags);
+
+	return size;
+}
+
+static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf,
+						size_t len, loff_t *offset)
+{
+	struct driver_data *dd =  (struct driver_data *)f->private_data;
+	int size = *offset;
+	char *buf;
+	int rv = 0;
+
+	if (!len || *offset)
+		return 0;
+
+	buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+	if (!buf) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: status buffer\n");
+		return -ENOMEM;
+	}
+
+	size += show_device_status(NULL, buf);
+
+	*offset = size <= len ? size : len;
+	size = copy_to_user(ubuf, buf, *offset);
+	if (size)
+		rv = -EFAULT;
+
+	kfree(buf);
+	return rv ? rv : *offset;
+}
+
+static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
+				  size_t len, loff_t *offset)
+{
+	struct driver_data *dd =  (struct driver_data *)f->private_data;
+	char *buf;
+	u32 group_allocated;
+	int size = *offset;
+	int n, rv = 0;
+
+	if (!len || size)
+		return 0;
+
+	buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+	if (!buf) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: register buffer\n");
+		return -ENOMEM;
+	}
+
+	size += sprintf(&buf[size], "H/ S ACTive      : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
+					 readl(dd->port->s_active[n]));
+
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "H/ Command Issue : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
+					readl(dd->port->cmd_issue[n]));
+
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "H/ Completed     : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
+				readl(dd->port->completed[n]));
+
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "H/ PORT IRQ STAT : [ 0x%08X ]\n",
+				readl(dd->port->mmio + PORT_IRQ_STAT));
+	size += sprintf(&buf[size], "H/ HOST IRQ STAT : [ 0x%08X ]\n",
+				readl(dd->mmio + HOST_IRQ_STAT));
+	size += sprintf(&buf[size], "\n");
+
+	size += sprintf(&buf[size], "L/ Allocated     : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--) {
+		if (sizeof(long) > sizeof(u32))
+			group_allocated =
+				dd->port->allocated[n/2] >> (32*(n&1));
+		else
+			group_allocated = dd->port->allocated[n];
+		size += sprintf(&buf[size], "%08X ", group_allocated);
+	}
+	size += sprintf(&buf[size], "]\n");
+
+	size += sprintf(&buf[size], "L/ Commands in Q : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--) {
+		if (sizeof(long) > sizeof(u32))
+			group_allocated =
+				dd->port->cmds_to_issue[n/2] >> (32*(n&1));
+		else
+			group_allocated = dd->port->cmds_to_issue[n];
+		size += sprintf(&buf[size], "%08X ", group_allocated);
+	}
+	size += sprintf(&buf[size], "]\n");
+
+	*offset = size <= len ? size : len;
+	size = copy_to_user(ubuf, buf, *offset);
+	if (size)
+		rv = -EFAULT;
+
+	kfree(buf);
+	return rv ? rv : *offset;
+}
+
+static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
+				  size_t len, loff_t *offset)
+{
+	struct driver_data *dd =  (struct driver_data *)f->private_data;
+	char *buf;
+	int size = *offset;
+	int rv = 0;
+
+	if (!len || size)
+		return 0;
+
+	buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+	if (!buf) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: flag buffer\n");
+		return -ENOMEM;
+	}
+
+	size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n",
+							dd->port->flags);
+	size += sprintf(&buf[size], "Flag-dd   : [ %08lX ]\n",
+							dd->dd_flag);
+
+	*offset = size <= len ? size : len;
+	size = copy_to_user(ubuf, buf, *offset);
+	if (size)
+		rv = -EFAULT;
+
+	kfree(buf);
+	return rv ? rv : *offset;
+}
+
+static const struct file_operations mtip_device_status_fops = {
+	.owner  = THIS_MODULE,
+	.open   = simple_open,
+	.read   = mtip_hw_read_device_status,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations mtip_regs_fops = {
+	.owner  = THIS_MODULE,
+	.open   = simple_open,
+	.read   = mtip_hw_read_registers,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations mtip_flags_fops = {
+	.owner  = THIS_MODULE,
+	.open   = simple_open,
+	.read   = mtip_hw_read_flags,
+	.llseek = no_llseek,
+};
+
+/*
+ * Create the sysfs related attributes.
+ *
+ * @dd   Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ *	0	Operation completed successfully.
+ *	-EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj)
+{
+	if (!kobj || !dd)
+		return -EINVAL;
+
+	if (sysfs_create_file(kobj, &dev_attr_status.attr))
+		dev_warn(&dd->pdev->dev,
+			"Error creating 'status' sysfs entry\n");
+	return 0;
+}
+
+/*
+ * Remove the sysfs related attributes.
+ *
+ * @dd   Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ *	0	Operation completed successfully.
+ *	-EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj)
+{
+	if (!kobj || !dd)
+		return -EINVAL;
+
+	sysfs_remove_file(kobj, &dev_attr_status.attr);
+
+	return 0;
+}
+
+static int mtip_hw_debugfs_init(struct driver_data *dd)
+{
+	if (!dfs_parent)
+		return -1;
+
+	dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent);
+	if (IS_ERR_OR_NULL(dd->dfs_node)) {
+		dev_warn(&dd->pdev->dev,
+			"Error creating node %s under debugfs\n",
+						dd->disk->disk_name);
+		dd->dfs_node = NULL;
+		return -1;
+	}
+
+	debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd,
+							&mtip_flags_fops);
+	debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd,
+							&mtip_regs_fops);
+
+	return 0;
+}
+
+static void mtip_hw_debugfs_exit(struct driver_data *dd)
+{
+	if (dd->dfs_node)
+		debugfs_remove_recursive(dd->dfs_node);
+}
+
+static int mtip_free_orphan(struct driver_data *dd)
+{
+	struct kobject *kobj;
+
+	if (dd->bdev) {
+		if (dd->bdev->bd_holders >= 1)
+			return -2;
+
+		bdput(dd->bdev);
+		dd->bdev = NULL;
+	}
+
+	mtip_hw_debugfs_exit(dd);
+
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, dd->index);
+	spin_unlock(&rssd_index_lock);
+
+	if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag) &&
+			test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) {
+		put_disk(dd->disk);
+	} else {
+		if (dd->disk) {
+			kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+			if (kobj) {
+				mtip_hw_sysfs_exit(dd, kobj);
+				kobject_put(kobj);
+			}
+			del_gendisk(dd->disk);
+			dd->disk = NULL;
+		}
+		if (dd->queue) {
+			dd->queue->queuedata = NULL;
+			blk_cleanup_queue(dd->queue);
+			blk_mq_free_tag_set(&dd->tags);
+			dd->queue = NULL;
+		}
+	}
+	kfree(dd);
+	return 0;
+}
+
+/*
+ * Perform any init/resume time hardware setup
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	None
+ */
+static inline void hba_setup(struct driver_data *dd)
+{
+	u32 hwdata;
+	hwdata = readl(dd->mmio + HOST_HSORG);
+
+	/* interrupt bug workaround: use only 1 IS bit.*/
+	writel(hwdata |
+		HSORG_DISABLE_SLOTGRP_INTR |
+		HSORG_DISABLE_SLOTGRP_PXIS,
+		dd->mmio + HOST_HSORG);
+}
+
+static int mtip_device_unaligned_constrained(struct driver_data *dd)
+{
+	return (dd->pdev->device == P420M_DEVICE_ID ? 1 : 0);
+}
+
+/*
+ * Detect the details of the product, and store anything needed
+ * into the driver data structure.  This includes product type and
+ * version and number of slot groups.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_detect_product(struct driver_data *dd)
+{
+	u32 hwdata;
+	unsigned int rev, slotgroups;
+
+	/*
+	 * HBA base + 0xFC [15:0] - vendor-specific hardware interface
+	 * info register:
+	 * [15:8] hardware/software interface rev#
+	 * [   3] asic-style interface
+	 * [ 2:0] number of slot groups, minus 1 (only valid for asic-style).
+	 */
+	hwdata = readl(dd->mmio + HOST_HSORG);
+
+	dd->product_type = MTIP_PRODUCT_UNKNOWN;
+	dd->slot_groups = 1;
+
+	if (hwdata & 0x8) {
+		dd->product_type = MTIP_PRODUCT_ASICFPGA;
+		rev = (hwdata & HSORG_HWREV) >> 8;
+		slotgroups = (hwdata & HSORG_SLOTGROUPS) + 1;
+		dev_info(&dd->pdev->dev,
+			"ASIC-FPGA design, HS rev 0x%x, "
+			"%i slot groups [%i slots]\n",
+			 rev,
+			 slotgroups,
+			 slotgroups * 32);
+
+		if (slotgroups > MTIP_MAX_SLOT_GROUPS) {
+			dev_warn(&dd->pdev->dev,
+				"Warning: driver only supports "
+				"%i slot groups.\n", MTIP_MAX_SLOT_GROUPS);
+			slotgroups = MTIP_MAX_SLOT_GROUPS;
+		}
+		dd->slot_groups = slotgroups;
+		return;
+	}
+
+	dev_warn(&dd->pdev->dev, "Unrecognized product id\n");
+}
+
+/*
+ * Blocking wait for FTL rebuild to complete
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	0	FTL rebuild completed successfully
+ *	-EFAULT FTL rebuild error/timeout/interruption
+ */
+static int mtip_ftl_rebuild_poll(struct driver_data *dd)
+{
+	unsigned long timeout, cnt = 0, start;
+
+	dev_warn(&dd->pdev->dev,
+		"FTL rebuild in progress. Polling for completion.\n");
+
+	start = jiffies;
+	timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS);
+
+	do {
+		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+				&dd->dd_flag)))
+			return -EFAULT;
+		if (mtip_check_surprise_removal(dd->pdev))
+			return -EFAULT;
+
+		if (mtip_get_identify(dd->port, NULL) < 0)
+			return -EFAULT;
+
+		if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+			MTIP_FTL_REBUILD_MAGIC) {
+			ssleep(1);
+			/* Print message every 3 minutes */
+			if (cnt++ >= 180) {
+				dev_warn(&dd->pdev->dev,
+				"FTL rebuild in progress (%d secs).\n",
+				jiffies_to_msecs(jiffies - start) / 1000);
+				cnt = 0;
+			}
+		} else {
+			dev_warn(&dd->pdev->dev,
+				"FTL rebuild complete (%d secs).\n",
+			jiffies_to_msecs(jiffies - start) / 1000);
+			mtip_block_initialize(dd);
+			return 0;
+		}
+		ssleep(10);
+	} while (time_before(jiffies, timeout));
+
+	/* Check for timeout */
+	dev_err(&dd->pdev->dev,
+		"Timed out waiting for FTL rebuild to complete (%d secs).\n",
+		jiffies_to_msecs(jiffies - start) / 1000);
+	return -EFAULT;
+}
+
+/*
+ * service thread to issue queued commands
+ *
+ * @data Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+
+static int mtip_service_thread(void *data)
+{
+	struct driver_data *dd = (struct driver_data *)data;
+	unsigned long slot, slot_start, slot_wrap;
+	unsigned int num_cmd_slots = dd->slot_groups * 32;
+	struct mtip_port *port = dd->port;
+	int ret;
+
+	while (1) {
+		if (kthread_should_stop() ||
+			test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+			goto st_out;
+		clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
+		/*
+		 * the condition is to check neither an internal command is
+		 * is in progress nor error handling is active
+		 */
+		wait_event_interruptible(port->svc_wait, (port->flags) &&
+			!(port->flags & MTIP_PF_PAUSE_IO));
+
+		set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
+		if (kthread_should_stop() ||
+			test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+			goto st_out;
+
+		/* If I am an orphan, start self cleanup */
+		if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags))
+			break;
+
+		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+				&dd->dd_flag)))
+			goto st_out;
+
+restart_eh:
+		/* Demux bits: start with error handling */
+		if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) {
+			mtip_handle_tfe(dd);
+			clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+		}
+
+		if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags))
+			goto restart_eh;
+
+		if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+			slot = 1;
+			/* used to restrict the loop to one iteration */
+			slot_start = num_cmd_slots;
+			slot_wrap = 0;
+			while (1) {
+				slot = find_next_bit(port->cmds_to_issue,
+						num_cmd_slots, slot);
+				if (slot_wrap == 1) {
+					if ((slot_start >= slot) ||
+						(slot >= num_cmd_slots))
+						break;
+				}
+				if (unlikely(slot_start == num_cmd_slots))
+					slot_start = slot;
+
+				if (unlikely(slot == num_cmd_slots)) {
+					slot = 1;
+					slot_wrap = 1;
+					continue;
+				}
+
+				/* Issue the command to the hardware */
+				mtip_issue_ncq_command(port, slot);
+
+				clear_bit(slot, port->cmds_to_issue);
+			}
+
+			clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+		}
+
+		if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
+			if (mtip_ftl_rebuild_poll(dd) < 0)
+				set_bit(MTIP_DDF_REBUILD_FAILED_BIT,
+							&dd->dd_flag);
+			clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
+		}
+	}
+
+	/* wait for pci remove to exit */
+	while (1) {
+		if (test_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag))
+			break;
+		msleep_interruptible(1000);
+		if (kthread_should_stop())
+			goto st_out;
+	}
+
+	while (1) {
+		ret = mtip_free_orphan(dd);
+		if (!ret) {
+			/* NOTE: All data structures are invalid, do not
+			 * access any here */
+			return 0;
+		}
+		msleep_interruptible(1000);
+		if (kthread_should_stop())
+			goto st_out;
+	}
+st_out:
+	return 0;
+}
+
+/*
+ * DMA region teardown
+ *
+ * @dd Pointer to driver_data structure
+ *
+ * return value
+ *      None
+ */
+static void mtip_dma_free(struct driver_data *dd)
+{
+	struct mtip_port *port = dd->port;
+
+	if (port->block1)
+		dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+					port->block1, port->block1_dma);
+
+	if (port->command_list) {
+		dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+				port->command_list, port->command_list_dma);
+	}
+}
+
+/*
+ * DMA region setup
+ *
+ * @dd Pointer to driver_data structure
+ *
+ * return value
+ *      -ENOMEM Not enough free DMA region space to initialize driver
+ */
+static int mtip_dma_alloc(struct driver_data *dd)
+{
+	struct mtip_port *port = dd->port;
+
+	/* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
+	port->block1 =
+		dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+					&port->block1_dma, GFP_KERNEL);
+	if (!port->block1)
+		return -ENOMEM;
+	memset(port->block1, 0, BLOCK_DMA_ALLOC_SZ);
+
+	/* Allocate dma memory for command list */
+	port->command_list =
+		dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+					&port->command_list_dma, GFP_KERNEL);
+	if (!port->command_list) {
+		dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+					port->block1, port->block1_dma);
+		port->block1 = NULL;
+		port->block1_dma = 0;
+		return -ENOMEM;
+	}
+	memset(port->command_list, 0, AHCI_CMD_TBL_SZ);
+
+	/* Setup all pointers into first DMA region */
+	port->rxfis         = port->block1 + AHCI_RX_FIS_OFFSET;
+	port->rxfis_dma     = port->block1_dma + AHCI_RX_FIS_OFFSET;
+	port->identify      = port->block1 + AHCI_IDFY_OFFSET;
+	port->identify_dma  = port->block1_dma + AHCI_IDFY_OFFSET;
+	port->log_buf       = port->block1 + AHCI_SECTBUF_OFFSET;
+	port->log_buf_dma   = port->block1_dma + AHCI_SECTBUF_OFFSET;
+	port->smart_buf     = port->block1 + AHCI_SMARTBUF_OFFSET;
+	port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET;
+
+	return 0;
+}
+
+static int mtip_hw_get_identify(struct driver_data *dd)
+{
+	struct smart_attr attr242;
+	unsigned char *buf;
+	int rv;
+
+	if (mtip_get_identify(dd->port, NULL) < 0)
+		return -EFAULT;
+
+	if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+		MTIP_FTL_REBUILD_MAGIC) {
+		set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
+		return MTIP_FTL_REBUILD_MAGIC;
+	}
+	mtip_dump_identify(dd->port);
+
+	/* check write protect, over temp and rebuild statuses */
+	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+				dd->port->log_buf,
+				dd->port->log_buf_dma, 1);
+	if (rv) {
+		dev_warn(&dd->pdev->dev,
+			"Error in READ LOG EXT (10h) command\n");
+		/* non-critical error, don't fail the load */
+	} else {
+		buf = (unsigned char *)dd->port->log_buf;
+		if (buf[259] & 0x1) {
+			dev_info(&dd->pdev->dev,
+				"Write protect bit is set.\n");
+			set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+		}
+		if (buf[288] == 0xF7) {
+			dev_info(&dd->pdev->dev,
+				"Exceeded Tmax, drive in thermal shutdown.\n");
+			set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+		}
+		if (buf[288] == 0xBF) {
+			dev_info(&dd->pdev->dev,
+				"Drive indicates rebuild has failed.\n");
+			/* TODO */
+		}
+	}
+
+	/* get write protect progess */
+	memset(&attr242, 0, sizeof(struct smart_attr));
+	if (mtip_get_smart_attr(dd->port, 242, &attr242))
+		dev_warn(&dd->pdev->dev,
+				"Unable to check write protect progress\n");
+	else
+		dev_info(&dd->pdev->dev,
+				"Write protect progress: %u%% (%u blocks)\n",
+				attr242.cur, le32_to_cpu(attr242.data));
+
+	return rv;
+}
+
+/*
+ * Called once for each card.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0 on success, else an error code.
+ */
+static int mtip_hw_init(struct driver_data *dd)
+{
+	int i;
+	int rv;
+	unsigned int num_command_slots;
+	unsigned long timeout, timetaken;
+
+	dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
+
+	mtip_detect_product(dd);
+	if (dd->product_type == MTIP_PRODUCT_UNKNOWN) {
+		rv = -EIO;
+		goto out1;
+	}
+	num_command_slots = dd->slot_groups * 32;
+
+	hba_setup(dd);
+
+	dd->port = kzalloc_node(sizeof(struct mtip_port), GFP_KERNEL,
+				dd->numa_node);
+	if (!dd->port) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: port structure\n");
+		return -ENOMEM;
+	}
+
+	/* Continue workqueue setup */
+	for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
+		dd->work[i].port = dd->port;
+
+	/* Enable unaligned IO constraints for some devices */
+	if (mtip_device_unaligned_constrained(dd))
+		dd->unal_qdepth = MTIP_MAX_UNALIGNED_SLOTS;
+	else
+		dd->unal_qdepth = 0;
+
+	sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth);
+
+	/* Spinlock to prevent concurrent issue */
+	for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
+		spin_lock_init(&dd->port->cmd_issue_lock[i]);
+
+	/* Set the port mmio base address. */
+	dd->port->mmio	= dd->mmio + PORT_OFFSET;
+	dd->port->dd	= dd;
+
+	/* DMA allocations */
+	rv = mtip_dma_alloc(dd);
+	if (rv < 0)
+		goto out1;
+
+	/* Setup the pointers to the extended s_active and CI registers. */
+	for (i = 0; i < dd->slot_groups; i++) {
+		dd->port->s_active[i] =
+			dd->port->mmio + i*0x80 + PORT_SCR_ACT;
+		dd->port->cmd_issue[i] =
+			dd->port->mmio + i*0x80 + PORT_COMMAND_ISSUE;
+		dd->port->completed[i] =
+			dd->port->mmio + i*0x80 + PORT_SDBV;
+	}
+
+	timetaken = jiffies;
+	timeout = jiffies + msecs_to_jiffies(30000);
+	while (((readl(dd->port->mmio + PORT_SCR_STAT) & 0x0F) != 0x03) &&
+		 time_before(jiffies, timeout)) {
+		mdelay(100);
+	}
+	if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+		timetaken = jiffies - timetaken;
+		dev_warn(&dd->pdev->dev,
+			"Surprise removal detected at %u ms\n",
+			jiffies_to_msecs(timetaken));
+		rv = -ENODEV;
+		goto out2 ;
+	}
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
+		timetaken = jiffies - timetaken;
+		dev_warn(&dd->pdev->dev,
+			"Removal detected at %u ms\n",
+			jiffies_to_msecs(timetaken));
+		rv = -EFAULT;
+		goto out2;
+	}
+
+	/* Conditionally reset the HBA. */
+	if (!(readl(dd->mmio + HOST_CAP) & HOST_CAP_NZDMA)) {
+		if (mtip_hba_reset(dd) < 0) {
+			dev_err(&dd->pdev->dev,
+				"Card did not reset within timeout\n");
+			rv = -EIO;
+			goto out2;
+		}
+	} else {
+		/* Clear any pending interrupts on the HBA */
+		writel(readl(dd->mmio + HOST_IRQ_STAT),
+			dd->mmio + HOST_IRQ_STAT);
+	}
+
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Setup the ISR and enable interrupts. */
+	rv = devm_request_irq(&dd->pdev->dev,
+				dd->pdev->irq,
+				mtip_irq_handler,
+				IRQF_SHARED,
+				dev_driver_string(&dd->pdev->dev),
+				dd);
+
+	if (rv) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate IRQ %d\n", dd->pdev->irq);
+		goto out2;
+	}
+	irq_set_affinity_hint(dd->pdev->irq, get_cpu_mask(dd->isr_binding));
+
+	/* Enable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+					dd->mmio + HOST_CTL);
+
+	init_waitqueue_head(&dd->port->svc_wait);
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
+		rv = -EFAULT;
+		goto out3;
+	}
+
+	return rv;
+
+out3:
+	/* Disable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+
+	/* Release the IRQ. */
+	irq_set_affinity_hint(dd->pdev->irq, NULL);
+	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+out2:
+	mtip_deinit_port(dd->port);
+	mtip_dma_free(dd);
+
+out1:
+	/* Free the memory allocated for the for structure. */
+	kfree(dd->port);
+
+	return rv;
+}
+
+static void mtip_standby_drive(struct driver_data *dd)
+{
+	if (dd->sr)
+		return;
+
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) &&
+	    !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
+		if (mtip_standby_immediate(dd->port))
+			dev_warn(&dd->pdev->dev,
+				"STANDBY IMMEDIATE failed\n");
+}
+
+/*
+ * Called to deinitialize an interface.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_hw_exit(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	if (!dd->sr) {
+		/* de-initialize the port. */
+		mtip_deinit_port(dd->port);
+
+		/* Disable interrupts on the HBA. */
+		writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+				dd->mmio + HOST_CTL);
+	}
+
+	/* Release the IRQ. */
+	irq_set_affinity_hint(dd->pdev->irq, NULL);
+	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+	/* Free dma regions */
+	mtip_dma_free(dd);
+
+	/* Free the memory allocated for the for structure. */
+	kfree(dd->port);
+	dd->port = NULL;
+
+	return 0;
+}
+
+/*
+ * Issue a Standby Immediate command to the device.
+ *
+ * This function is called by the Block Layer just before the
+ * system powers off during a shutdown.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_hw_shutdown(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	if (!dd->sr && dd->port)
+		mtip_standby_immediate(dd->port);
+
+	return 0;
+}
+
+/*
+ * Suspend function
+ *
+ * This function is called by the Block Layer just before the
+ * system hibernates.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	Suspend was successful
+ *	-EFAULT Suspend was not successful
+ */
+static int mtip_hw_suspend(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive
+	 * so that it saves its state.
+	 */
+	if (mtip_standby_immediate(dd->port) != 0) {
+		dev_err(&dd->pdev->dev,
+			"Failed standby-immediate command\n");
+		return -EFAULT;
+	}
+
+	/* Disable interrupts on the HBA.*/
+	writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+	mtip_deinit_port(dd->port);
+
+	return 0;
+}
+
+/*
+ * Resume function
+ *
+ * This function is called by the Block Layer as the
+ * system resumes.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	Resume was successful
+ *      -EFAULT Resume was not successful
+ */
+static int mtip_hw_resume(struct driver_data *dd)
+{
+	/* Perform any needed hardware setup steps */
+	hba_setup(dd);
+
+	/* Reset the HBA */
+	if (mtip_hba_reset(dd) != 0) {
+		dev_err(&dd->pdev->dev,
+			"Unable to reset the HBA\n");
+		return -EFAULT;
+	}
+
+	/*
+	 * Enable the port, DMA engine, and FIS reception specific
+	 * h/w in controller.
+	 */
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Enable interrupts on the HBA.*/
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+
+	return 0;
+}
+
+/*
+ * Helper function for reusing disk name
+ * upon hot insertion.
+ */
+static int rssd_disk_name_format(char *prefix,
+				 int index,
+				 char *buf,
+				 int buflen)
+{
+	const int base = 'z' - 'a' + 1;
+	char *begin = buf + strlen(prefix);
+	char *end = buf + buflen;
+	char *p;
+	int unit;
+
+	p = end - 1;
+	*p = '\0';
+	unit = base;
+	do {
+		if (p == begin)
+			return -EINVAL;
+		*--p = 'a' + (index % unit);
+		index = (index / unit) - 1;
+	} while (index >= 0);
+
+	memmove(begin, p, end - p);
+	memcpy(buf, prefix, strlen(prefix));
+
+	return 0;
+}
+
+/*
+ * Block layer IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ *	0        IOCTL completed successfully.
+ *	-ENOTTY  IOCTL not supported or invalid driver data
+ *                 structure pointer.
+ */
+static int mtip_block_ioctl(struct block_device *dev,
+			    fmode_t mode,
+			    unsigned cmd,
+			    unsigned long arg)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!dd)
+		return -ENOTTY;
+
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+		return -ENOTTY;
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		return -ENOTTY;
+	default:
+		return mtip_hw_ioctl(dd, cmd, arg);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Block layer compat IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ *	0        IOCTL completed successfully.
+ *	-ENOTTY  IOCTL not supported or invalid driver data
+ *                 structure pointer.
+ */
+static int mtip_block_compat_ioctl(struct block_device *dev,
+			    fmode_t mode,
+			    unsigned cmd,
+			    unsigned long arg)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!dd)
+		return -ENOTTY;
+
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+		return -ENOTTY;
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		return -ENOTTY;
+	case HDIO_DRIVE_TASKFILE: {
+		struct mtip_compat_ide_task_request_s __user *compat_req_task;
+		ide_task_request_t req_task;
+		int compat_tasksize, outtotal, ret;
+
+		compat_tasksize =
+			sizeof(struct mtip_compat_ide_task_request_s);
+
+		compat_req_task =
+			(struct mtip_compat_ide_task_request_s __user *) arg;
+
+		if (copy_from_user(&req_task, (void __user *) arg,
+			compat_tasksize - (2 * sizeof(compat_long_t))))
+			return -EFAULT;
+
+		if (get_user(req_task.out_size, &compat_req_task->out_size))
+			return -EFAULT;
+
+		if (get_user(req_task.in_size, &compat_req_task->in_size))
+			return -EFAULT;
+
+		outtotal = sizeof(struct mtip_compat_ide_task_request_s);
+
+		ret = exec_drive_taskfile(dd, (void __user *) arg,
+						&req_task, outtotal);
+
+		if (copy_to_user((void __user *) arg, &req_task,
+				compat_tasksize -
+				(2 * sizeof(compat_long_t))))
+			return -EFAULT;
+
+		if (put_user(req_task.out_size, &compat_req_task->out_size))
+			return -EFAULT;
+
+		if (put_user(req_task.in_size, &compat_req_task->in_size))
+			return -EFAULT;
+
+		return ret;
+	}
+	default:
+		return mtip_hw_ioctl(dd, cmd, arg);
+	}
+}
+#endif
+
+/*
+ * Obtain the geometry of the device.
+ *
+ * You may think that this function is obsolete, but some applications,
+ * fdisk for example still used CHS values. This function describes the
+ * device as having 224 heads and 56 sectors per cylinder. These values are
+ * chosen so that each cylinder is aligned on a 4KB boundary. Since a
+ * partition is described in terms of a start and end cylinder this means
+ * that each partition is also 4KB aligned. Non-aligned partitions adversely
+ * affects performance.
+ *
+ * @dev Pointer to the block_device strucutre.
+ * @geo Pointer to a hd_geometry structure.
+ *
+ * return value
+ *	0       Operation completed successfully.
+ *	-ENOTTY An error occurred while reading the drive capacity.
+ */
+static int mtip_block_getgeo(struct block_device *dev,
+				struct hd_geometry *geo)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+	sector_t capacity;
+
+	if (!dd)
+		return -ENOTTY;
+
+	if (!(mtip_hw_get_capacity(dd, &capacity))) {
+		dev_warn(&dd->pdev->dev,
+			"Could not get drive capacity.\n");
+		return -ENOTTY;
+	}
+
+	geo->heads = 224;
+	geo->sectors = 56;
+	sector_div(capacity, (geo->heads * geo->sectors));
+	geo->cylinders = capacity;
+	return 0;
+}
+
+/*
+ * Block device operation function.
+ *
+ * This structure contains pointers to the functions required by the block
+ * layer.
+ */
+static const struct block_device_operations mtip_block_ops = {
+	.ioctl		= mtip_block_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= mtip_block_compat_ioctl,
+#endif
+	.getgeo		= mtip_block_getgeo,
+	.owner		= THIS_MODULE
+};
+
+/*
+ * Block layer make request function.
+ *
+ * This function is called by the kernel to process a BIO for
+ * the P320 device.
+ *
+ * @queue Pointer to the request queue. Unused other than to obtain
+ *              the driver data structure.
+ * @rq    Pointer to the request.
+ *
+ */
+static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct driver_data *dd = hctx->queue->queuedata;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	unsigned int nents;
+
+	if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
+		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+							&dd->dd_flag))) {
+			return -ENXIO;
+		}
+		if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
+			return -ENODATA;
+		}
+		if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
+							&dd->dd_flag) &&
+				rq_data_dir(rq))) {
+			return -ENODATA;
+		}
+		if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)))
+			return -ENODATA;
+		if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
+			return -ENXIO;
+	}
+
+	if (rq->cmd_flags & REQ_DISCARD) {
+		int err;
+
+		err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
+		blk_mq_end_io(rq, err);
+		return 0;
+	}
+
+	/* Create the scatter list for this request. */
+	nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg);
+
+	/* Issue the read/write. */
+	mtip_hw_submit_io(dd, rq, cmd, nents, hctx);
+	return 0;
+}
+
+static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
+				  struct request *rq)
+{
+	struct driver_data *dd = hctx->queue->queuedata;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	if (rq_data_dir(rq) == READ || !dd->unal_qdepth)
+		return false;
+
+	/*
+	 * If unaligned depth must be limited on this controller, mark it
+	 * as unaligned if the IO isn't on a 4k boundary (start of length).
+	 */
+	if (blk_rq_sectors(rq) <= 64) {
+		if ((blk_rq_pos(rq) & 7) || (blk_rq_sectors(rq) & 7))
+			cmd->unaligned = 1;
+	}
+
+	if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal))
+		return true;
+
+	return false;
+}
+
+static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	int ret;
+
+	if (unlikely(mtip_check_unal_depth(hctx, rq)))
+		return BLK_MQ_RQ_QUEUE_BUSY;
+
+	ret = mtip_submit_request(hctx, rq);
+	if (likely(!ret))
+		return BLK_MQ_RQ_QUEUE_OK;
+
+	rq->errors = ret;
+	return BLK_MQ_RQ_QUEUE_ERROR;
+}
+
+static void mtip_free_cmd(void *data, struct request *rq,
+			  unsigned int hctx_idx, unsigned int request_idx)
+{
+	struct driver_data *dd = data;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	if (!cmd->command)
+		return;
+
+	dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+				cmd->command, cmd->command_dma);
+}
+
+static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx,
+			 unsigned int request_idx, unsigned int numa_node)
+{
+	struct driver_data *dd = data;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
+
+	cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+			&cmd->command_dma, GFP_KERNEL);
+	if (!cmd->command)
+		return -ENOMEM;
+
+	memset(cmd->command, 0, CMD_DMA_ALLOC_SZ);
+
+	/* Point the command headers at the command tables. */
+	cmd->command_header = dd->port->command_list +
+				(sizeof(struct mtip_cmd_hdr) * request_idx);
+	cmd->command_header_dma = dd->port->command_list_dma +
+				(sizeof(struct mtip_cmd_hdr) * request_idx);
+
+	if (host_cap_64)
+		cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
+
+	cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+
+	sg_init_table(cmd->sg, MTIP_MAX_SG);
+	return 0;
+}
+
+static struct blk_mq_ops mtip_mq_ops = {
+	.queue_rq	= mtip_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_request	= mtip_init_cmd,
+	.exit_request	= mtip_free_cmd,
+};
+
+/*
+ * Block layer initialization function.
+ *
+ * This function is called once by the PCI layer for each P320
+ * device that is connected to the system.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0 on success else an error code.
+ */
+static int mtip_block_initialize(struct driver_data *dd)
+{
+	int rv = 0, wait_for_rebuild = 0;
+	sector_t capacity;
+	unsigned int index = 0;
+	struct kobject *kobj;
+	unsigned char thd_name[16];
+
+	if (dd->disk)
+		goto skip_create_disk; /* hw init done, before rebuild */
+
+	if (mtip_hw_init(dd)) {
+		rv = -EINVAL;
+		goto protocol_init_error;
+	}
+
+	dd->disk = alloc_disk_node(MTIP_MAX_MINORS, dd->numa_node);
+	if (dd->disk  == NULL) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate gendisk structure\n");
+		rv = -EINVAL;
+		goto alloc_disk_error;
+	}
+
+	/* Generate the disk name, implemented same as in sd.c */
+	do {
+		if (!ida_pre_get(&rssd_index_ida, GFP_KERNEL))
+			goto ida_get_error;
+
+		spin_lock(&rssd_index_lock);
+		rv = ida_get_new(&rssd_index_ida, &index);
+		spin_unlock(&rssd_index_lock);
+	} while (rv == -EAGAIN);
+
+	if (rv)
+		goto ida_get_error;
+
+	rv = rssd_disk_name_format("rssd",
+				index,
+				dd->disk->disk_name,
+				DISK_NAME_LEN);
+	if (rv)
+		goto disk_index_error;
+
+	dd->disk->driverfs_dev	= &dd->pdev->dev;
+	dd->disk->major		= dd->major;
+	dd->disk->first_minor	= dd->instance * MTIP_MAX_MINORS;
+	dd->disk->fops		= &mtip_block_ops;
+	dd->disk->private_data	= dd;
+	dd->index		= index;
+
+	mtip_hw_debugfs_init(dd);
+
+skip_create_disk:
+	memset(&dd->tags, 0, sizeof(dd->tags));
+	dd->tags.ops = &mtip_mq_ops;
+	dd->tags.nr_hw_queues = 1;
+	dd->tags.queue_depth = MTIP_MAX_COMMAND_SLOTS;
+	dd->tags.reserved_tags = 1;
+	dd->tags.cmd_size = sizeof(struct mtip_cmd);
+	dd->tags.numa_node = dd->numa_node;
+	dd->tags.flags = BLK_MQ_F_SHOULD_MERGE;
+	dd->tags.driver_data = dd;
+
+	rv = blk_mq_alloc_tag_set(&dd->tags);
+	if (rv) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate request queue\n");
+		rv = -ENOMEM;
+		goto block_queue_alloc_init_error;
+	}
+
+	/* Allocate the request queue. */
+	dd->queue = blk_mq_init_queue(&dd->tags);
+	if (IS_ERR(dd->queue)) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate request queue\n");
+		rv = -ENOMEM;
+		goto block_queue_alloc_init_error;
+	}
+
+	dd->disk->queue		= dd->queue;
+	dd->queue->queuedata	= dd;
+
+	/* Initialize the protocol layer. */
+	wait_for_rebuild = mtip_hw_get_identify(dd);
+	if (wait_for_rebuild < 0) {
+		dev_err(&dd->pdev->dev,
+			"Protocol layer initialization failed\n");
+		rv = -EINVAL;
+		goto init_hw_cmds_error;
+	}
+
+	/*
+	 * if rebuild pending, start the service thread, and delay the block
+	 * queue creation and add_disk()
+	 */
+	if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+		goto start_service_thread;
+
+	/* Set device limits. */
+	set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
+	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
+	blk_queue_physical_block_size(dd->queue, 4096);
+	blk_queue_max_hw_sectors(dd->queue, 0xffff);
+	blk_queue_max_segment_size(dd->queue, 0x400000);
+	blk_queue_io_min(dd->queue, 4096);
+	blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask);
+
+	/*
+	 * write back cache is not supported in the device. FUA depends on
+	 * write back cache support, hence setting flush support to zero.
+	 */
+	blk_queue_flush(dd->queue, 0);
+
+	/* Signal trim support */
+	if (dd->trim_supp == true) {
+		set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags);
+		dd->queue->limits.discard_granularity = 4096;
+		blk_queue_max_discard_sectors(dd->queue,
+			MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
+		dd->queue->limits.discard_zeroes_data = 0;
+	}
+
+	/* Set the capacity of the device in 512 byte sectors. */
+	if (!(mtip_hw_get_capacity(dd, &capacity))) {
+		dev_warn(&dd->pdev->dev,
+			"Could not read drive capacity\n");
+		rv = -EIO;
+		goto read_capacity_error;
+	}
+	set_capacity(dd->disk, capacity);
+
+	/* Enable the block device and add it to /dev */
+	add_disk(dd->disk);
+
+	dd->bdev = bdget_disk(dd->disk, 0);
+	/*
+	 * Now that the disk is active, initialize any sysfs attributes
+	 * managed by the protocol layer.
+	 */
+	kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+	if (kobj) {
+		mtip_hw_sysfs_init(dd, kobj);
+		kobject_put(kobj);
+	}
+
+	if (dd->mtip_svc_handler) {
+		set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+		return rv; /* service thread created for handling rebuild */
+	}
+
+start_service_thread:
+	sprintf(thd_name, "mtip_svc_thd_%02d", index);
+	dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread,
+						dd, dd->numa_node, "%s",
+						thd_name);
+
+	if (IS_ERR(dd->mtip_svc_handler)) {
+		dev_err(&dd->pdev->dev, "service thread failed to start\n");
+		dd->mtip_svc_handler = NULL;
+		rv = -EFAULT;
+		goto kthread_run_error;
+	}
+	wake_up_process(dd->mtip_svc_handler);
+	if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+		rv = wait_for_rebuild;
+
+	return rv;
+
+kthread_run_error:
+	bdput(dd->bdev);
+	dd->bdev = NULL;
+
+	/* Delete our gendisk. This also removes the device from /dev */
+	del_gendisk(dd->disk);
+
+read_capacity_error:
+init_hw_cmds_error:
+	blk_cleanup_queue(dd->queue);
+	blk_mq_free_tag_set(&dd->tags);
+block_queue_alloc_init_error:
+	mtip_hw_debugfs_exit(dd);
+disk_index_error:
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, index);
+	spin_unlock(&rssd_index_lock);
+
+ida_get_error:
+	put_disk(dd->disk);
+
+alloc_disk_error:
+	mtip_hw_exit(dd); /* De-initialize the protocol layer. */
+
+protocol_init_error:
+	return rv;
+}
+
+/*
+ * Block layer deinitialization function.
+ *
+ * Called by the PCI layer as each P320 device is removed.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_block_remove(struct driver_data *dd)
+{
+	struct kobject *kobj;
+
+	if (!dd->sr) {
+		mtip_hw_debugfs_exit(dd);
+
+		if (dd->mtip_svc_handler) {
+			set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
+			wake_up_interruptible(&dd->port->svc_wait);
+			kthread_stop(dd->mtip_svc_handler);
+		}
+
+		/* Clean up the sysfs attributes, if created */
+		if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) {
+			kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+			if (kobj) {
+				mtip_hw_sysfs_exit(dd, kobj);
+				kobject_put(kobj);
+			}
+		}
+
+		mtip_standby_drive(dd);
+
+		/*
+		 * Delete our gendisk structure. This also removes the device
+		 * from /dev
+		 */
+		if (dd->bdev) {
+			bdput(dd->bdev);
+			dd->bdev = NULL;
+		}
+		if (dd->disk) {
+			if (dd->disk->queue) {
+				del_gendisk(dd->disk);
+				blk_cleanup_queue(dd->queue);
+				blk_mq_free_tag_set(&dd->tags);
+				dd->queue = NULL;
+			} else
+				put_disk(dd->disk);
+		}
+		dd->disk  = NULL;
+
+		spin_lock(&rssd_index_lock);
+		ida_remove(&rssd_index_ida, dd->index);
+		spin_unlock(&rssd_index_lock);
+	} else {
+		dev_info(&dd->pdev->dev, "device %s surprise removal\n",
+						dd->disk->disk_name);
+	}
+
+	/* De-initialize the protocol layer. */
+	mtip_hw_exit(dd);
+
+	return 0;
+}
+
+/*
+ * Function called by the PCI layer when just before the
+ * machine shuts down.
+ *
+ * If a protocol layer shutdown function is present it will be called
+ * by this function.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_block_shutdown(struct driver_data *dd)
+{
+	mtip_hw_shutdown(dd);
+
+	/* Delete our gendisk structure, and cleanup the blk queue. */
+	if (dd->disk) {
+		dev_info(&dd->pdev->dev,
+			"Shutting down %s ...\n", dd->disk->disk_name);
+
+		if (dd->disk->queue) {
+			del_gendisk(dd->disk);
+			blk_cleanup_queue(dd->queue);
+			blk_mq_free_tag_set(&dd->tags);
+		} else
+			put_disk(dd->disk);
+		dd->disk  = NULL;
+		dd->queue = NULL;
+	}
+
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, dd->index);
+	spin_unlock(&rssd_index_lock);
+	return 0;
+}
+
+static int mtip_block_suspend(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev,
+		"Suspending %s ...\n", dd->disk->disk_name);
+	mtip_hw_suspend(dd);
+	return 0;
+}
+
+static int mtip_block_resume(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev, "Resuming %s ...\n",
+		dd->disk->disk_name);
+	mtip_hw_resume(dd);
+	return 0;
+}
+
+static void drop_cpu(int cpu)
+{
+	cpu_use[cpu]--;
+}
+
+static int get_least_used_cpu_on_node(int node)
+{
+	int cpu, least_used_cpu, least_cnt;
+	const struct cpumask *node_mask;
+
+	node_mask = cpumask_of_node(node);
+	least_used_cpu = cpumask_first(node_mask);
+	least_cnt = cpu_use[least_used_cpu];
+	cpu = least_used_cpu;
+
+	for_each_cpu(cpu, node_mask) {
+		if (cpu_use[cpu] < least_cnt) {
+			least_used_cpu = cpu;
+			least_cnt = cpu_use[cpu];
+		}
+	}
+	cpu_use[least_used_cpu]++;
+	return least_used_cpu;
+}
+
+/* Helper for selecting a node in round robin mode */
+static inline int mtip_get_next_rr_node(void)
+{
+	static int next_node = -1;
+
+	if (next_node == -1) {
+		next_node = first_online_node;
+		return next_node;
+	}
+
+	next_node = next_online_node(next_node);
+	if (next_node == MAX_NUMNODES)
+		next_node = first_online_node;
+	return next_node;
+}
+
+static DEFINE_HANDLER(0);
+static DEFINE_HANDLER(1);
+static DEFINE_HANDLER(2);
+static DEFINE_HANDLER(3);
+static DEFINE_HANDLER(4);
+static DEFINE_HANDLER(5);
+static DEFINE_HANDLER(6);
+static DEFINE_HANDLER(7);
+
+static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev)
+{
+	int pos;
+	unsigned short pcie_dev_ctrl;
+
+	pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	if (pos) {
+		pci_read_config_word(pdev,
+			pos + PCI_EXP_DEVCTL,
+			&pcie_dev_ctrl);
+		if (pcie_dev_ctrl & (1 << 11) ||
+		    pcie_dev_ctrl & (1 << 4)) {
+			dev_info(&dd->pdev->dev,
+				"Disabling ERO/No-Snoop on bridge device %04x:%04x\n",
+					pdev->vendor, pdev->device);
+			pcie_dev_ctrl &= ~(PCI_EXP_DEVCTL_NOSNOOP_EN |
+						PCI_EXP_DEVCTL_RELAX_EN);
+			pci_write_config_word(pdev,
+				pos + PCI_EXP_DEVCTL,
+				pcie_dev_ctrl);
+		}
+	}
+}
+
+static void mtip_fix_ero_nosnoop(struct driver_data *dd, struct pci_dev *pdev)
+{
+	/*
+	 * This workaround is specific to AMD/ATI chipset with a PCI upstream
+	 * device with device id 0x5aXX
+	 */
+	if (pdev->bus && pdev->bus->self) {
+		if (pdev->bus->self->vendor == PCI_VENDOR_ID_ATI &&
+		    ((pdev->bus->self->device & 0xff00) == 0x5a00)) {
+			mtip_disable_link_opts(dd, pdev->bus->self);
+		} else {
+			/* Check further up the topology */
+			struct pci_dev *parent_dev = pdev->bus->self;
+			if (parent_dev->bus &&
+				parent_dev->bus->parent &&
+				parent_dev->bus->parent->self &&
+				parent_dev->bus->parent->self->vendor ==
+					 PCI_VENDOR_ID_ATI &&
+				(parent_dev->bus->parent->self->device &
+					0xff00) == 0x5a00) {
+				mtip_disable_link_opts(dd,
+					parent_dev->bus->parent->self);
+			}
+		}
+	}
+}
+
+/*
+ * Called for each supported PCI device detected.
+ *
+ * This function allocates the private data structure, enables the
+ * PCI device and then calls the block layer initialization function.
+ *
+ * return value
+ *	0 on success else an error code.
+ */
+static int mtip_pci_probe(struct pci_dev *pdev,
+			const struct pci_device_id *ent)
+{
+	int rv = 0;
+	struct driver_data *dd = NULL;
+	char cpu_list[256];
+	const struct cpumask *node_mask;
+	int cpu, i = 0, j = 0;
+	int my_node = NUMA_NO_NODE;
+	unsigned long flags;
+
+	/* Allocate memory for this devices private data. */
+	my_node = pcibus_to_node(pdev->bus);
+	if (my_node != NUMA_NO_NODE) {
+		if (!node_online(my_node))
+			my_node = mtip_get_next_rr_node();
+	} else {
+		dev_info(&pdev->dev, "Kernel not reporting proximity, choosing a node\n");
+		my_node = mtip_get_next_rr_node();
+	}
+	dev_info(&pdev->dev, "NUMA node %d (closest: %d,%d, probe on %d:%d)\n",
+		my_node, pcibus_to_node(pdev->bus), dev_to_node(&pdev->dev),
+		cpu_to_node(raw_smp_processor_id()), raw_smp_processor_id());
+
+	dd = kzalloc_node(sizeof(struct driver_data), GFP_KERNEL, my_node);
+	if (dd == NULL) {
+		dev_err(&pdev->dev,
+			"Unable to allocate memory for driver data\n");
+		return -ENOMEM;
+	}
+
+	/* Attach the private data to this PCI device.  */
+	pci_set_drvdata(pdev, dd);
+
+	rv = pcim_enable_device(pdev);
+	if (rv < 0) {
+		dev_err(&pdev->dev, "Unable to enable device\n");
+		goto iomap_err;
+	}
+
+	/* Map BAR5 to memory. */
+	rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME);
+	if (rv < 0) {
+		dev_err(&pdev->dev, "Unable to map regions\n");
+		goto iomap_err;
+	}
+
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		rv = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+
+		if (rv) {
+			rv = pci_set_consistent_dma_mask(pdev,
+						DMA_BIT_MASK(32));
+			if (rv) {
+				dev_warn(&pdev->dev,
+					"64-bit DMA enable failed\n");
+				goto setmask_err;
+			}
+		}
+	}
+
+	/* Copy the info we may need later into the private data structure. */
+	dd->major	= mtip_major;
+	dd->instance	= instance;
+	dd->pdev	= pdev;
+	dd->numa_node	= my_node;
+
+	INIT_LIST_HEAD(&dd->online_list);
+	INIT_LIST_HEAD(&dd->remove_list);
+
+	memset(dd->workq_name, 0, 32);
+	snprintf(dd->workq_name, 31, "mtipq%d", dd->instance);
+
+	dd->isr_workq = create_workqueue(dd->workq_name);
+	if (!dd->isr_workq) {
+		dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance);
+		rv = -ENOMEM;
+		goto block_initialize_err;
+	}
+
+	memset(cpu_list, 0, sizeof(cpu_list));
+
+	node_mask = cpumask_of_node(dd->numa_node);
+	if (!cpumask_empty(node_mask)) {
+		for_each_cpu(cpu, node_mask)
+		{
+			snprintf(&cpu_list[j], 256 - j, "%d ", cpu);
+			j = strlen(cpu_list);
+		}
+
+		dev_info(&pdev->dev, "Node %d on package %d has %d cpu(s): %s\n",
+			dd->numa_node,
+			topology_physical_package_id(cpumask_first(node_mask)),
+			nr_cpus_node(dd->numa_node),
+			cpu_list);
+	} else
+		dev_dbg(&pdev->dev, "mtip32xx: node_mask empty\n");
+
+	dd->isr_binding = get_least_used_cpu_on_node(dd->numa_node);
+	dev_info(&pdev->dev, "Initial IRQ binding node:cpu %d:%d\n",
+		cpu_to_node(dd->isr_binding), dd->isr_binding);
+
+	/* first worker context always runs in ISR */
+	dd->work[0].cpu_binding = dd->isr_binding;
+	dd->work[1].cpu_binding = get_least_used_cpu_on_node(dd->numa_node);
+	dd->work[2].cpu_binding = get_least_used_cpu_on_node(dd->numa_node);
+	dd->work[3].cpu_binding = dd->work[0].cpu_binding;
+	dd->work[4].cpu_binding = dd->work[1].cpu_binding;
+	dd->work[5].cpu_binding = dd->work[2].cpu_binding;
+	dd->work[6].cpu_binding = dd->work[2].cpu_binding;
+	dd->work[7].cpu_binding = dd->work[1].cpu_binding;
+
+	/* Log the bindings */
+	for_each_present_cpu(cpu) {
+		memset(cpu_list, 0, sizeof(cpu_list));
+		for (i = 0, j = 0; i < MTIP_MAX_SLOT_GROUPS; i++) {
+			if (dd->work[i].cpu_binding == cpu) {
+				snprintf(&cpu_list[j], 256 - j, "%d ", i);
+				j = strlen(cpu_list);
+			}
+		}
+		if (j)
+			dev_info(&pdev->dev, "CPU %d: WQs %s\n", cpu, cpu_list);
+	}
+
+	INIT_WORK(&dd->work[0].work, mtip_workq_sdbf0);
+	INIT_WORK(&dd->work[1].work, mtip_workq_sdbf1);
+	INIT_WORK(&dd->work[2].work, mtip_workq_sdbf2);
+	INIT_WORK(&dd->work[3].work, mtip_workq_sdbf3);
+	INIT_WORK(&dd->work[4].work, mtip_workq_sdbf4);
+	INIT_WORK(&dd->work[5].work, mtip_workq_sdbf5);
+	INIT_WORK(&dd->work[6].work, mtip_workq_sdbf6);
+	INIT_WORK(&dd->work[7].work, mtip_workq_sdbf7);
+
+	pci_set_master(pdev);
+	rv = pci_enable_msi(pdev);
+	if (rv) {
+		dev_warn(&pdev->dev,
+			"Unable to enable MSI interrupt.\n");
+		goto msi_initialize_err;
+	}
+
+	mtip_fix_ero_nosnoop(dd, pdev);
+
+	/* Initialize the block layer. */
+	rv = mtip_block_initialize(dd);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Unable to initialize block layer\n");
+		goto block_initialize_err;
+	}
+
+	/*
+	 * Increment the instance count so that each device has a unique
+	 * instance number.
+	 */
+	instance++;
+	if (rv != MTIP_FTL_REBUILD_MAGIC)
+		set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+	else
+		rv = 0; /* device in rebuild state, return 0 from probe */
+
+	/* Add to online list even if in ftl rebuild */
+	spin_lock_irqsave(&dev_lock, flags);
+	list_add(&dd->online_list, &online_list);
+	spin_unlock_irqrestore(&dev_lock, flags);
+
+	goto done;
+
+block_initialize_err:
+	pci_disable_msi(pdev);
+
+msi_initialize_err:
+	if (dd->isr_workq) {
+		flush_workqueue(dd->isr_workq);
+		destroy_workqueue(dd->isr_workq);
+		drop_cpu(dd->work[0].cpu_binding);
+		drop_cpu(dd->work[1].cpu_binding);
+		drop_cpu(dd->work[2].cpu_binding);
+	}
+setmask_err:
+	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+
+iomap_err:
+	kfree(dd);
+	pci_set_drvdata(pdev, NULL);
+	return rv;
+done:
+	return rv;
+}
+
+/*
+ * Called for each probed device when the device is removed or the
+ * driver is unloaded.
+ *
+ * return value
+ *	None
+ */
+static void mtip_pci_remove(struct pci_dev *pdev)
+{
+	struct driver_data *dd = pci_get_drvdata(pdev);
+	unsigned long flags, to;
+
+	set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag);
+
+	spin_lock_irqsave(&dev_lock, flags);
+	list_del_init(&dd->online_list);
+	list_add(&dd->remove_list, &removing_list);
+	spin_unlock_irqrestore(&dev_lock, flags);
+
+	mtip_check_surprise_removal(pdev);
+	synchronize_irq(dd->pdev->irq);
+
+	/* Spin until workers are done */
+	to = jiffies + msecs_to_jiffies(4000);
+	do {
+		msleep(20);
+	} while (atomic_read(&dd->irq_workers_active) != 0 &&
+		time_before(jiffies, to));
+
+	if (atomic_read(&dd->irq_workers_active) != 0) {
+		dev_warn(&dd->pdev->dev,
+			"Completion workers still active!\n");
+	}
+
+	/* Clean up the block layer. */
+	mtip_block_remove(dd);
+
+	if (dd->isr_workq) {
+		flush_workqueue(dd->isr_workq);
+		destroy_workqueue(dd->isr_workq);
+		drop_cpu(dd->work[0].cpu_binding);
+		drop_cpu(dd->work[1].cpu_binding);
+		drop_cpu(dd->work[2].cpu_binding);
+	}
+
+	pci_disable_msi(pdev);
+
+	spin_lock_irqsave(&dev_lock, flags);
+	list_del_init(&dd->remove_list);
+	spin_unlock_irqrestore(&dev_lock, flags);
+
+	if (!dd->sr)
+		kfree(dd);
+	else
+		set_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag);
+
+	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+	pci_set_drvdata(pdev, NULL);
+}
+
+/*
+ * Called for each probed device when the device is suspended.
+ *
+ * return value
+ *	0  Success
+ *	<0 Error
+ */
+static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg)
+{
+	int rv = 0;
+	struct driver_data *dd = pci_get_drvdata(pdev);
+
+	if (!dd) {
+		dev_err(&pdev->dev,
+			"Driver private datastructure is NULL\n");
+		return -EFAULT;
+	}
+
+	set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+	/* Disable ports & interrupts then send standby immediate */
+	rv = mtip_block_suspend(dd);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Failed to suspend controller\n");
+		return rv;
+	}
+
+	/*
+	 * Save the pci config space to pdev structure &
+	 * disable the device
+	 */
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+
+	/* Move to Low power state*/
+	pci_set_power_state(pdev, PCI_D3hot);
+
+	return rv;
+}
+
+/*
+ * Called for each probed device when the device is resumed.
+ *
+ * return value
+ *      0  Success
+ *      <0 Error
+ */
+static int mtip_pci_resume(struct pci_dev *pdev)
+{
+	int rv = 0;
+	struct driver_data *dd;
+
+	dd = pci_get_drvdata(pdev);
+	if (!dd) {
+		dev_err(&pdev->dev,
+			"Driver private datastructure is NULL\n");
+		return -EFAULT;
+	}
+
+	/* Move the device to active State */
+	pci_set_power_state(pdev, PCI_D0);
+
+	/* Restore PCI configuration space */
+	pci_restore_state(pdev);
+
+	/* Enable the PCI device*/
+	rv = pcim_enable_device(pdev);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Failed to enable card during resume\n");
+		goto err;
+	}
+	pci_set_master(pdev);
+
+	/*
+	 * Calls hbaReset, initPort, & startPort function
+	 * then enables interrupts
+	 */
+	rv = mtip_block_resume(dd);
+	if (rv < 0)
+		dev_err(&pdev->dev, "Unable to resume\n");
+
+err:
+	clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+	return rv;
+}
+
+/*
+ * Shutdown routine
+ *
+ * return value
+ *      None
+ */
+static void mtip_pci_shutdown(struct pci_dev *pdev)
+{
+	struct driver_data *dd = pci_get_drvdata(pdev);
+	if (dd)
+		mtip_block_shutdown(dd);
+}
+
+/* Table of device ids supported by this driver. */
+static DEFINE_PCI_DEVICE_TABLE(mtip_pci_tbl) = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320H_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320M_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320S_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P325M_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420H_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420M_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P425M_DEVICE_ID) },
+	{ 0 }
+};
+
+/* Structure that describes the PCI driver functions. */
+static struct pci_driver mtip_pci_driver = {
+	.name			= MTIP_DRV_NAME,
+	.id_table		= mtip_pci_tbl,
+	.probe			= mtip_pci_probe,
+	.remove			= mtip_pci_remove,
+	.suspend		= mtip_pci_suspend,
+	.resume			= mtip_pci_resume,
+	.shutdown		= mtip_pci_shutdown,
+};
+
+MODULE_DEVICE_TABLE(pci, mtip_pci_tbl);
+
+/*
+ * Module initialization function.
+ *
+ * Called once when the module is loaded. This function allocates a major
+ * block device number to the Cyclone devices and registers the PCI layer
+ * of the driver.
+ *
+ * Return value
+ *      0 on success else error code.
+ */
+static int __init mtip_init(void)
+{
+	int error;
+
+	pr_info(MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n");
+
+	spin_lock_init(&dev_lock);
+
+	INIT_LIST_HEAD(&online_list);
+	INIT_LIST_HEAD(&removing_list);
+
+	/* Allocate a major block device number to use with this driver. */
+	error = register_blkdev(0, MTIP_DRV_NAME);
+	if (error <= 0) {
+		pr_err("Unable to register block device (%d)\n",
+		error);
+		return -EBUSY;
+	}
+	mtip_major = error;
+
+	dfs_parent = debugfs_create_dir("rssd", NULL);
+	if (IS_ERR_OR_NULL(dfs_parent)) {
+		pr_warn("Error creating debugfs parent\n");
+		dfs_parent = NULL;
+	}
+	if (dfs_parent) {
+		dfs_device_status = debugfs_create_file("device_status",
+					S_IRUGO, dfs_parent, NULL,
+					&mtip_device_status_fops);
+		if (IS_ERR_OR_NULL(dfs_device_status)) {
+			pr_err("Error creating device_status node\n");
+			dfs_device_status = NULL;
+		}
+	}
+
+	/* Register our PCI operations. */
+	error = pci_register_driver(&mtip_pci_driver);
+	if (error) {
+		debugfs_remove(dfs_parent);
+		unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+	}
+
+	return error;
+}
+
+/*
+ * Module de-initialization function.
+ *
+ * Called once when the module is unloaded. This function deallocates
+ * the major block device number allocated by mtip_init() and
+ * unregisters the PCI layer of the driver.
+ *
+ * Return value
+ *      none
+ */
+static void __exit mtip_exit(void)
+{
+	/* Release the allocated major block device number. */
+	unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+
+	/* Unregister the PCI driver. */
+	pci_unregister_driver(&mtip_pci_driver);
+
+	debugfs_remove_recursive(dfs_parent);
+}
+
+MODULE_AUTHOR("Micron Technology, Inc");
+MODULE_DESCRIPTION("Micron RealSSD PCIe Block Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(MTIP_DRV_VERSION);
+
+module_init(mtip_init);
+module_exit(mtip_exit);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
new file mode 100644
index 00000000000..ba1b31ee22e
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -0,0 +1,511 @@
+/*
+ * mtip32xx.h - Header file for the P320 SSD Block Driver
+ *   Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ *    Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __MTIP32XX_H__
+#define __MTIP32XX_H__
+
+#include <linux/spinlock.h>
+#include <linux/rwsem.h>
+#include <linux/ata.h>
+#include <linux/interrupt.h>
+#include <linux/genhd.h>
+
+/* Offset of Subsystem Device ID in pci confoguration space */
+#define PCI_SUBSYSTEM_DEVICEID	0x2E
+
+/* offset of Device Control register in PCIe extended capabilites space */
+#define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET	0x48
+
+/* check for erase mode support during secure erase */
+#define MTIP_SEC_ERASE_MODE     0x2
+
+/* # of times to retry timed out/failed IOs */
+#define MTIP_MAX_RETRIES	2
+
+/* Various timeout values in ms */
+#define MTIP_NCQ_CMD_TIMEOUT_MS      15000
+#define MTIP_IOCTL_CMD_TIMEOUT_MS    5000
+#define MTIP_INT_CMD_TIMEOUT_MS      5000
+#define MTIP_QUIESCE_IO_TIMEOUT_MS   (MTIP_NCQ_CMD_TIMEOUT_MS * \
+				     (MTIP_MAX_RETRIES + 1))
+
+/* check for timeouts every 500ms */
+#define MTIP_TIMEOUT_CHECK_PERIOD	500
+
+/* ftl rebuild */
+#define MTIP_FTL_REBUILD_OFFSET		142
+#define MTIP_FTL_REBUILD_MAGIC		0xED51
+#define MTIP_FTL_REBUILD_TIMEOUT_MS	2400000
+
+/* unaligned IO handling */
+#define MTIP_MAX_UNALIGNED_SLOTS	2
+
+/* Macro to extract the tag bit number from a tag value. */
+#define MTIP_TAG_BIT(tag)	(tag & 0x1F)
+
+/*
+ * Macro to extract the tag index from a tag value. The index
+ * is used to access the correct s_active/Command Issue register based
+ * on the tag value.
+ */
+#define MTIP_TAG_INDEX(tag)	(tag >> 5)
+
+/*
+ * Maximum number of scatter gather entries
+ * a single command may have.
+ */
+#define MTIP_MAX_SG		504
+
+/*
+ * Maximum number of slot groups (Command Issue & s_active registers)
+ * NOTE: This is the driver maximum; check dd->slot_groups for actual value.
+ */
+#define MTIP_MAX_SLOT_GROUPS	8
+
+/* Internal command tag. */
+#define MTIP_TAG_INTERNAL	0
+
+/* Micron Vendor ID & P320x SSD Device ID */
+#define PCI_VENDOR_ID_MICRON    0x1344
+#define P320H_DEVICE_ID		0x5150
+#define P320M_DEVICE_ID		0x5151
+#define P320S_DEVICE_ID		0x5152
+#define P325M_DEVICE_ID		0x5153
+#define P420H_DEVICE_ID		0x5160
+#define P420M_DEVICE_ID		0x5161
+#define P425M_DEVICE_ID		0x5163
+
+/* Driver name and version strings */
+#define MTIP_DRV_NAME		"mtip32xx"
+#define MTIP_DRV_VERSION	"1.3.1"
+
+/* Maximum number of minor device numbers per device. */
+#define MTIP_MAX_MINORS		16
+
+/* Maximum number of supported command slots. */
+#define MTIP_MAX_COMMAND_SLOTS	(MTIP_MAX_SLOT_GROUPS * 32)
+
+/*
+ * Per-tag bitfield size in longs.
+ * Linux bit manipulation functions
+ * (i.e. test_and_set_bit, find_next_zero_bit)
+ * manipulate memory in longs, so we try to make the math work.
+ * take the slot groups and find the number of longs, rounding up.
+ * Careful! i386 and x86_64 use different size longs!
+ */
+#define U32_PER_LONG	(sizeof(long) / sizeof(u32))
+#define SLOTBITS_IN_LONGS ((MTIP_MAX_SLOT_GROUPS + \
+					(U32_PER_LONG-1))/U32_PER_LONG)
+
+/* BAR number used to access the HBA registers. */
+#define MTIP_ABAR		5
+
+#ifdef DEBUG
+ #define dbg_printk(format, arg...)	\
+	printk(pr_fmt(format), ##arg);
+#else
+ #define dbg_printk(format, arg...)
+#endif
+
+#define MTIP_DFS_MAX_BUF_SIZE 1024
+
+#define __force_bit2int (unsigned int __force)
+
+enum {
+	/* below are bit numbers in 'flags' defined in mtip_port */
+	MTIP_PF_IC_ACTIVE_BIT       = 0, /* pio/ioctl */
+	MTIP_PF_EH_ACTIVE_BIT       = 1, /* error handling */
+	MTIP_PF_SE_ACTIVE_BIT       = 2, /* secure erase */
+	MTIP_PF_DM_ACTIVE_BIT       = 3, /* download microcde */
+	MTIP_PF_PAUSE_IO      =	((1 << MTIP_PF_IC_ACTIVE_BIT) |
+				(1 << MTIP_PF_EH_ACTIVE_BIT) |
+				(1 << MTIP_PF_SE_ACTIVE_BIT) |
+				(1 << MTIP_PF_DM_ACTIVE_BIT)),
+
+	MTIP_PF_SVC_THD_ACTIVE_BIT  = 4,
+	MTIP_PF_ISSUE_CMDS_BIT      = 5,
+	MTIP_PF_REBUILD_BIT         = 6,
+	MTIP_PF_SR_CLEANUP_BIT      = 7,
+	MTIP_PF_SVC_THD_STOP_BIT    = 8,
+
+	/* below are bit numbers in 'dd_flag' defined in driver_data */
+	MTIP_DDF_SEC_LOCK_BIT	    = 0,
+	MTIP_DDF_REMOVE_PENDING_BIT = 1,
+	MTIP_DDF_OVER_TEMP_BIT      = 2,
+	MTIP_DDF_WRITE_PROTECT_BIT  = 3,
+	MTIP_DDF_REMOVE_DONE_BIT    = 4,
+	MTIP_DDF_CLEANUP_BIT        = 5,
+	MTIP_DDF_RESUME_BIT         = 6,
+	MTIP_DDF_INIT_DONE_BIT      = 7,
+	MTIP_DDF_REBUILD_FAILED_BIT = 8,
+
+	MTIP_DDF_STOP_IO      = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) |
+				(1 << MTIP_DDF_SEC_LOCK_BIT) |
+				(1 << MTIP_DDF_OVER_TEMP_BIT) |
+				(1 << MTIP_DDF_WRITE_PROTECT_BIT) |
+				(1 << MTIP_DDF_REBUILD_FAILED_BIT)),
+
+};
+
+struct smart_attr {
+	u8 attr_id;
+	u16 flags;
+	u8 cur;
+	u8 worst;
+	u32 data;
+	u8 res[3];
+} __packed;
+
+struct mtip_work {
+	struct work_struct work;
+	void *port;
+	int cpu_binding;
+	u32 completed;
+} ____cacheline_aligned_in_smp;
+
+#define DEFINE_HANDLER(group)                                  \
+	void mtip_workq_sdbf##group(struct work_struct *work)       \
+	{                                                      \
+		struct mtip_work *w = (struct mtip_work *) work;         \
+		mtip_workq_sdbfx(w->port, group, w->completed);     \
+	}
+
+#define MTIP_TRIM_TIMEOUT_MS		240000
+#define MTIP_MAX_TRIM_ENTRIES		8
+#define MTIP_MAX_TRIM_ENTRY_LEN		0xfff8
+
+struct mtip_trim_entry {
+	u32 lba;   /* starting lba of region */
+	u16 rsvd;  /* unused */
+	u16 range; /* # of 512b blocks to trim */
+} __packed;
+
+struct mtip_trim {
+	/* Array of regions to trim */
+	struct mtip_trim_entry entry[MTIP_MAX_TRIM_ENTRIES];
+} __packed;
+
+/* Register Frame Information Structure (FIS), host to device. */
+struct host_to_dev_fis {
+	/*
+	 * FIS type.
+	 * - 27h Register FIS, host to device.
+	 * - 34h Register FIS, device to host.
+	 * - 39h DMA Activate FIS, device to host.
+	 * - 41h DMA Setup FIS, bi-directional.
+	 * - 46h Data FIS, bi-directional.
+	 * - 58h BIST Activate FIS, bi-directional.
+	 * - 5Fh PIO Setup FIS, device to host.
+	 * - A1h Set Device Bits FIS, device to host.
+	 */
+	unsigned char type;
+	unsigned char opts;
+	unsigned char command;
+	unsigned char features;
+
+	union {
+		unsigned char lba_low;
+		unsigned char sector;
+	};
+	union {
+		unsigned char lba_mid;
+		unsigned char cyl_low;
+	};
+	union {
+		unsigned char lba_hi;
+		unsigned char cyl_hi;
+	};
+	union {
+		unsigned char device;
+		unsigned char head;
+	};
+
+	union {
+		unsigned char lba_low_ex;
+		unsigned char sector_ex;
+	};
+	union {
+		unsigned char lba_mid_ex;
+		unsigned char cyl_low_ex;
+	};
+	union {
+		unsigned char lba_hi_ex;
+		unsigned char cyl_hi_ex;
+	};
+	unsigned char features_ex;
+
+	unsigned char sect_count;
+	unsigned char sect_cnt_ex;
+	unsigned char res2;
+	unsigned char control;
+
+	unsigned int res3;
+};
+
+/* Command header structure. */
+struct mtip_cmd_hdr {
+	/*
+	 * Command options.
+	 * - Bits 31:16 Number of PRD entries.
+	 * - Bits 15:8 Unused in this implementation.
+	 * - Bit 7 Prefetch bit, informs the drive to prefetch PRD entries.
+	 * - Bit 6 Write bit, should be set when writing data to the device.
+	 * - Bit 5 Unused in this implementation.
+	 * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
+	 */
+	unsigned int opts;
+	/* This field is unsed when using NCQ. */
+	union {
+		unsigned int byte_count;
+		unsigned int status;
+	};
+	/*
+	 * Lower 32 bits of the command table address associated with this
+	 * header. The command table addresses must be 128 byte aligned.
+	 */
+	unsigned int ctba;
+	/*
+	 * If 64 bit addressing is used this field is the upper 32 bits
+	 * of the command table address associated with this command.
+	 */
+	unsigned int ctbau;
+	/* Reserved and unused. */
+	unsigned int res[4];
+};
+
+/* Command scatter gather structure (PRD). */
+struct mtip_cmd_sg {
+	/*
+	 * Low 32 bits of the data buffer address. For P320 this
+	 * address must be 8 byte aligned signified by bits 2:0 being
+	 * set to 0.
+	 */
+	unsigned int dba;
+	/*
+	 * When 64 bit addressing is used this field is the upper
+	 * 32 bits of the data buffer address.
+	 */
+	unsigned int dba_upper;
+	/* Unused. */
+	unsigned int reserved;
+	/*
+	 * Bit 31: interrupt when this data block has been transferred.
+	 * Bits 30..22: reserved
+	 * Bits 21..0: byte count (minus 1).  For P320 the byte count must be
+	 * 8 byte aligned signified by bits 2:0 being set to 1.
+	 */
+	unsigned int info;
+};
+struct mtip_port;
+
+/* Structure used to describe a command. */
+struct mtip_cmd {
+
+	struct mtip_cmd_hdr *command_header; /* ptr to command header entry */
+
+	dma_addr_t command_header_dma; /* corresponding physical address */
+
+	void *command; /* ptr to command table entry */
+
+	dma_addr_t command_dma; /* corresponding physical address */
+
+	void *comp_data; /* data passed to completion function comp_func() */
+	/*
+	 * Completion function called by the ISR upon completion of
+	 * a command.
+	 */
+	void (*comp_func)(struct mtip_port *port,
+				int tag,
+				struct mtip_cmd *cmd,
+				int status);
+
+	int scatter_ents; /* Number of scatter list entries used */
+
+	int unaligned; /* command is unaligned on 4k boundary */
+
+	struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+
+	int retries; /* The number of retries left for this command. */
+
+	int direction; /* Data transfer direction */
+};
+
+/* Structure used to describe a port. */
+struct mtip_port {
+	/* Pointer back to the driver data for this port. */
+	struct driver_data *dd;
+	/*
+	 * Used to determine if the data pointed to by the
+	 * identify field is valid.
+	 */
+	unsigned long identify_valid;
+	/* Base address of the memory mapped IO for the port. */
+	void __iomem *mmio;
+	/* Array of pointers to the memory mapped s_active registers. */
+	void __iomem *s_active[MTIP_MAX_SLOT_GROUPS];
+	/* Array of pointers to the memory mapped completed registers. */
+	void __iomem *completed[MTIP_MAX_SLOT_GROUPS];
+	/* Array of pointers to the memory mapped Command Issue registers. */
+	void __iomem *cmd_issue[MTIP_MAX_SLOT_GROUPS];
+	/*
+	 * Pointer to the beginning of the command header memory as used
+	 * by the driver.
+	 */
+	void *command_list;
+	/*
+	 * Pointer to the beginning of the command header memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t command_list_dma;
+	/*
+	 * Pointer to the beginning of the RX FIS memory as used
+	 * by the driver.
+	 */
+	void *rxfis;
+	/*
+	 * Pointer to the beginning of the RX FIS memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t rxfis_dma;
+	/*
+	 * Pointer to the DMA region for RX Fis, Identify, RLE10, and SMART
+	 */
+	void *block1;
+	/*
+	 * DMA address of region for RX Fis, Identify, RLE10, and SMART
+	 */
+	dma_addr_t block1_dma;
+	/*
+	 * Pointer to the beginning of the identify data memory as used
+	 * by the driver.
+	 */
+	u16 *identify;
+	/*
+	 * Pointer to the beginning of the identify data memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t identify_dma;
+	/*
+	 * Pointer to the beginning of a sector buffer that is used
+	 * by the driver when issuing internal commands.
+	 */
+	u16 *sector_buffer;
+	/*
+	 * Pointer to the beginning of a sector buffer that is used
+	 * by the DMA when the driver issues internal commands.
+	 */
+	dma_addr_t sector_buffer_dma;
+	/*
+	 * Bit significant, used to determine if a command slot has
+	 * been allocated. i.e. the slot is in use.  Bits are cleared
+	 * when the command slot and all associated data structures
+	 * are no longer needed.
+	 */
+	u16 *log_buf;
+	dma_addr_t log_buf_dma;
+
+	u8 *smart_buf;
+	dma_addr_t smart_buf_dma;
+
+	unsigned long allocated[SLOTBITS_IN_LONGS];
+	/*
+	 * used to queue commands when an internal command is in progress
+	 * or error handling is active
+	 */
+	unsigned long cmds_to_issue[SLOTBITS_IN_LONGS];
+	/* Used by mtip_service_thread to wait for an event */
+	wait_queue_head_t svc_wait;
+	/*
+	 * indicates the state of the port. Also, helps the service thread
+	 * to determine its action on wake up.
+	 */
+	unsigned long flags;
+	/*
+	 * Timer used to complete commands that have been active for too long.
+	 */
+	unsigned long ic_pause_timer;
+
+	/* Semaphore to control queue depth of unaligned IOs */
+	struct semaphore cmd_slot_unal;
+
+	/* Spinlock for working around command-issue bug. */
+	spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];
+};
+
+/*
+ * Driver private data structure.
+ *
+ * One structure is allocated per probed device.
+ */
+struct driver_data {
+	void __iomem *mmio; /* Base address of the HBA registers. */
+
+	int major; /* Major device number. */
+
+	int instance; /* Instance number. First device probed is 0, ... */
+
+	struct gendisk *disk; /* Pointer to our gendisk structure. */
+
+	struct pci_dev *pdev; /* Pointer to the PCI device structure. */
+
+	struct request_queue *queue; /* Our request queue. */
+
+	struct blk_mq_tag_set tags; /* blk_mq tags */
+
+	struct mtip_port *port; /* Pointer to the port data structure. */
+
+	unsigned product_type; /* magic value declaring the product type */
+
+	unsigned slot_groups; /* number of slot groups the product supports */
+
+	unsigned long index; /* Index to determine the disk name */
+
+	unsigned long dd_flag; /* NOTE: use atomic bit operations on this */
+
+	struct task_struct *mtip_svc_handler; /* task_struct of svc thd */
+
+	struct dentry *dfs_node;
+
+	bool trim_supp; /* flag indicating trim support */
+
+	bool sr;
+
+	int numa_node; /* NUMA support */
+
+	char workq_name[32];
+
+	struct workqueue_struct *isr_workq;
+
+	atomic_t irq_workers_active;
+
+	struct mtip_work work[MTIP_MAX_SLOT_GROUPS];
+
+	int isr_binding;
+
+	struct block_device *bdev;
+
+	struct list_head online_list; /* linkage for online list */
+
+	struct list_head remove_list; /* linkage for removing list */
+
+	int unal_qdepth; /* qdepth of unaligned IO queue */
+};
+
+#endif
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 5d23ffad7c7..fb31b8ee437 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -4,7 +4,7 @@
  * Note that you can not swap over this thing, yet. Seems to work but
  * deadlocks sometimes - you can not swap over TCP in general.
  * 
- * Copyright 1997-2000, 2008 Pavel Machek <pavel@suse.cz>
+ * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
  * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
  *
  * This file is released under GPLv2 or later.
@@ -24,20 +24,21 @@
 #include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/ioctl.h>
+#include <linux/mutex.h>
 #include <linux/compiler.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <net/sock.h>
 #include <linux/net.h>
 #include <linux/kthread.h>
 
 #include <asm/uaccess.h>
-#include <asm/system.h>
 #include <asm/types.h>
 
 #include <linux/nbd.h>
 
-#define LO_MAGIC 0x68797548
+#define NBD_MAGIC 0x68797548
 
 #ifdef NDEBUG
 #define dprintk(flags, fmt...)
@@ -77,6 +78,8 @@ static const char *ioctl_cmd_to_ascii(int cmd)
 	case NBD_SET_SOCK: return "set-sock";
 	case NBD_SET_BLKSIZE: return "set-blksize";
 	case NBD_SET_SIZE: return "set-size";
+	case NBD_SET_TIMEOUT: return "set-timeout";
+	case NBD_SET_FLAGS: return "set-flags";
 	case NBD_DO_IT: return "do-it";
 	case NBD_CLEAR_SOCK: return "clear-sock";
 	case NBD_CLEAR_QUE: return "clear-que";
@@ -95,6 +98,8 @@ static const char *nbdcmd_to_ascii(int cmd)
 	case  NBD_CMD_READ: return "read";
 	case NBD_CMD_WRITE: return "write";
 	case  NBD_CMD_DISC: return "disconnect";
+	case NBD_CMD_FLUSH: return "flush";
+	case  NBD_CMD_TRIM: return "trim/discard";
 	}
 	return "invalid";
 }
@@ -114,7 +119,7 @@ static void nbd_end_request(struct request *req)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void sock_shutdown(struct nbd_device *lo, int lock)
+static void sock_shutdown(struct nbd_device *nbd, int lock)
 {
 	/* Forcibly shutdown the socket causing all listeners
 	 * to error
@@ -123,15 +128,14 @@ static void sock_shutdown(struct nbd_device *lo, int lock)
 	 * there should be a more generic interface rather than
 	 * calling socket ops directly here */
 	if (lock)
-		mutex_lock(&lo->tx_lock);
-	if (lo->sock) {
-		printk(KERN_WARNING "%s: shutting down socket\n",
-			lo->disk->disk_name);
-		kernel_sock_shutdown(lo->sock, SHUT_RDWR);
-		lo->sock = NULL;
+		mutex_lock(&nbd->tx_lock);
+	if (nbd->sock) {
+		dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
+		kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
+		nbd->sock = NULL;
 	}
 	if (lock)
-		mutex_unlock(&lo->tx_lock);
+		mutex_unlock(&nbd->tx_lock);
 }
 
 static void nbd_xmit_timeout(unsigned long arg)
@@ -146,18 +150,20 @@ static void nbd_xmit_timeout(unsigned long arg)
 /*
  *  Send or receive packet.
  */
-static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
+static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
 		int msg_flags)
 {
-	struct socket *sock = lo->sock;
+	struct socket *sock = nbd->sock;
 	int result;
 	struct msghdr msg;
 	struct kvec iov;
 	sigset_t blocked, oldset;
+	unsigned long pflags = current->flags;
 
 	if (unlikely(!sock)) {
-		printk(KERN_ERR "%s: Attempted %s on closed socket in sock_xmit\n",
-		       lo->disk->disk_name, (send ? "send" : "recv"));
+		dev_err(disk_to_dev(nbd->disk),
+			"Attempted %s on closed socket in sock_xmit\n",
+			(send ? "send" : "recv"));
 		return -EINVAL;
 	}
 
@@ -166,8 +172,9 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 	siginitsetinv(&blocked, sigmask(SIGKILL));
 	sigprocmask(SIG_SETMASK, &blocked, &oldset);
 
+	current->flags |= PF_MEMALLOC;
 	do {
-		sock->sk->sk_allocation = GFP_NOIO;
+		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
 		iov.iov_base = buf;
 		iov.iov_len = size;
 		msg.msg_name = NULL;
@@ -179,18 +186,19 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 		if (send) {
 			struct timer_list ti;
 
-			if (lo->xmit_timeout) {
+			if (nbd->xmit_timeout) {
 				init_timer(&ti);
 				ti.function = nbd_xmit_timeout;
 				ti.data = (unsigned long)current;
-				ti.expires = jiffies + lo->xmit_timeout;
+				ti.expires = jiffies + nbd->xmit_timeout;
 				add_timer(&ti);
 			}
 			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
-			if (lo->xmit_timeout)
+			if (nbd->xmit_timeout)
 				del_timer_sync(&ti);
 		} else
-			result = kernel_recvmsg(sock, &msg, &iov, 1, size, 0);
+			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
+						msg.msg_flags);
 
 		if (signal_pending(current)) {
 			siginfo_t info;
@@ -198,7 +206,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 				task_pid_nr(current), current->comm,
 				dequeue_signal_lock(current, &current->blocked, &info));
 			result = -EINTR;
-			sock_shutdown(lo, !send);
+			sock_shutdown(nbd, !send);
 			break;
 		}
 
@@ -212,63 +220,70 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 	} while (size > 0);
 
 	sigprocmask(SIG_SETMASK, &oldset, NULL);
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 
 	return result;
 }
 
-static inline int sock_send_bvec(struct nbd_device *lo, struct bio_vec *bvec,
+static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
 		int flags)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(lo, 1, kaddr + bvec->bv_offset, bvec->bv_len, flags);
+	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
+			   bvec->bv_len, flags);
 	kunmap(bvec->bv_page);
 	return result;
 }
 
 /* always call with the tx_lock held */
-static int nbd_send_req(struct nbd_device *lo, struct request *req)
+static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 {
 	int result, flags;
 	struct nbd_request request;
 	unsigned long size = blk_rq_bytes(req);
 
+	memset(&request, 0, sizeof(request));
 	request.magic = htonl(NBD_REQUEST_MAGIC);
 	request.type = htonl(nbd_cmd(req));
-	request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
-	request.len = htonl(size);
+
+	if (nbd_cmd(req) != NBD_CMD_FLUSH && nbd_cmd(req) != NBD_CMD_DISC) {
+		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
+		request.len = htonl(size);
+	}
 	memcpy(request.handle, &req, sizeof(req));
 
 	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
-			lo->disk->disk_name, req,
+			nbd->disk->disk_name, req,
 			nbdcmd_to_ascii(nbd_cmd(req)),
 			(unsigned long long)blk_rq_pos(req) << 9,
 			blk_rq_bytes(req));
-	result = sock_xmit(lo, 1, &request, sizeof(request),
+	result = sock_xmit(nbd, 1, &request, sizeof(request),
 			(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
-		printk(KERN_ERR "%s: Send control failed (result %d)\n",
-				lo->disk->disk_name, result);
+		dev_err(disk_to_dev(nbd->disk),
+			"Send control failed (result %d)\n", result);
 		goto error_out;
 	}
 
 	if (nbd_cmd(req) == NBD_CMD_WRITE) {
 		struct req_iterator iter;
-		struct bio_vec *bvec;
+		struct bio_vec bvec;
 		/*
 		 * we are really probing at internals to determine
 		 * whether to set MSG_MORE or not...
 		 */
 		rq_for_each_segment(bvec, req, iter) {
 			flags = 0;
-			if (!rq_iter_last(req, iter))
+			if (!rq_iter_last(bvec, iter))
 				flags = MSG_MORE;
 			dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
-					lo->disk->disk_name, req, bvec->bv_len);
-			result = sock_send_bvec(lo, bvec, flags);
+					nbd->disk->disk_name, req, bvec.bv_len);
+			result = sock_send_bvec(nbd, &bvec, flags);
 			if (result <= 0) {
-				printk(KERN_ERR "%s: Send data failed (result %d)\n",
-						lo->disk->disk_name, result);
+				dev_err(disk_to_dev(nbd->disk),
+					"Send data failed (result %d)\n",
+					result);
 				goto error_out;
 			}
 		}
@@ -279,25 +294,25 @@ error_out:
 	return -EIO;
 }
 
-static struct request *nbd_find_request(struct nbd_device *lo,
+static struct request *nbd_find_request(struct nbd_device *nbd,
 					struct request *xreq)
 {
 	struct request *req, *tmp;
 	int err;
 
-	err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq);
+	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
 	if (unlikely(err))
 		goto out;
 
-	spin_lock(&lo->queue_lock);
-	list_for_each_entry_safe(req, tmp, &lo->queue_head, queuelist) {
+	spin_lock(&nbd->queue_lock);
+	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
 		if (req != xreq)
 			continue;
 		list_del_init(&req->queuelist);
-		spin_unlock(&lo->queue_lock);
+		spin_unlock(&nbd->queue_lock);
 		return req;
 	}
-	spin_unlock(&lo->queue_lock);
+	spin_unlock(&nbd->queue_lock);
 
 	err = -ENOENT;
 
@@ -305,79 +320,78 @@ out:
 	return ERR_PTR(err);
 }
 
-static inline int sock_recv_bvec(struct nbd_device *lo, struct bio_vec *bvec)
+static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(lo, 0, kaddr + bvec->bv_offset, bvec->bv_len,
+	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
 			MSG_WAITALL);
 	kunmap(bvec->bv_page);
 	return result;
 }
 
 /* NULL returned = something went wrong, inform userspace */
-static struct request *nbd_read_stat(struct nbd_device *lo)
+static struct request *nbd_read_stat(struct nbd_device *nbd)
 {
 	int result;
 	struct nbd_reply reply;
 	struct request *req;
 
 	reply.magic = 0;
-	result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL);
+	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
 	if (result <= 0) {
-		printk(KERN_ERR "%s: Receive control failed (result %d)\n",
-				lo->disk->disk_name, result);
+		dev_err(disk_to_dev(nbd->disk),
+			"Receive control failed (result %d)\n", result);
 		goto harderror;
 	}
 
 	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
-		printk(KERN_ERR "%s: Wrong magic (0x%lx)\n",
-				lo->disk->disk_name,
+		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
 				(unsigned long)ntohl(reply.magic));
 		result = -EPROTO;
 		goto harderror;
 	}
 
-	req = nbd_find_request(lo, *(struct request **)reply.handle);
+	req = nbd_find_request(nbd, *(struct request **)reply.handle);
 	if (IS_ERR(req)) {
 		result = PTR_ERR(req);
 		if (result != -ENOENT)
 			goto harderror;
 
-		printk(KERN_ERR "%s: Unexpected reply (%p)\n",
-				lo->disk->disk_name, reply.handle);
+		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
+			reply.handle);
 		result = -EBADR;
 		goto harderror;
 	}
 
 	if (ntohl(reply.error)) {
-		printk(KERN_ERR "%s: Other side returned error (%d)\n",
-				lo->disk->disk_name, ntohl(reply.error));
+		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
+			ntohl(reply.error));
 		req->errors++;
 		return req;
 	}
 
 	dprintk(DBG_RX, "%s: request %p: got reply\n",
-			lo->disk->disk_name, req);
+			nbd->disk->disk_name, req);
 	if (nbd_cmd(req) == NBD_CMD_READ) {
 		struct req_iterator iter;
-		struct bio_vec *bvec;
+		struct bio_vec bvec;
 
 		rq_for_each_segment(bvec, req, iter) {
-			result = sock_recv_bvec(lo, bvec);
+			result = sock_recv_bvec(nbd, &bvec);
 			if (result <= 0) {
-				printk(KERN_ERR "%s: Receive data failed (result %d)\n",
-						lo->disk->disk_name, result);
+				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
+					result);
 				req->errors++;
 				return req;
 			}
 			dprintk(DBG_RX, "%s: request %p: got %d bytes data\n",
-				lo->disk->disk_name, req, bvec->bv_len);
+				nbd->disk->disk_name, req, bvec.bv_len);
 		}
 	}
 	return req;
 harderror:
-	lo->harderror = result;
+	nbd->harderror = result;
 	return NULL;
 }
 
@@ -395,48 +409,57 @@ static struct device_attribute pid_attr = {
 	.show = pid_show,
 };
 
-static int nbd_do_it(struct nbd_device *lo)
+static int nbd_do_it(struct nbd_device *nbd)
 {
 	struct request *req;
 	int ret;
 
-	BUG_ON(lo->magic != LO_MAGIC);
+	BUG_ON(nbd->magic != NBD_MAGIC);
 
-	lo->pid = current->pid;
-	ret = sysfs_create_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr);
+	sk_set_memalloc(nbd->sock->sk);
+	nbd->pid = task_pid_nr(current);
+	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
 	if (ret) {
-		printk(KERN_ERR "nbd: sysfs_create_file failed!");
-		lo->pid = 0;
+		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+		nbd->pid = 0;
 		return ret;
 	}
 
-	while ((req = nbd_read_stat(lo)) != NULL)
+	while ((req = nbd_read_stat(nbd)) != NULL)
 		nbd_end_request(req);
 
-	sysfs_remove_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr);
-	lo->pid = 0;
+	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+	nbd->pid = 0;
 	return 0;
 }
 
-static void nbd_clear_que(struct nbd_device *lo)
+static void nbd_clear_que(struct nbd_device *nbd)
 {
 	struct request *req;
 
-	BUG_ON(lo->magic != LO_MAGIC);
+	BUG_ON(nbd->magic != NBD_MAGIC);
 
 	/*
-	 * Because we have set lo->sock to NULL under the tx_lock, all
+	 * Because we have set nbd->sock to NULL under the tx_lock, all
 	 * modifications to the list must have completed by now.  For
 	 * the same reason, the active_req must be NULL.
 	 *
 	 * As a consequence, we don't need to take the spin lock while
 	 * purging the list here.
 	 */
-	BUG_ON(lo->sock);
-	BUG_ON(lo->active_req);
+	BUG_ON(nbd->sock);
+	BUG_ON(nbd->active_req);
+
+	while (!list_empty(&nbd->queue_head)) {
+		req = list_entry(nbd->queue_head.next, struct request,
+				 queuelist);
+		list_del_init(&req->queuelist);
+		req->errors++;
+		nbd_end_request(req);
+	}
 
-	while (!list_empty(&lo->queue_head)) {
-		req = list_entry(lo->queue_head.next, struct request,
+	while (!list_empty(&nbd->waiting_queue)) {
+		req = list_entry(nbd->waiting_queue.next, struct request,
 				 queuelist);
 		list_del_init(&req->queuelist);
 		req->errors++;
@@ -445,47 +468,55 @@ static void nbd_clear_que(struct nbd_device *lo)
 }
 
 
-static void nbd_handle_req(struct nbd_device *lo, struct request *req)
+static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
 {
-	if (!blk_fs_request(req))
+	if (req->cmd_type != REQ_TYPE_FS)
 		goto error_out;
 
 	nbd_cmd(req) = NBD_CMD_READ;
 	if (rq_data_dir(req) == WRITE) {
-		nbd_cmd(req) = NBD_CMD_WRITE;
-		if (lo->flags & NBD_READ_ONLY) {
-			printk(KERN_ERR "%s: Write on read-only\n",
-					lo->disk->disk_name);
+		if ((req->cmd_flags & REQ_DISCARD)) {
+			WARN_ON(!(nbd->flags & NBD_FLAG_SEND_TRIM));
+			nbd_cmd(req) = NBD_CMD_TRIM;
+		} else
+			nbd_cmd(req) = NBD_CMD_WRITE;
+		if (nbd->flags & NBD_FLAG_READ_ONLY) {
+			dev_err(disk_to_dev(nbd->disk),
+				"Write on read-only\n");
 			goto error_out;
 		}
 	}
 
+	if (req->cmd_flags & REQ_FLUSH) {
+		BUG_ON(unlikely(blk_rq_sectors(req)));
+		nbd_cmd(req) = NBD_CMD_FLUSH;
+	}
+
 	req->errors = 0;
 
-	mutex_lock(&lo->tx_lock);
-	if (unlikely(!lo->sock)) {
-		mutex_unlock(&lo->tx_lock);
-		printk(KERN_ERR "%s: Attempted send on closed socket\n",
-		       lo->disk->disk_name);
+	mutex_lock(&nbd->tx_lock);
+	if (unlikely(!nbd->sock)) {
+		mutex_unlock(&nbd->tx_lock);
+		dev_err(disk_to_dev(nbd->disk),
+			"Attempted send on closed socket\n");
 		goto error_out;
 	}
 
-	lo->active_req = req;
+	nbd->active_req = req;
 
-	if (nbd_send_req(lo, req) != 0) {
-		printk(KERN_ERR "%s: Request send failed\n",
-				lo->disk->disk_name);
+	if (nbd_send_req(nbd, req) != 0) {
+		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
 		req->errors++;
 		nbd_end_request(req);
 	} else {
-		spin_lock(&lo->queue_lock);
-		list_add(&req->queuelist, &lo->queue_head);
-		spin_unlock(&lo->queue_lock);
+		spin_lock(&nbd->queue_lock);
+		list_add_tail(&req->queuelist, &nbd->queue_head);
+		spin_unlock(&nbd->queue_lock);
 	}
 
-	lo->active_req = NULL;
-	mutex_unlock(&lo->tx_lock);
-	wake_up_all(&lo->active_wq);
+	nbd->active_req = NULL;
+	mutex_unlock(&nbd->tx_lock);
+	wake_up_all(&nbd->active_wq);
 
 	return;
 
@@ -496,28 +527,28 @@ error_out:
 
 static int nbd_thread(void *data)
 {
-	struct nbd_device *lo = data;
+	struct nbd_device *nbd = data;
 	struct request *req;
 
-	set_user_nice(current, -20);
-	while (!kthread_should_stop() || !list_empty(&lo->waiting_queue)) {
+	set_user_nice(current, MIN_NICE);
+	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
 		/* wait for something to do */
-		wait_event_interruptible(lo->waiting_wq,
+		wait_event_interruptible(nbd->waiting_wq,
 					 kthread_should_stop() ||
-					 !list_empty(&lo->waiting_queue));
+					 !list_empty(&nbd->waiting_queue));
 
 		/* extract request */
-		if (list_empty(&lo->waiting_queue))
+		if (list_empty(&nbd->waiting_queue))
 			continue;
 
-		spin_lock_irq(&lo->queue_lock);
-		req = list_entry(lo->waiting_queue.next, struct request,
+		spin_lock_irq(&nbd->queue_lock);
+		req = list_entry(nbd->waiting_queue.next, struct request,
 				 queuelist);
 		list_del_init(&req->queuelist);
-		spin_unlock_irq(&lo->queue_lock);
+		spin_unlock_irq(&nbd->queue_lock);
 
 		/* handle request */
-		nbd_handle_req(lo, req);
+		nbd_handle_req(nbd, req);
 	}
 	return 0;
 }
@@ -525,40 +556,41 @@ static int nbd_thread(void *data)
 /*
  * We always wait for result of write, for now. It would be nice to make it optional
  * in future
- * if ((rq_data_dir(req) == WRITE) && (lo->flags & NBD_WRITE_NOCHK))
+ * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
  *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
  */
 
 static void do_nbd_request(struct request_queue *q)
+		__releases(q->queue_lock) __acquires(q->queue_lock)
 {
 	struct request *req;
 	
 	while ((req = blk_fetch_request(q)) != NULL) {
-		struct nbd_device *lo;
+		struct nbd_device *nbd;
 
 		spin_unlock_irq(q->queue_lock);
 
 		dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n",
 				req->rq_disk->disk_name, req, req->cmd_type);
 
-		lo = req->rq_disk->private_data;
+		nbd = req->rq_disk->private_data;
 
-		BUG_ON(lo->magic != LO_MAGIC);
+		BUG_ON(nbd->magic != NBD_MAGIC);
 
-		if (unlikely(!lo->sock)) {
-			printk(KERN_ERR "%s: Attempted send on closed socket\n",
-				lo->disk->disk_name);
+		if (unlikely(!nbd->sock)) {
+			dev_err(disk_to_dev(nbd->disk),
+				"Attempted send on closed socket\n");
 			req->errors++;
 			nbd_end_request(req);
 			spin_lock_irq(q->queue_lock);
 			continue;
 		}
 
-		spin_lock_irq(&lo->queue_lock);
-		list_add_tail(&req->queuelist, &lo->waiting_queue);
-		spin_unlock_irq(&lo->queue_lock);
+		spin_lock_irq(&nbd->queue_lock);
+		list_add_tail(&req->queuelist, &nbd->waiting_queue);
+		spin_unlock_irq(&nbd->queue_lock);
 
-		wake_up(&lo->waiting_wq);
+		wake_up(&nbd->waiting_wq);
 
 		spin_lock_irq(q->queue_lock);
 	}
@@ -566,120 +598,146 @@ static void do_nbd_request(struct request_queue *q)
 
 /* Must be called with tx_lock held */
 
-static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
+static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		       unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
 	case NBD_DISCONNECT: {
 		struct request sreq;
 
-	        printk(KERN_INFO "%s: NBD_DISCONNECT\n", lo->disk->disk_name);
+		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
+		if (!nbd->sock)
+			return -EINVAL;
 
+		mutex_unlock(&nbd->tx_lock);
+		fsync_bdev(bdev);
+		mutex_lock(&nbd->tx_lock);
 		blk_rq_init(NULL, &sreq);
 		sreq.cmd_type = REQ_TYPE_SPECIAL;
 		nbd_cmd(&sreq) = NBD_CMD_DISC;
-		if (!lo->sock)
+
+		/* Check again after getting mutex back.  */
+		if (!nbd->sock)
 			return -EINVAL;
-		nbd_send_req(lo, &sreq);
-                return 0;
+
+		nbd->disconnect = 1;
+
+		nbd_send_req(nbd, &sreq);
+		return 0;
 	}
  
 	case NBD_CLEAR_SOCK: {
-		struct file *file;
-
-		lo->sock = NULL;
-		file = lo->file;
-		lo->file = NULL;
-		nbd_clear_que(lo);
-		BUG_ON(!list_empty(&lo->queue_head));
-		if (file)
-			fput(file);
+		struct socket *sock = nbd->sock;
+		nbd->sock = NULL;
+		nbd_clear_que(nbd);
+		BUG_ON(!list_empty(&nbd->queue_head));
+		BUG_ON(!list_empty(&nbd->waiting_queue));
+		kill_bdev(bdev);
+		if (sock)
+			sockfd_put(sock);
 		return 0;
 	}
 
 	case NBD_SET_SOCK: {
-		struct file *file;
-		if (lo->file)
+		struct socket *sock;
+		int err;
+		if (nbd->sock)
 			return -EBUSY;
-		file = fget(arg);
-		if (file) {
-			struct inode *inode = file->f_path.dentry->d_inode;
-			if (S_ISSOCK(inode->i_mode)) {
-				lo->file = file;
-				lo->sock = SOCKET_I(inode);
-				if (max_part > 0)
-					bdev->bd_invalidated = 1;
-				return 0;
-			} else {
-				fput(file);
-			}
+		sock = sockfd_lookup(arg, &err);
+		if (sock) {
+			nbd->sock = sock;
+			if (max_part > 0)
+				bdev->bd_invalidated = 1;
+			nbd->disconnect = 0; /* we're connected now */
+			return 0;
 		}
 		return -EINVAL;
 	}
 
 	case NBD_SET_BLKSIZE:
-		lo->blksize = arg;
-		lo->bytesize &= ~(lo->blksize-1);
-		bdev->bd_inode->i_size = lo->bytesize;
-		set_blocksize(bdev, lo->blksize);
-		set_capacity(lo->disk, lo->bytesize >> 9);
+		nbd->blksize = arg;
+		nbd->bytesize &= ~(nbd->blksize-1);
+		bdev->bd_inode->i_size = nbd->bytesize;
+		set_blocksize(bdev, nbd->blksize);
+		set_capacity(nbd->disk, nbd->bytesize >> 9);
 		return 0;
 
 	case NBD_SET_SIZE:
-		lo->bytesize = arg & ~(lo->blksize-1);
-		bdev->bd_inode->i_size = lo->bytesize;
-		set_blocksize(bdev, lo->blksize);
-		set_capacity(lo->disk, lo->bytesize >> 9);
+		nbd->bytesize = arg & ~(nbd->blksize-1);
+		bdev->bd_inode->i_size = nbd->bytesize;
+		set_blocksize(bdev, nbd->blksize);
+		set_capacity(nbd->disk, nbd->bytesize >> 9);
 		return 0;
 
 	case NBD_SET_TIMEOUT:
-		lo->xmit_timeout = arg * HZ;
+		nbd->xmit_timeout = arg * HZ;
+		return 0;
+
+	case NBD_SET_FLAGS:
+		nbd->flags = arg;
 		return 0;
 
 	case NBD_SET_SIZE_BLOCKS:
-		lo->bytesize = ((u64) arg) * lo->blksize;
-		bdev->bd_inode->i_size = lo->bytesize;
-		set_blocksize(bdev, lo->blksize);
-		set_capacity(lo->disk, lo->bytesize >> 9);
+		nbd->bytesize = ((u64) arg) * nbd->blksize;
+		bdev->bd_inode->i_size = nbd->bytesize;
+		set_blocksize(bdev, nbd->blksize);
+		set_capacity(nbd->disk, nbd->bytesize >> 9);
 		return 0;
 
 	case NBD_DO_IT: {
 		struct task_struct *thread;
-		struct file *file;
+		struct socket *sock;
 		int error;
 
-		if (lo->pid)
+		if (nbd->pid)
 			return -EBUSY;
-		if (!lo->file)
+		if (!nbd->sock)
 			return -EINVAL;
 
-		mutex_unlock(&lo->tx_lock);
+		mutex_unlock(&nbd->tx_lock);
+
+		if (nbd->flags & NBD_FLAG_READ_ONLY)
+			set_device_ro(bdev, true);
+		if (nbd->flags & NBD_FLAG_SEND_TRIM)
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+				nbd->disk->queue);
+		if (nbd->flags & NBD_FLAG_SEND_FLUSH)
+			blk_queue_flush(nbd->disk->queue, REQ_FLUSH);
+		else
+			blk_queue_flush(nbd->disk->queue, 0);
 
-		thread = kthread_create(nbd_thread, lo, lo->disk->disk_name);
+		thread = kthread_create(nbd_thread, nbd, "%s",
+					nbd->disk->disk_name);
 		if (IS_ERR(thread)) {
-			mutex_lock(&lo->tx_lock);
+			mutex_lock(&nbd->tx_lock);
 			return PTR_ERR(thread);
 		}
 		wake_up_process(thread);
-		error = nbd_do_it(lo);
+		error = nbd_do_it(nbd);
 		kthread_stop(thread);
 
-		mutex_lock(&lo->tx_lock);
+		mutex_lock(&nbd->tx_lock);
 		if (error)
 			return error;
-		sock_shutdown(lo, 0);
-		file = lo->file;
-		lo->file = NULL;
-		nbd_clear_que(lo);
-		printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name);
-		if (file)
-			fput(file);
-		lo->bytesize = 0;
+		sock_shutdown(nbd, 0);
+		sock = nbd->sock;
+		nbd->sock = NULL;
+		nbd_clear_que(nbd);
+		dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
+		kill_bdev(bdev);
+		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
+		set_device_ro(bdev, false);
+		if (sock)
+			sockfd_put(sock);
+		nbd->flags = 0;
+		nbd->bytesize = 0;
 		bdev->bd_inode->i_size = 0;
-		set_capacity(lo->disk, 0);
+		set_capacity(nbd->disk, 0);
 		if (max_part > 0)
 			ioctl_by_bdev(bdev, BLKRRPART, 0);
-		return lo->harderror;
+		if (nbd->disconnect) /* user requested, ignore socket errors */
+			return 0;
+		return nbd->harderror;
 	}
 
 	case NBD_CLEAR_QUE:
@@ -687,14 +745,13 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 		 * This is for compatibility only.  The queue is always cleared
 		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
 		 */
-		BUG_ON(!lo->sock && !list_empty(&lo->queue_head));
 		return 0;
 
 	case NBD_PRINT_DEBUG:
-		printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n",
-			bdev->bd_disk->disk_name,
-			lo->queue_head.next, lo->queue_head.prev,
-			&lo->queue_head);
+		dev_info(disk_to_dev(nbd->disk),
+			"next = %p, prev = %p, head = %p\n",
+			nbd->queue_head.next, nbd->queue_head.prev,
+			&nbd->queue_head);
 		return 0;
 	}
 	return -ENOTTY;
@@ -703,29 +760,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
 		     unsigned int cmd, unsigned long arg)
 {
-	struct nbd_device *lo = bdev->bd_disk->private_data;
+	struct nbd_device *nbd = bdev->bd_disk->private_data;
 	int error;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	BUG_ON(lo->magic != LO_MAGIC);
+	BUG_ON(nbd->magic != NBD_MAGIC);
 
 	/* Anyone capable of this syscall can do *real bad* things */
 	dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
-			lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
+		nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
 
-	mutex_lock(&lo->tx_lock);
-	error = __nbd_ioctl(bdev, lo, cmd, arg);
-	mutex_unlock(&lo->tx_lock);
+	mutex_lock(&nbd->tx_lock);
+	error = __nbd_ioctl(bdev, nbd, cmd, arg);
+	mutex_unlock(&nbd->tx_lock);
 
 	return error;
 }
 
-static struct block_device_operations nbd_fops =
+static const struct block_device_operations nbd_fops =
 {
 	.owner =	THIS_MODULE,
-	.locked_ioctl =	nbd_ioctl,
+	.ioctl =	nbd_ioctl,
 };
 
 /*
@@ -742,7 +799,7 @@ static int __init nbd_init(void)
 	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
 
 	if (max_part < 0) {
-		printk(KERN_CRIT "nbd: max_part must be >= 0\n");
+		printk(KERN_ERR "nbd: max_part must be >= 0\n");
 		return -EINVAL;
 	}
 
@@ -751,9 +808,26 @@ static int __init nbd_init(void)
 		return -ENOMEM;
 
 	part_shift = 0;
-	if (max_part > 0)
+	if (max_part > 0) {
 		part_shift = fls(max_part);
 
+		/*
+		 * Adjust max_part according to part_shift as it is exported
+		 * to user space so that user can know the max number of
+		 * partition kernel should be able to manage.
+		 *
+		 * Note that -1 is required because partition 0 is reserved
+		 * for the whole disk.
+		 */
+		max_part = (1UL << part_shift) - 1;
+	}
+
+	if ((1UL << part_shift) > DISK_MAX_PARTS)
+		return -EINVAL;
+
+	if (nbds_max > 1UL << (MINORBITS - part_shift))
+		return -EINVAL;
+
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = alloc_disk(1 << part_shift);
 		if (!disk)
@@ -773,6 +847,11 @@ static int __init nbd_init(void)
 		 * Tell the block layer that we are not a rotational device
 		 */
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
+		disk->queue->limits.discard_granularity = 512;
+		disk->queue->limits.max_discard_sectors = UINT_MAX;
+		disk->queue->limits.discard_zeroes_data = 0;
+		blk_queue_max_hw_sectors(disk->queue, 65536);
+		disk->queue->limits.max_sectors = 256;
 	}
 
 	if (register_blkdev(NBD_MAJOR, "nbd")) {
@@ -785,9 +864,7 @@ static int __init nbd_init(void)
 
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = nbd_dev[i].disk;
-		nbd_dev[i].file = NULL;
-		nbd_dev[i].magic = LO_MAGIC;
-		nbd_dev[i].flags = 0;
+		nbd_dev[i].magic = NBD_MAGIC;
 		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
 		spin_lock_init(&nbd_dev[i].queue_lock);
 		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
new file mode 100644
index 00000000000..a3b042c4d44
--- /dev/null
+++ b/drivers/block/null_blk.c
@@ -0,0 +1,619 @@
+#include <linux/module.h>
+
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/blk-mq.h>
+#include <linux/hrtimer.h>
+
+struct nullb_cmd {
+	struct list_head list;
+	struct llist_node ll_list;
+	struct call_single_data csd;
+	struct request *rq;
+	struct bio *bio;
+	unsigned int tag;
+	struct nullb_queue *nq;
+};
+
+struct nullb_queue {
+	unsigned long *tag_map;
+	wait_queue_head_t wait;
+	unsigned int queue_depth;
+
+	struct nullb_cmd *cmds;
+};
+
+struct nullb {
+	struct list_head list;
+	unsigned int index;
+	struct request_queue *q;
+	struct gendisk *disk;
+	struct blk_mq_tag_set tag_set;
+	struct hrtimer timer;
+	unsigned int queue_depth;
+	spinlock_t lock;
+
+	struct nullb_queue *queues;
+	unsigned int nr_queues;
+};
+
+static LIST_HEAD(nullb_list);
+static struct mutex lock;
+static int null_major;
+static int nullb_indexes;
+
+struct completion_queue {
+	struct llist_head list;
+	struct hrtimer timer;
+};
+
+/*
+ * These are per-cpu for now, they will need to be configured by the
+ * complete_queues parameter and appropriately mapped.
+ */
+static DEFINE_PER_CPU(struct completion_queue, completion_queues);
+
+enum {
+	NULL_IRQ_NONE		= 0,
+	NULL_IRQ_SOFTIRQ	= 1,
+	NULL_IRQ_TIMER		= 2,
+};
+
+enum {
+	NULL_Q_BIO		= 0,
+	NULL_Q_RQ		= 1,
+	NULL_Q_MQ		= 2,
+};
+
+static int submit_queues;
+module_param(submit_queues, int, S_IRUGO);
+MODULE_PARM_DESC(submit_queues, "Number of submission queues");
+
+static int home_node = NUMA_NO_NODE;
+module_param(home_node, int, S_IRUGO);
+MODULE_PARM_DESC(home_node, "Home node for the device");
+
+static int queue_mode = NULL_Q_MQ;
+module_param(queue_mode, int, S_IRUGO);
+MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
+
+static int gb = 250;
+module_param(gb, int, S_IRUGO);
+MODULE_PARM_DESC(gb, "Size in GB");
+
+static int bs = 512;
+module_param(bs, int, S_IRUGO);
+MODULE_PARM_DESC(bs, "Block size (in bytes)");
+
+static int nr_devices = 2;
+module_param(nr_devices, int, S_IRUGO);
+MODULE_PARM_DESC(nr_devices, "Number of devices to register");
+
+static int irqmode = NULL_IRQ_SOFTIRQ;
+module_param(irqmode, int, S_IRUGO);
+MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
+
+static int completion_nsec = 10000;
+module_param(completion_nsec, int, S_IRUGO);
+MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
+
+static int hw_queue_depth = 64;
+module_param(hw_queue_depth, int, S_IRUGO);
+MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
+
+static bool use_per_node_hctx = false;
+module_param(use_per_node_hctx, bool, S_IRUGO);
+MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
+
+static void put_tag(struct nullb_queue *nq, unsigned int tag)
+{
+	clear_bit_unlock(tag, nq->tag_map);
+
+	if (waitqueue_active(&nq->wait))
+		wake_up(&nq->wait);
+}
+
+static unsigned int get_tag(struct nullb_queue *nq)
+{
+	unsigned int tag;
+
+	do {
+		tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
+		if (tag >= nq->queue_depth)
+			return -1U;
+	} while (test_and_set_bit_lock(tag, nq->tag_map));
+
+	return tag;
+}
+
+static void free_cmd(struct nullb_cmd *cmd)
+{
+	put_tag(cmd->nq, cmd->tag);
+}
+
+static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
+{
+	struct nullb_cmd *cmd;
+	unsigned int tag;
+
+	tag = get_tag(nq);
+	if (tag != -1U) {
+		cmd = &nq->cmds[tag];
+		cmd->tag = tag;
+		cmd->nq = nq;
+		return cmd;
+	}
+
+	return NULL;
+}
+
+static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait)
+{
+	struct nullb_cmd *cmd;
+	DEFINE_WAIT(wait);
+
+	cmd = __alloc_cmd(nq);
+	if (cmd || !can_wait)
+		return cmd;
+
+	do {
+		prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
+		cmd = __alloc_cmd(nq);
+		if (cmd)
+			break;
+
+		io_schedule();
+	} while (1);
+
+	finish_wait(&nq->wait, &wait);
+	return cmd;
+}
+
+static void end_cmd(struct nullb_cmd *cmd)
+{
+	switch (queue_mode)  {
+	case NULL_Q_MQ:
+		blk_mq_end_io(cmd->rq, 0);
+		return;
+	case NULL_Q_RQ:
+		INIT_LIST_HEAD(&cmd->rq->queuelist);
+		blk_end_request_all(cmd->rq, 0);
+		break;
+	case NULL_Q_BIO:
+		bio_endio(cmd->bio, 0);
+		break;
+	}
+
+	free_cmd(cmd);
+}
+
+static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
+{
+	struct completion_queue *cq;
+	struct llist_node *entry;
+	struct nullb_cmd *cmd;
+
+	cq = &per_cpu(completion_queues, smp_processor_id());
+
+	while ((entry = llist_del_all(&cq->list)) != NULL) {
+		entry = llist_reverse_order(entry);
+		do {
+			cmd = container_of(entry, struct nullb_cmd, ll_list);
+			entry = entry->next;
+			end_cmd(cmd);
+		} while (entry);
+	}
+
+	return HRTIMER_NORESTART;
+}
+
+static void null_cmd_end_timer(struct nullb_cmd *cmd)
+{
+	struct completion_queue *cq = &per_cpu(completion_queues, get_cpu());
+
+	cmd->ll_list.next = NULL;
+	if (llist_add(&cmd->ll_list, &cq->list)) {
+		ktime_t kt = ktime_set(0, completion_nsec);
+
+		hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL);
+	}
+
+	put_cpu();
+}
+
+static void null_softirq_done_fn(struct request *rq)
+{
+	if (queue_mode == NULL_Q_MQ)
+		end_cmd(blk_mq_rq_to_pdu(rq));
+	else
+		end_cmd(rq->special);
+}
+
+static inline void null_handle_cmd(struct nullb_cmd *cmd)
+{
+	/* Complete IO by inline, softirq or timer */
+	switch (irqmode) {
+	case NULL_IRQ_SOFTIRQ:
+		switch (queue_mode)  {
+		case NULL_Q_MQ:
+			blk_mq_complete_request(cmd->rq);
+			break;
+		case NULL_Q_RQ:
+			blk_complete_request(cmd->rq);
+			break;
+		case NULL_Q_BIO:
+			/*
+			 * XXX: no proper submitting cpu information available.
+			 */
+			end_cmd(cmd);
+			break;
+		}
+		break;
+	case NULL_IRQ_NONE:
+		end_cmd(cmd);
+		break;
+	case NULL_IRQ_TIMER:
+		null_cmd_end_timer(cmd);
+		break;
+	}
+}
+
+static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
+{
+	int index = 0;
+
+	if (nullb->nr_queues != 1)
+		index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
+
+	return &nullb->queues[index];
+}
+
+static void null_queue_bio(struct request_queue *q, struct bio *bio)
+{
+	struct nullb *nullb = q->queuedata;
+	struct nullb_queue *nq = nullb_to_queue(nullb);
+	struct nullb_cmd *cmd;
+
+	cmd = alloc_cmd(nq, 1);
+	cmd->bio = bio;
+
+	null_handle_cmd(cmd);
+}
+
+static int null_rq_prep_fn(struct request_queue *q, struct request *req)
+{
+	struct nullb *nullb = q->queuedata;
+	struct nullb_queue *nq = nullb_to_queue(nullb);
+	struct nullb_cmd *cmd;
+
+	cmd = alloc_cmd(nq, 0);
+	if (cmd) {
+		cmd->rq = req;
+		req->special = cmd;
+		return BLKPREP_OK;
+	}
+
+	return BLKPREP_DEFER;
+}
+
+static void null_request_fn(struct request_queue *q)
+{
+	struct request *rq;
+
+	while ((rq = blk_fetch_request(q)) != NULL) {
+		struct nullb_cmd *cmd = rq->special;
+
+		spin_unlock_irq(q->queue_lock);
+		null_handle_cmd(cmd);
+		spin_lock_irq(q->queue_lock);
+	}
+}
+
+static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	cmd->rq = rq;
+	cmd->nq = hctx->driver_data;
+
+	null_handle_cmd(cmd);
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
+{
+	BUG_ON(!nullb);
+	BUG_ON(!nq);
+
+	init_waitqueue_head(&nq->wait);
+	nq->queue_depth = nullb->queue_depth;
+}
+
+static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+			  unsigned int index)
+{
+	struct nullb *nullb = data;
+	struct nullb_queue *nq = &nullb->queues[index];
+
+	hctx->driver_data = nq;
+	null_init_queue(nullb, nq);
+	nullb->nr_queues++;
+
+	return 0;
+}
+
+static struct blk_mq_ops null_mq_ops = {
+	.queue_rq       = null_queue_rq,
+	.map_queue      = blk_mq_map_queue,
+	.init_hctx	= null_init_hctx,
+	.complete	= null_softirq_done_fn,
+};
+
+static void null_del_dev(struct nullb *nullb)
+{
+	list_del_init(&nullb->list);
+
+	del_gendisk(nullb->disk);
+	blk_cleanup_queue(nullb->q);
+	if (queue_mode == NULL_Q_MQ)
+		blk_mq_free_tag_set(&nullb->tag_set);
+	put_disk(nullb->disk);
+	kfree(nullb);
+}
+
+static int null_open(struct block_device *bdev, fmode_t mode)
+{
+	return 0;
+}
+
+static void null_release(struct gendisk *disk, fmode_t mode)
+{
+}
+
+static const struct block_device_operations null_fops = {
+	.owner =	THIS_MODULE,
+	.open =		null_open,
+	.release =	null_release,
+};
+
+static int setup_commands(struct nullb_queue *nq)
+{
+	struct nullb_cmd *cmd;
+	int i, tag_size;
+
+	nq->cmds = kzalloc(nq->queue_depth * sizeof(*cmd), GFP_KERNEL);
+	if (!nq->cmds)
+		return -ENOMEM;
+
+	tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
+	nq->tag_map = kzalloc(tag_size * sizeof(unsigned long), GFP_KERNEL);
+	if (!nq->tag_map) {
+		kfree(nq->cmds);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nq->queue_depth; i++) {
+		cmd = &nq->cmds[i];
+		INIT_LIST_HEAD(&cmd->list);
+		cmd->ll_list.next = NULL;
+		cmd->tag = -1U;
+	}
+
+	return 0;
+}
+
+static void cleanup_queue(struct nullb_queue *nq)
+{
+	kfree(nq->tag_map);
+	kfree(nq->cmds);
+}
+
+static void cleanup_queues(struct nullb *nullb)
+{
+	int i;
+
+	for (i = 0; i < nullb->nr_queues; i++)
+		cleanup_queue(&nullb->queues[i]);
+
+	kfree(nullb->queues);
+}
+
+static int setup_queues(struct nullb *nullb)
+{
+	nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue),
+								GFP_KERNEL);
+	if (!nullb->queues)
+		return -ENOMEM;
+
+	nullb->nr_queues = 0;
+	nullb->queue_depth = hw_queue_depth;
+
+	return 0;
+}
+
+static int init_driver_queues(struct nullb *nullb)
+{
+	struct nullb_queue *nq;
+	int i, ret = 0;
+
+	for (i = 0; i < submit_queues; i++) {
+		nq = &nullb->queues[i];
+
+		null_init_queue(nullb, nq);
+
+		ret = setup_commands(nq);
+		if (ret)
+			goto err_queue;
+		nullb->nr_queues++;
+	}
+
+	return 0;
+err_queue:
+	cleanup_queues(nullb);
+	return ret;
+}
+
+static int null_add_dev(void)
+{
+	struct gendisk *disk;
+	struct nullb *nullb;
+	sector_t size;
+
+	nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
+	if (!nullb)
+		goto out;
+
+	spin_lock_init(&nullb->lock);
+
+	if (queue_mode == NULL_Q_MQ && use_per_node_hctx)
+		submit_queues = nr_online_nodes;
+
+	if (setup_queues(nullb))
+		goto out_free_nullb;
+
+	if (queue_mode == NULL_Q_MQ) {
+		nullb->tag_set.ops = &null_mq_ops;
+		nullb->tag_set.nr_hw_queues = submit_queues;
+		nullb->tag_set.queue_depth = hw_queue_depth;
+		nullb->tag_set.numa_node = home_node;
+		nullb->tag_set.cmd_size	= sizeof(struct nullb_cmd);
+		nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+		nullb->tag_set.driver_data = nullb;
+
+		if (blk_mq_alloc_tag_set(&nullb->tag_set))
+			goto out_cleanup_queues;
+
+		nullb->q = blk_mq_init_queue(&nullb->tag_set);
+		if (!nullb->q)
+			goto out_cleanup_tags;
+	} else if (queue_mode == NULL_Q_BIO) {
+		nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+		if (!nullb->q)
+			goto out_cleanup_queues;
+		blk_queue_make_request(nullb->q, null_queue_bio);
+		init_driver_queues(nullb);
+	} else {
+		nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
+		if (!nullb->q)
+			goto out_cleanup_queues;
+		blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
+		blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+		init_driver_queues(nullb);
+	}
+
+	nullb->q->queuedata = nullb;
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
+
+	disk = nullb->disk = alloc_disk_node(1, home_node);
+	if (!disk)
+		goto out_cleanup_blk_queue;
+
+	mutex_lock(&lock);
+	list_add_tail(&nullb->list, &nullb_list);
+	nullb->index = nullb_indexes++;
+	mutex_unlock(&lock);
+
+	blk_queue_logical_block_size(nullb->q, bs);
+	blk_queue_physical_block_size(nullb->q, bs);
+
+	size = gb * 1024 * 1024 * 1024ULL;
+	sector_div(size, bs);
+	set_capacity(disk, size);
+
+	disk->flags |= GENHD_FL_EXT_DEVT;
+	disk->major		= null_major;
+	disk->first_minor	= nullb->index;
+	disk->fops		= &null_fops;
+	disk->private_data	= nullb;
+	disk->queue		= nullb->q;
+	sprintf(disk->disk_name, "nullb%d", nullb->index);
+	add_disk(disk);
+	return 0;
+
+out_cleanup_blk_queue:
+	blk_cleanup_queue(nullb->q);
+out_cleanup_tags:
+	if (queue_mode == NULL_Q_MQ)
+		blk_mq_free_tag_set(&nullb->tag_set);
+out_cleanup_queues:
+	cleanup_queues(nullb);
+out_free_nullb:
+	kfree(nullb);
+out:
+	return -ENOMEM;
+}
+
+static int __init null_init(void)
+{
+	unsigned int i;
+
+	if (bs > PAGE_SIZE) {
+		pr_warn("null_blk: invalid block size\n");
+		pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
+		bs = PAGE_SIZE;
+	}
+
+	if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
+		if (submit_queues < nr_online_nodes) {
+			pr_warn("null_blk: submit_queues param is set to %u.",
+							nr_online_nodes);
+			submit_queues = nr_online_nodes;
+		}
+	} else if (submit_queues > nr_cpu_ids)
+		submit_queues = nr_cpu_ids;
+	else if (!submit_queues)
+		submit_queues = 1;
+
+	mutex_init(&lock);
+
+	/* Initialize a separate list for each CPU for issuing softirqs */
+	for_each_possible_cpu(i) {
+		struct completion_queue *cq = &per_cpu(completion_queues, i);
+
+		init_llist_head(&cq->list);
+
+		if (irqmode != NULL_IRQ_TIMER)
+			continue;
+
+		hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		cq->timer.function = null_cmd_timer_expired;
+	}
+
+	null_major = register_blkdev(0, "nullb");
+	if (null_major < 0)
+		return null_major;
+
+	for (i = 0; i < nr_devices; i++) {
+		if (null_add_dev()) {
+			unregister_blkdev(null_major, "nullb");
+			return -EINVAL;
+		}
+	}
+
+	pr_info("null: module loaded\n");
+	return 0;
+}
+
+static void __exit null_exit(void)
+{
+	struct nullb *nullb;
+
+	unregister_blkdev(null_major, "nullb");
+
+	mutex_lock(&lock);
+	while (!list_empty(&nullb_list)) {
+		nullb = list_entry(nullb_list.next, struct nullb, list);
+		null_del_dev(nullb);
+	}
+	mutex_unlock(&lock);
+}
+
+module_init(null_init);
+module_exit(null_exit);
+
+MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
new file mode 100644
index 00000000000..02351e21716
--- /dev/null
+++ b/drivers/block/nvme-core.c
@@ -0,0 +1,2977 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/nvme.h>
+#include <linux/bio.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/hdreg.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/percpu.h>
+#include <linux/poison.h>
+#include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <scsi/sg.h>
+#include <asm-generic/io-64-nonatomic-lo-hi.h>
+
+#include <trace/events/block.h>
+
+#define NVME_Q_DEPTH		1024
+#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
+#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
+#define ADMIN_TIMEOUT		(admin_timeout * HZ)
+#define IOD_TIMEOUT		(retry_time * HZ)
+
+static unsigned char admin_timeout = 60;
+module_param(admin_timeout, byte, 0644);
+MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
+
+unsigned char nvme_io_timeout = 30;
+module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
+MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
+
+static unsigned char retry_time = 30;
+module_param(retry_time, byte, 0644);
+MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O");
+
+static int nvme_major;
+module_param(nvme_major, int, 0);
+
+static int use_threaded_interrupts;
+module_param(use_threaded_interrupts, int, 0);
+
+static DEFINE_SPINLOCK(dev_list_lock);
+static LIST_HEAD(dev_list);
+static struct task_struct *nvme_thread;
+static struct workqueue_struct *nvme_workq;
+static wait_queue_head_t nvme_kthread_wait;
+static struct notifier_block nvme_nb;
+
+static void nvme_reset_failed_dev(struct work_struct *ws);
+
+struct async_cmd_info {
+	struct kthread_work work;
+	struct kthread_worker *worker;
+	u32 result;
+	int status;
+	void *ctx;
+};
+
+/*
+ * An NVM Express queue.  Each device has at least two (one for admin
+ * commands and one for I/O commands).
+ */
+struct nvme_queue {
+	struct rcu_head r_head;
+	struct device *q_dmadev;
+	struct nvme_dev *dev;
+	char irqname[24];	/* nvme4294967295-65535\0 */
+	spinlock_t q_lock;
+	struct nvme_command *sq_cmds;
+	volatile struct nvme_completion *cqes;
+	dma_addr_t sq_dma_addr;
+	dma_addr_t cq_dma_addr;
+	wait_queue_head_t sq_full;
+	wait_queue_t sq_cong_wait;
+	struct bio_list sq_cong;
+	struct list_head iod_bio;
+	u32 __iomem *q_db;
+	u16 q_depth;
+	u16 cq_vector;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 cq_head;
+	u16 qid;
+	u8 cq_phase;
+	u8 cqe_seen;
+	u8 q_suspended;
+	cpumask_var_t cpu_mask;
+	struct async_cmd_info cmdinfo;
+	unsigned long cmdid_data[];
+};
+
+/*
+ * Check we didin't inadvertently grow the command struct
+ */
+static inline void _nvme_check_size(void)
+{
+	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
+	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+}
+
+typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
+						struct nvme_completion *);
+
+struct nvme_cmd_info {
+	nvme_completion_fn fn;
+	void *ctx;
+	unsigned long timeout;
+	int aborted;
+};
+
+static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
+{
+	return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
+}
+
+static unsigned nvme_queue_extra(int depth)
+{
+	return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
+}
+
+/**
+ * alloc_cmdid() - Allocate a Command ID
+ * @nvmeq: The queue that will be used for this command
+ * @ctx: A pointer that will be passed to the handler
+ * @handler: The function to call on completion
+ *
+ * Allocate a Command ID for a queue.  The data passed in will
+ * be passed to the completion handler.  This is implemented by using
+ * the bottom two bits of the ctx pointer to store the handler ID.
+ * Passing in a pointer that's not 4-byte aligned will cause a BUG.
+ * We can change this if it becomes a problem.
+ *
+ * May be called with local interrupts disabled and the q_lock held,
+ * or with interrupts enabled and no locks held.
+ */
+static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
+				nvme_completion_fn handler, unsigned timeout)
+{
+	int depth = nvmeq->q_depth - 1;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	int cmdid;
+
+	do {
+		cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
+		if (cmdid >= depth)
+			return -EBUSY;
+	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
+
+	info[cmdid].fn = handler;
+	info[cmdid].ctx = ctx;
+	info[cmdid].timeout = jiffies + timeout;
+	info[cmdid].aborted = 0;
+	return cmdid;
+}
+
+static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
+				nvme_completion_fn handler, unsigned timeout)
+{
+	int cmdid;
+	wait_event_killable(nvmeq->sq_full,
+		(cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
+	return (cmdid < 0) ? -EINTR : cmdid;
+}
+
+/* Special values must be less than 0x1000 */
+#define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
+#define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
+#define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
+#define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
+#define CMD_CTX_ABORT		(0x318 + CMD_CTX_BASE)
+
+static void special_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	if (ctx == CMD_CTX_CANCELLED)
+		return;
+	if (ctx == CMD_CTX_ABORT) {
+		++nvmeq->dev->abort_limit;
+		return;
+	}
+	if (ctx == CMD_CTX_COMPLETED) {
+		dev_warn(nvmeq->q_dmadev,
+				"completed id %d twice on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
+	if (ctx == CMD_CTX_INVALID) {
+		dev_warn(nvmeq->q_dmadev,
+				"invalid id %d completed on queue %d\n",
+				cqe->command_id, le16_to_cpup(&cqe->sq_id));
+		return;
+	}
+
+	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
+}
+
+static void async_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct async_cmd_info *cmdinfo = ctx;
+	cmdinfo->result = le32_to_cpup(&cqe->result);
+	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
+}
+
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
+static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
+						nvme_completion_fn *fn)
+{
+	void *ctx;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+
+	if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) {
+		if (fn)
+			*fn = special_completion;
+		return CMD_CTX_INVALID;
+	}
+	if (fn)
+		*fn = info[cmdid].fn;
+	ctx = info[cmdid].ctx;
+	info[cmdid].fn = special_completion;
+	info[cmdid].ctx = CMD_CTX_COMPLETED;
+	clear_bit(cmdid, nvmeq->cmdid_data);
+	wake_up(&nvmeq->sq_full);
+	return ctx;
+}
+
+static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
+						nvme_completion_fn *fn)
+{
+	void *ctx;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	if (fn)
+		*fn = info[cmdid].fn;
+	ctx = info[cmdid].ctx;
+	info[cmdid].fn = special_completion;
+	info[cmdid].ctx = CMD_CTX_CANCELLED;
+	return ctx;
+}
+
+static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
+{
+	return rcu_dereference_raw(dev->queues[qid]);
+}
+
+static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
+{
+	struct nvme_queue *nvmeq;
+	unsigned queue_id = get_cpu_var(*dev->io_queue);
+
+	rcu_read_lock();
+	nvmeq = rcu_dereference(dev->queues[queue_id]);
+	if (nvmeq)
+		return nvmeq;
+
+	rcu_read_unlock();
+	put_cpu_var(*dev->io_queue);
+	return NULL;
+}
+
+static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
+{
+	rcu_read_unlock();
+	put_cpu_var(nvmeq->dev->io_queue);
+}
+
+static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
+							__acquires(RCU)
+{
+	struct nvme_queue *nvmeq;
+
+	rcu_read_lock();
+	nvmeq = rcu_dereference(dev->queues[q_idx]);
+	if (nvmeq)
+		return nvmeq;
+
+	rcu_read_unlock();
+	return NULL;
+}
+
+static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+/**
+ * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
+ * @nvmeq: The queue to use
+ * @cmd: The command to send
+ *
+ * Safe to use from interrupt context
+ */
+static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+{
+	unsigned long flags;
+	u16 tail;
+	spin_lock_irqsave(&nvmeq->q_lock, flags);
+	if (nvmeq->q_suspended) {
+		spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+		return -EBUSY;
+	}
+	tail = nvmeq->sq_tail;
+	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+	if (++tail == nvmeq->q_depth)
+		tail = 0;
+	writel(tail, nvmeq->q_db);
+	nvmeq->sq_tail = tail;
+	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+
+	return 0;
+}
+
+static __le64 **iod_list(struct nvme_iod *iod)
+{
+	return ((void *)iod) + iod->offset;
+}
+
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size)
+{
+	unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE);
+	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static struct nvme_iod *
+nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
+{
+	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
+				sizeof(__le64 *) * nvme_npages(nbytes) +
+				sizeof(struct scatterlist) * nseg, gfp);
+
+	if (iod) {
+		iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+		iod->npages = -1;
+		iod->length = nbytes;
+		iod->nents = 0;
+		iod->first_dma = 0ULL;
+		iod->start_time = jiffies;
+	}
+
+	return iod;
+}
+
+void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
+{
+	const int last_prp = PAGE_SIZE / 8 - 1;
+	int i;
+	__le64 **list = iod_list(iod);
+	dma_addr_t prp_dma = iod->first_dma;
+
+	if (iod->npages == 0)
+		dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
+	for (i = 0; i < iod->npages; i++) {
+		__le64 *prp_list = list[i];
+		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
+		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
+		prp_dma = next_prp_dma;
+	}
+	kfree(iod);
+}
+
+static void nvme_start_io_acct(struct bio *bio)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	if (blk_queue_io_stat(disk->queue)) {
+		const int rw = bio_data_dir(bio);
+		int cpu = part_stat_lock();
+		part_round_stats(cpu, &disk->part0);
+		part_stat_inc(cpu, &disk->part0, ios[rw]);
+		part_stat_add(cpu, &disk->part0, sectors[rw],
+							bio_sectors(bio));
+		part_inc_in_flight(&disk->part0, rw);
+		part_stat_unlock();
+	}
+}
+
+static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	if (blk_queue_io_stat(disk->queue)) {
+		const int rw = bio_data_dir(bio);
+		unsigned long duration = jiffies - start_time;
+		int cpu = part_stat_lock();
+		part_stat_add(cpu, &disk->part0, ticks[rw], duration);
+		part_round_stats(cpu, &disk->part0);
+		part_dec_in_flight(&disk->part0, rw);
+		part_stat_unlock();
+	}
+}
+
+static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct nvme_iod *iod = ctx;
+	struct bio *bio = iod->private;
+	u16 status = le16_to_cpup(&cqe->status) >> 1;
+	int error = 0;
+
+	if (unlikely(status)) {
+		if (!(status & NVME_SC_DNR ||
+				bio->bi_rw & REQ_FAILFAST_MASK) &&
+				(jiffies - iod->start_time) < IOD_TIMEOUT) {
+			if (!waitqueue_active(&nvmeq->sq_full))
+				add_wait_queue(&nvmeq->sq_full,
+							&nvmeq->sq_cong_wait);
+			list_add_tail(&iod->node, &nvmeq->iod_bio);
+			wake_up(&nvmeq->sq_full);
+			return;
+		}
+		error = -EIO;
+	}
+	if (iod->nents) {
+		dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
+			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		nvme_end_io_acct(bio, iod->start_time);
+	}
+	nvme_free_iod(nvmeq->dev, iod);
+
+	trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error);
+	bio_endio(bio, error);
+}
+
+/* length is in bytes.  gfp flags indicates whether we may sleep. */
+int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
+								gfp_t gfp)
+{
+	struct dma_pool *pool;
+	int length = total_len;
+	struct scatterlist *sg = iod->sg;
+	int dma_len = sg_dma_len(sg);
+	u64 dma_addr = sg_dma_address(sg);
+	int offset = offset_in_page(dma_addr);
+	__le64 *prp_list;
+	__le64 **list = iod_list(iod);
+	dma_addr_t prp_dma;
+	int nprps, i;
+
+	length -= (PAGE_SIZE - offset);
+	if (length <= 0)
+		return total_len;
+
+	dma_len -= (PAGE_SIZE - offset);
+	if (dma_len) {
+		dma_addr += (PAGE_SIZE - offset);
+	} else {
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+
+	if (length <= PAGE_SIZE) {
+		iod->first_dma = dma_addr;
+		return total_len;
+	}
+
+	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
+	if (nprps <= (256 / 8)) {
+		pool = dev->prp_small_pool;
+		iod->npages = 0;
+	} else {
+		pool = dev->prp_page_pool;
+		iod->npages = 1;
+	}
+
+	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+	if (!prp_list) {
+		iod->first_dma = dma_addr;
+		iod->npages = -1;
+		return (total_len - length) + PAGE_SIZE;
+	}
+	list[0] = prp_list;
+	iod->first_dma = prp_dma;
+	i = 0;
+	for (;;) {
+		if (i == PAGE_SIZE / 8) {
+			__le64 *old_prp_list = prp_list;
+			prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+			if (!prp_list)
+				return total_len - length;
+			list[iod->npages++] = prp_list;
+			prp_list[0] = old_prp_list[i - 1];
+			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+			i = 1;
+		}
+		prp_list[i++] = cpu_to_le64(dma_addr);
+		dma_len -= PAGE_SIZE;
+		dma_addr += PAGE_SIZE;
+		length -= PAGE_SIZE;
+		if (length <= 0)
+			break;
+		if (dma_len > 0)
+			continue;
+		BUG_ON(dma_len < 0);
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+
+	return total_len;
+}
+
+static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
+				 int len)
+{
+	struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL);
+	if (!split)
+		return -ENOMEM;
+
+	trace_block_split(bdev_get_queue(bio->bi_bdev), bio,
+					split->bi_iter.bi_sector);
+	bio_chain(split, bio);
+
+	if (!waitqueue_active(&nvmeq->sq_full))
+		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+	bio_list_add(&nvmeq->sq_cong, split);
+	bio_list_add(&nvmeq->sq_cong, bio);
+	wake_up(&nvmeq->sq_full);
+
+	return 0;
+}
+
+/* NVMe scatterlists require no holes in the virtual address */
+#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)	((vec2)->bv_offset || \
+			(((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
+
+static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
+		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
+{
+	struct bio_vec bvec, bvprv;
+	struct bvec_iter iter;
+	struct scatterlist *sg = NULL;
+	int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
+	int first = 1;
+
+	if (nvmeq->dev->stripe_size)
+		split_len = nvmeq->dev->stripe_size -
+			((bio->bi_iter.bi_sector << 9) &
+			 (nvmeq->dev->stripe_size - 1));
+
+	sg_init_table(iod->sg, psegs);
+	bio_for_each_segment(bvec, bio, iter) {
+		if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) {
+			sg->length += bvec.bv_len;
+		} else {
+			if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
+				return nvme_split_and_submit(bio, nvmeq,
+							     length);
+
+			sg = sg ? sg + 1 : iod->sg;
+			sg_set_page(sg, bvec.bv_page,
+				    bvec.bv_len, bvec.bv_offset);
+			nsegs++;
+		}
+
+		if (split_len - length < bvec.bv_len)
+			return nvme_split_and_submit(bio, nvmeq, split_len);
+		length += bvec.bv_len;
+		bvprv = bvec;
+		first = 0;
+	}
+	iod->nents = nsegs;
+	sg_mark_end(sg);
+	if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
+		return -ENOMEM;
+
+	BUG_ON(length != bio->bi_iter.bi_size);
+	return length;
+}
+
+static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+		struct bio *bio, struct nvme_iod *iod, int cmdid)
+{
+	struct nvme_dsm_range *range =
+				(struct nvme_dsm_range *)iod_list(iod)[0];
+	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+	range->cattr = cpu_to_le32(0);
+	range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
+	range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->dsm.opcode = nvme_cmd_dsm;
+	cmnd->dsm.command_id = cmdid;
+	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
+	cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
+	cmnd->dsm.nr = 0;
+	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+
+	return 0;
+}
+
+static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+								int cmdid)
+{
+	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->common.opcode = nvme_cmd_flush;
+	cmnd->common.command_id = cmdid;
+	cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+
+	return 0;
+}
+
+static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
+{
+	struct bio *bio = iod->private;
+	struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+	struct nvme_command *cmnd;
+	int cmdid;
+	u16 control;
+	u32 dsmgmt;
+
+	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
+	if (unlikely(cmdid < 0))
+		return cmdid;
+
+	if (bio->bi_rw & REQ_DISCARD)
+		return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
+	if (bio->bi_rw & REQ_FLUSH)
+		return nvme_submit_flush(nvmeq, ns, cmdid);
+
+	control = 0;
+	if (bio->bi_rw & REQ_FUA)
+		control |= NVME_RW_FUA;
+	if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+		control |= NVME_RW_LR;
+
+	dsmgmt = 0;
+	if (bio->bi_rw & REQ_RAHEAD)
+		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+
+	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+	memset(cmnd, 0, sizeof(*cmnd));
+
+	cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read;
+	cmnd->rw.command_id = cmdid;
+	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+	cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+	cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
+	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
+	cmnd->rw.length =
+		cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1);
+	cmnd->rw.control = cpu_to_le16(control);
+	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+
+	return 0;
+}
+
+static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio)
+{
+	struct bio *split = bio_clone(bio, GFP_ATOMIC);
+	if (!split)
+		return -ENOMEM;
+
+	split->bi_iter.bi_size = 0;
+	split->bi_phys_segments = 0;
+	bio->bi_rw &= ~REQ_FLUSH;
+	bio_chain(split, bio);
+
+	if (!waitqueue_active(&nvmeq->sq_full))
+		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+	bio_list_add(&nvmeq->sq_cong, split);
+	bio_list_add(&nvmeq->sq_cong, bio);
+	wake_up_process(nvme_thread);
+
+	return 0;
+}
+
+/*
+ * Called with local interrupts disabled and the q_lock held.  May not sleep.
+ */
+static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+								struct bio *bio)
+{
+	struct nvme_iod *iod;
+	int psegs = bio_phys_segments(ns->queue, bio);
+	int result;
+
+	if ((bio->bi_rw & REQ_FLUSH) && psegs)
+		return nvme_split_flush_data(nvmeq, bio);
+
+	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
+	if (!iod)
+		return -ENOMEM;
+
+	iod->private = bio;
+	if (bio->bi_rw & REQ_DISCARD) {
+		void *range;
+		/*
+		 * We reuse the small pool to allocate the 16-byte range here
+		 * as it is not worth having a special pool for these or
+		 * additional cases to handle freeing the iod.
+		 */
+		range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
+						GFP_ATOMIC,
+						&iod->first_dma);
+		if (!range) {
+			result = -ENOMEM;
+			goto free_iod;
+		}
+		iod_list(iod)[0] = (__le64 *)range;
+		iod->npages = 0;
+	} else if (psegs) {
+		result = nvme_map_bio(nvmeq, iod, bio,
+			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+			psegs);
+		if (result <= 0)
+			goto free_iod;
+		if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) !=
+								result) {
+			result = -ENOMEM;
+			goto free_iod;
+		}
+		nvme_start_io_acct(bio);
+	}
+	if (unlikely(nvme_submit_iod(nvmeq, iod))) {
+		if (!waitqueue_active(&nvmeq->sq_full))
+			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+		list_add_tail(&iod->node, &nvmeq->iod_bio);
+	}
+	return 0;
+
+ free_iod:
+	nvme_free_iod(nvmeq->dev, iod);
+	return result;
+}
+
+static int nvme_process_cq(struct nvme_queue *nvmeq)
+{
+	u16 head, phase;
+
+	head = nvmeq->cq_head;
+	phase = nvmeq->cq_phase;
+
+	for (;;) {
+		void *ctx;
+		nvme_completion_fn fn;
+		struct nvme_completion cqe = nvmeq->cqes[head];
+		if ((le16_to_cpu(cqe.status) & 1) != phase)
+			break;
+		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
+		if (++head == nvmeq->q_depth) {
+			head = 0;
+			phase = !phase;
+		}
+
+		ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
+		fn(nvmeq, ctx, &cqe);
+	}
+
+	/* If the controller ignores the cq head doorbell and continuously
+	 * writes to the queue, it is theoretically possible to wrap around
+	 * the queue twice and mistakenly return IRQ_NONE.  Linux only
+	 * requires that 0.1% of your interrupts are handled, so this isn't
+	 * a big problem.
+	 */
+	if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
+		return 0;
+
+	writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+	nvmeq->cq_head = head;
+	nvmeq->cq_phase = phase;
+
+	nvmeq->cqe_seen = 1;
+	return 1;
+}
+
+static void nvme_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct nvme_ns *ns = q->queuedata;
+	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
+	int result = -EBUSY;
+
+	if (!nvmeq) {
+		bio_endio(bio, -EIO);
+		return;
+	}
+
+	spin_lock_irq(&nvmeq->q_lock);
+	if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
+		result = nvme_submit_bio_queue(nvmeq, ns, bio);
+	if (unlikely(result)) {
+		if (!waitqueue_active(&nvmeq->sq_full))
+			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+		bio_list_add(&nvmeq->sq_cong, bio);
+	}
+
+	nvme_process_cq(nvmeq);
+	spin_unlock_irq(&nvmeq->q_lock);
+	put_nvmeq(nvmeq);
+}
+
+static irqreturn_t nvme_irq(int irq, void *data)
+{
+	irqreturn_t result;
+	struct nvme_queue *nvmeq = data;
+	spin_lock(&nvmeq->q_lock);
+	nvme_process_cq(nvmeq);
+	result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
+	nvmeq->cqe_seen = 0;
+	spin_unlock(&nvmeq->q_lock);
+	return result;
+}
+
+static irqreturn_t nvme_irq_check(int irq, void *data)
+{
+	struct nvme_queue *nvmeq = data;
+	struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
+	if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
+		return IRQ_NONE;
+	return IRQ_WAKE_THREAD;
+}
+
+static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
+{
+	spin_lock_irq(&nvmeq->q_lock);
+	cancel_cmdid(nvmeq, cmdid, NULL);
+	spin_unlock_irq(&nvmeq->q_lock);
+}
+
+struct sync_cmd_info {
+	struct task_struct *task;
+	u32 result;
+	int status;
+};
+
+static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct sync_cmd_info *cmdinfo = ctx;
+	cmdinfo->result = le32_to_cpup(&cqe->result);
+	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+	wake_up_process(cmdinfo->task);
+}
+
+/*
+ * Returns 0 on success.  If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
+						struct nvme_command *cmd,
+						u32 *result, unsigned timeout)
+{
+	int cmdid, ret;
+	struct sync_cmd_info cmdinfo;
+	struct nvme_queue *nvmeq;
+
+	nvmeq = lock_nvmeq(dev, q_idx);
+	if (!nvmeq)
+		return -ENODEV;
+
+	cmdinfo.task = current;
+	cmdinfo.status = -EINTR;
+
+	cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout);
+	if (cmdid < 0) {
+		unlock_nvmeq(nvmeq);
+		return cmdid;
+	}
+	cmd->common.command_id = cmdid;
+
+	set_current_state(TASK_KILLABLE);
+	ret = nvme_submit_cmd(nvmeq, cmd);
+	if (ret) {
+		free_cmdid(nvmeq, cmdid, NULL);
+		unlock_nvmeq(nvmeq);
+		set_current_state(TASK_RUNNING);
+		return ret;
+	}
+	unlock_nvmeq(nvmeq);
+	schedule_timeout(timeout);
+
+	if (cmdinfo.status == -EINTR) {
+		nvmeq = lock_nvmeq(dev, q_idx);
+		if (nvmeq) {
+			nvme_abort_command(nvmeq, cmdid);
+			unlock_nvmeq(nvmeq);
+		}
+		return -EINTR;
+	}
+
+	if (result)
+		*result = cmdinfo.result;
+
+	return cmdinfo.status;
+}
+
+static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
+			struct nvme_command *cmd,
+			struct async_cmd_info *cmdinfo, unsigned timeout)
+{
+	int cmdid;
+
+	cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout);
+	if (cmdid < 0)
+		return cmdid;
+	cmdinfo->status = -EINTR;
+	cmd->common.command_id = cmdid;
+	return nvme_submit_cmd(nvmeq, cmd);
+}
+
+int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+								u32 *result)
+{
+	return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT);
+}
+
+int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+								u32 *result)
+{
+	return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result,
+							NVME_IO_TIMEOUT);
+}
+
+static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
+		struct nvme_command *cmd, struct async_cmd_info *cmdinfo)
+{
+	return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo,
+								ADMIN_TIMEOUT);
+}
+
+static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
+{
+	int status;
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.delete_queue.opcode = opcode;
+	c.delete_queue.qid = cpu_to_le16(id);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
+						struct nvme_queue *nvmeq)
+{
+	int status;
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+
+	memset(&c, 0, sizeof(c));
+	c.create_cq.opcode = nvme_admin_create_cq;
+	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
+	c.create_cq.cqid = cpu_to_le16(qid);
+	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_cq.cq_flags = cpu_to_le16(flags);
+	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
+						struct nvme_queue *nvmeq)
+{
+	int status;
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
+
+	memset(&c, 0, sizeof(c));
+	c.create_sq.opcode = nvme_admin_create_sq;
+	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
+	c.create_sq.sqid = cpu_to_le16(qid);
+	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_sq.sq_flags = cpu_to_le16(flags);
+	c.create_sq.cqid = cpu_to_le16(qid);
+
+	status = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (status)
+		return -EIO;
+	return 0;
+}
+
+static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
+}
+
+static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
+}
+
+int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
+							dma_addr_t dma_addr)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.prp1 = cpu_to_le64(dma_addr);
+	c.identify.cns = cpu_to_le32(cns);
+
+	return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
+					dma_addr_t dma_addr, u32 *result)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_get_features;
+	c.features.nsid = cpu_to_le32(nsid);
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+
+	return nvme_submit_admin_cmd(dev, &c, result);
+}
+
+int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
+					dma_addr_t dma_addr, u32 *result)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_set_features;
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+	c.features.dword11 = cpu_to_le32(dword11);
+
+	return nvme_submit_admin_cmd(dev, &c, result);
+}
+
+/**
+ * nvme_abort_cmd - Attempt aborting a command
+ * @cmdid: Command id of a timed out IO
+ * @queue: The queue with timed out IO
+ *
+ * Schedule controller reset if the command was already aborted once before and
+ * still hasn't been returned to the driver, or if this is the admin queue.
+ */
+static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
+{
+	int a_cmdid;
+	struct nvme_command cmd;
+	struct nvme_dev *dev = nvmeq->dev;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	struct nvme_queue *adminq;
+
+	if (!nvmeq->qid || info[cmdid].aborted) {
+		if (work_busy(&dev->reset_work))
+			return;
+		list_del_init(&dev->node);
+		dev_warn(&dev->pci_dev->dev,
+			"I/O %d QID %d timeout, reset controller\n", cmdid,
+								nvmeq->qid);
+		dev->reset_workfn = nvme_reset_failed_dev;
+		queue_work(nvme_workq, &dev->reset_work);
+		return;
+	}
+
+	if (!dev->abort_limit)
+		return;
+
+	adminq = rcu_dereference(dev->queues[0]);
+	a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion,
+								ADMIN_TIMEOUT);
+	if (a_cmdid < 0)
+		return;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.abort.opcode = nvme_admin_abort_cmd;
+	cmd.abort.cid = cmdid;
+	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
+	cmd.abort.command_id = a_cmdid;
+
+	--dev->abort_limit;
+	info[cmdid].aborted = 1;
+	info[cmdid].timeout = jiffies + ADMIN_TIMEOUT;
+
+	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid,
+							nvmeq->qid);
+	nvme_submit_cmd(adminq, &cmd);
+}
+
+/**
+ * nvme_cancel_ios - Cancel outstanding I/Os
+ * @queue: The queue to cancel I/Os on
+ * @timeout: True to only cancel I/Os which have timed out
+ */
+static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
+{
+	int depth = nvmeq->q_depth - 1;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	unsigned long now = jiffies;
+	int cmdid;
+
+	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
+		void *ctx;
+		nvme_completion_fn fn;
+		static struct nvme_completion cqe = {
+			.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
+		};
+
+		if (timeout && !time_after(now, info[cmdid].timeout))
+			continue;
+		if (info[cmdid].ctx == CMD_CTX_CANCELLED)
+			continue;
+		if (timeout && nvmeq->dev->initialized) {
+			nvme_abort_cmd(cmdid, nvmeq);
+			continue;
+		}
+		dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
+								nvmeq->qid);
+		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
+		fn(nvmeq, ctx, &cqe);
+	}
+}
+
+static void nvme_free_queue(struct rcu_head *r)
+{
+	struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);
+
+	spin_lock_irq(&nvmeq->q_lock);
+	while (bio_list_peek(&nvmeq->sq_cong)) {
+		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
+		bio_endio(bio, -EIO);
+	}
+	while (!list_empty(&nvmeq->iod_bio)) {
+		static struct nvme_completion cqe = {
+			.status = cpu_to_le16(
+				(NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1),
+		};
+		struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio,
+							struct nvme_iod,
+							node);
+		list_del(&iod->node);
+		bio_completion(nvmeq, iod, &cqe);
+	}
+	spin_unlock_irq(&nvmeq->q_lock);
+
+	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	if (nvmeq->qid)
+		free_cpumask_var(nvmeq->cpu_mask);
+	kfree(nvmeq);
+}
+
+static void nvme_free_queues(struct nvme_dev *dev, int lowest)
+{
+	int i;
+
+	for (i = dev->queue_count - 1; i >= lowest; i--) {
+		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+		rcu_assign_pointer(dev->queues[i], NULL);
+		call_rcu(&nvmeq->r_head, nvme_free_queue);
+		dev->queue_count--;
+	}
+}
+
+/**
+ * nvme_suspend_queue - put queue into suspended state
+ * @nvmeq - queue to suspend
+ *
+ * Returns 1 if already suspended, 0 otherwise.
+ */
+static int nvme_suspend_queue(struct nvme_queue *nvmeq)
+{
+	int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
+
+	spin_lock_irq(&nvmeq->q_lock);
+	if (nvmeq->q_suspended) {
+		spin_unlock_irq(&nvmeq->q_lock);
+		return 1;
+	}
+	nvmeq->q_suspended = 1;
+	nvmeq->dev->online_queues--;
+	spin_unlock_irq(&nvmeq->q_lock);
+
+	irq_set_affinity_hint(vector, NULL);
+	free_irq(vector, nvmeq);
+
+	return 0;
+}
+
+static void nvme_clear_queue(struct nvme_queue *nvmeq)
+{
+	spin_lock_irq(&nvmeq->q_lock);
+	nvme_process_cq(nvmeq);
+	nvme_cancel_ios(nvmeq, false);
+	spin_unlock_irq(&nvmeq->q_lock);
+}
+
+static void nvme_disable_queue(struct nvme_dev *dev, int qid)
+{
+	struct nvme_queue *nvmeq = raw_nvmeq(dev, qid);
+
+	if (!nvmeq)
+		return;
+	if (nvme_suspend_queue(nvmeq))
+		return;
+
+	/* Don't tell the adapter to delete the admin queue.
+	 * Don't tell a removed adapter to delete IO queues. */
+	if (qid && readl(&dev->bar->csts) != -1) {
+		adapter_delete_sq(dev, qid);
+		adapter_delete_cq(dev, qid);
+	}
+	nvme_clear_queue(nvmeq);
+}
+
+static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
+							int depth, int vector)
+{
+	struct device *dmadev = &dev->pci_dev->dev;
+	unsigned extra = nvme_queue_extra(depth);
+	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
+	if (!nvmeq)
+		return NULL;
+
+	nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
+					&nvmeq->cq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->cqes)
+		goto free_nvmeq;
+	memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
+
+	nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
+					&nvmeq->sq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->sq_cmds)
+		goto free_cqdma;
+
+	if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL))
+		goto free_sqdma;
+
+	nvmeq->q_dmadev = dmadev;
+	nvmeq->dev = dev;
+	snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
+			dev->instance, qid);
+	spin_lock_init(&nvmeq->q_lock);
+	nvmeq->cq_head = 0;
+	nvmeq->cq_phase = 1;
+	init_waitqueue_head(&nvmeq->sq_full);
+	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
+	bio_list_init(&nvmeq->sq_cong);
+	INIT_LIST_HEAD(&nvmeq->iod_bio);
+	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
+	nvmeq->q_depth = depth;
+	nvmeq->cq_vector = vector;
+	nvmeq->qid = qid;
+	nvmeq->q_suspended = 1;
+	dev->queue_count++;
+	rcu_assign_pointer(dev->queues[qid], nvmeq);
+
+	return nvmeq;
+
+ free_sqdma:
+	dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds,
+							nvmeq->sq_dma_addr);
+ free_cqdma:
+	dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
+							nvmeq->cq_dma_addr);
+ free_nvmeq:
+	kfree(nvmeq);
+	return NULL;
+}
+
+static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+							const char *name)
+{
+	if (use_threaded_interrupts)
+		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
+					nvme_irq_check, nvme_irq, IRQF_SHARED,
+					name, nvmeq);
+	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
+				IRQF_SHARED, name, nvmeq);
+}
+
+static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
+{
+	struct nvme_dev *dev = nvmeq->dev;
+	unsigned extra = nvme_queue_extra(nvmeq->q_depth);
+
+	nvmeq->sq_tail = 0;
+	nvmeq->cq_head = 0;
+	nvmeq->cq_phase = 1;
+	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
+	memset(nvmeq->cmdid_data, 0, extra);
+	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+	nvme_cancel_ios(nvmeq, false);
+	nvmeq->q_suspended = 0;
+	dev->online_queues++;
+}
+
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
+{
+	struct nvme_dev *dev = nvmeq->dev;
+	int result;
+
+	result = adapter_alloc_cq(dev, qid, nvmeq);
+	if (result < 0)
+		return result;
+
+	result = adapter_alloc_sq(dev, qid, nvmeq);
+	if (result < 0)
+		goto release_cq;
+
+	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
+	if (result < 0)
+		goto release_sq;
+
+	spin_lock_irq(&nvmeq->q_lock);
+	nvme_init_queue(nvmeq, qid);
+	spin_unlock_irq(&nvmeq->q_lock);
+
+	return result;
+
+ release_sq:
+	adapter_delete_sq(dev, qid);
+ release_cq:
+	adapter_delete_cq(dev, qid);
+	return result;
+}
+
+static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
+{
+	unsigned long timeout;
+	u32 bit = enabled ? NVME_CSTS_RDY : 0;
+
+	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+
+	while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(&dev->pci_dev->dev,
+				"Device not ready; aborting %s\n", enabled ?
+						"initialisation" : "reset");
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * If the device has been passed off to us in an enabled state, just clear
+ * the enabled bit.  The spec says we should set the 'shutdown notification
+ * bits', but doing so may cause the device to complete commands to the
+ * admin queue ... and we don't know what memory that might be pointing at!
+ */
+static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
+{
+	u32 cc = readl(&dev->bar->cc);
+
+	if (cc & NVME_CC_ENABLE)
+		writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc);
+	return nvme_wait_ready(dev, cap, false);
+}
+
+static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
+{
+	return nvme_wait_ready(dev, cap, true);
+}
+
+static int nvme_shutdown_ctrl(struct nvme_dev *dev)
+{
+	unsigned long timeout;
+	u32 cc;
+
+	cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL;
+	writel(cc, &dev->bar->cc);
+
+	timeout = 2 * HZ + jiffies;
+	while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
+							NVME_CSTS_SHST_CMPLT) {
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(&dev->pci_dev->dev,
+				"Device shutdown incomplete; abort shutdown\n");
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
+static int nvme_configure_admin_queue(struct nvme_dev *dev)
+{
+	int result;
+	u32 aqa;
+	u64 cap = readq(&dev->bar->cap);
+	struct nvme_queue *nvmeq;
+
+	result = nvme_disable_ctrl(dev, cap);
+	if (result < 0)
+		return result;
+
+	nvmeq = raw_nvmeq(dev, 0);
+	if (!nvmeq) {
+		nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+		if (!nvmeq)
+			return -ENOMEM;
+	}
+
+	aqa = nvmeq->q_depth - 1;
+	aqa |= aqa << 16;
+
+	dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
+	dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
+	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+	dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
+
+	writel(aqa, &dev->bar->aqa);
+	writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
+	writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
+	writel(dev->ctrl_config, &dev->bar->cc);
+
+	result = nvme_enable_ctrl(dev, cap);
+	if (result)
+		return result;
+
+	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
+	if (result)
+		return result;
+
+	spin_lock_irq(&nvmeq->q_lock);
+	nvme_init_queue(nvmeq, 0);
+	spin_unlock_irq(&nvmeq->q_lock);
+	return result;
+}
+
+struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
+				unsigned long addr, unsigned length)
+{
+	int i, err, count, nents, offset;
+	struct scatterlist *sg;
+	struct page **pages;
+	struct nvme_iod *iod;
+
+	if (addr & 3)
+		return ERR_PTR(-EINVAL);
+	if (!length || length > INT_MAX - PAGE_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	offset = offset_in_page(addr);
+	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+	pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	err = get_user_pages_fast(addr, count, 1, pages);
+	if (err < count) {
+		count = err;
+		err = -EFAULT;
+		goto put_pages;
+	}
+
+	err = -ENOMEM;
+	iod = nvme_alloc_iod(count, length, GFP_KERNEL);
+	if (!iod)
+		goto put_pages;
+
+	sg = iod->sg;
+	sg_init_table(sg, count);
+	for (i = 0; i < count; i++) {
+		sg_set_page(&sg[i], pages[i],
+			    min_t(unsigned, length, PAGE_SIZE - offset),
+			    offset);
+		length -= (PAGE_SIZE - offset);
+		offset = 0;
+	}
+	sg_mark_end(&sg[i - 1]);
+	iod->nents = count;
+
+	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
+				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	if (!nents)
+		goto free_iod;
+
+	kfree(pages);
+	return iod;
+
+ free_iod:
+	kfree(iod);
+ put_pages:
+	for (i = 0; i < count; i++)
+		put_page(pages[i]);
+	kfree(pages);
+	return ERR_PTR(err);
+}
+
+void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
+			struct nvme_iod *iod)
+{
+	int i;
+
+	dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	for (i = 0; i < iod->nents; i++)
+		put_page(sg_page(&iod->sg[i]));
+}
+
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_user_io io;
+	struct nvme_command c;
+	unsigned length, meta_len;
+	int status, i;
+	struct nvme_iod *iod, *meta_iod = NULL;
+	dma_addr_t meta_dma_addr;
+	void *meta, *uninitialized_var(meta_mem);
+
+	if (copy_from_user(&io, uio, sizeof(io)))
+		return -EFAULT;
+	length = (io.nblocks + 1) << ns->lba_shift;
+	meta_len = (io.nblocks + 1) * ns->ms;
+
+	if (meta_len && ((io.metadata & 3) || !io.metadata))
+		return -EINVAL;
+
+	switch (io.opcode) {
+	case nvme_cmd_write:
+	case nvme_cmd_read:
+	case nvme_cmd_compare:
+		iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (IS_ERR(iod))
+		return PTR_ERR(iod);
+
+	memset(&c, 0, sizeof(c));
+	c.rw.opcode = io.opcode;
+	c.rw.flags = io.flags;
+	c.rw.nsid = cpu_to_le32(ns->ns_id);
+	c.rw.slba = cpu_to_le64(io.slba);
+	c.rw.length = cpu_to_le16(io.nblocks);
+	c.rw.control = cpu_to_le16(io.control);
+	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+	c.rw.reftag = cpu_to_le32(io.reftag);
+	c.rw.apptag = cpu_to_le16(io.apptag);
+	c.rw.appmask = cpu_to_le16(io.appmask);
+
+	if (meta_len) {
+		meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata,
+								meta_len);
+		if (IS_ERR(meta_iod)) {
+			status = PTR_ERR(meta_iod);
+			meta_iod = NULL;
+			goto unmap;
+		}
+
+		meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len,
+						&meta_dma_addr, GFP_KERNEL);
+		if (!meta_mem) {
+			status = -ENOMEM;
+			goto unmap;
+		}
+
+		if (io.opcode & 1) {
+			int meta_offset = 0;
+
+			for (i = 0; i < meta_iod->nents; i++) {
+				meta = kmap_atomic(sg_page(&meta_iod->sg[i])) +
+						meta_iod->sg[i].offset;
+				memcpy(meta_mem + meta_offset, meta,
+						meta_iod->sg[i].length);
+				kunmap_atomic(meta);
+				meta_offset += meta_iod->sg[i].length;
+			}
+		}
+
+		c.rw.metadata = cpu_to_le64(meta_dma_addr);
+	}
+
+	length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+	c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+	c.rw.prp2 = cpu_to_le64(iod->first_dma);
+
+	if (length != (io.nblocks + 1) << ns->lba_shift)
+		status = -ENOMEM;
+	else
+		status = nvme_submit_io_cmd(dev, &c, NULL);
+
+	if (meta_len) {
+		if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
+			int meta_offset = 0;
+
+			for (i = 0; i < meta_iod->nents; i++) {
+				meta = kmap_atomic(sg_page(&meta_iod->sg[i])) +
+						meta_iod->sg[i].offset;
+				memcpy(meta, meta_mem + meta_offset,
+						meta_iod->sg[i].length);
+				kunmap_atomic(meta);
+				meta_offset += meta_iod->sg[i].length;
+			}
+		}
+
+		dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem,
+								meta_dma_addr);
+	}
+
+ unmap:
+	nvme_unmap_user_pages(dev, io.opcode & 1, iod);
+	nvme_free_iod(dev, iod);
+
+	if (meta_iod) {
+		nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod);
+		nvme_free_iod(dev, meta_iod);
+	}
+
+	return status;
+}
+
+static int nvme_user_admin_cmd(struct nvme_dev *dev,
+					struct nvme_admin_cmd __user *ucmd)
+{
+	struct nvme_admin_cmd cmd;
+	struct nvme_command c;
+	int status, length;
+	struct nvme_iod *uninitialized_var(iod);
+	unsigned timeout;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+		return -EFAULT;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = cmd.opcode;
+	c.common.flags = cmd.flags;
+	c.common.nsid = cpu_to_le32(cmd.nsid);
+	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
+	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
+	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
+	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
+	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
+	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+
+	length = cmd.data_len;
+	if (cmd.data_len) {
+		iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
+								length);
+		if (IS_ERR(iod))
+			return PTR_ERR(iod);
+		length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+		c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+		c.common.prp2 = cpu_to_le64(iod->first_dma);
+	}
+
+	timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
+								ADMIN_TIMEOUT;
+	if (length != cmd.data_len)
+		status = -ENOMEM;
+	else
+		status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout);
+
+	if (cmd.data_len) {
+		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
+		nvme_free_iod(dev, iod);
+	}
+
+	if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result,
+							sizeof(cmd.result)))
+		status = -EFAULT;
+
+	return status;
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+							unsigned long arg)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case NVME_IOCTL_ID:
+		force_successful_syscall_return();
+		return ns->ns_id;
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_admin_cmd(ns->dev, (void __user *)arg);
+	case NVME_IOCTL_SUBMIT_IO:
+		return nvme_submit_io(ns, (void __user *)arg);
+	case SG_GET_VERSION_NUM:
+		return nvme_sg_get_version_num((void __user *)arg);
+	case SG_IO:
+		return nvme_sg_io(ns, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
+					unsigned int cmd, unsigned long arg)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case SG_IO:
+		return nvme_sg_io32(ns, arg);
+	}
+	return nvme_ioctl(bdev, mode, cmd, arg);
+}
+#else
+#define nvme_compat_ioctl	NULL
+#endif
+
+static int nvme_open(struct block_device *bdev, fmode_t mode)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+	struct nvme_dev *dev = ns->dev;
+
+	kref_get(&dev->kref);
+	return 0;
+}
+
+static void nvme_free_dev(struct kref *kref);
+
+static void nvme_release(struct gendisk *disk, fmode_t mode)
+{
+	struct nvme_ns *ns = disk->private_data;
+	struct nvme_dev *dev = ns->dev;
+
+	kref_put(&dev->kref, nvme_free_dev);
+}
+
+static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+	/* some standard values */
+	geo->heads = 1 << 6;
+	geo->sectors = 1 << 5;
+	geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+	return 0;
+}
+
+static const struct block_device_operations nvme_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= nvme_compat_ioctl,
+	.open		= nvme_open,
+	.release	= nvme_release,
+	.getgeo		= nvme_getgeo,
+};
+
+static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
+{
+	struct nvme_iod *iod, *next;
+
+	list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) {
+		if (unlikely(nvme_submit_iod(nvmeq, iod)))
+			break;
+		list_del(&iod->node);
+		if (bio_list_empty(&nvmeq->sq_cong) &&
+						list_empty(&nvmeq->iod_bio))
+			remove_wait_queue(&nvmeq->sq_full,
+						&nvmeq->sq_cong_wait);
+	}
+}
+
+static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
+{
+	while (bio_list_peek(&nvmeq->sq_cong)) {
+		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
+		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+
+		if (bio_list_empty(&nvmeq->sq_cong) &&
+						list_empty(&nvmeq->iod_bio))
+			remove_wait_queue(&nvmeq->sq_full,
+							&nvmeq->sq_cong_wait);
+		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
+			if (!waitqueue_active(&nvmeq->sq_full))
+				add_wait_queue(&nvmeq->sq_full,
+							&nvmeq->sq_cong_wait);
+			bio_list_add_head(&nvmeq->sq_cong, bio);
+			break;
+		}
+	}
+}
+
+static int nvme_kthread(void *data)
+{
+	struct nvme_dev *dev, *next;
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_lock(&dev_list_lock);
+		list_for_each_entry_safe(dev, next, &dev_list, node) {
+			int i;
+			if (readl(&dev->bar->csts) & NVME_CSTS_CFS &&
+							dev->initialized) {
+				if (work_busy(&dev->reset_work))
+					continue;
+				list_del_init(&dev->node);
+				dev_warn(&dev->pci_dev->dev,
+					"Failed status, reset controller\n");
+				dev->reset_workfn = nvme_reset_failed_dev;
+				queue_work(nvme_workq, &dev->reset_work);
+				continue;
+			}
+			rcu_read_lock();
+			for (i = 0; i < dev->queue_count; i++) {
+				struct nvme_queue *nvmeq =
+						rcu_dereference(dev->queues[i]);
+				if (!nvmeq)
+					continue;
+				spin_lock_irq(&nvmeq->q_lock);
+				if (nvmeq->q_suspended)
+					goto unlock;
+				nvme_process_cq(nvmeq);
+				nvme_cancel_ios(nvmeq, true);
+				nvme_resubmit_bios(nvmeq);
+				nvme_resubmit_iods(nvmeq);
+ unlock:
+				spin_unlock_irq(&nvmeq->q_lock);
+			}
+			rcu_read_unlock();
+		}
+		spin_unlock(&dev_list_lock);
+		schedule_timeout(round_jiffies_relative(HZ));
+	}
+	return 0;
+}
+
+static void nvme_config_discard(struct nvme_ns *ns)
+{
+	u32 logical_block_size = queue_logical_block_size(ns->queue);
+	ns->queue->limits.discard_zeroes_data = 0;
+	ns->queue->limits.discard_alignment = logical_block_size;
+	ns->queue->limits.discard_granularity = logical_block_size;
+	ns->queue->limits.max_discard_sectors = 0xffffffff;
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+}
+
+static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
+			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
+{
+	struct nvme_ns *ns;
+	struct gendisk *disk;
+	int lbaf;
+
+	if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
+		return NULL;
+
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	if (!ns)
+		return NULL;
+	ns->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!ns->queue)
+		goto out_free_ns;
+	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+	blk_queue_make_request(ns->queue, nvme_make_request);
+	ns->dev = dev;
+	ns->queue->queuedata = ns;
+
+	disk = alloc_disk(0);
+	if (!disk)
+		goto out_free_queue;
+	ns->ns_id = nsid;
+	ns->disk = disk;
+	lbaf = id->flbas & 0xf;
+	ns->lba_shift = id->lbaf[lbaf].ds;
+	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+	if (dev->max_hw_sectors)
+		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
+	if (dev->vwc & NVME_CTRL_VWC_PRESENT)
+		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
+
+	disk->major = nvme_major;
+	disk->first_minor = 0;
+	disk->fops = &nvme_fops;
+	disk->private_data = ns;
+	disk->queue = ns->queue;
+	disk->driverfs_dev = &dev->pci_dev->dev;
+	disk->flags = GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
+	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+	if (dev->oncs & NVME_CTRL_ONCS_DSM)
+		nvme_config_discard(ns);
+
+	return ns;
+
+ out_free_queue:
+	blk_cleanup_queue(ns->queue);
+ out_free_ns:
+	kfree(ns);
+	return NULL;
+}
+
+static int nvme_find_closest_node(int node)
+{
+	int n, val, min_val = INT_MAX, best_node = node;
+
+	for_each_online_node(n) {
+		if (n == node)
+			continue;
+		val = node_distance(node, n);
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+	return best_node;
+}
+
+static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
+								int count)
+{
+	int cpu;
+	for_each_cpu(cpu, qmask) {
+		if (cpumask_weight(nvmeq->cpu_mask) >= count)
+			break;
+		if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask))
+			*per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid;
+	}
+}
+
+static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
+	const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue)
+{
+	int next_cpu;
+	for_each_cpu(next_cpu, new_mask) {
+		cpumask_or(mask, mask, get_cpu_mask(next_cpu));
+		cpumask_or(mask, mask, topology_thread_cpumask(next_cpu));
+		cpumask_and(mask, mask, unassigned_cpus);
+		nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue);
+	}
+}
+
+static void nvme_create_io_queues(struct nvme_dev *dev)
+{
+	unsigned i, max;
+
+	max = min(dev->max_qid, num_online_cpus());
+	for (i = dev->queue_count; i <= max; i++)
+		if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
+			break;
+
+	max = min(dev->queue_count - 1, num_online_cpus());
+	for (i = dev->online_queues; i <= max; i++)
+		if (nvme_create_queue(raw_nvmeq(dev, i), i))
+			break;
+}
+
+/*
+ * If there are fewer queues than online cpus, this will try to optimally
+ * assign a queue to multiple cpus by grouping cpus that are "close" together:
+ * thread siblings, core, socket, closest node, then whatever else is
+ * available.
+ */
+static void nvme_assign_io_queues(struct nvme_dev *dev)
+{
+	unsigned cpu, cpus_per_queue, queues, remainder, i;
+	cpumask_var_t unassigned_cpus;
+
+	nvme_create_io_queues(dev);
+
+	queues = min(dev->online_queues - 1, num_online_cpus());
+	if (!queues)
+		return;
+
+	cpus_per_queue = num_online_cpus() / queues;
+	remainder = queues - (num_online_cpus() - queues * cpus_per_queue);
+
+	if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL))
+		return;
+
+	cpumask_copy(unassigned_cpus, cpu_online_mask);
+	cpu = cpumask_first(unassigned_cpus);
+	for (i = 1; i <= queues; i++) {
+		struct nvme_queue *nvmeq = lock_nvmeq(dev, i);
+		cpumask_t mask;
+
+		cpumask_clear(nvmeq->cpu_mask);
+		if (!cpumask_weight(unassigned_cpus)) {
+			unlock_nvmeq(nvmeq);
+			break;
+		}
+
+		mask = *get_cpu_mask(cpu);
+		nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, unassigned_cpus,
+				topology_thread_cpumask(cpu),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, unassigned_cpus,
+				topology_core_cpumask(cpu),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, unassigned_cpus,
+				cpumask_of_node(cpu_to_node(cpu)),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, unassigned_cpus,
+				cpumask_of_node(
+					nvme_find_closest_node(
+						cpu_to_node(cpu))),
+				nvmeq, cpus_per_queue);
+		if (cpus_weight(mask) < cpus_per_queue)
+			nvme_add_cpus(&mask, unassigned_cpus,
+				unassigned_cpus,
+				nvmeq, cpus_per_queue);
+
+		WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue,
+			"nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
+			dev->instance, i);
+
+		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
+							nvmeq->cpu_mask);
+		cpumask_andnot(unassigned_cpus, unassigned_cpus,
+						nvmeq->cpu_mask);
+		cpu = cpumask_next(cpu, unassigned_cpus);
+		if (remainder && !--remainder)
+			cpus_per_queue++;
+		unlock_nvmeq(nvmeq);
+	}
+	WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n",
+								dev->instance);
+	i = 0;
+	cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
+	for_each_cpu(cpu, unassigned_cpus)
+		*per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1;
+	free_cpumask_var(unassigned_cpus);
+}
+
+static int set_queue_count(struct nvme_dev *dev, int count)
+{
+	int status;
+	u32 result;
+	u32 q_count = (count - 1) | ((count - 1) << 16);
+
+	status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
+								&result);
+	if (status < 0)
+		return status;
+	if (status > 0) {
+		dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n",
+									status);
+		return -EBUSY;
+	}
+	return min(result & 0xffff, result >> 16) + 1;
+}
+
+static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
+{
+	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
+}
+
+static void nvme_cpu_workfn(struct work_struct *work)
+{
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work);
+	if (dev->initialized)
+		nvme_assign_io_queues(dev);
+}
+
+static int nvme_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	struct nvme_dev *dev;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		spin_lock(&dev_list_lock);
+		list_for_each_entry(dev, &dev_list, node)
+			schedule_work(&dev->cpu_work);
+		spin_unlock(&dev_list_lock);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static int nvme_setup_io_queues(struct nvme_dev *dev)
+{
+	struct nvme_queue *adminq = raw_nvmeq(dev, 0);
+	struct pci_dev *pdev = dev->pci_dev;
+	int result, i, vecs, nr_io_queues, size;
+
+	nr_io_queues = num_possible_cpus();
+	result = set_queue_count(dev, nr_io_queues);
+	if (result < 0)
+		return result;
+	if (result < nr_io_queues)
+		nr_io_queues = result;
+
+	size = db_bar_size(dev, nr_io_queues);
+	if (size > 8192) {
+		iounmap(dev->bar);
+		do {
+			dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+			if (dev->bar)
+				break;
+			if (!--nr_io_queues)
+				return -ENOMEM;
+			size = db_bar_size(dev, nr_io_queues);
+		} while (1);
+		dev->dbs = ((void __iomem *)dev->bar) + 4096;
+		adminq->q_db = dev->dbs;
+	}
+
+	/* Deregister the admin queue's interrupt */
+	free_irq(dev->entry[0].vector, adminq);
+
+	for (i = 0; i < nr_io_queues; i++)
+		dev->entry[i].entry = i;
+	vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
+	if (vecs < 0) {
+		vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
+		if (vecs < 0) {
+			vecs = 1;
+		} else {
+			for (i = 0; i < vecs; i++)
+				dev->entry[i].vector = i + pdev->irq;
+		}
+	}
+
+	/*
+	 * Should investigate if there's a performance win from allocating
+	 * more queues than interrupt vectors; it might allow the submission
+	 * path to scale better, even if the receive path is limited by the
+	 * number of interrupts.
+	 */
+	nr_io_queues = vecs;
+	dev->max_qid = nr_io_queues;
+
+	result = queue_request_irq(dev, adminq, adminq->irqname);
+	if (result) {
+		adminq->q_suspended = 1;
+		goto free_queues;
+	}
+
+	/* Free previously allocated queues that are no longer usable */
+	nvme_free_queues(dev, nr_io_queues + 1);
+	nvme_assign_io_queues(dev);
+
+	return 0;
+
+ free_queues:
+	nvme_free_queues(dev, 1);
+	return result;
+}
+
+/*
+ * Return: error value if an error occurred setting up the queues or calling
+ * Identify Device.  0 if these succeeded, even if adding some of the
+ * namespaces failed.  At the moment, these failures are silent.  TBD which
+ * failures should be reported.
+ */
+static int nvme_dev_add(struct nvme_dev *dev)
+{
+	struct pci_dev *pdev = dev->pci_dev;
+	int res;
+	unsigned nn, i;
+	struct nvme_ns *ns;
+	struct nvme_id_ctrl *ctrl;
+	struct nvme_id_ns *id_ns;
+	void *mem;
+	dma_addr_t dma_addr;
+	int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
+
+	mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL);
+	if (!mem)
+		return -ENOMEM;
+
+	res = nvme_identify(dev, 0, 1, dma_addr);
+	if (res) {
+		dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res);
+		res = -EIO;
+		goto out;
+	}
+
+	ctrl = mem;
+	nn = le32_to_cpup(&ctrl->nn);
+	dev->oncs = le16_to_cpup(&ctrl->oncs);
+	dev->abort_limit = ctrl->acl + 1;
+	dev->vwc = ctrl->vwc;
+	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
+	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
+	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
+	if (ctrl->mdts)
+		dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
+	if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
+			(pdev->device == 0x0953) && ctrl->vs[3])
+		dev->stripe_size = 1 << (ctrl->vs[3] + shift);
+
+	id_ns = mem;
+	for (i = 1; i <= nn; i++) {
+		res = nvme_identify(dev, i, 0, dma_addr);
+		if (res)
+			continue;
+
+		if (id_ns->ncap == 0)
+			continue;
+
+		res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
+							dma_addr + 4096, NULL);
+		if (res)
+			memset(mem + 4096, 0, 4096);
+
+		ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
+		if (ns)
+			list_add_tail(&ns->list, &dev->namespaces);
+	}
+	list_for_each_entry(ns, &dev->namespaces, list)
+		add_disk(ns->disk);
+	res = 0;
+
+ out:
+	dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
+	return res;
+}
+
+static int nvme_dev_map(struct nvme_dev *dev)
+{
+	u64 cap;
+	int bars, result = -ENOMEM;
+	struct pci_dev *pdev = dev->pci_dev;
+
+	if (pci_enable_device_mem(pdev))
+		return result;
+
+	dev->entry[0].vector = pdev->irq;
+	pci_set_master(pdev);
+	bars = pci_select_bars(pdev, IORESOURCE_MEM);
+	if (pci_request_selected_regions(pdev, bars, "nvme"))
+		goto disable_pci;
+
+	if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) &&
+	    dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))
+		goto disable;
+
+	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+	if (!dev->bar)
+		goto disable;
+	if (readl(&dev->bar->csts) == -1) {
+		result = -ENODEV;
+		goto unmap;
+	}
+	cap = readq(&dev->bar->cap);
+	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
+	dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
+	dev->dbs = ((void __iomem *)dev->bar) + 4096;
+
+	return 0;
+
+ unmap:
+	iounmap(dev->bar);
+	dev->bar = NULL;
+ disable:
+	pci_release_regions(pdev);
+ disable_pci:
+	pci_disable_device(pdev);
+	return result;
+}
+
+static void nvme_dev_unmap(struct nvme_dev *dev)
+{
+	if (dev->pci_dev->msi_enabled)
+		pci_disable_msi(dev->pci_dev);
+	else if (dev->pci_dev->msix_enabled)
+		pci_disable_msix(dev->pci_dev);
+
+	if (dev->bar) {
+		iounmap(dev->bar);
+		dev->bar = NULL;
+		pci_release_regions(dev->pci_dev);
+	}
+
+	if (pci_is_enabled(dev->pci_dev))
+		pci_disable_device(dev->pci_dev);
+}
+
+struct nvme_delq_ctx {
+	struct task_struct *waiter;
+	struct kthread_worker *worker;
+	atomic_t refcount;
+};
+
+static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
+{
+	dq->waiter = current;
+	mb();
+
+	for (;;) {
+		set_current_state(TASK_KILLABLE);
+		if (!atomic_read(&dq->refcount))
+			break;
+		if (!schedule_timeout(ADMIN_TIMEOUT) ||
+					fatal_signal_pending(current)) {
+			set_current_state(TASK_RUNNING);
+
+			nvme_disable_ctrl(dev, readq(&dev->bar->cap));
+			nvme_disable_queue(dev, 0);
+
+			send_sig(SIGKILL, dq->worker->task, 1);
+			flush_kthread_worker(dq->worker);
+			return;
+		}
+	}
+	set_current_state(TASK_RUNNING);
+}
+
+static void nvme_put_dq(struct nvme_delq_ctx *dq)
+{
+	atomic_dec(&dq->refcount);
+	if (dq->waiter)
+		wake_up_process(dq->waiter);
+}
+
+static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
+{
+	atomic_inc(&dq->refcount);
+	return dq;
+}
+
+static void nvme_del_queue_end(struct nvme_queue *nvmeq)
+{
+	struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
+
+	nvme_clear_queue(nvmeq);
+	nvme_put_dq(dq);
+}
+
+static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
+						kthread_work_func_t fn)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.delete_queue.opcode = opcode;
+	c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
+
+	init_kthread_work(&nvmeq->cmdinfo.work, fn);
+	return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo);
+}
+
+static void nvme_del_cq_work_handler(struct kthread_work *work)
+{
+	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
+							cmdinfo.work);
+	nvme_del_queue_end(nvmeq);
+}
+
+static int nvme_delete_cq(struct nvme_queue *nvmeq)
+{
+	return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
+						nvme_del_cq_work_handler);
+}
+
+static void nvme_del_sq_work_handler(struct kthread_work *work)
+{
+	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
+							cmdinfo.work);
+	int status = nvmeq->cmdinfo.status;
+
+	if (!status)
+		status = nvme_delete_cq(nvmeq);
+	if (status)
+		nvme_del_queue_end(nvmeq);
+}
+
+static int nvme_delete_sq(struct nvme_queue *nvmeq)
+{
+	return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
+						nvme_del_sq_work_handler);
+}
+
+static void nvme_del_queue_start(struct kthread_work *work)
+{
+	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
+							cmdinfo.work);
+	allow_signal(SIGKILL);
+	if (nvme_delete_sq(nvmeq))
+		nvme_del_queue_end(nvmeq);
+}
+
+static void nvme_disable_io_queues(struct nvme_dev *dev)
+{
+	int i;
+	DEFINE_KTHREAD_WORKER_ONSTACK(worker);
+	struct nvme_delq_ctx dq;
+	struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
+					&worker, "nvme%d", dev->instance);
+
+	if (IS_ERR(kworker_task)) {
+		dev_err(&dev->pci_dev->dev,
+			"Failed to create queue del task\n");
+		for (i = dev->queue_count - 1; i > 0; i--)
+			nvme_disable_queue(dev, i);
+		return;
+	}
+
+	dq.waiter = NULL;
+	atomic_set(&dq.refcount, 0);
+	dq.worker = &worker;
+	for (i = dev->queue_count - 1; i > 0; i--) {
+		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+
+		if (nvme_suspend_queue(nvmeq))
+			continue;
+		nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
+		nvmeq->cmdinfo.worker = dq.worker;
+		init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
+		queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
+	}
+	nvme_wait_dq(&dq, dev);
+	kthread_stop(kworker_task);
+}
+
+/*
+* Remove the node from the device list and check
+* for whether or not we need to stop the nvme_thread.
+*/
+static void nvme_dev_list_remove(struct nvme_dev *dev)
+{
+	struct task_struct *tmp = NULL;
+
+	spin_lock(&dev_list_lock);
+	list_del_init(&dev->node);
+	if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) {
+		tmp = nvme_thread;
+		nvme_thread = NULL;
+	}
+	spin_unlock(&dev_list_lock);
+
+	if (tmp)
+		kthread_stop(tmp);
+}
+
+static void nvme_dev_shutdown(struct nvme_dev *dev)
+{
+	int i;
+
+	dev->initialized = 0;
+	nvme_dev_list_remove(dev);
+
+	if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
+		for (i = dev->queue_count - 1; i >= 0; i--) {
+			struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
+			nvme_suspend_queue(nvmeq);
+			nvme_clear_queue(nvmeq);
+		}
+	} else {
+		nvme_disable_io_queues(dev);
+		nvme_shutdown_ctrl(dev);
+		nvme_disable_queue(dev, 0);
+	}
+	nvme_dev_unmap(dev);
+}
+
+static void nvme_dev_remove(struct nvme_dev *dev)
+{
+	struct nvme_ns *ns;
+
+	list_for_each_entry(ns, &dev->namespaces, list) {
+		if (ns->disk->flags & GENHD_FL_UP)
+			del_gendisk(ns->disk);
+		if (!blk_queue_dying(ns->queue))
+			blk_cleanup_queue(ns->queue);
+	}
+}
+
+static int nvme_setup_prp_pools(struct nvme_dev *dev)
+{
+	struct device *dmadev = &dev->pci_dev->dev;
+	dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
+						PAGE_SIZE, PAGE_SIZE, 0);
+	if (!dev->prp_page_pool)
+		return -ENOMEM;
+
+	/* Optimisation for I/Os between 4k and 128k */
+	dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
+						256, 256, 0);
+	if (!dev->prp_small_pool) {
+		dma_pool_destroy(dev->prp_page_pool);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void nvme_release_prp_pools(struct nvme_dev *dev)
+{
+	dma_pool_destroy(dev->prp_page_pool);
+	dma_pool_destroy(dev->prp_small_pool);
+}
+
+static DEFINE_IDA(nvme_instance_ida);
+
+static int nvme_set_instance(struct nvme_dev *dev)
+{
+	int instance, error;
+
+	do {
+		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
+			return -ENODEV;
+
+		spin_lock(&dev_list_lock);
+		error = ida_get_new(&nvme_instance_ida, &instance);
+		spin_unlock(&dev_list_lock);
+	} while (error == -EAGAIN);
+
+	if (error)
+		return -ENODEV;
+
+	dev->instance = instance;
+	return 0;
+}
+
+static void nvme_release_instance(struct nvme_dev *dev)
+{
+	spin_lock(&dev_list_lock);
+	ida_remove(&nvme_instance_ida, dev->instance);
+	spin_unlock(&dev_list_lock);
+}
+
+static void nvme_free_namespaces(struct nvme_dev *dev)
+{
+	struct nvme_ns *ns, *next;
+
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+		list_del(&ns->list);
+		put_disk(ns->disk);
+		kfree(ns);
+	}
+}
+
+static void nvme_free_dev(struct kref *kref)
+{
+	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
+
+	nvme_free_namespaces(dev);
+	free_percpu(dev->io_queue);
+	kfree(dev->queues);
+	kfree(dev->entry);
+	kfree(dev);
+}
+
+static int nvme_dev_open(struct inode *inode, struct file *f)
+{
+	struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev,
+								miscdev);
+	kref_get(&dev->kref);
+	f->private_data = dev;
+	return 0;
+}
+
+static int nvme_dev_release(struct inode *inode, struct file *f)
+{
+	struct nvme_dev *dev = f->private_data;
+	kref_put(&dev->kref, nvme_free_dev);
+	return 0;
+}
+
+static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+	struct nvme_dev *dev = f->private_data;
+	switch (cmd) {
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_admin_cmd(dev, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations nvme_dev_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nvme_dev_open,
+	.release	= nvme_dev_release,
+	.unlocked_ioctl	= nvme_dev_ioctl,
+	.compat_ioctl	= nvme_dev_ioctl,
+};
+
+static int nvme_dev_start(struct nvme_dev *dev)
+{
+	int result;
+	bool start_thread = false;
+
+	result = nvme_dev_map(dev);
+	if (result)
+		return result;
+
+	result = nvme_configure_admin_queue(dev);
+	if (result)
+		goto unmap;
+
+	spin_lock(&dev_list_lock);
+	if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
+		start_thread = true;
+		nvme_thread = NULL;
+	}
+	list_add(&dev->node, &dev_list);
+	spin_unlock(&dev_list_lock);
+
+	if (start_thread) {
+		nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
+		wake_up(&nvme_kthread_wait);
+	} else
+		wait_event_killable(nvme_kthread_wait, nvme_thread);
+
+	if (IS_ERR_OR_NULL(nvme_thread)) {
+		result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
+		goto disable;
+	}
+
+	result = nvme_setup_io_queues(dev);
+	if (result && result != -EBUSY)
+		goto disable;
+
+	return result;
+
+ disable:
+	nvme_disable_queue(dev, 0);
+	nvme_dev_list_remove(dev);
+ unmap:
+	nvme_dev_unmap(dev);
+	return result;
+}
+
+static int nvme_remove_dead_ctrl(void *arg)
+{
+	struct nvme_dev *dev = (struct nvme_dev *)arg;
+	struct pci_dev *pdev = dev->pci_dev;
+
+	if (pci_get_drvdata(pdev))
+		pci_stop_and_remove_bus_device(pdev);
+	kref_put(&dev->kref, nvme_free_dev);
+	return 0;
+}
+
+static void nvme_remove_disks(struct work_struct *ws)
+{
+	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
+
+	nvme_dev_remove(dev);
+	nvme_free_queues(dev, 1);
+}
+
+static int nvme_dev_resume(struct nvme_dev *dev)
+{
+	int ret;
+
+	ret = nvme_dev_start(dev);
+	if (ret && ret != -EBUSY)
+		return ret;
+	if (ret == -EBUSY) {
+		spin_lock(&dev_list_lock);
+		dev->reset_workfn = nvme_remove_disks;
+		queue_work(nvme_workq, &dev->reset_work);
+		spin_unlock(&dev_list_lock);
+	}
+	dev->initialized = 1;
+	return 0;
+}
+
+static void nvme_dev_reset(struct nvme_dev *dev)
+{
+	nvme_dev_shutdown(dev);
+	if (nvme_dev_resume(dev)) {
+		dev_err(&dev->pci_dev->dev, "Device failed to resume\n");
+		kref_get(&dev->kref);
+		if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
+							dev->instance))) {
+			dev_err(&dev->pci_dev->dev,
+				"Failed to start controller remove task\n");
+			kref_put(&dev->kref, nvme_free_dev);
+		}
+	}
+}
+
+static void nvme_reset_failed_dev(struct work_struct *ws)
+{
+	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
+	nvme_dev_reset(dev);
+}
+
+static void nvme_reset_workfn(struct work_struct *work)
+{
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
+	dev->reset_workfn(work);
+}
+
+static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	int result = -ENOMEM;
+	struct nvme_dev *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
+								GFP_KERNEL);
+	if (!dev->entry)
+		goto free;
+	dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
+								GFP_KERNEL);
+	if (!dev->queues)
+		goto free;
+	dev->io_queue = alloc_percpu(unsigned short);
+	if (!dev->io_queue)
+		goto free;
+
+	INIT_LIST_HEAD(&dev->namespaces);
+	dev->reset_workfn = nvme_reset_failed_dev;
+	INIT_WORK(&dev->reset_work, nvme_reset_workfn);
+	INIT_WORK(&dev->cpu_work, nvme_cpu_workfn);
+	dev->pci_dev = pdev;
+	pci_set_drvdata(pdev, dev);
+	result = nvme_set_instance(dev);
+	if (result)
+		goto free;
+
+	result = nvme_setup_prp_pools(dev);
+	if (result)
+		goto release;
+
+	kref_init(&dev->kref);
+	result = nvme_dev_start(dev);
+	if (result) {
+		if (result == -EBUSY)
+			goto create_cdev;
+		goto release_pools;
+	}
+
+	result = nvme_dev_add(dev);
+	if (result)
+		goto shutdown;
+
+ create_cdev:
+	scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance);
+	dev->miscdev.minor = MISC_DYNAMIC_MINOR;
+	dev->miscdev.parent = &pdev->dev;
+	dev->miscdev.name = dev->name;
+	dev->miscdev.fops = &nvme_dev_fops;
+	result = misc_register(&dev->miscdev);
+	if (result)
+		goto remove;
+
+	dev->initialized = 1;
+	return 0;
+
+ remove:
+	nvme_dev_remove(dev);
+	nvme_free_namespaces(dev);
+ shutdown:
+	nvme_dev_shutdown(dev);
+ release_pools:
+	nvme_free_queues(dev, 0);
+	nvme_release_prp_pools(dev);
+ release:
+	nvme_release_instance(dev);
+ free:
+	free_percpu(dev->io_queue);
+	kfree(dev->queues);
+	kfree(dev->entry);
+	kfree(dev);
+	return result;
+}
+
+static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
+{
+       struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+       if (prepare)
+               nvme_dev_shutdown(dev);
+       else
+               nvme_dev_resume(dev);
+}
+
+static void nvme_shutdown(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+	nvme_dev_shutdown(dev);
+}
+
+static void nvme_remove(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	spin_lock(&dev_list_lock);
+	list_del_init(&dev->node);
+	spin_unlock(&dev_list_lock);
+
+	pci_set_drvdata(pdev, NULL);
+	flush_work(&dev->reset_work);
+	flush_work(&dev->cpu_work);
+	misc_deregister(&dev->miscdev);
+	nvme_dev_remove(dev);
+	nvme_dev_shutdown(dev);
+	nvme_free_queues(dev, 0);
+	rcu_barrier();
+	nvme_release_instance(dev);
+	nvme_release_prp_pools(dev);
+	kref_put(&dev->kref, nvme_free_dev);
+}
+
+/* These functions are yet to be implemented */
+#define nvme_error_detected NULL
+#define nvme_dump_registers NULL
+#define nvme_link_reset NULL
+#define nvme_slot_reset NULL
+#define nvme_error_resume NULL
+
+#ifdef CONFIG_PM_SLEEP
+static int nvme_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct nvme_dev *ndev = pci_get_drvdata(pdev);
+
+	nvme_dev_shutdown(ndev);
+	return 0;
+}
+
+static int nvme_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct nvme_dev *ndev = pci_get_drvdata(pdev);
+
+	if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) {
+		ndev->reset_workfn = nvme_reset_failed_dev;
+		queue_work(nvme_workq, &ndev->reset_work);
+	}
+	return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
+
+static const struct pci_error_handlers nvme_err_handler = {
+	.error_detected	= nvme_error_detected,
+	.mmio_enabled	= nvme_dump_registers,
+	.link_reset	= nvme_link_reset,
+	.slot_reset	= nvme_slot_reset,
+	.resume		= nvme_error_resume,
+	.reset_notify	= nvme_reset_notify,
+};
+
+/* Move to pci_ids.h later */
+#define PCI_CLASS_STORAGE_EXPRESS	0x010802
+
+static const struct pci_device_id nvme_id_table[] = {
+	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, nvme_id_table);
+
+static struct pci_driver nvme_driver = {
+	.name		= "nvme",
+	.id_table	= nvme_id_table,
+	.probe		= nvme_probe,
+	.remove		= nvme_remove,
+	.shutdown	= nvme_shutdown,
+	.driver		= {
+		.pm	= &nvme_dev_pm_ops,
+	},
+	.err_handler	= &nvme_err_handler,
+};
+
+static int __init nvme_init(void)
+{
+	int result;
+
+	init_waitqueue_head(&nvme_kthread_wait);
+
+	nvme_workq = create_singlethread_workqueue("nvme");
+	if (!nvme_workq)
+		return -ENOMEM;
+
+	result = register_blkdev(nvme_major, "nvme");
+	if (result < 0)
+		goto kill_workq;
+	else if (result > 0)
+		nvme_major = result;
+
+	nvme_nb.notifier_call = &nvme_cpu_notify;
+	result = register_hotcpu_notifier(&nvme_nb);
+	if (result)
+		goto unregister_blkdev;
+
+	result = pci_register_driver(&nvme_driver);
+	if (result)
+		goto unregister_hotcpu;
+	return 0;
+
+ unregister_hotcpu:
+	unregister_hotcpu_notifier(&nvme_nb);
+ unregister_blkdev:
+	unregister_blkdev(nvme_major, "nvme");
+ kill_workq:
+	destroy_workqueue(nvme_workq);
+	return result;
+}
+
+static void __exit nvme_exit(void)
+{
+	pci_unregister_driver(&nvme_driver);
+	unregister_hotcpu_notifier(&nvme_nb);
+	unregister_blkdev(nvme_major, "nvme");
+	destroy_workqueue(nvme_workq);
+	BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
+	_nvme_check_size();
+}
+
+MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.9");
+module_init(nvme_init);
+module_exit(nvme_exit);
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
new file mode 100644
index 00000000000..a4cd6d691c6
--- /dev/null
+++ b/drivers/block/nvme-scsi.c
@@ -0,0 +1,3168 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+/*
+ * Refer to the SCSI-NVMe Translation spec for details on how
+ * each command is translated.
+ */
+
+#include <linux/nvme.h>
+#include <linux/bio.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/compat.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <scsi/sg.h>
+#include <scsi/scsi.h>
+
+
+static int sg_version_num = 30534;	/* 2 digits for each component */
+
+#define SNTI_TRANSLATION_SUCCESS			0
+#define SNTI_INTERNAL_ERROR				1
+
+/* VPD Page Codes */
+#define VPD_SUPPORTED_PAGES				0x00
+#define VPD_SERIAL_NUMBER				0x80
+#define VPD_DEVICE_IDENTIFIERS				0x83
+#define VPD_EXTENDED_INQUIRY				0x86
+#define VPD_BLOCK_DEV_CHARACTERISTICS			0xB1
+
+/* CDB offsets */
+#define REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET		6
+#define REPORT_LUNS_SR_OFFSET				2
+#define READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET		10
+#define REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET		4
+#define REQUEST_SENSE_DESC_OFFSET			1
+#define REQUEST_SENSE_DESC_MASK				0x01
+#define DESCRIPTOR_FORMAT_SENSE_DATA_TYPE		1
+#define INQUIRY_EVPD_BYTE_OFFSET			1
+#define INQUIRY_PAGE_CODE_BYTE_OFFSET			2
+#define INQUIRY_EVPD_BIT_MASK				1
+#define INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET		3
+#define START_STOP_UNIT_CDB_IMMED_OFFSET		1
+#define START_STOP_UNIT_CDB_IMMED_MASK			0x1
+#define START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET	3
+#define START_STOP_UNIT_CDB_POWER_COND_MOD_MASK		0xF
+#define START_STOP_UNIT_CDB_POWER_COND_OFFSET		4
+#define START_STOP_UNIT_CDB_POWER_COND_MASK		0xF0
+#define START_STOP_UNIT_CDB_NO_FLUSH_OFFSET		4
+#define START_STOP_UNIT_CDB_NO_FLUSH_MASK		0x4
+#define START_STOP_UNIT_CDB_START_OFFSET		4
+#define START_STOP_UNIT_CDB_START_MASK			0x1
+#define WRITE_BUFFER_CDB_MODE_OFFSET			1
+#define WRITE_BUFFER_CDB_MODE_MASK			0x1F
+#define WRITE_BUFFER_CDB_BUFFER_ID_OFFSET		2
+#define WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET		3
+#define WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET	6
+#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET		1
+#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK		0xC0
+#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT		6
+#define FORMAT_UNIT_CDB_LONG_LIST_OFFSET		1
+#define FORMAT_UNIT_CDB_LONG_LIST_MASK			0x20
+#define FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET		1
+#define FORMAT_UNIT_CDB_FORMAT_DATA_MASK		0x10
+#define FORMAT_UNIT_SHORT_PARM_LIST_LEN			4
+#define FORMAT_UNIT_LONG_PARM_LIST_LEN			8
+#define FORMAT_UNIT_PROT_INT_OFFSET			3
+#define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET		0
+#define FORMAT_UNIT_PROT_FIELD_USAGE_MASK		0x07
+#define UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET		7
+
+/* Misc. defines */
+#define NIBBLE_SHIFT					4
+#define FIXED_SENSE_DATA				0x70
+#define DESC_FORMAT_SENSE_DATA				0x72
+#define FIXED_SENSE_DATA_ADD_LENGTH			10
+#define LUN_ENTRY_SIZE					8
+#define LUN_DATA_HEADER_SIZE				8
+#define ALL_LUNS_RETURNED				0x02
+#define ALL_WELL_KNOWN_LUNS_RETURNED			0x01
+#define RESTRICTED_LUNS_RETURNED			0x00
+#define NVME_POWER_STATE_START_VALID			0x00
+#define NVME_POWER_STATE_ACTIVE				0x01
+#define NVME_POWER_STATE_IDLE				0x02
+#define NVME_POWER_STATE_STANDBY			0x03
+#define NVME_POWER_STATE_LU_CONTROL			0x07
+#define POWER_STATE_0					0
+#define POWER_STATE_1					1
+#define POWER_STATE_2					2
+#define POWER_STATE_3					3
+#define DOWNLOAD_SAVE_ACTIVATE				0x05
+#define DOWNLOAD_SAVE_DEFER_ACTIVATE			0x0E
+#define ACTIVATE_DEFERRED_MICROCODE			0x0F
+#define FORMAT_UNIT_IMMED_MASK				0x2
+#define FORMAT_UNIT_IMMED_OFFSET			1
+#define KELVIN_TEMP_FACTOR				273
+#define FIXED_FMT_SENSE_DATA_SIZE			18
+#define DESC_FMT_SENSE_DATA_SIZE			8
+
+/* SCSI/NVMe defines and bit masks */
+#define INQ_STANDARD_INQUIRY_PAGE			0x00
+#define INQ_SUPPORTED_VPD_PAGES_PAGE			0x00
+#define INQ_UNIT_SERIAL_NUMBER_PAGE			0x80
+#define INQ_DEVICE_IDENTIFICATION_PAGE			0x83
+#define INQ_EXTENDED_INQUIRY_DATA_PAGE			0x86
+#define INQ_BDEV_CHARACTERISTICS_PAGE			0xB1
+#define INQ_SERIAL_NUMBER_LENGTH			0x14
+#define INQ_NUM_SUPPORTED_VPD_PAGES			5
+#define VERSION_SPC_4					0x06
+#define ACA_UNSUPPORTED					0
+#define STANDARD_INQUIRY_LENGTH				36
+#define ADDITIONAL_STD_INQ_LENGTH			31
+#define EXTENDED_INQUIRY_DATA_PAGE_LENGTH		0x3C
+#define RESERVED_FIELD					0
+
+/* SCSI READ/WRITE Defines */
+#define IO_CDB_WP_MASK					0xE0
+#define IO_CDB_WP_SHIFT					5
+#define IO_CDB_FUA_MASK					0x8
+#define IO_6_CDB_LBA_OFFSET				0
+#define IO_6_CDB_LBA_MASK				0x001FFFFF
+#define IO_6_CDB_TX_LEN_OFFSET				4
+#define IO_6_DEFAULT_TX_LEN				256
+#define IO_10_CDB_LBA_OFFSET				2
+#define IO_10_CDB_TX_LEN_OFFSET				7
+#define IO_10_CDB_WP_OFFSET				1
+#define IO_10_CDB_FUA_OFFSET				1
+#define IO_12_CDB_LBA_OFFSET				2
+#define IO_12_CDB_TX_LEN_OFFSET				6
+#define IO_12_CDB_WP_OFFSET				1
+#define IO_12_CDB_FUA_OFFSET				1
+#define IO_16_CDB_FUA_OFFSET				1
+#define IO_16_CDB_WP_OFFSET				1
+#define IO_16_CDB_LBA_OFFSET				2
+#define IO_16_CDB_TX_LEN_OFFSET				10
+
+/* Mode Sense/Select defines */
+#define MODE_PAGE_INFO_EXCEP				0x1C
+#define MODE_PAGE_CACHING				0x08
+#define MODE_PAGE_CONTROL				0x0A
+#define MODE_PAGE_POWER_CONDITION			0x1A
+#define MODE_PAGE_RETURN_ALL				0x3F
+#define MODE_PAGE_BLK_DES_LEN				0x08
+#define MODE_PAGE_LLBAA_BLK_DES_LEN			0x10
+#define MODE_PAGE_CACHING_LEN				0x14
+#define MODE_PAGE_CONTROL_LEN				0x0C
+#define MODE_PAGE_POW_CND_LEN				0x28
+#define MODE_PAGE_INF_EXC_LEN				0x0C
+#define MODE_PAGE_ALL_LEN				0x54
+#define MODE_SENSE6_MPH_SIZE				4
+#define MODE_SENSE6_ALLOC_LEN_OFFSET			4
+#define MODE_SENSE_PAGE_CONTROL_OFFSET			2
+#define MODE_SENSE_PAGE_CONTROL_MASK			0xC0
+#define MODE_SENSE_PAGE_CODE_OFFSET			2
+#define MODE_SENSE_PAGE_CODE_MASK			0x3F
+#define MODE_SENSE_LLBAA_OFFSET				1
+#define MODE_SENSE_LLBAA_MASK				0x10
+#define MODE_SENSE_LLBAA_SHIFT				4
+#define MODE_SENSE_DBD_OFFSET				1
+#define MODE_SENSE_DBD_MASK				8
+#define MODE_SENSE_DBD_SHIFT				3
+#define MODE_SENSE10_MPH_SIZE				8
+#define MODE_SENSE10_ALLOC_LEN_OFFSET			7
+#define MODE_SELECT_CDB_PAGE_FORMAT_OFFSET		1
+#define MODE_SELECT_CDB_SAVE_PAGES_OFFSET		1
+#define MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET	4
+#define MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET	7
+#define MODE_SELECT_CDB_PAGE_FORMAT_MASK		0x10
+#define MODE_SELECT_CDB_SAVE_PAGES_MASK			0x1
+#define MODE_SELECT_6_BD_OFFSET				3
+#define MODE_SELECT_10_BD_OFFSET			6
+#define MODE_SELECT_10_LLBAA_OFFSET			4
+#define MODE_SELECT_10_LLBAA_MASK			1
+#define MODE_SELECT_6_MPH_SIZE				4
+#define MODE_SELECT_10_MPH_SIZE				8
+#define CACHING_MODE_PAGE_WCE_MASK			0x04
+#define MODE_SENSE_BLK_DESC_ENABLED			0
+#define MODE_SENSE_BLK_DESC_COUNT			1
+#define MODE_SELECT_PAGE_CODE_MASK			0x3F
+#define SHORT_DESC_BLOCK				8
+#define LONG_DESC_BLOCK					16
+#define MODE_PAGE_POW_CND_LEN_FIELD			0x26
+#define MODE_PAGE_INF_EXC_LEN_FIELD			0x0A
+#define MODE_PAGE_CACHING_LEN_FIELD			0x12
+#define MODE_PAGE_CONTROL_LEN_FIELD			0x0A
+#define MODE_SENSE_PC_CURRENT_VALUES			0
+
+/* Log Sense defines */
+#define LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE		0x00
+#define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH		0x07
+#define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE		0x2F
+#define LOG_PAGE_TEMPERATURE_PAGE			0x0D
+#define LOG_SENSE_CDB_SP_OFFSET				1
+#define LOG_SENSE_CDB_SP_NOT_ENABLED			0
+#define LOG_SENSE_CDB_PC_OFFSET				2
+#define LOG_SENSE_CDB_PC_MASK				0xC0
+#define LOG_SENSE_CDB_PC_SHIFT				6
+#define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES		1
+#define LOG_SENSE_CDB_PAGE_CODE_MASK			0x3F
+#define LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET		7
+#define REMAINING_INFO_EXCP_PAGE_LENGTH			0x8
+#define LOG_INFO_EXCP_PAGE_LENGTH			0xC
+#define REMAINING_TEMP_PAGE_LENGTH			0xC
+#define LOG_TEMP_PAGE_LENGTH				0x10
+#define LOG_TEMP_UNKNOWN				0xFF
+#define SUPPORTED_LOG_PAGES_PAGE_LENGTH			0x3
+
+/* Read Capacity defines */
+#define READ_CAP_10_RESP_SIZE				8
+#define READ_CAP_16_RESP_SIZE				32
+
+/* NVMe Namespace and Command Defines */
+#define BYTES_TO_DWORDS					4
+#define NVME_MAX_FIRMWARE_SLOT				7
+
+/* Report LUNs defines */
+#define REPORT_LUNS_FIRST_LUN_OFFSET			8
+
+/* SCSI ADDITIONAL SENSE Codes */
+
+#define SCSI_ASC_NO_SENSE				0x00
+#define SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT		0x03
+#define SCSI_ASC_LUN_NOT_READY				0x04
+#define SCSI_ASC_WARNING				0x0B
+#define SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED		0x10
+#define SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED		0x10
+#define SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED		0x10
+#define SCSI_ASC_UNRECOVERED_READ_ERROR			0x11
+#define SCSI_ASC_MISCOMPARE_DURING_VERIFY		0x1D
+#define SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID		0x20
+#define SCSI_ASC_ILLEGAL_COMMAND			0x20
+#define SCSI_ASC_ILLEGAL_BLOCK				0x21
+#define SCSI_ASC_INVALID_CDB				0x24
+#define SCSI_ASC_INVALID_LUN				0x25
+#define SCSI_ASC_INVALID_PARAMETER			0x26
+#define SCSI_ASC_FORMAT_COMMAND_FAILED			0x31
+#define SCSI_ASC_INTERNAL_TARGET_FAILURE		0x44
+
+/* SCSI ADDITIONAL SENSE Code Qualifiers */
+
+#define SCSI_ASCQ_CAUSE_NOT_REPORTABLE			0x00
+#define SCSI_ASCQ_FORMAT_COMMAND_FAILED			0x01
+#define SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED		0x01
+#define SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED		0x02
+#define SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED		0x03
+#define SCSI_ASCQ_FORMAT_IN_PROGRESS			0x04
+#define SCSI_ASCQ_POWER_LOSS_EXPECTED			0x08
+#define SCSI_ASCQ_INVALID_LUN_ID			0x09
+
+/**
+ * DEVICE_SPECIFIC_PARAMETER in mode parameter header (see sbc2r16) to
+ * enable DPOFUA support type 0x10 value.
+ */
+#define DEVICE_SPECIFIC_PARAMETER			0
+#define VPD_ID_DESCRIPTOR_LENGTH sizeof(VPD_IDENTIFICATION_DESCRIPTOR)
+
+/* MACROs to extract information from CDBs */
+
+#define GET_OPCODE(cdb)		cdb[0]
+
+#define GET_U8_FROM_CDB(cdb, index) (cdb[index] << 0)
+
+#define GET_U16_FROM_CDB(cdb, index) ((cdb[index] << 8) | (cdb[index + 1] << 0))
+
+#define GET_U24_FROM_CDB(cdb, index) ((cdb[index] << 16) | \
+(cdb[index + 1] <<  8) | \
+(cdb[index + 2] <<  0))
+
+#define GET_U32_FROM_CDB(cdb, index) ((cdb[index] << 24) | \
+(cdb[index + 1] << 16) | \
+(cdb[index + 2] <<  8) | \
+(cdb[index + 3] <<  0))
+
+#define GET_U64_FROM_CDB(cdb, index) ((((u64)cdb[index]) << 56) | \
+(((u64)cdb[index + 1]) << 48) | \
+(((u64)cdb[index + 2]) << 40) | \
+(((u64)cdb[index + 3]) << 32) | \
+(((u64)cdb[index + 4]) << 24) | \
+(((u64)cdb[index + 5]) << 16) | \
+(((u64)cdb[index + 6]) <<  8) | \
+(((u64)cdb[index + 7]) <<  0))
+
+/* Inquiry Helper Macros */
+#define GET_INQ_EVPD_BIT(cdb) \
+((GET_U8_FROM_CDB(cdb, INQUIRY_EVPD_BYTE_OFFSET) &		\
+INQUIRY_EVPD_BIT_MASK) ? 1 : 0)
+
+#define GET_INQ_PAGE_CODE(cdb)					\
+(GET_U8_FROM_CDB(cdb, INQUIRY_PAGE_CODE_BYTE_OFFSET))
+
+#define GET_INQ_ALLOC_LENGTH(cdb)				\
+(GET_U16_FROM_CDB(cdb, INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET))
+
+/* Report LUNs Helper Macros */
+#define GET_REPORT_LUNS_ALLOC_LENGTH(cdb)			\
+(GET_U32_FROM_CDB(cdb, REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET))
+
+/* Read Capacity Helper Macros */
+#define GET_READ_CAP_16_ALLOC_LENGTH(cdb)			\
+(GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET))
+
+#define IS_READ_CAP_16(cdb)					\
+((cdb[0] == SERVICE_ACTION_IN && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0)
+
+/* Request Sense Helper Macros */
+#define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb)			\
+(GET_U8_FROM_CDB(cdb, REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET))
+
+/* Mode Sense Helper Macros */
+#define GET_MODE_SENSE_DBD(cdb)					\
+((GET_U8_FROM_CDB(cdb, MODE_SENSE_DBD_OFFSET) & MODE_SENSE_DBD_MASK) >>	\
+MODE_SENSE_DBD_SHIFT)
+
+#define GET_MODE_SENSE_LLBAA(cdb)				\
+((GET_U8_FROM_CDB(cdb, MODE_SENSE_LLBAA_OFFSET) &		\
+MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT)
+
+#define GET_MODE_SENSE_MPH_SIZE(cdb10)				\
+(cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE)
+
+
+/* Struct to gather data that needs to be extracted from a SCSI CDB.
+   Not conforming to any particular CDB variant, but compatible with all. */
+
+struct nvme_trans_io_cdb {
+	u8 fua;
+	u8 prot_info;
+	u64 lba;
+	u32 xfer_len;
+};
+
+
+/* Internal Helper Functions */
+
+
+/* Copy data to userspace memory */
+
+static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from,
+								unsigned long n)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	unsigned long not_copied;
+	int i;
+	void *index = from;
+	size_t remaining = n;
+	size_t xfer_len;
+
+	if (hdr->iovec_count > 0) {
+		struct sg_iovec sgl;
+
+		for (i = 0; i < hdr->iovec_count; i++) {
+			not_copied = copy_from_user(&sgl, hdr->dxferp +
+						i * sizeof(struct sg_iovec),
+						sizeof(struct sg_iovec));
+			if (not_copied)
+				return -EFAULT;
+			xfer_len = min(remaining, sgl.iov_len);
+			not_copied = copy_to_user(sgl.iov_base, index,
+								xfer_len);
+			if (not_copied) {
+				res = -EFAULT;
+				break;
+			}
+			index += xfer_len;
+			remaining -= xfer_len;
+			if (remaining == 0)
+				break;
+		}
+		return res;
+	}
+	not_copied = copy_to_user(hdr->dxferp, from, n);
+	if (not_copied)
+		res = -EFAULT;
+	return res;
+}
+
+/* Copy data from userspace memory */
+
+static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to,
+								unsigned long n)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	unsigned long not_copied;
+	int i;
+	void *index = to;
+	size_t remaining = n;
+	size_t xfer_len;
+
+	if (hdr->iovec_count > 0) {
+		struct sg_iovec sgl;
+
+		for (i = 0; i < hdr->iovec_count; i++) {
+			not_copied = copy_from_user(&sgl, hdr->dxferp +
+						i * sizeof(struct sg_iovec),
+						sizeof(struct sg_iovec));
+			if (not_copied)
+				return -EFAULT;
+			xfer_len = min(remaining, sgl.iov_len);
+			not_copied = copy_from_user(index, sgl.iov_base,
+								xfer_len);
+			if (not_copied) {
+				res = -EFAULT;
+				break;
+			}
+			index += xfer_len;
+			remaining -= xfer_len;
+			if (remaining == 0)
+				break;
+		}
+		return res;
+	}
+
+	not_copied = copy_from_user(to, hdr->dxferp, n);
+	if (not_copied)
+		res = -EFAULT;
+	return res;
+}
+
+/* Status/Sense Buffer Writeback */
+
+static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key,
+				 u8 asc, u8 ascq)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 xfer_len;
+	u8 resp[DESC_FMT_SENSE_DATA_SIZE];
+
+	if (scsi_status_is_good(status)) {
+		hdr->status = SAM_STAT_GOOD;
+		hdr->masked_status = GOOD;
+		hdr->host_status = DID_OK;
+		hdr->driver_status = DRIVER_OK;
+		hdr->sb_len_wr = 0;
+	} else {
+		hdr->status = status;
+		hdr->masked_status = status >> 1;
+		hdr->host_status = DID_OK;
+		hdr->driver_status = DRIVER_OK;
+
+		memset(resp, 0, DESC_FMT_SENSE_DATA_SIZE);
+		resp[0] = DESC_FORMAT_SENSE_DATA;
+		resp[1] = sense_key;
+		resp[2] = asc;
+		resp[3] = ascq;
+
+		xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE);
+		hdr->sb_len_wr = xfer_len;
+		if (copy_to_user(hdr->sbp, resp, xfer_len) > 0)
+			res = -EFAULT;
+	}
+
+	return res;
+}
+
+static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc)
+{
+	u8 status, sense_key, asc, ascq;
+	int res = SNTI_TRANSLATION_SUCCESS;
+
+	/* For non-nvme (Linux) errors, simply return the error code */
+	if (nvme_sc < 0)
+		return nvme_sc;
+
+	/* Mask DNR, More, and reserved fields */
+	nvme_sc &= 0x7FF;
+
+	switch (nvme_sc) {
+	/* Generic Command Status */
+	case NVME_SC_SUCCESS:
+		status = SAM_STAT_GOOD;
+		sense_key = NO_SENSE;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_INVALID_OPCODE:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_ILLEGAL_COMMAND;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_INVALID_FIELD:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_INVALID_CDB;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_DATA_XFER_ERROR:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_POWER_LOSS:
+		status = SAM_STAT_TASK_ABORTED;
+		sense_key = ABORTED_COMMAND;
+		asc = SCSI_ASC_WARNING;
+		ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED;
+		break;
+	case NVME_SC_INTERNAL:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = HARDWARE_ERROR;
+		asc = SCSI_ASC_INTERNAL_TARGET_FAILURE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_ABORT_REQ:
+		status = SAM_STAT_TASK_ABORTED;
+		sense_key = ABORTED_COMMAND;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_ABORT_QUEUE:
+		status = SAM_STAT_TASK_ABORTED;
+		sense_key = ABORTED_COMMAND;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_FUSED_FAIL:
+		status = SAM_STAT_TASK_ABORTED;
+		sense_key = ABORTED_COMMAND;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_FUSED_MISSING:
+		status = SAM_STAT_TASK_ABORTED;
+		sense_key = ABORTED_COMMAND;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_INVALID_NS:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID;
+		ascq = SCSI_ASCQ_INVALID_LUN_ID;
+		break;
+	case NVME_SC_LBA_RANGE:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_ILLEGAL_BLOCK;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_CAP_EXCEEDED:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_NS_NOT_READY:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = NOT_READY;
+		asc = SCSI_ASC_LUN_NOT_READY;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+
+	/* Command Specific Status */
+	case NVME_SC_INVALID_FORMAT:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_FORMAT_COMMAND_FAILED;
+		ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED;
+		break;
+	case NVME_SC_BAD_ATTRIBUTES:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_INVALID_CDB;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+
+	/* Media Errors */
+	case NVME_SC_WRITE_FAULT:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_READ_ERROR:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_UNRECOVERED_READ_ERROR;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_GUARD_CHECK:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED;
+		ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED;
+		break;
+	case NVME_SC_APPTAG_CHECK:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED;
+		ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED;
+		break;
+	case NVME_SC_REFTAG_CHECK:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED;
+		ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED;
+		break;
+	case NVME_SC_COMPARE_FAILED:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MISCOMPARE;
+		asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_ACCESS_DENIED:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID;
+		ascq = SCSI_ASCQ_INVALID_LUN_ID;
+		break;
+
+	/* Unspecified/Default */
+	case NVME_SC_CMDID_CONFLICT:
+	case NVME_SC_CMD_SEQ_ERROR:
+	case NVME_SC_CQ_INVALID:
+	case NVME_SC_QID_INVALID:
+	case NVME_SC_QUEUE_SIZE:
+	case NVME_SC_ABORT_LIMIT:
+	case NVME_SC_ABORT_MISSING:
+	case NVME_SC_ASYNC_LIMIT:
+	case NVME_SC_FIRMWARE_SLOT:
+	case NVME_SC_FIRMWARE_IMAGE:
+	case NVME_SC_INVALID_VECTOR:
+	case NVME_SC_INVALID_LOG_PAGE:
+	default:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	}
+
+	res = nvme_trans_completion(hdr, status, sense_key, asc, ascq);
+
+	return res;
+}
+
+/* INQUIRY Helper Functions */
+
+static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *inq_response,
+					int alloc_len)
+{
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ns *id_ns;
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	int xfer_len;
+	u8 resp_data_format = 0x02;
+	u8 protect;
+	u8 cmdque = 0x01 << 1;
+	u8 fw_offset = sizeof(dev->firmware_rev);
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+				&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+
+	/* nvme ns identify - use DPS value for PROTECT field */
+	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	/*
+	 * If nvme_sc was -ve, res will be -ve here.
+	 * If nvme_sc was +ve, the status would bace been translated, and res
+	 *  can only be 0 or -ve.
+	 *    - If 0 && nvme_sc > 0, then go into next if where res gets nvme_sc
+	 *    - If -ve, return because its a Linux error.
+	 */
+	if (res)
+		goto out_free;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_free;
+	}
+	id_ns = mem;
+	(id_ns->dps) ? (protect = 0x01) : (protect = 0);
+
+	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+	inq_response[2] = VERSION_SPC_4;
+	inq_response[3] = resp_data_format;	/*normaca=0 | hisup=0 */
+	inq_response[4] = ADDITIONAL_STD_INQ_LENGTH;
+	inq_response[5] = protect;	/* sccs=0 | acc=0 | tpgs=0 | pc3=0 */
+	inq_response[7] = cmdque;	/* wbus16=0 | sync=0 | vs=0 */
+	strncpy(&inq_response[8], "NVMe    ", 8);
+	strncpy(&inq_response[16], dev->model, 16);
+
+	while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4)
+		fw_offset--;
+	fw_offset -= 4;
+	strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4);
+
+	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ out_free:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out_dma:
+	return res;
+}
+
+static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *inq_response,
+					int alloc_len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+
+	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+	inq_response[1] = INQ_SUPPORTED_VPD_PAGES_PAGE;   /* Page Code */
+	inq_response[3] = INQ_NUM_SUPPORTED_VPD_PAGES;    /* Page Length */
+	inq_response[4] = INQ_SUPPORTED_VPD_PAGES_PAGE;
+	inq_response[5] = INQ_UNIT_SERIAL_NUMBER_PAGE;
+	inq_response[6] = INQ_DEVICE_IDENTIFICATION_PAGE;
+	inq_response[7] = INQ_EXTENDED_INQUIRY_DATA_PAGE;
+	inq_response[8] = INQ_BDEV_CHARACTERISTICS_PAGE;
+
+	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+	return res;
+}
+
+static int nvme_trans_unit_serial_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *inq_response,
+					int alloc_len)
+{
+	struct nvme_dev *dev = ns->dev;
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+
+	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+	inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */
+	inq_response[3] = INQ_SERIAL_NUMBER_LENGTH;    /* Page Length */
+	strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH);
+
+	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+	return res;
+}
+
+static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					u8 *inq_response, int alloc_len)
+{
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ctrl *id_ctrl;
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	u8 ieee[4];
+	int xfer_len;
+	__be32 tmp_id = cpu_to_be32(ns->ns_id);
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+					&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+
+	/* nvme controller identify */
+	nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_free;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_free;
+	}
+	id_ctrl = mem;
+
+	/* Since SCSI tried to save 4 bits... [SPC-4(r34) Table 591] */
+	ieee[0] = id_ctrl->ieee[0] << 4;
+	ieee[1] = id_ctrl->ieee[0] >> 4 | id_ctrl->ieee[1] << 4;
+	ieee[2] = id_ctrl->ieee[1] >> 4 | id_ctrl->ieee[2] << 4;
+	ieee[3] = id_ctrl->ieee[2] >> 4;
+
+	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+	inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;    /* Page Code */
+	inq_response[3] = 20;      /* Page Length */
+	/* Designation Descriptor start */
+	inq_response[4] = 0x01;    /* Proto ID=0h | Code set=1h */
+	inq_response[5] = 0x03;    /* PIV=0b | Asso=00b | Designator Type=3h */
+	inq_response[6] = 0x00;    /* Rsvd */
+	inq_response[7] = 16;      /* Designator Length */
+	/* Designator start */
+	inq_response[8] = 0x60 | ieee[3]; /* NAA=6h | IEEE ID MSB, High nibble*/
+	inq_response[9] = ieee[2];        /* IEEE ID */
+	inq_response[10] = ieee[1];       /* IEEE ID */
+	inq_response[11] = ieee[0];       /* IEEE ID| Vendor Specific ID... */
+	inq_response[12] = (dev->pci_dev->vendor & 0xFF00) >> 8;
+	inq_response[13] = (dev->pci_dev->vendor & 0x00FF);
+	inq_response[14] = dev->serial[0];
+	inq_response[15] = dev->serial[1];
+	inq_response[16] = dev->model[0];
+	inq_response[17] = dev->model[1];
+	memcpy(&inq_response[18], &tmp_id, sizeof(u32));
+	/* Last 2 bytes are zero */
+
+	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ out_free:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out_dma:
+	return res;
+}
+
+static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					int alloc_len)
+{
+	u8 *inq_response;
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ctrl *id_ctrl;
+	struct nvme_id_ns *id_ns;
+	int xfer_len;
+	u8 microcode = 0x80;
+	u8 spt;
+	u8 spt_lut[8] = {0, 0, 2, 1, 4, 6, 5, 7};
+	u8 grd_chk, app_chk, ref_chk, protect;
+	u8 uask_sup = 0x20;
+	u8 v_sup;
+	u8 luiclr = 0x01;
+
+	inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL);
+	if (inq_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+							&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+
+	/* nvme ns identify */
+	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_free;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_free;
+	}
+	id_ns = mem;
+	spt = spt_lut[(id_ns->dpc) & 0x07] << 3;
+	(id_ns->dps) ? (protect = 0x01) : (protect = 0);
+	grd_chk = protect << 2;
+	app_chk = protect << 1;
+	ref_chk = protect;
+
+	/* nvme controller identify */
+	nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_free;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_free;
+	}
+	id_ctrl = mem;
+	v_sup = id_ctrl->vwc;
+
+	memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
+	inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE;    /* Page Code */
+	inq_response[2] = 0x00;    /* Page Length MSB */
+	inq_response[3] = 0x3C;    /* Page Length LSB */
+	inq_response[4] = microcode | spt | grd_chk | app_chk | ref_chk;
+	inq_response[5] = uask_sup;
+	inq_response[6] = v_sup;
+	inq_response[7] = luiclr;
+	inq_response[8] = 0;
+	inq_response[9] = 0;
+
+	xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ out_free:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out_dma:
+	kfree(inq_response);
+ out_mem:
+	return res;
+}
+
+static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					int alloc_len)
+{
+	u8 *inq_response;
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+
+	inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL);
+	if (inq_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+
+	inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE;    /* Page Code */
+	inq_response[2] = 0x00;    /* Page Length MSB */
+	inq_response[3] = 0x3C;    /* Page Length LSB */
+	inq_response[4] = 0x00;    /* Medium Rotation Rate MSB */
+	inq_response[5] = 0x01;    /* Medium Rotation Rate LSB */
+	inq_response[6] = 0x00;    /* Form Factor */
+
+	xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+	kfree(inq_response);
+ out_mem:
+	return res;
+}
+
+/* LOG SENSE Helper Functions */
+
+static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					int alloc_len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+	u8 *log_response;
+
+	log_response = kzalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL);
+	if (log_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+
+	log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE;
+	/* Subpage=0x00, Page Length MSB=0 */
+	log_response[3] = SUPPORTED_LOG_PAGES_PAGE_LENGTH;
+	log_response[4] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE;
+	log_response[5] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE;
+	log_response[6] = LOG_PAGE_TEMPERATURE_PAGE;
+
+	xfer_len = min(alloc_len, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
+
+	kfree(log_response);
+ out_mem:
+	return res;
+}
+
+static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, int alloc_len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+	u8 *log_response;
+	struct nvme_command c;
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_smart_log *smart_log;
+	dma_addr_t dma_addr;
+	void *mem;
+	u8 temp_c;
+	u16 temp_k;
+
+	log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL);
+	if (log_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev,
+					sizeof(struct nvme_smart_log),
+					&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+
+	/* Get SMART Log Page */
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_get_log_page;
+	c.common.nsid = cpu_to_le32(0xFFFFFFFF);
+	c.common.prp1 = cpu_to_le64(dma_addr);
+	c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
+			BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
+	res = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (res != NVME_SC_SUCCESS) {
+		temp_c = LOG_TEMP_UNKNOWN;
+	} else {
+		smart_log = mem;
+		temp_k = (smart_log->temperature[1] << 8) +
+				(smart_log->temperature[0]);
+		temp_c = temp_k - KELVIN_TEMP_FACTOR;
+	}
+
+	log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE;
+	/* Subpage=0x00, Page Length MSB=0 */
+	log_response[3] = REMAINING_INFO_EXCP_PAGE_LENGTH;
+	/* Informational Exceptions Log Parameter 1 Start */
+	/* Parameter Code=0x0000 bytes 4,5 */
+	log_response[6] = 0x23; /* DU=0, TSD=1, ETC=0, TMC=0, FMT_AND_LNK=11b */
+	log_response[7] = 0x04; /* PARAMETER LENGTH */
+	/* Add sense Code and qualifier = 0x00 each */
+	/* Use Temperature from NVMe Get Log Page, convert to C from K */
+	log_response[10] = temp_c;
+
+	xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
+
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log),
+			  mem, dma_addr);
+ out_dma:
+	kfree(log_response);
+ out_mem:
+	return res;
+}
+
+static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					int alloc_len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+	u8 *log_response;
+	struct nvme_command c;
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_smart_log *smart_log;
+	dma_addr_t dma_addr;
+	void *mem;
+	u32 feature_resp;
+	u8 temp_c_cur, temp_c_thresh;
+	u16 temp_k;
+
+	log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL);
+	if (log_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev,
+					sizeof(struct nvme_smart_log),
+					&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+
+	/* Get SMART Log Page */
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_get_log_page;
+	c.common.nsid = cpu_to_le32(0xFFFFFFFF);
+	c.common.prp1 = cpu_to_le64(dma_addr);
+	c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
+			BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
+	res = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (res != NVME_SC_SUCCESS) {
+		temp_c_cur = LOG_TEMP_UNKNOWN;
+	} else {
+		smart_log = mem;
+		temp_k = (smart_log->temperature[1] << 8) +
+				(smart_log->temperature[0]);
+		temp_c_cur = temp_k - KELVIN_TEMP_FACTOR;
+	}
+
+	/* Get Features for Temp Threshold */
+	res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0,
+								&feature_resp);
+	if (res != NVME_SC_SUCCESS)
+		temp_c_thresh = LOG_TEMP_UNKNOWN;
+	else
+		temp_c_thresh = (feature_resp & 0xFFFF) - KELVIN_TEMP_FACTOR;
+
+	log_response[0] = LOG_PAGE_TEMPERATURE_PAGE;
+	/* Subpage=0x00, Page Length MSB=0 */
+	log_response[3] = REMAINING_TEMP_PAGE_LENGTH;
+	/* Temperature Log Parameter 1 (Temperature) Start */
+	/* Parameter Code = 0x0000 */
+	log_response[6] = 0x01;		/* Format and Linking = 01b */
+	log_response[7] = 0x02;		/* Parameter Length */
+	/* Use Temperature from NVMe Get Log Page, convert to C from K */
+	log_response[9] = temp_c_cur;
+	/* Temperature Log Parameter 2 (Reference Temperature) Start */
+	log_response[11] = 0x01;	/* Parameter Code = 0x0001 */
+	log_response[12] = 0x01;	/* Format and Linking = 01b */
+	log_response[13] = 0x02;	/* Parameter Length */
+	/* Use Temperature Thresh from NVMe Get Log Page, convert to C from K */
+	log_response[15] = temp_c_thresh;
+
+	xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
+
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log),
+			  mem, dma_addr);
+ out_dma:
+	kfree(log_response);
+ out_mem:
+	return res;
+}
+
+/* MODE SENSE Helper Functions */
+
+static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa,
+					u16 mode_data_length, u16 blk_desc_len)
+{
+	/* Quick check to make sure I don't stomp on my own memory... */
+	if ((cdb10 && len < 8) || (!cdb10 && len < 4))
+		return SNTI_INTERNAL_ERROR;
+
+	if (cdb10) {
+		resp[0] = (mode_data_length & 0xFF00) >> 8;
+		resp[1] = (mode_data_length & 0x00FF);
+		/* resp[2] and [3] are zero */
+		resp[4] = llbaa;
+		resp[5] = RESERVED_FIELD;
+		resp[6] = (blk_desc_len & 0xFF00) >> 8;
+		resp[7] = (blk_desc_len & 0x00FF);
+	} else {
+		resp[0] = (mode_data_length & 0x00FF);
+		/* resp[1] and [2] are zero */
+		resp[3] = (blk_desc_len & 0x00FF);
+	}
+
+	return SNTI_TRANSLATION_SUCCESS;
+}
+
+static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+				    u8 *resp, int len, u8 llbaa)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ns *id_ns;
+	u8 flbas;
+	u32 lba_length;
+
+	if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN)
+		return SNTI_INTERNAL_ERROR;
+	else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN)
+		return SNTI_INTERNAL_ERROR;
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+							&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+
+	/* nvme ns identify */
+	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_dma;
+	}
+	id_ns = mem;
+	flbas = (id_ns->flbas) & 0x0F;
+	lba_length = (1 << (id_ns->lbaf[flbas].ds));
+
+	if (llbaa == 0) {
+		__be32 tmp_cap = cpu_to_be32(le64_to_cpu(id_ns->ncap));
+		/* Byte 4 is reserved */
+		__be32 tmp_len = cpu_to_be32(lba_length & 0x00FFFFFF);
+
+		memcpy(resp, &tmp_cap, sizeof(u32));
+		memcpy(&resp[4], &tmp_len, sizeof(u32));
+	} else {
+		__be64 tmp_cap = cpu_to_be64(le64_to_cpu(id_ns->ncap));
+		__be32 tmp_len = cpu_to_be32(lba_length);
+
+		memcpy(resp, &tmp_cap, sizeof(u64));
+		/* Bytes 8, 9, 10, 11 are reserved */
+		memcpy(&resp[12], &tmp_len, sizeof(u32));
+	}
+
+ out_dma:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out:
+	return res;
+}
+
+static int nvme_trans_fill_control_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *resp,
+					int len)
+{
+	if (len < MODE_PAGE_CONTROL_LEN)
+		return SNTI_INTERNAL_ERROR;
+
+	resp[0] = MODE_PAGE_CONTROL;
+	resp[1] = MODE_PAGE_CONTROL_LEN_FIELD;
+	resp[2] = 0x0E;		/* TST=000b, TMF_ONLY=0, DPICZ=1,
+				 * D_SENSE=1, GLTSD=1, RLEC=0 */
+	resp[3] = 0x12;		/* Q_ALGO_MODIFIER=1h, NUAR=0, QERR=01b */
+	/* Byte 4:  VS=0, RAC=0, UA_INT=0, SWP=0 */
+	resp[5] = 0x40;		/* ATO=0, TAS=1, ATMPE=0, RWWP=0, AUTOLOAD=0 */
+	/* resp[6] and [7] are obsolete, thus zero */
+	resp[8] = 0xFF;		/* Busy timeout period = 0xffff */
+	resp[9] = 0xFF;
+	/* Bytes 10,11: Extended selftest completion time = 0x0000 */
+
+	return SNTI_TRANSLATION_SUCCESS;
+}
+
+static int nvme_trans_fill_caching_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr,
+					u8 *resp, int len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	u32 feature_resp;
+	u8 vwc;
+
+	if (len < MODE_PAGE_CACHING_LEN)
+		return SNTI_INTERNAL_ERROR;
+
+	nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0,
+								&feature_resp);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out;
+	}
+	vwc = feature_resp & 0x00000001;
+
+	resp[0] = MODE_PAGE_CACHING;
+	resp[1] = MODE_PAGE_CACHING_LEN_FIELD;
+	resp[2] = vwc << 2;
+
+ out:
+	return res;
+}
+
+static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *resp,
+					int len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+
+	if (len < MODE_PAGE_POW_CND_LEN)
+		return SNTI_INTERNAL_ERROR;
+
+	resp[0] = MODE_PAGE_POWER_CONDITION;
+	resp[1] = MODE_PAGE_POW_CND_LEN_FIELD;
+	/* All other bytes are zero */
+
+	return res;
+}
+
+static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *resp,
+					int len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+
+	if (len < MODE_PAGE_INF_EXC_LEN)
+		return SNTI_INTERNAL_ERROR;
+
+	resp[0] = MODE_PAGE_INFO_EXCEP;
+	resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD;
+	resp[2] = 0x88;
+	/* All other bytes are zero */
+
+	return res;
+}
+
+static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+				     u8 *resp, int len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u16 mode_pages_offset_1 = 0;
+	u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4;
+
+	mode_pages_offset_2 = mode_pages_offset_1 + MODE_PAGE_CACHING_LEN;
+	mode_pages_offset_3 = mode_pages_offset_2 + MODE_PAGE_CONTROL_LEN;
+	mode_pages_offset_4 = mode_pages_offset_3 + MODE_PAGE_POW_CND_LEN;
+
+	res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1],
+					MODE_PAGE_CACHING_LEN);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+	res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2],
+					MODE_PAGE_CONTROL_LEN);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+	res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3],
+					MODE_PAGE_POW_CND_LEN);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+	res = nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4],
+					MODE_PAGE_INF_EXC_LEN);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+
+ out:
+	return res;
+}
+
+static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa)
+{
+	if (dbd == MODE_SENSE_BLK_DESC_ENABLED) {
+		/* SPC-4: len = 8 x Num_of_descriptors if llbaa = 0, 16x if 1 */
+		return 8 * (llbaa + 1) * MODE_SENSE_BLK_DESC_COUNT;
+	} else {
+		return 0;
+	}
+}
+
+static int nvme_trans_mode_page_create(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *cmd,
+					u16 alloc_len, u8 cdb10,
+					int (*mode_page_fill_func)
+					(struct nvme_ns *,
+					struct sg_io_hdr *hdr, u8 *, int),
+					u16 mode_pages_tot_len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+	u8 *response;
+	u8 dbd, llbaa;
+	u16 resp_size;
+	int mph_size;
+	u16 mode_pages_offset_1;
+	u16 blk_desc_len, blk_desc_offset, mode_data_length;
+
+	dbd = GET_MODE_SENSE_DBD(cmd);
+	llbaa = GET_MODE_SENSE_LLBAA(cmd);
+	mph_size = GET_MODE_SENSE_MPH_SIZE(cdb10);
+	blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa);
+
+	resp_size = mph_size + blk_desc_len + mode_pages_tot_len;
+	/* Refer spc4r34 Table 440 for calculation of Mode data Length field */
+	mode_data_length = 3 + (3 * cdb10) + blk_desc_len + mode_pages_tot_len;
+
+	blk_desc_offset = mph_size;
+	mode_pages_offset_1 = blk_desc_offset + blk_desc_len;
+
+	response = kzalloc(resp_size, GFP_KERNEL);
+	if (response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+
+	res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10,
+					llbaa, mode_data_length, blk_desc_len);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out_free;
+	if (blk_desc_len > 0) {
+		res = nvme_trans_fill_blk_desc(ns, hdr,
+					       &response[blk_desc_offset],
+					       blk_desc_len, llbaa);
+		if (res != SNTI_TRANSLATION_SUCCESS)
+			goto out_free;
+	}
+	res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1],
+					mode_pages_tot_len);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out_free;
+
+	xfer_len = min(alloc_len, resp_size);
+	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+ out_free:
+	kfree(response);
+ out_mem:
+	return res;
+}
+
+/* Read Capacity Helper Functions */
+
+static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns,
+								u8 cdb16)
+{
+	u8 flbas;
+	u32 lba_length;
+	u64 rlba;
+	u8 prot_en;
+	u8 p_type_lut[4] = {0, 0, 1, 2};
+	__be64 tmp_rlba;
+	__be32 tmp_rlba_32;
+	__be32 tmp_len;
+
+	flbas = (id_ns->flbas) & 0x0F;
+	lba_length = (1 << (id_ns->lbaf[flbas].ds));
+	rlba = le64_to_cpup(&id_ns->nsze) - 1;
+	(id_ns->dps) ? (prot_en = 0x01) : (prot_en = 0);
+
+	if (!cdb16) {
+		if (rlba > 0xFFFFFFFF)
+			rlba = 0xFFFFFFFF;
+		tmp_rlba_32 = cpu_to_be32(rlba);
+		tmp_len = cpu_to_be32(lba_length);
+		memcpy(response, &tmp_rlba_32, sizeof(u32));
+		memcpy(&response[4], &tmp_len, sizeof(u32));
+	} else {
+		tmp_rlba = cpu_to_be64(rlba);
+		tmp_len = cpu_to_be32(lba_length);
+		memcpy(response, &tmp_rlba, sizeof(u64));
+		memcpy(&response[8], &tmp_len, sizeof(u32));
+		response[12] = (p_type_lut[id_ns->dps & 0x3] << 1) | prot_en;
+		/* P_I_Exponent = 0x0 | LBPPBE = 0x0 */
+		/* LBPME = 0 | LBPRZ = 0 | LALBA = 0x00 */
+		/* Bytes 16-31 - Reserved */
+	}
+}
+
+/* Start Stop Unit Helper Functions */
+
+static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+						u8 pc, u8 pcmod, u8 start)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ctrl *id_ctrl;
+	int lowest_pow_st;	/* max npss = lowest power consumption */
+	unsigned ps_desired = 0;
+
+	/* NVMe Controller Identify */
+	mem = dma_alloc_coherent(&dev->pci_dev->dev,
+				sizeof(struct nvme_id_ctrl),
+				&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+	nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_dma;
+	}
+	id_ctrl = mem;
+	lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1));
+
+	switch (pc) {
+	case NVME_POWER_STATE_START_VALID:
+		/* Action unspecified if POWER CONDITION MODIFIER != 0 */
+		if (pcmod == 0 && start == 0x1)
+			ps_desired = POWER_STATE_0;
+		if (pcmod == 0 && start == 0x0)
+			ps_desired = lowest_pow_st;
+		break;
+	case NVME_POWER_STATE_ACTIVE:
+		/* Action unspecified if POWER CONDITION MODIFIER != 0 */
+		if (pcmod == 0)
+			ps_desired = POWER_STATE_0;
+		break;
+	case NVME_POWER_STATE_IDLE:
+		/* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */
+		if (pcmod == 0x0)
+			ps_desired = POWER_STATE_1;
+		else if (pcmod == 0x1)
+			ps_desired = POWER_STATE_2;
+		else if (pcmod == 0x2)
+			ps_desired = POWER_STATE_3;
+		break;
+	case NVME_POWER_STATE_STANDBY:
+		/* Action unspecified if POWER CONDITION MODIFIER != [0,1] */
+		if (pcmod == 0x0)
+			ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2));
+		else if (pcmod == 0x1)
+			ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1));
+		break;
+	case NVME_POWER_STATE_LU_CONTROL:
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+				ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+	nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0,
+				    NULL);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc)
+		res = nvme_sc;
+ out_dma:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem,
+			  dma_addr);
+ out:
+	return res;
+}
+
+/* Write Buffer Helper Functions */
+/* Also using this for Format Unit with hdr passed as NULL, and buffer_id, 0 */
+
+static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					u8 opcode, u32 tot_len, u32 offset,
+					u8 buffer_id)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_command c;
+	struct nvme_iod *iod = NULL;
+	unsigned length;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = opcode;
+	if (opcode == nvme_admin_download_fw) {
+		if (hdr->iovec_count > 0) {
+			/* Assuming SGL is not allowed for this command */
+			res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_CDB,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			goto out;
+		}
+		iod = nvme_map_user_pages(dev, DMA_TO_DEVICE,
+				(unsigned long)hdr->dxferp, tot_len);
+		if (IS_ERR(iod)) {
+			res = PTR_ERR(iod);
+			goto out;
+		}
+		length = nvme_setup_prps(dev, iod, tot_len, GFP_KERNEL);
+		if (length != tot_len) {
+			res = -ENOMEM;
+			goto out_unmap;
+		}
+
+		c.dlfw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+		c.dlfw.prp2 = cpu_to_le64(iod->first_dma);
+		c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1);
+		c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS);
+	} else if (opcode == nvme_admin_activate_fw) {
+		u32 cdw10 = buffer_id | NVME_FWACT_REPL_ACTV;
+		c.common.cdw10[0] = cpu_to_le32(cdw10);
+	}
+
+	nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_unmap;
+	if (nvme_sc)
+		res = nvme_sc;
+
+ out_unmap:
+	if (opcode == nvme_admin_download_fw) {
+		nvme_unmap_user_pages(dev, DMA_TO_DEVICE, iod);
+		nvme_free_iod(dev, iod);
+	}
+ out:
+	return res;
+}
+
+/* Mode Select Helper Functions */
+
+static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10,
+						u16 *bd_len, u8 *llbaa)
+{
+	if (cdb10) {
+		/* 10 Byte CDB */
+		*bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) +
+			parm_list[MODE_SELECT_10_BD_OFFSET + 1];
+		*llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] &&
+				MODE_SELECT_10_LLBAA_MASK;
+	} else {
+		/* 6 Byte CDB */
+		*bd_len = parm_list[MODE_SELECT_6_BD_OFFSET];
+	}
+}
+
+static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list,
+					u16 idx, u16 bd_len, u8 llbaa)
+{
+	u16 bd_num;
+
+	bd_num = bd_len / ((llbaa == 0) ?
+			SHORT_DESC_BLOCK : LONG_DESC_BLOCK);
+	/* Store block descriptor info if a FORMAT UNIT comes later */
+	/* TODO Saving 1st BD info; what to do if multiple BD received? */
+	if (llbaa == 0) {
+		/* Standard Block Descriptor - spc4r34 7.5.5.1 */
+		ns->mode_select_num_blocks =
+				(parm_list[idx + 1] << 16) +
+				(parm_list[idx + 2] << 8) +
+				(parm_list[idx + 3]);
+
+		ns->mode_select_block_len =
+				(parm_list[idx + 5] << 16) +
+				(parm_list[idx + 6] << 8) +
+				(parm_list[idx + 7]);
+	} else {
+		/* Long LBA Block Descriptor - sbc3r27 6.4.2.3 */
+		ns->mode_select_num_blocks =
+				(((u64)parm_list[idx + 0]) << 56) +
+				(((u64)parm_list[idx + 1]) << 48) +
+				(((u64)parm_list[idx + 2]) << 40) +
+				(((u64)parm_list[idx + 3]) << 32) +
+				(((u64)parm_list[idx + 4]) << 24) +
+				(((u64)parm_list[idx + 5]) << 16) +
+				(((u64)parm_list[idx + 6]) << 8) +
+				((u64)parm_list[idx + 7]);
+
+		ns->mode_select_block_len =
+				(parm_list[idx + 12] << 24) +
+				(parm_list[idx + 13] << 16) +
+				(parm_list[idx + 14] << 8) +
+				(parm_list[idx + 15]);
+	}
+}
+
+static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					u8 *mode_page, u8 page_code)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	unsigned dword11;
+
+	switch (page_code) {
+	case MODE_PAGE_CACHING:
+		dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0);
+		nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11,
+					    0, NULL);
+		res = nvme_trans_status_code(hdr, nvme_sc);
+		if (res)
+			break;
+		if (nvme_sc) {
+			res = nvme_sc;
+			break;
+		}
+		break;
+	case MODE_PAGE_CONTROL:
+		break;
+	case MODE_PAGE_POWER_CONDITION:
+		/* Verify the OS is not trying to set timers */
+		if ((mode_page[2] & 0x01) != 0 || (mode_page[3] & 0x0F) != 0) {
+			res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_PARAMETER,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			if (!res)
+				res = SNTI_INTERNAL_ERROR;
+			break;
+		}
+		break;
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		if (!res)
+			res = SNTI_INTERNAL_ERROR;
+		break;
+	}
+
+	return res;
+}
+
+static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					u8 *cmd, u16 parm_list_len, u8 pf,
+					u8 sp, u8 cdb10)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 *parm_list;
+	u16 bd_len;
+	u8 llbaa = 0;
+	u16 index, saved_index;
+	u8 page_code;
+	u16 mp_size;
+
+	/* Get parm list from data-in/out buffer */
+	parm_list = kmalloc(parm_list_len, GFP_KERNEL);
+	if (parm_list == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+
+	res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out_mem;
+
+	nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa);
+	index = (cdb10) ? (MODE_SELECT_10_MPH_SIZE) : (MODE_SELECT_6_MPH_SIZE);
+
+	if (bd_len != 0) {
+		/* Block Descriptors present, parse */
+		nvme_trans_modesel_save_bd(ns, parm_list, index, bd_len, llbaa);
+		index += bd_len;
+	}
+	saved_index = index;
+
+	/* Multiple mode pages may be present; iterate through all */
+	/* In 1st Iteration, don't do NVME Command, only check for CDB errors */
+	do {
+		page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK;
+		mp_size = parm_list[index + 1] + 2;
+		if ((page_code != MODE_PAGE_CACHING) &&
+		    (page_code != MODE_PAGE_CONTROL) &&
+		    (page_code != MODE_PAGE_POWER_CONDITION)) {
+			res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_CDB,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			goto out_mem;
+		}
+		index += mp_size;
+	} while (index < parm_list_len);
+
+	/* In 2nd Iteration, do the NVME Commands */
+	index = saved_index;
+	do {
+		page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK;
+		mp_size = parm_list[index + 1] + 2;
+		res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index],
+								page_code);
+		if (res != SNTI_TRANSLATION_SUCCESS)
+			break;
+		index += mp_size;
+	} while (index < parm_list_len);
+
+ out_mem:
+	kfree(parm_list);
+ out:
+	return res;
+}
+
+/* Format Unit Helper Functions */
+
+static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
+					     struct sg_io_hdr *hdr)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ns *id_ns;
+	u8 flbas;
+
+	/*
+	 * SCSI Expects a MODE SELECT would have been issued prior to
+	 * a FORMAT UNIT, and the block size and number would be used
+	 * from the block descriptor in it. If a MODE SELECT had not
+	 * been issued, FORMAT shall use the current values for both.
+	 */
+
+	if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) {
+		mem = dma_alloc_coherent(&dev->pci_dev->dev,
+			sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL);
+		if (mem == NULL) {
+			res = -ENOMEM;
+			goto out;
+		}
+		/* nvme ns identify */
+		nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+		res = nvme_trans_status_code(hdr, nvme_sc);
+		if (res)
+			goto out_dma;
+		if (nvme_sc) {
+			res = nvme_sc;
+			goto out_dma;
+		}
+		id_ns = mem;
+
+		if (ns->mode_select_num_blocks == 0)
+			ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap);
+		if (ns->mode_select_block_len == 0) {
+			flbas = (id_ns->flbas) & 0x0F;
+			ns->mode_select_block_len =
+						(1 << (id_ns->lbaf[flbas].ds));
+		}
+ out_dma:
+		dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+				  mem, dma_addr);
+	}
+ out:
+	return res;
+}
+
+static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len,
+					u8 format_prot_info, u8 *nvme_pf_code)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 *parm_list;
+	u8 pf_usage, pf_code;
+
+	parm_list = kmalloc(len, GFP_KERNEL);
+	if (parm_list == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+	res = nvme_trans_copy_from_user(hdr, parm_list, len);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out_mem;
+
+	if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] &
+				FORMAT_UNIT_IMMED_MASK) != 0) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out_mem;
+	}
+
+	if (len == FORMAT_UNIT_LONG_PARM_LIST_LEN &&
+	    (parm_list[FORMAT_UNIT_PROT_INT_OFFSET] & 0x0F) != 0) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out_mem;
+	}
+	pf_usage = parm_list[FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET] &
+			FORMAT_UNIT_PROT_FIELD_USAGE_MASK;
+	pf_code = (pf_usage << 2) | format_prot_info;
+	switch (pf_code) {
+	case 0:
+		*nvme_pf_code = 0;
+		break;
+	case 2:
+		*nvme_pf_code = 1;
+		break;
+	case 3:
+		*nvme_pf_code = 2;
+		break;
+	case 7:
+		*nvme_pf_code = 3;
+		break;
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+
+ out_mem:
+	kfree(parm_list);
+ out:
+	return res;
+}
+
+static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+				   u8 prot_info)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ns *id_ns;
+	u8 i;
+	u8 flbas, nlbaf;
+	u8 selected_lbaf = 0xFF;
+	u32 cdw10 = 0;
+	struct nvme_command c;
+
+	/* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+							&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+	/* nvme ns identify */
+	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_dma;
+	}
+	id_ns = mem;
+	flbas = (id_ns->flbas) & 0x0F;
+	nlbaf = id_ns->nlbaf;
+
+	for (i = 0; i < nlbaf; i++) {
+		if (ns->mode_select_block_len == (1 << (id_ns->lbaf[i].ds))) {
+			selected_lbaf = i;
+			break;
+		}
+	}
+	if (selected_lbaf > 0x0F) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+				ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER,
+				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	}
+	if (ns->mode_select_num_blocks != le64_to_cpu(id_ns->ncap)) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+				ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER,
+				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	}
+
+	cdw10 |= prot_info << 5;
+	cdw10 |= selected_lbaf & 0x0F;
+	memset(&c, 0, sizeof(c));
+	c.format.opcode = nvme_admin_format_nvm;
+	c.format.nsid = cpu_to_le32(ns->ns_id);
+	c.format.cdw10 = cpu_to_le32(cdw10);
+
+	nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc)
+		res = nvme_sc;
+
+ out_dma:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out:
+	return res;
+}
+
+/* Read/Write Helper Functions */
+
+static inline void nvme_trans_get_io_cdb6(u8 *cmd,
+					struct nvme_trans_io_cdb *cdb_info)
+{
+	cdb_info->fua = 0;
+	cdb_info->prot_info = 0;
+	cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_6_CDB_LBA_OFFSET) &
+					IO_6_CDB_LBA_MASK;
+	cdb_info->xfer_len = GET_U8_FROM_CDB(cmd, IO_6_CDB_TX_LEN_OFFSET);
+
+	/* sbc3r27 sec 5.32 - TRANSFER LEN of 0 implies a 256 Block transfer */
+	if (cdb_info->xfer_len == 0)
+		cdb_info->xfer_len = IO_6_DEFAULT_TX_LEN;
+}
+
+static inline void nvme_trans_get_io_cdb10(u8 *cmd,
+					struct nvme_trans_io_cdb *cdb_info)
+{
+	cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_10_CDB_FUA_OFFSET) &
+					IO_CDB_FUA_MASK;
+	cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_10_CDB_WP_OFFSET) &
+					IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
+	cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_10_CDB_LBA_OFFSET);
+	cdb_info->xfer_len = GET_U16_FROM_CDB(cmd, IO_10_CDB_TX_LEN_OFFSET);
+}
+
+static inline void nvme_trans_get_io_cdb12(u8 *cmd,
+					struct nvme_trans_io_cdb *cdb_info)
+{
+	cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_12_CDB_FUA_OFFSET) &
+					IO_CDB_FUA_MASK;
+	cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_12_CDB_WP_OFFSET) &
+					IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
+	cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_12_CDB_LBA_OFFSET);
+	cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_12_CDB_TX_LEN_OFFSET);
+}
+
+static inline void nvme_trans_get_io_cdb16(u8 *cmd,
+					struct nvme_trans_io_cdb *cdb_info)
+{
+	cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_16_CDB_FUA_OFFSET) &
+					IO_CDB_FUA_MASK;
+	cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_16_CDB_WP_OFFSET) &
+					IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
+	cdb_info->lba = GET_U64_FROM_CDB(cmd, IO_16_CDB_LBA_OFFSET);
+	cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_16_CDB_TX_LEN_OFFSET);
+}
+
+static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr,
+					struct nvme_trans_io_cdb *cdb_info,
+					u32 max_blocks)
+{
+	/* If using iovecs, send one nvme command per vector */
+	if (hdr->iovec_count > 0)
+		return hdr->iovec_count;
+	else if (cdb_info->xfer_len > max_blocks)
+		return ((cdb_info->xfer_len - 1) / max_blocks) + 1;
+	else
+		return 1;
+}
+
+static u16 nvme_trans_io_get_control(struct nvme_ns *ns,
+					struct nvme_trans_io_cdb *cdb_info)
+{
+	u16 control = 0;
+
+	/* When Protection information support is added, implement here */
+
+	if (cdb_info->fua > 0)
+		control |= NVME_RW_FUA;
+
+	return control;
+}
+
+static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+				struct nvme_trans_io_cdb *cdb_info, u8 is_write)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	u32 num_cmds;
+	struct nvme_iod *iod;
+	u64 unit_len;
+	u64 unit_num_blocks;	/* Number of blocks to xfer in each nvme cmd */
+	u32 retcode;
+	u32 i = 0;
+	u64 nvme_offset = 0;
+	void __user *next_mapping_addr;
+	struct nvme_command c;
+	u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read);
+	u16 control;
+	u32 max_blocks = queue_max_hw_sectors(ns->queue);
+
+	num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks);
+
+	/*
+	 * This loop handles two cases.
+	 * First, when an SGL is used in the form of an iovec list:
+	 *   - Use iov_base as the next mapping address for the nvme command_id
+	 *   - Use iov_len as the data transfer length for the command.
+	 * Second, when we have a single buffer
+	 *   - If larger than max_blocks, split into chunks, offset
+	 *        each nvme command accordingly.
+	 */
+	for (i = 0; i < num_cmds; i++) {
+		memset(&c, 0, sizeof(c));
+		if (hdr->iovec_count > 0) {
+			struct sg_iovec sgl;
+
+			retcode = copy_from_user(&sgl, hdr->dxferp +
+					i * sizeof(struct sg_iovec),
+					sizeof(struct sg_iovec));
+			if (retcode)
+				return -EFAULT;
+			unit_len = sgl.iov_len;
+			unit_num_blocks = unit_len >> ns->lba_shift;
+			next_mapping_addr = sgl.iov_base;
+		} else {
+			unit_num_blocks = min((u64)max_blocks,
+					(cdb_info->xfer_len - nvme_offset));
+			unit_len = unit_num_blocks << ns->lba_shift;
+			next_mapping_addr = hdr->dxferp +
+					((1 << ns->lba_shift) * nvme_offset);
+		}
+
+		c.rw.opcode = opcode;
+		c.rw.nsid = cpu_to_le32(ns->ns_id);
+		c.rw.slba = cpu_to_le64(cdb_info->lba + nvme_offset);
+		c.rw.length = cpu_to_le16(unit_num_blocks - 1);
+		control = nvme_trans_io_get_control(ns, cdb_info);
+		c.rw.control = cpu_to_le16(control);
+
+		iod = nvme_map_user_pages(dev,
+			(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+			(unsigned long)next_mapping_addr, unit_len);
+		if (IS_ERR(iod)) {
+			res = PTR_ERR(iod);
+			goto out;
+		}
+		retcode = nvme_setup_prps(dev, iod, unit_len, GFP_KERNEL);
+		if (retcode != unit_len) {
+			nvme_unmap_user_pages(dev,
+				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+				iod);
+			nvme_free_iod(dev, iod);
+			res = -ENOMEM;
+			goto out;
+		}
+		c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+		c.rw.prp2 = cpu_to_le64(iod->first_dma);
+
+		nvme_offset += unit_num_blocks;
+
+		nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
+		if (nvme_sc != NVME_SC_SUCCESS) {
+			nvme_unmap_user_pages(dev,
+				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+				iod);
+			nvme_free_iod(dev, iod);
+			res = nvme_trans_status_code(hdr, nvme_sc);
+			goto out;
+		}
+		nvme_unmap_user_pages(dev,
+				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+				iod);
+		nvme_free_iod(dev, iod);
+	}
+	res = nvme_trans_status_code(hdr, NVME_SC_SUCCESS);
+
+ out:
+	return res;
+}
+
+
+/* SCSI Command Translation Functions */
+
+static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	struct nvme_trans_io_cdb cdb_info;
+	u8 opcode = cmd[0];
+	u64 xfer_bytes;
+	u64 sum_iov_len = 0;
+	struct sg_iovec sgl;
+	int i;
+	size_t not_copied;
+
+	/* Extract Fields from CDB */
+	switch (opcode) {
+	case WRITE_6:
+	case READ_6:
+		nvme_trans_get_io_cdb6(cmd, &cdb_info);
+		break;
+	case WRITE_10:
+	case READ_10:
+		nvme_trans_get_io_cdb10(cmd, &cdb_info);
+		break;
+	case WRITE_12:
+	case READ_12:
+		nvme_trans_get_io_cdb12(cmd, &cdb_info);
+		break;
+	case WRITE_16:
+	case READ_16:
+		nvme_trans_get_io_cdb16(cmd, &cdb_info);
+		break;
+	default:
+		/* Will never really reach here */
+		res = SNTI_INTERNAL_ERROR;
+		goto out;
+	}
+
+	/* Calculate total length of transfer (in bytes) */
+	if (hdr->iovec_count > 0) {
+		for (i = 0; i < hdr->iovec_count; i++) {
+			not_copied = copy_from_user(&sgl, hdr->dxferp +
+						i * sizeof(struct sg_iovec),
+						sizeof(struct sg_iovec));
+			if (not_copied)
+				return -EFAULT;
+			sum_iov_len += sgl.iov_len;
+			/* IO vector sizes should be multiples of block size */
+			if (sgl.iov_len % (1 << ns->lba_shift) != 0) {
+				res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_PARAMETER,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+				goto out;
+			}
+		}
+	} else {
+		sum_iov_len = hdr->dxfer_len;
+	}
+
+	/* As Per sg ioctl howto, if the lengths differ, use the lower one */
+	xfer_bytes = min(((u64)hdr->dxfer_len), sum_iov_len);
+
+	/* If block count and actual data buffer size dont match, error out */
+	if (xfer_bytes != (cdb_info.xfer_len << ns->lba_shift)) {
+		res = -EINVAL;
+		goto out;
+	}
+
+	/* Check for 0 length transfer - it is not illegal */
+	if (cdb_info.xfer_len == 0)
+		goto out;
+
+	/* Send NVMe IO Command(s) */
+	res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+
+ out:
+	return res;
+}
+
+static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 evpd;
+	u8 page_code;
+	int alloc_len;
+	u8 *inq_response;
+
+	evpd = GET_INQ_EVPD_BIT(cmd);
+	page_code = GET_INQ_PAGE_CODE(cmd);
+	alloc_len = GET_INQ_ALLOC_LENGTH(cmd);
+
+	inq_response = kmalloc(STANDARD_INQUIRY_LENGTH, GFP_KERNEL);
+	if (inq_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+
+	if (evpd == 0) {
+		if (page_code == INQ_STANDARD_INQUIRY_PAGE) {
+			res = nvme_trans_standard_inquiry_page(ns, hdr,
+						inq_response, alloc_len);
+		} else {
+			res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_CDB,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		}
+	} else {
+		switch (page_code) {
+		case VPD_SUPPORTED_PAGES:
+			res = nvme_trans_supported_vpd_pages(ns, hdr,
+						inq_response, alloc_len);
+			break;
+		case VPD_SERIAL_NUMBER:
+			res = nvme_trans_unit_serial_page(ns, hdr, inq_response,
+								alloc_len);
+			break;
+		case VPD_DEVICE_IDENTIFIERS:
+			res = nvme_trans_device_id_page(ns, hdr, inq_response,
+								alloc_len);
+			break;
+		case VPD_EXTENDED_INQUIRY:
+			res = nvme_trans_ext_inq_page(ns, hdr, alloc_len);
+			break;
+		case VPD_BLOCK_DEV_CHARACTERISTICS:
+			res = nvme_trans_bdev_char_page(ns, hdr, alloc_len);
+			break;
+		default:
+			res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_CDB,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			break;
+		}
+	}
+	kfree(inq_response);
+ out_mem:
+	return res;
+}
+
+static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u16 alloc_len;
+	u8 sp;
+	u8 pc;
+	u8 page_code;
+
+	sp = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_SP_OFFSET);
+	if (sp != LOG_SENSE_CDB_SP_NOT_ENABLED) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+	pc = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_PC_OFFSET);
+	page_code = pc & LOG_SENSE_CDB_PAGE_CODE_MASK;
+	pc = (pc & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT;
+	if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+	alloc_len = GET_U16_FROM_CDB(cmd, LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET);
+	switch (page_code) {
+	case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE:
+		res = nvme_trans_log_supp_pages(ns, hdr, alloc_len);
+		break;
+	case LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE:
+		res = nvme_trans_log_info_exceptions(ns, hdr, alloc_len);
+		break;
+	case LOG_PAGE_TEMPERATURE_PAGE:
+		res = nvme_trans_log_temperature(ns, hdr, alloc_len);
+		break;
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+
+ out:
+	return res;
+}
+
+static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 cdb10 = 0;
+	u16 parm_list_len;
+	u8 page_format;
+	u8 save_pages;
+
+	page_format = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_PAGE_FORMAT_OFFSET);
+	page_format &= MODE_SELECT_CDB_PAGE_FORMAT_MASK;
+
+	save_pages = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_SAVE_PAGES_OFFSET);
+	save_pages &= MODE_SELECT_CDB_SAVE_PAGES_MASK;
+
+	if (GET_OPCODE(cmd) == MODE_SELECT) {
+		parm_list_len = GET_U8_FROM_CDB(cmd,
+				MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET);
+	} else {
+		parm_list_len = GET_U16_FROM_CDB(cmd,
+				MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET);
+		cdb10 = 1;
+	}
+
+	if (parm_list_len != 0) {
+		/*
+		 * According to SPC-4 r24, a paramter list length field of 0
+		 * shall not be considered an error
+		 */
+		res = nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len,
+						page_format, save_pages, cdb10);
+	}
+
+	return res;
+}
+
+static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u16 alloc_len;
+	u8 cdb10 = 0;
+	u8 page_code;
+	u8 pc;
+
+	if (GET_OPCODE(cmd) == MODE_SENSE) {
+		alloc_len = GET_U8_FROM_CDB(cmd, MODE_SENSE6_ALLOC_LEN_OFFSET);
+	} else {
+		alloc_len = GET_U16_FROM_CDB(cmd,
+						MODE_SENSE10_ALLOC_LEN_OFFSET);
+		cdb10 = 1;
+	}
+
+	pc = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CONTROL_OFFSET) &
+						MODE_SENSE_PAGE_CONTROL_MASK;
+	if (pc != MODE_SENSE_PC_CURRENT_VALUES) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+
+	page_code = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CODE_OFFSET) &
+					MODE_SENSE_PAGE_CODE_MASK;
+	switch (page_code) {
+	case MODE_PAGE_CACHING:
+		res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+						cdb10,
+						&nvme_trans_fill_caching_page,
+						MODE_PAGE_CACHING_LEN);
+		break;
+	case MODE_PAGE_CONTROL:
+		res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+						cdb10,
+						&nvme_trans_fill_control_page,
+						MODE_PAGE_CONTROL_LEN);
+		break;
+	case MODE_PAGE_POWER_CONDITION:
+		res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+						cdb10,
+						&nvme_trans_fill_pow_cnd_page,
+						MODE_PAGE_POW_CND_LEN);
+		break;
+	case MODE_PAGE_INFO_EXCEP:
+		res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+						cdb10,
+						&nvme_trans_fill_inf_exc_page,
+						MODE_PAGE_INF_EXC_LEN);
+		break;
+	case MODE_PAGE_RETURN_ALL:
+		res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+						cdb10,
+						&nvme_trans_fill_all_pages,
+						MODE_PAGE_ALL_LEN);
+		break;
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+
+ out:
+	return res;
+}
+
+static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	u32 alloc_len = READ_CAP_10_RESP_SIZE;
+	u32 resp_size = READ_CAP_10_RESP_SIZE;
+	u32 xfer_len;
+	u8 cdb16;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ns *id_ns;
+	u8 *response;
+
+	cdb16 = IS_READ_CAP_16(cmd);
+	if (cdb16) {
+		alloc_len = GET_READ_CAP_16_ALLOC_LENGTH(cmd);
+		resp_size = READ_CAP_16_RESP_SIZE;
+	}
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+							&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+	/* nvme ns identify */
+	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_dma;
+	}
+	id_ns = mem;
+
+	response = kzalloc(resp_size, GFP_KERNEL);
+	if (response == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+	nvme_trans_fill_read_cap(response, id_ns, cdb16);
+
+	xfer_len = min(alloc_len, resp_size);
+	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+	kfree(response);
+ out_dma:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out:
+	return res;
+}
+
+static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	u32 alloc_len, xfer_len, resp_size;
+	u8 select_report;
+	u8 *response;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ctrl *id_ctrl;
+	u32 ll_length, lun_id;
+	u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET;
+	__be32 tmp_len;
+
+	alloc_len = GET_REPORT_LUNS_ALLOC_LENGTH(cmd);
+	select_report = GET_U8_FROM_CDB(cmd, REPORT_LUNS_SR_OFFSET);
+
+	if ((select_report != ALL_LUNS_RETURNED) &&
+	    (select_report != ALL_WELL_KNOWN_LUNS_RETURNED) &&
+	    (select_report != RESTRICTED_LUNS_RETURNED)) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	} else {
+		/* NVMe Controller Identify */
+		mem = dma_alloc_coherent(&dev->pci_dev->dev,
+					sizeof(struct nvme_id_ctrl),
+					&dma_addr, GFP_KERNEL);
+		if (mem == NULL) {
+			res = -ENOMEM;
+			goto out;
+		}
+		nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+		res = nvme_trans_status_code(hdr, nvme_sc);
+		if (res)
+			goto out_dma;
+		if (nvme_sc) {
+			res = nvme_sc;
+			goto out_dma;
+		}
+		id_ctrl = mem;
+		ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE;
+		resp_size = ll_length + LUN_DATA_HEADER_SIZE;
+
+		if (alloc_len < resp_size) {
+			res = nvme_trans_completion(hdr,
+					SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			goto out_dma;
+		}
+
+		response = kzalloc(resp_size, GFP_KERNEL);
+		if (response == NULL) {
+			res = -ENOMEM;
+			goto out_dma;
+		}
+
+		/* The first LUN ID will always be 0 per the SAM spec */
+		for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) {
+			/*
+			 * Set the LUN Id and then increment to the next LUN
+			 * location in the parameter data.
+			 */
+			__be64 tmp_id = cpu_to_be64(lun_id);
+			memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64));
+			lun_id_offset += LUN_ENTRY_SIZE;
+		}
+		tmp_len = cpu_to_be32(ll_length);
+		memcpy(response, &tmp_len, sizeof(u32));
+	}
+
+	xfer_len = min(alloc_len, resp_size);
+	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+	kfree(response);
+ out_dma:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem,
+			  dma_addr);
+ out:
+	return res;
+}
+
+static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 alloc_len, xfer_len, resp_size;
+	u8 desc_format;
+	u8 *response;
+
+	alloc_len = GET_REQUEST_SENSE_ALLOC_LENGTH(cmd);
+	desc_format = GET_U8_FROM_CDB(cmd, REQUEST_SENSE_DESC_OFFSET);
+	desc_format &= REQUEST_SENSE_DESC_MASK;
+
+	resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) :
+					(FIXED_FMT_SENSE_DATA_SIZE));
+	response = kzalloc(resp_size, GFP_KERNEL);
+	if (response == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+
+	if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) {
+		/* Descriptor Format Sense Data */
+		response[0] = DESC_FORMAT_SENSE_DATA;
+		response[1] = NO_SENSE;
+		/* TODO How is LOW POWER CONDITION ON handled? (byte 2) */
+		response[2] = SCSI_ASC_NO_SENSE;
+		response[3] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		/* SDAT_OVFL = 0 | Additional Sense Length = 0 */
+	} else {
+		/* Fixed Format Sense Data */
+		response[0] = FIXED_SENSE_DATA;
+		/* Byte 1 = Obsolete */
+		response[2] = NO_SENSE; /* FM, EOM, ILI, SDAT_OVFL = 0 */
+		/* Bytes 3-6 - Information - set to zero */
+		response[7] = FIXED_SENSE_DATA_ADD_LENGTH;
+		/* Bytes 8-11 - Cmd Specific Information - set to zero */
+		response[12] = SCSI_ASC_NO_SENSE;
+		response[13] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		/* Byte 14 = Field Replaceable Unit Code = 0 */
+		/* Bytes 15-17 - SKSV=0; Sense Key Specific = 0 */
+	}
+
+	xfer_len = min(alloc_len, resp_size);
+	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+	kfree(response);
+ out:
+	return res;
+}
+
+static int nvme_trans_security_protocol(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr,
+					u8 *cmd)
+{
+	return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+				ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND,
+				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+}
+
+static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_command c;
+	u8 immed, pcmod, pc, no_flush, start;
+
+	immed = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_IMMED_OFFSET);
+	pcmod = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET);
+	pc = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_OFFSET);
+	no_flush = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_NO_FLUSH_OFFSET);
+	start = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_START_OFFSET);
+
+	immed &= START_STOP_UNIT_CDB_IMMED_MASK;
+	pcmod &= START_STOP_UNIT_CDB_POWER_COND_MOD_MASK;
+	pc = (pc & START_STOP_UNIT_CDB_POWER_COND_MASK) >> NIBBLE_SHIFT;
+	no_flush &= START_STOP_UNIT_CDB_NO_FLUSH_MASK;
+	start &= START_STOP_UNIT_CDB_START_MASK;
+
+	if (immed != 0) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	} else {
+		if (no_flush == 0) {
+			/* Issue NVME FLUSH command prior to START STOP UNIT */
+			memset(&c, 0, sizeof(c));
+			c.common.opcode = nvme_cmd_flush;
+			c.common.nsid = cpu_to_le32(ns->ns_id);
+
+			nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
+			res = nvme_trans_status_code(hdr, nvme_sc);
+			if (res)
+				goto out;
+			if (nvme_sc) {
+				res = nvme_sc;
+				goto out;
+			}
+		}
+		/* Setup the expected power state transition */
+		res = nvme_trans_power_state(ns, hdr, pc, pcmod, start);
+	}
+
+ out:
+	return res;
+}
+
+static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_cmd_flush;
+	c.common.nsid = cpu_to_le32(ns->ns_id);
+
+	nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
+
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out;
+	if (nvme_sc)
+		res = nvme_sc;
+
+ out:
+	return res;
+}
+
+static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 parm_hdr_len = 0;
+	u8 nvme_pf_code = 0;
+	u8 format_prot_info, long_list, format_data;
+
+	format_prot_info = GET_U8_FROM_CDB(cmd,
+				FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET);
+	long_list = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_LONG_LIST_OFFSET);
+	format_data = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET);
+
+	format_prot_info = (format_prot_info &
+				FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK) >>
+				FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT;
+	long_list &= FORMAT_UNIT_CDB_LONG_LIST_MASK;
+	format_data &= FORMAT_UNIT_CDB_FORMAT_DATA_MASK;
+
+	if (format_data != 0) {
+		if (format_prot_info != 0) {
+			if (long_list == 0)
+				parm_hdr_len = FORMAT_UNIT_SHORT_PARM_LIST_LEN;
+			else
+				parm_hdr_len = FORMAT_UNIT_LONG_PARM_LIST_LEN;
+		}
+	} else if (format_data == 0 && format_prot_info != 0) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+
+	/* Get parm header from data-in/out buffer */
+	/*
+	 * According to the translation spec, the only fields in the parameter
+	 * list we are concerned with are in the header. So allocate only that.
+	 */
+	if (parm_hdr_len > 0) {
+		res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len,
+					format_prot_info, &nvme_pf_code);
+		if (res != SNTI_TRANSLATION_SUCCESS)
+			goto out;
+	}
+
+	/* Attempt to activate any previously downloaded firmware image */
+	res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, 0, 0, 0);
+
+	/* Determine Block size and count and send format command */
+	res = nvme_trans_fmt_set_blk_size_count(ns, hdr);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+
+	res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code);
+
+ out:
+	return res;
+}
+
+static int nvme_trans_test_unit_ready(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr,
+					u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	struct nvme_dev *dev = ns->dev;
+
+	if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY))
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					    NOT_READY, SCSI_ASC_LUN_NOT_READY,
+					    SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	else
+		res = nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0);
+
+	return res;
+}
+
+static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u32 buffer_offset, parm_list_length;
+	u8 buffer_id, mode;
+
+	parm_list_length =
+		GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET);
+	if (parm_list_length % BYTES_TO_DWORDS != 0) {
+		/* NVMe expects Firmware file to be a whole number of DWORDS */
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+	buffer_id = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_ID_OFFSET);
+	if (buffer_id > NVME_MAX_FIRMWARE_SLOT) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+	mode = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_MODE_OFFSET) &
+						WRITE_BUFFER_CDB_MODE_MASK;
+	buffer_offset =
+		GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET);
+
+	switch (mode) {
+	case DOWNLOAD_SAVE_ACTIVATE:
+		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw,
+						parm_list_length, buffer_offset,
+						buffer_id);
+		if (res != SNTI_TRANSLATION_SUCCESS)
+			goto out;
+		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw,
+						parm_list_length, buffer_offset,
+						buffer_id);
+		break;
+	case DOWNLOAD_SAVE_DEFER_ACTIVATE:
+		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw,
+						parm_list_length, buffer_offset,
+						buffer_id);
+		break;
+	case ACTIVATE_DEFERRED_MICROCODE:
+		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw,
+						parm_list_length, buffer_offset,
+						buffer_id);
+		break;
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+
+ out:
+	return res;
+}
+
+struct scsi_unmap_blk_desc {
+	__be64	slba;
+	__be32	nlb;
+	u32	resv;
+};
+
+struct scsi_unmap_parm_list {
+	__be16	unmap_data_len;
+	__be16	unmap_blk_desc_data_len;
+	u32	resv;
+	struct scsi_unmap_blk_desc desc[0];
+};
+
+static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	struct nvme_dev *dev = ns->dev;
+	struct scsi_unmap_parm_list *plist;
+	struct nvme_dsm_range *range;
+	struct nvme_command c;
+	int i, nvme_sc, res = -ENOMEM;
+	u16 ndesc, list_len;
+	dma_addr_t dma_addr;
+
+	list_len = GET_U16_FROM_CDB(cmd, UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET);
+	if (!list_len)
+		return -EINVAL;
+
+	plist = kmalloc(list_len, GFP_KERNEL);
+	if (!plist)
+		return -ENOMEM;
+
+	res = nvme_trans_copy_from_user(hdr, plist, list_len);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+
+	ndesc = be16_to_cpu(plist->unmap_blk_desc_data_len) >> 4;
+	if (!ndesc || ndesc > 256) {
+		res = -EINVAL;
+		goto out;
+	}
+
+	range = dma_alloc_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
+							&dma_addr, GFP_KERNEL);
+	if (!range)
+		goto out;
+
+	for (i = 0; i < ndesc; i++) {
+		range[i].nlb = cpu_to_le32(be32_to_cpu(plist->desc[i].nlb));
+		range[i].slba = cpu_to_le64(be64_to_cpu(plist->desc[i].slba));
+		range[i].cattr = 0;
+	}
+
+	memset(&c, 0, sizeof(c));
+	c.dsm.opcode = nvme_cmd_dsm;
+	c.dsm.nsid = cpu_to_le32(ns->ns_id);
+	c.dsm.prp1 = cpu_to_le64(dma_addr);
+	c.dsm.nr = cpu_to_le32(ndesc - 1);
+	c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+
+	nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+
+	dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
+							range, dma_addr);
+ out:
+	kfree(plist);
+	return res;
+}
+
+static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr)
+{
+	u8 cmd[BLK_MAX_CDB];
+	int retcode;
+	unsigned int opcode;
+
+	if (hdr->cmdp == NULL)
+		return -EMSGSIZE;
+	if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len))
+		return -EFAULT;
+
+	opcode = cmd[0];
+
+	switch (opcode) {
+	case READ_6:
+	case READ_10:
+	case READ_12:
+	case READ_16:
+		retcode = nvme_trans_io(ns, hdr, 0, cmd);
+		break;
+	case WRITE_6:
+	case WRITE_10:
+	case WRITE_12:
+	case WRITE_16:
+		retcode = nvme_trans_io(ns, hdr, 1, cmd);
+		break;
+	case INQUIRY:
+		retcode = nvme_trans_inquiry(ns, hdr, cmd);
+		break;
+	case LOG_SENSE:
+		retcode = nvme_trans_log_sense(ns, hdr, cmd);
+		break;
+	case MODE_SELECT:
+	case MODE_SELECT_10:
+		retcode = nvme_trans_mode_select(ns, hdr, cmd);
+		break;
+	case MODE_SENSE:
+	case MODE_SENSE_10:
+		retcode = nvme_trans_mode_sense(ns, hdr, cmd);
+		break;
+	case READ_CAPACITY:
+		retcode = nvme_trans_read_capacity(ns, hdr, cmd);
+		break;
+	case SERVICE_ACTION_IN:
+		if (IS_READ_CAP_16(cmd))
+			retcode = nvme_trans_read_capacity(ns, hdr, cmd);
+		else
+			goto out;
+		break;
+	case REPORT_LUNS:
+		retcode = nvme_trans_report_luns(ns, hdr, cmd);
+		break;
+	case REQUEST_SENSE:
+		retcode = nvme_trans_request_sense(ns, hdr, cmd);
+		break;
+	case SECURITY_PROTOCOL_IN:
+	case SECURITY_PROTOCOL_OUT:
+		retcode = nvme_trans_security_protocol(ns, hdr, cmd);
+		break;
+	case START_STOP:
+		retcode = nvme_trans_start_stop(ns, hdr, cmd);
+		break;
+	case SYNCHRONIZE_CACHE:
+		retcode = nvme_trans_synchronize_cache(ns, hdr, cmd);
+		break;
+	case FORMAT_UNIT:
+		retcode = nvme_trans_format_unit(ns, hdr, cmd);
+		break;
+	case TEST_UNIT_READY:
+		retcode = nvme_trans_test_unit_ready(ns, hdr, cmd);
+		break;
+	case WRITE_BUFFER:
+		retcode = nvme_trans_write_buffer(ns, hdr, cmd);
+		break;
+	case UNMAP:
+		retcode = nvme_trans_unmap(ns, hdr, cmd);
+		break;
+	default:
+ out:
+		retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+				ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND,
+				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+	return retcode;
+}
+
+int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr)
+{
+	struct sg_io_hdr hdr;
+	int retcode;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&hdr, u_hdr, sizeof(hdr)))
+		return -EFAULT;
+	if (hdr.interface_id != 'S')
+		return -EINVAL;
+	if (hdr.cmd_len > BLK_MAX_CDB)
+		return -EINVAL;
+
+	retcode = nvme_scsi_translate(ns, &hdr);
+	if (retcode < 0)
+		return retcode;
+	if (retcode > 0)
+		retcode = SNTI_TRANSLATION_SUCCESS;
+	if (copy_to_user(u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0)
+		return -EFAULT;
+
+	return retcode;
+}
+
+#ifdef CONFIG_COMPAT
+typedef struct sg_io_hdr32 {
+	compat_int_t interface_id;	/* [i] 'S' for SCSI generic (required) */
+	compat_int_t dxfer_direction;	/* [i] data transfer direction  */
+	unsigned char cmd_len;		/* [i] SCSI command length ( <= 16 bytes) */
+	unsigned char mx_sb_len;		/* [i] max length to write to sbp */
+	unsigned short iovec_count;	/* [i] 0 implies no scatter gather */
+	compat_uint_t dxfer_len;		/* [i] byte count of data transfer */
+	compat_uint_t dxferp;		/* [i], [*io] points to data transfer memory
+					      or scatter gather list */
+	compat_uptr_t cmdp;		/* [i], [*i] points to command to perform */
+	compat_uptr_t sbp;		/* [i], [*o] points to sense_buffer memory */
+	compat_uint_t timeout;		/* [i] MAX_UINT->no timeout (unit: millisec) */
+	compat_uint_t flags;		/* [i] 0 -> default, see SG_FLAG... */
+	compat_int_t pack_id;		/* [i->o] unused internally (normally) */
+	compat_uptr_t usr_ptr;		/* [i->o] unused internally */
+	unsigned char status;		/* [o] scsi status */
+	unsigned char masked_status;	/* [o] shifted, masked scsi status */
+	unsigned char msg_status;		/* [o] messaging level data (optional) */
+	unsigned char sb_len_wr;		/* [o] byte count actually written to sbp */
+	unsigned short host_status;	/* [o] errors from host adapter */
+	unsigned short driver_status;	/* [o] errors from software driver */
+	compat_int_t resid;		/* [o] dxfer_len - actual_transferred */
+	compat_uint_t duration;		/* [o] time taken by cmd (unit: millisec) */
+	compat_uint_t info;		/* [o] auxiliary information */
+} sg_io_hdr32_t;  /* 64 bytes long (on sparc32) */
+
+typedef struct sg_iovec32 {
+	compat_uint_t iov_base;
+	compat_uint_t iov_len;
+} sg_iovec32_t;
+
+static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count)
+{
+	sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1);
+	sg_iovec32_t __user *iov32 = dxferp;
+	int i;
+
+	for (i = 0; i < iovec_count; i++) {
+		u32 base, len;
+
+		if (get_user(base, &iov32[i].iov_base) ||
+		    get_user(len, &iov32[i].iov_len) ||
+		    put_user(compat_ptr(base), &iov[i].iov_base) ||
+		    put_user(len, &iov[i].iov_len))
+			return -EFAULT;
+	}
+
+	if (put_user(iov, &sgio->dxferp))
+		return -EFAULT;
+	return 0;
+}
+
+int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg)
+{
+	sg_io_hdr32_t __user *sgio32 = (sg_io_hdr32_t __user *)arg;
+	sg_io_hdr_t __user *sgio;
+	u16 iovec_count;
+	u32 data;
+	void __user *dxferp;
+	int err;
+	int interface_id;
+
+	if (get_user(interface_id, &sgio32->interface_id))
+		return -EFAULT;
+	if (interface_id != 'S')
+		return -EINVAL;
+
+	if (get_user(iovec_count, &sgio32->iovec_count))
+		return -EFAULT;
+
+	{
+		void __user *top = compat_alloc_user_space(0);
+		void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) +
+				       (iovec_count * sizeof(sg_iovec_t)));
+		if (new > top)
+			return -EINVAL;
+
+		sgio = new;
+	}
+
+	/* Ok, now construct.  */
+	if (copy_in_user(&sgio->interface_id, &sgio32->interface_id,
+			 (2 * sizeof(int)) +
+			 (2 * sizeof(unsigned char)) +
+			 (1 * sizeof(unsigned short)) +
+			 (1 * sizeof(unsigned int))))
+		return -EFAULT;
+
+	if (get_user(data, &sgio32->dxferp))
+		return -EFAULT;
+	dxferp = compat_ptr(data);
+	if (iovec_count) {
+		if (sg_build_iovec(sgio, dxferp, iovec_count))
+			return -EFAULT;
+	} else {
+		if (put_user(dxferp, &sgio->dxferp))
+			return -EFAULT;
+	}
+
+	{
+		unsigned char __user *cmdp;
+		unsigned char __user *sbp;
+
+		if (get_user(data, &sgio32->cmdp))
+			return -EFAULT;
+		cmdp = compat_ptr(data);
+
+		if (get_user(data, &sgio32->sbp))
+			return -EFAULT;
+		sbp = compat_ptr(data);
+
+		if (put_user(cmdp, &sgio->cmdp) ||
+		    put_user(sbp, &sgio->sbp))
+			return -EFAULT;
+	}
+
+	if (copy_in_user(&sgio->timeout, &sgio32->timeout,
+			 3 * sizeof(int)))
+		return -EFAULT;
+
+	if (get_user(data, &sgio32->usr_ptr))
+		return -EFAULT;
+	if (put_user(compat_ptr(data), &sgio->usr_ptr))
+		return -EFAULT;
+
+	err = nvme_sg_io(ns, sgio);
+	if (err >= 0) {
+		void __user *datap;
+
+		if (copy_in_user(&sgio32->pack_id, &sgio->pack_id,
+				 sizeof(int)) ||
+		    get_user(datap, &sgio->usr_ptr) ||
+		    put_user((u32)(unsigned long)datap,
+			     &sgio32->usr_ptr) ||
+		    copy_in_user(&sgio32->status, &sgio->status,
+				 (4 * sizeof(unsigned char)) +
+				 (2 * sizeof(unsigned short)) +
+				 (3 * sizeof(int))))
+			err = -EFAULT;
+	}
+
+	return err;
+}
+#endif
+
+int nvme_sg_get_version_num(int __user *ip)
+{
+	return put_user(sg_version_num, ip);
+}
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
new file mode 100644
index 00000000000..79aa179305b
--- /dev/null
+++ b/drivers/block/osdblk.c
@@ -0,0 +1,699 @@
+
+/*
+   osdblk.c -- Export a single SCSI OSD object as a Linux block device
+
+
+   Copyright 2009 Red Hat, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+   Instructions for use
+   --------------------
+
+   1) Map a Linux block device to an existing OSD object.
+
+      In this example, we will use partition id 1234, object id 5678,
+      OSD device /dev/osd1.
+
+      $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
+
+
+   2) List all active blkdev<->object mappings.
+
+      In this example, we have performed step #1 twice, creating two blkdevs,
+      mapped to two separate OSD objects.
+
+      $ cat /sys/class/osdblk/list
+      0 174 1234 5678 /dev/osd1
+      1 179 1994 897123 /dev/osd0
+
+      The columns, in order, are:
+      - blkdev unique id
+      - blkdev assigned major
+      - OSD object partition id
+      - OSD object id
+      - OSD device
+
+
+   3) Remove an active blkdev<->object mapping.
+
+      In this example, we remove the mapping with blkdev unique id 1.
+
+      $ echo 1 > /sys/class/osdblk/remove
+
+
+   NOTE:  The actual creation and deletion of OSD objects is outside the scope
+   of this driver.
+
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_sec.h>
+#include <scsi/scsi_device.h>
+
+#define DRV_NAME "osdblk"
+#define PFX DRV_NAME ": "
+
+/* #define _OSDBLK_DEBUG */
+#ifdef _OSDBLK_DEBUG
+#define OSDBLK_DEBUG(fmt, a...) \
+	printk(KERN_NOTICE "osdblk @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define OSDBLK_DEBUG(fmt, a...) \
+	do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+MODULE_DESCRIPTION("block device inside an OSD object osdblk.ko");
+MODULE_LICENSE("GPL");
+
+struct osdblk_device;
+
+enum {
+	OSDBLK_MINORS_PER_MAJOR	= 256,		/* max minors per blkdev */
+	OSDBLK_MAX_REQ		= 32,		/* max parallel requests */
+	OSDBLK_OP_TIMEOUT	= 4 * 60,	/* sync OSD req timeout */
+};
+
+struct osdblk_request {
+	struct request		*rq;		/* blk layer request */
+	struct bio		*bio;		/* cloned bio */
+	struct osdblk_device	*osdev;		/* associated blkdev */
+};
+
+struct osdblk_device {
+	int			id;		/* blkdev unique id */
+
+	int			major;		/* blkdev assigned major */
+	struct gendisk		*disk;		/* blkdev's gendisk and rq */
+	struct request_queue	*q;
+
+	struct osd_dev		*osd;		/* associated OSD */
+
+	char			name[32];	/* blkdev name, e.g. osdblk34 */
+
+	spinlock_t		lock;		/* queue lock */
+
+	struct osd_obj_id	obj;		/* OSD partition, obj id */
+	uint8_t			obj_cred[OSD_CAP_LEN]; /* OSD cred */
+
+	struct osdblk_request	req[OSDBLK_MAX_REQ]; /* request table */
+
+	struct list_head	node;
+
+	char			osd_path[0];	/* OSD device path */
+};
+
+static struct class *class_osdblk;		/* /sys/class/osdblk */
+static DEFINE_MUTEX(ctl_mutex);	/* Serialize open/close/setup/teardown */
+static LIST_HEAD(osdblkdev_list);
+
+static const struct block_device_operations osdblk_bd_ops = {
+	.owner		= THIS_MODULE,
+};
+
+static const struct osd_attr g_attr_logical_length = ATTR_DEF(
+	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+
+static void osdblk_make_credential(u8 cred_a[OSD_CAP_LEN],
+				   const struct osd_obj_id *obj)
+{
+	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+/* copied from exofs; move to libosd? */
+/*
+ * Perform a synchronous OSD operation.  copied from exofs; move to libosd?
+ */
+static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+	int ret;
+
+	or->timeout = timeout;
+	ret = osd_finalize_request(or, 0, credential, NULL);
+	if (ret)
+		return ret;
+
+	ret = osd_execute_request(or);
+
+	/* osd_req_decode_sense(or, ret); */
+	return ret;
+}
+
+/*
+ * Perform an asynchronous OSD operation.  copied from exofs; move to libosd?
+ */
+static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+		   void *caller_context, u8 *cred)
+{
+	int ret;
+
+	ret = osd_finalize_request(or, 0, cred, NULL);
+	if (ret)
+		return ret;
+
+	ret = osd_execute_request_async(or, async_done, caller_context);
+
+	return ret;
+}
+
+/* copied from exofs; move to libosd? */
+static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
+{
+	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+	void *iter = NULL;
+	int nelem;
+
+	do {
+		nelem = 1;
+		osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
+		if ((cur_attr.attr_page == attr->attr_page) &&
+		    (cur_attr.attr_id == attr->attr_id)) {
+			attr->len = cur_attr.len;
+			attr->val_ptr = cur_attr.val_ptr;
+			return 0;
+		}
+	} while (iter);
+
+	return -EIO;
+}
+
+static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
+{
+	struct osd_request *or;
+	struct osd_attr attr;
+	int ret;
+
+	/* start request */
+	or = osd_start_request(osdev->osd, GFP_KERNEL);
+	if (!or)
+		return -ENOMEM;
+
+	/* create a get-attributes(length) request */
+	osd_req_get_attributes(or, &osdev->obj);
+
+	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+
+	/* execute op synchronously */
+	ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
+	if (ret)
+		goto out;
+
+	/* extract length from returned attribute info */
+	attr = g_attr_logical_length;
+	ret = extract_attr_from_req(or, &attr);
+	if (ret)
+		goto out;
+
+	*size_out = get_unaligned_be64(attr.val_ptr);
+
+out:
+	osd_end_request(or);
+	return ret;
+
+}
+
+static void osdblk_osd_complete(struct osd_request *or, void *private)
+{
+	struct osdblk_request *orq = private;
+	struct osd_sense_info osi;
+	int ret = osd_req_decode_sense(or, &osi);
+
+	if (ret) {
+		ret = -EIO;
+		OSDBLK_DEBUG("osdblk_osd_complete with err=%d\n", ret);
+	}
+
+	/* complete OSD request */
+	osd_end_request(or);
+
+	/* complete request passed to osdblk by block layer */
+	__blk_end_request_all(orq->rq, ret);
+}
+
+static void bio_chain_put(struct bio *chain)
+{
+	struct bio *tmp;
+
+	while (chain) {
+		tmp = chain;
+		chain = chain->bi_next;
+
+		bio_put(tmp);
+	}
+}
+
+static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
+{
+	struct bio *tmp, *new_chain = NULL, *tail = NULL;
+
+	while (old_chain) {
+		tmp = bio_clone_kmalloc(old_chain, gfpmask);
+		if (!tmp)
+			goto err_out;
+
+		tmp->bi_bdev = NULL;
+		gfpmask &= ~__GFP_WAIT;
+		tmp->bi_next = NULL;
+
+		if (!new_chain)
+			new_chain = tail = tmp;
+		else {
+			tail->bi_next = tmp;
+			tail = tmp;
+		}
+
+		old_chain = old_chain->bi_next;
+	}
+
+	return new_chain;
+
+err_out:
+	OSDBLK_DEBUG("bio_chain_clone with err\n");
+	bio_chain_put(new_chain);
+	return NULL;
+}
+
+static void osdblk_rq_fn(struct request_queue *q)
+{
+	struct osdblk_device *osdev = q->queuedata;
+
+	while (1) {
+		struct request *rq;
+		struct osdblk_request *orq;
+		struct osd_request *or;
+		struct bio *bio;
+		bool do_write, do_flush;
+
+		/* peek at request from block layer */
+		rq = blk_fetch_request(q);
+		if (!rq)
+			break;
+
+		/* filter out block requests we don't understand */
+		if (rq->cmd_type != REQ_TYPE_FS) {
+			blk_end_request_all(rq, 0);
+			continue;
+		}
+
+		/* deduce our operation (read, write, flush) */
+		/* I wish the block layer simplified cmd_type/cmd_flags/cmd[]
+		 * into a clearly defined set of RPC commands:
+		 * read, write, flush, scsi command, power mgmt req,
+		 * driver-specific, etc.
+		 */
+
+		do_flush = rq->cmd_flags & REQ_FLUSH;
+		do_write = (rq_data_dir(rq) == WRITE);
+
+		if (!do_flush) { /* osd_flush does not use a bio */
+			/* a bio clone to be passed down to OSD request */
+			bio = bio_chain_clone(rq->bio, GFP_ATOMIC);
+			if (!bio)
+				break;
+		} else
+			bio = NULL;
+
+		/* alloc internal OSD request, for OSD command execution */
+		or = osd_start_request(osdev->osd, GFP_ATOMIC);
+		if (!or) {
+			bio_chain_put(bio);
+			OSDBLK_DEBUG("osd_start_request with err\n");
+			break;
+		}
+
+		orq = &osdev->req[rq->tag];
+		orq->rq = rq;
+		orq->bio = bio;
+		orq->osdev = osdev;
+
+		/* init OSD command: flush, write or read */
+		if (do_flush)
+			osd_req_flush_object(or, &osdev->obj,
+					     OSD_CDB_FLUSH_ALL, 0, 0);
+		else if (do_write)
+			osd_req_write(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
+				      bio, blk_rq_bytes(rq));
+		else
+			osd_req_read(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
+				     bio, blk_rq_bytes(rq));
+
+		OSDBLK_DEBUG("%s 0x%x bytes at 0x%llx\n",
+			do_flush ? "flush" : do_write ?
+				"write" : "read", blk_rq_bytes(rq),
+			blk_rq_pos(rq) * 512ULL);
+
+		/* begin OSD command execution */
+		if (osd_async_op(or, osdblk_osd_complete, orq,
+				 osdev->obj_cred)) {
+			osd_end_request(or);
+			blk_requeue_request(q, rq);
+			bio_chain_put(bio);
+			OSDBLK_DEBUG("osd_execute_request_async with err\n");
+			break;
+		}
+
+		/* remove the special 'flush' marker, now that the command
+		 * is executing
+		 */
+		rq->special = NULL;
+	}
+}
+
+static void osdblk_free_disk(struct osdblk_device *osdev)
+{
+	struct gendisk *disk = osdev->disk;
+
+	if (!disk)
+		return;
+
+	if (disk->flags & GENHD_FL_UP)
+		del_gendisk(disk);
+	if (disk->queue)
+		blk_cleanup_queue(disk->queue);
+	put_disk(disk);
+}
+
+static int osdblk_init_disk(struct osdblk_device *osdev)
+{
+	struct gendisk *disk;
+	struct request_queue *q;
+	int rc;
+	u64 obj_size = 0;
+
+	/* contact OSD, request size info about the object being mapped */
+	rc = osdblk_get_obj_size(osdev, &obj_size);
+	if (rc)
+		return rc;
+
+	/* create gendisk info */
+	disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);
+	if (!disk)
+		return -ENOMEM;
+
+	sprintf(disk->disk_name, DRV_NAME "%d", osdev->id);
+	disk->major = osdev->major;
+	disk->first_minor = 0;
+	disk->fops = &osdblk_bd_ops;
+	disk->private_data = osdev;
+
+	/* init rq */
+	q = blk_init_queue(osdblk_rq_fn, &osdev->lock);
+	if (!q) {
+		put_disk(disk);
+		return -ENOMEM;
+	}
+
+	/* switch queue to TCQ mode; allocate tag map */
+	rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL);
+	if (rc) {
+		blk_cleanup_queue(q);
+		put_disk(disk);
+		return rc;
+	}
+
+	/* Set our limits to the lower device limits, because osdblk cannot
+	 * sleep when allocating a lower-request and therefore cannot be
+	 * bouncing.
+	 */
+	blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
+
+	blk_queue_prep_rq(q, blk_queue_start_tag);
+	blk_queue_flush(q, REQ_FLUSH);
+
+	disk->queue = q;
+
+	q->queuedata = osdev;
+
+	osdev->disk = disk;
+	osdev->q = q;
+
+	/* finally, announce the disk to the world */
+	set_capacity(disk, obj_size / 512ULL);
+	add_disk(disk);
+
+	printk(KERN_INFO "%s: Added of size 0x%llx\n",
+		disk->disk_name, (unsigned long long)obj_size);
+
+	return 0;
+}
+
+/********************************************************************
+ * /sys/class/osdblk/
+ *                   add	map OSD object to blkdev
+ *                   remove	unmap OSD object
+ *                   list	show mappings
+ *******************************************************************/
+
+static void class_osdblk_release(struct class *cls)
+{
+	kfree(cls);
+}
+
+static ssize_t class_osdblk_list(struct class *c,
+				struct class_attribute *attr,
+				char *data)
+{
+	int n = 0;
+	struct list_head *tmp;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	list_for_each(tmp, &osdblkdev_list) {
+		struct osdblk_device *osdev;
+
+		osdev = list_entry(tmp, struct osdblk_device, node);
+
+		n += sprintf(data+n, "%d %d %llu %llu %s\n",
+			osdev->id,
+			osdev->major,
+			osdev->obj.partition,
+			osdev->obj.id,
+			osdev->osd_path);
+	}
+
+	mutex_unlock(&ctl_mutex);
+	return n;
+}
+
+static ssize_t class_osdblk_add(struct class *c,
+				struct class_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct osdblk_device *osdev;
+	ssize_t rc;
+	int irc, new_id = 0;
+	struct list_head *tmp;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	/* new osdblk_device object */
+	osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);
+	if (!osdev) {
+		rc = -ENOMEM;
+		goto err_out_mod;
+	}
+
+	/* static osdblk_device initialization */
+	spin_lock_init(&osdev->lock);
+	INIT_LIST_HEAD(&osdev->node);
+
+	/* generate unique id: find highest unique id, add one */
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	list_for_each(tmp, &osdblkdev_list) {
+		struct osdblk_device *osdev;
+
+		osdev = list_entry(tmp, struct osdblk_device, node);
+		if (osdev->id > new_id)
+			new_id = osdev->id + 1;
+	}
+
+	osdev->id = new_id;
+
+	/* add to global list */
+	list_add_tail(&osdev->node, &osdblkdev_list);
+
+	mutex_unlock(&ctl_mutex);
+
+	/* parse add command */
+	if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,
+		   osdev->osd_path) != 3) {
+		rc = -EINVAL;
+		goto err_out_slot;
+	}
+
+	/* initialize rest of new object */
+	sprintf(osdev->name, DRV_NAME "%d", osdev->id);
+
+	/* contact requested OSD */
+	osdev->osd = osduld_path_lookup(osdev->osd_path);
+	if (IS_ERR(osdev->osd)) {
+		rc = PTR_ERR(osdev->osd);
+		goto err_out_slot;
+	}
+
+	/* build OSD credential */
+	osdblk_make_credential(osdev->obj_cred, &osdev->obj);
+
+	/* register our block device */
+	irc = register_blkdev(0, osdev->name);
+	if (irc < 0) {
+		rc = irc;
+		goto err_out_osd;
+	}
+
+	osdev->major = irc;
+
+	/* set up and announce blkdev mapping */
+	rc = osdblk_init_disk(osdev);
+	if (rc)
+		goto err_out_blkdev;
+
+	return count;
+
+err_out_blkdev:
+	unregister_blkdev(osdev->major, osdev->name);
+err_out_osd:
+	osduld_put_device(osdev->osd);
+err_out_slot:
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+	list_del_init(&osdev->node);
+	mutex_unlock(&ctl_mutex);
+
+	kfree(osdev);
+err_out_mod:
+	OSDBLK_DEBUG("Error adding device %s\n", buf);
+	module_put(THIS_MODULE);
+	return rc;
+}
+
+static ssize_t class_osdblk_remove(struct class *c,
+					struct class_attribute *attr,
+					const char *buf,
+					size_t count)
+{
+	struct osdblk_device *osdev = NULL;
+	int target_id, rc;
+	unsigned long ul;
+	struct list_head *tmp;
+
+	rc = kstrtoul(buf, 10, &ul);
+	if (rc)
+		return rc;
+
+	/* convert to int; abort if we lost anything in the conversion */
+	target_id = (int) ul;
+	if (target_id != ul)
+		return -EINVAL;
+
+	/* remove object from list immediately */
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	list_for_each(tmp, &osdblkdev_list) {
+		osdev = list_entry(tmp, struct osdblk_device, node);
+		if (osdev->id == target_id) {
+			list_del_init(&osdev->node);
+			break;
+		}
+		osdev = NULL;
+	}
+
+	mutex_unlock(&ctl_mutex);
+
+	if (!osdev)
+		return -ENOENT;
+
+	/* clean up and free blkdev and associated OSD connection */
+	osdblk_free_disk(osdev);
+	unregister_blkdev(osdev->major, osdev->name);
+	osduld_put_device(osdev->osd);
+	kfree(osdev);
+
+	/* release module ref */
+	module_put(THIS_MODULE);
+
+	return count;
+}
+
+static struct class_attribute class_osdblk_attrs[] = {
+	__ATTR(add,	0200, NULL, class_osdblk_add),
+	__ATTR(remove,	0200, NULL, class_osdblk_remove),
+	__ATTR(list,	0444, class_osdblk_list, NULL),
+	__ATTR_NULL
+};
+
+static int osdblk_sysfs_init(void)
+{
+	int ret = 0;
+
+	/*
+	 * create control files in sysfs
+	 * /sys/class/osdblk/...
+	 */
+	class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);
+	if (!class_osdblk)
+		return -ENOMEM;
+
+	class_osdblk->name = DRV_NAME;
+	class_osdblk->owner = THIS_MODULE;
+	class_osdblk->class_release = class_osdblk_release;
+	class_osdblk->class_attrs = class_osdblk_attrs;
+
+	ret = class_register(class_osdblk);
+	if (ret) {
+		kfree(class_osdblk);
+		class_osdblk = NULL;
+		printk(PFX "failed to create class osdblk\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void osdblk_sysfs_cleanup(void)
+{
+	if (class_osdblk)
+		class_destroy(class_osdblk);
+	class_osdblk = NULL;
+}
+
+static int __init osdblk_init(void)
+{
+	int rc;
+
+	rc = osdblk_sysfs_init();
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static void __exit osdblk_exit(void)
+{
+	osdblk_sysfs_cleanup();
+}
+
+module_init(osdblk_init);
+module_exit(osdblk_exit);
+
diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig
index 28cf3082d44..efefb5ac300 100644
--- a/drivers/block/paride/Kconfig
+++ b/drivers/block/paride/Kconfig
@@ -205,8 +205,8 @@ config PARIDE_EPAT
 	  support.
 
 config PARIDE_EPATC8
-	bool "Support c7/c8 chips (EXPERIMENTAL)"
-	depends on PARIDE_EPAT && EXPERIMENTAL
+	bool "Support c7/c8 chips"
+	depends on PARIDE_EPAT
 	help
 	  This option enables support for the newer Shuttle EP1284 (aka c7 and
 	  c8) chip. You need this if you are using any recent Imation SuperDisk
diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
index ad124525ac2..ec64e7f5d1c 100644
--- a/drivers/block/paride/bpck6.c
+++ b/drivers/block/paride/bpck6.c
@@ -20,9 +20,6 @@
 */
 
 
-/* PARAMETERS */
-static int verbose; /* set this to 1 to see debugging messages and whatnot */
-
 #define BACKPACK_VERSION "2.0.2"
 
 #include <linux/module.h>
@@ -36,6 +33,8 @@ static int verbose; /* set this to 1 to see debugging messages and whatnot */
 #include "ppc6lnx.c"
 #include "paride.h"
 
+/* PARAMETERS */
+static bool verbose; /* set this to 1 to see debugging messages and whatnot */
  
 
 #define PPCSTRUCT(pi) ((Interface *)(pi->private))
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 911dfd98d81..719cb1bc164 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -138,11 +138,13 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
 #include <linux/cdrom.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 
+static DEFINE_MUTEX(pcd_mutex);
 static DEFINE_SPINLOCK(pcd_lock);
 
-module_param(verbose, bool, 0644);
+module_param(verbose, int, 0644);
 module_param(major, int, 0);
 module_param(name, charp, 0);
 module_param(nice, int, 0);
@@ -170,7 +172,8 @@ module_param_array(drive3, int, NULL, 0);
 static int pcd_open(struct cdrom_device_info *cdi, int purpose);
 static void pcd_release(struct cdrom_device_info *cdi);
 static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr);
-static int pcd_media_changed(struct cdrom_device_info *cdi, int slot_nr);
+static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
+				     unsigned int clearing, int slot_nr);
 static int pcd_tray_move(struct cdrom_device_info *cdi, int position);
 static int pcd_lock_door(struct cdrom_device_info *cdi, int lock);
 static int pcd_drive_reset(struct cdrom_device_info *cdi);
@@ -219,49 +222,61 @@ static int pcd_sector;		/* address of next requested sector */
 static int pcd_count;		/* number of blocks still to do */
 static char *pcd_buf;		/* buffer for request in progress */
 
-static int pcd_warned;		/* Have we logged a phase warning ? */
-
 /* kernel glue structures */
 
 static int pcd_block_open(struct block_device *bdev, fmode_t mode)
 {
 	struct pcd_unit *cd = bdev->bd_disk->private_data;
-	return cdrom_open(&cd->info, bdev, mode);
+	int ret;
+
+	mutex_lock(&pcd_mutex);
+	ret = cdrom_open(&cd->info, bdev, mode);
+	mutex_unlock(&pcd_mutex);
+
+	return ret;
 }
 
-static int pcd_block_release(struct gendisk *disk, fmode_t mode)
+static void pcd_block_release(struct gendisk *disk, fmode_t mode)
 {
 	struct pcd_unit *cd = disk->private_data;
+	mutex_lock(&pcd_mutex);
 	cdrom_release(&cd->info, mode);
-	return 0;
+	mutex_unlock(&pcd_mutex);
 }
 
 static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
 				unsigned cmd, unsigned long arg)
 {
 	struct pcd_unit *cd = bdev->bd_disk->private_data;
-	return cdrom_ioctl(&cd->info, bdev, mode, cmd, arg);
+	int ret;
+
+	mutex_lock(&pcd_mutex);
+	ret = cdrom_ioctl(&cd->info, bdev, mode, cmd, arg);
+	mutex_unlock(&pcd_mutex);
+
+	return ret;
 }
 
-static int pcd_block_media_changed(struct gendisk *disk)
+static unsigned int pcd_block_check_events(struct gendisk *disk,
+					   unsigned int clearing)
 {
 	struct pcd_unit *cd = disk->private_data;
-	return cdrom_media_changed(&cd->info);
+	return cdrom_check_events(&cd->info, clearing);
 }
 
-static struct block_device_operations pcd_bdops = {
+static const struct block_device_operations pcd_bdops = {
 	.owner		= THIS_MODULE,
 	.open		= pcd_block_open,
 	.release	= pcd_block_release,
-	.locked_ioctl	= pcd_block_ioctl,
-	.media_changed	= pcd_block_media_changed,
+	.ioctl		= pcd_block_ioctl,
+	.check_events	= pcd_block_check_events,
 };
 
 static struct cdrom_device_ops pcd_dops = {
 	.open		= pcd_open,
 	.release	= pcd_release,
 	.drive_status	= pcd_drive_status,
-	.media_changed	= pcd_media_changed,
+	.check_events	= pcd_check_events,
 	.tray_move	= pcd_tray_move,
 	.lock_door	= pcd_lock_door,
 	.get_mcn	= pcd_get_mcn,
@@ -304,6 +319,7 @@ static void pcd_init_units(void)
 		disk->first_minor = unit;
 		strcpy(disk->disk_name, cd->name);	/* umm... */
 		disk->fops = &pcd_bdops;
+		disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
 	}
 }
 
@@ -343,11 +359,11 @@ static int pcd_wait(struct pcd_unit *cd, int go, int stop, char *fun, char *msg)
 	       && (j++ < PCD_SPIN))
 		udelay(PCD_DELAY);
 
-	if ((r & (IDE_ERR & stop)) || (j >= PCD_SPIN)) {
+	if ((r & (IDE_ERR & stop)) || (j > PCD_SPIN)) {
 		s = read_reg(cd, 7);
 		e = read_reg(cd, 1);
 		p = read_reg(cd, 2);
-		if (j >= PCD_SPIN)
+		if (j > PCD_SPIN)
 			e |= 0x100;
 		if (fun)
 			printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
@@ -417,12 +433,10 @@ static int pcd_completion(struct pcd_unit *cd, char *buf, char *fun)
 					printk
 					    ("%s: %s: Unexpected phase %d, d=%d, k=%d\n",
 					     cd->name, fun, p, d, k);
-				if ((verbose < 2) && !pcd_warned) {
-					pcd_warned = 1;
-					printk
-					    ("%s: WARNING: ATAPI phase errors\n",
-					     cd->name);
-				}
+				if (verbose < 2)
+					printk_once(
+					    "%s: WARNING: ATAPI phase errors\n",
+					    cd->name);
 				mdelay(1);
 			}
 			if (k++ > PCD_TMO) {
@@ -490,13 +504,14 @@ static int pcd_packet(struct cdrom_device_info *cdi, struct packet_command *cgc)
 
 #define DBMSG(msg)	((verbose>1)?(msg):NULL)
 
-static int pcd_media_changed(struct cdrom_device_info *cdi, int slot_nr)
+static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
+				     unsigned int clearing, int slot_nr)
 {
 	struct pcd_unit *cd = cdi->handle;
 	int res = cd->changed;
 	if (res)
 		cd->changed = 0;
-	return res;
+	return res ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static int pcd_lock_door(struct cdrom_device_info *cdi, int lock)
@@ -732,7 +747,7 @@ static void do_pcd_request(struct request_queue * q)
 			pcd_current = cd;
 			pcd_sector = blk_rq_pos(pcd_req);
 			pcd_count = blk_rq_cur_sectors(pcd_req);
-			pcd_buf = pcd_req->buffer;
+			pcd_buf = bio_data(pcd_req->bio);
 			pcd_busy = 1;
 			ps_set_intr(do_pcd_read, NULL, 0, nice);
 			return;
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index bf5955b3d87..fea7e76a00d 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -124,8 +124,9 @@
    by default.
 
 */
+#include <linux/types.h>
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PD_MAJOR;
 static char *name = PD_NAME;
 static int cluster = 64;
@@ -145,6 +146,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/delay.h>
 #include <linux/hdreg.h>
@@ -152,9 +154,11 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
 #include <linux/kernel.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <linux/workqueue.h>
 
+static DEFINE_MUTEX(pd_mutex);
 static DEFINE_SPINLOCK(pd_lock);
 
 module_param(verbose, bool, 0);
@@ -438,7 +442,7 @@ static char *pd_buf;		/* buffer for request in progress */
 
 static enum action do_pd_io_start(void)
 {
-	if (blk_special_request(pd_req)) {
+	if (pd_req->cmd_type == REQ_TYPE_SPECIAL) {
 		phase = pd_special;
 		return pd_special();
 	}
@@ -450,7 +454,7 @@ static enum action do_pd_io_start(void)
 		if (pd_block + pd_count > get_capacity(pd_req->rq_disk))
 			return Fail;
 		pd_run = blk_rq_sectors(pd_req);
-		pd_buf = pd_req->buffer;
+		pd_buf = bio_data(pd_req->bio);
 		pd_retries = 0;
 		if (pd_cmd == READ)
 			return do_pd_read_start();
@@ -481,7 +485,7 @@ static int pd_next_buf(void)
 	spin_lock_irqsave(&pd_lock, saved_flags);
 	__blk_end_request_cur(pd_req, 0);
 	pd_count = blk_rq_cur_sectors(pd_req);
-	pd_buf = pd_req->buffer;
+	pd_buf = bio_data(pd_req->bio);
 	spin_unlock_irqrestore(&pd_lock, saved_flags);
 	return 0;
 }
@@ -734,12 +738,14 @@ static int pd_open(struct block_device *bdev, fmode_t mode)
 {
 	struct pd_unit *disk = bdev->bd_disk->private_data;
 
+	mutex_lock(&pd_mutex);
 	disk->access++;
 
 	if (disk->removable) {
 		pd_special_command(disk, pd_media_check);
 		pd_special_command(disk, pd_door_lock);
 	}
+	mutex_unlock(&pd_mutex);
 	return 0;
 }
 
@@ -767,25 +773,27 @@ static int pd_ioctl(struct block_device *bdev, fmode_t mode,
 
 	switch (cmd) {
 	case CDROMEJECT:
+		mutex_lock(&pd_mutex);
 		if (disk->access == 1)
 			pd_special_command(disk, pd_eject);
+		mutex_unlock(&pd_mutex);
 		return 0;
 	default:
 		return -EINVAL;
 	}
 }
 
-static int pd_release(struct gendisk *p, fmode_t mode)
+static void pd_release(struct gendisk *p, fmode_t mode)
 {
 	struct pd_unit *disk = p->private_data;
 
+	mutex_lock(&pd_mutex);
 	if (!--disk->access && disk->removable)
 		pd_special_command(disk, pd_door_unlock);
-
-	return 0;
+	mutex_unlock(&pd_mutex);
 }
 
-static int pd_check_media(struct gendisk *p)
+static unsigned int pd_check_events(struct gendisk *p, unsigned int clearing)
 {
 	struct pd_unit *disk = p->private_data;
 	int r;
@@ -794,7 +802,7 @@ static int pd_check_media(struct gendisk *p)
 	pd_special_command(disk, pd_media_check);
 	r = disk->changed;
 	disk->changed = 0;
-	return r;
+	return r ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static int pd_revalidate(struct gendisk *p)
@@ -807,13 +815,13 @@ static int pd_revalidate(struct gendisk *p)
 	return 0;
 }
 
-static struct block_device_operations pd_fops = {
+static const struct block_device_operations pd_fops = {
 	.owner		= THIS_MODULE,
 	.open		= pd_open,
 	.release	= pd_release,
-	.locked_ioctl	= pd_ioctl,
+	.ioctl		= pd_ioctl,
 	.getgeo		= pd_getgeo,
-	.media_changed	= pd_check_media,
+	.check_events	= pd_check_events,
 	.revalidate_disk= pd_revalidate
 };
 
@@ -906,7 +914,7 @@ static int __init pd_init(void)
 	if (!pd_queue)
 		goto out1;
 
-	blk_queue_max_sectors(pd_queue, cluster);
+	blk_queue_max_hw_sectors(pd_queue, cluster);
 
 	if (register_blkdev(major, name))
 		goto out2;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 68a90834e99..9a15fd3c934 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -118,13 +118,15 @@
 #define PF_NAME		"pf"
 #define PF_UNITS	4
 
+#include <linux/types.h>
+
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is off
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PF_MAJOR;
 static char *name = PF_NAME;
 static int cluster = 64;
@@ -152,8 +154,10 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_LUN, D_DLY};
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 
+static DEFINE_MUTEX(pf_mutex);
 static DEFINE_SPINLOCK(pf_spin_lock);
 
 module_param(verbose, bool, 0644);
@@ -207,7 +211,7 @@ static int pf_ioctl(struct block_device *bdev, fmode_t mode,
 		    unsigned int cmd, unsigned long arg);
 static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
-static int pf_release(struct gendisk *disk, fmode_t mode);
+static void pf_release(struct gendisk *disk, fmode_t mode);
 
 static int pf_detect(void);
 static void do_pf_read(void);
@@ -241,7 +245,8 @@ static struct pf_unit units[PF_UNITS];
 static int pf_identify(struct pf_unit *pf);
 static void pf_lock(struct pf_unit *pf, int func);
 static void pf_eject(struct pf_unit *pf);
-static int pf_check_media(struct gendisk *disk);
+static unsigned int pf_check_events(struct gendisk *disk,
+				    unsigned int clearing);
 
 static char pf_scratch[512];	/* scratch block buffer */
 
@@ -262,13 +267,13 @@ static char *pf_buf;		/* buffer for request in progress */
 
 /* kernel glue structures */
 
-static struct block_device_operations pf_fops = {
+static const struct block_device_operations pf_fops = {
 	.owner		= THIS_MODULE,
 	.open		= pf_open,
 	.release	= pf_release,
-	.locked_ioctl	= pf_ioctl,
+	.ioctl		= pf_ioctl,
 	.getgeo		= pf_getgeo,
-	.media_changed	= pf_check_media,
+	.check_events	= pf_check_events,
 };
 
 static void __init pf_init_units(void)
@@ -299,20 +304,26 @@ static void __init pf_init_units(void)
 static int pf_open(struct block_device *bdev, fmode_t mode)
 {
 	struct pf_unit *pf = bdev->bd_disk->private_data;
+	int ret;
 
+	mutex_lock(&pf_mutex);
 	pf_identify(pf);
 
+	ret = -ENODEV;
 	if (pf->media_status == PF_NM)
-		return -ENODEV;
+		goto out;
 
+	ret = -EROFS;
 	if ((pf->media_status == PF_RO) && (mode & FMODE_WRITE))
-		return -EROFS;
+		goto out;
 
+	ret = 0;
 	pf->access++;
 	if (pf->removable)
 		pf_lock(pf, 1);
-
-	return 0;
+out:
+	mutex_unlock(&pf_mutex);
+	return ret;
 }
 
 static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -342,29 +353,35 @@ static int pf_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
 
 	if (pf->access != 1)
 		return -EBUSY;
+	mutex_lock(&pf_mutex);
 	pf_eject(pf);
+	mutex_unlock(&pf_mutex);
+
 	return 0;
 }
 
-static int pf_release(struct gendisk *disk, fmode_t mode)
+static void pf_release(struct gendisk *disk, fmode_t mode)
 {
 	struct pf_unit *pf = disk->private_data;
 
-	if (pf->access <= 0)
-		return -EINVAL;
+	mutex_lock(&pf_mutex);
+	if (pf->access <= 0) {
+		mutex_unlock(&pf_mutex);
+		WARN_ON(1);
+		return;
+	}
 
 	pf->access--;
 
 	if (!pf->access && pf->removable)
 		pf_lock(pf, 0);
 
-	return 0;
-
+	mutex_unlock(&pf_mutex);
 }
 
-static int pf_check_media(struct gendisk *disk)
+static unsigned int pf_check_events(struct gendisk *disk, unsigned int clearing)
 {
-	return 1;
+	return DISK_EVENT_MEDIA_CHANGE;
 }
 
 static inline int status_reg(struct pf_unit *pf)
@@ -391,11 +408,11 @@ static int pf_wait(struct pf_unit *pf, int go, int stop, char *fun, char *msg)
 	       && (j++ < PF_SPIN))
 		udelay(PF_SPIN_DEL);
 
-	if ((r & (STAT_ERR & stop)) || (j >= PF_SPIN)) {
+	if ((r & (STAT_ERR & stop)) || (j > PF_SPIN)) {
 		s = read_reg(pf, 7);
 		e = read_reg(pf, 1);
 		p = read_reg(pf, 2);
-		if (j >= PF_SPIN)
+		if (j > PF_SPIN)
 			e |= 0x100;
 		if (fun)
 			printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
@@ -778,7 +795,7 @@ repeat:
 	}
 
 	pf_cmd = rq_data_dir(pf_req);
-	pf_buf = pf_req->buffer;
+	pf_buf = bio_data(pf_req->bio);
 	pf_retries = 0;
 
 	pf_busy = 1;
@@ -810,7 +827,7 @@ static int pf_next_buf(void)
 		if (!pf_req)
 			return 1;
 		pf_count = blk_rq_cur_sectors(pf_req);
-		pf_buf = pf_req->buffer;
+		pf_buf = bio_data(pf_req->bio);
 	}
 	return 0;
 }
@@ -956,8 +973,7 @@ static int __init pf_init(void)
 		return -ENOMEM;
 	}
 
-	blk_queue_max_phys_segments(pf_queue, cluster);
-	blk_queue_max_hw_segments(pf_queue, cluster);
+	blk_queue_max_segments(pf_queue, cluster);
 
 	for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
 		struct gendisk *disk = pf->disk;
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index c397b3ddba9..2ce3dfd7e6b 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -130,13 +130,14 @@
 #define PI_PG	4
 #endif
 
+#include <linux/types.h>
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is 0
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PG_MAJOR;
 static char *name = PG_NAME;
 static int disable = 0;
@@ -162,7 +163,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
 #include <linux/pg.h>
 #include <linux/device.h>
 #include <linux/sched.h>	/* current, TASK_* */
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <linux/jiffies.h>
 
 #include <asm/uaccess.h>
@@ -193,6 +194,7 @@ module_param_array(drive3, int, NULL, 0);
 
 #define ATAPI_IDENTIFY		0x12
 
+static DEFINE_MUTEX(pg_mutex);
 static int pg_open(struct inode *inode, struct file *file);
 static int pg_release(struct inode *inode, struct file *file);
 static ssize_t pg_read(struct file *filp, char __user *buf,
@@ -234,6 +236,7 @@ static const struct file_operations pg_fops = {
 	.write = pg_write,
 	.open = pg_open,
 	.release = pg_release,
+	.llseek = noop_llseek,
 };
 
 static void pg_init_units(void)
@@ -518,7 +521,7 @@ static int pg_open(struct inode *inode, struct file *file)
 	struct pg *dev = &devices[unit];
 	int ret = 0;
 
-	lock_kernel();
+	mutex_lock(&pg_mutex);
 	if ((unit >= PG_UNITS) || (!dev->present)) {
 		ret = -ENODEV;
 		goto out;
@@ -547,7 +550,7 @@ static int pg_open(struct inode *inode, struct file *file)
 	file->private_data = dev;
 
 out:
-	unlock_kernel();
+	mutex_unlock(&pg_mutex);
 	return ret;
 }
 
@@ -578,7 +581,7 @@ static ssize_t pg_write(struct file *filp, const char __user *buf, size_t count,
 
 	if (hdr.magic != PG_MAGIC)
 		return -EINVAL;
-	if (hdr.dlen > PG_MAX_DATA)
+	if (hdr.dlen < 0 || hdr.dlen > PG_MAX_DATA)
 		return -EINVAL;
 	if ((count - hs) > PG_MAX_DATA)
 		return -EINVAL;
@@ -628,6 +631,7 @@ static ssize_t pg_read(struct file *filp, char __user *buf, size_t count, loff_t
 		if (dev->status & 0x10)
 			return -ETIME;
 
+	memset(&hdr, 0, sizeof(hdr));
 	hdr.magic = PG_MAGIC;
 	hdr.dlen = dev->dlen;
 	copy = 0;
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 1e4006e18f0..2596042eb98 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -109,13 +109,15 @@
 #define PT_NAME		"pt"
 #define PT_UNITS	4
 
+#include <linux/types.h>
+
 /* Here are things one can override from the insmod command.
    Most are autoprobed by paride unless set here.  Verbose is on
    by default.
 
 */
 
-static int verbose = 0;
+static bool verbose = 0;
 static int major = PT_MAJOR;
 static char *name = PT_NAME;
 static int disable = 0;
@@ -146,7 +148,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
 #include <linux/mtio.h>
 #include <linux/device.h>
 #include <linux/sched.h>	/* current, TASK_*, schedule_timeout() */
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 
 #include <asm/uaccess.h>
 
@@ -189,6 +191,7 @@ module_param_array(drive3, int, NULL, 0);
 #define ATAPI_MODE_SENSE	0x1a
 #define ATAPI_LOG_SENSE		0x4d
 
+static DEFINE_MUTEX(pt_mutex);
 static int pt_open(struct inode *inode, struct file *file);
 static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 static int pt_release(struct inode *inode, struct file *file);
@@ -239,6 +242,7 @@ static const struct file_operations pt_fops = {
 	.unlocked_ioctl = pt_ioctl,
 	.open = pt_open,
 	.release = pt_release,
+	.llseek = noop_llseek,
 };
 
 /* sysfs class support */
@@ -274,11 +278,11 @@ static int pt_wait(struct pt_unit *tape, int go, int stop, char *fun, char *msg)
 	       && (j++ < PT_SPIN))
 		udelay(PT_SPIN_DEL);
 
-	if ((r & (STAT_ERR & stop)) || (j >= PT_SPIN)) {
+	if ((r & (STAT_ERR & stop)) || (j > PT_SPIN)) {
 		s = read_reg(pi, 7);
 		e = read_reg(pi, 1);
 		p = read_reg(pi, 2);
-		if (j >= PT_SPIN)
+		if (j > PT_SPIN)
 			e |= 0x100;
 		if (fun)
 			printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
@@ -650,9 +654,9 @@ static int pt_open(struct inode *inode, struct file *file)
 	struct pt_unit *tape = pt + unit;
 	int err;
 
-	lock_kernel();
+	mutex_lock(&pt_mutex);
 	if (unit >= PT_UNITS || (!tape->present)) {
-		unlock_kernel();
+		mutex_unlock(&pt_mutex);
 		return -ENODEV;
 	}
 
@@ -681,12 +685,12 @@ static int pt_open(struct inode *inode, struct file *file)
 	}
 
 	file->private_data = tape;
-	unlock_kernel();
+	mutex_unlock(&pt_mutex);
 	return 0;
 
 out:
 	atomic_inc(&tape->available);
-	unlock_kernel();
+	mutex_unlock(&pt_mutex);
 	return err;
 }
 
@@ -704,15 +708,15 @@ static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		switch (mtop.mt_op) {
 
 		case MTREW:
-			lock_kernel();
+			mutex_lock(&pt_mutex);
 			pt_rewind(tape);
-			unlock_kernel();
+			mutex_unlock(&pt_mutex);
 			return 0;
 
 		case MTWEOF:
-			lock_kernel();
+			mutex_lock(&pt_mutex);
 			pt_write_fm(tape);
-			unlock_kernel();
+			mutex_unlock(&pt_mutex);
 			return 0;
 
 		default:
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index d57f1175948..758ac442c5b 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -44,10 +44,13 @@
  *
  *************************************************************************/
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/pktcdvd.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/compat.h>
 #include <linux/kthread.h>
 #include <linux/errno.h>
 #include <linux/spinlock.h>
@@ -57,6 +60,7 @@
 #include <linux/miscdevice.h>
 #include <linux/freezer.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/scsi.h>
@@ -67,22 +71,25 @@
 
 #define DRIVER_NAME	"pktcdvd"
 
-#if PACKET_DEBUG
-#define DPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args)
-#else
-#define DPRINTK(fmt, args...)
-#endif
-
-#if PACKET_DEBUG > 1
-#define VPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args)
-#else
-#define VPRINTK(fmt, args...)
-#endif
+#define pkt_err(pd, fmt, ...)						\
+	pr_err("%s: " fmt, pd->name, ##__VA_ARGS__)
+#define pkt_notice(pd, fmt, ...)					\
+	pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__)
+#define pkt_info(pd, fmt, ...)						\
+	pr_info("%s: " fmt, pd->name, ##__VA_ARGS__)
+
+#define pkt_dbg(level, pd, fmt, ...)					\
+do {									\
+	if (level == 2 && PACKET_DEBUG >= 2)				\
+		pr_notice("%s: %s():" fmt,				\
+			  pd->name, __func__, ##__VA_ARGS__);		\
+	else if (level == 1 && PACKET_DEBUG >= 1)			\
+		pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__);		\
+} while (0)
 
 #define MAX_SPEED 0xffff
 
-#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1))
-
+static DEFINE_MUTEX(pktcdvd_mutex);
 static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
 static struct proc_dir_entry *pkt_proc;
 static int pktdev_major;
@@ -92,14 +99,17 @@ static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
 static mempool_t *psd_pool;
 
 static struct class	*class_pktcdvd = NULL;    /* /sys/class/pktcdvd */
-static struct dentry	*pkt_debugfs_root = NULL; /* /debug/pktcdvd */
+static struct dentry	*pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */
 
 /* forward declaration */
 static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev);
 static int pkt_remove_dev(dev_t pkt_dev);
 static int pkt_seq_show(struct seq_file *m, void *p);
 
-
+static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd)
+{
+	return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1);
+}
 
 /*
  * create and register a pktcdvd kernel object.
@@ -284,7 +294,7 @@ static ssize_t kobj_pkt_store(struct kobject *kobj,
 	return len;
 }
 
-static struct sysfs_ops kobj_pkt_ops = {
+static const struct sysfs_ops kobj_pkt_ops = {
 	.show = kobj_pkt_show,
 	.store = kobj_pkt_store
 };
@@ -322,7 +332,7 @@ static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd)
 	pkt_kobj_remove(pd->kobj_stat);
 	pkt_kobj_remove(pd->kobj_wqueue);
 	if (class_pktcdvd)
-		device_destroy(class_pktcdvd, pd->pkt_dev);
+		device_unregister(pd->dev);
 }
 
 
@@ -337,7 +347,9 @@ static void class_pktcdvd_release(struct class *cls)
 {
 	kfree(cls);
 }
-static ssize_t class_pktcdvd_show_map(struct class *c, char *data)
+static ssize_t class_pktcdvd_show_map(struct class *c,
+					struct class_attribute *attr,
+					char *data)
 {
 	int n = 0;
 	int idx;
@@ -356,7 +368,9 @@ static ssize_t class_pktcdvd_show_map(struct class *c, char *data)
 	return n;
 }
 
-static ssize_t class_pktcdvd_store_add(struct class *c, const char *buf,
+static ssize_t class_pktcdvd_store_add(struct class *c,
+					struct class_attribute *attr,
+					const char *buf,
 					size_t count)
 {
 	unsigned int major, minor;
@@ -376,7 +390,9 @@ static ssize_t class_pktcdvd_store_add(struct class *c, const char *buf,
 	return -EINVAL;
 }
 
-static ssize_t class_pktcdvd_store_remove(struct class *c, const char *buf,
+static ssize_t class_pktcdvd_store_remove(struct class *c,
+					  struct class_attribute *attr,
+					  const char *buf,
 					size_t count)
 {
 	unsigned int major, minor;
@@ -414,7 +430,7 @@ static int pkt_sysfs_init(void)
 	if (ret) {
 		kfree(class_pktcdvd);
 		class_pktcdvd = NULL;
-		printk(DRIVER_NAME": failed to create class pktcdvd\n");
+		pr_err("failed to create class pktcdvd\n");
 		return ret;
 	}
 	return 0;
@@ -430,7 +446,7 @@ static void pkt_sysfs_cleanup(void)
 /********************************************************************
   entries in debugfs
 
-  /debugfs/pktcdvd[0-7]/
+  /sys/kernel/debug/pktcdvd[0-7]/
 			info
 
  *******************************************************************/
@@ -457,45 +473,31 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
 {
 	if (!pkt_debugfs_root)
 		return;
-	pd->dfs_f_info = NULL;
 	pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root);
-	if (IS_ERR(pd->dfs_d_root)) {
-		pd->dfs_d_root = NULL;
+	if (!pd->dfs_d_root)
 		return;
-	}
+
 	pd->dfs_f_info = debugfs_create_file("info", S_IRUGO,
 				pd->dfs_d_root, pd, &debug_fops);
-	if (IS_ERR(pd->dfs_f_info)) {
-		pd->dfs_f_info = NULL;
-		return;
-	}
 }
 
 static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd)
 {
 	if (!pkt_debugfs_root)
 		return;
-	if (pd->dfs_f_info)
-		debugfs_remove(pd->dfs_f_info);
+	debugfs_remove(pd->dfs_f_info);
+	debugfs_remove(pd->dfs_d_root);
 	pd->dfs_f_info = NULL;
-	if (pd->dfs_d_root)
-		debugfs_remove(pd->dfs_d_root);
 	pd->dfs_d_root = NULL;
 }
 
 static void pkt_debugfs_init(void)
 {
 	pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL);
-	if (IS_ERR(pkt_debugfs_root)) {
-		pkt_debugfs_root = NULL;
-		return;
-	}
 }
 
 static void pkt_debugfs_cleanup(void)
 {
-	if (!pkt_debugfs_root)
-		return;
 	debugfs_remove(pkt_debugfs_root);
 	pkt_debugfs_root = NULL;
 }
@@ -507,44 +509,12 @@ static void pkt_bio_finished(struct pktcdvd_device *pd)
 {
 	BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0);
 	if (atomic_dec_and_test(&pd->cdrw.pending_bios)) {
-		VPRINTK(DRIVER_NAME": queue empty\n");
+		pkt_dbg(2, pd, "queue empty\n");
 		atomic_set(&pd->iosched.attention, 1);
 		wake_up(&pd->wqueue);
 	}
 }
 
-static void pkt_bio_destructor(struct bio *bio)
-{
-	kfree(bio->bi_io_vec);
-	kfree(bio);
-}
-
-static struct bio *pkt_bio_alloc(int nr_iovecs)
-{
-	struct bio_vec *bvl = NULL;
-	struct bio *bio;
-
-	bio = kmalloc(sizeof(struct bio), GFP_KERNEL);
-	if (!bio)
-		goto no_bio;
-	bio_init(bio);
-
-	bvl = kcalloc(nr_iovecs, sizeof(struct bio_vec), GFP_KERNEL);
-	if (!bvl)
-		goto no_bvl;
-
-	bio->bi_max_vecs = nr_iovecs;
-	bio->bi_io_vec = bvl;
-	bio->bi_destructor = pkt_bio_destructor;
-
-	return bio;
-
- no_bvl:
-	kfree(bio);
- no_bio:
-	return NULL;
-}
-
 /*
  * Allocate a packet_data struct
  */
@@ -558,7 +528,7 @@ static struct packet_data *pkt_alloc_packet_data(int frames)
 		goto no_pkt;
 
 	pkt->frames = frames;
-	pkt->w_bio = pkt_bio_alloc(frames);
+	pkt->w_bio = bio_kmalloc(GFP_KERNEL, frames);
 	if (!pkt->w_bio)
 		goto no_bio;
 
@@ -569,11 +539,13 @@ static struct packet_data *pkt_alloc_packet_data(int frames)
 	}
 
 	spin_lock_init(&pkt->lock);
+	bio_list_init(&pkt->orig_bios);
 
 	for (i = 0; i < frames; i++) {
-		struct bio *bio = pkt_bio_alloc(1);
+		struct bio *bio = bio_kmalloc(GFP_KERNEL, 1);
 		if (!bio)
 			goto no_rd_bio;
+
 		pkt->r_bios[i] = bio;
 	}
 
@@ -679,7 +651,7 @@ static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s
 
 	for (;;) {
 		tmp = rb_entry(n, struct pkt_rb_node, rb_node);
-		if (s <= tmp->bio->bi_sector)
+		if (s <= tmp->bio->bi_iter.bi_sector)
 			next = n->rb_left;
 		else
 			next = n->rb_right;
@@ -688,12 +660,12 @@ static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s
 		n = next;
 	}
 
-	if (s > tmp->bio->bi_sector) {
+	if (s > tmp->bio->bi_iter.bi_sector) {
 		tmp = pkt_rbtree_next(tmp);
 		if (!tmp)
 			return NULL;
 	}
-	BUG_ON(s > tmp->bio->bi_sector);
+	BUG_ON(s > tmp->bio->bi_iter.bi_sector);
 	return tmp;
 }
 
@@ -704,13 +676,13 @@ static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *nod
 {
 	struct rb_node **p = &pd->bio_queue.rb_node;
 	struct rb_node *parent = NULL;
-	sector_t s = node->bio->bi_sector;
+	sector_t s = node->bio->bi_iter.bi_sector;
 	struct pkt_rb_node *tmp;
 
 	while (*p) {
 		parent = *p;
 		tmp = rb_entry(parent, struct pkt_rb_node, rb_node);
-		if (s < tmp->bio->bi_sector)
+		if (s < tmp->bio->bi_iter.bi_sector)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
@@ -721,43 +693,6 @@ static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *nod
 }
 
 /*
- * Add a bio to a single linked list defined by its head and tail pointers.
- */
-static void pkt_add_list_last(struct bio *bio, struct bio **list_head, struct bio **list_tail)
-{
-	bio->bi_next = NULL;
-	if (*list_tail) {
-		BUG_ON((*list_head) == NULL);
-		(*list_tail)->bi_next = bio;
-		(*list_tail) = bio;
-	} else {
-		BUG_ON((*list_head) != NULL);
-		(*list_head) = bio;
-		(*list_tail) = bio;
-	}
-}
-
-/*
- * Remove and return the first bio from a single linked list defined by its
- * head and tail pointers.
- */
-static inline struct bio *pkt_get_list_first(struct bio **list_head, struct bio **list_tail)
-{
-	struct bio *bio;
-
-	if (*list_head == NULL)
-		return NULL;
-
-	bio = *list_head;
-	*list_head = bio->bi_next;
-	if (*list_head == NULL)
-		*list_tail = NULL;
-
-	bio->bi_next = NULL;
-	return bio;
-}
-
-/*
  * Send a packet_command to the underlying block device and
  * wait for completion.
  */
@@ -769,9 +704,12 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 
 	rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
 			     WRITE : READ, __GFP_WAIT);
+	blk_rq_set_block_pc(rq);
 
 	if (cgc->buflen) {
-		if (blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, __GFP_WAIT))
+		ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
+				      __GFP_WAIT);
+		if (ret)
 			goto out;
 	}
 
@@ -779,8 +717,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 	memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE);
 
 	rq->timeout = 60*HZ;
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
-	rq->cmd_flags |= REQ_HARDBARRIER;
 	if (cgc->quiet)
 		rq->cmd_flags |= REQ_QUIET;
 
@@ -792,36 +728,33 @@ out:
 	return ret;
 }
 
+static const char *sense_key_string(__u8 index)
+{
+	static const char * const info[] = {
+		"No sense", "Recovered error", "Not ready",
+		"Medium error", "Hardware error", "Illegal request",
+		"Unit attention", "Data protect", "Blank check",
+	};
+
+	return index < ARRAY_SIZE(info) ? info[index] : "INVALID";
+}
+
 /*
  * A generic sense dump / resolve mechanism should be implemented across
  * all ATAPI + SCSI devices.
  */
-static void pkt_dump_sense(struct packet_command *cgc)
+static void pkt_dump_sense(struct pktcdvd_device *pd,
+			   struct packet_command *cgc)
 {
-	static char *info[9] = { "No sense", "Recovered error", "Not ready",
-				 "Medium error", "Hardware error", "Illegal request",
-				 "Unit attention", "Data protect", "Blank check" };
-	int i;
 	struct request_sense *sense = cgc->sense;
 
-	printk(DRIVER_NAME":");
-	for (i = 0; i < CDROM_PACKET_SIZE; i++)
-		printk(" %02x", cgc->cmd[i]);
-	printk(" - ");
-
-	if (sense == NULL) {
-		printk("no sense\n");
-		return;
-	}
-
-	printk("sense %02x.%02x.%02x", sense->sense_key, sense->asc, sense->ascq);
-
-	if (sense->sense_key > 8) {
-		printk(" (INVALID)\n");
-		return;
-	}
-
-	printk(" (%s)\n", info[sense->sense_key]);
+	if (sense)
+		pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n",
+			CDROM_PACKET_SIZE, cgc->cmd,
+			sense->sense_key, sense->asc, sense->ascq,
+			sense_key_string(sense->sense_key));
+	else
+		pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
 }
 
 /*
@@ -864,7 +797,7 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
 	cgc.cmd[5] = write_speed & 0xff;
 
 	if ((ret = pkt_generic_packet(pd, &cgc)))
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 
 	return ret;
 }
@@ -876,13 +809,10 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
 static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio)
 {
 	spin_lock(&pd->iosched.lock);
-	if (bio_data_dir(bio) == READ) {
-		pkt_add_list_last(bio, &pd->iosched.read_queue,
-				  &pd->iosched.read_queue_tail);
-	} else {
-		pkt_add_list_last(bio, &pd->iosched.write_queue,
-				  &pd->iosched.write_queue_tail);
-	}
+	if (bio_data_dir(bio) == READ)
+		bio_list_add(&pd->iosched.read_queue, bio);
+	else
+		bio_list_add(&pd->iosched.write_queue, bio);
 	spin_unlock(&pd->iosched.lock);
 
 	atomic_set(&pd->iosched.attention, 1);
@@ -917,8 +847,8 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 		int reads_queued, writes_queued;
 
 		spin_lock(&pd->iosched.lock);
-		reads_queued = (pd->iosched.read_queue != NULL);
-		writes_queued = (pd->iosched.write_queue != NULL);
+		reads_queued = !bio_list_empty(&pd->iosched.read_queue);
+		writes_queued = !bio_list_empty(&pd->iosched.write_queue);
 		spin_unlock(&pd->iosched.lock);
 
 		if (!reads_queued && !writes_queued)
@@ -927,13 +857,14 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 		if (pd->iosched.writing) {
 			int need_write_seek = 1;
 			spin_lock(&pd->iosched.lock);
-			bio = pd->iosched.write_queue;
+			bio = bio_list_peek(&pd->iosched.write_queue);
 			spin_unlock(&pd->iosched.lock);
-			if (bio && (bio->bi_sector == pd->iosched.last_write))
+			if (bio && (bio->bi_iter.bi_sector ==
+				    pd->iosched.last_write))
 				need_write_seek = 0;
 			if (need_write_seek && reads_queued) {
 				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
-					VPRINTK(DRIVER_NAME": write, waiting\n");
+					pkt_dbg(2, pd, "write, waiting\n");
 					break;
 				}
 				pkt_flush_cache(pd);
@@ -942,7 +873,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 		} else {
 			if (!reads_queued && writes_queued) {
 				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
-					VPRINTK(DRIVER_NAME": read, waiting\n");
+					pkt_dbg(2, pd, "read, waiting\n");
 					break;
 				}
 				pd->iosched.writing = 1;
@@ -950,23 +881,21 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 		}
 
 		spin_lock(&pd->iosched.lock);
-		if (pd->iosched.writing) {
-			bio = pkt_get_list_first(&pd->iosched.write_queue,
-						 &pd->iosched.write_queue_tail);
-		} else {
-			bio = pkt_get_list_first(&pd->iosched.read_queue,
-						 &pd->iosched.read_queue_tail);
-		}
+		if (pd->iosched.writing)
+			bio = bio_list_pop(&pd->iosched.write_queue);
+		else
+			bio = bio_list_pop(&pd->iosched.read_queue);
 		spin_unlock(&pd->iosched.lock);
 
 		if (!bio)
 			continue;
 
 		if (bio_data_dir(bio) == READ)
-			pd->iosched.successive_reads += bio->bi_size >> 10;
+			pd->iosched.successive_reads +=
+				bio->bi_iter.bi_size >> 10;
 		else {
 			pd->iosched.successive_reads = 0;
-			pd->iosched.last_write = bio->bi_sector + bio_sectors(bio);
+			pd->iosched.last_write = bio_end_sector(bio);
 		}
 		if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
 			if (pd->read_speed == pd->write_speed) {
@@ -992,14 +921,14 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
 {
 	if ((pd->settings.size << 9) / CD_FRAMESIZE
-	    <= queue_max_phys_segments(q)) {
+	    <= queue_max_segments(q)) {
 		/*
 		 * The cdrom device can handle one segment/frame
 		 */
 		clear_bit(PACKET_MERGE_SEGS, &pd->flags);
 		return 0;
 	} else if ((pd->settings.size << 9) / PAGE_SIZE
-		   <= queue_max_phys_segments(q)) {
+		   <= queue_max_segments(q)) {
 		/*
 		 * We can handle this case at the expense of some extra memory
 		 * copies during write operations
@@ -1007,37 +936,12 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que
 		set_bit(PACKET_MERGE_SEGS, &pd->flags);
 		return 0;
 	} else {
-		printk(DRIVER_NAME": cdrom max_phys_segments too small\n");
+		pkt_err(pd, "cdrom max_phys_segments too small\n");
 		return -EIO;
 	}
 }
 
 /*
- * Copy CD_FRAMESIZE bytes from src_bio into a destination page
- */
-static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct page *dst_page, int dst_offs)
-{
-	unsigned int copy_size = CD_FRAMESIZE;
-
-	while (copy_size > 0) {
-		struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
-		void *vfrom = kmap_atomic(src_bvl->bv_page, KM_USER0) +
-			src_bvl->bv_offset + offs;
-		void *vto = page_address(dst_page) + dst_offs;
-		int len = min_t(int, copy_size, src_bvl->bv_len - offs);
-
-		BUG_ON(len < 0);
-		memcpy(vto, vfrom, len);
-		kunmap_atomic(vfrom, KM_USER0);
-
-		seg++;
-		offs = 0;
-		dst_offs += len;
-		copy_size -= len;
-	}
-}
-
-/*
  * Copy all data for this packet to pkt->pages[], so that
  * a) The number of required segments for the write bio is minimized, which
  *    is necessary for some scsi controllers.
@@ -1053,10 +957,10 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec)
 	offs = 0;
 	for (f = 0; f < pkt->frames; f++) {
 		if (bvec[f].bv_page != pkt->pages[p]) {
-			void *vfrom = kmap_atomic(bvec[f].bv_page, KM_USER0) + bvec[f].bv_offset;
+			void *vfrom = kmap_atomic(bvec[f].bv_page) + bvec[f].bv_offset;
 			void *vto = page_address(pkt->pages[p]) + offs;
 			memcpy(vto, vfrom, CD_FRAMESIZE);
-			kunmap_atomic(vfrom, KM_USER0);
+			kunmap_atomic(vfrom);
 			bvec[f].bv_page = pkt->pages[p];
 			bvec[f].bv_offset = offs;
 		} else {
@@ -1076,8 +980,9 @@ static void pkt_end_io_read(struct bio *bio, int err)
 	struct pktcdvd_device *pd = pkt->pd;
 	BUG_ON(!pd);
 
-	VPRINTK("pkt_end_io_read: bio=%p sec0=%llx sec=%llx err=%d\n", bio,
-		(unsigned long long)pkt->sector, (unsigned long long)bio->bi_sector, err);
+	pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n",
+		bio, (unsigned long long)pkt->sector,
+		(unsigned long long)bio->bi_iter.bi_sector, err);
 
 	if (err)
 		atomic_inc(&pkt->io_errors);
@@ -1094,7 +999,7 @@ static void pkt_end_io_packet_write(struct bio *bio, int err)
 	struct pktcdvd_device *pd = pkt->pd;
 	BUG_ON(!pd);
 
-	VPRINTK("pkt_end_io_packet_write: id=%d, err=%d\n", pkt->id, err);
+	pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, err);
 
 	pd->stats.pkt_ended++;
 
@@ -1114,7 +1019,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 	int f;
 	char written[PACKET_MAX_SIZE];
 
-	BUG_ON(!pkt->orig_bios);
+	BUG_ON(bio_list_empty(&pkt->orig_bios));
 
 	atomic_set(&pkt->io_wait, 0);
 	atomic_set(&pkt->io_errors, 0);
@@ -1124,9 +1029,10 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 	 */
 	memset(written, 0, sizeof(written));
 	spin_lock(&pkt->lock);
-	for (bio = pkt->orig_bios; bio; bio = bio->bi_next) {
-		int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
-		int num_frames = bio->bi_size / CD_FRAMESIZE;
+	bio_list_for_each(bio, &pkt->orig_bios) {
+		int first_frame = (bio->bi_iter.bi_sector - pkt->sector) /
+			(CD_FRAMESIZE >> 9);
+		int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE;
 		pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9);
 		BUG_ON(first_frame < 0);
 		BUG_ON(first_frame + num_frames > pkt->frames);
@@ -1136,7 +1042,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 	spin_unlock(&pkt->lock);
 
 	if (pkt->cache_valid) {
-		VPRINTK("pkt_gather_data: zone %llx cached\n",
+		pkt_dbg(2, pd, "zone %llx cached\n",
 			(unsigned long long)pkt->sector);
 		goto out_account;
 	}
@@ -1145,25 +1051,21 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 	 * Schedule reads for missing parts of the packet.
 	 */
 	for (f = 0; f < pkt->frames; f++) {
-		struct bio_vec *vec;
-
 		int p, offset;
+
 		if (written[f])
 			continue;
+
 		bio = pkt->r_bios[f];
-		vec = bio->bi_io_vec;
-		bio_init(bio);
-		bio->bi_max_vecs = 1;
-		bio->bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
+		bio_reset(bio);
+		bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
 		bio->bi_bdev = pd->bdev;
 		bio->bi_end_io = pkt_end_io_read;
 		bio->bi_private = pkt;
-		bio->bi_io_vec = vec;
-		bio->bi_destructor = pkt_bio_destructor;
 
 		p = (f * CD_FRAMESIZE) / PAGE_SIZE;
 		offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
-		VPRINTK("pkt_gather_data: Adding frame %d, page:%p offs:%d\n",
+		pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n",
 			f, pkt->pages[p], offset);
 		if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset))
 			BUG();
@@ -1175,7 +1077,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 	}
 
 out_account:
-	VPRINTK("pkt_gather_data: need %d frames for zone %llx\n",
+	pkt_dbg(2, pd, "need %d frames for zone %llx\n",
 		frames_read, (unsigned long long)pkt->sector);
 	pd->stats.pkt_started++;
 	pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9);
@@ -1240,7 +1142,7 @@ static int pkt_start_recovery(struct packet_data *pkt)
 	if (!sb)
 		return 0;
 
-	if (!sb->s_op || !sb->s_op->relocate_blocks)
+	if (!sb->s_op->relocate_blocks)
 		goto out;
 
 	old_block = pkt->sector / (CD_FRAMESIZE >> 9);
@@ -1250,16 +1152,15 @@ static int pkt_start_recovery(struct packet_data *pkt)
 	new_sector = new_block * (CD_FRAMESIZE >> 9);
 	pkt->sector = new_sector;
 
-	pkt->bio->bi_sector = new_sector;
-	pkt->bio->bi_next = NULL;
-	pkt->bio->bi_flags = 1 << BIO_UPTODATE;
-	pkt->bio->bi_idx = 0;
+	bio_reset(pkt->bio);
+	pkt->bio->bi_bdev = pd->bdev;
+	pkt->bio->bi_rw = REQ_WRITE;
+	pkt->bio->bi_iter.bi_sector = new_sector;
+	pkt->bio->bi_iter.bi_size = pkt->frames * CD_FRAMESIZE;
+	pkt->bio->bi_vcnt = pkt->frames;
 
-	BUG_ON(pkt->bio->bi_rw != (1 << BIO_RW));
-	BUG_ON(pkt->bio->bi_vcnt != pkt->frames);
-	BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE);
-	BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write);
-	BUG_ON(pkt->bio->bi_private != pkt);
+	pkt->bio->bi_end_io = pkt_end_io_packet_write;
+	pkt->bio->bi_private = pkt;
 
 	drop_super(sb);
 	return 1;
@@ -1277,7 +1178,8 @@ static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state
 		"IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED"
 	};
 	enum packet_data_state old_state = pkt->state;
-	VPRINTK("pkt %2d : s=%6llx %s -> %s\n", pkt->id, (unsigned long long)pkt->sector,
+	pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n",
+		pkt->id, (unsigned long long)pkt->sector,
 		state_name[old_state], state_name[state]);
 #endif
 	pkt->state = state;
@@ -1296,12 +1198,10 @@ static int pkt_handle_queue(struct pktcdvd_device *pd)
 	struct rb_node *n;
 	int wakeup;
 
-	VPRINTK("handle_queue\n");
-
 	atomic_set(&pd->scan_queue, 0);
 
 	if (list_empty(&pd->cdrw.pkt_free_list)) {
-		VPRINTK("handle_queue: no pkt\n");
+		pkt_dbg(2, pd, "no pkt\n");
 		return 0;
 	}
 
@@ -1318,7 +1218,7 @@ static int pkt_handle_queue(struct pktcdvd_device *pd)
 	node = first_node;
 	while (node) {
 		bio = node->bio;
-		zone = ZONE(bio->bi_sector, pd);
+		zone = get_zone(bio->bi_iter.bi_sector, pd);
 		list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
 			if (p->sector == zone) {
 				bio = NULL;
@@ -1338,7 +1238,7 @@ try_next_bio:
 	}
 	spin_unlock(&pd->lock);
 	if (!bio) {
-		VPRINTK("handle_queue: no bio\n");
+		pkt_dbg(2, pd, "no bio\n");
 		return 0;
 	}
 
@@ -1354,17 +1254,17 @@ try_next_bio:
 	 * to this packet.
 	 */
 	spin_lock(&pd->lock);
-	VPRINTK("pkt_handle_queue: looking for zone %llx\n", (unsigned long long)zone);
+	pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone);
 	while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
 		bio = node->bio;
-		VPRINTK("pkt_handle_queue: found zone=%llx\n",
-			(unsigned long long)ZONE(bio->bi_sector, pd));
-		if (ZONE(bio->bi_sector, pd) != zone)
+		pkt_dbg(2, pd, "found zone=%llx\n", (unsigned long long)
+			get_zone(bio->bi_iter.bi_sector, pd));
+		if (get_zone(bio->bi_iter.bi_sector, pd) != zone)
 			break;
 		pkt_rbtree_erase(pd, node);
 		spin_lock(&pkt->lock);
-		pkt_add_list_last(bio, &pkt->orig_bios, &pkt->orig_bios_tail);
-		pkt->write_size += bio->bi_size / CD_FRAMESIZE;
+		bio_list_add(&pkt->orig_bios, bio);
+		pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE;
 		spin_unlock(&pkt->lock);
 	}
 	/* check write congestion marks, and if bio_queue_size is
@@ -1372,8 +1272,10 @@ try_next_bio:
 	wakeup = (pd->write_congestion_on > 0
 	 		&& pd->bio_queue_size <= pd->write_congestion_off);
 	spin_unlock(&pd->lock);
-	if (wakeup)
-		clear_bdi_congested(&pd->disk->queue->backing_dev_info, WRITE);
+	if (wakeup) {
+		clear_bdi_congested(&pd->disk->queue->backing_dev_info,
+					BLK_RW_ASYNC);
+	}
 
 	pkt->sleep_time = max(PACKET_WAIT_TIME, 1);
 	pkt_set_state(pkt, PACKET_WAITING_STATE);
@@ -1392,55 +1294,35 @@ try_next_bio:
  */
 static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 {
-	struct bio *bio;
 	int f;
-	int frames_write;
 	struct bio_vec *bvec = pkt->w_bio->bi_io_vec;
 
+	bio_reset(pkt->w_bio);
+	pkt->w_bio->bi_iter.bi_sector = pkt->sector;
+	pkt->w_bio->bi_bdev = pd->bdev;
+	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
+	pkt->w_bio->bi_private = pkt;
+
+	/* XXX: locking? */
 	for (f = 0; f < pkt->frames; f++) {
 		bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
 		bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+		if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
+			BUG();
 	}
+	pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt);
 
 	/*
 	 * Fill-in bvec with data from orig_bios.
 	 */
-	frames_write = 0;
 	spin_lock(&pkt->lock);
-	for (bio = pkt->orig_bios; bio; bio = bio->bi_next) {
-		int segment = bio->bi_idx;
-		int src_offs = 0;
-		int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
-		int num_frames = bio->bi_size / CD_FRAMESIZE;
-		BUG_ON(first_frame < 0);
-		BUG_ON(first_frame + num_frames > pkt->frames);
-		for (f = first_frame; f < first_frame + num_frames; f++) {
-			struct bio_vec *src_bvl = bio_iovec_idx(bio, segment);
-
-			while (src_offs >= src_bvl->bv_len) {
-				src_offs -= src_bvl->bv_len;
-				segment++;
-				BUG_ON(segment >= bio->bi_vcnt);
-				src_bvl = bio_iovec_idx(bio, segment);
-			}
+	bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
 
-			if (src_bvl->bv_len - src_offs >= CD_FRAMESIZE) {
-				bvec[f].bv_page = src_bvl->bv_page;
-				bvec[f].bv_offset = src_bvl->bv_offset + src_offs;
-			} else {
-				pkt_copy_bio_data(bio, segment, src_offs,
-						  bvec[f].bv_page, bvec[f].bv_offset);
-			}
-			src_offs += CD_FRAMESIZE;
-			frames_write++;
-		}
-	}
 	pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
 	spin_unlock(&pkt->lock);
 
-	VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n",
-		frames_write, (unsigned long long)pkt->sector);
-	BUG_ON(frames_write != pkt->write_size);
+	pkt_dbg(2, pd, "Writing %d frames for zone %llx\n",
+		pkt->write_size, (unsigned long long)pkt->sector);
 
 	if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
 		pkt_make_local_copy(pkt, bvec);
@@ -1450,19 +1332,6 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	}
 
 	/* Start the write request */
-	bio_init(pkt->w_bio);
-	pkt->w_bio->bi_max_vecs = PACKET_MAX_SIZE;
-	pkt->w_bio->bi_sector = pkt->sector;
-	pkt->w_bio->bi_bdev = pd->bdev;
-	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
-	pkt->w_bio->bi_private = pkt;
-	pkt->w_bio->bi_io_vec = bvec;
-	pkt->w_bio->bi_destructor = pkt_bio_destructor;
-	for (f = 0; f < pkt->frames; f++)
-		if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
-			BUG();
-	VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
-
 	atomic_set(&pkt->io_wait, 1);
 	pkt->w_bio->bi_rw = WRITE;
 	pkt_queue_bio(pd, pkt->w_bio);
@@ -1470,27 +1339,21 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 
 static void pkt_finish_packet(struct packet_data *pkt, int uptodate)
 {
-	struct bio *bio, *next;
+	struct bio *bio;
 
 	if (!uptodate)
 		pkt->cache_valid = 0;
 
 	/* Finish all bios corresponding to this packet */
-	bio = pkt->orig_bios;
-	while (bio) {
-		next = bio->bi_next;
-		bio->bi_next = NULL;
+	while ((bio = bio_list_pop(&pkt->orig_bios)))
 		bio_endio(bio, uptodate ? 0 : -EIO);
-		bio = next;
-	}
-	pkt->orig_bios = pkt->orig_bios_tail = NULL;
 }
 
 static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt)
 {
 	int uptodate;
 
-	VPRINTK("run_state_machine: pkt %d\n", pkt->id);
+	pkt_dbg(2, pd, "pkt %d\n", pkt->id);
 
 	for (;;) {
 		switch (pkt->state) {
@@ -1529,7 +1392,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
 			if (pkt_start_recovery(pkt)) {
 				pkt_start_write(pd, pkt);
 			} else {
-				VPRINTK("No recovery possible\n");
+				pkt_dbg(2, pd, "No recovery possible\n");
 				pkt_set_state(pkt, PACKET_FINISHED_STATE);
 			}
 			break;
@@ -1550,8 +1413,6 @@ static void pkt_handle_packets(struct pktcdvd_device *pd)
 {
 	struct packet_data *pkt, *next;
 
-	VPRINTK("pkt_handle_packets\n");
-
 	/*
 	 * Run state machine for active packets
 	 */
@@ -1602,7 +1463,7 @@ static int kcdrwd(void *foobar)
 	struct packet_data *pkt;
 	long min_sleep_time, residue;
 
-	set_user_nice(current, -20);
+	set_user_nice(current, MIN_NICE);
 	set_freezable();
 
 	for (;;) {
@@ -1633,9 +1494,9 @@ static int kcdrwd(void *foobar)
 			if (PACKET_DEBUG > 1) {
 				int states[PACKET_NUM_STATES];
 				pkt_count_states(pd, states);
-				VPRINTK("kcdrwd: i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
-					states[0], states[1], states[2], states[3],
-					states[4], states[5]);
+				pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
+					states[0], states[1], states[2],
+					states[3], states[4], states[5]);
 			}
 
 			min_sleep_time = MAX_SCHEDULE_TIMEOUT;
@@ -1644,11 +1505,9 @@ static int kcdrwd(void *foobar)
 					min_sleep_time = pkt->sleep_time;
 			}
 
-			generic_unplug_device(bdev_get_queue(pd->bdev));
-
-			VPRINTK("kcdrwd: sleeping\n");
+			pkt_dbg(2, pd, "sleeping\n");
 			residue = schedule_timeout(min_sleep_time);
-			VPRINTK("kcdrwd: wake up\n");
+			pkt_dbg(2, pd, "wake up\n");
 
 			/* make swsusp happy with our thread */
 			try_to_freeze();
@@ -1696,9 +1555,10 @@ work_to_do:
 
 static void pkt_print_settings(struct pktcdvd_device *pd)
 {
-	printk(DRIVER_NAME": %s packets, ", pd->settings.fp ? "Fixed" : "Variable");
-	printk("%u blocks, ", pd->settings.size >> 2);
-	printk("Mode-%c disc\n", pd->settings.block_mode == 8 ? '1' : '2');
+	pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n",
+		 pd->settings.fp ? "Fixed" : "Variable",
+		 pd->settings.size >> 2,
+		 pd->settings.block_mode == 8 ? '1' : '2');
 }
 
 static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control)
@@ -1832,7 +1692,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
 	init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
 	cgc.sense = &sense;
 	if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 		return ret;
 	}
 
@@ -1847,7 +1707,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
 	init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
 	cgc.sense = &sense;
 	if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 		return ret;
 	}
 
@@ -1882,14 +1742,14 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
 		/*
 		 * paranoia
 		 */
-		printk(DRIVER_NAME": write mode wrong %d\n", wp->data_block_type);
+		pkt_err(pd, "write mode wrong %d\n", wp->data_block_type);
 		return 1;
 	}
 	wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
 
 	cgc.buflen = cgc.cmd[8] = size;
 	if ((ret = pkt_mode_select(pd, &cgc))) {
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 		return ret;
 	}
 
@@ -1926,7 +1786,7 @@ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti)
 	if (ti->rt == 1 && ti->blank == 0)
 		return 1;
 
-	printk(DRIVER_NAME": bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
+	pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
 	return 0;
 }
 
@@ -1944,7 +1804,8 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
 		case 0x12: /* DVD-RAM */
 			return 1;
 		default:
-			VPRINTK(DRIVER_NAME": Wrong disc profile (%x)\n", pd->mmc3_profile);
+			pkt_dbg(2, pd, "Wrong disc profile (%x)\n",
+				pd->mmc3_profile);
 			return 0;
 	}
 
@@ -1953,22 +1814,22 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
 	 * but i'm not sure, should we leave this to user apps? probably.
 	 */
 	if (di->disc_type == 0xff) {
-		printk(DRIVER_NAME": Unknown disc. No track?\n");
+		pkt_notice(pd, "unknown disc - no track?\n");
 		return 0;
 	}
 
 	if (di->disc_type != 0x20 && di->disc_type != 0) {
-		printk(DRIVER_NAME": Wrong disc type (%x)\n", di->disc_type);
+		pkt_err(pd, "wrong disc type (%x)\n", di->disc_type);
 		return 0;
 	}
 
 	if (di->erasable == 0) {
-		printk(DRIVER_NAME": Disc not erasable\n");
+		pkt_notice(pd, "disc not erasable\n");
 		return 0;
 	}
 
 	if (di->border_status == PACKET_SESSION_RESERVED) {
-		printk(DRIVER_NAME": Can't write to last track (reserved)\n");
+		pkt_err(pd, "can't write to last track (reserved)\n");
 		return 0;
 	}
 
@@ -1993,7 +1854,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 	memset(&ti, 0, sizeof(track_information));
 
 	if ((ret = pkt_get_disc_info(pd, &di))) {
-		printk("failed get_disc\n");
+		pkt_err(pd, "failed get_disc\n");
 		return ret;
 	}
 
@@ -2004,12 +1865,12 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 
 	track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
 	if ((ret = pkt_get_track_info(pd, track, 1, &ti))) {
-		printk(DRIVER_NAME": failed get_track\n");
+		pkt_err(pd, "failed get_track\n");
 		return ret;
 	}
 
 	if (!pkt_writable_track(pd, &ti)) {
-		printk(DRIVER_NAME": can't write to this track\n");
+		pkt_err(pd, "can't write to this track\n");
 		return -EROFS;
 	}
 
@@ -2019,11 +1880,11 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 	 */
 	pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2;
 	if (pd->settings.size == 0) {
-		printk(DRIVER_NAME": detected zero packet size!\n");
+		pkt_notice(pd, "detected zero packet size!\n");
 		return -ENXIO;
 	}
 	if (pd->settings.size > PACKET_MAX_SECTORS) {
-		printk(DRIVER_NAME": packet size is too big\n");
+		pkt_err(pd, "packet size is too big\n");
 		return -EROFS;
 	}
 	pd->settings.fp = ti.fp;
@@ -2065,7 +1926,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 			pd->settings.block_mode = PACKET_BLOCK_MODE2;
 			break;
 		default:
-			printk(DRIVER_NAME": unknown data mode\n");
+			pkt_err(pd, "unknown data mode\n");
 			return -EROFS;
 	}
 	return 0;
@@ -2099,10 +1960,10 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
 	cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff));
 	ret = pkt_mode_select(pd, &cgc);
 	if (ret) {
-		printk(DRIVER_NAME": write caching control failed\n");
-		pkt_dump_sense(&cgc);
+		pkt_err(pd, "write caching control failed\n");
+		pkt_dump_sense(pd, &cgc);
 	} else if (!ret && set)
-		printk(DRIVER_NAME": enabled write caching on %s\n", pd->name);
+		pkt_notice(pd, "enabled write caching\n");
 	return ret;
 }
 
@@ -2138,7 +1999,7 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
 			     sizeof(struct mode_page_header);
 		ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
 		if (ret) {
-			pkt_dump_sense(&cgc);
+			pkt_dump_sense(pd, &cgc);
 			return ret;
 		}
 	}
@@ -2197,7 +2058,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 	cgc.cmd[8] = 2;
 	ret = pkt_generic_packet(pd, &cgc);
 	if (ret) {
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 		return ret;
 	}
 	size = ((unsigned int) buf[0]<<8) + buf[1] + 2;
@@ -2212,16 +2073,16 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 	cgc.cmd[8] = size;
 	ret = pkt_generic_packet(pd, &cgc);
 	if (ret) {
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 		return ret;
 	}
 
 	if (!(buf[6] & 0x40)) {
-		printk(DRIVER_NAME": Disc type is not CD-RW\n");
+		pkt_notice(pd, "disc type is not CD-RW\n");
 		return 1;
 	}
 	if (!(buf[6] & 0x4)) {
-		printk(DRIVER_NAME": A1 values on media are not valid, maybe not CDRW?\n");
+		pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n");
 		return 1;
 	}
 
@@ -2241,14 +2102,14 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 			*speed = us_clv_to_speed[sp];
 			break;
 		default:
-			printk(DRIVER_NAME": Unknown disc sub-type %d\n",st);
+			pkt_notice(pd, "unknown disc sub-type %d\n", st);
 			return 1;
 	}
 	if (*speed) {
-		printk(DRIVER_NAME": Max. media speed: %d\n",*speed);
+		pkt_info(pd, "maximum media speed: %d\n", *speed);
 		return 0;
 	} else {
-		printk(DRIVER_NAME": Unknown speed %d for sub-type %d\n",sp,st);
+		pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st);
 		return 1;
 	}
 }
@@ -2259,7 +2120,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
 	struct request_sense sense;
 	int ret;
 
-	VPRINTK(DRIVER_NAME": Performing OPC\n");
+	pkt_dbg(2, pd, "Performing OPC\n");
 
 	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
 	cgc.sense = &sense;
@@ -2267,7 +2128,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
 	cgc.cmd[0] = GPCMD_SEND_OPC;
 	cgc.cmd[1] = 1;
 	if ((ret = pkt_generic_packet(pd, &cgc)))
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 	return ret;
 }
 
@@ -2277,12 +2138,12 @@ static int pkt_open_write(struct pktcdvd_device *pd)
 	unsigned int write_speed, media_write_speed, read_speed;
 
 	if ((ret = pkt_probe_settings(pd))) {
-		VPRINTK(DRIVER_NAME": %s failed probe\n", pd->name);
+		pkt_dbg(2, pd, "failed probe\n");
 		return ret;
 	}
 
 	if ((ret = pkt_set_write_settings(pd))) {
-		DPRINTK(DRIVER_NAME": %s failed saving write settings\n", pd->name);
+		pkt_dbg(1, pd, "failed saving write settings\n");
 		return -EIO;
 	}
 
@@ -2294,26 +2155,26 @@ static int pkt_open_write(struct pktcdvd_device *pd)
 		case 0x13: /* DVD-RW */
 		case 0x1a: /* DVD+RW */
 		case 0x12: /* DVD-RAM */
-			DPRINTK(DRIVER_NAME": write speed %ukB/s\n", write_speed);
+			pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed);
 			break;
 		default:
 			if ((ret = pkt_media_speed(pd, &media_write_speed)))
 				media_write_speed = 16;
 			write_speed = min(write_speed, media_write_speed * 177);
-			DPRINTK(DRIVER_NAME": write speed %ux\n", write_speed / 176);
+			pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176);
 			break;
 	}
 	read_speed = write_speed;
 
 	if ((ret = pkt_set_speed(pd, write_speed, read_speed))) {
-		DPRINTK(DRIVER_NAME": %s couldn't set write speed\n", pd->name);
+		pkt_dbg(1, pd, "couldn't set write speed\n");
 		return -EIO;
 	}
 	pd->write_speed = write_speed;
 	pd->read_speed = read_speed;
 
 	if ((ret = pkt_perform_opc(pd))) {
-		DPRINTK(DRIVER_NAME": %s Optimum Power Calibration failed\n", pd->name);
+		pkt_dbg(1, pd, "Optimum Power Calibration failed\n");
 	}
 
 	return 0;
@@ -2334,15 +2195,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	 * so bdget() can't fail.
 	 */
 	bdget(pd->bdev->bd_dev);
-	if ((ret = blkdev_get(pd->bdev, FMODE_READ)))
+	if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
 		goto out;
 
-	if ((ret = bd_claim(pd->bdev, pd)))
-		goto out_putdev;
-
 	if ((ret = pkt_get_last_written(pd, &lba))) {
-		printk(DRIVER_NAME": pkt_get_last_written failed\n");
-		goto out_unclaim;
+		pkt_err(pd, "pkt_get_last_written failed\n");
+		goto out_putdev;
 	}
 
 	set_capacity(pd->disk, lba << 2);
@@ -2352,13 +2210,13 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	q = bdev_get_queue(pd->bdev);
 	if (write) {
 		if ((ret = pkt_open_write(pd)))
-			goto out_unclaim;
+			goto out_putdev;
 		/*
 		 * Some CDRW drives can not handle writes larger than one packet,
 		 * even if the size is a multiple of the packet size.
 		 */
 		spin_lock_irq(q->queue_lock);
-		blk_queue_max_sectors(q, pd->settings.size);
+		blk_queue_max_hw_sectors(q, pd->settings.size);
 		spin_unlock_irq(q->queue_lock);
 		set_bit(PACKET_WRITABLE, &pd->flags);
 	} else {
@@ -2367,23 +2225,21 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	}
 
 	if ((ret = pkt_set_segment_merging(pd, q)))
-		goto out_unclaim;
+		goto out_putdev;
 
 	if (write) {
 		if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
-			printk(DRIVER_NAME": not enough memory for buffers\n");
+			pkt_err(pd, "not enough memory for buffers\n");
 			ret = -ENOMEM;
-			goto out_unclaim;
+			goto out_putdev;
 		}
-		printk(DRIVER_NAME": %lukB available on disc\n", lba << 1);
+		pkt_info(pd, "%lukB available on disc\n", lba << 1);
 	}
 
 	return 0;
 
-out_unclaim:
-	bd_release(pd->bdev);
 out_putdev:
-	blkdev_put(pd->bdev, FMODE_READ);
+	blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
 out:
 	return ret;
 }
@@ -2395,18 +2251,17 @@ out:
 static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
 {
 	if (flush && pkt_flush_cache(pd))
-		DPRINTK(DRIVER_NAME": %s not flushing cache\n", pd->name);
+		pkt_dbg(1, pd, "not flushing cache\n");
 
 	pkt_lock_door(pd, 0);
 
 	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
-	bd_release(pd->bdev);
-	blkdev_put(pd->bdev, FMODE_READ);
+	blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
 
 	pkt_shrink_pktlist(pd);
 }
 
-static struct pktcdvd_device *pkt_find_dev_from_minor(int dev_minor)
+static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor)
 {
 	if (dev_minor >= MAX_WRITERS)
 		return NULL;
@@ -2418,8 +2273,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
 	struct pktcdvd_device *pd = NULL;
 	int ret;
 
-	VPRINTK(DRIVER_NAME": entering open\n");
-
+	mutex_lock(&pktcdvd_mutex);
 	mutex_lock(&ctl_mutex);
 	pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
 	if (!pd) {
@@ -2447,21 +2301,22 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
 	}
 
 	mutex_unlock(&ctl_mutex);
+	mutex_unlock(&pktcdvd_mutex);
 	return 0;
 
 out_dec:
 	pd->refcnt--;
 out:
-	VPRINTK(DRIVER_NAME": failed open (%d)\n", ret);
 	mutex_unlock(&ctl_mutex);
+	mutex_unlock(&pktcdvd_mutex);
 	return ret;
 }
 
-static int pkt_close(struct gendisk *disk, fmode_t mode)
+static void pkt_close(struct gendisk *disk, fmode_t mode)
 {
 	struct pktcdvd_device *pd = disk->private_data;
-	int ret = 0;
 
+	mutex_lock(&pktcdvd_mutex);
 	mutex_lock(&ctl_mutex);
 	pd->refcnt--;
 	BUG_ON(pd->refcnt < 0);
@@ -2470,7 +2325,7 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
 		pkt_release_dev(pd, flush);
 	}
 	mutex_unlock(&ctl_mutex);
-	return ret;
+	mutex_unlock(&pktcdvd_mutex);
 }
 
 
@@ -2485,74 +2340,29 @@ static void pkt_end_io_read_cloned(struct bio *bio, int err)
 	pkt_bio_finished(pd);
 }
 
-static int pkt_make_request(struct request_queue *q, struct bio *bio)
+static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
 {
-	struct pktcdvd_device *pd;
-	char b[BDEVNAME_SIZE];
+	struct bio *cloned_bio = bio_clone(bio, GFP_NOIO);
+	struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
+
+	psd->pd = pd;
+	psd->bio = bio;
+	cloned_bio->bi_bdev = pd->bdev;
+	cloned_bio->bi_private = psd;
+	cloned_bio->bi_end_io = pkt_end_io_read_cloned;
+	pd->stats.secs_r += bio_sectors(bio);
+	pkt_queue_bio(pd, cloned_bio);
+}
+
+static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
+{
+	struct pktcdvd_device *pd = q->queuedata;
 	sector_t zone;
 	struct packet_data *pkt;
 	int was_empty, blocked_bio;
 	struct pkt_rb_node *node;
 
-	pd = q->queuedata;
-	if (!pd) {
-		printk(DRIVER_NAME": %s incorrect request queue\n", bdevname(bio->bi_bdev, b));
-		goto end_io;
-	}
-
-	/*
-	 * Clone READ bios so we can have our own bi_end_io callback.
-	 */
-	if (bio_data_dir(bio) == READ) {
-		struct bio *cloned_bio = bio_clone(bio, GFP_NOIO);
-		struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
-
-		psd->pd = pd;
-		psd->bio = bio;
-		cloned_bio->bi_bdev = pd->bdev;
-		cloned_bio->bi_private = psd;
-		cloned_bio->bi_end_io = pkt_end_io_read_cloned;
-		pd->stats.secs_r += bio->bi_size >> 9;
-		pkt_queue_bio(pd, cloned_bio);
-		return 0;
-	}
-
-	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
-		printk(DRIVER_NAME": WRITE for ro device %s (%llu)\n",
-			pd->name, (unsigned long long)bio->bi_sector);
-		goto end_io;
-	}
-
-	if (!bio->bi_size || (bio->bi_size % CD_FRAMESIZE)) {
-		printk(DRIVER_NAME": wrong bio size\n");
-		goto end_io;
-	}
-
-	blk_queue_bounce(q, &bio);
-
-	zone = ZONE(bio->bi_sector, pd);
-	VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n",
-		(unsigned long long)bio->bi_sector,
-		(unsigned long long)(bio->bi_sector + bio_sectors(bio)));
-
-	/* Check if we have to split the bio */
-	{
-		struct bio_pair *bp;
-		sector_t last_zone;
-		int first_sectors;
-
-		last_zone = ZONE(bio->bi_sector + bio_sectors(bio) - 1, pd);
-		if (last_zone != zone) {
-			BUG_ON(last_zone != zone + pd->settings.size);
-			first_sectors = last_zone - bio->bi_sector;
-			bp = bio_split(bio, first_sectors);
-			BUG_ON(!bp);
-			pkt_make_request(q, &bp->bio1);
-			pkt_make_request(q, &bp->bio2);
-			bio_pair_release(bp);
-			return 0;
-		}
-	}
+	zone = get_zone(bio->bi_iter.bi_sector, pd);
 
 	/*
 	 * If we find a matching packet in state WAITING or READ_WAIT, we can
@@ -2565,9 +2375,9 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 			spin_lock(&pkt->lock);
 			if ((pkt->state == PACKET_WAITING_STATE) ||
 			    (pkt->state == PACKET_READ_WAIT_STATE)) {
-				pkt_add_list_last(bio, &pkt->orig_bios,
-						  &pkt->orig_bios_tail);
-				pkt->write_size += bio->bi_size / CD_FRAMESIZE;
+				bio_list_add(&pkt->orig_bios, bio);
+				pkt->write_size +=
+					bio->bi_iter.bi_size / CD_FRAMESIZE;
 				if ((pkt->write_size >= pkt->frames) &&
 				    (pkt->state == PACKET_WAITING_STATE)) {
 					atomic_inc(&pkt->run_sm);
@@ -2575,7 +2385,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 				}
 				spin_unlock(&pkt->lock);
 				spin_unlock(&pd->cdrw.active_list_lock);
-				return 0;
+				return;
 			} else {
 				blocked_bio = 1;
 			}
@@ -2592,10 +2402,10 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 	spin_lock(&pd->lock);
 	if (pd->write_congestion_on > 0
 	    && pd->bio_queue_size >= pd->write_congestion_on) {
-		set_bdi_congested(&q->backing_dev_info, WRITE);
+		set_bdi_congested(&q->backing_dev_info, BLK_RW_ASYNC);
 		do {
 			spin_unlock(&pd->lock);
-			congestion_wait(WRITE, HZ);
+			congestion_wait(BLK_RW_ASYNC, HZ);
 			spin_lock(&pd->lock);
 		} while(pd->bio_queue_size > pd->write_congestion_off);
 	}
@@ -2626,10 +2436,67 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 		 */
 		wake_up(&pd->wqueue);
 	}
-	return 0;
+}
+
+static void pkt_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct pktcdvd_device *pd;
+	char b[BDEVNAME_SIZE];
+	struct bio *split;
+
+	pd = q->queuedata;
+	if (!pd) {
+		pr_err("%s incorrect request queue\n",
+		       bdevname(bio->bi_bdev, b));
+		goto end_io;
+	}
+
+	pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
+		(unsigned long long)bio->bi_iter.bi_sector,
+		(unsigned long long)bio_end_sector(bio));
+
+	/*
+	 * Clone READ bios so we can have our own bi_end_io callback.
+	 */
+	if (bio_data_dir(bio) == READ) {
+		pkt_make_request_read(pd, bio);
+		return;
+	}
+
+	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
+		pkt_notice(pd, "WRITE for ro device (%llu)\n",
+			   (unsigned long long)bio->bi_iter.bi_sector);
+		goto end_io;
+	}
+
+	if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) {
+		pkt_err(pd, "wrong bio size\n");
+		goto end_io;
+	}
+
+	blk_queue_bounce(q, &bio);
+
+	do {
+		sector_t zone = get_zone(bio->bi_iter.bi_sector, pd);
+		sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd);
+
+		if (last_zone != zone) {
+			BUG_ON(last_zone != zone + pd->settings.size);
+
+			split = bio_split(bio, last_zone -
+					  bio->bi_iter.bi_sector,
+					  GFP_NOIO, fs_bio_set);
+			bio_chain(split, bio);
+		} else {
+			split = bio;
+		}
+
+		pkt_make_request_write(q, split);
+	} while (split != bio);
+
+	return;
 end_io:
 	bio_io_error(bio);
-	return 0;
 }
 
 
@@ -2638,7 +2505,7 @@ static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
 			  struct bio_vec *bvec)
 {
 	struct pktcdvd_device *pd = q->queuedata;
-	sector_t zone = ZONE(bmd->bi_sector, pd);
+	sector_t zone = get_zone(bmd->bi_sector, pd);
 	int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size;
 	int remaining = (pd->settings.size << 9) - used;
 	int remaining2;
@@ -2660,7 +2527,7 @@ static void pkt_init_queue(struct pktcdvd_device *pd)
 
 	blk_queue_make_request(q, pkt_make_request);
 	blk_queue_logical_block_size(q, CD_FRAMESIZE);
-	blk_queue_max_sectors(q, PACKET_MAX_SECTORS);
+	blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS);
 	blk_queue_merge_bvec(q, pkt_merge_bvec);
 	q->queuedata = pd;
 }
@@ -2729,7 +2596,7 @@ static int pkt_seq_show(struct seq_file *m, void *p)
 
 static int pkt_seq_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, pkt_seq_show, PDE(inode)->data);
+	return single_open(file, pkt_seq_show, PDE_DATA(inode));
 }
 
 static const struct file_operations pkt_proc_fops = {
@@ -2747,7 +2614,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	struct block_device *bdev;
 
 	if (pd->pkt_dev == dev) {
-		printk(DRIVER_NAME": Recursive setup not allowed\n");
+		pkt_err(pd, "recursive setup not allowed\n");
 		return -EBUSY;
 	}
 	for (i = 0; i < MAX_WRITERS; i++) {
@@ -2755,11 +2622,12 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 		if (!pd2)
 			continue;
 		if (pd2->bdev->bd_dev == dev) {
-			printk(DRIVER_NAME": %s already setup\n", bdevname(pd2->bdev, b));
+			pkt_err(pd, "%s already setup\n",
+				bdevname(pd2->bdev, b));
 			return -EBUSY;
 		}
 		if (pd2->pkt_dev == dev) {
-			printk(DRIVER_NAME": Can't chain pktcdvd devices\n");
+			pkt_err(pd, "can't chain pktcdvd devices\n");
 			return -EBUSY;
 		}
 	}
@@ -2767,7 +2635,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	bdev = bdget(dev);
 	if (!bdev)
 		return -ENOMEM;
-	ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY);
+	ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
 	if (ret)
 		return ret;
 
@@ -2782,13 +2650,13 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	atomic_set(&pd->cdrw.pending_bios, 0);
 	pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name);
 	if (IS_ERR(pd->cdrw.thread)) {
-		printk(DRIVER_NAME": can't start kernel thread\n");
+		pkt_err(pd, "can't start kernel thread\n");
 		ret = -ENOMEM;
 		goto out_mem;
 	}
 
 	proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd);
-	DPRINTK(DRIVER_NAME": writer %s mapped to %s\n", pd->name, bdevname(bdev, b));
+	pkt_dbg(1, pd, "writer mapped to %s\n", bdevname(bdev, b));
 	return 0;
 
 out_mem:
@@ -2801,10 +2669,12 @@ out_mem:
 static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
 {
 	struct pktcdvd_device *pd = bdev->bd_disk->private_data;
+	int ret;
 
-	VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd,
-		MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+	pkt_dbg(2, pd, "cmd %x, dev %d:%d\n",
+		cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
 
+	mutex_lock(&pktcdvd_mutex);
 	switch (cmd) {
 	case CDROMEJECT:
 		/*
@@ -2822,17 +2692,20 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 	case CDROM_LAST_WRITTEN:
 	case CDROM_SEND_PACKET:
 	case SCSI_IOCTL_SEND_COMMAND:
-		return __blkdev_driver_ioctl(pd->bdev, mode, cmd, arg);
+		ret = __blkdev_driver_ioctl(pd->bdev, mode, cmd, arg);
+		break;
 
 	default:
-		VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd);
-		return -ENOTTY;
+		pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd);
+		ret = -ENOTTY;
 	}
+	mutex_unlock(&pktcdvd_mutex);
 
-	return 0;
+	return ret;
 }
 
-static int pkt_media_changed(struct gendisk *disk)
+static unsigned int pkt_check_events(struct gendisk *disk,
+				     unsigned int clearing)
 {
 	struct pktcdvd_device *pd = disk->private_data;
 	struct gendisk *attached_disk;
@@ -2842,19 +2715,24 @@ static int pkt_media_changed(struct gendisk *disk)
 	if (!pd->bdev)
 		return 0;
 	attached_disk = pd->bdev->bd_disk;
-	if (!attached_disk)
+	if (!attached_disk || !attached_disk->fops->check_events)
 		return 0;
-	return attached_disk->fops->media_changed(attached_disk);
+	return attached_disk->fops->check_events(attached_disk, clearing);
 }
 
-static struct block_device_operations pktcdvd_ops = {
+static const struct block_device_operations pktcdvd_ops = {
 	.owner =		THIS_MODULE,
 	.open =			pkt_open,
 	.release =		pkt_close,
-	.locked_ioctl =		pkt_ioctl,
-	.media_changed =	pkt_media_changed,
+	.ioctl =		pkt_ioctl,
+	.check_events =		pkt_check_events,
 };
 
+static char *pktcdvd_devnode(struct gendisk *gd, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name);
+}
+
 /*
  * Set up mapping from pktcdvd device to CD-ROM device.
  */
@@ -2871,7 +2749,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 		if (!pkt_devs[idx])
 			break;
 	if (idx == MAX_WRITERS) {
-		printk(DRIVER_NAME": max %d writers supported\n", MAX_WRITERS);
+		pr_err("max %d writers supported\n", MAX_WRITERS);
 		ret = -EBUSY;
 		goto out_mutex;
 	}
@@ -2891,6 +2769,8 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 
 	spin_lock_init(&pd->lock);
 	spin_lock_init(&pd->iosched.lock);
+	bio_list_init(&pd->iosched.read_queue);
+	bio_list_init(&pd->iosched.write_queue);
 	sprintf(pd->name, DRIVER_NAME"%d", idx);
 	init_waitqueue_head(&pd->wqueue);
 	pd->bio_queue = RB_ROOT;
@@ -2907,6 +2787,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	disk->fops = &pktcdvd_ops;
 	disk->flags = GENHD_FL_REMOVABLE;
 	strcpy(disk->disk_name, pd->name);
+	disk->devnode = pktcdvd_devnode;
 	disk->private_data = pd;
 	disk->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!disk->queue)
@@ -2917,6 +2798,10 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	if (ret)
 		goto out_new_dev;
 
+	/* inherit events of the host device */
+	disk->events = pd->bdev->bd_disk->events;
+	disk->async_events = pd->bdev->bd_disk->async_events;
+
 	add_disk(disk);
 
 	pkt_sysfs_dev_new(pd);
@@ -2939,7 +2824,7 @@ out_mem:
 	kfree(pd);
 out_mutex:
 	mutex_unlock(&ctl_mutex);
-	printk(DRIVER_NAME": setup of pktcdvd device failed\n");
+	pr_err("setup of pktcdvd device failed\n");
 	return ret;
 }
 
@@ -2960,7 +2845,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 			break;
 	}
 	if (idx == MAX_WRITERS) {
-		DPRINTK(DRIVER_NAME": dev not setup\n");
+		pr_debug("dev not setup\n");
 		ret = -ENXIO;
 		goto out;
 	}
@@ -2980,7 +2865,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 	blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY);
 
 	remove_proc_entry(pd->name, pkt_proc);
-	DPRINTK(DRIVER_NAME": writer %s unmapped\n", pd->name);
+	pkt_dbg(1, pd, "writer unmapped\n");
 
 	del_gendisk(pd->disk);
 	blk_cleanup_queue(pd->disk->queue);
@@ -3016,7 +2901,7 @@ static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
 	mutex_unlock(&ctl_mutex);
 }
 
-static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
 	struct pkt_ctrl_command ctrl_cmd;
@@ -3053,15 +2938,27 @@ static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cm
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT
+static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
 
 static const struct file_operations pkt_ctl_fops = {
-	.ioctl	 = pkt_ctl_ioctl,
-	.owner	 = THIS_MODULE,
+	.open		= nonseekable_open,
+	.unlocked_ioctl	= pkt_ctl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= pkt_ctl_compat_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice pkt_misc = {
 	.minor 		= MISC_DYNAMIC_MINOR,
 	.name  		= DRIVER_NAME,
+	.nodename	= "pktcdvd/control",
 	.fops  		= &pkt_ctl_fops
 };
 
@@ -3078,7 +2975,7 @@ static int __init pkt_init(void)
 
 	ret = register_blkdev(pktdev_major, DRIVER_NAME);
 	if (ret < 0) {
-		printk(DRIVER_NAME": Unable to register block device\n");
+		pr_err("unable to register block device\n");
 		goto out2;
 	}
 	if (!pktdev_major)
@@ -3092,7 +2989,7 @@ static int __init pkt_init(void)
 
 	ret = misc_register(&pkt_misc);
 	if (ret) {
-		printk(DRIVER_NAME": Unable to register misc device\n");
+		pr_err("unable to register misc device\n");
 		goto out_misc;
 	}
 
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index aaeeb544228..c120d70d3fb 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -20,6 +20,8 @@
 
 #include <linux/ata.h>
 #include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/module.h>
 
 #include <asm/lv1call.h>
 #include <asm/ps3stor.h>
@@ -82,7 +84,7 @@ enum lv1_ata_in_out {
 static int ps3disk_major;
 
 
-static struct block_device_operations ps3disk_fops = {
+static const struct block_device_operations ps3disk_fops = {
 	.owner		= THIS_MODULE,
 };
 
@@ -92,27 +94,26 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
 {
 	unsigned int offset = 0;
 	struct req_iterator iter;
-	struct bio_vec *bvec;
+	struct bio_vec bvec;
 	unsigned int i = 0;
 	size_t size;
 	void *buf;
 
 	rq_for_each_segment(bvec, req, iter) {
 		unsigned long flags;
-		dev_dbg(&dev->sbd.core,
-			"%s:%u: bio %u: %u segs %u sectors from %lu\n",
-			__func__, __LINE__, i, bio_segments(iter.bio),
-			bio_sectors(iter.bio), iter.bio->bi_sector);
+		dev_dbg(&dev->sbd.core, "%s:%u: bio %u: %u sectors from %lu\n",
+			__func__, __LINE__, i, bio_sectors(iter.bio),
+			iter.bio->bi_iter.bi_sector);
 
-		size = bvec->bv_len;
-		buf = bvec_kmap_irq(bvec, &flags);
+		size = bvec.bv_len;
+		buf = bvec_kmap_irq(&bvec, &flags);
 		if (gather)
 			memcpy(dev->bounce_buf+offset, buf, size);
 		else
 			memcpy(buf, dev->bounce_buf+offset, size);
 		offset += size;
-		flush_kernel_dcache_page(bvec->bv_page);
-		bvec_kunmap_irq(bvec, &flags);
+		flush_kernel_dcache_page(bvec.bv_page);
+		bvec_kunmap_irq(buf, &flags);
 		i++;
 	}
 }
@@ -120,7 +121,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
 static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 				     struct request *req)
 {
-	struct ps3disk_private *priv = dev->sbd.core.driver_data;
+	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
 	int write = rq_data_dir(req), res;
 	const char *op = write ? "write" : "read";
 	u64 start_sector, sectors;
@@ -128,7 +129,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 
 #ifdef DEBUG
 	unsigned int n = 0;
-	struct bio_vec *bv;
+	struct bio_vec bv;
 	struct req_iterator iter;
 
 	rq_for_each_segment(bv, req, iter)
@@ -168,7 +169,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 static int ps3disk_submit_flush_request(struct ps3_storage_device *dev,
 					struct request *req)
 {
-	struct ps3disk_private *priv = dev->sbd.core.driver_data;
+	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
 	u64 res;
 
 	dev_dbg(&dev->sbd.core, "%s:%u: flush request\n", __func__, __LINE__);
@@ -195,13 +196,12 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
 
 	while ((req = blk_fetch_request(q))) {
-		if (blk_fs_request(req)) {
-			if (ps3disk_submit_request_sg(dev, req))
-				break;
-		} else if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
-			   req->cmd[0] == REQ_LB_OP_FLUSH) {
+		if (req->cmd_flags & REQ_FLUSH) {
 			if (ps3disk_submit_flush_request(dev, req))
 				break;
+		} else if (req->cmd_type == REQ_TYPE_FS) {
+			if (ps3disk_submit_request_sg(dev, req))
+				break;
 		} else {
 			blk_dump_rq_flags(req, DEVICE_NAME " bad request");
 			__blk_end_request_all(req, -EIO);
@@ -213,7 +213,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 static void ps3disk_request(struct request_queue *q)
 {
 	struct ps3_storage_device *dev = q->queuedata;
-	struct ps3disk_private *priv = dev->sbd.core.driver_data;
+	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
 
 	if (priv->req) {
 		dev_dbg(&dev->sbd.core, "%s:%u busy\n", __func__, __LINE__);
@@ -245,7 +245,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
 		return IRQ_HANDLED;
 	}
 
-	priv = dev->sbd.core.driver_data;
+	priv = ps3_system_bus_get_drvdata(&dev->sbd);
 	req = priv->req;
 	if (!req) {
 		dev_dbg(&dev->sbd.core,
@@ -256,8 +256,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
 		return IRQ_HANDLED;
 	}
 
-	if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
-	    req->cmd[0] == REQ_LB_OP_FLUSH) {
+	if (req->cmd_flags & REQ_FLUSH) {
 		read = 0;
 		op = "flush";
 	} else {
@@ -364,7 +363,7 @@ static void ata_id_c_string(const u16 *id, unsigned char *s, unsigned int ofs,
 
 static int ps3disk_identify(struct ps3_storage_device *dev)
 {
-	struct ps3disk_private *priv = dev->sbd.core.driver_data;
+	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
 	struct lv1_ata_cmnd_block ata_cmnd;
 	u16 *id = dev->bounce_buf;
 	u64 res;
@@ -397,21 +396,11 @@ static int ps3disk_identify(struct ps3_storage_device *dev)
 	return 0;
 }
 
-static void ps3disk_prepare_flush(struct request_queue *q, struct request *req)
-{
-	struct ps3_storage_device *dev = q->queuedata;
-
-	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
-
-	req->cmd_type = REQ_TYPE_LINUX_BLOCK;
-	req->cmd[0] = REQ_LB_OP_FLUSH;
-}
-
 static unsigned long ps3disk_mask;
 
 static DEFINE_MUTEX(ps3disk_mask_mutex);
 
-static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
+static int ps3disk_probe(struct ps3_system_bus_device *_dev)
 {
 	struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core);
 	struct ps3disk_private *priv;
@@ -445,7 +434,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
 		goto fail;
 	}
 
-	dev->sbd.core.driver_data = priv;
+	ps3_system_bus_set_drvdata(_dev, priv);
 	spin_lock_init(&priv->lock);
 
 	dev->bounce_size = BOUNCE_SIZE;
@@ -474,16 +463,14 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
 
 	blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH);
 
-	blk_queue_max_sectors(queue, dev->bounce_size >> 9);
+	blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
 	blk_queue_segment_boundary(queue, -1UL);
 	blk_queue_dma_alignment(queue, dev->blk_size-1);
 	blk_queue_logical_block_size(queue, dev->blk_size);
 
-	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH,
-			  ps3disk_prepare_flush);
+	blk_queue_flush(queue, REQ_FLUSH);
 
-	blk_queue_max_phys_segments(queue, -1);
-	blk_queue_max_hw_segments(queue, -1);
+	blk_queue_max_segments(queue, -1);
 	blk_queue_max_segment_size(queue, dev->bounce_size);
 
 	gendisk = alloc_disk(PS3DISK_MINORS);
@@ -523,7 +510,7 @@ fail_free_bounce:
 	kfree(dev->bounce_buf);
 fail_free_priv:
 	kfree(priv);
-	dev->sbd.core.driver_data = NULL;
+	ps3_system_bus_set_drvdata(_dev, NULL);
 fail:
 	mutex_lock(&ps3disk_mask_mutex);
 	__clear_bit(devidx, &ps3disk_mask);
@@ -534,7 +521,7 @@ fail:
 static int ps3disk_remove(struct ps3_system_bus_device *_dev)
 {
 	struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core);
-	struct ps3disk_private *priv = dev->sbd.core.driver_data;
+	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
 
 	mutex_lock(&ps3disk_mask_mutex);
 	__clear_bit(MINOR(disk_devt(priv->gendisk)) / PS3DISK_MINORS,
@@ -548,7 +535,7 @@ static int ps3disk_remove(struct ps3_system_bus_device *_dev)
 	ps3stor_teardown(dev);
 	kfree(dev->bounce_buf);
 	kfree(priv);
-	dev->sbd.core.driver_data = NULL;
+	ps3_system_bus_set_drvdata(_dev, NULL);
 	return 0;
 }
 
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 8eddef373a9..ef45cfb98fd 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -10,12 +10,16 @@
 
 #include <linux/blkdev.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 
+#include <asm/cell-regs.h>
 #include <asm/firmware.h>
 #include <asm/lv1call.h>
 #include <asm/ps3.h>
+#include <asm/ps3gpu.h>
 
 
 #define DEVICE_NAME		"ps3vram"
@@ -45,8 +49,6 @@
 #define NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN	0x0000030c
 #define NV_MEMORY_TO_MEMORY_FORMAT_NOTIFY	0x00000104
 
-#define L1GPU_CONTEXT_ATTRIBUTE_FB_BLIT 0x601
-
 #define CACHE_PAGE_PRESENT 1
 #define CACHE_PAGE_DIRTY   2
 
@@ -72,8 +74,7 @@ struct ps3vram_priv {
 	u64 memory_handle;
 	u64 context_handle;
 	u32 *ctrl;
-	u32 *reports;
-	u8 __iomem *ddr_base;
+	void *reports;
 	u8 *xdr_buf;
 
 	u32 *fifo_base;
@@ -81,15 +82,15 @@ struct ps3vram_priv {
 
 	struct ps3vram_cache cache;
 
-	/* Used to serialize cache/DMA operations */
-	struct mutex lock;
+	spinlock_t lock;	/* protecting list of bios */
+	struct bio_list list;
 };
 
 
 static int ps3vram_major;
 
 
-static struct block_device_operations ps3vram_fops = {
+static const struct block_device_operations ps3vram_fops = {
 	.owner		= THIS_MODULE,
 };
 
@@ -103,15 +104,15 @@ static char *size = "256M";
 module_param(size, charp, 0);
 MODULE_PARM_DESC(size, "memory size");
 
-static u32 *ps3vram_get_notifier(u32 *reports, int notifier)
+static u32 *ps3vram_get_notifier(void *reports, int notifier)
 {
-	return (void *)reports + DMA_NOTIFIER_OFFSET_BASE +
+	return reports + DMA_NOTIFIER_OFFSET_BASE +
 	       DMA_NOTIFIER_SIZE * notifier;
 }
 
 static void ps3vram_notifier_reset(struct ps3_system_bus_device *dev)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER);
 	int i;
 
@@ -122,9 +123,17 @@ static void ps3vram_notifier_reset(struct ps3_system_bus_device *dev)
 static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev,
 				 unsigned int timeout_ms)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER);
-	unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms);
+	unsigned long timeout;
+
+	for (timeout = 20; timeout; timeout--) {
+		if (!notify[3])
+			return 0;
+		udelay(10);
+	}
+
+	timeout = jiffies + msecs_to_jiffies(timeout_ms);
 
 	do {
 		if (!notify[3])
@@ -137,7 +146,7 @@ static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev,
 
 static void ps3vram_init_ring(struct ps3_system_bus_device *dev)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 
 	priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET;
 	priv->ctrl[CTRL_GET] = FIFO_BASE + FIFO_OFFSET;
@@ -146,7 +155,7 @@ static void ps3vram_init_ring(struct ps3_system_bus_device *dev)
 static int ps3vram_wait_ring(struct ps3_system_bus_device *dev,
 			     unsigned int timeout_ms)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms);
 
 	do {
@@ -175,7 +184,7 @@ static void ps3vram_begin_ring(struct ps3vram_priv *priv, u32 chan, u32 tag,
 
 static void ps3vram_rewind_ring(struct ps3_system_bus_device *dev)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	int status;
 
 	ps3vram_out_ring(priv, 0x20000000 | (FIFO_BASE + FIFO_OFFSET));
@@ -183,20 +192,17 @@ static void ps3vram_rewind_ring(struct ps3_system_bus_device *dev)
 	priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET;
 
 	/* asking the HV for a blit will kick the FIFO */
-	status = lv1_gpu_context_attribute(priv->context_handle,
-					   L1GPU_CONTEXT_ATTRIBUTE_FB_BLIT, 0,
-					   0, 0, 0);
+	status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0);
 	if (status)
-		dev_err(&dev->core,
-			"%s: lv1_gpu_context_attribute failed %d\n", __func__,
-			status);
+		dev_err(&dev->core, "%s: lv1_gpu_fb_blit failed %d\n",
+			__func__, status);
 
 	priv->fifo_ptr = priv->fifo_base;
 }
 
 static void ps3vram_fire_ring(struct ps3_system_bus_device *dev)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	int status;
 
 	mutex_lock(&ps3_gpu_mutex);
@@ -205,13 +211,10 @@ static void ps3vram_fire_ring(struct ps3_system_bus_device *dev)
 			       (priv->fifo_ptr - priv->fifo_base) * sizeof(u32);
 
 	/* asking the HV for a blit will kick the FIFO */
-	status = lv1_gpu_context_attribute(priv->context_handle,
-					   L1GPU_CONTEXT_ATTRIBUTE_FB_BLIT, 0,
-					   0, 0, 0);
+	status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0);
 	if (status)
-		dev_err(&dev->core,
-			"%s: lv1_gpu_context_attribute failed %d\n", __func__,
-			status);
+		dev_err(&dev->core, "%s: lv1_gpu_fb_blit failed %d\n",
+			__func__, status);
 
 	if ((priv->fifo_ptr - priv->fifo_base) * sizeof(u32) >
 	    FIFO_SIZE - 1024) {
@@ -225,7 +228,7 @@ static void ps3vram_fire_ring(struct ps3_system_bus_device *dev)
 
 static void ps3vram_bind(struct ps3_system_bus_device *dev)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 
 	ps3vram_begin_ring(priv, UPLOAD_SUBCH, 0, 1);
 	ps3vram_out_ring(priv, 0x31337303);
@@ -248,7 +251,7 @@ static int ps3vram_upload(struct ps3_system_bus_device *dev,
 			  unsigned int src_offset, unsigned int dst_offset,
 			  int len, int count)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 
 	ps3vram_begin_ring(priv, UPLOAD_SUBCH,
 			   NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
@@ -280,7 +283,7 @@ static int ps3vram_download(struct ps3_system_bus_device *dev,
 			    unsigned int src_offset, unsigned int dst_offset,
 			    int len, int count)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 
 	ps3vram_begin_ring(priv, DOWNLOAD_SUBCH,
 			   NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
@@ -310,7 +313,7 @@ static int ps3vram_download(struct ps3_system_bus_device *dev,
 
 static void ps3vram_cache_evict(struct ps3_system_bus_device *dev, int entry)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	struct ps3vram_cache *cache = &priv->cache;
 
 	if (!(cache->tags[entry].flags & CACHE_PAGE_DIRTY))
@@ -332,7 +335,7 @@ static void ps3vram_cache_evict(struct ps3_system_bus_device *dev, int entry)
 static void ps3vram_cache_load(struct ps3_system_bus_device *dev, int entry,
 			       unsigned int address)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	struct ps3vram_cache *cache = &priv->cache;
 
 	dev_dbg(&dev->core, "Fetching %d: 0x%08x\n", entry, address);
@@ -352,7 +355,7 @@ static void ps3vram_cache_load(struct ps3_system_bus_device *dev, int entry,
 
 static void ps3vram_cache_flush(struct ps3_system_bus_device *dev)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	struct ps3vram_cache *cache = &priv->cache;
 	int i;
 
@@ -366,7 +369,7 @@ static void ps3vram_cache_flush(struct ps3_system_bus_device *dev)
 static unsigned int ps3vram_cache_match(struct ps3_system_bus_device *dev,
 					loff_t address)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	struct ps3vram_cache *cache = &priv->cache;
 	unsigned int base;
 	unsigned int offset;
@@ -400,7 +403,7 @@ static unsigned int ps3vram_cache_match(struct ps3_system_bus_device *dev,
 
 static int ps3vram_cache_init(struct ps3_system_bus_device *dev)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 
 	priv->cache.page_count = CACHE_PAGE_COUNT;
 	priv->cache.page_size = CACHE_PAGE_SIZE;
@@ -419,7 +422,7 @@ static int ps3vram_cache_init(struct ps3_system_bus_device *dev)
 
 static void ps3vram_cache_cleanup(struct ps3_system_bus_device *dev)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 
 	ps3vram_cache_flush(dev);
 	kfree(priv->cache.tags);
@@ -428,7 +431,7 @@ static void ps3vram_cache_cleanup(struct ps3_system_bus_device *dev)
 static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from,
 			size_t len, size_t *retlen, u_char *buf)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	unsigned int cached, count;
 
 	dev_dbg(&dev->core, "%s: from=0x%08x len=0x%zx\n", __func__,
@@ -449,8 +452,6 @@ static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from,
 		offset = (unsigned int) (from & (priv->cache.page_size - 1));
 		avail  = priv->cache.page_size - offset;
 
-		mutex_lock(&priv->lock);
-
 		entry = ps3vram_cache_match(dev, from);
 		cached = CACHE_OFFSET + entry * priv->cache.page_size + offset;
 
@@ -462,8 +463,6 @@ static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from,
 			avail = count;
 		memcpy(buf, priv->xdr_buf + cached, avail);
 
-		mutex_unlock(&priv->lock);
-
 		buf += avail;
 		count -= avail;
 		from += avail;
@@ -476,7 +475,7 @@ static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from,
 static int ps3vram_write(struct ps3_system_bus_device *dev, loff_t to,
 			 size_t len, size_t *retlen, const u_char *buf)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	unsigned int cached, count;
 
 	if (to >= priv->size)
@@ -494,8 +493,6 @@ static int ps3vram_write(struct ps3_system_bus_device *dev, loff_t to,
 		offset = (unsigned int) (to & (priv->cache.page_size - 1));
 		avail  = priv->cache.page_size - offset;
 
-		mutex_lock(&priv->lock);
-
 		entry = ps3vram_cache_match(dev, to);
 		cached = CACHE_OFFSET + entry * priv->cache.page_size + offset;
 
@@ -509,8 +506,6 @@ static int ps3vram_write(struct ps3_system_bus_device *dev, loff_t to,
 
 		priv->cache.tags[entry].flags |= CACHE_PAGE_DIRTY;
 
-		mutex_unlock(&priv->lock);
-
 		buf += avail;
 		count -= avail;
 		to += avail;
@@ -530,7 +525,7 @@ static int ps3vram_proc_show(struct seq_file *m, void *v)
 
 static int ps3vram_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ps3vram_proc_show, PDE(inode)->data);
+	return single_open(file, ps3vram_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ps3vram_proc_fops = {
@@ -541,35 +536,33 @@ static const struct file_operations ps3vram_proc_fops = {
 	.release	= single_release,
 };
 
-static void __devinit ps3vram_proc_init(struct ps3_system_bus_device *dev)
+static void ps3vram_proc_init(struct ps3_system_bus_device *dev)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	struct proc_dir_entry *pde;
 
-	pde = proc_create(DEVICE_NAME, 0444, NULL, &ps3vram_proc_fops);
-	if (!pde) {
+	pde = proc_create_data(DEVICE_NAME, 0444, NULL, &ps3vram_proc_fops,
+			       priv);
+	if (!pde)
 		dev_warn(&dev->core, "failed to create /proc entry\n");
-		return;
-	}
-	pde->data = priv;
 }
 
-static int ps3vram_make_request(struct request_queue *q, struct bio *bio)
+static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev,
+				  struct bio *bio)
 {
-	struct ps3_system_bus_device *dev = q->queuedata;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	int write = bio_data_dir(bio) == WRITE;
 	const char *op = write ? "write" : "read";
-	loff_t offset = bio->bi_sector << 9;
+	loff_t offset = bio->bi_iter.bi_sector << 9;
 	int error = 0;
-	struct bio_vec *bvec;
-	unsigned int i;
-
-	dev_dbg(&dev->core, "%s\n", __func__);
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	struct bio *next;
 
-	bio_for_each_segment(bvec, bio, i) {
+	bio_for_each_segment(bvec, bio, iter) {
 		/* PS3 is ppc64, so we don't handle highmem */
-		char *ptr = page_address(bvec->bv_page) + bvec->bv_offset;
-		size_t len = bvec->bv_len, retlen;
+		char *ptr = page_address(bvec.bv_page) + bvec.bv_offset;
+		size_t len = bvec.bv_len, retlen;
 
 		dev_dbg(&dev->core, "    %s %zu bytes at offset %llu\n", op,
 			len, offset);
@@ -585,6 +578,7 @@ static int ps3vram_make_request(struct request_queue *q, struct bio *bio)
 
 		if (retlen != len) {
 			dev_err(&dev->core, "Short %s\n", op);
+			error = -EIO;
 			goto out;
 		}
 
@@ -594,18 +588,44 @@ static int ps3vram_make_request(struct request_queue *q, struct bio *bio)
 	dev_dbg(&dev->core, "%s completed\n", op);
 
 out:
+	spin_lock_irq(&priv->lock);
+	bio_list_pop(&priv->list);
+	next = bio_list_peek(&priv->list);
+	spin_unlock_irq(&priv->lock);
+
 	bio_endio(bio, error);
-	return 0;
+	return next;
 }
 
-static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
+static void ps3vram_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct ps3_system_bus_device *dev = q->queuedata;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	int busy;
+
+	dev_dbg(&dev->core, "%s\n", __func__);
+
+	spin_lock_irq(&priv->lock);
+	busy = !bio_list_empty(&priv->list);
+	bio_list_add(&priv->list, bio);
+	spin_unlock_irq(&priv->lock);
+
+	if (busy)
+		return;
+
+	do {
+		bio = ps3vram_do_bio(dev, bio);
+	} while (bio);
+}
+
+static int ps3vram_probe(struct ps3_system_bus_device *dev)
 {
 	struct ps3vram_priv *priv;
 	int error, status;
 	struct request_queue *queue;
 	struct gendisk *gendisk;
-	u64 ddr_lpar, ctrl_lpar, info_lpar, reports_lpar, ddr_size,
-	    reports_size;
+	u64 ddr_size, ddr_lpar, ctrl_lpar, info_lpar, reports_lpar,
+	    reports_size, xdr_lpar;
 	char *rest;
 
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@@ -614,10 +634,9 @@ static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
 		goto fail;
 	}
 
-	mutex_init(&priv->lock);
-	dev->core.driver_data = priv;
-
-	priv = dev->core.driver_data;
+	spin_lock_init(&priv->lock);
+	bio_list_init(&priv->list);
+	ps3_system_bus_set_drvdata(dev, priv);
 
 	/* Allocate XDR buffer (1MiB aligned) */
 	priv->xdr_buf = (void *)__get_free_pages(GFP_KERNEL,
@@ -636,7 +655,7 @@ static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
 	if (ps3_open_hv_device(dev)) {
 		dev_err(&dev->core, "ps3_open_hv_device failed\n");
 		error = -EAGAIN;
-		goto out_close_gpu;
+		goto out_free_xdr_buf;
 	}
 
 	/* Request memory */
@@ -660,7 +679,7 @@ static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
 		dev_err(&dev->core, "lv1_gpu_memory_allocate failed %d\n",
 			status);
 		error = -ENOMEM;
-		goto out_free_xdr_buf;
+		goto out_close_gpu;
 	}
 
 	/* Request context */
@@ -676,9 +695,11 @@ static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
 	}
 
 	/* Map XDR buffer to RSX */
+	xdr_lpar = ps3_mm_phys_to_lpar(__pa(priv->xdr_buf));
 	status = lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF,
-				       ps3_mm_phys_to_lpar(__pa(priv->xdr_buf)),
-				       XDR_BUF_SIZE, 0);
+				       xdr_lpar, XDR_BUF_SIZE,
+				       CBE_IOPTE_PP_W | CBE_IOPTE_PP_R |
+				       CBE_IOPTE_M);
 	if (status) {
 		dev_err(&dev->core, "lv1_gpu_context_iomap failed %d\n",
 			status);
@@ -686,19 +707,11 @@ static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
 		goto out_free_context;
 	}
 
-	priv->ddr_base = ioremap_flags(ddr_lpar, ddr_size, _PAGE_NO_CACHE);
-
-	if (!priv->ddr_base) {
-		dev_err(&dev->core, "ioremap DDR failed\n");
-		error = -ENOMEM;
-		goto out_free_context;
-	}
-
 	priv->ctrl = ioremap(ctrl_lpar, 64 * 1024);
 	if (!priv->ctrl) {
 		dev_err(&dev->core, "ioremap CTRL failed\n");
 		error = -ENOMEM;
-		goto out_unmap_vram;
+		goto out_unmap_context;
 	}
 
 	priv->reports = ioremap(reports_lpar, reports_size);
@@ -738,10 +751,9 @@ static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
 	priv->queue = queue;
 	queue->queuedata = dev;
 	blk_queue_make_request(queue, ps3vram_make_request);
-	blk_queue_max_phys_segments(queue, MAX_PHYS_SEGMENTS);
-	blk_queue_max_hw_segments(queue, MAX_HW_SEGMENTS);
-	blk_queue_max_segment_size(queue, MAX_SEGMENT_SIZE);
-	blk_queue_max_sectors(queue, SAFE_MAX_SECTORS);
+	blk_queue_max_segments(queue, BLK_MAX_SEGMENTS);
+	blk_queue_max_segment_size(queue, BLK_MAX_SEGMENT_SIZE);
+	blk_queue_max_hw_sectors(queue, BLK_SAFE_MAX_SECTORS);
 
 	gendisk = alloc_disk(1);
 	if (!gendisk) {
@@ -775,8 +787,9 @@ out_unmap_reports:
 	iounmap(priv->reports);
 out_unmap_ctrl:
 	iounmap(priv->ctrl);
-out_unmap_vram:
-	iounmap(priv->ddr_base);
+out_unmap_context:
+	lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF, xdr_lpar,
+			      XDR_BUF_SIZE, CBE_IOPTE_M);
 out_free_context:
 	lv1_gpu_context_free(priv->context_handle);
 out_free_memory:
@@ -787,14 +800,14 @@ out_free_xdr_buf:
 	free_pages((unsigned long) priv->xdr_buf, get_order(XDR_BUF_SIZE));
 fail_free_priv:
 	kfree(priv);
-	dev->core.driver_data = NULL;
+	ps3_system_bus_set_drvdata(dev, NULL);
 fail:
 	return error;
 }
 
 static int ps3vram_remove(struct ps3_system_bus_device *dev)
 {
-	struct ps3vram_priv *priv = dev->core.driver_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 
 	del_gendisk(priv->gendisk);
 	put_disk(priv->gendisk);
@@ -803,13 +816,15 @@ static int ps3vram_remove(struct ps3_system_bus_device *dev)
 	ps3vram_cache_cleanup(dev);
 	iounmap(priv->reports);
 	iounmap(priv->ctrl);
-	iounmap(priv->ddr_base);
+	lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF,
+			      ps3_mm_phys_to_lpar(__pa(priv->xdr_buf)),
+			      XDR_BUF_SIZE, CBE_IOPTE_M);
 	lv1_gpu_context_free(priv->context_handle);
 	lv1_gpu_memory_free(priv->memory_handle);
 	ps3_close_hv_device(dev);
 	free_pages((unsigned long) priv->xdr_buf, get_order(XDR_BUF_SIZE));
 	kfree(priv);
-	dev->core.driver_data = NULL;
+	ps3_system_bus_set_drvdata(dev, NULL);
 	return 0;
 }
 
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 00000000000..b2c98c1bc03
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,5566 @@
+
+/*
+   rbd.c -- Export ceph rados objects as a Linux block device
+
+
+   based on drivers/block/osdblk.c:
+
+   Copyright 2009 Red Hat, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+   For usage instructions, please refer to:
+
+                 Documentation/ABI/testing/sysfs-bus-rbd
+
+ */
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/decode.h>
+#include <linux/parser.h>
+#include <linux/bsearch.h>
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+
+#include "rbd_types.h"
+
+#define RBD_DEBUG	/* Activate rbd_assert() calls */
+
+/*
+ * The basic unit of block I/O is a sector.  It is interpreted in a
+ * number of contexts in Linux (blk, bio, genhd), but the default is
+ * universally 512 bytes.  These symbols are just slightly more
+ * meaningful than the bare numbers they represent.
+ */
+#define	SECTOR_SHIFT	9
+#define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
+
+/*
+ * Increment the given counter and return its updated value.
+ * If the counter is already 0 it will not be incremented.
+ * If the counter is already at its maximum value returns
+ * -EINVAL without updating it.
+ */
+static int atomic_inc_return_safe(atomic_t *v)
+{
+	unsigned int counter;
+
+	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
+	if (counter <= (unsigned int)INT_MAX)
+		return (int)counter;
+
+	atomic_dec(v);
+
+	return -EINVAL;
+}
+
+/* Decrement the counter.  Return the resulting value, or -EINVAL */
+static int atomic_dec_return_safe(atomic_t *v)
+{
+	int counter;
+
+	counter = atomic_dec_return(v);
+	if (counter >= 0)
+		return counter;
+
+	atomic_inc(v);
+
+	return -EINVAL;
+}
+
+#define RBD_DRV_NAME "rbd"
+
+#define RBD_MINORS_PER_MAJOR		256
+#define RBD_SINGLE_MAJOR_PART_SHIFT	4
+
+#define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
+#define RBD_MAX_SNAP_NAME_LEN	\
+			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
+
+#define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
+
+#define RBD_SNAP_HEAD_NAME	"-"
+
+#define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
+
+/* This allows a single page to hold an image name sent by OSD */
+#define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
+#define RBD_IMAGE_ID_LEN_MAX	64
+
+#define RBD_OBJ_PREFIX_LEN_MAX	64
+
+/* Feature bits */
+
+#define RBD_FEATURE_LAYERING	(1<<0)
+#define RBD_FEATURE_STRIPINGV2	(1<<1)
+#define RBD_FEATURES_ALL \
+	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
+
+/* Features supported by this (client software) implementation. */
+
+#define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
+
+/*
+ * An RBD device name will be "rbd#", where the "rbd" comes from
+ * RBD_DRV_NAME above, and # is a unique integer identifier.
+ * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
+ * enough to hold all possible device names.
+ */
+#define DEV_NAME_LEN		32
+#define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+	/* These six fields never change for a given rbd image */
+	char *object_prefix;
+	__u8 obj_order;
+	__u8 crypt_type;
+	__u8 comp_type;
+	u64 stripe_unit;
+	u64 stripe_count;
+	u64 features;		/* Might be changeable someday? */
+
+	/* The remaining fields need to be updated occasionally */
+	u64 image_size;
+	struct ceph_snap_context *snapc;
+	char *snap_names;	/* format 1 only */
+	u64 *snap_sizes;	/* format 1 only */
+};
+
+/*
+ * An rbd image specification.
+ *
+ * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
+ * identify an image.  Each rbd_dev structure includes a pointer to
+ * an rbd_spec structure that encapsulates this identity.
+ *
+ * Each of the id's in an rbd_spec has an associated name.  For a
+ * user-mapped image, the names are supplied and the id's associated
+ * with them are looked up.  For a layered image, a parent image is
+ * defined by the tuple, and the names are looked up.
+ *
+ * An rbd_dev structure contains a parent_spec pointer which is
+ * non-null if the image it represents is a child in a layered
+ * image.  This pointer will refer to the rbd_spec structure used
+ * by the parent rbd_dev for its own identity (i.e., the structure
+ * is shared between the parent and child).
+ *
+ * Since these structures are populated once, during the discovery
+ * phase of image construction, they are effectively immutable so
+ * we make no effort to synchronize access to them.
+ *
+ * Note that code herein does not assume the image name is known (it
+ * could be a null pointer).
+ */
+struct rbd_spec {
+	u64		pool_id;
+	const char	*pool_name;
+
+	const char	*image_id;
+	const char	*image_name;
+
+	u64		snap_id;
+	const char	*snap_name;
+
+	struct kref	kref;
+};
+
+/*
+ * an instance of the client.  multiple devices may share an rbd client.
+ */
+struct rbd_client {
+	struct ceph_client	*client;
+	struct kref		kref;
+	struct list_head	node;
+};
+
+struct rbd_img_request;
+typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
+
+#define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
+
+struct rbd_obj_request;
+typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
+
+enum obj_request_type {
+	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
+};
+
+enum obj_req_flags {
+	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
+	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
+	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
+	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
+};
+
+struct rbd_obj_request {
+	const char		*object_name;
+	u64			offset;		/* object start byte */
+	u64			length;		/* bytes from offset */
+	unsigned long		flags;
+
+	/*
+	 * An object request associated with an image will have its
+	 * img_data flag set; a standalone object request will not.
+	 *
+	 * A standalone object request will have which == BAD_WHICH
+	 * and a null obj_request pointer.
+	 *
+	 * An object request initiated in support of a layered image
+	 * object (to check for its existence before a write) will
+	 * have which == BAD_WHICH and a non-null obj_request pointer.
+	 *
+	 * Finally, an object request for rbd image data will have
+	 * which != BAD_WHICH, and will have a non-null img_request
+	 * pointer.  The value of which will be in the range
+	 * 0..(img_request->obj_request_count-1).
+	 */
+	union {
+		struct rbd_obj_request	*obj_request;	/* STAT op */
+		struct {
+			struct rbd_img_request	*img_request;
+			u64			img_offset;
+			/* links for img_request->obj_requests list */
+			struct list_head	links;
+		};
+	};
+	u32			which;		/* posn image request list */
+
+	enum obj_request_type	type;
+	union {
+		struct bio	*bio_list;
+		struct {
+			struct page	**pages;
+			u32		page_count;
+		};
+	};
+	struct page		**copyup_pages;
+	u32			copyup_page_count;
+
+	struct ceph_osd_request	*osd_req;
+
+	u64			xferred;	/* bytes transferred */
+	int			result;
+
+	rbd_obj_callback_t	callback;
+	struct completion	completion;
+
+	struct kref		kref;
+};
+
+enum img_req_flags {
+	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
+	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
+	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
+};
+
+struct rbd_img_request {
+	struct rbd_device	*rbd_dev;
+	u64			offset;	/* starting image byte offset */
+	u64			length;	/* byte count from offset */
+	unsigned long		flags;
+	union {
+		u64			snap_id;	/* for reads */
+		struct ceph_snap_context *snapc;	/* for writes */
+	};
+	union {
+		struct request		*rq;		/* block request */
+		struct rbd_obj_request	*obj_request;	/* obj req initiator */
+	};
+	struct page		**copyup_pages;
+	u32			copyup_page_count;
+	spinlock_t		completion_lock;/* protects next_completion */
+	u32			next_completion;
+	rbd_img_callback_t	callback;
+	u64			xferred;/* aggregate bytes transferred */
+	int			result;	/* first nonzero obj_request result */
+
+	u32			obj_request_count;
+	struct list_head	obj_requests;	/* rbd_obj_request structs */
+
+	struct kref		kref;
+};
+
+#define for_each_obj_request(ireq, oreq) \
+	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
+#define for_each_obj_request_from(ireq, oreq) \
+	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
+#define for_each_obj_request_safe(ireq, oreq, n) \
+	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
+
+struct rbd_mapping {
+	u64                     size;
+	u64                     features;
+	bool			read_only;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+	int			dev_id;		/* blkdev unique id */
+
+	int			major;		/* blkdev assigned major */
+	int			minor;
+	struct gendisk		*disk;		/* blkdev's gendisk and rq */
+
+	u32			image_format;	/* Either 1 or 2 */
+	struct rbd_client	*rbd_client;
+
+	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+	spinlock_t		lock;		/* queue, flags, open_count */
+
+	struct rbd_image_header	header;
+	unsigned long		flags;		/* possibly lock protected */
+	struct rbd_spec		*spec;
+
+	char			*header_name;
+
+	struct ceph_file_layout	layout;
+
+	struct ceph_osd_event   *watch_event;
+	struct rbd_obj_request	*watch_request;
+
+	struct rbd_spec		*parent_spec;
+	u64			parent_overlap;
+	atomic_t		parent_ref;
+	struct rbd_device	*parent;
+
+	/* protects updating the header */
+	struct rw_semaphore     header_rwsem;
+
+	struct rbd_mapping	mapping;
+
+	struct list_head	node;
+
+	/* sysfs related */
+	struct device		dev;
+	unsigned long		open_count;	/* protected by lock */
+};
+
+/*
+ * Flag bits for rbd_dev->flags.  If atomicity is required,
+ * rbd_dev->lock is used to protect access.
+ *
+ * Currently, only the "removing" flag (which is coupled with the
+ * "open_count" field) requires atomic access.
+ */
+enum rbd_dev_flags {
+	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
+	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
+};
+
+static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
+
+static LIST_HEAD(rbd_dev_list);    /* devices */
+static DEFINE_SPINLOCK(rbd_dev_list_lock);
+
+static LIST_HEAD(rbd_client_list);		/* clients */
+static DEFINE_SPINLOCK(rbd_client_list_lock);
+
+/* Slab caches for frequently-allocated structures */
+
+static struct kmem_cache	*rbd_img_request_cache;
+static struct kmem_cache	*rbd_obj_request_cache;
+static struct kmem_cache	*rbd_segment_name_cache;
+
+static int rbd_major;
+static DEFINE_IDA(rbd_dev_id_ida);
+
+/*
+ * Default to false for now, as single-major requires >= 0.75 version of
+ * userspace rbd utility.
+ */
+static bool single_major = false;
+module_param(single_major, bool, S_IRUGO);
+MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
+
+static int rbd_img_request_submit(struct rbd_img_request *img_request);
+
+static void rbd_dev_device_release(struct device *dev);
+
+static ssize_t rbd_add(struct bus_type *bus, const char *buf,
+		       size_t count);
+static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
+			  size_t count);
+static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
+				    size_t count);
+static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
+				       size_t count);
+static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
+static void rbd_spec_put(struct rbd_spec *spec);
+
+static int rbd_dev_id_to_minor(int dev_id)
+{
+	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
+}
+
+static int minor_to_rbd_dev_id(int minor)
+{
+	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
+}
+
+static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
+static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
+static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
+static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
+
+static struct attribute *rbd_bus_attrs[] = {
+	&bus_attr_add.attr,
+	&bus_attr_remove.attr,
+	&bus_attr_add_single_major.attr,
+	&bus_attr_remove_single_major.attr,
+	NULL,
+};
+
+static umode_t rbd_bus_is_visible(struct kobject *kobj,
+				  struct attribute *attr, int index)
+{
+	if (!single_major &&
+	    (attr == &bus_attr_add_single_major.attr ||
+	     attr == &bus_attr_remove_single_major.attr))
+		return 0;
+
+	return attr->mode;
+}
+
+static const struct attribute_group rbd_bus_group = {
+	.attrs = rbd_bus_attrs,
+	.is_visible = rbd_bus_is_visible,
+};
+__ATTRIBUTE_GROUPS(rbd_bus);
+
+static struct bus_type rbd_bus_type = {
+	.name		= "rbd",
+	.bus_groups	= rbd_bus_groups,
+};
+
+static void rbd_root_dev_release(struct device *dev)
+{
+}
+
+static struct device rbd_root_dev = {
+	.init_name =    "rbd",
+	.release =      rbd_root_dev_release,
+};
+
+static __printf(2, 3)
+void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	if (!rbd_dev)
+		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
+	else if (rbd_dev->disk)
+		printk(KERN_WARNING "%s: %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
+	else if (rbd_dev->spec && rbd_dev->spec->image_name)
+		printk(KERN_WARNING "%s: image %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
+	else if (rbd_dev->spec && rbd_dev->spec->image_id)
+		printk(KERN_WARNING "%s: id %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
+	else	/* punt */
+		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
+			RBD_DRV_NAME, rbd_dev, &vaf);
+	va_end(args);
+}
+
+#ifdef RBD_DEBUG
+#define rbd_assert(expr)						\
+		if (unlikely(!(expr))) {				\
+			printk(KERN_ERR "\nAssertion failure in %s() "	\
+						"at line %d:\n\n"	\
+					"\trbd_assert(%s);\n\n",	\
+					__func__, __LINE__, #expr);	\
+			BUG();						\
+		}
+#else /* !RBD_DEBUG */
+#  define rbd_assert(expr)	((void) 0)
+#endif /* !RBD_DEBUG */
+
+static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
+static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
+static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
+
+static int rbd_dev_refresh(struct rbd_device *rbd_dev);
+static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
+static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
+static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
+					u64 snap_id);
+static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+				u8 *order, u64 *snap_size);
+static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
+		u64 *snap_features);
+static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+	bool removing = false;
+
+	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
+		return -EROFS;
+
+	spin_lock_irq(&rbd_dev->lock);
+	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
+		removing = true;
+	else
+		rbd_dev->open_count++;
+	spin_unlock_irq(&rbd_dev->lock);
+	if (removing)
+		return -ENOENT;
+
+	(void) get_device(&rbd_dev->dev);
+
+	return 0;
+}
+
+static void rbd_release(struct gendisk *disk, fmode_t mode)
+{
+	struct rbd_device *rbd_dev = disk->private_data;
+	unsigned long open_count_before;
+
+	spin_lock_irq(&rbd_dev->lock);
+	open_count_before = rbd_dev->open_count--;
+	spin_unlock_irq(&rbd_dev->lock);
+	rbd_assert(open_count_before > 0);
+
+	put_device(&rbd_dev->dev);
+}
+
+static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
+{
+	int ret = 0;
+	int val;
+	bool ro;
+	bool ro_changed = false;
+
+	/* get_user() may sleep, so call it before taking rbd_dev->lock */
+	if (get_user(val, (int __user *)(arg)))
+		return -EFAULT;
+
+	ro = val ? true : false;
+	/* Snapshot doesn't allow to write*/
+	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
+		return -EROFS;
+
+	spin_lock_irq(&rbd_dev->lock);
+	/* prevent others open this device */
+	if (rbd_dev->open_count > 1) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (rbd_dev->mapping.read_only != ro) {
+		rbd_dev->mapping.read_only = ro;
+		ro_changed = true;
+	}
+
+out:
+	spin_unlock_irq(&rbd_dev->lock);
+	/* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
+	if (ret == 0 && ro_changed)
+		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
+
+	return ret;
+}
+
+static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned int cmd, unsigned long arg)
+{
+	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+	int ret = 0;
+
+	switch (cmd) {
+	case BLKROSET:
+		ret = rbd_ioctl_set_ro(rbd_dev, arg);
+		break;
+	default:
+		ret = -ENOTTY;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
+				unsigned int cmd, unsigned long arg)
+{
+	return rbd_ioctl(bdev, mode, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
+
+static const struct block_device_operations rbd_bd_ops = {
+	.owner			= THIS_MODULE,
+	.open			= rbd_open,
+	.release		= rbd_release,
+	.ioctl			= rbd_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl		= rbd_compat_ioctl,
+#endif
+};
+
+/*
+ * Initialize an rbd client instance.  Success or not, this function
+ * consumes ceph_opts.  Caller holds client_mutex.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
+{
+	struct rbd_client *rbdc;
+	int ret = -ENOMEM;
+
+	dout("%s:\n", __func__);
+	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+	if (!rbdc)
+		goto out_opt;
+
+	kref_init(&rbdc->kref);
+	INIT_LIST_HEAD(&rbdc->node);
+
+	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
+	if (IS_ERR(rbdc->client))
+		goto out_rbdc;
+	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
+
+	ret = ceph_open_session(rbdc->client);
+	if (ret < 0)
+		goto out_client;
+
+	spin_lock(&rbd_client_list_lock);
+	list_add_tail(&rbdc->node, &rbd_client_list);
+	spin_unlock(&rbd_client_list_lock);
+
+	dout("%s: rbdc %p\n", __func__, rbdc);
+
+	return rbdc;
+out_client:
+	ceph_destroy_client(rbdc->client);
+out_rbdc:
+	kfree(rbdc);
+out_opt:
+	if (ceph_opts)
+		ceph_destroy_options(ceph_opts);
+	dout("%s: error %d\n", __func__, ret);
+
+	return ERR_PTR(ret);
+}
+
+static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
+{
+	kref_get(&rbdc->kref);
+
+	return rbdc;
+}
+
+/*
+ * Find a ceph client with specific addr and configuration.  If
+ * found, bump its reference count.
+ */
+static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
+{
+	struct rbd_client *client_node;
+	bool found = false;
+
+	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
+		return NULL;
+
+	spin_lock(&rbd_client_list_lock);
+	list_for_each_entry(client_node, &rbd_client_list, node) {
+		if (!ceph_compare_options(ceph_opts, client_node->client)) {
+			__rbd_get_client(client_node);
+
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&rbd_client_list_lock);
+
+	return found ? client_node : NULL;
+}
+
+/*
+ * mount options
+ */
+enum {
+	Opt_last_int,
+	/* int args above */
+	Opt_last_string,
+	/* string args above */
+	Opt_read_only,
+	Opt_read_write,
+	/* Boolean args above */
+	Opt_last_bool,
+};
+
+static match_table_t rbd_opts_tokens = {
+	/* int args above */
+	/* string args above */
+	{Opt_read_only, "read_only"},
+	{Opt_read_only, "ro"},		/* Alternate spelling */
+	{Opt_read_write, "read_write"},
+	{Opt_read_write, "rw"},		/* Alternate spelling */
+	/* Boolean args above */
+	{-1, NULL}
+};
+
+struct rbd_options {
+	bool	read_only;
+};
+
+#define RBD_READ_ONLY_DEFAULT	false
+
+static int parse_rbd_opts_token(char *c, void *private)
+{
+	struct rbd_options *rbd_opts = private;
+	substring_t argstr[MAX_OPT_ARGS];
+	int token, intval, ret;
+
+	token = match_token(c, rbd_opts_tokens, argstr);
+	if (token < 0)
+		return -EINVAL;
+
+	if (token < Opt_last_int) {
+		ret = match_int(&argstr[0], &intval);
+		if (ret < 0) {
+			pr_err("bad mount option arg (not int) "
+			       "at '%s'\n", c);
+			return ret;
+		}
+		dout("got int token %d val %d\n", token, intval);
+	} else if (token > Opt_last_int && token < Opt_last_string) {
+		dout("got string token %d val %s\n", token,
+		     argstr[0].from);
+	} else if (token > Opt_last_string && token < Opt_last_bool) {
+		dout("got Boolean token %d\n", token);
+	} else {
+		dout("got token %d\n", token);
+	}
+
+	switch (token) {
+	case Opt_read_only:
+		rbd_opts->read_only = true;
+		break;
+	case Opt_read_write:
+		rbd_opts->read_only = false;
+		break;
+	default:
+		rbd_assert(false);
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it.  Either way, ceph_opts is consumed by this
+ * function.
+ */
+static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
+{
+	struct rbd_client *rbdc;
+
+	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
+	rbdc = rbd_client_find(ceph_opts);
+	if (rbdc)	/* using an existing client */
+		ceph_destroy_options(ceph_opts);
+	else
+		rbdc = rbd_client_create(ceph_opts);
+	mutex_unlock(&client_mutex);
+
+	return rbdc;
+}
+
+/*
+ * Destroy ceph client
+ *
+ * Caller must hold rbd_client_list_lock.
+ */
+static void rbd_client_release(struct kref *kref)
+{
+	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+	dout("%s: rbdc %p\n", __func__, rbdc);
+	spin_lock(&rbd_client_list_lock);
+	list_del(&rbdc->node);
+	spin_unlock(&rbd_client_list_lock);
+
+	ceph_destroy_client(rbdc->client);
+	kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_client *rbdc)
+{
+	if (rbdc)
+		kref_put(&rbdc->kref, rbd_client_release);
+}
+
+static bool rbd_image_format_valid(u32 image_format)
+{
+	return image_format == 1 || image_format == 2;
+}
+
+static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
+{
+	size_t size;
+	u32 snap_count;
+
+	/* The header has to start with the magic rbd header text */
+	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
+		return false;
+
+	/* The bio layer requires at least sector-sized I/O */
+
+	if (ondisk->options.order < SECTOR_SHIFT)
+		return false;
+
+	/* If we use u64 in a few spots we may be able to loosen this */
+
+	if (ondisk->options.order > 8 * sizeof (int) - 1)
+		return false;
+
+	/*
+	 * The size of a snapshot header has to fit in a size_t, and
+	 * that limits the number of snapshots.
+	 */
+	snap_count = le32_to_cpu(ondisk->snap_count);
+	size = SIZE_MAX - sizeof (struct ceph_snap_context);
+	if (snap_count > size / sizeof (__le64))
+		return false;
+
+	/*
+	 * Not only that, but the size of the entire the snapshot
+	 * header must also be representable in a size_t.
+	 */
+	size -= snap_count * sizeof (__le64);
+	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
+		return false;
+
+	return true;
+}
+
+/*
+ * Fill an rbd image header with information from the given format 1
+ * on-disk header.
+ */
+static int rbd_header_from_disk(struct rbd_device *rbd_dev,
+				 struct rbd_image_header_ondisk *ondisk)
+{
+	struct rbd_image_header *header = &rbd_dev->header;
+	bool first_time = header->object_prefix == NULL;
+	struct ceph_snap_context *snapc;
+	char *object_prefix = NULL;
+	char *snap_names = NULL;
+	u64 *snap_sizes = NULL;
+	u32 snap_count;
+	size_t size;
+	int ret = -ENOMEM;
+	u32 i;
+
+	/* Allocate this now to avoid having to handle failure below */
+
+	if (first_time) {
+		size_t len;
+
+		len = strnlen(ondisk->object_prefix,
+				sizeof (ondisk->object_prefix));
+		object_prefix = kmalloc(len + 1, GFP_KERNEL);
+		if (!object_prefix)
+			return -ENOMEM;
+		memcpy(object_prefix, ondisk->object_prefix, len);
+		object_prefix[len] = '\0';
+	}
+
+	/* Allocate the snapshot context and fill it in */
+
+	snap_count = le32_to_cpu(ondisk->snap_count);
+	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
+	if (!snapc)
+		goto out_err;
+	snapc->seq = le64_to_cpu(ondisk->snap_seq);
+	if (snap_count) {
+		struct rbd_image_snap_ondisk *snaps;
+		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+
+		/* We'll keep a copy of the snapshot names... */
+
+		if (snap_names_len > (u64)SIZE_MAX)
+			goto out_2big;
+		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
+		if (!snap_names)
+			goto out_err;
+
+		/* ...as well as the array of their sizes. */
+
+		size = snap_count * sizeof (*header->snap_sizes);
+		snap_sizes = kmalloc(size, GFP_KERNEL);
+		if (!snap_sizes)
+			goto out_err;
+
+		/*
+		 * Copy the names, and fill in each snapshot's id
+		 * and size.
+		 *
+		 * Note that rbd_dev_v1_header_info() guarantees the
+		 * ondisk buffer we're working with has
+		 * snap_names_len bytes beyond the end of the
+		 * snapshot id array, this memcpy() is safe.
+		 */
+		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
+		snaps = ondisk->snaps;
+		for (i = 0; i < snap_count; i++) {
+			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
+			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
+		}
+	}
+
+	/* We won't fail any more, fill in the header */
+
+	if (first_time) {
+		header->object_prefix = object_prefix;
+		header->obj_order = ondisk->options.order;
+		header->crypt_type = ondisk->options.crypt_type;
+		header->comp_type = ondisk->options.comp_type;
+		/* The rest aren't used for format 1 images */
+		header->stripe_unit = 0;
+		header->stripe_count = 0;
+		header->features = 0;
+	} else {
+		ceph_put_snap_context(header->snapc);
+		kfree(header->snap_names);
+		kfree(header->snap_sizes);
+	}
+
+	/* The remaining fields always get updated (when we refresh) */
+
+	header->image_size = le64_to_cpu(ondisk->image_size);
+	header->snapc = snapc;
+	header->snap_names = snap_names;
+	header->snap_sizes = snap_sizes;
+
+	/* Make sure mapping size is consistent with header info */
+
+	if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
+		if (rbd_dev->mapping.size != header->image_size)
+			rbd_dev->mapping.size = header->image_size;
+
+	return 0;
+out_2big:
+	ret = -EIO;
+out_err:
+	kfree(snap_sizes);
+	kfree(snap_names);
+	ceph_put_snap_context(snapc);
+	kfree(object_prefix);
+
+	return ret;
+}
+
+static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
+{
+	const char *snap_name;
+
+	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
+
+	/* Skip over names until we find the one we are looking for */
+
+	snap_name = rbd_dev->header.snap_names;
+	while (which--)
+		snap_name += strlen(snap_name) + 1;
+
+	return kstrdup(snap_name, GFP_KERNEL);
+}
+
+/*
+ * Snapshot id comparison function for use with qsort()/bsearch().
+ * Note that result is for snapshots in *descending* order.
+ */
+static int snapid_compare_reverse(const void *s1, const void *s2)
+{
+	u64 snap_id1 = *(u64 *)s1;
+	u64 snap_id2 = *(u64 *)s2;
+
+	if (snap_id1 < snap_id2)
+		return 1;
+	return snap_id1 == snap_id2 ? 0 : -1;
+}
+
+/*
+ * Search a snapshot context to see if the given snapshot id is
+ * present.
+ *
+ * Returns the position of the snapshot id in the array if it's found,
+ * or BAD_SNAP_INDEX otherwise.
+ *
+ * Note: The snapshot array is in kept sorted (by the osd) in
+ * reverse order, highest snapshot id first.
+ */
+static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
+{
+	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+	u64 *found;
+
+	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
+				sizeof (snap_id), snapid_compare_reverse);
+
+	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
+}
+
+static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
+					u64 snap_id)
+{
+	u32 which;
+	const char *snap_name;
+
+	which = rbd_dev_snap_index(rbd_dev, snap_id);
+	if (which == BAD_SNAP_INDEX)
+		return ERR_PTR(-ENOENT);
+
+	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
+	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
+}
+
+static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
+{
+	if (snap_id == CEPH_NOSNAP)
+		return RBD_SNAP_HEAD_NAME;
+
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+	if (rbd_dev->image_format == 1)
+		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
+
+	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
+}
+
+static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+				u64 *snap_size)
+{
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+	if (snap_id == CEPH_NOSNAP) {
+		*snap_size = rbd_dev->header.image_size;
+	} else if (rbd_dev->image_format == 1) {
+		u32 which;
+
+		which = rbd_dev_snap_index(rbd_dev, snap_id);
+		if (which == BAD_SNAP_INDEX)
+			return -ENOENT;
+
+		*snap_size = rbd_dev->header.snap_sizes[which];
+	} else {
+		u64 size = 0;
+		int ret;
+
+		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
+		if (ret)
+			return ret;
+
+		*snap_size = size;
+	}
+	return 0;
+}
+
+static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
+			u64 *snap_features)
+{
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+	if (snap_id == CEPH_NOSNAP) {
+		*snap_features = rbd_dev->header.features;
+	} else if (rbd_dev->image_format == 1) {
+		*snap_features = 0;	/* No features for format 1 */
+	} else {
+		u64 features = 0;
+		int ret;
+
+		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
+		if (ret)
+			return ret;
+
+		*snap_features = features;
+	}
+	return 0;
+}
+
+static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
+{
+	u64 snap_id = rbd_dev->spec->snap_id;
+	u64 size = 0;
+	u64 features = 0;
+	int ret;
+
+	ret = rbd_snap_size(rbd_dev, snap_id, &size);
+	if (ret)
+		return ret;
+	ret = rbd_snap_features(rbd_dev, snap_id, &features);
+	if (ret)
+		return ret;
+
+	rbd_dev->mapping.size = size;
+	rbd_dev->mapping.features = features;
+
+	return 0;
+}
+
+static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
+{
+	rbd_dev->mapping.size = 0;
+	rbd_dev->mapping.features = 0;
+}
+
+static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
+{
+	char *name;
+	u64 segment;
+	int ret;
+	char *name_format;
+
+	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
+	if (!name)
+		return NULL;
+	segment = offset >> rbd_dev->header.obj_order;
+	name_format = "%s.%012llx";
+	if (rbd_dev->image_format == 2)
+		name_format = "%s.%016llx";
+	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
+			rbd_dev->header.object_prefix, segment);
+	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
+		pr_err("error formatting segment name for #%llu (%d)\n",
+			segment, ret);
+		kfree(name);
+		name = NULL;
+	}
+
+	return name;
+}
+
+static void rbd_segment_name_free(const char *name)
+{
+	/* The explicit cast here is needed to drop the const qualifier */
+
+	kmem_cache_free(rbd_segment_name_cache, (void *)name);
+}
+
+static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
+{
+	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+
+	return offset & (segment_size - 1);
+}
+
+static u64 rbd_segment_length(struct rbd_device *rbd_dev,
+				u64 offset, u64 length)
+{
+	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+
+	offset &= segment_size - 1;
+
+	rbd_assert(length <= U64_MAX - offset);
+	if (offset + length > segment_size)
+		length = segment_size - offset;
+
+	return length;
+}
+
+/*
+ * returns the size of an object in the image
+ */
+static u64 rbd_obj_bytes(struct rbd_image_header *header)
+{
+	return 1 << header->obj_order;
+}
+
+/*
+ * bio helpers
+ */
+
+static void bio_chain_put(struct bio *chain)
+{
+	struct bio *tmp;
+
+	while (chain) {
+		tmp = chain;
+		chain = chain->bi_next;
+		bio_put(tmp);
+	}
+}
+
+/*
+ * zeros a bio chain, starting at specific offset
+ */
+static void zero_bio_chain(struct bio *chain, int start_ofs)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	unsigned long flags;
+	void *buf;
+	int pos = 0;
+
+	while (chain) {
+		bio_for_each_segment(bv, chain, iter) {
+			if (pos + bv.bv_len > start_ofs) {
+				int remainder = max(start_ofs - pos, 0);
+				buf = bvec_kmap_irq(&bv, &flags);
+				memset(buf + remainder, 0,
+				       bv.bv_len - remainder);
+				flush_dcache_page(bv.bv_page);
+				bvec_kunmap_irq(buf, &flags);
+			}
+			pos += bv.bv_len;
+		}
+
+		chain = chain->bi_next;
+	}
+}
+
+/*
+ * similar to zero_bio_chain(), zeros data defined by a page array,
+ * starting at the given byte offset from the start of the array and
+ * continuing up to the given end offset.  The pages array is
+ * assumed to be big enough to hold all bytes up to the end.
+ */
+static void zero_pages(struct page **pages, u64 offset, u64 end)
+{
+	struct page **page = &pages[offset >> PAGE_SHIFT];
+
+	rbd_assert(end > offset);
+	rbd_assert(end - offset <= (u64)SIZE_MAX);
+	while (offset < end) {
+		size_t page_offset;
+		size_t length;
+		unsigned long flags;
+		void *kaddr;
+
+		page_offset = offset & ~PAGE_MASK;
+		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
+		local_irq_save(flags);
+		kaddr = kmap_atomic(*page);
+		memset(kaddr + page_offset, 0, length);
+		flush_dcache_page(*page);
+		kunmap_atomic(kaddr);
+		local_irq_restore(flags);
+
+		offset += length;
+		page++;
+	}
+}
+
+/*
+ * Clone a portion of a bio, starting at the given byte offset
+ * and continuing for the number of bytes indicated.
+ */
+static struct bio *bio_clone_range(struct bio *bio_src,
+					unsigned int offset,
+					unsigned int len,
+					gfp_t gfpmask)
+{
+	struct bio *bio;
+
+	bio = bio_clone(bio_src, gfpmask);
+	if (!bio)
+		return NULL;	/* ENOMEM */
+
+	bio_advance(bio, offset);
+	bio->bi_iter.bi_size = len;
+
+	return bio;
+}
+
+/*
+ * Clone a portion of a bio chain, starting at the given byte offset
+ * into the first bio in the source chain and continuing for the
+ * number of bytes indicated.  The result is another bio chain of
+ * exactly the given length, or a null pointer on error.
+ *
+ * The bio_src and offset parameters are both in-out.  On entry they
+ * refer to the first source bio and the offset into that bio where
+ * the start of data to be cloned is located.
+ *
+ * On return, bio_src is updated to refer to the bio in the source
+ * chain that contains first un-cloned byte, and *offset will
+ * contain the offset of that byte within that bio.
+ */
+static struct bio *bio_chain_clone_range(struct bio **bio_src,
+					unsigned int *offset,
+					unsigned int len,
+					gfp_t gfpmask)
+{
+	struct bio *bi = *bio_src;
+	unsigned int off = *offset;
+	struct bio *chain = NULL;
+	struct bio **end;
+
+	/* Build up a chain of clone bios up to the limit */
+
+	if (!bi || off >= bi->bi_iter.bi_size || !len)
+		return NULL;		/* Nothing to clone */
+
+	end = &chain;
+	while (len) {
+		unsigned int bi_size;
+		struct bio *bio;
+
+		if (!bi) {
+			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
+			goto out_err;	/* EINVAL; ran out of bio's */
+		}
+		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
+		bio = bio_clone_range(bi, off, bi_size, gfpmask);
+		if (!bio)
+			goto out_err;	/* ENOMEM */
+
+		*end = bio;
+		end = &bio->bi_next;
+
+		off += bi_size;
+		if (off == bi->bi_iter.bi_size) {
+			bi = bi->bi_next;
+			off = 0;
+		}
+		len -= bi_size;
+	}
+	*bio_src = bi;
+	*offset = off;
+
+	return chain;
+out_err:
+	bio_chain_put(chain);
+
+	return NULL;
+}
+
+/*
+ * The default/initial value for all object request flags is 0.  For
+ * each flag, once its value is set to 1 it is never reset to 0
+ * again.
+ */
+static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
+{
+	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
+		struct rbd_device *rbd_dev;
+
+		rbd_dev = obj_request->img_request->rbd_dev;
+		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
+			obj_request);
+	}
+}
+
+static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
+{
+	smp_mb();
+	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
+}
+
+static void obj_request_done_set(struct rbd_obj_request *obj_request)
+{
+	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
+		struct rbd_device *rbd_dev = NULL;
+
+		if (obj_request_img_data_test(obj_request))
+			rbd_dev = obj_request->img_request->rbd_dev;
+		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
+			obj_request);
+	}
+}
+
+static bool obj_request_done_test(struct rbd_obj_request *obj_request)
+{
+	smp_mb();
+	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
+}
+
+/*
+ * This sets the KNOWN flag after (possibly) setting the EXISTS
+ * flag.  The latter is set based on the "exists" value provided.
+ *
+ * Note that for our purposes once an object exists it never goes
+ * away again.  It's possible that the response from two existence
+ * checks are separated by the creation of the target object, and
+ * the first ("doesn't exist") response arrives *after* the second
+ * ("does exist").  In that case we ignore the second one.
+ */
+static void obj_request_existence_set(struct rbd_obj_request *obj_request,
+				bool exists)
+{
+	if (exists)
+		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
+	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
+	smp_mb();
+}
+
+static bool obj_request_known_test(struct rbd_obj_request *obj_request)
+{
+	smp_mb();
+	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
+}
+
+static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
+{
+	smp_mb();
+	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
+}
+
+static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
+{
+	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
+
+	return obj_request->img_offset <
+	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
+}
+
+static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
+{
+	dout("%s: obj %p (was %d)\n", __func__, obj_request,
+		atomic_read(&obj_request->kref.refcount));
+	kref_get(&obj_request->kref);
+}
+
+static void rbd_obj_request_destroy(struct kref *kref);
+static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
+{
+	rbd_assert(obj_request != NULL);
+	dout("%s: obj %p (was %d)\n", __func__, obj_request,
+		atomic_read(&obj_request->kref.refcount));
+	kref_put(&obj_request->kref, rbd_obj_request_destroy);
+}
+
+static void rbd_img_request_get(struct rbd_img_request *img_request)
+{
+	dout("%s: img %p (was %d)\n", __func__, img_request,
+	     atomic_read(&img_request->kref.refcount));
+	kref_get(&img_request->kref);
+}
+
+static bool img_request_child_test(struct rbd_img_request *img_request);
+static void rbd_parent_request_destroy(struct kref *kref);
+static void rbd_img_request_destroy(struct kref *kref);
+static void rbd_img_request_put(struct rbd_img_request *img_request)
+{
+	rbd_assert(img_request != NULL);
+	dout("%s: img %p (was %d)\n", __func__, img_request,
+		atomic_read(&img_request->kref.refcount));
+	if (img_request_child_test(img_request))
+		kref_put(&img_request->kref, rbd_parent_request_destroy);
+	else
+		kref_put(&img_request->kref, rbd_img_request_destroy);
+}
+
+static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
+					struct rbd_obj_request *obj_request)
+{
+	rbd_assert(obj_request->img_request == NULL);
+
+	/* Image request now owns object's original reference */
+	obj_request->img_request = img_request;
+	obj_request->which = img_request->obj_request_count;
+	rbd_assert(!obj_request_img_data_test(obj_request));
+	obj_request_img_data_set(obj_request);
+	rbd_assert(obj_request->which != BAD_WHICH);
+	img_request->obj_request_count++;
+	list_add_tail(&obj_request->links, &img_request->obj_requests);
+	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
+		obj_request->which);
+}
+
+static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
+					struct rbd_obj_request *obj_request)
+{
+	rbd_assert(obj_request->which != BAD_WHICH);
+
+	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
+		obj_request->which);
+	list_del(&obj_request->links);
+	rbd_assert(img_request->obj_request_count > 0);
+	img_request->obj_request_count--;
+	rbd_assert(obj_request->which == img_request->obj_request_count);
+	obj_request->which = BAD_WHICH;
+	rbd_assert(obj_request_img_data_test(obj_request));
+	rbd_assert(obj_request->img_request == img_request);
+	obj_request->img_request = NULL;
+	obj_request->callback = NULL;
+	rbd_obj_request_put(obj_request);
+}
+
+static bool obj_request_type_valid(enum obj_request_type type)
+{
+	switch (type) {
+	case OBJ_REQUEST_NODATA:
+	case OBJ_REQUEST_BIO:
+	case OBJ_REQUEST_PAGES:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
+				struct rbd_obj_request *obj_request)
+{
+	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
+
+	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
+}
+
+static void rbd_img_request_complete(struct rbd_img_request *img_request)
+{
+
+	dout("%s: img %p\n", __func__, img_request);
+
+	/*
+	 * If no error occurred, compute the aggregate transfer
+	 * count for the image request.  We could instead use
+	 * atomic64_cmpxchg() to update it as each object request
+	 * completes; not clear which way is better off hand.
+	 */
+	if (!img_request->result) {
+		struct rbd_obj_request *obj_request;
+		u64 xferred = 0;
+
+		for_each_obj_request(img_request, obj_request)
+			xferred += obj_request->xferred;
+		img_request->xferred = xferred;
+	}
+
+	if (img_request->callback)
+		img_request->callback(img_request);
+	else
+		rbd_img_request_put(img_request);
+}
+
+/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
+
+static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
+{
+	dout("%s: obj %p\n", __func__, obj_request);
+
+	return wait_for_completion_interruptible(&obj_request->completion);
+}
+
+/*
+ * The default/initial value for all image request flags is 0.  Each
+ * is conditionally set to 1 at image request initialization time
+ * and currently never change thereafter.
+ */
+static void img_request_write_set(struct rbd_img_request *img_request)
+{
+	set_bit(IMG_REQ_WRITE, &img_request->flags);
+	smp_mb();
+}
+
+static bool img_request_write_test(struct rbd_img_request *img_request)
+{
+	smp_mb();
+	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
+}
+
+static void img_request_child_set(struct rbd_img_request *img_request)
+{
+	set_bit(IMG_REQ_CHILD, &img_request->flags);
+	smp_mb();
+}
+
+static void img_request_child_clear(struct rbd_img_request *img_request)
+{
+	clear_bit(IMG_REQ_CHILD, &img_request->flags);
+	smp_mb();
+}
+
+static bool img_request_child_test(struct rbd_img_request *img_request)
+{
+	smp_mb();
+	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
+}
+
+static void img_request_layered_set(struct rbd_img_request *img_request)
+{
+	set_bit(IMG_REQ_LAYERED, &img_request->flags);
+	smp_mb();
+}
+
+static void img_request_layered_clear(struct rbd_img_request *img_request)
+{
+	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
+	smp_mb();
+}
+
+static bool img_request_layered_test(struct rbd_img_request *img_request)
+{
+	smp_mb();
+	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
+}
+
+static void
+rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
+{
+	u64 xferred = obj_request->xferred;
+	u64 length = obj_request->length;
+
+	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
+		obj_request, obj_request->img_request, obj_request->result,
+		xferred, length);
+	/*
+	 * ENOENT means a hole in the image.  We zero-fill the entire
+	 * length of the request.  A short read also implies zero-fill
+	 * to the end of the request.  An error requires the whole
+	 * length of the request to be reported finished with an error
+	 * to the block layer.  In each case we update the xferred
+	 * count to indicate the whole request was satisfied.
+	 */
+	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
+	if (obj_request->result == -ENOENT) {
+		if (obj_request->type == OBJ_REQUEST_BIO)
+			zero_bio_chain(obj_request->bio_list, 0);
+		else
+			zero_pages(obj_request->pages, 0, length);
+		obj_request->result = 0;
+	} else if (xferred < length && !obj_request->result) {
+		if (obj_request->type == OBJ_REQUEST_BIO)
+			zero_bio_chain(obj_request->bio_list, xferred);
+		else
+			zero_pages(obj_request->pages, xferred, length);
+	}
+	obj_request->xferred = length;
+	obj_request_done_set(obj_request);
+}
+
+static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
+{
+	dout("%s: obj %p cb %p\n", __func__, obj_request,
+		obj_request->callback);
+	if (obj_request->callback)
+		obj_request->callback(obj_request);
+	else
+		complete_all(&obj_request->completion);
+}
+
+static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
+{
+	dout("%s: obj %p\n", __func__, obj_request);
+	obj_request_done_set(obj_request);
+}
+
+static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request = NULL;
+	struct rbd_device *rbd_dev = NULL;
+	bool layered = false;
+
+	if (obj_request_img_data_test(obj_request)) {
+		img_request = obj_request->img_request;
+		layered = img_request && img_request_layered_test(img_request);
+		rbd_dev = img_request->rbd_dev;
+	}
+
+	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
+		obj_request, img_request, obj_request->result,
+		obj_request->xferred, obj_request->length);
+	if (layered && obj_request->result == -ENOENT &&
+			obj_request->img_offset < rbd_dev->parent_overlap)
+		rbd_img_parent_read(obj_request);
+	else if (img_request)
+		rbd_img_obj_request_read_callback(obj_request);
+	else
+		obj_request_done_set(obj_request);
+}
+
+static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
+{
+	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
+		obj_request->result, obj_request->length);
+	/*
+	 * There is no such thing as a successful short write.  Set
+	 * it to our originally-requested length.
+	 */
+	obj_request->xferred = obj_request->length;
+	obj_request_done_set(obj_request);
+}
+
+/*
+ * For a simple stat call there's nothing to do.  We'll do more if
+ * this is part of a write sequence for a layered image.
+ */
+static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
+{
+	dout("%s: obj %p\n", __func__, obj_request);
+	obj_request_done_set(obj_request);
+}
+
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
+				struct ceph_msg *msg)
+{
+	struct rbd_obj_request *obj_request = osd_req->r_priv;
+	u16 opcode;
+
+	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
+	rbd_assert(osd_req == obj_request->osd_req);
+	if (obj_request_img_data_test(obj_request)) {
+		rbd_assert(obj_request->img_request);
+		rbd_assert(obj_request->which != BAD_WHICH);
+	} else {
+		rbd_assert(obj_request->which == BAD_WHICH);
+	}
+
+	if (osd_req->r_result < 0)
+		obj_request->result = osd_req->r_result;
+
+	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
+
+	/*
+	 * We support a 64-bit length, but ultimately it has to be
+	 * passed to blk_end_request(), which takes an unsigned int.
+	 */
+	obj_request->xferred = osd_req->r_reply_op_len[0];
+	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
+
+	opcode = osd_req->r_ops[0].op;
+	switch (opcode) {
+	case CEPH_OSD_OP_READ:
+		rbd_osd_read_callback(obj_request);
+		break;
+	case CEPH_OSD_OP_SETALLOCHINT:
+		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
+		/* fall through */
+	case CEPH_OSD_OP_WRITE:
+		rbd_osd_write_callback(obj_request);
+		break;
+	case CEPH_OSD_OP_STAT:
+		rbd_osd_stat_callback(obj_request);
+		break;
+	case CEPH_OSD_OP_CALL:
+	case CEPH_OSD_OP_NOTIFY_ACK:
+	case CEPH_OSD_OP_WATCH:
+		rbd_osd_trivial_callback(obj_request);
+		break;
+	default:
+		rbd_warn(NULL, "%s: unsupported op %hu\n",
+			obj_request->object_name, (unsigned short) opcode);
+		break;
+	}
+
+	if (obj_request_done_test(obj_request))
+		rbd_obj_request_complete(obj_request);
+}
+
+static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request = obj_request->img_request;
+	struct ceph_osd_request *osd_req = obj_request->osd_req;
+	u64 snap_id;
+
+	rbd_assert(osd_req != NULL);
+
+	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
+	ceph_osdc_build_request(osd_req, obj_request->offset,
+			NULL, snap_id, NULL);
+}
+
+static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request = obj_request->img_request;
+	struct ceph_osd_request *osd_req = obj_request->osd_req;
+	struct ceph_snap_context *snapc;
+	struct timespec mtime = CURRENT_TIME;
+
+	rbd_assert(osd_req != NULL);
+
+	snapc = img_request ? img_request->snapc : NULL;
+	ceph_osdc_build_request(osd_req, obj_request->offset,
+			snapc, CEPH_NOSNAP, &mtime);
+}
+
+/*
+ * Create an osd request.  A read request has one osd op (read).
+ * A write request has either one (watch) or two (hint+write) osd ops.
+ * (All rbd data writes are prefixed with an allocation hint op, but
+ * technically osd watch is a write request, hence this distinction.)
+ */
+static struct ceph_osd_request *rbd_osd_req_create(
+					struct rbd_device *rbd_dev,
+					bool write_request,
+					unsigned int num_ops,
+					struct rbd_obj_request *obj_request)
+{
+	struct ceph_snap_context *snapc = NULL;
+	struct ceph_osd_client *osdc;
+	struct ceph_osd_request *osd_req;
+
+	if (obj_request_img_data_test(obj_request)) {
+		struct rbd_img_request *img_request = obj_request->img_request;
+
+		rbd_assert(write_request ==
+				img_request_write_test(img_request));
+		if (write_request)
+			snapc = img_request->snapc;
+	}
+
+	rbd_assert(num_ops == 1 || (write_request && num_ops == 2));
+
+	/* Allocate and initialize the request, for the num_ops ops */
+
+	osdc = &rbd_dev->rbd_client->client->osdc;
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
+					  GFP_ATOMIC);
+	if (!osd_req)
+		return NULL;	/* ENOMEM */
+
+	if (write_request)
+		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+	else
+		osd_req->r_flags = CEPH_OSD_FLAG_READ;
+
+	osd_req->r_callback = rbd_osd_req_callback;
+	osd_req->r_priv = obj_request;
+
+	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+
+	return osd_req;
+}
+
+/*
+ * Create a copyup osd request based on the information in the
+ * object request supplied.  A copyup request has three osd ops,
+ * a copyup method call, a hint op, and a write op.
+ */
+static struct ceph_osd_request *
+rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request;
+	struct ceph_snap_context *snapc;
+	struct rbd_device *rbd_dev;
+	struct ceph_osd_client *osdc;
+	struct ceph_osd_request *osd_req;
+
+	rbd_assert(obj_request_img_data_test(obj_request));
+	img_request = obj_request->img_request;
+	rbd_assert(img_request);
+	rbd_assert(img_request_write_test(img_request));
+
+	/* Allocate and initialize the request, for the three ops */
+
+	snapc = img_request->snapc;
+	rbd_dev = img_request->rbd_dev;
+	osdc = &rbd_dev->rbd_client->client->osdc;
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
+	if (!osd_req)
+		return NULL;	/* ENOMEM */
+
+	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+	osd_req->r_callback = rbd_osd_req_callback;
+	osd_req->r_priv = obj_request;
+
+	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+
+	return osd_req;
+}
+
+
+static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
+{
+	ceph_osdc_put_request(osd_req);
+}
+
+/* object_name is assumed to be a non-null pointer and NUL-terminated */
+
+static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
+						u64 offset, u64 length,
+						enum obj_request_type type)
+{
+	struct rbd_obj_request *obj_request;
+	size_t size;
+	char *name;
+
+	rbd_assert(obj_request_type_valid(type));
+
+	size = strlen(object_name) + 1;
+	name = kmalloc(size, GFP_KERNEL);
+	if (!name)
+		return NULL;
+
+	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
+	if (!obj_request) {
+		kfree(name);
+		return NULL;
+	}
+
+	obj_request->object_name = memcpy(name, object_name, size);
+	obj_request->offset = offset;
+	obj_request->length = length;
+	obj_request->flags = 0;
+	obj_request->which = BAD_WHICH;
+	obj_request->type = type;
+	INIT_LIST_HEAD(&obj_request->links);
+	init_completion(&obj_request->completion);
+	kref_init(&obj_request->kref);
+
+	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
+		offset, length, (int)type, obj_request);
+
+	return obj_request;
+}
+
+static void rbd_obj_request_destroy(struct kref *kref)
+{
+	struct rbd_obj_request *obj_request;
+
+	obj_request = container_of(kref, struct rbd_obj_request, kref);
+
+	dout("%s: obj %p\n", __func__, obj_request);
+
+	rbd_assert(obj_request->img_request == NULL);
+	rbd_assert(obj_request->which == BAD_WHICH);
+
+	if (obj_request->osd_req)
+		rbd_osd_req_destroy(obj_request->osd_req);
+
+	rbd_assert(obj_request_type_valid(obj_request->type));
+	switch (obj_request->type) {
+	case OBJ_REQUEST_NODATA:
+		break;		/* Nothing to do */
+	case OBJ_REQUEST_BIO:
+		if (obj_request->bio_list)
+			bio_chain_put(obj_request->bio_list);
+		break;
+	case OBJ_REQUEST_PAGES:
+		if (obj_request->pages)
+			ceph_release_page_vector(obj_request->pages,
+						obj_request->page_count);
+		break;
+	}
+
+	kfree(obj_request->object_name);
+	obj_request->object_name = NULL;
+	kmem_cache_free(rbd_obj_request_cache, obj_request);
+}
+
+/* It's OK to call this for a device with no parent */
+
+static void rbd_spec_put(struct rbd_spec *spec);
+static void rbd_dev_unparent(struct rbd_device *rbd_dev)
+{
+	rbd_dev_remove_parent(rbd_dev);
+	rbd_spec_put(rbd_dev->parent_spec);
+	rbd_dev->parent_spec = NULL;
+	rbd_dev->parent_overlap = 0;
+}
+
+/*
+ * Parent image reference counting is used to determine when an
+ * image's parent fields can be safely torn down--after there are no
+ * more in-flight requests to the parent image.  When the last
+ * reference is dropped, cleaning them up is safe.
+ */
+static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
+{
+	int counter;
+
+	if (!rbd_dev->parent_spec)
+		return;
+
+	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
+	if (counter > 0)
+		return;
+
+	/* Last reference; clean up parent data structures */
+
+	if (!counter)
+		rbd_dev_unparent(rbd_dev);
+	else
+		rbd_warn(rbd_dev, "parent reference underflow\n");
+}
+
+/*
+ * If an image has a non-zero parent overlap, get a reference to its
+ * parent.
+ *
+ * We must get the reference before checking for the overlap to
+ * coordinate properly with zeroing the parent overlap in
+ * rbd_dev_v2_parent_info() when an image gets flattened.  We
+ * drop it again if there is no overlap.
+ *
+ * Returns true if the rbd device has a parent with a non-zero
+ * overlap and a reference for it was successfully taken, or
+ * false otherwise.
+ */
+static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
+{
+	int counter;
+
+	if (!rbd_dev->parent_spec)
+		return false;
+
+	counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
+	if (counter > 0 && rbd_dev->parent_overlap)
+		return true;
+
+	/* Image was flattened, but parent is not yet torn down */
+
+	if (counter < 0)
+		rbd_warn(rbd_dev, "parent reference overflow\n");
+
+	return false;
+}
+
+/*
+ * Caller is responsible for filling in the list of object requests
+ * that comprises the image request, and the Linux request pointer
+ * (if there is one).
+ */
+static struct rbd_img_request *rbd_img_request_create(
+					struct rbd_device *rbd_dev,
+					u64 offset, u64 length,
+					bool write_request)
+{
+	struct rbd_img_request *img_request;
+
+	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
+	if (!img_request)
+		return NULL;
+
+	if (write_request) {
+		down_read(&rbd_dev->header_rwsem);
+		ceph_get_snap_context(rbd_dev->header.snapc);
+		up_read(&rbd_dev->header_rwsem);
+	}
+
+	img_request->rq = NULL;
+	img_request->rbd_dev = rbd_dev;
+	img_request->offset = offset;
+	img_request->length = length;
+	img_request->flags = 0;
+	if (write_request) {
+		img_request_write_set(img_request);
+		img_request->snapc = rbd_dev->header.snapc;
+	} else {
+		img_request->snap_id = rbd_dev->spec->snap_id;
+	}
+	if (rbd_dev_parent_get(rbd_dev))
+		img_request_layered_set(img_request);
+	spin_lock_init(&img_request->completion_lock);
+	img_request->next_completion = 0;
+	img_request->callback = NULL;
+	img_request->result = 0;
+	img_request->obj_request_count = 0;
+	INIT_LIST_HEAD(&img_request->obj_requests);
+	kref_init(&img_request->kref);
+
+	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
+		write_request ? "write" : "read", offset, length,
+		img_request);
+
+	return img_request;
+}
+
+static void rbd_img_request_destroy(struct kref *kref)
+{
+	struct rbd_img_request *img_request;
+	struct rbd_obj_request *obj_request;
+	struct rbd_obj_request *next_obj_request;
+
+	img_request = container_of(kref, struct rbd_img_request, kref);
+
+	dout("%s: img %p\n", __func__, img_request);
+
+	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
+		rbd_img_obj_request_del(img_request, obj_request);
+	rbd_assert(img_request->obj_request_count == 0);
+
+	if (img_request_layered_test(img_request)) {
+		img_request_layered_clear(img_request);
+		rbd_dev_parent_put(img_request->rbd_dev);
+	}
+
+	if (img_request_write_test(img_request))
+		ceph_put_snap_context(img_request->snapc);
+
+	kmem_cache_free(rbd_img_request_cache, img_request);
+}
+
+static struct rbd_img_request *rbd_parent_request_create(
+					struct rbd_obj_request *obj_request,
+					u64 img_offset, u64 length)
+{
+	struct rbd_img_request *parent_request;
+	struct rbd_device *rbd_dev;
+
+	rbd_assert(obj_request->img_request);
+	rbd_dev = obj_request->img_request->rbd_dev;
+
+	parent_request = rbd_img_request_create(rbd_dev->parent,
+						img_offset, length, false);
+	if (!parent_request)
+		return NULL;
+
+	img_request_child_set(parent_request);
+	rbd_obj_request_get(obj_request);
+	parent_request->obj_request = obj_request;
+
+	return parent_request;
+}
+
+static void rbd_parent_request_destroy(struct kref *kref)
+{
+	struct rbd_img_request *parent_request;
+	struct rbd_obj_request *orig_request;
+
+	parent_request = container_of(kref, struct rbd_img_request, kref);
+	orig_request = parent_request->obj_request;
+
+	parent_request->obj_request = NULL;
+	rbd_obj_request_put(orig_request);
+	img_request_child_clear(parent_request);
+
+	rbd_img_request_destroy(kref);
+}
+
+static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request;
+	unsigned int xferred;
+	int result;
+	bool more;
+
+	rbd_assert(obj_request_img_data_test(obj_request));
+	img_request = obj_request->img_request;
+
+	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
+	xferred = (unsigned int)obj_request->xferred;
+	result = obj_request->result;
+	if (result) {
+		struct rbd_device *rbd_dev = img_request->rbd_dev;
+
+		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
+			img_request_write_test(img_request) ? "write" : "read",
+			obj_request->length, obj_request->img_offset,
+			obj_request->offset);
+		rbd_warn(rbd_dev, "  result %d xferred %x\n",
+			result, xferred);
+		if (!img_request->result)
+			img_request->result = result;
+	}
+
+	/* Image object requests don't own their page array */
+
+	if (obj_request->type == OBJ_REQUEST_PAGES) {
+		obj_request->pages = NULL;
+		obj_request->page_count = 0;
+	}
+
+	if (img_request_child_test(img_request)) {
+		rbd_assert(img_request->obj_request != NULL);
+		more = obj_request->which < img_request->obj_request_count - 1;
+	} else {
+		rbd_assert(img_request->rq != NULL);
+		more = blk_end_request(img_request->rq, result, xferred);
+	}
+
+	return more;
+}
+
+static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request;
+	u32 which = obj_request->which;
+	bool more = true;
+
+	rbd_assert(obj_request_img_data_test(obj_request));
+	img_request = obj_request->img_request;
+
+	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
+	rbd_assert(img_request != NULL);
+	rbd_assert(img_request->obj_request_count > 0);
+	rbd_assert(which != BAD_WHICH);
+	rbd_assert(which < img_request->obj_request_count);
+
+	spin_lock_irq(&img_request->completion_lock);
+	if (which != img_request->next_completion)
+		goto out;
+
+	for_each_obj_request_from(img_request, obj_request) {
+		rbd_assert(more);
+		rbd_assert(which < img_request->obj_request_count);
+
+		if (!obj_request_done_test(obj_request))
+			break;
+		more = rbd_img_obj_end_request(obj_request);
+		which++;
+	}
+
+	rbd_assert(more ^ (which == img_request->obj_request_count));
+	img_request->next_completion = which;
+out:
+	spin_unlock_irq(&img_request->completion_lock);
+	rbd_img_request_put(img_request);
+
+	if (!more)
+		rbd_img_request_complete(img_request);
+}
+
+/*
+ * Split up an image request into one or more object requests, each
+ * to a different object.  The "type" parameter indicates whether
+ * "data_desc" is the pointer to the head of a list of bio
+ * structures, or the base of a page array.  In either case this
+ * function assumes data_desc describes memory sufficient to hold
+ * all data described by the image request.
+ */
+static int rbd_img_request_fill(struct rbd_img_request *img_request,
+					enum obj_request_type type,
+					void *data_desc)
+{
+	struct rbd_device *rbd_dev = img_request->rbd_dev;
+	struct rbd_obj_request *obj_request = NULL;
+	struct rbd_obj_request *next_obj_request;
+	bool write_request = img_request_write_test(img_request);
+	struct bio *bio_list = NULL;
+	unsigned int bio_offset = 0;
+	struct page **pages = NULL;
+	u64 img_offset;
+	u64 resid;
+	u16 opcode;
+
+	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
+		(int)type, data_desc);
+
+	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
+	img_offset = img_request->offset;
+	resid = img_request->length;
+	rbd_assert(resid > 0);
+
+	if (type == OBJ_REQUEST_BIO) {
+		bio_list = data_desc;
+		rbd_assert(img_offset ==
+			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
+	} else {
+		rbd_assert(type == OBJ_REQUEST_PAGES);
+		pages = data_desc;
+	}
+
+	while (resid) {
+		struct ceph_osd_request *osd_req;
+		const char *object_name;
+		u64 offset;
+		u64 length;
+		unsigned int which = 0;
+
+		object_name = rbd_segment_name(rbd_dev, img_offset);
+		if (!object_name)
+			goto out_unwind;
+		offset = rbd_segment_offset(rbd_dev, img_offset);
+		length = rbd_segment_length(rbd_dev, img_offset, resid);
+		obj_request = rbd_obj_request_create(object_name,
+						offset, length, type);
+		/* object request has its own copy of the object name */
+		rbd_segment_name_free(object_name);
+		if (!obj_request)
+			goto out_unwind;
+
+		/*
+		 * set obj_request->img_request before creating the
+		 * osd_request so that it gets the right snapc
+		 */
+		rbd_img_obj_request_add(img_request, obj_request);
+
+		if (type == OBJ_REQUEST_BIO) {
+			unsigned int clone_size;
+
+			rbd_assert(length <= (u64)UINT_MAX);
+			clone_size = (unsigned int)length;
+			obj_request->bio_list =
+					bio_chain_clone_range(&bio_list,
+								&bio_offset,
+								clone_size,
+								GFP_ATOMIC);
+			if (!obj_request->bio_list)
+				goto out_unwind;
+		} else {
+			unsigned int page_count;
+
+			obj_request->pages = pages;
+			page_count = (u32)calc_pages_for(offset, length);
+			obj_request->page_count = page_count;
+			if ((offset + length) & ~PAGE_MASK)
+				page_count--;	/* more on last page */
+			pages += page_count;
+		}
+
+		osd_req = rbd_osd_req_create(rbd_dev, write_request,
+					     (write_request ? 2 : 1),
+					     obj_request);
+		if (!osd_req)
+			goto out_unwind;
+		obj_request->osd_req = osd_req;
+		obj_request->callback = rbd_img_obj_callback;
+		rbd_img_request_get(img_request);
+
+		if (write_request) {
+			osd_req_op_alloc_hint_init(osd_req, which,
+					     rbd_obj_bytes(&rbd_dev->header),
+					     rbd_obj_bytes(&rbd_dev->header));
+			which++;
+		}
+
+		osd_req_op_extent_init(osd_req, which, opcode, offset, length,
+				       0, 0);
+		if (type == OBJ_REQUEST_BIO)
+			osd_req_op_extent_osd_data_bio(osd_req, which,
+					obj_request->bio_list, length);
+		else
+			osd_req_op_extent_osd_data_pages(osd_req, which,
+					obj_request->pages, length,
+					offset & ~PAGE_MASK, false, false);
+
+		if (write_request)
+			rbd_osd_req_format_write(obj_request);
+		else
+			rbd_osd_req_format_read(obj_request);
+
+		obj_request->img_offset = img_offset;
+
+		img_offset += length;
+		resid -= length;
+	}
+
+	return 0;
+
+out_unwind:
+	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
+		rbd_img_obj_request_del(img_request, obj_request);
+
+	return -ENOMEM;
+}
+
+static void
+rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request;
+	struct rbd_device *rbd_dev;
+	struct page **pages;
+	u32 page_count;
+
+	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
+	rbd_assert(obj_request_img_data_test(obj_request));
+	img_request = obj_request->img_request;
+	rbd_assert(img_request);
+
+	rbd_dev = img_request->rbd_dev;
+	rbd_assert(rbd_dev);
+
+	pages = obj_request->copyup_pages;
+	rbd_assert(pages != NULL);
+	obj_request->copyup_pages = NULL;
+	page_count = obj_request->copyup_page_count;
+	rbd_assert(page_count);
+	obj_request->copyup_page_count = 0;
+	ceph_release_page_vector(pages, page_count);
+
+	/*
+	 * We want the transfer count to reflect the size of the
+	 * original write request.  There is no such thing as a
+	 * successful short write, so if the request was successful
+	 * we can just set it to the originally-requested length.
+	 */
+	if (!obj_request->result)
+		obj_request->xferred = obj_request->length;
+
+	/* Finish up with the normal image object callback */
+
+	rbd_img_obj_callback(obj_request);
+}
+
+static void
+rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
+{
+	struct rbd_obj_request *orig_request;
+	struct ceph_osd_request *osd_req;
+	struct ceph_osd_client *osdc;
+	struct rbd_device *rbd_dev;
+	struct page **pages;
+	u32 page_count;
+	int img_result;
+	u64 parent_length;
+	u64 offset;
+	u64 length;
+
+	rbd_assert(img_request_child_test(img_request));
+
+	/* First get what we need from the image request */
+
+	pages = img_request->copyup_pages;
+	rbd_assert(pages != NULL);
+	img_request->copyup_pages = NULL;
+	page_count = img_request->copyup_page_count;
+	rbd_assert(page_count);
+	img_request->copyup_page_count = 0;
+
+	orig_request = img_request->obj_request;
+	rbd_assert(orig_request != NULL);
+	rbd_assert(obj_request_type_valid(orig_request->type));
+	img_result = img_request->result;
+	parent_length = img_request->length;
+	rbd_assert(parent_length == img_request->xferred);
+	rbd_img_request_put(img_request);
+
+	rbd_assert(orig_request->img_request);
+	rbd_dev = orig_request->img_request->rbd_dev;
+	rbd_assert(rbd_dev);
+
+	/*
+	 * If the overlap has become 0 (most likely because the
+	 * image has been flattened) we need to free the pages
+	 * and re-submit the original write request.
+	 */
+	if (!rbd_dev->parent_overlap) {
+		struct ceph_osd_client *osdc;
+
+		ceph_release_page_vector(pages, page_count);
+		osdc = &rbd_dev->rbd_client->client->osdc;
+		img_result = rbd_obj_request_submit(osdc, orig_request);
+		if (!img_result)
+			return;
+	}
+
+	if (img_result)
+		goto out_err;
+
+	/*
+	 * The original osd request is of no use to use any more.
+	 * We need a new one that can hold the three ops in a copyup
+	 * request.  Allocate the new copyup osd request for the
+	 * original request, and release the old one.
+	 */
+	img_result = -ENOMEM;
+	osd_req = rbd_osd_req_create_copyup(orig_request);
+	if (!osd_req)
+		goto out_err;
+	rbd_osd_req_destroy(orig_request->osd_req);
+	orig_request->osd_req = osd_req;
+	orig_request->copyup_pages = pages;
+	orig_request->copyup_page_count = page_count;
+
+	/* Initialize the copyup op */
+
+	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
+	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
+						false, false);
+
+	/* Then the hint op */
+
+	osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
+				   rbd_obj_bytes(&rbd_dev->header));
+
+	/* And the original write request op */
+
+	offset = orig_request->offset;
+	length = orig_request->length;
+	osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
+					offset, length, 0, 0);
+	if (orig_request->type == OBJ_REQUEST_BIO)
+		osd_req_op_extent_osd_data_bio(osd_req, 2,
+					orig_request->bio_list, length);
+	else
+		osd_req_op_extent_osd_data_pages(osd_req, 2,
+					orig_request->pages, length,
+					offset & ~PAGE_MASK, false, false);
+
+	rbd_osd_req_format_write(orig_request);
+
+	/* All set, send it off. */
+
+	orig_request->callback = rbd_img_obj_copyup_callback;
+	osdc = &rbd_dev->rbd_client->client->osdc;
+	img_result = rbd_obj_request_submit(osdc, orig_request);
+	if (!img_result)
+		return;
+out_err:
+	/* Record the error code and complete the request */
+
+	orig_request->result = img_result;
+	orig_request->xferred = 0;
+	obj_request_done_set(orig_request);
+	rbd_obj_request_complete(orig_request);
+}
+
+/*
+ * Read from the parent image the range of data that covers the
+ * entire target of the given object request.  This is used for
+ * satisfying a layered image write request when the target of an
+ * object request from the image request does not exist.
+ *
+ * A page array big enough to hold the returned data is allocated
+ * and supplied to rbd_img_request_fill() as the "data descriptor."
+ * When the read completes, this page array will be transferred to
+ * the original object request for the copyup operation.
+ *
+ * If an error occurs, record it as the result of the original
+ * object request and mark it done so it gets completed.
+ */
+static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request = NULL;
+	struct rbd_img_request *parent_request = NULL;
+	struct rbd_device *rbd_dev;
+	u64 img_offset;
+	u64 length;
+	struct page **pages = NULL;
+	u32 page_count;
+	int result;
+
+	rbd_assert(obj_request_img_data_test(obj_request));
+	rbd_assert(obj_request_type_valid(obj_request->type));
+
+	img_request = obj_request->img_request;
+	rbd_assert(img_request != NULL);
+	rbd_dev = img_request->rbd_dev;
+	rbd_assert(rbd_dev->parent != NULL);
+
+	/*
+	 * Determine the byte range covered by the object in the
+	 * child image to which the original request was to be sent.
+	 */
+	img_offset = obj_request->img_offset - obj_request->offset;
+	length = (u64)1 << rbd_dev->header.obj_order;
+
+	/*
+	 * There is no defined parent data beyond the parent
+	 * overlap, so limit what we read at that boundary if
+	 * necessary.
+	 */
+	if (img_offset + length > rbd_dev->parent_overlap) {
+		rbd_assert(img_offset < rbd_dev->parent_overlap);
+		length = rbd_dev->parent_overlap - img_offset;
+	}
+
+	/*
+	 * Allocate a page array big enough to receive the data read
+	 * from the parent.
+	 */
+	page_count = (u32)calc_pages_for(0, length);
+	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+	if (IS_ERR(pages)) {
+		result = PTR_ERR(pages);
+		pages = NULL;
+		goto out_err;
+	}
+
+	result = -ENOMEM;
+	parent_request = rbd_parent_request_create(obj_request,
+						img_offset, length);
+	if (!parent_request)
+		goto out_err;
+
+	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
+	if (result)
+		goto out_err;
+	parent_request->copyup_pages = pages;
+	parent_request->copyup_page_count = page_count;
+
+	parent_request->callback = rbd_img_obj_parent_read_full_callback;
+	result = rbd_img_request_submit(parent_request);
+	if (!result)
+		return 0;
+
+	parent_request->copyup_pages = NULL;
+	parent_request->copyup_page_count = 0;
+	parent_request->obj_request = NULL;
+	rbd_obj_request_put(obj_request);
+out_err:
+	if (pages)
+		ceph_release_page_vector(pages, page_count);
+	if (parent_request)
+		rbd_img_request_put(parent_request);
+	obj_request->result = result;
+	obj_request->xferred = 0;
+	obj_request_done_set(obj_request);
+
+	return result;
+}
+
+static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
+{
+	struct rbd_obj_request *orig_request;
+	struct rbd_device *rbd_dev;
+	int result;
+
+	rbd_assert(!obj_request_img_data_test(obj_request));
+
+	/*
+	 * All we need from the object request is the original
+	 * request and the result of the STAT op.  Grab those, then
+	 * we're done with the request.
+	 */
+	orig_request = obj_request->obj_request;
+	obj_request->obj_request = NULL;
+	rbd_obj_request_put(orig_request);
+	rbd_assert(orig_request);
+	rbd_assert(orig_request->img_request);
+
+	result = obj_request->result;
+	obj_request->result = 0;
+
+	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
+		obj_request, orig_request, result,
+		obj_request->xferred, obj_request->length);
+	rbd_obj_request_put(obj_request);
+
+	/*
+	 * If the overlap has become 0 (most likely because the
+	 * image has been flattened) we need to free the pages
+	 * and re-submit the original write request.
+	 */
+	rbd_dev = orig_request->img_request->rbd_dev;
+	if (!rbd_dev->parent_overlap) {
+		struct ceph_osd_client *osdc;
+
+		osdc = &rbd_dev->rbd_client->client->osdc;
+		result = rbd_obj_request_submit(osdc, orig_request);
+		if (!result)
+			return;
+	}
+
+	/*
+	 * Our only purpose here is to determine whether the object
+	 * exists, and we don't want to treat the non-existence as
+	 * an error.  If something else comes back, transfer the
+	 * error to the original request and complete it now.
+	 */
+	if (!result) {
+		obj_request_existence_set(orig_request, true);
+	} else if (result == -ENOENT) {
+		obj_request_existence_set(orig_request, false);
+	} else if (result) {
+		orig_request->result = result;
+		goto out;
+	}
+
+	/*
+	 * Resubmit the original request now that we have recorded
+	 * whether the target object exists.
+	 */
+	orig_request->result = rbd_img_obj_request_submit(orig_request);
+out:
+	if (orig_request->result)
+		rbd_obj_request_complete(orig_request);
+}
+
+static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
+{
+	struct rbd_obj_request *stat_request;
+	struct rbd_device *rbd_dev;
+	struct ceph_osd_client *osdc;
+	struct page **pages = NULL;
+	u32 page_count;
+	size_t size;
+	int ret;
+
+	/*
+	 * The response data for a STAT call consists of:
+	 *     le64 length;
+	 *     struct {
+	 *         le32 tv_sec;
+	 *         le32 tv_nsec;
+	 *     } mtime;
+	 */
+	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
+	page_count = (u32)calc_pages_for(0, size);
+	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	ret = -ENOMEM;
+	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
+							OBJ_REQUEST_PAGES);
+	if (!stat_request)
+		goto out;
+
+	rbd_obj_request_get(obj_request);
+	stat_request->obj_request = obj_request;
+	stat_request->pages = pages;
+	stat_request->page_count = page_count;
+
+	rbd_assert(obj_request->img_request);
+	rbd_dev = obj_request->img_request->rbd_dev;
+	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+						   stat_request);
+	if (!stat_request->osd_req)
+		goto out;
+	stat_request->callback = rbd_img_obj_exists_callback;
+
+	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
+	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
+					false, false);
+	rbd_osd_req_format_read(stat_request);
+
+	osdc = &rbd_dev->rbd_client->client->osdc;
+	ret = rbd_obj_request_submit(osdc, stat_request);
+out:
+	if (ret)
+		rbd_obj_request_put(obj_request);
+
+	return ret;
+}
+
+static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request;
+	struct rbd_device *rbd_dev;
+	bool known;
+
+	rbd_assert(obj_request_img_data_test(obj_request));
+
+	img_request = obj_request->img_request;
+	rbd_assert(img_request);
+	rbd_dev = img_request->rbd_dev;
+
+	/*
+	 * Only writes to layered images need special handling.
+	 * Reads and non-layered writes are simple object requests.
+	 * Layered writes that start beyond the end of the overlap
+	 * with the parent have no parent data, so they too are
+	 * simple object requests.  Finally, if the target object is
+	 * known to already exist, its parent data has already been
+	 * copied, so a write to the object can also be handled as a
+	 * simple object request.
+	 */
+	if (!img_request_write_test(img_request) ||
+		!img_request_layered_test(img_request) ||
+		!obj_request_overlaps_parent(obj_request) ||
+		((known = obj_request_known_test(obj_request)) &&
+			obj_request_exists_test(obj_request))) {
+
+		struct rbd_device *rbd_dev;
+		struct ceph_osd_client *osdc;
+
+		rbd_dev = obj_request->img_request->rbd_dev;
+		osdc = &rbd_dev->rbd_client->client->osdc;
+
+		return rbd_obj_request_submit(osdc, obj_request);
+	}
+
+	/*
+	 * It's a layered write.  The target object might exist but
+	 * we may not know that yet.  If we know it doesn't exist,
+	 * start by reading the data for the full target object from
+	 * the parent so we can use it for a copyup to the target.
+	 */
+	if (known)
+		return rbd_img_obj_parent_read_full(obj_request);
+
+	/* We don't know whether the target exists.  Go find out. */
+
+	return rbd_img_obj_exists_submit(obj_request);
+}
+
+static int rbd_img_request_submit(struct rbd_img_request *img_request)
+{
+	struct rbd_obj_request *obj_request;
+	struct rbd_obj_request *next_obj_request;
+
+	dout("%s: img %p\n", __func__, img_request);
+	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
+		int ret;
+
+		ret = rbd_img_obj_request_submit(obj_request);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
+{
+	struct rbd_obj_request *obj_request;
+	struct rbd_device *rbd_dev;
+	u64 obj_end;
+	u64 img_xferred;
+	int img_result;
+
+	rbd_assert(img_request_child_test(img_request));
+
+	/* First get what we need from the image request and release it */
+
+	obj_request = img_request->obj_request;
+	img_xferred = img_request->xferred;
+	img_result = img_request->result;
+	rbd_img_request_put(img_request);
+
+	/*
+	 * If the overlap has become 0 (most likely because the
+	 * image has been flattened) we need to re-submit the
+	 * original request.
+	 */
+	rbd_assert(obj_request);
+	rbd_assert(obj_request->img_request);
+	rbd_dev = obj_request->img_request->rbd_dev;
+	if (!rbd_dev->parent_overlap) {
+		struct ceph_osd_client *osdc;
+
+		osdc = &rbd_dev->rbd_client->client->osdc;
+		img_result = rbd_obj_request_submit(osdc, obj_request);
+		if (!img_result)
+			return;
+	}
+
+	obj_request->result = img_result;
+	if (obj_request->result)
+		goto out;
+
+	/*
+	 * We need to zero anything beyond the parent overlap
+	 * boundary.  Since rbd_img_obj_request_read_callback()
+	 * will zero anything beyond the end of a short read, an
+	 * easy way to do this is to pretend the data from the
+	 * parent came up short--ending at the overlap boundary.
+	 */
+	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
+	obj_end = obj_request->img_offset + obj_request->length;
+	if (obj_end > rbd_dev->parent_overlap) {
+		u64 xferred = 0;
+
+		if (obj_request->img_offset < rbd_dev->parent_overlap)
+			xferred = rbd_dev->parent_overlap -
+					obj_request->img_offset;
+
+		obj_request->xferred = min(img_xferred, xferred);
+	} else {
+		obj_request->xferred = img_xferred;
+	}
+out:
+	rbd_img_obj_request_read_callback(obj_request);
+	rbd_obj_request_complete(obj_request);
+}
+
+static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
+{
+	struct rbd_img_request *img_request;
+	int result;
+
+	rbd_assert(obj_request_img_data_test(obj_request));
+	rbd_assert(obj_request->img_request != NULL);
+	rbd_assert(obj_request->result == (s32) -ENOENT);
+	rbd_assert(obj_request_type_valid(obj_request->type));
+
+	/* rbd_read_finish(obj_request, obj_request->length); */
+	img_request = rbd_parent_request_create(obj_request,
+						obj_request->img_offset,
+						obj_request->length);
+	result = -ENOMEM;
+	if (!img_request)
+		goto out_err;
+
+	if (obj_request->type == OBJ_REQUEST_BIO)
+		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
+						obj_request->bio_list);
+	else
+		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
+						obj_request->pages);
+	if (result)
+		goto out_err;
+
+	img_request->callback = rbd_img_parent_read_callback;
+	result = rbd_img_request_submit(img_request);
+	if (result)
+		goto out_err;
+
+	return;
+out_err:
+	if (img_request)
+		rbd_img_request_put(img_request);
+	obj_request->result = result;
+	obj_request->xferred = 0;
+	obj_request_done_set(obj_request);
+}
+
+static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
+{
+	struct rbd_obj_request *obj_request;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	int ret;
+
+	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+							OBJ_REQUEST_NODATA);
+	if (!obj_request)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+						  obj_request);
+	if (!obj_request->osd_req)
+		goto out;
+
+	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
+					notify_id, 0, 0);
+	rbd_osd_req_format_read(obj_request);
+
+	ret = rbd_obj_request_submit(osdc, obj_request);
+	if (ret)
+		goto out;
+	ret = rbd_obj_request_wait(obj_request);
+out:
+	rbd_obj_request_put(obj_request);
+
+	return ret;
+}
+
+static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
+{
+	struct rbd_device *rbd_dev = (struct rbd_device *)data;
+	int ret;
+
+	if (!rbd_dev)
+		return;
+
+	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
+		rbd_dev->header_name, (unsigned long long)notify_id,
+		(unsigned int)opcode);
+	ret = rbd_dev_refresh(rbd_dev);
+	if (ret)
+		rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
+
+	rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+}
+
+/*
+ * Initiate a watch request, synchronously.
+ */
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct rbd_obj_request *obj_request;
+	int ret;
+
+	rbd_assert(!rbd_dev->watch_event);
+	rbd_assert(!rbd_dev->watch_request);
+
+	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
+				     &rbd_dev->watch_event);
+	if (ret < 0)
+		return ret;
+
+	rbd_assert(rbd_dev->watch_event);
+
+	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+					     OBJ_REQUEST_NODATA);
+	if (!obj_request) {
+		ret = -ENOMEM;
+		goto out_cancel;
+	}
+
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
+						  obj_request);
+	if (!obj_request->osd_req) {
+		ret = -ENOMEM;
+		goto out_put;
+	}
+
+	ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
+
+	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
+			      rbd_dev->watch_event->cookie, 0, 1);
+	rbd_osd_req_format_write(obj_request);
+
+	ret = rbd_obj_request_submit(osdc, obj_request);
+	if (ret)
+		goto out_linger;
+
+	ret = rbd_obj_request_wait(obj_request);
+	if (ret)
+		goto out_linger;
+
+	ret = obj_request->result;
+	if (ret)
+		goto out_linger;
+
+	/*
+	 * A watch request is set to linger, so the underlying osd
+	 * request won't go away until we unregister it.  We retain
+	 * a pointer to the object request during that time (in
+	 * rbd_dev->watch_request), so we'll keep a reference to
+	 * it.  We'll drop that reference (below) after we've
+	 * unregistered it.
+	 */
+	rbd_dev->watch_request = obj_request;
+
+	return 0;
+
+out_linger:
+	ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req);
+out_put:
+	rbd_obj_request_put(obj_request);
+out_cancel:
+	ceph_osdc_cancel_event(rbd_dev->watch_event);
+	rbd_dev->watch_event = NULL;
+
+	return ret;
+}
+
+/*
+ * Tear down a watch request, synchronously.
+ */
+static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct rbd_obj_request *obj_request;
+	int ret;
+
+	rbd_assert(rbd_dev->watch_event);
+	rbd_assert(rbd_dev->watch_request);
+
+	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+					     OBJ_REQUEST_NODATA);
+	if (!obj_request) {
+		ret = -ENOMEM;
+		goto out_cancel;
+	}
+
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
+						  obj_request);
+	if (!obj_request->osd_req) {
+		ret = -ENOMEM;
+		goto out_put;
+	}
+
+	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
+			      rbd_dev->watch_event->cookie, 0, 0);
+	rbd_osd_req_format_write(obj_request);
+
+	ret = rbd_obj_request_submit(osdc, obj_request);
+	if (ret)
+		goto out_put;
+
+	ret = rbd_obj_request_wait(obj_request);
+	if (ret)
+		goto out_put;
+
+	ret = obj_request->result;
+	if (ret)
+		goto out_put;
+
+	/* We have successfully torn down the watch request */
+
+	ceph_osdc_unregister_linger_request(osdc,
+					    rbd_dev->watch_request->osd_req);
+	rbd_obj_request_put(rbd_dev->watch_request);
+	rbd_dev->watch_request = NULL;
+
+out_put:
+	rbd_obj_request_put(obj_request);
+out_cancel:
+	ceph_osdc_cancel_event(rbd_dev->watch_event);
+	rbd_dev->watch_event = NULL;
+
+	return ret;
+}
+
+static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	ret = __rbd_dev_header_unwatch_sync(rbd_dev);
+	if (ret) {
+		rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
+			 ret);
+	}
+}
+
+/*
+ * Synchronous osd object method call.  Returns the number of bytes
+ * returned in the outbound buffer, or a negative error code.
+ */
+static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
+			     const char *object_name,
+			     const char *class_name,
+			     const char *method_name,
+			     const void *outbound,
+			     size_t outbound_size,
+			     void *inbound,
+			     size_t inbound_size)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct rbd_obj_request *obj_request;
+	struct page **pages;
+	u32 page_count;
+	int ret;
+
+	/*
+	 * Method calls are ultimately read operations.  The result
+	 * should placed into the inbound buffer provided.  They
+	 * also supply outbound data--parameters for the object
+	 * method.  Currently if this is present it will be a
+	 * snapshot id.
+	 */
+	page_count = (u32)calc_pages_for(0, inbound_size);
+	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	ret = -ENOMEM;
+	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
+							OBJ_REQUEST_PAGES);
+	if (!obj_request)
+		goto out;
+
+	obj_request->pages = pages;
+	obj_request->page_count = page_count;
+
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+						  obj_request);
+	if (!obj_request->osd_req)
+		goto out;
+
+	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
+					class_name, method_name);
+	if (outbound_size) {
+		struct ceph_pagelist *pagelist;
+
+		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
+		if (!pagelist)
+			goto out;
+
+		ceph_pagelist_init(pagelist);
+		ceph_pagelist_append(pagelist, outbound, outbound_size);
+		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
+						pagelist);
+	}
+	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
+					obj_request->pages, inbound_size,
+					0, false, false);
+	rbd_osd_req_format_read(obj_request);
+
+	ret = rbd_obj_request_submit(osdc, obj_request);
+	if (ret)
+		goto out;
+	ret = rbd_obj_request_wait(obj_request);
+	if (ret)
+		goto out;
+
+	ret = obj_request->result;
+	if (ret < 0)
+		goto out;
+
+	rbd_assert(obj_request->xferred < (u64)INT_MAX);
+	ret = (int)obj_request->xferred;
+	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
+out:
+	if (obj_request)
+		rbd_obj_request_put(obj_request);
+	else
+		ceph_release_page_vector(pages, page_count);
+
+	return ret;
+}
+
+static void rbd_request_fn(struct request_queue *q)
+		__releases(q->queue_lock) __acquires(q->queue_lock)
+{
+	struct rbd_device *rbd_dev = q->queuedata;
+	struct request *rq;
+	int result;
+
+	while ((rq = blk_fetch_request(q))) {
+		bool write_request = rq_data_dir(rq) == WRITE;
+		struct rbd_img_request *img_request;
+		u64 offset;
+		u64 length;
+
+		/* Ignore any non-FS requests that filter through. */
+
+		if (rq->cmd_type != REQ_TYPE_FS) {
+			dout("%s: non-fs request type %d\n", __func__,
+				(int) rq->cmd_type);
+			__blk_end_request_all(rq, 0);
+			continue;
+		}
+
+		/* Ignore/skip any zero-length requests */
+
+		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
+		length = (u64) blk_rq_bytes(rq);
+
+		if (!length) {
+			dout("%s: zero-length request\n", __func__);
+			__blk_end_request_all(rq, 0);
+			continue;
+		}
+
+		spin_unlock_irq(q->queue_lock);
+
+		/* Disallow writes to a read-only device */
+
+		if (write_request) {
+			result = -EROFS;
+			if (rbd_dev->mapping.read_only)
+				goto end_request;
+			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
+		}
+
+		/*
+		 * Quit early if the mapped snapshot no longer
+		 * exists.  It's still possible the snapshot will
+		 * have disappeared by the time our request arrives
+		 * at the osd, but there's no sense in sending it if
+		 * we already know.
+		 */
+		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
+			dout("request for non-existent snapshot");
+			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
+			result = -ENXIO;
+			goto end_request;
+		}
+
+		result = -EINVAL;
+		if (offset && length > U64_MAX - offset + 1) {
+			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
+				offset, length);
+			goto end_request;	/* Shouldn't happen */
+		}
+
+		result = -EIO;
+		if (offset + length > rbd_dev->mapping.size) {
+			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
+				offset, length, rbd_dev->mapping.size);
+			goto end_request;
+		}
+
+		result = -ENOMEM;
+		img_request = rbd_img_request_create(rbd_dev, offset, length,
+							write_request);
+		if (!img_request)
+			goto end_request;
+
+		img_request->rq = rq;
+
+		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
+						rq->bio);
+		if (!result)
+			result = rbd_img_request_submit(img_request);
+		if (result)
+			rbd_img_request_put(img_request);
+end_request:
+		spin_lock_irq(q->queue_lock);
+		if (result < 0) {
+			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
+				write_request ? "write" : "read",
+				length, offset, result);
+
+			__blk_end_request_all(rq, result);
+		}
+	}
+}
+
+/*
+ * a queue callback. Makes sure that we don't create a bio that spans across
+ * multiple osd objects. One exception would be with a single page bios,
+ * which we handle later at bio_chain_clone_range()
+ */
+static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+			  struct bio_vec *bvec)
+{
+	struct rbd_device *rbd_dev = q->queuedata;
+	sector_t sector_offset;
+	sector_t sectors_per_obj;
+	sector_t obj_sector_offset;
+	int ret;
+
+	/*
+	 * Find how far into its rbd object the partition-relative
+	 * bio start sector is to offset relative to the enclosing
+	 * device.
+	 */
+	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
+	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
+	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
+
+	/*
+	 * Compute the number of bytes from that offset to the end
+	 * of the object.  Account for what's already used by the bio.
+	 */
+	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
+	if (ret > bmd->bi_size)
+		ret -= bmd->bi_size;
+	else
+		ret = 0;
+
+	/*
+	 * Don't send back more than was asked for.  And if the bio
+	 * was empty, let the whole thing through because:  "Note
+	 * that a block device *must* allow a single page to be
+	 * added to an empty bio."
+	 */
+	rbd_assert(bvec->bv_len <= PAGE_SIZE);
+	if (ret > (int) bvec->bv_len || !bmd->bi_size)
+		ret = (int) bvec->bv_len;
+
+	return ret;
+}
+
+static void rbd_free_disk(struct rbd_device *rbd_dev)
+{
+	struct gendisk *disk = rbd_dev->disk;
+
+	if (!disk)
+		return;
+
+	rbd_dev->disk = NULL;
+	if (disk->flags & GENHD_FL_UP) {
+		del_gendisk(disk);
+		if (disk->queue)
+			blk_cleanup_queue(disk->queue);
+	}
+	put_disk(disk);
+}
+
+static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
+				const char *object_name,
+				u64 offset, u64 length, void *buf)
+
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct rbd_obj_request *obj_request;
+	struct page **pages = NULL;
+	u32 page_count;
+	size_t size;
+	int ret;
+
+	page_count = (u32) calc_pages_for(offset, length);
+	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+	if (IS_ERR(pages))
+		ret = PTR_ERR(pages);
+
+	ret = -ENOMEM;
+	obj_request = rbd_obj_request_create(object_name, offset, length,
+							OBJ_REQUEST_PAGES);
+	if (!obj_request)
+		goto out;
+
+	obj_request->pages = pages;
+	obj_request->page_count = page_count;
+
+	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+						  obj_request);
+	if (!obj_request->osd_req)
+		goto out;
+
+	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
+					offset, length, 0, 0);
+	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
+					obj_request->pages,
+					obj_request->length,
+					obj_request->offset & ~PAGE_MASK,
+					false, false);
+	rbd_osd_req_format_read(obj_request);
+
+	ret = rbd_obj_request_submit(osdc, obj_request);
+	if (ret)
+		goto out;
+	ret = rbd_obj_request_wait(obj_request);
+	if (ret)
+		goto out;
+
+	ret = obj_request->result;
+	if (ret < 0)
+		goto out;
+
+	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
+	size = (size_t) obj_request->xferred;
+	ceph_copy_from_page_vector(pages, buf, 0, size);
+	rbd_assert(size <= (size_t)INT_MAX);
+	ret = (int)size;
+out:
+	if (obj_request)
+		rbd_obj_request_put(obj_request);
+	else
+		ceph_release_page_vector(pages, page_count);
+
+	return ret;
+}
+
+/*
+ * Read the complete header for the given rbd device.  On successful
+ * return, the rbd_dev->header field will contain up-to-date
+ * information about the image.
+ */
+static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
+{
+	struct rbd_image_header_ondisk *ondisk = NULL;
+	u32 snap_count = 0;
+	u64 names_size = 0;
+	u32 want_count;
+	int ret;
+
+	/*
+	 * The complete header will include an array of its 64-bit
+	 * snapshot ids, followed by the names of those snapshots as
+	 * a contiguous block of NUL-terminated strings.  Note that
+	 * the number of snapshots could change by the time we read
+	 * it in, in which case we re-read it.
+	 */
+	do {
+		size_t size;
+
+		kfree(ondisk);
+
+		size = sizeof (*ondisk);
+		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
+		size += names_size;
+		ondisk = kmalloc(size, GFP_KERNEL);
+		if (!ondisk)
+			return -ENOMEM;
+
+		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
+				       0, size, ondisk);
+		if (ret < 0)
+			goto out;
+		if ((size_t)ret < size) {
+			ret = -ENXIO;
+			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
+				size, ret);
+			goto out;
+		}
+		if (!rbd_dev_ondisk_valid(ondisk)) {
+			ret = -ENXIO;
+			rbd_warn(rbd_dev, "invalid header");
+			goto out;
+		}
+
+		names_size = le64_to_cpu(ondisk->snap_names_len);
+		want_count = snap_count;
+		snap_count = le32_to_cpu(ondisk->snap_count);
+	} while (snap_count != want_count);
+
+	ret = rbd_header_from_disk(rbd_dev, ondisk);
+out:
+	kfree(ondisk);
+
+	return ret;
+}
+
+/*
+ * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
+ * has disappeared from the (just updated) snapshot context.
+ */
+static void rbd_exists_validate(struct rbd_device *rbd_dev)
+{
+	u64 snap_id;
+
+	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
+		return;
+
+	snap_id = rbd_dev->spec->snap_id;
+	if (snap_id == CEPH_NOSNAP)
+		return;
+
+	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
+		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+}
+
+static void rbd_dev_update_size(struct rbd_device *rbd_dev)
+{
+	sector_t size;
+	bool removing;
+
+	/*
+	 * Don't hold the lock while doing disk operations,
+	 * or lock ordering will conflict with the bdev mutex via:
+	 * rbd_add() -> blkdev_get() -> rbd_open()
+	 */
+	spin_lock_irq(&rbd_dev->lock);
+	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
+	spin_unlock_irq(&rbd_dev->lock);
+	/*
+	 * If the device is being removed, rbd_dev->disk has
+	 * been destroyed, so don't try to update its size
+	 */
+	if (!removing) {
+		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
+		dout("setting size to %llu sectors", (unsigned long long)size);
+		set_capacity(rbd_dev->disk, size);
+		revalidate_disk(rbd_dev->disk);
+	}
+}
+
+static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+{
+	u64 mapping_size;
+	int ret;
+
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+	down_write(&rbd_dev->header_rwsem);
+	mapping_size = rbd_dev->mapping.size;
+	if (rbd_dev->image_format == 1)
+		ret = rbd_dev_v1_header_info(rbd_dev);
+	else
+		ret = rbd_dev_v2_header_info(rbd_dev);
+
+	/* If it's a mapped snapshot, validate its EXISTS flag */
+
+	rbd_exists_validate(rbd_dev);
+	up_write(&rbd_dev->header_rwsem);
+
+	if (mapping_size != rbd_dev->mapping.size) {
+		rbd_dev_update_size(rbd_dev);
+	}
+
+	return ret;
+}
+
+static int rbd_init_disk(struct rbd_device *rbd_dev)
+{
+	struct gendisk *disk;
+	struct request_queue *q;
+	u64 segment_size;
+
+	/* create gendisk info */
+	disk = alloc_disk(single_major ?
+			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
+			  RBD_MINORS_PER_MAJOR);
+	if (!disk)
+		return -ENOMEM;
+
+	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
+		 rbd_dev->dev_id);
+	disk->major = rbd_dev->major;
+	disk->first_minor = rbd_dev->minor;
+	if (single_major)
+		disk->flags |= GENHD_FL_EXT_DEVT;
+	disk->fops = &rbd_bd_ops;
+	disk->private_data = rbd_dev;
+
+	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
+	if (!q)
+		goto out_disk;
+
+	/* We use the default size, but let's be explicit about it. */
+	blk_queue_physical_block_size(q, SECTOR_SIZE);
+
+	/* set io sizes to object size */
+	segment_size = rbd_obj_bytes(&rbd_dev->header);
+	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
+	blk_queue_max_segment_size(q, segment_size);
+	blk_queue_io_min(q, segment_size);
+	blk_queue_io_opt(q, segment_size);
+
+	blk_queue_merge_bvec(q, rbd_merge_bvec);
+	disk->queue = q;
+
+	q->queuedata = rbd_dev;
+
+	rbd_dev->disk = disk;
+
+	return 0;
+out_disk:
+	put_disk(disk);
+
+	return -ENOMEM;
+}
+
+/*
+  sysfs
+*/
+
+static struct rbd_device *dev_to_rbd_dev(struct device *dev)
+{
+	return container_of(dev, struct rbd_device, dev);
+}
+
+static ssize_t rbd_size_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%llu\n",
+		(unsigned long long)rbd_dev->mapping.size);
+}
+
+/*
+ * Note this shows the features for whatever's mapped, which is not
+ * necessarily the base image.
+ */
+static ssize_t rbd_features_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "0x%016llx\n",
+			(unsigned long long)rbd_dev->mapping.features);
+}
+
+static ssize_t rbd_major_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	if (rbd_dev->major)
+		return sprintf(buf, "%d\n", rbd_dev->major);
+
+	return sprintf(buf, "(none)\n");
+}
+
+static ssize_t rbd_minor_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%d\n", rbd_dev->minor);
+}
+
+static ssize_t rbd_client_id_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "client%lld\n",
+			ceph_client_id(rbd_dev->rbd_client->client));
+}
+
+static ssize_t rbd_pool_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
+}
+
+static ssize_t rbd_pool_id_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%llu\n",
+			(unsigned long long) rbd_dev->spec->pool_id);
+}
+
+static ssize_t rbd_name_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	if (rbd_dev->spec->image_name)
+		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
+
+	return sprintf(buf, "(unknown)\n");
+}
+
+static ssize_t rbd_image_id_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
+}
+
+/*
+ * Shows the name of the currently-mapped snapshot (or
+ * RBD_SNAP_HEAD_NAME for the base image).
+ */
+static ssize_t rbd_snap_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
+}
+
+/*
+ * For an rbd v2 image, shows the pool id, image id, and snapshot id
+ * for the parent image.  If there is no parent, simply shows
+ * "(no parent image)".
+ */
+static ssize_t rbd_parent_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+	struct rbd_spec *spec = rbd_dev->parent_spec;
+	int count;
+	char *bufp = buf;
+
+	if (!spec)
+		return sprintf(buf, "(no parent image)\n");
+
+	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
+			(unsigned long long) spec->pool_id, spec->pool_name);
+	if (count < 0)
+		return count;
+	bufp += count;
+
+	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
+			spec->image_name ? spec->image_name : "(unknown)");
+	if (count < 0)
+		return count;
+	bufp += count;
+
+	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
+			(unsigned long long) spec->snap_id, spec->snap_name);
+	if (count < 0)
+		return count;
+	bufp += count;
+
+	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
+	if (count < 0)
+		return count;
+	bufp += count;
+
+	return (ssize_t) (bufp - buf);
+}
+
+static ssize_t rbd_image_refresh(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf,
+				 size_t size)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+	int ret;
+
+	ret = rbd_dev_refresh(rbd_dev);
+	if (ret)
+		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
+
+	return ret < 0 ? ret : size;
+}
+
+static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
+static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
+static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
+static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
+static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
+static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
+static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
+static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
+static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
+static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
+static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
+static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
+
+static struct attribute *rbd_attrs[] = {
+	&dev_attr_size.attr,
+	&dev_attr_features.attr,
+	&dev_attr_major.attr,
+	&dev_attr_minor.attr,
+	&dev_attr_client_id.attr,
+	&dev_attr_pool.attr,
+	&dev_attr_pool_id.attr,
+	&dev_attr_name.attr,
+	&dev_attr_image_id.attr,
+	&dev_attr_current_snap.attr,
+	&dev_attr_parent.attr,
+	&dev_attr_refresh.attr,
+	NULL
+};
+
+static struct attribute_group rbd_attr_group = {
+	.attrs = rbd_attrs,
+};
+
+static const struct attribute_group *rbd_attr_groups[] = {
+	&rbd_attr_group,
+	NULL
+};
+
+static void rbd_sysfs_dev_release(struct device *dev)
+{
+}
+
+static struct device_type rbd_device_type = {
+	.name		= "rbd",
+	.groups		= rbd_attr_groups,
+	.release	= rbd_sysfs_dev_release,
+};
+
+static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
+{
+	kref_get(&spec->kref);
+
+	return spec;
+}
+
+static void rbd_spec_free(struct kref *kref);
+static void rbd_spec_put(struct rbd_spec *spec)
+{
+	if (spec)
+		kref_put(&spec->kref, rbd_spec_free);
+}
+
+static struct rbd_spec *rbd_spec_alloc(void)
+{
+	struct rbd_spec *spec;
+
+	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
+	if (!spec)
+		return NULL;
+	kref_init(&spec->kref);
+
+	return spec;
+}
+
+static void rbd_spec_free(struct kref *kref)
+{
+	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
+
+	kfree(spec->pool_name);
+	kfree(spec->image_id);
+	kfree(spec->image_name);
+	kfree(spec->snap_name);
+	kfree(spec);
+}
+
+static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
+				struct rbd_spec *spec)
+{
+	struct rbd_device *rbd_dev;
+
+	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
+	if (!rbd_dev)
+		return NULL;
+
+	spin_lock_init(&rbd_dev->lock);
+	rbd_dev->flags = 0;
+	atomic_set(&rbd_dev->parent_ref, 0);
+	INIT_LIST_HEAD(&rbd_dev->node);
+	init_rwsem(&rbd_dev->header_rwsem);
+
+	rbd_dev->spec = spec;
+	rbd_dev->rbd_client = rbdc;
+
+	/* Initialize the layout used for all rbd requests */
+
+	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
+	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
+
+	return rbd_dev;
+}
+
+static void rbd_dev_destroy(struct rbd_device *rbd_dev)
+{
+	rbd_put_client(rbd_dev->rbd_client);
+	rbd_spec_put(rbd_dev->spec);
+	kfree(rbd_dev);
+}
+
+/*
+ * Get the size and object order for an image snapshot, or if
+ * snap_id is CEPH_NOSNAP, gets this information for the base
+ * image.
+ */
+static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+				u8 *order, u64 *snap_size)
+{
+	__le64 snapid = cpu_to_le64(snap_id);
+	int ret;
+	struct {
+		u8 order;
+		__le64 size;
+	} __attribute__ ((packed)) size_buf = { 0 };
+
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+				"rbd", "get_size",
+				&snapid, sizeof (snapid),
+				&size_buf, sizeof (size_buf));
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		return ret;
+	if (ret < sizeof (size_buf))
+		return -ERANGE;
+
+	if (order) {
+		*order = size_buf.order;
+		dout("  order %u", (unsigned int)*order);
+	}
+	*snap_size = le64_to_cpu(size_buf.size);
+
+	dout("  snap_id 0x%016llx snap_size = %llu\n",
+		(unsigned long long)snap_id,
+		(unsigned long long)*snap_size);
+
+	return 0;
+}
+
+static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
+{
+	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
+					&rbd_dev->header.obj_order,
+					&rbd_dev->header.image_size);
+}
+
+static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
+{
+	void *reply_buf;
+	int ret;
+	void *p;
+
+	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
+	if (!reply_buf)
+		return -ENOMEM;
+
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+				"rbd", "get_object_prefix", NULL, 0,
+				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		goto out;
+
+	p = reply_buf;
+	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
+						p + ret, NULL, GFP_NOIO);
+	ret = 0;
+
+	if (IS_ERR(rbd_dev->header.object_prefix)) {
+		ret = PTR_ERR(rbd_dev->header.object_prefix);
+		rbd_dev->header.object_prefix = NULL;
+	} else {
+		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
+	}
+out:
+	kfree(reply_buf);
+
+	return ret;
+}
+
+static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
+		u64 *snap_features)
+{
+	__le64 snapid = cpu_to_le64(snap_id);
+	struct {
+		__le64 features;
+		__le64 incompat;
+	} __attribute__ ((packed)) features_buf = { 0 };
+	u64 incompat;
+	int ret;
+
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+				"rbd", "get_features",
+				&snapid, sizeof (snapid),
+				&features_buf, sizeof (features_buf));
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		return ret;
+	if (ret < sizeof (features_buf))
+		return -ERANGE;
+
+	incompat = le64_to_cpu(features_buf.incompat);
+	if (incompat & ~RBD_FEATURES_SUPPORTED)
+		return -ENXIO;
+
+	*snap_features = le64_to_cpu(features_buf.features);
+
+	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
+		(unsigned long long)snap_id,
+		(unsigned long long)*snap_features,
+		(unsigned long long)le64_to_cpu(features_buf.incompat));
+
+	return 0;
+}
+
+static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
+{
+	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
+						&rbd_dev->header.features);
+}
+
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
+{
+	struct rbd_spec *parent_spec;
+	size_t size;
+	void *reply_buf = NULL;
+	__le64 snapid;
+	void *p;
+	void *end;
+	u64 pool_id;
+	char *image_id;
+	u64 snap_id;
+	u64 overlap;
+	int ret;
+
+	parent_spec = rbd_spec_alloc();
+	if (!parent_spec)
+		return -ENOMEM;
+
+	size = sizeof (__le64) +				/* pool_id */
+		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
+		sizeof (__le64) +				/* snap_id */
+		sizeof (__le64);				/* overlap */
+	reply_buf = kmalloc(size, GFP_KERNEL);
+	if (!reply_buf) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	snapid = cpu_to_le64(CEPH_NOSNAP);
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+				"rbd", "get_parent",
+				&snapid, sizeof (snapid),
+				reply_buf, size);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		goto out_err;
+
+	p = reply_buf;
+	end = reply_buf + ret;
+	ret = -ERANGE;
+	ceph_decode_64_safe(&p, end, pool_id, out_err);
+	if (pool_id == CEPH_NOPOOL) {
+		/*
+		 * Either the parent never existed, or we have
+		 * record of it but the image got flattened so it no
+		 * longer has a parent.  When the parent of a
+		 * layered image disappears we immediately set the
+		 * overlap to 0.  The effect of this is that all new
+		 * requests will be treated as if the image had no
+		 * parent.
+		 */
+		if (rbd_dev->parent_overlap) {
+			rbd_dev->parent_overlap = 0;
+			smp_mb();
+			rbd_dev_parent_put(rbd_dev);
+			pr_info("%s: clone image has been flattened\n",
+				rbd_dev->disk->disk_name);
+		}
+
+		goto out;	/* No parent?  No problem. */
+	}
+
+	/* The ceph file layout needs to fit pool id in 32 bits */
+
+	ret = -EIO;
+	if (pool_id > (u64)U32_MAX) {
+		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
+			(unsigned long long)pool_id, U32_MAX);
+		goto out_err;
+	}
+
+	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
+	if (IS_ERR(image_id)) {
+		ret = PTR_ERR(image_id);
+		goto out_err;
+	}
+	ceph_decode_64_safe(&p, end, snap_id, out_err);
+	ceph_decode_64_safe(&p, end, overlap, out_err);
+
+	/*
+	 * The parent won't change (except when the clone is
+	 * flattened, already handled that).  So we only need to
+	 * record the parent spec we have not already done so.
+	 */
+	if (!rbd_dev->parent_spec) {
+		parent_spec->pool_id = pool_id;
+		parent_spec->image_id = image_id;
+		parent_spec->snap_id = snap_id;
+		rbd_dev->parent_spec = parent_spec;
+		parent_spec = NULL;	/* rbd_dev now owns this */
+	}
+
+	/*
+	 * We always update the parent overlap.  If it's zero we
+	 * treat it specially.
+	 */
+	rbd_dev->parent_overlap = overlap;
+	smp_mb();
+	if (!overlap) {
+
+		/* A null parent_spec indicates it's the initial probe */
+
+		if (parent_spec) {
+			/*
+			 * The overlap has become zero, so the clone
+			 * must have been resized down to 0 at some
+			 * point.  Treat this the same as a flatten.
+			 */
+			rbd_dev_parent_put(rbd_dev);
+			pr_info("%s: clone image now standalone\n",
+				rbd_dev->disk->disk_name);
+		} else {
+			/*
+			 * For the initial probe, if we find the
+			 * overlap is zero we just pretend there was
+			 * no parent image.
+			 */
+			rbd_warn(rbd_dev, "ignoring parent of "
+						"clone with overlap 0\n");
+		}
+	}
+out:
+	ret = 0;
+out_err:
+	kfree(reply_buf);
+	rbd_spec_put(parent_spec);
+
+	return ret;
+}
+
+static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
+{
+	struct {
+		__le64 stripe_unit;
+		__le64 stripe_count;
+	} __attribute__ ((packed)) striping_info_buf = { 0 };
+	size_t size = sizeof (striping_info_buf);
+	void *p;
+	u64 obj_size;
+	u64 stripe_unit;
+	u64 stripe_count;
+	int ret;
+
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+				"rbd", "get_stripe_unit_count", NULL, 0,
+				(char *)&striping_info_buf, size);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		return ret;
+	if (ret < size)
+		return -ERANGE;
+
+	/*
+	 * We don't actually support the "fancy striping" feature
+	 * (STRIPINGV2) yet, but if the striping sizes are the
+	 * defaults the behavior is the same as before.  So find
+	 * out, and only fail if the image has non-default values.
+	 */
+	ret = -EINVAL;
+	obj_size = (u64)1 << rbd_dev->header.obj_order;
+	p = &striping_info_buf;
+	stripe_unit = ceph_decode_64(&p);
+	if (stripe_unit != obj_size) {
+		rbd_warn(rbd_dev, "unsupported stripe unit "
+				"(got %llu want %llu)",
+				stripe_unit, obj_size);
+		return -EINVAL;
+	}
+	stripe_count = ceph_decode_64(&p);
+	if (stripe_count != 1) {
+		rbd_warn(rbd_dev, "unsupported stripe count "
+				"(got %llu want 1)", stripe_count);
+		return -EINVAL;
+	}
+	rbd_dev->header.stripe_unit = stripe_unit;
+	rbd_dev->header.stripe_count = stripe_count;
+
+	return 0;
+}
+
+static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
+{
+	size_t image_id_size;
+	char *image_id;
+	void *p;
+	void *end;
+	size_t size;
+	void *reply_buf = NULL;
+	size_t len = 0;
+	char *image_name = NULL;
+	int ret;
+
+	rbd_assert(!rbd_dev->spec->image_name);
+
+	len = strlen(rbd_dev->spec->image_id);
+	image_id_size = sizeof (__le32) + len;
+	image_id = kmalloc(image_id_size, GFP_KERNEL);
+	if (!image_id)
+		return NULL;
+
+	p = image_id;
+	end = image_id + image_id_size;
+	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
+
+	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
+	reply_buf = kmalloc(size, GFP_KERNEL);
+	if (!reply_buf)
+		goto out;
+
+	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
+				"rbd", "dir_get_name",
+				image_id, image_id_size,
+				reply_buf, size);
+	if (ret < 0)
+		goto out;
+	p = reply_buf;
+	end = reply_buf + ret;
+
+	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
+	if (IS_ERR(image_name))
+		image_name = NULL;
+	else
+		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
+out:
+	kfree(reply_buf);
+	kfree(image_id);
+
+	return image_name;
+}
+
+static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
+{
+	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+	const char *snap_name;
+	u32 which = 0;
+
+	/* Skip over names until we find the one we are looking for */
+
+	snap_name = rbd_dev->header.snap_names;
+	while (which < snapc->num_snaps) {
+		if (!strcmp(name, snap_name))
+			return snapc->snaps[which];
+		snap_name += strlen(snap_name) + 1;
+		which++;
+	}
+	return CEPH_NOSNAP;
+}
+
+static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
+{
+	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+	u32 which;
+	bool found = false;
+	u64 snap_id;
+
+	for (which = 0; !found && which < snapc->num_snaps; which++) {
+		const char *snap_name;
+
+		snap_id = snapc->snaps[which];
+		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
+		if (IS_ERR(snap_name)) {
+			/* ignore no-longer existing snapshots */
+			if (PTR_ERR(snap_name) == -ENOENT)
+				continue;
+			else
+				break;
+		}
+		found = !strcmp(name, snap_name);
+		kfree(snap_name);
+	}
+	return found ? snap_id : CEPH_NOSNAP;
+}
+
+/*
+ * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
+ * no snapshot by that name is found, or if an error occurs.
+ */
+static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
+{
+	if (rbd_dev->image_format == 1)
+		return rbd_v1_snap_id_by_name(rbd_dev, name);
+
+	return rbd_v2_snap_id_by_name(rbd_dev, name);
+}
+
+/*
+ * When an rbd image has a parent image, it is identified by the
+ * pool, image, and snapshot ids (not names).  This function fills
+ * in the names for those ids.  (It's OK if we can't figure out the
+ * name for an image id, but the pool and snapshot ids should always
+ * exist and have names.)  All names in an rbd spec are dynamically
+ * allocated.
+ *
+ * When an image being mapped (not a parent) is probed, we have the
+ * pool name and pool id, image name and image id, and the snapshot
+ * name.  The only thing we're missing is the snapshot id.
+ */
+static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct rbd_spec *spec = rbd_dev->spec;
+	const char *pool_name;
+	const char *image_name;
+	const char *snap_name;
+	int ret;
+
+	/*
+	 * An image being mapped will have the pool name (etc.), but
+	 * we need to look up the snapshot id.
+	 */
+	if (spec->pool_name) {
+		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
+			u64 snap_id;
+
+			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
+			if (snap_id == CEPH_NOSNAP)
+				return -ENOENT;
+			spec->snap_id = snap_id;
+		} else {
+			spec->snap_id = CEPH_NOSNAP;
+		}
+
+		return 0;
+	}
+
+	/* Get the pool name; we have to make our own copy of this */
+
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
+	if (!pool_name) {
+		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
+		return -EIO;
+	}
+	pool_name = kstrdup(pool_name, GFP_KERNEL);
+	if (!pool_name)
+		return -ENOMEM;
+
+	/* Fetch the image name; tolerate failure here */
+
+	image_name = rbd_dev_image_name(rbd_dev);
+	if (!image_name)
+		rbd_warn(rbd_dev, "unable to get image name");
+
+	/* Look up the snapshot name, and make a copy */
+
+	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
+	if (IS_ERR(snap_name)) {
+		ret = PTR_ERR(snap_name);
+		goto out_err;
+	}
+
+	spec->pool_name = pool_name;
+	spec->image_name = image_name;
+	spec->snap_name = snap_name;
+
+	return 0;
+out_err:
+	kfree(image_name);
+	kfree(pool_name);
+
+	return ret;
+}
+
+static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
+{
+	size_t size;
+	int ret;
+	void *reply_buf;
+	void *p;
+	void *end;
+	u64 seq;
+	u32 snap_count;
+	struct ceph_snap_context *snapc;
+	u32 i;
+
+	/*
+	 * We'll need room for the seq value (maximum snapshot id),
+	 * snapshot count, and array of that many snapshot ids.
+	 * For now we have a fixed upper limit on the number we're
+	 * prepared to receive.
+	 */
+	size = sizeof (__le64) + sizeof (__le32) +
+			RBD_MAX_SNAP_COUNT * sizeof (__le64);
+	reply_buf = kzalloc(size, GFP_KERNEL);
+	if (!reply_buf)
+		return -ENOMEM;
+
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+				"rbd", "get_snapcontext", NULL, 0,
+				reply_buf, size);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		goto out;
+
+	p = reply_buf;
+	end = reply_buf + ret;
+	ret = -ERANGE;
+	ceph_decode_64_safe(&p, end, seq, out);
+	ceph_decode_32_safe(&p, end, snap_count, out);
+
+	/*
+	 * Make sure the reported number of snapshot ids wouldn't go
+	 * beyond the end of our buffer.  But before checking that,
+	 * make sure the computed size of the snapshot context we
+	 * allocate is representable in a size_t.
+	 */
+	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
+				 / sizeof (u64)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
+		goto out;
+	ret = 0;
+
+	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
+	if (!snapc) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	snapc->seq = seq;
+	for (i = 0; i < snap_count; i++)
+		snapc->snaps[i] = ceph_decode_64(&p);
+
+	ceph_put_snap_context(rbd_dev->header.snapc);
+	rbd_dev->header.snapc = snapc;
+
+	dout("  snap context seq = %llu, snap_count = %u\n",
+		(unsigned long long)seq, (unsigned int)snap_count);
+out:
+	kfree(reply_buf);
+
+	return ret;
+}
+
+static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
+					u64 snap_id)
+{
+	size_t size;
+	void *reply_buf;
+	__le64 snapid;
+	int ret;
+	void *p;
+	void *end;
+	char *snap_name;
+
+	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
+	reply_buf = kmalloc(size, GFP_KERNEL);
+	if (!reply_buf)
+		return ERR_PTR(-ENOMEM);
+
+	snapid = cpu_to_le64(snap_id);
+	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+				"rbd", "get_snapshot_name",
+				&snapid, sizeof (snapid),
+				reply_buf, size);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0) {
+		snap_name = ERR_PTR(ret);
+		goto out;
+	}
+
+	p = reply_buf;
+	end = reply_buf + ret;
+	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
+	if (IS_ERR(snap_name))
+		goto out;
+
+	dout("  snap_id 0x%016llx snap_name = %s\n",
+		(unsigned long long)snap_id, snap_name);
+out:
+	kfree(reply_buf);
+
+	return snap_name;
+}
+
+static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
+{
+	bool first_time = rbd_dev->header.object_prefix == NULL;
+	int ret;
+
+	ret = rbd_dev_v2_image_size(rbd_dev);
+	if (ret)
+		return ret;
+
+	if (first_time) {
+		ret = rbd_dev_v2_header_onetime(rbd_dev);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * If the image supports layering, get the parent info.  We
+	 * need to probe the first time regardless.  Thereafter we
+	 * only need to if there's a parent, to see if it has
+	 * disappeared due to the mapped image getting flattened.
+	 */
+	if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
+			(first_time || rbd_dev->parent_spec)) {
+		bool warn;
+
+		ret = rbd_dev_v2_parent_info(rbd_dev);
+		if (ret)
+			return ret;
+
+		/*
+		 * Print a warning if this is the initial probe and
+		 * the image has a parent.  Don't print it if the
+		 * image now being probed is itself a parent.  We
+		 * can tell at this point because we won't know its
+		 * pool name yet (just its pool id).
+		 */
+		warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
+		if (first_time && warn)
+			rbd_warn(rbd_dev, "WARNING: kernel layering "
+					"is EXPERIMENTAL!");
+	}
+
+	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
+		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
+			rbd_dev->mapping.size = rbd_dev->header.image_size;
+
+	ret = rbd_dev_v2_snap_context(rbd_dev);
+	dout("rbd_dev_v2_snap_context returned %d\n", ret);
+
+	return ret;
+}
+
+static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
+{
+	struct device *dev;
+	int ret;
+
+	dev = &rbd_dev->dev;
+	dev->bus = &rbd_bus_type;
+	dev->type = &rbd_device_type;
+	dev->parent = &rbd_root_dev;
+	dev->release = rbd_dev_device_release;
+	dev_set_name(dev, "%d", rbd_dev->dev_id);
+	ret = device_register(dev);
+
+	return ret;
+}
+
+static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
+{
+	device_unregister(&rbd_dev->dev);
+}
+
+/*
+ * Get a unique rbd identifier for the given new rbd_dev, and add
+ * the rbd_dev to the global list.
+ */
+static int rbd_dev_id_get(struct rbd_device *rbd_dev)
+{
+	int new_dev_id;
+
+	new_dev_id = ida_simple_get(&rbd_dev_id_ida,
+				    0, minor_to_rbd_dev_id(1 << MINORBITS),
+				    GFP_KERNEL);
+	if (new_dev_id < 0)
+		return new_dev_id;
+
+	rbd_dev->dev_id = new_dev_id;
+
+	spin_lock(&rbd_dev_list_lock);
+	list_add_tail(&rbd_dev->node, &rbd_dev_list);
+	spin_unlock(&rbd_dev_list_lock);
+
+	dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
+
+	return 0;
+}
+
+/*
+ * Remove an rbd_dev from the global list, and record that its
+ * identifier is no longer in use.
+ */
+static void rbd_dev_id_put(struct rbd_device *rbd_dev)
+{
+	spin_lock(&rbd_dev_list_lock);
+	list_del_init(&rbd_dev->node);
+	spin_unlock(&rbd_dev_list_lock);
+
+	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
+
+	dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
+}
+
+/*
+ * Skips over white space at *buf, and updates *buf to point to the
+ * first found non-space character (if any). Returns the length of
+ * the token (string of non-white space characters) found.  Note
+ * that *buf must be terminated with '\0'.
+ */
+static inline size_t next_token(const char **buf)
+{
+        /*
+        * These are the characters that produce nonzero for
+        * isspace() in the "C" and "POSIX" locales.
+        */
+        const char *spaces = " \f\n\r\t\v";
+
+        *buf += strspn(*buf, spaces);	/* Find start of token */
+
+	return strcspn(*buf, spaces);   /* Return token length */
+}
+
+/*
+ * Finds the next token in *buf, and if the provided token buffer is
+ * big enough, copies the found token into it.  The result, if
+ * copied, is guaranteed to be terminated with '\0'.  Note that *buf
+ * must be terminated with '\0' on entry.
+ *
+ * Returns the length of the token found (not including the '\0').
+ * Return value will be 0 if no token is found, and it will be >=
+ * token_size if the token would not fit.
+ *
+ * The *buf pointer will be updated to point beyond the end of the
+ * found token.  Note that this occurs even if the token buffer is
+ * too small to hold it.
+ */
+static inline size_t copy_token(const char **buf,
+				char *token,
+				size_t token_size)
+{
+        size_t len;
+
+	len = next_token(buf);
+	if (len < token_size) {
+		memcpy(token, *buf, len);
+		*(token + len) = '\0';
+	}
+	*buf += len;
+
+        return len;
+}
+
+/*
+ * Finds the next token in *buf, dynamically allocates a buffer big
+ * enough to hold a copy of it, and copies the token into the new
+ * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
+ * that a duplicate buffer is created even for a zero-length token.
+ *
+ * Returns a pointer to the newly-allocated duplicate, or a null
+ * pointer if memory for the duplicate was not available.  If
+ * the lenp argument is a non-null pointer, the length of the token
+ * (not including the '\0') is returned in *lenp.
+ *
+ * If successful, the *buf pointer will be updated to point beyond
+ * the end of the found token.
+ *
+ * Note: uses GFP_KERNEL for allocation.
+ */
+static inline char *dup_token(const char **buf, size_t *lenp)
+{
+	char *dup;
+	size_t len;
+
+	len = next_token(buf);
+	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
+	if (!dup)
+		return NULL;
+	*(dup + len) = '\0';
+	*buf += len;
+
+	if (lenp)
+		*lenp = len;
+
+	return dup;
+}
+
+/*
+ * Parse the options provided for an "rbd add" (i.e., rbd image
+ * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
+ * and the data written is passed here via a NUL-terminated buffer.
+ * Returns 0 if successful or an error code otherwise.
+ *
+ * The information extracted from these options is recorded in
+ * the other parameters which return dynamically-allocated
+ * structures:
+ *  ceph_opts
+ *      The address of a pointer that will refer to a ceph options
+ *      structure.  Caller must release the returned pointer using
+ *      ceph_destroy_options() when it is no longer needed.
+ *  rbd_opts
+ *	Address of an rbd options pointer.  Fully initialized by
+ *	this function; caller must release with kfree().
+ *  spec
+ *	Address of an rbd image specification pointer.  Fully
+ *	initialized by this function based on parsed options.
+ *	Caller must release with rbd_spec_put().
+ *
+ * The options passed take this form:
+ *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
+ * where:
+ *  <mon_addrs>
+ *      A comma-separated list of one or more monitor addresses.
+ *      A monitor address is an ip address, optionally followed
+ *      by a port number (separated by a colon).
+ *        I.e.:  ip1[:port1][,ip2[:port2]...]
+ *  <options>
+ *      A comma-separated list of ceph and/or rbd options.
+ *  <pool_name>
+ *      The name of the rados pool containing the rbd image.
+ *  <image_name>
+ *      The name of the image in that pool to map.
+ *  <snap_id>
+ *      An optional snapshot id.  If provided, the mapping will
+ *      present data from the image at the time that snapshot was
+ *      created.  The image head is used if no snapshot id is
+ *      provided.  Snapshot mappings are always read-only.
+ */
+static int rbd_add_parse_args(const char *buf,
+				struct ceph_options **ceph_opts,
+				struct rbd_options **opts,
+				struct rbd_spec **rbd_spec)
+{
+	size_t len;
+	char *options;
+	const char *mon_addrs;
+	char *snap_name;
+	size_t mon_addrs_size;
+	struct rbd_spec *spec = NULL;
+	struct rbd_options *rbd_opts = NULL;
+	struct ceph_options *copts;
+	int ret;
+
+	/* The first four tokens are required */
+
+	len = next_token(&buf);
+	if (!len) {
+		rbd_warn(NULL, "no monitor address(es) provided");
+		return -EINVAL;
+	}
+	mon_addrs = buf;
+	mon_addrs_size = len + 1;
+	buf += len;
+
+	ret = -EINVAL;
+	options = dup_token(&buf, NULL);
+	if (!options)
+		return -ENOMEM;
+	if (!*options) {
+		rbd_warn(NULL, "no options provided");
+		goto out_err;
+	}
+
+	spec = rbd_spec_alloc();
+	if (!spec)
+		goto out_mem;
+
+	spec->pool_name = dup_token(&buf, NULL);
+	if (!spec->pool_name)
+		goto out_mem;
+	if (!*spec->pool_name) {
+		rbd_warn(NULL, "no pool name provided");
+		goto out_err;
+	}
+
+	spec->image_name = dup_token(&buf, NULL);
+	if (!spec->image_name)
+		goto out_mem;
+	if (!*spec->image_name) {
+		rbd_warn(NULL, "no image name provided");
+		goto out_err;
+	}
+
+	/*
+	 * Snapshot name is optional; default is to use "-"
+	 * (indicating the head/no snapshot).
+	 */
+	len = next_token(&buf);
+	if (!len) {
+		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
+		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
+	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		goto out_err;
+	}
+	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
+	if (!snap_name)
+		goto out_mem;
+	*(snap_name + len) = '\0';
+	spec->snap_name = snap_name;
+
+	/* Initialize all rbd options to the defaults */
+
+	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
+	if (!rbd_opts)
+		goto out_mem;
+
+	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
+
+	copts = ceph_parse_options(options, mon_addrs,
+					mon_addrs + mon_addrs_size - 1,
+					parse_rbd_opts_token, rbd_opts);
+	if (IS_ERR(copts)) {
+		ret = PTR_ERR(copts);
+		goto out_err;
+	}
+	kfree(options);
+
+	*ceph_opts = copts;
+	*opts = rbd_opts;
+	*rbd_spec = spec;
+
+	return 0;
+out_mem:
+	ret = -ENOMEM;
+out_err:
+	kfree(rbd_opts);
+	rbd_spec_put(spec);
+	kfree(options);
+
+	return ret;
+}
+
+/*
+ * Return pool id (>= 0) or a negative error code.
+ */
+static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
+{
+	u64 newest_epoch;
+	unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
+	int tries = 0;
+	int ret;
+
+again:
+	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
+	if (ret == -ENOENT && tries++ < 1) {
+		ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
+					       &newest_epoch);
+		if (ret < 0)
+			return ret;
+
+		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
+			ceph_monc_request_next_osdmap(&rbdc->client->monc);
+			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
+						     newest_epoch, timeout);
+			goto again;
+		} else {
+			/* the osdmap we have is new enough */
+			return -ENOENT;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * An rbd format 2 image has a unique identifier, distinct from the
+ * name given to it by the user.  Internally, that identifier is
+ * what's used to specify the names of objects related to the image.
+ *
+ * A special "rbd id" object is used to map an rbd image name to its
+ * id.  If that object doesn't exist, then there is no v2 rbd image
+ * with the supplied name.
+ *
+ * This function will record the given rbd_dev's image_id field if
+ * it can be determined, and in that case will return 0.  If any
+ * errors occur a negative errno will be returned and the rbd_dev's
+ * image_id field will be unchanged (and should be NULL).
+ */
+static int rbd_dev_image_id(struct rbd_device *rbd_dev)
+{
+	int ret;
+	size_t size;
+	char *object_name;
+	void *response;
+	char *image_id;
+
+	/*
+	 * When probing a parent image, the image id is already
+	 * known (and the image name likely is not).  There's no
+	 * need to fetch the image id again in this case.  We
+	 * do still need to set the image format though.
+	 */
+	if (rbd_dev->spec->image_id) {
+		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
+
+		return 0;
+	}
+
+	/*
+	 * First, see if the format 2 image id file exists, and if
+	 * so, get the image's persistent id from it.
+	 */
+	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
+	object_name = kmalloc(size, GFP_NOIO);
+	if (!object_name)
+		return -ENOMEM;
+	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
+	dout("rbd id object name is %s\n", object_name);
+
+	/* Response will be an encoded string, which includes a length */
+
+	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
+	response = kzalloc(size, GFP_NOIO);
+	if (!response) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* If it doesn't exist we'll assume it's a format 1 image */
+
+	ret = rbd_obj_method_sync(rbd_dev, object_name,
+				"rbd", "get_id", NULL, 0,
+				response, RBD_IMAGE_ID_LEN_MAX);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret == -ENOENT) {
+		image_id = kstrdup("", GFP_KERNEL);
+		ret = image_id ? 0 : -ENOMEM;
+		if (!ret)
+			rbd_dev->image_format = 1;
+	} else if (ret > sizeof (__le32)) {
+		void *p = response;
+
+		image_id = ceph_extract_encoded_string(&p, p + ret,
+						NULL, GFP_NOIO);
+		ret = PTR_ERR_OR_ZERO(image_id);
+		if (!ret)
+			rbd_dev->image_format = 2;
+	} else {
+		ret = -EINVAL;
+	}
+
+	if (!ret) {
+		rbd_dev->spec->image_id = image_id;
+		dout("image_id is %s\n", image_id);
+	}
+out:
+	kfree(response);
+	kfree(object_name);
+
+	return ret;
+}
+
+/*
+ * Undo whatever state changes are made by v1 or v2 header info
+ * call.
+ */
+static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
+{
+	struct rbd_image_header	*header;
+
+	/* Drop parent reference unless it's already been done (or none) */
+
+	if (rbd_dev->parent_overlap)
+		rbd_dev_parent_put(rbd_dev);
+
+	/* Free dynamic fields from the header, then zero it out */
+
+	header = &rbd_dev->header;
+	ceph_put_snap_context(header->snapc);
+	kfree(header->snap_sizes);
+	kfree(header->snap_names);
+	kfree(header->object_prefix);
+	memset(header, 0, sizeof (*header));
+}
+
+static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	ret = rbd_dev_v2_object_prefix(rbd_dev);
+	if (ret)
+		goto out_err;
+
+	/*
+	 * Get the and check features for the image.  Currently the
+	 * features are assumed to never change.
+	 */
+	ret = rbd_dev_v2_features(rbd_dev);
+	if (ret)
+		goto out_err;
+
+	/* If the image supports fancy striping, get its parameters */
+
+	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
+		ret = rbd_dev_v2_striping_info(rbd_dev);
+		if (ret < 0)
+			goto out_err;
+	}
+	/* No support for crypto and compression type format 2 images */
+
+	return 0;
+out_err:
+	rbd_dev->header.features = 0;
+	kfree(rbd_dev->header.object_prefix);
+	rbd_dev->header.object_prefix = NULL;
+
+	return ret;
+}
+
+static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
+{
+	struct rbd_device *parent = NULL;
+	struct rbd_spec *parent_spec;
+	struct rbd_client *rbdc;
+	int ret;
+
+	if (!rbd_dev->parent_spec)
+		return 0;
+	/*
+	 * We need to pass a reference to the client and the parent
+	 * spec when creating the parent rbd_dev.  Images related by
+	 * parent/child relationships always share both.
+	 */
+	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
+	rbdc = __rbd_get_client(rbd_dev->rbd_client);
+
+	ret = -ENOMEM;
+	parent = rbd_dev_create(rbdc, parent_spec);
+	if (!parent)
+		goto out_err;
+
+	ret = rbd_dev_image_probe(parent, false);
+	if (ret < 0)
+		goto out_err;
+	rbd_dev->parent = parent;
+	atomic_set(&rbd_dev->parent_ref, 1);
+
+	return 0;
+out_err:
+	if (parent) {
+		rbd_dev_unparent(rbd_dev);
+		kfree(rbd_dev->header_name);
+		rbd_dev_destroy(parent);
+	} else {
+		rbd_put_client(rbdc);
+		rbd_spec_put(parent_spec);
+	}
+
+	return ret;
+}
+
+static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	/* Get an id and fill in device name. */
+
+	ret = rbd_dev_id_get(rbd_dev);
+	if (ret)
+		return ret;
+
+	BUILD_BUG_ON(DEV_NAME_LEN
+			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
+	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
+
+	/* Record our major and minor device numbers. */
+
+	if (!single_major) {
+		ret = register_blkdev(0, rbd_dev->name);
+		if (ret < 0)
+			goto err_out_id;
+
+		rbd_dev->major = ret;
+		rbd_dev->minor = 0;
+	} else {
+		rbd_dev->major = rbd_major;
+		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
+	}
+
+	/* Set up the blkdev mapping. */
+
+	ret = rbd_init_disk(rbd_dev);
+	if (ret)
+		goto err_out_blkdev;
+
+	ret = rbd_dev_mapping_set(rbd_dev);
+	if (ret)
+		goto err_out_disk;
+	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
+	set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
+
+	ret = rbd_bus_add_dev(rbd_dev);
+	if (ret)
+		goto err_out_mapping;
+
+	/* Everything's ready.  Announce the disk to the world. */
+
+	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+	add_disk(rbd_dev->disk);
+
+	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
+		(unsigned long long) rbd_dev->mapping.size);
+
+	return ret;
+
+err_out_mapping:
+	rbd_dev_mapping_clear(rbd_dev);
+err_out_disk:
+	rbd_free_disk(rbd_dev);
+err_out_blkdev:
+	if (!single_major)
+		unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_id:
+	rbd_dev_id_put(rbd_dev);
+	rbd_dev_mapping_clear(rbd_dev);
+
+	return ret;
+}
+
+static int rbd_dev_header_name(struct rbd_device *rbd_dev)
+{
+	struct rbd_spec *spec = rbd_dev->spec;
+	size_t size;
+
+	/* Record the header object name for this rbd image. */
+
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+
+	if (rbd_dev->image_format == 1)
+		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
+	else
+		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
+
+	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
+	if (!rbd_dev->header_name)
+		return -ENOMEM;
+
+	if (rbd_dev->image_format == 1)
+		sprintf(rbd_dev->header_name, "%s%s",
+			spec->image_name, RBD_SUFFIX);
+	else
+		sprintf(rbd_dev->header_name, "%s%s",
+			RBD_HEADER_PREFIX, spec->image_id);
+	return 0;
+}
+
+static void rbd_dev_image_release(struct rbd_device *rbd_dev)
+{
+	rbd_dev_unprobe(rbd_dev);
+	kfree(rbd_dev->header_name);
+	rbd_dev->header_name = NULL;
+	rbd_dev->image_format = 0;
+	kfree(rbd_dev->spec->image_id);
+	rbd_dev->spec->image_id = NULL;
+
+	rbd_dev_destroy(rbd_dev);
+}
+
+/*
+ * Probe for the existence of the header object for the given rbd
+ * device.  If this image is the one being mapped (i.e., not a
+ * parent), initiate a watch on its header object before using that
+ * object to get detailed information about the rbd image.
+ */
+static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
+{
+	int ret;
+
+	/*
+	 * Get the id from the image id object.  Unless there's an
+	 * error, rbd_dev->spec->image_id will be filled in with
+	 * a dynamically-allocated string, and rbd_dev->image_format
+	 * will be set to either 1 or 2.
+	 */
+	ret = rbd_dev_image_id(rbd_dev);
+	if (ret)
+		return ret;
+	rbd_assert(rbd_dev->spec->image_id);
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+
+	ret = rbd_dev_header_name(rbd_dev);
+	if (ret)
+		goto err_out_format;
+
+	if (mapping) {
+		ret = rbd_dev_header_watch_sync(rbd_dev);
+		if (ret)
+			goto out_header_name;
+	}
+
+	if (rbd_dev->image_format == 1)
+		ret = rbd_dev_v1_header_info(rbd_dev);
+	else
+		ret = rbd_dev_v2_header_info(rbd_dev);
+	if (ret)
+		goto err_out_watch;
+
+	ret = rbd_dev_spec_update(rbd_dev);
+	if (ret)
+		goto err_out_probe;
+
+	ret = rbd_dev_probe_parent(rbd_dev);
+	if (ret)
+		goto err_out_probe;
+
+	dout("discovered format %u image, header name is %s\n",
+		rbd_dev->image_format, rbd_dev->header_name);
+
+	return 0;
+err_out_probe:
+	rbd_dev_unprobe(rbd_dev);
+err_out_watch:
+	if (mapping)
+		rbd_dev_header_unwatch_sync(rbd_dev);
+out_header_name:
+	kfree(rbd_dev->header_name);
+	rbd_dev->header_name = NULL;
+err_out_format:
+	rbd_dev->image_format = 0;
+	kfree(rbd_dev->spec->image_id);
+	rbd_dev->spec->image_id = NULL;
+
+	dout("probe failed, returning %d\n", ret);
+
+	return ret;
+}
+
+static ssize_t do_rbd_add(struct bus_type *bus,
+			  const char *buf,
+			  size_t count)
+{
+	struct rbd_device *rbd_dev = NULL;
+	struct ceph_options *ceph_opts = NULL;
+	struct rbd_options *rbd_opts = NULL;
+	struct rbd_spec *spec = NULL;
+	struct rbd_client *rbdc;
+	bool read_only;
+	int rc = -ENOMEM;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	/* parse add command */
+	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
+	if (rc < 0)
+		goto err_out_module;
+	read_only = rbd_opts->read_only;
+	kfree(rbd_opts);
+	rbd_opts = NULL;	/* done with this */
+
+	rbdc = rbd_get_client(ceph_opts);
+	if (IS_ERR(rbdc)) {
+		rc = PTR_ERR(rbdc);
+		goto err_out_args;
+	}
+
+	/* pick the pool */
+	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
+	if (rc < 0)
+		goto err_out_client;
+	spec->pool_id = (u64)rc;
+
+	/* The ceph file layout needs to fit pool id in 32 bits */
+
+	if (spec->pool_id > (u64)U32_MAX) {
+		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
+				(unsigned long long)spec->pool_id, U32_MAX);
+		rc = -EIO;
+		goto err_out_client;
+	}
+
+	rbd_dev = rbd_dev_create(rbdc, spec);
+	if (!rbd_dev)
+		goto err_out_client;
+	rbdc = NULL;		/* rbd_dev now owns this */
+	spec = NULL;		/* rbd_dev now owns this */
+
+	rc = rbd_dev_image_probe(rbd_dev, true);
+	if (rc < 0)
+		goto err_out_rbd_dev;
+
+	/* If we are mapping a snapshot it must be marked read-only */
+
+	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
+		read_only = true;
+	rbd_dev->mapping.read_only = read_only;
+
+	rc = rbd_dev_device_setup(rbd_dev);
+	if (rc) {
+		/*
+		 * rbd_dev_header_unwatch_sync() can't be moved into
+		 * rbd_dev_image_release() without refactoring, see
+		 * commit 1f3ef78861ac.
+		 */
+		rbd_dev_header_unwatch_sync(rbd_dev);
+		rbd_dev_image_release(rbd_dev);
+		goto err_out_module;
+	}
+
+	return count;
+
+err_out_rbd_dev:
+	rbd_dev_destroy(rbd_dev);
+err_out_client:
+	rbd_put_client(rbdc);
+err_out_args:
+	rbd_spec_put(spec);
+err_out_module:
+	module_put(THIS_MODULE);
+
+	dout("Error adding device %s\n", buf);
+
+	return (ssize_t)rc;
+}
+
+static ssize_t rbd_add(struct bus_type *bus,
+		       const char *buf,
+		       size_t count)
+{
+	if (single_major)
+		return -EINVAL;
+
+	return do_rbd_add(bus, buf, count);
+}
+
+static ssize_t rbd_add_single_major(struct bus_type *bus,
+				    const char *buf,
+				    size_t count)
+{
+	return do_rbd_add(bus, buf, count);
+}
+
+static void rbd_dev_device_release(struct device *dev)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	rbd_free_disk(rbd_dev);
+	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+	rbd_dev_mapping_clear(rbd_dev);
+	if (!single_major)
+		unregister_blkdev(rbd_dev->major, rbd_dev->name);
+	rbd_dev_id_put(rbd_dev);
+	rbd_dev_mapping_clear(rbd_dev);
+}
+
+static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
+{
+	while (rbd_dev->parent) {
+		struct rbd_device *first = rbd_dev;
+		struct rbd_device *second = first->parent;
+		struct rbd_device *third;
+
+		/*
+		 * Follow to the parent with no grandparent and
+		 * remove it.
+		 */
+		while (second && (third = second->parent)) {
+			first = second;
+			second = third;
+		}
+		rbd_assert(second);
+		rbd_dev_image_release(second);
+		first->parent = NULL;
+		first->parent_overlap = 0;
+
+		rbd_assert(first->parent_spec);
+		rbd_spec_put(first->parent_spec);
+		first->parent_spec = NULL;
+	}
+}
+
+static ssize_t do_rbd_remove(struct bus_type *bus,
+			     const char *buf,
+			     size_t count)
+{
+	struct rbd_device *rbd_dev = NULL;
+	struct list_head *tmp;
+	int dev_id;
+	unsigned long ul;
+	bool already = false;
+	int ret;
+
+	ret = kstrtoul(buf, 10, &ul);
+	if (ret)
+		return ret;
+
+	/* convert to int; abort if we lost anything in the conversion */
+	dev_id = (int)ul;
+	if (dev_id != ul)
+		return -EINVAL;
+
+	ret = -ENOENT;
+	spin_lock(&rbd_dev_list_lock);
+	list_for_each(tmp, &rbd_dev_list) {
+		rbd_dev = list_entry(tmp, struct rbd_device, node);
+		if (rbd_dev->dev_id == dev_id) {
+			ret = 0;
+			break;
+		}
+	}
+	if (!ret) {
+		spin_lock_irq(&rbd_dev->lock);
+		if (rbd_dev->open_count)
+			ret = -EBUSY;
+		else
+			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
+							&rbd_dev->flags);
+		spin_unlock_irq(&rbd_dev->lock);
+	}
+	spin_unlock(&rbd_dev_list_lock);
+	if (ret < 0 || already)
+		return ret;
+
+	rbd_dev_header_unwatch_sync(rbd_dev);
+	/*
+	 * flush remaining watch callbacks - these must be complete
+	 * before the osd_client is shutdown
+	 */
+	dout("%s: flushing notifies", __func__);
+	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
+
+	/*
+	 * Don't free anything from rbd_dev->disk until after all
+	 * notifies are completely processed. Otherwise
+	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
+	 * in a potential use after free of rbd_dev->disk or rbd_dev.
+	 */
+	rbd_bus_del_dev(rbd_dev);
+	rbd_dev_image_release(rbd_dev);
+	module_put(THIS_MODULE);
+
+	return count;
+}
+
+static ssize_t rbd_remove(struct bus_type *bus,
+			  const char *buf,
+			  size_t count)
+{
+	if (single_major)
+		return -EINVAL;
+
+	return do_rbd_remove(bus, buf, count);
+}
+
+static ssize_t rbd_remove_single_major(struct bus_type *bus,
+				       const char *buf,
+				       size_t count)
+{
+	return do_rbd_remove(bus, buf, count);
+}
+
+/*
+ * create control files in sysfs
+ * /sys/bus/rbd/...
+ */
+static int rbd_sysfs_init(void)
+{
+	int ret;
+
+	ret = device_register(&rbd_root_dev);
+	if (ret < 0)
+		return ret;
+
+	ret = bus_register(&rbd_bus_type);
+	if (ret < 0)
+		device_unregister(&rbd_root_dev);
+
+	return ret;
+}
+
+static void rbd_sysfs_cleanup(void)
+{
+	bus_unregister(&rbd_bus_type);
+	device_unregister(&rbd_root_dev);
+}
+
+static int rbd_slab_init(void)
+{
+	rbd_assert(!rbd_img_request_cache);
+	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
+					sizeof (struct rbd_img_request),
+					__alignof__(struct rbd_img_request),
+					0, NULL);
+	if (!rbd_img_request_cache)
+		return -ENOMEM;
+
+	rbd_assert(!rbd_obj_request_cache);
+	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
+					sizeof (struct rbd_obj_request),
+					__alignof__(struct rbd_obj_request),
+					0, NULL);
+	if (!rbd_obj_request_cache)
+		goto out_err;
+
+	rbd_assert(!rbd_segment_name_cache);
+	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
+					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
+	if (rbd_segment_name_cache)
+		return 0;
+out_err:
+	if (rbd_obj_request_cache) {
+		kmem_cache_destroy(rbd_obj_request_cache);
+		rbd_obj_request_cache = NULL;
+	}
+
+	kmem_cache_destroy(rbd_img_request_cache);
+	rbd_img_request_cache = NULL;
+
+	return -ENOMEM;
+}
+
+static void rbd_slab_exit(void)
+{
+	rbd_assert(rbd_segment_name_cache);
+	kmem_cache_destroy(rbd_segment_name_cache);
+	rbd_segment_name_cache = NULL;
+
+	rbd_assert(rbd_obj_request_cache);
+	kmem_cache_destroy(rbd_obj_request_cache);
+	rbd_obj_request_cache = NULL;
+
+	rbd_assert(rbd_img_request_cache);
+	kmem_cache_destroy(rbd_img_request_cache);
+	rbd_img_request_cache = NULL;
+}
+
+static int __init rbd_init(void)
+{
+	int rc;
+
+	if (!libceph_compatible(NULL)) {
+		rbd_warn(NULL, "libceph incompatibility (quitting)");
+		return -EINVAL;
+	}
+
+	rc = rbd_slab_init();
+	if (rc)
+		return rc;
+
+	if (single_major) {
+		rbd_major = register_blkdev(0, RBD_DRV_NAME);
+		if (rbd_major < 0) {
+			rc = rbd_major;
+			goto err_out_slab;
+		}
+	}
+
+	rc = rbd_sysfs_init();
+	if (rc)
+		goto err_out_blkdev;
+
+	if (single_major)
+		pr_info("loaded (major %d)\n", rbd_major);
+	else
+		pr_info("loaded\n");
+
+	return 0;
+
+err_out_blkdev:
+	if (single_major)
+		unregister_blkdev(rbd_major, RBD_DRV_NAME);
+err_out_slab:
+	rbd_slab_exit();
+	return rc;
+}
+
+static void __exit rbd_exit(void)
+{
+	ida_destroy(&rbd_dev_id_ida);
+	rbd_sysfs_cleanup();
+	if (single_major)
+		unregister_blkdev(rbd_major, RBD_DRV_NAME);
+	rbd_slab_exit();
+}
+
+module_init(rbd_init);
+module_exit(rbd_exit);
+
+MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+/* following authorship retained from original osdblk.c */
+MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+
+MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644
index 00000000000..49d77cbcf8b
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,81 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include <linux/types.h>
+
+/* For format version 2, rbd image 'foo' consists of objects
+ *   rbd_id.foo		- id of image
+ *   rbd_header.<id>	- image metadata
+ *   rbd_data.<id>.0000000000000000
+ *   rbd_data.<id>.0000000000000001
+ *   ...		- data
+ * Clients do not access header data directly in rbd format 2.
+ */
+
+#define RBD_HEADER_PREFIX      "rbd_header."
+#define RBD_DATA_PREFIX        "rbd_data."
+#define RBD_ID_PREFIX          "rbd_id."
+
+/*
+ * For format version 1, rbd image 'foo' consists of objects
+ *   foo.rbd		- image metadata
+ *   rb.<idhi>.<idlo>.00000000
+ *   rb.<idhi>.<idlo>.00000001
+ *   ...		- data
+ * There is no notion of a persistent image id in rbd format 1.
+ */
+
+#define RBD_SUFFIX		".rbd"
+
+#define RBD_DIRECTORY           "rbd_directory"
+#define RBD_INFO                "rbd_info"
+
+#define RBD_DEFAULT_OBJ_ORDER	22   /* 4MB */
+#define RBD_MIN_OBJ_ORDER       16
+#define RBD_MAX_OBJ_ORDER       30
+
+#define RBD_COMP_NONE		0
+#define RBD_CRYPT_NONE		0
+
+#define RBD_HEADER_TEXT		"<<< Rados Block Device Image >>>\n"
+#define RBD_HEADER_SIGNATURE	"RBD"
+#define RBD_HEADER_VERSION	"001.005"
+
+struct rbd_image_snap_ondisk {
+	__le64 id;
+	__le64 image_size;
+} __attribute__((packed));
+
+struct rbd_image_header_ondisk {
+	char text[40];
+	char object_prefix[24];
+	char signature[4];
+	char version[8];
+	struct {
+		__u8 order;
+		__u8 crypt_type;
+		__u8 comp_type;
+		__u8 unused;
+	} __attribute__((packed)) options;
+	__le64 image_size;
+	__le64 snap_seq;
+	__le32 snap_count;
+	__le32 reserved;
+	__le64 snap_names_len;
+	struct rbd_image_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+
+#endif
diff --git a/drivers/block/rsxx/Makefile b/drivers/block/rsxx/Makefile
new file mode 100644
index 00000000000..b1c53c0aa45
--- /dev/null
+++ b/drivers/block/rsxx/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_BLK_DEV_RSXX) += rsxx.o
+rsxx-objs := config.o core.o cregs.o dev.o dma.o
diff --git a/drivers/block/rsxx/config.c b/drivers/block/rsxx/config.c
new file mode 100644
index 00000000000..10cd530d3e1
--- /dev/null
+++ b/drivers/block/rsxx/config.c
@@ -0,0 +1,211 @@
+/*
+* Filename: config.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+*	Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/swab.h>
+
+#include "rsxx_priv.h"
+#include "rsxx_cfg.h"
+
+static void initialize_config(struct rsxx_card_cfg *cfg)
+{
+	cfg->hdr.version = RSXX_CFG_VERSION;
+
+	cfg->data.block_size        = RSXX_HW_BLK_SIZE;
+	cfg->data.stripe_size       = RSXX_HW_BLK_SIZE;
+	cfg->data.vendor_id         = RSXX_VENDOR_ID_IBM;
+	cfg->data.cache_order       = (-1);
+	cfg->data.intr_coal.mode    = RSXX_INTR_COAL_DISABLED;
+	cfg->data.intr_coal.count   = 0;
+	cfg->data.intr_coal.latency = 0;
+}
+
+static u32 config_data_crc32(struct rsxx_card_cfg *cfg)
+{
+	/*
+	 * Return the compliment of the CRC to ensure compatibility
+	 * (i.e. this is how early rsxx drivers did it.)
+	 */
+
+	return ~crc32(~0, &cfg->data, sizeof(cfg->data));
+}
+
+
+/*----------------- Config Byte Swap Functions -------------------*/
+static void config_hdr_be_to_cpu(struct card_cfg_hdr *hdr)
+{
+	hdr->version = be32_to_cpu((__force __be32) hdr->version);
+	hdr->crc     = be32_to_cpu((__force __be32) hdr->crc);
+}
+
+static void config_hdr_cpu_to_be(struct card_cfg_hdr *hdr)
+{
+	hdr->version = (__force u32) cpu_to_be32(hdr->version);
+	hdr->crc     = (__force u32) cpu_to_be32(hdr->crc);
+}
+
+static void config_data_swab(struct rsxx_card_cfg *cfg)
+{
+	u32 *data = (u32 *) &cfg->data;
+	int i;
+
+	for (i = 0; i < (sizeof(cfg->data) / 4); i++)
+		data[i] = swab32(data[i]);
+}
+
+static void config_data_le_to_cpu(struct rsxx_card_cfg *cfg)
+{
+	u32 *data = (u32 *) &cfg->data;
+	int i;
+
+	for (i = 0; i < (sizeof(cfg->data) / 4); i++)
+		data[i] = le32_to_cpu((__force __le32) data[i]);
+}
+
+static void config_data_cpu_to_le(struct rsxx_card_cfg *cfg)
+{
+	u32 *data = (u32 *) &cfg->data;
+	int i;
+
+	for (i = 0; i < (sizeof(cfg->data) / 4); i++)
+		data[i] = (__force u32) cpu_to_le32(data[i]);
+}
+
+
+/*----------------- Config Operations ------------------*/
+static int rsxx_save_config(struct rsxx_cardinfo *card)
+{
+	struct rsxx_card_cfg cfg;
+	int st;
+
+	memcpy(&cfg, &card->config, sizeof(cfg));
+
+	if (unlikely(cfg.hdr.version != RSXX_CFG_VERSION)) {
+		dev_err(CARD_TO_DEV(card),
+			"Cannot save config with invalid version %d\n",
+			cfg.hdr.version);
+		return -EINVAL;
+	}
+
+	/* Convert data to little endian for the CRC calculation. */
+	config_data_cpu_to_le(&cfg);
+
+	cfg.hdr.crc = config_data_crc32(&cfg);
+
+	/*
+	 * Swap the data from little endian to big endian so it can be
+	 * stored.
+	 */
+	config_data_swab(&cfg);
+	config_hdr_cpu_to_be(&cfg.hdr);
+
+	st = rsxx_creg_write(card, CREG_ADD_CONFIG, sizeof(cfg), &cfg, 1);
+	if (st)
+		return st;
+
+	return 0;
+}
+
+int rsxx_load_config(struct rsxx_cardinfo *card)
+{
+	int st;
+	u32 crc;
+
+	st = rsxx_creg_read(card, CREG_ADD_CONFIG, sizeof(card->config),
+				&card->config, 1);
+	if (st) {
+		dev_err(CARD_TO_DEV(card),
+			"Failed reading card config.\n");
+		return st;
+	}
+
+	config_hdr_be_to_cpu(&card->config.hdr);
+
+	if (card->config.hdr.version == RSXX_CFG_VERSION) {
+		/*
+		 * We calculate the CRC with the data in little endian, because
+		 * early drivers did not take big endian CPUs into account.
+		 * The data is always stored in big endian, so we need to byte
+		 * swap it before calculating the CRC.
+		 */
+
+		config_data_swab(&card->config);
+
+		/* Check the CRC */
+		crc = config_data_crc32(&card->config);
+		if (crc != card->config.hdr.crc) {
+			dev_err(CARD_TO_DEV(card),
+				"Config corruption detected!\n");
+			dev_info(CARD_TO_DEV(card),
+				"CRC (sb x%08x is x%08x)\n",
+				card->config.hdr.crc, crc);
+			return -EIO;
+		}
+
+		/* Convert the data to CPU byteorder */
+		config_data_le_to_cpu(&card->config);
+
+	} else if (card->config.hdr.version != 0) {
+		dev_err(CARD_TO_DEV(card),
+			"Invalid config version %d.\n",
+			card->config.hdr.version);
+		/*
+		 * Config version changes require special handling from the
+		 * user
+		 */
+		return -EINVAL;
+	} else {
+		dev_info(CARD_TO_DEV(card),
+			"Initializing card configuration.\n");
+		initialize_config(&card->config);
+		st = rsxx_save_config(card);
+		if (st)
+			return st;
+	}
+
+	card->config_valid = 1;
+
+	dev_dbg(CARD_TO_DEV(card), "version:     x%08x\n",
+		card->config.hdr.version);
+	dev_dbg(CARD_TO_DEV(card), "crc:         x%08x\n",
+		card->config.hdr.crc);
+	dev_dbg(CARD_TO_DEV(card), "block_size:  x%08x\n",
+		card->config.data.block_size);
+	dev_dbg(CARD_TO_DEV(card), "stripe_size: x%08x\n",
+		card->config.data.stripe_size);
+	dev_dbg(CARD_TO_DEV(card), "vendor_id:   x%08x\n",
+		card->config.data.vendor_id);
+	dev_dbg(CARD_TO_DEV(card), "cache_order: x%08x\n",
+		card->config.data.cache_order);
+	dev_dbg(CARD_TO_DEV(card), "mode:        x%08x\n",
+		card->config.data.intr_coal.mode);
+	dev_dbg(CARD_TO_DEV(card), "count:       x%08x\n",
+		card->config.data.intr_coal.count);
+	dev_dbg(CARD_TO_DEV(card), "latency:     x%08x\n",
+		 card->config.data.intr_coal.latency);
+
+	return 0;
+}
+
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
new file mode 100644
index 00000000000..a8de2eec6ff
--- /dev/null
+++ b/drivers/block/rsxx/core.c
@@ -0,0 +1,1193 @@
+/*
+* Filename: core.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+*	Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/genhd.h>
+#include <linux/idr.h>
+
+#include "rsxx_priv.h"
+#include "rsxx_cfg.h"
+
+#define NO_LEGACY 0
+#define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */
+
+MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver");
+MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRIVER_VERSION);
+
+static unsigned int force_legacy = NO_LEGACY;
+module_param(force_legacy, uint, 0444);
+MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts");
+
+static unsigned int sync_start = 1;
+module_param(sync_start, uint, 0444);
+MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete "
+			     "until the card startup has completed.");
+
+static DEFINE_IDA(rsxx_disk_ida);
+static DEFINE_SPINLOCK(rsxx_ida_lock);
+
+/* --------------------Debugfs Setup ------------------- */
+
+struct rsxx_cram {
+	u32 f_pos;
+	u32 offset;
+	void *i_private;
+};
+
+static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p)
+{
+	struct rsxx_cardinfo *card = m->private;
+
+	seq_printf(m, "HWID		0x%08x\n",
+					ioread32(card->regmap + HWID));
+	seq_printf(m, "SCRATCH		0x%08x\n",
+					ioread32(card->regmap + SCRATCH));
+	seq_printf(m, "IER		0x%08x\n",
+					ioread32(card->regmap + IER));
+	seq_printf(m, "IPR		0x%08x\n",
+					ioread32(card->regmap + IPR));
+	seq_printf(m, "CREG_CMD		0x%08x\n",
+					ioread32(card->regmap + CREG_CMD));
+	seq_printf(m, "CREG_ADD		0x%08x\n",
+					ioread32(card->regmap + CREG_ADD));
+	seq_printf(m, "CREG_CNT		0x%08x\n",
+					ioread32(card->regmap + CREG_CNT));
+	seq_printf(m, "CREG_STAT	0x%08x\n",
+					ioread32(card->regmap + CREG_STAT));
+	seq_printf(m, "CREG_DATA0	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA0));
+	seq_printf(m, "CREG_DATA1	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA1));
+	seq_printf(m, "CREG_DATA2	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA2));
+	seq_printf(m, "CREG_DATA3	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA3));
+	seq_printf(m, "CREG_DATA4	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA4));
+	seq_printf(m, "CREG_DATA5	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA5));
+	seq_printf(m, "CREG_DATA6	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA6));
+	seq_printf(m, "CREG_DATA7	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA7));
+	seq_printf(m, "INTR_COAL	0x%08x\n",
+					ioread32(card->regmap + INTR_COAL));
+	seq_printf(m, "HW_ERROR		0x%08x\n",
+					ioread32(card->regmap + HW_ERROR));
+	seq_printf(m, "DEBUG0		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG0));
+	seq_printf(m, "DEBUG1		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG1));
+	seq_printf(m, "DEBUG2		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG2));
+	seq_printf(m, "DEBUG3		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG3));
+	seq_printf(m, "DEBUG4		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG4));
+	seq_printf(m, "DEBUG5		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG5));
+	seq_printf(m, "DEBUG6		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG6));
+	seq_printf(m, "DEBUG7		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG7));
+	seq_printf(m, "RECONFIG		0x%08x\n",
+					ioread32(card->regmap + PCI_RECONFIG));
+
+	return 0;
+}
+
+static int rsxx_attr_stats_show(struct seq_file *m, void *p)
+{
+	struct rsxx_cardinfo *card = m->private;
+	int i;
+
+	for (i = 0; i < card->n_targets; i++) {
+		seq_printf(m, "Ctrl %d CRC Errors	= %d\n",
+				i, card->ctrl[i].stats.crc_errors);
+		seq_printf(m, "Ctrl %d Hard Errors	= %d\n",
+				i, card->ctrl[i].stats.hard_errors);
+		seq_printf(m, "Ctrl %d Soft Errors	= %d\n",
+				i, card->ctrl[i].stats.soft_errors);
+		seq_printf(m, "Ctrl %d Writes Issued	= %d\n",
+				i, card->ctrl[i].stats.writes_issued);
+		seq_printf(m, "Ctrl %d Writes Failed	= %d\n",
+				i, card->ctrl[i].stats.writes_failed);
+		seq_printf(m, "Ctrl %d Reads Issued	= %d\n",
+				i, card->ctrl[i].stats.reads_issued);
+		seq_printf(m, "Ctrl %d Reads Failed	= %d\n",
+				i, card->ctrl[i].stats.reads_failed);
+		seq_printf(m, "Ctrl %d Reads Retried	= %d\n",
+				i, card->ctrl[i].stats.reads_retried);
+		seq_printf(m, "Ctrl %d Discards Issued	= %d\n",
+				i, card->ctrl[i].stats.discards_issued);
+		seq_printf(m, "Ctrl %d Discards Failed	= %d\n",
+				i, card->ctrl[i].stats.discards_failed);
+		seq_printf(m, "Ctrl %d DMA SW Errors	= %d\n",
+				i, card->ctrl[i].stats.dma_sw_err);
+		seq_printf(m, "Ctrl %d DMA HW Faults	= %d\n",
+				i, card->ctrl[i].stats.dma_hw_fault);
+		seq_printf(m, "Ctrl %d DMAs Cancelled	= %d\n",
+				i, card->ctrl[i].stats.dma_cancelled);
+		seq_printf(m, "Ctrl %d SW Queue Depth	= %d\n",
+				i, card->ctrl[i].stats.sw_q_depth);
+		seq_printf(m, "Ctrl %d HW Queue Depth	= %d\n",
+			i, atomic_read(&card->ctrl[i].stats.hw_q_depth));
+	}
+
+	return 0;
+}
+
+static int rsxx_attr_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rsxx_attr_stats_show, inode->i_private);
+}
+
+static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rsxx_attr_pci_regs_show, inode->i_private);
+}
+
+static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf,
+			      size_t cnt, loff_t *ppos)
+{
+	struct rsxx_cram *info = fp->private_data;
+	struct rsxx_cardinfo *card = info->i_private;
+	char *buf;
+	int st;
+
+	buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	info->f_pos = (u32)*ppos + info->offset;
+
+	st = rsxx_creg_read(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1);
+	if (st)
+		return st;
+
+	st = copy_to_user(ubuf, buf, cnt);
+	if (st)
+		return st;
+
+	info->offset += cnt;
+
+	kfree(buf);
+
+	return cnt;
+}
+
+static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf,
+			       size_t cnt, loff_t *ppos)
+{
+	struct rsxx_cram *info = fp->private_data;
+	struct rsxx_cardinfo *card = info->i_private;
+	char *buf;
+	int st;
+
+	buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	st = copy_from_user(buf, ubuf, cnt);
+	if (st)
+		return st;
+
+	info->f_pos = (u32)*ppos + info->offset;
+
+	st = rsxx_creg_write(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1);
+	if (st)
+		return st;
+
+	info->offset += cnt;
+
+	kfree(buf);
+
+	return cnt;
+}
+
+static int rsxx_cram_open(struct inode *inode, struct file *file)
+{
+	struct rsxx_cram *info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->i_private = inode->i_private;
+	info->f_pos = file->f_pos;
+	file->private_data = info;
+
+	return 0;
+}
+
+static int rsxx_cram_release(struct inode *inode, struct file *file)
+{
+	struct rsxx_cram *info = file->private_data;
+
+	if (!info)
+		return 0;
+
+	kfree(info);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+static const struct file_operations debugfs_cram_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rsxx_cram_open,
+	.read		= rsxx_cram_read,
+	.write		= rsxx_cram_write,
+	.release	= rsxx_cram_release,
+};
+
+static const struct file_operations debugfs_stats_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rsxx_attr_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static const struct file_operations debugfs_pci_regs_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rsxx_attr_pci_regs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card)
+{
+	struct dentry *debugfs_stats;
+	struct dentry *debugfs_pci_regs;
+	struct dentry *debugfs_cram;
+
+	card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL);
+	if (IS_ERR_OR_NULL(card->debugfs_dir))
+		goto failed_debugfs_dir;
+
+	debugfs_stats = debugfs_create_file("stats", S_IRUGO,
+					    card->debugfs_dir, card,
+					    &debugfs_stats_fops);
+	if (IS_ERR_OR_NULL(debugfs_stats))
+		goto failed_debugfs_stats;
+
+	debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO,
+					       card->debugfs_dir, card,
+					       &debugfs_pci_regs_fops);
+	if (IS_ERR_OR_NULL(debugfs_pci_regs))
+		goto failed_debugfs_pci_regs;
+
+	debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR,
+					   card->debugfs_dir, card,
+					   &debugfs_cram_fops);
+	if (IS_ERR_OR_NULL(debugfs_cram))
+		goto failed_debugfs_cram;
+
+	return;
+failed_debugfs_cram:
+	debugfs_remove(debugfs_pci_regs);
+failed_debugfs_pci_regs:
+	debugfs_remove(debugfs_stats);
+failed_debugfs_stats:
+	debugfs_remove(card->debugfs_dir);
+failed_debugfs_dir:
+	card->debugfs_dir = NULL;
+}
+
+/*----------------- Interrupt Control & Handling -------------------*/
+
+static void rsxx_mask_interrupts(struct rsxx_cardinfo *card)
+{
+	card->isr_mask = 0;
+	card->ier_mask = 0;
+}
+
+static void __enable_intr(unsigned int *mask, unsigned int intr)
+{
+	*mask |= intr;
+}
+
+static void __disable_intr(unsigned int *mask, unsigned int intr)
+{
+	*mask &= ~intr;
+}
+
+/*
+ * NOTE: Disabling the IER will disable the hardware interrupt.
+ * Disabling the ISR will disable the software handling of the ISR bit.
+ *
+ * Enable/Disable interrupt functions assume the card->irq_lock
+ * is held by the caller.
+ */
+void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr)
+{
+	if (unlikely(card->halt) ||
+	    unlikely(card->eeh_state))
+		return;
+
+	__enable_intr(&card->ier_mask, intr);
+	iowrite32(card->ier_mask, card->regmap + IER);
+}
+
+void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr)
+{
+	if (unlikely(card->eeh_state))
+		return;
+
+	__disable_intr(&card->ier_mask, intr);
+	iowrite32(card->ier_mask, card->regmap + IER);
+}
+
+void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card,
+				 unsigned int intr)
+{
+	if (unlikely(card->halt) ||
+	    unlikely(card->eeh_state))
+		return;
+
+	__enable_intr(&card->isr_mask, intr);
+	__enable_intr(&card->ier_mask, intr);
+	iowrite32(card->ier_mask, card->regmap + IER);
+}
+void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card,
+				  unsigned int intr)
+{
+	if (unlikely(card->eeh_state))
+		return;
+
+	__disable_intr(&card->isr_mask, intr);
+	__disable_intr(&card->ier_mask, intr);
+	iowrite32(card->ier_mask, card->regmap + IER);
+}
+
+static irqreturn_t rsxx_isr(int irq, void *pdata)
+{
+	struct rsxx_cardinfo *card = pdata;
+	unsigned int isr;
+	int handled = 0;
+	int reread_isr;
+	int i;
+
+	spin_lock(&card->irq_lock);
+
+	do {
+		reread_isr = 0;
+
+		if (unlikely(card->eeh_state))
+			break;
+
+		isr = ioread32(card->regmap + ISR);
+		if (isr == 0xffffffff) {
+			/*
+			 * A few systems seem to have an intermittent issue
+			 * where PCI reads return all Fs, but retrying the read
+			 * a little later will return as expected.
+			 */
+			dev_info(CARD_TO_DEV(card),
+				"ISR = 0xFFFFFFFF, retrying later\n");
+			break;
+		}
+
+		isr &= card->isr_mask;
+		if (!isr)
+			break;
+
+		for (i = 0; i < card->n_targets; i++) {
+			if (isr & CR_INTR_DMA(i)) {
+				if (card->ier_mask & CR_INTR_DMA(i)) {
+					rsxx_disable_ier(card, CR_INTR_DMA(i));
+					reread_isr = 1;
+				}
+				queue_work(card->ctrl[i].done_wq,
+					   &card->ctrl[i].dma_done_work);
+				handled++;
+			}
+		}
+
+		if (isr & CR_INTR_CREG) {
+			queue_work(card->creg_ctrl.creg_wq,
+				   &card->creg_ctrl.done_work);
+			handled++;
+		}
+
+		if (isr & CR_INTR_EVENT) {
+			queue_work(card->event_wq, &card->event_work);
+			rsxx_disable_ier_and_isr(card, CR_INTR_EVENT);
+			handled++;
+		}
+	} while (reread_isr);
+
+	spin_unlock(&card->irq_lock);
+
+	return handled ? IRQ_HANDLED : IRQ_NONE;
+}
+
+/*----------------- Card Event Handler -------------------*/
+static const char * const rsxx_card_state_to_str(unsigned int state)
+{
+	static const char * const state_strings[] = {
+		"Unknown", "Shutdown", "Starting", "Formatting",
+		"Uninitialized", "Good", "Shutting Down",
+		"Fault", "Read Only Fault", "dStroying"
+	};
+
+	return state_strings[ffs(state)];
+}
+
+static void card_state_change(struct rsxx_cardinfo *card,
+			      unsigned int new_state)
+{
+	int st;
+
+	dev_info(CARD_TO_DEV(card),
+		"card state change detected.(%s -> %s)\n",
+		rsxx_card_state_to_str(card->state),
+		rsxx_card_state_to_str(new_state));
+
+	card->state = new_state;
+
+	/* Don't attach DMA interfaces if the card has an invalid config */
+	if (!card->config_valid)
+		return;
+
+	switch (new_state) {
+	case CARD_STATE_RD_ONLY_FAULT:
+		dev_crit(CARD_TO_DEV(card),
+			"Hardware has entered read-only mode!\n");
+		/*
+		 * Fall through so the DMA devices can be attached and
+		 * the user can attempt to pull off their data.
+		 */
+	case CARD_STATE_GOOD:
+		st = rsxx_get_card_size8(card, &card->size8);
+		if (st)
+			dev_err(CARD_TO_DEV(card),
+				"Failed attaching DMA devices\n");
+
+		if (card->config_valid)
+			set_capacity(card->gendisk, card->size8 >> 9);
+		break;
+
+	case CARD_STATE_FAULT:
+		dev_crit(CARD_TO_DEV(card),
+			"Hardware Fault reported!\n");
+		/* Fall through. */
+
+	/* Everything else, detach DMA interface if it's attached. */
+	case CARD_STATE_SHUTDOWN:
+	case CARD_STATE_STARTING:
+	case CARD_STATE_FORMATTING:
+	case CARD_STATE_UNINITIALIZED:
+	case CARD_STATE_SHUTTING_DOWN:
+	/*
+	 * dStroy is a term coined by marketing to represent the low level
+	 * secure erase.
+	 */
+	case CARD_STATE_DSTROYING:
+		set_capacity(card->gendisk, 0);
+		break;
+	}
+}
+
+static void card_event_handler(struct work_struct *work)
+{
+	struct rsxx_cardinfo *card;
+	unsigned int state;
+	unsigned long flags;
+	int st;
+
+	card = container_of(work, struct rsxx_cardinfo, event_work);
+
+	if (unlikely(card->halt))
+		return;
+
+	/*
+	 * Enable the interrupt now to avoid any weird race conditions where a
+	 * state change might occur while rsxx_get_card_state() is
+	 * processing a returned creg cmd.
+	 */
+	spin_lock_irqsave(&card->irq_lock, flags);
+	rsxx_enable_ier_and_isr(card, CR_INTR_EVENT);
+	spin_unlock_irqrestore(&card->irq_lock, flags);
+
+	st = rsxx_get_card_state(card, &state);
+	if (st) {
+		dev_info(CARD_TO_DEV(card),
+			"Failed reading state after event.\n");
+		return;
+	}
+
+	if (card->state != state)
+		card_state_change(card, state);
+
+	if (card->creg_ctrl.creg_stats.stat & CREG_STAT_LOG_PENDING)
+		rsxx_read_hw_log(card);
+}
+
+/*----------------- Card Operations -------------------*/
+static int card_shutdown(struct rsxx_cardinfo *card)
+{
+	unsigned int state;
+	signed long start;
+	const int timeout = msecs_to_jiffies(120000);
+	int st;
+
+	/* We can't issue a shutdown if the card is in a transition state */
+	start = jiffies;
+	do {
+		st = rsxx_get_card_state(card, &state);
+		if (st)
+			return st;
+	} while (state == CARD_STATE_STARTING &&
+		 (jiffies - start < timeout));
+
+	if (state == CARD_STATE_STARTING)
+		return -ETIMEDOUT;
+
+	/* Only issue a shutdown if we need to */
+	if ((state != CARD_STATE_SHUTTING_DOWN) &&
+	    (state != CARD_STATE_SHUTDOWN)) {
+		st = rsxx_issue_card_cmd(card, CARD_CMD_SHUTDOWN);
+		if (st)
+			return st;
+	}
+
+	start = jiffies;
+	do {
+		st = rsxx_get_card_state(card, &state);
+		if (st)
+			return st;
+	} while (state != CARD_STATE_SHUTDOWN &&
+		 (jiffies - start < timeout));
+
+	if (state != CARD_STATE_SHUTDOWN)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static int rsxx_eeh_frozen(struct pci_dev *dev)
+{
+	struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+	int i;
+	int st;
+
+	dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n");
+
+	card->eeh_state = 1;
+	rsxx_mask_interrupts(card);
+
+	/*
+	 * We need to guarantee that the write for eeh_state and masking
+	 * interrupts does not become reordered. This will prevent a possible
+	 * race condition with the EEH code.
+	 */
+	wmb();
+
+	pci_disable_device(dev);
+
+	st = rsxx_eeh_save_issued_dmas(card);
+	if (st)
+		return st;
+
+	rsxx_eeh_save_issued_creg(card);
+
+	for (i = 0; i < card->n_targets; i++) {
+		if (card->ctrl[i].status.buf)
+			pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8,
+					    card->ctrl[i].status.buf,
+					    card->ctrl[i].status.dma_addr);
+		if (card->ctrl[i].cmd.buf)
+			pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8,
+					    card->ctrl[i].cmd.buf,
+					    card->ctrl[i].cmd.dma_addr);
+	}
+
+	return 0;
+}
+
+static void rsxx_eeh_failure(struct pci_dev *dev)
+{
+	struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+	int i;
+	int cnt = 0;
+
+	dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n");
+
+	card->eeh_state = 1;
+	card->halt = 1;
+
+	for (i = 0; i < card->n_targets; i++) {
+		spin_lock_bh(&card->ctrl[i].queue_lock);
+		cnt = rsxx_cleanup_dma_queue(&card->ctrl[i],
+					     &card->ctrl[i].queue,
+					     COMPLETE_DMA);
+		spin_unlock_bh(&card->ctrl[i].queue_lock);
+
+		cnt += rsxx_dma_cancel(&card->ctrl[i]);
+
+		if (cnt)
+			dev_info(CARD_TO_DEV(card),
+				"Freed %d queued DMAs on channel %d\n",
+				cnt, card->ctrl[i].id);
+	}
+}
+
+static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card)
+{
+	unsigned int status;
+	int iter = 0;
+
+	/* We need to wait for the hardware to reset */
+	while (iter++ < 10) {
+		status = ioread32(card->regmap + PCI_RECONFIG);
+
+		if (status & RSXX_FLUSH_BUSY) {
+			ssleep(1);
+			continue;
+		}
+
+		if (status & RSXX_FLUSH_TIMEOUT)
+			dev_warn(CARD_TO_DEV(card), "HW: flash controller timeout\n");
+		return 0;
+	}
+
+	/* Hardware failed resetting itself. */
+	return -1;
+}
+
+static pci_ers_result_t rsxx_error_detected(struct pci_dev *dev,
+					    enum pci_channel_state error)
+{
+	int st;
+
+	if (dev->revision < RSXX_EEH_SUPPORT)
+		return PCI_ERS_RESULT_NONE;
+
+	if (error == pci_channel_io_perm_failure) {
+		rsxx_eeh_failure(dev);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	st = rsxx_eeh_frozen(dev);
+	if (st) {
+		dev_err(&dev->dev, "Slot reset setup failed\n");
+		rsxx_eeh_failure(dev);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
+{
+	struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+	unsigned long flags;
+	int i;
+	int st;
+
+	dev_warn(&dev->dev,
+		"IBM Flash Adapter PCI: recovering from slot reset.\n");
+
+	st = pci_enable_device(dev);
+	if (st)
+		goto failed_hw_setup;
+
+	pci_set_master(dev);
+
+	st = rsxx_eeh_fifo_flush_poll(card);
+	if (st)
+		goto failed_hw_setup;
+
+	rsxx_dma_queue_reset(card);
+
+	for (i = 0; i < card->n_targets; i++) {
+		st = rsxx_hw_buffers_init(dev, &card->ctrl[i]);
+		if (st)
+			goto failed_hw_buffers_init;
+	}
+
+	if (card->config_valid)
+		rsxx_dma_configure(card);
+
+	/* Clears the ISR register from spurious interrupts */
+	st = ioread32(card->regmap + ISR);
+
+	card->eeh_state = 0;
+
+	spin_lock_irqsave(&card->irq_lock, flags);
+	if (card->n_targets & RSXX_MAX_TARGETS)
+		rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G);
+	else
+		rsxx_enable_ier_and_isr(card, CR_INTR_ALL_C);
+	spin_unlock_irqrestore(&card->irq_lock, flags);
+
+	rsxx_kick_creg_queue(card);
+
+	for (i = 0; i < card->n_targets; i++) {
+		spin_lock(&card->ctrl[i].queue_lock);
+		if (list_empty(&card->ctrl[i].queue)) {
+			spin_unlock(&card->ctrl[i].queue_lock);
+			continue;
+		}
+		spin_unlock(&card->ctrl[i].queue_lock);
+
+		queue_work(card->ctrl[i].issue_wq,
+				&card->ctrl[i].issue_dma_work);
+	}
+
+	dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n");
+
+	return PCI_ERS_RESULT_RECOVERED;
+
+failed_hw_buffers_init:
+	for (i = 0; i < card->n_targets; i++) {
+		if (card->ctrl[i].status.buf)
+			pci_free_consistent(card->dev,
+					STATUS_BUFFER_SIZE8,
+					card->ctrl[i].status.buf,
+					card->ctrl[i].status.dma_addr);
+		if (card->ctrl[i].cmd.buf)
+			pci_free_consistent(card->dev,
+					COMMAND_BUFFER_SIZE8,
+					card->ctrl[i].cmd.buf,
+					card->ctrl[i].cmd.dma_addr);
+	}
+failed_hw_setup:
+	rsxx_eeh_failure(dev);
+	return PCI_ERS_RESULT_DISCONNECT;
+
+}
+
+/*----------------- Driver Initialization & Setup -------------------*/
+/* Returns:   0 if the driver is compatible with the device
+	     -1 if the driver is NOT compatible with the device */
+static int rsxx_compatibility_check(struct rsxx_cardinfo *card)
+{
+	unsigned char pci_rev;
+
+	pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev);
+
+	if (pci_rev > RS70_PCI_REV_SUPPORTED)
+		return -1;
+	return 0;
+}
+
+static int rsxx_pci_probe(struct pci_dev *dev,
+					const struct pci_device_id *id)
+{
+	struct rsxx_cardinfo *card;
+	int st;
+	unsigned int sync_timeout;
+
+	dev_info(&dev->dev, "PCI-Flash SSD discovered\n");
+
+	card = kzalloc(sizeof(*card), GFP_KERNEL);
+	if (!card)
+		return -ENOMEM;
+
+	card->dev = dev;
+	pci_set_drvdata(dev, card);
+
+	do {
+		if (!ida_pre_get(&rsxx_disk_ida, GFP_KERNEL)) {
+			st = -ENOMEM;
+			goto failed_ida_get;
+		}
+
+		spin_lock(&rsxx_ida_lock);
+		st = ida_get_new(&rsxx_disk_ida, &card->disk_id);
+		spin_unlock(&rsxx_ida_lock);
+	} while (st == -EAGAIN);
+
+	if (st)
+		goto failed_ida_get;
+
+	st = pci_enable_device(dev);
+	if (st)
+		goto failed_enable;
+
+	pci_set_master(dev);
+	pci_set_dma_max_seg_size(dev, RSXX_HW_BLK_SIZE);
+
+	st = pci_set_dma_mask(dev, DMA_BIT_MASK(64));
+	if (st) {
+		dev_err(CARD_TO_DEV(card),
+			"No usable DMA configuration,aborting\n");
+		goto failed_dma_mask;
+	}
+
+	st = pci_request_regions(dev, DRIVER_NAME);
+	if (st) {
+		dev_err(CARD_TO_DEV(card),
+			"Failed to request memory region\n");
+		goto failed_request_regions;
+	}
+
+	if (pci_resource_len(dev, 0) == 0) {
+		dev_err(CARD_TO_DEV(card), "BAR0 has length 0!\n");
+		st = -ENOMEM;
+		goto failed_iomap;
+	}
+
+	card->regmap = pci_iomap(dev, 0, 0);
+	if (!card->regmap) {
+		dev_err(CARD_TO_DEV(card), "Failed to map BAR0\n");
+		st = -ENOMEM;
+		goto failed_iomap;
+	}
+
+	spin_lock_init(&card->irq_lock);
+	card->halt = 0;
+	card->eeh_state = 0;
+
+	spin_lock_irq(&card->irq_lock);
+	rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
+	spin_unlock_irq(&card->irq_lock);
+
+	if (!force_legacy) {
+		st = pci_enable_msi(dev);
+		if (st)
+			dev_warn(CARD_TO_DEV(card),
+				"Failed to enable MSI\n");
+	}
+
+	st = request_irq(dev->irq, rsxx_isr, IRQF_DISABLED | IRQF_SHARED,
+			 DRIVER_NAME, card);
+	if (st) {
+		dev_err(CARD_TO_DEV(card),
+			"Failed requesting IRQ%d\n", dev->irq);
+		goto failed_irq;
+	}
+
+	/************* Setup Processor Command Interface *************/
+	st = rsxx_creg_setup(card);
+	if (st) {
+		dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n");
+		goto failed_creg_setup;
+	}
+
+	spin_lock_irq(&card->irq_lock);
+	rsxx_enable_ier_and_isr(card, CR_INTR_CREG);
+	spin_unlock_irq(&card->irq_lock);
+
+	st = rsxx_compatibility_check(card);
+	if (st) {
+		dev_warn(CARD_TO_DEV(card),
+			"Incompatible driver detected. Please update the driver.\n");
+		st = -EINVAL;
+		goto failed_compatiblity_check;
+	}
+
+	/************* Load Card Config *************/
+	st = rsxx_load_config(card);
+	if (st)
+		dev_err(CARD_TO_DEV(card),
+			"Failed loading card config\n");
+
+	/************* Setup DMA Engine *************/
+	st = rsxx_get_num_targets(card, &card->n_targets);
+	if (st)
+		dev_info(CARD_TO_DEV(card),
+			"Failed reading the number of DMA targets\n");
+
+	card->ctrl = kzalloc(card->n_targets * sizeof(*card->ctrl), GFP_KERNEL);
+	if (!card->ctrl) {
+		st = -ENOMEM;
+		goto failed_dma_setup;
+	}
+
+	st = rsxx_dma_setup(card);
+	if (st) {
+		dev_info(CARD_TO_DEV(card),
+			"Failed to setup DMA engine\n");
+		goto failed_dma_setup;
+	}
+
+	/************* Setup Card Event Handler *************/
+	card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event");
+	if (!card->event_wq) {
+		dev_err(CARD_TO_DEV(card), "Failed card event setup.\n");
+		goto failed_event_handler;
+	}
+
+	INIT_WORK(&card->event_work, card_event_handler);
+
+	st = rsxx_setup_dev(card);
+	if (st)
+		goto failed_create_dev;
+
+	rsxx_get_card_state(card, &card->state);
+
+	dev_info(CARD_TO_DEV(card),
+		"card state: %s\n",
+		rsxx_card_state_to_str(card->state));
+
+	/*
+	 * Now that the DMA Engine and devices have been setup,
+	 * we can enable the event interrupt(it kicks off actions in
+	 * those layers so we couldn't enable it right away.)
+	 */
+	spin_lock_irq(&card->irq_lock);
+	rsxx_enable_ier_and_isr(card, CR_INTR_EVENT);
+	spin_unlock_irq(&card->irq_lock);
+
+	if (card->state == CARD_STATE_SHUTDOWN) {
+		st = rsxx_issue_card_cmd(card, CARD_CMD_STARTUP);
+		if (st)
+			dev_crit(CARD_TO_DEV(card),
+				"Failed issuing card startup\n");
+		if (sync_start) {
+			sync_timeout = SYNC_START_TIMEOUT;
+
+			dev_info(CARD_TO_DEV(card),
+				 "Waiting for card to startup\n");
+
+			do {
+				ssleep(1);
+				sync_timeout--;
+
+				rsxx_get_card_state(card, &card->state);
+			} while (sync_timeout &&
+				(card->state == CARD_STATE_STARTING));
+
+			if (card->state == CARD_STATE_STARTING) {
+				dev_warn(CARD_TO_DEV(card),
+					 "Card startup timed out\n");
+				card->size8 = 0;
+			} else {
+				dev_info(CARD_TO_DEV(card),
+					"card state: %s\n",
+					rsxx_card_state_to_str(card->state));
+				st = rsxx_get_card_size8(card, &card->size8);
+				if (st)
+					card->size8 = 0;
+			}
+		}
+	} else if (card->state == CARD_STATE_GOOD ||
+		   card->state == CARD_STATE_RD_ONLY_FAULT) {
+		st = rsxx_get_card_size8(card, &card->size8);
+		if (st)
+			card->size8 = 0;
+	}
+
+	rsxx_attach_dev(card);
+
+	/************* Setup Debugfs *************/
+	rsxx_debugfs_dev_new(card);
+
+	return 0;
+
+failed_create_dev:
+	destroy_workqueue(card->event_wq);
+	card->event_wq = NULL;
+failed_event_handler:
+	rsxx_dma_destroy(card);
+failed_dma_setup:
+failed_compatiblity_check:
+	destroy_workqueue(card->creg_ctrl.creg_wq);
+	card->creg_ctrl.creg_wq = NULL;
+failed_creg_setup:
+	spin_lock_irq(&card->irq_lock);
+	rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
+	spin_unlock_irq(&card->irq_lock);
+	free_irq(dev->irq, card);
+	if (!force_legacy)
+		pci_disable_msi(dev);
+failed_irq:
+	pci_iounmap(dev, card->regmap);
+failed_iomap:
+	pci_release_regions(dev);
+failed_request_regions:
+failed_dma_mask:
+	pci_disable_device(dev);
+failed_enable:
+	spin_lock(&rsxx_ida_lock);
+	ida_remove(&rsxx_disk_ida, card->disk_id);
+	spin_unlock(&rsxx_ida_lock);
+failed_ida_get:
+	kfree(card);
+
+	return st;
+}
+
+static void rsxx_pci_remove(struct pci_dev *dev)
+{
+	struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+	unsigned long flags;
+	int st;
+	int i;
+
+	if (!card)
+		return;
+
+	dev_info(CARD_TO_DEV(card),
+		"Removing PCI-Flash SSD.\n");
+
+	rsxx_detach_dev(card);
+
+	for (i = 0; i < card->n_targets; i++) {
+		spin_lock_irqsave(&card->irq_lock, flags);
+		rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i));
+		spin_unlock_irqrestore(&card->irq_lock, flags);
+	}
+
+	st = card_shutdown(card);
+	if (st)
+		dev_crit(CARD_TO_DEV(card), "Shutdown failed!\n");
+
+	/* Sync outstanding event handlers. */
+	spin_lock_irqsave(&card->irq_lock, flags);
+	rsxx_disable_ier_and_isr(card, CR_INTR_EVENT);
+	spin_unlock_irqrestore(&card->irq_lock, flags);
+
+	cancel_work_sync(&card->event_work);
+
+	rsxx_destroy_dev(card);
+	rsxx_dma_destroy(card);
+
+	spin_lock_irqsave(&card->irq_lock, flags);
+	rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
+	spin_unlock_irqrestore(&card->irq_lock, flags);
+
+	/* Prevent work_structs from re-queuing themselves. */
+	card->halt = 1;
+
+	debugfs_remove_recursive(card->debugfs_dir);
+
+	free_irq(dev->irq, card);
+
+	if (!force_legacy)
+		pci_disable_msi(dev);
+
+	rsxx_creg_destroy(card);
+
+	pci_iounmap(dev, card->regmap);
+
+	pci_disable_device(dev);
+	pci_release_regions(dev);
+
+	kfree(card);
+}
+
+static int rsxx_pci_suspend(struct pci_dev *dev, pm_message_t state)
+{
+	/* We don't support suspend at this time. */
+	return -ENOSYS;
+}
+
+static void rsxx_pci_shutdown(struct pci_dev *dev)
+{
+	struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+	unsigned long flags;
+	int i;
+
+	if (!card)
+		return;
+
+	dev_info(CARD_TO_DEV(card), "Shutting down PCI-Flash SSD.\n");
+
+	rsxx_detach_dev(card);
+
+	for (i = 0; i < card->n_targets; i++) {
+		spin_lock_irqsave(&card->irq_lock, flags);
+		rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i));
+		spin_unlock_irqrestore(&card->irq_lock, flags);
+	}
+
+	card_shutdown(card);
+}
+
+static const struct pci_error_handlers rsxx_err_handler = {
+	.error_detected = rsxx_error_detected,
+	.slot_reset     = rsxx_slot_reset,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(rsxx_pci_ids) = {
+	{PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS70_FLASH)},
+	{PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS80_FLASH)},
+	{0,},
+};
+
+MODULE_DEVICE_TABLE(pci, rsxx_pci_ids);
+
+static struct pci_driver rsxx_pci_driver = {
+	.name		= DRIVER_NAME,
+	.id_table	= rsxx_pci_ids,
+	.probe		= rsxx_pci_probe,
+	.remove		= rsxx_pci_remove,
+	.suspend	= rsxx_pci_suspend,
+	.shutdown	= rsxx_pci_shutdown,
+	.err_handler    = &rsxx_err_handler,
+};
+
+static int __init rsxx_core_init(void)
+{
+	int st;
+
+	st = rsxx_dev_init();
+	if (st)
+		return st;
+
+	st = rsxx_dma_init();
+	if (st)
+		goto dma_init_failed;
+
+	st = rsxx_creg_init();
+	if (st)
+		goto creg_init_failed;
+
+	return pci_register_driver(&rsxx_pci_driver);
+
+creg_init_failed:
+	rsxx_dma_cleanup();
+dma_init_failed:
+	rsxx_dev_cleanup();
+
+	return st;
+}
+
+static void __exit rsxx_core_cleanup(void)
+{
+	pci_unregister_driver(&rsxx_pci_driver);
+	rsxx_creg_cleanup();
+	rsxx_dma_cleanup();
+	rsxx_dev_cleanup();
+}
+
+module_init(rsxx_core_init);
+module_exit(rsxx_core_cleanup);
diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c
new file mode 100644
index 00000000000..926dce9c452
--- /dev/null
+++ b/drivers/block/rsxx/cregs.c
@@ -0,0 +1,804 @@
+/*
+* Filename: cregs.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+*	Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/completion.h>
+#include <linux/slab.h>
+
+#include "rsxx_priv.h"
+
+#define CREG_TIMEOUT_MSEC	10000
+
+typedef void (*creg_cmd_cb)(struct rsxx_cardinfo *card,
+			    struct creg_cmd *cmd,
+			    int st);
+
+struct creg_cmd {
+	struct list_head list;
+	creg_cmd_cb cb;
+	void *cb_private;
+	unsigned int op;
+	unsigned int addr;
+	int cnt8;
+	void *buf;
+	unsigned int stream;
+	unsigned int status;
+};
+
+static struct kmem_cache *creg_cmd_pool;
+
+
+/*------------ Private Functions --------------*/
+
+#if defined(__LITTLE_ENDIAN)
+#define LITTLE_ENDIAN 1
+#elif defined(__BIG_ENDIAN)
+#define LITTLE_ENDIAN 0
+#else
+#error Unknown endianess!!! Aborting...
+#endif
+
+static int copy_to_creg_data(struct rsxx_cardinfo *card,
+			      int cnt8,
+			      void *buf,
+			      unsigned int stream)
+{
+	int i = 0;
+	u32 *data = buf;
+
+	if (unlikely(card->eeh_state))
+		return -EIO;
+
+	for (i = 0; cnt8 > 0; i++, cnt8 -= 4) {
+		/*
+		 * Firmware implementation makes it necessary to byte swap on
+		 * little endian processors.
+		 */
+		if (LITTLE_ENDIAN && stream)
+			iowrite32be(data[i], card->regmap + CREG_DATA(i));
+		else
+			iowrite32(data[i], card->regmap + CREG_DATA(i));
+	}
+
+	return 0;
+}
+
+
+static int copy_from_creg_data(struct rsxx_cardinfo *card,
+				int cnt8,
+				void *buf,
+				unsigned int stream)
+{
+	int i = 0;
+	u32 *data = buf;
+
+	if (unlikely(card->eeh_state))
+		return -EIO;
+
+	for (i = 0; cnt8 > 0; i++, cnt8 -= 4) {
+		/*
+		 * Firmware implementation makes it necessary to byte swap on
+		 * little endian processors.
+		 */
+		if (LITTLE_ENDIAN && stream)
+			data[i] = ioread32be(card->regmap + CREG_DATA(i));
+		else
+			data[i] = ioread32(card->regmap + CREG_DATA(i));
+	}
+
+	return 0;
+}
+
+static void creg_issue_cmd(struct rsxx_cardinfo *card, struct creg_cmd *cmd)
+{
+	int st;
+
+	if (unlikely(card->eeh_state))
+		return;
+
+	iowrite32(cmd->addr, card->regmap + CREG_ADD);
+	iowrite32(cmd->cnt8, card->regmap + CREG_CNT);
+
+	if (cmd->op == CREG_OP_WRITE) {
+		if (cmd->buf) {
+			st = copy_to_creg_data(card, cmd->cnt8,
+					       cmd->buf, cmd->stream);
+			if (st)
+				return;
+		}
+	}
+
+	if (unlikely(card->eeh_state))
+		return;
+
+	/* Setting the valid bit will kick off the command. */
+	iowrite32(cmd->op, card->regmap + CREG_CMD);
+}
+
+static void creg_kick_queue(struct rsxx_cardinfo *card)
+{
+	if (card->creg_ctrl.active || list_empty(&card->creg_ctrl.queue))
+		return;
+
+	card->creg_ctrl.active = 1;
+	card->creg_ctrl.active_cmd = list_first_entry(&card->creg_ctrl.queue,
+						      struct creg_cmd, list);
+	list_del(&card->creg_ctrl.active_cmd->list);
+	card->creg_ctrl.q_depth--;
+
+	/*
+	 * We have to set the timer before we push the new command. Otherwise,
+	 * we could create a race condition that would occur if the timer
+	 * was not canceled, and expired after the new command was pushed,
+	 * but before the command was issued to hardware.
+	 */
+	mod_timer(&card->creg_ctrl.cmd_timer,
+				jiffies + msecs_to_jiffies(CREG_TIMEOUT_MSEC));
+
+	creg_issue_cmd(card, card->creg_ctrl.active_cmd);
+}
+
+static int creg_queue_cmd(struct rsxx_cardinfo *card,
+			  unsigned int op,
+			  unsigned int addr,
+			  unsigned int cnt8,
+			  void *buf,
+			  int stream,
+			  creg_cmd_cb callback,
+			  void *cb_private)
+{
+	struct creg_cmd *cmd;
+
+	/* Don't queue stuff up if we're halted. */
+	if (unlikely(card->halt))
+		return -EINVAL;
+
+	if (card->creg_ctrl.reset)
+		return -EAGAIN;
+
+	if (cnt8 > MAX_CREG_DATA8)
+		return -EINVAL;
+
+	cmd = kmem_cache_alloc(creg_cmd_pool, GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&cmd->list);
+
+	cmd->op		= op;
+	cmd->addr	= addr;
+	cmd->cnt8	= cnt8;
+	cmd->buf	= buf;
+	cmd->stream	= stream;
+	cmd->cb		= callback;
+	cmd->cb_private = cb_private;
+	cmd->status	= 0;
+
+	spin_lock_bh(&card->creg_ctrl.lock);
+	list_add_tail(&cmd->list, &card->creg_ctrl.queue);
+	card->creg_ctrl.q_depth++;
+	creg_kick_queue(card);
+	spin_unlock_bh(&card->creg_ctrl.lock);
+
+	return 0;
+}
+
+static void creg_cmd_timed_out(unsigned long data)
+{
+	struct rsxx_cardinfo *card = (struct rsxx_cardinfo *) data;
+	struct creg_cmd *cmd;
+
+	spin_lock(&card->creg_ctrl.lock);
+	cmd = card->creg_ctrl.active_cmd;
+	card->creg_ctrl.active_cmd = NULL;
+	spin_unlock(&card->creg_ctrl.lock);
+
+	if (cmd == NULL) {
+		card->creg_ctrl.creg_stats.creg_timeout++;
+		dev_warn(CARD_TO_DEV(card),
+			"No active command associated with timeout!\n");
+		return;
+	}
+
+	if (cmd->cb)
+		cmd->cb(card, cmd, -ETIMEDOUT);
+
+	kmem_cache_free(creg_cmd_pool, cmd);
+
+
+	spin_lock(&card->creg_ctrl.lock);
+	card->creg_ctrl.active = 0;
+	creg_kick_queue(card);
+	spin_unlock(&card->creg_ctrl.lock);
+}
+
+
+static void creg_cmd_done(struct work_struct *work)
+{
+	struct rsxx_cardinfo *card;
+	struct creg_cmd *cmd;
+	int st = 0;
+
+	card = container_of(work, struct rsxx_cardinfo,
+			    creg_ctrl.done_work);
+
+	/*
+	 * The timer could not be cancelled for some reason,
+	 * race to pop the active command.
+	 */
+	if (del_timer_sync(&card->creg_ctrl.cmd_timer) == 0)
+		card->creg_ctrl.creg_stats.failed_cancel_timer++;
+
+	spin_lock_bh(&card->creg_ctrl.lock);
+	cmd = card->creg_ctrl.active_cmd;
+	card->creg_ctrl.active_cmd = NULL;
+	spin_unlock_bh(&card->creg_ctrl.lock);
+
+	if (cmd == NULL) {
+		dev_err(CARD_TO_DEV(card),
+			"Spurious creg interrupt!\n");
+		return;
+	}
+
+	card->creg_ctrl.creg_stats.stat = ioread32(card->regmap + CREG_STAT);
+	cmd->status = card->creg_ctrl.creg_stats.stat;
+	if ((cmd->status & CREG_STAT_STATUS_MASK) == 0) {
+		dev_err(CARD_TO_DEV(card),
+			"Invalid status on creg command\n");
+		/*
+		 * At this point we're probably reading garbage from HW. Don't
+		 * do anything else that could mess up the system and let
+		 * the sync function return an error.
+		 */
+		st = -EIO;
+		goto creg_done;
+	} else if (cmd->status & CREG_STAT_ERROR) {
+		st = -EIO;
+	}
+
+	if ((cmd->op == CREG_OP_READ)) {
+		unsigned int cnt8 = ioread32(card->regmap + CREG_CNT);
+
+		/* Paranoid Sanity Checks */
+		if (!cmd->buf) {
+			dev_err(CARD_TO_DEV(card),
+				"Buffer not given for read.\n");
+			st = -EIO;
+			goto creg_done;
+		}
+		if (cnt8 != cmd->cnt8) {
+			dev_err(CARD_TO_DEV(card),
+				"count mismatch\n");
+			st = -EIO;
+			goto creg_done;
+		}
+
+		st = copy_from_creg_data(card, cnt8, cmd->buf, cmd->stream);
+	}
+
+creg_done:
+	if (cmd->cb)
+		cmd->cb(card, cmd, st);
+
+	kmem_cache_free(creg_cmd_pool, cmd);
+
+	spin_lock_bh(&card->creg_ctrl.lock);
+	card->creg_ctrl.active = 0;
+	creg_kick_queue(card);
+	spin_unlock_bh(&card->creg_ctrl.lock);
+}
+
+static void creg_reset(struct rsxx_cardinfo *card)
+{
+	struct creg_cmd *cmd = NULL;
+	struct creg_cmd *tmp;
+	unsigned long flags;
+
+	/*
+	 * mutex_trylock is used here because if reset_lock is taken then a
+	 * reset is already happening. So, we can just go ahead and return.
+	 */
+	if (!mutex_trylock(&card->creg_ctrl.reset_lock))
+		return;
+
+	card->creg_ctrl.reset = 1;
+	spin_lock_irqsave(&card->irq_lock, flags);
+	rsxx_disable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT);
+	spin_unlock_irqrestore(&card->irq_lock, flags);
+
+	dev_warn(CARD_TO_DEV(card),
+		"Resetting creg interface for recovery\n");
+
+	/* Cancel outstanding commands */
+	spin_lock_bh(&card->creg_ctrl.lock);
+	list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) {
+		list_del(&cmd->list);
+		card->creg_ctrl.q_depth--;
+		if (cmd->cb)
+			cmd->cb(card, cmd, -ECANCELED);
+		kmem_cache_free(creg_cmd_pool, cmd);
+	}
+
+	cmd = card->creg_ctrl.active_cmd;
+	card->creg_ctrl.active_cmd = NULL;
+	if (cmd) {
+		if (timer_pending(&card->creg_ctrl.cmd_timer))
+			del_timer_sync(&card->creg_ctrl.cmd_timer);
+
+		if (cmd->cb)
+			cmd->cb(card, cmd, -ECANCELED);
+		kmem_cache_free(creg_cmd_pool, cmd);
+
+		card->creg_ctrl.active = 0;
+	}
+	spin_unlock_bh(&card->creg_ctrl.lock);
+
+	card->creg_ctrl.reset = 0;
+	spin_lock_irqsave(&card->irq_lock, flags);
+	rsxx_enable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT);
+	spin_unlock_irqrestore(&card->irq_lock, flags);
+
+	mutex_unlock(&card->creg_ctrl.reset_lock);
+}
+
+/* Used for synchronous accesses */
+struct creg_completion {
+	struct completion	*cmd_done;
+	int			st;
+	u32			creg_status;
+};
+
+static void creg_cmd_done_cb(struct rsxx_cardinfo *card,
+			     struct creg_cmd *cmd,
+			     int st)
+{
+	struct creg_completion *cmd_completion;
+
+	cmd_completion = cmd->cb_private;
+	BUG_ON(!cmd_completion);
+
+	cmd_completion->st = st;
+	cmd_completion->creg_status = cmd->status;
+	complete(cmd_completion->cmd_done);
+}
+
+static int __issue_creg_rw(struct rsxx_cardinfo *card,
+			   unsigned int op,
+			   unsigned int addr,
+			   unsigned int cnt8,
+			   void *buf,
+			   int stream,
+			   unsigned int *hw_stat)
+{
+	DECLARE_COMPLETION_ONSTACK(cmd_done);
+	struct creg_completion completion;
+	unsigned long timeout;
+	int st;
+
+	completion.cmd_done = &cmd_done;
+	completion.st = 0;
+	completion.creg_status = 0;
+
+	st = creg_queue_cmd(card, op, addr, cnt8, buf, stream, creg_cmd_done_cb,
+			    &completion);
+	if (st)
+		return st;
+
+	/*
+	 * This timeout is necessary for unresponsive hardware. The additional
+	 * 20 seconds to used to guarantee that each cregs requests has time to
+	 * complete.
+	 */
+	timeout = msecs_to_jiffies(CREG_TIMEOUT_MSEC *
+				   card->creg_ctrl.q_depth + 20000);
+
+	/*
+	 * The creg interface is guaranteed to complete. It has a timeout
+	 * mechanism that will kick in if hardware does not respond.
+	 */
+	st = wait_for_completion_timeout(completion.cmd_done, timeout);
+	if (st == 0) {
+		/*
+		 * This is really bad, because the kernel timer did not
+		 * expire and notify us of a timeout!
+		 */
+		dev_crit(CARD_TO_DEV(card),
+			"cregs timer failed\n");
+		creg_reset(card);
+		return -EIO;
+	}
+
+	*hw_stat = completion.creg_status;
+
+	if (completion.st) {
+		/*
+		* This read is needed to verify that there has not been any
+		* extreme errors that might have occurred, i.e. EEH. The
+		* function iowrite32 will not detect EEH errors, so it is
+		* necessary that we recover if such an error is the reason
+		* for the timeout. This is a dummy read.
+		*/
+		ioread32(card->regmap + SCRATCH);
+
+		dev_warn(CARD_TO_DEV(card),
+			"creg command failed(%d x%08x)\n",
+			completion.st, addr);
+		return completion.st;
+	}
+
+	return 0;
+}
+
+static int issue_creg_rw(struct rsxx_cardinfo *card,
+			 u32 addr,
+			 unsigned int size8,
+			 void *data,
+			 int stream,
+			 int read)
+{
+	unsigned int hw_stat;
+	unsigned int xfer;
+	unsigned int op;
+	int st;
+
+	op = read ? CREG_OP_READ : CREG_OP_WRITE;
+
+	do {
+		xfer = min_t(unsigned int, size8, MAX_CREG_DATA8);
+
+		st = __issue_creg_rw(card, op, addr, xfer,
+				     data, stream, &hw_stat);
+		if (st)
+			return st;
+
+		data   = (char *)data + xfer;
+		addr  += xfer;
+		size8 -= xfer;
+	} while (size8);
+
+	return 0;
+}
+
+/* ---------------------------- Public API ---------------------------------- */
+int rsxx_creg_write(struct rsxx_cardinfo *card,
+			u32 addr,
+			unsigned int size8,
+			void *data,
+			int byte_stream)
+{
+	return issue_creg_rw(card, addr, size8, data, byte_stream, 0);
+}
+
+int rsxx_creg_read(struct rsxx_cardinfo *card,
+		       u32 addr,
+		       unsigned int size8,
+		       void *data,
+		       int byte_stream)
+{
+	return issue_creg_rw(card, addr, size8, data, byte_stream, 1);
+}
+
+int rsxx_get_card_state(struct rsxx_cardinfo *card, unsigned int *state)
+{
+	return rsxx_creg_read(card, CREG_ADD_CARD_STATE,
+				  sizeof(*state), state, 0);
+}
+
+int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8)
+{
+	unsigned int size;
+	int st;
+
+	st = rsxx_creg_read(card, CREG_ADD_CARD_SIZE,
+				sizeof(size), &size, 0);
+	if (st)
+		return st;
+
+	*size8 = (u64)size * RSXX_HW_BLK_SIZE;
+	return 0;
+}
+
+int rsxx_get_num_targets(struct rsxx_cardinfo *card,
+			     unsigned int *n_targets)
+{
+	return rsxx_creg_read(card, CREG_ADD_NUM_TARGETS,
+				  sizeof(*n_targets), n_targets, 0);
+}
+
+int rsxx_get_card_capabilities(struct rsxx_cardinfo *card,
+				   u32 *capabilities)
+{
+	return rsxx_creg_read(card, CREG_ADD_CAPABILITIES,
+				  sizeof(*capabilities), capabilities, 0);
+}
+
+int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd)
+{
+	return rsxx_creg_write(card, CREG_ADD_CARD_CMD,
+				   sizeof(cmd), &cmd, 0);
+}
+
+
+/*----------------- HW Log Functions -------------------*/
+static void hw_log_msg(struct rsxx_cardinfo *card, const char *str, int len)
+{
+	static char level;
+
+	/*
+	 * New messages start with "<#>", where # is the log level. Messages
+	 * that extend past the log buffer will use the previous level
+	 */
+	if ((len > 3) && (str[0] == '<') && (str[2] == '>')) {
+		level = str[1];
+		str += 3; /* Skip past the log level. */
+		len -= 3;
+	}
+
+	switch (level) {
+	case '0':
+		dev_emerg(CARD_TO_DEV(card), "HW: %.*s", len, str);
+		break;
+	case '1':
+		dev_alert(CARD_TO_DEV(card), "HW: %.*s", len, str);
+		break;
+	case '2':
+		dev_crit(CARD_TO_DEV(card), "HW: %.*s", len, str);
+		break;
+	case '3':
+		dev_err(CARD_TO_DEV(card), "HW: %.*s", len, str);
+		break;
+	case '4':
+		dev_warn(CARD_TO_DEV(card), "HW: %.*s", len, str);
+		break;
+	case '5':
+		dev_notice(CARD_TO_DEV(card), "HW: %.*s", len, str);
+		break;
+	case '6':
+		dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str);
+		break;
+	case '7':
+		dev_dbg(CARD_TO_DEV(card), "HW: %.*s", len, str);
+		break;
+	default:
+		dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str);
+		break;
+	}
+}
+
+/*
+ * The substrncpy function copies the src string (which includes the
+ * terminating '\0' character), up to the count into the dest pointer.
+ * Returns the number of bytes copied to dest.
+ */
+static int substrncpy(char *dest, const char *src, int count)
+{
+	int max_cnt = count;
+
+	while (count) {
+		count--;
+		*dest = *src;
+		if (*dest == '\0')
+			break;
+		src++;
+		dest++;
+	}
+	return max_cnt - count;
+}
+
+
+static void read_hw_log_done(struct rsxx_cardinfo *card,
+			     struct creg_cmd *cmd,
+			     int st)
+{
+	char *buf;
+	char *log_str;
+	int cnt;
+	int len;
+	int off;
+
+	buf = cmd->buf;
+	off = 0;
+
+	/* Failed getting the log message */
+	if (st)
+		return;
+
+	while (off < cmd->cnt8) {
+		log_str = &card->log.buf[card->log.buf_len];
+		cnt = min(cmd->cnt8 - off, LOG_BUF_SIZE8 - card->log.buf_len);
+		len = substrncpy(log_str, &buf[off], cnt);
+
+		off += len;
+		card->log.buf_len += len;
+
+		/*
+		 * Flush the log if we've hit the end of a message or if we've
+		 * run out of buffer space.
+		 */
+		if ((log_str[len - 1] == '\0')  ||
+		    (card->log.buf_len == LOG_BUF_SIZE8)) {
+			if (card->log.buf_len != 1) /* Don't log blank lines. */
+				hw_log_msg(card, card->log.buf,
+					   card->log.buf_len);
+			card->log.buf_len = 0;
+		}
+
+	}
+
+	if (cmd->status & CREG_STAT_LOG_PENDING)
+		rsxx_read_hw_log(card);
+}
+
+int rsxx_read_hw_log(struct rsxx_cardinfo *card)
+{
+	int st;
+
+	st = creg_queue_cmd(card, CREG_OP_READ, CREG_ADD_LOG,
+			    sizeof(card->log.tmp), card->log.tmp,
+			    1, read_hw_log_done, NULL);
+	if (st)
+		dev_err(CARD_TO_DEV(card),
+			"Failed getting log text\n");
+
+	return st;
+}
+
+/*-------------- IOCTL REG Access ------------------*/
+static int issue_reg_cmd(struct rsxx_cardinfo *card,
+			 struct rsxx_reg_access *cmd,
+			 int read)
+{
+	unsigned int op = read ? CREG_OP_READ : CREG_OP_WRITE;
+
+	return __issue_creg_rw(card, op, cmd->addr, cmd->cnt, cmd->data,
+			       cmd->stream, &cmd->stat);
+}
+
+int rsxx_reg_access(struct rsxx_cardinfo *card,
+			struct rsxx_reg_access __user *ucmd,
+			int read)
+{
+	struct rsxx_reg_access cmd;
+	int st;
+
+	st = copy_from_user(&cmd, ucmd, sizeof(cmd));
+	if (st)
+		return -EFAULT;
+
+	if (cmd.cnt > RSXX_MAX_REG_CNT)
+		return -EFAULT;
+
+	st = issue_reg_cmd(card, &cmd, read);
+	if (st)
+		return st;
+
+	st = put_user(cmd.stat, &ucmd->stat);
+	if (st)
+		return -EFAULT;
+
+	if (read) {
+		st = copy_to_user(ucmd->data, cmd.data, cmd.cnt);
+		if (st)
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card)
+{
+	struct creg_cmd *cmd = NULL;
+
+	cmd = card->creg_ctrl.active_cmd;
+	card->creg_ctrl.active_cmd = NULL;
+
+	if (cmd) {
+		del_timer_sync(&card->creg_ctrl.cmd_timer);
+
+		spin_lock_bh(&card->creg_ctrl.lock);
+		list_add(&cmd->list, &card->creg_ctrl.queue);
+		card->creg_ctrl.q_depth++;
+		card->creg_ctrl.active = 0;
+		spin_unlock_bh(&card->creg_ctrl.lock);
+	}
+}
+
+void rsxx_kick_creg_queue(struct rsxx_cardinfo *card)
+{
+	spin_lock_bh(&card->creg_ctrl.lock);
+	if (!list_empty(&card->creg_ctrl.queue))
+		creg_kick_queue(card);
+	spin_unlock_bh(&card->creg_ctrl.lock);
+}
+
+/*------------ Initialization & Setup --------------*/
+int rsxx_creg_setup(struct rsxx_cardinfo *card)
+{
+	card->creg_ctrl.active_cmd = NULL;
+
+	card->creg_ctrl.creg_wq =
+			create_singlethread_workqueue(DRIVER_NAME"_creg");
+	if (!card->creg_ctrl.creg_wq)
+		return -ENOMEM;
+
+	INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done);
+	mutex_init(&card->creg_ctrl.reset_lock);
+	INIT_LIST_HEAD(&card->creg_ctrl.queue);
+	spin_lock_init(&card->creg_ctrl.lock);
+	setup_timer(&card->creg_ctrl.cmd_timer, creg_cmd_timed_out,
+		    (unsigned long) card);
+
+	return 0;
+}
+
+void rsxx_creg_destroy(struct rsxx_cardinfo *card)
+{
+	struct creg_cmd *cmd;
+	struct creg_cmd *tmp;
+	int cnt = 0;
+
+	/* Cancel outstanding commands */
+	spin_lock_bh(&card->creg_ctrl.lock);
+	list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) {
+		list_del(&cmd->list);
+		if (cmd->cb)
+			cmd->cb(card, cmd, -ECANCELED);
+		kmem_cache_free(creg_cmd_pool, cmd);
+		cnt++;
+	}
+
+	if (cnt)
+		dev_info(CARD_TO_DEV(card),
+			"Canceled %d queue creg commands\n", cnt);
+
+	cmd = card->creg_ctrl.active_cmd;
+	card->creg_ctrl.active_cmd = NULL;
+	if (cmd) {
+		if (timer_pending(&card->creg_ctrl.cmd_timer))
+			del_timer_sync(&card->creg_ctrl.cmd_timer);
+
+		if (cmd->cb)
+			cmd->cb(card, cmd, -ECANCELED);
+		dev_info(CARD_TO_DEV(card),
+			"Canceled active creg command\n");
+		kmem_cache_free(creg_cmd_pool, cmd);
+	}
+	spin_unlock_bh(&card->creg_ctrl.lock);
+
+	cancel_work_sync(&card->creg_ctrl.done_work);
+}
+
+
+int rsxx_creg_init(void)
+{
+	creg_cmd_pool = KMEM_CACHE(creg_cmd, SLAB_HWCACHE_ALIGN);
+	if (!creg_cmd_pool)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void rsxx_creg_cleanup(void)
+{
+	kmem_cache_destroy(creg_cmd_pool);
+}
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
new file mode 100644
index 00000000000..2839d37e5af
--- /dev/null
+++ b/drivers/block/rsxx/dev.c
@@ -0,0 +1,360 @@
+/*
+* Filename: dev.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+*	Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+
+#include <linux/hdreg.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+
+#include <linux/fs.h>
+
+#include "rsxx_priv.h"
+
+static unsigned int blkdev_minors = 64;
+module_param(blkdev_minors, uint, 0444);
+MODULE_PARM_DESC(blkdev_minors, "Number of minors(partitions)");
+
+/*
+ * For now I'm making this tweakable in case any applications hit this limit.
+ * If you see a "bio too big" error in the log you will need to raise this
+ * value.
+ */
+static unsigned int blkdev_max_hw_sectors = 1024;
+module_param(blkdev_max_hw_sectors, uint, 0444);
+MODULE_PARM_DESC(blkdev_max_hw_sectors, "Max hw sectors for a single BIO");
+
+static unsigned int enable_blkdev = 1;
+module_param(enable_blkdev , uint, 0444);
+MODULE_PARM_DESC(enable_blkdev, "Enable block device interfaces");
+
+
+struct rsxx_bio_meta {
+	struct bio	*bio;
+	atomic_t	pending_dmas;
+	atomic_t	error;
+	unsigned long	start_time;
+};
+
+static struct kmem_cache *bio_meta_pool;
+
+/*----------------- Block Device Operations -----------------*/
+static int rsxx_blkdev_ioctl(struct block_device *bdev,
+				 fmode_t mode,
+				 unsigned int cmd,
+				 unsigned long arg)
+{
+	struct rsxx_cardinfo *card = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case RSXX_GETREG:
+		return rsxx_reg_access(card, (void __user *)arg, 1);
+	case RSXX_SETREG:
+		return rsxx_reg_access(card, (void __user *)arg, 0);
+	}
+
+	return -ENOTTY;
+}
+
+static int rsxx_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct rsxx_cardinfo *card = bdev->bd_disk->private_data;
+	u64 blocks = card->size8 >> 9;
+
+	/*
+	 * get geometry: Fake it. I haven't found any drivers that set
+	 * geo->start, so we won't either.
+	 */
+	if (card->size8) {
+		geo->heads = 64;
+		geo->sectors = 16;
+		do_div(blocks, (geo->heads * geo->sectors));
+		geo->cylinders = blocks;
+	} else {
+		geo->heads = 0;
+		geo->sectors = 0;
+		geo->cylinders = 0;
+	}
+	return 0;
+}
+
+static const struct block_device_operations rsxx_fops = {
+	.owner		= THIS_MODULE,
+	.getgeo		= rsxx_getgeo,
+	.ioctl		= rsxx_blkdev_ioctl,
+};
+
+static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
+{
+	struct hd_struct *part0 = &card->gendisk->part0;
+	int rw = bio_data_dir(bio);
+	int cpu;
+
+	cpu = part_stat_lock();
+
+	part_round_stats(cpu, part0);
+	part_inc_in_flight(part0, rw);
+
+	part_stat_unlock();
+}
+
+static void disk_stats_complete(struct rsxx_cardinfo *card,
+				struct bio *bio,
+				unsigned long start_time)
+{
+	struct hd_struct *part0 = &card->gendisk->part0;
+	unsigned long duration = jiffies - start_time;
+	int rw = bio_data_dir(bio);
+	int cpu;
+
+	cpu = part_stat_lock();
+
+	part_stat_add(cpu, part0, sectors[rw], bio_sectors(bio));
+	part_stat_inc(cpu, part0, ios[rw]);
+	part_stat_add(cpu, part0, ticks[rw], duration);
+
+	part_round_stats(cpu, part0);
+	part_dec_in_flight(part0, rw);
+
+	part_stat_unlock();
+}
+
+static void bio_dma_done_cb(struct rsxx_cardinfo *card,
+			    void *cb_data,
+			    unsigned int error)
+{
+	struct rsxx_bio_meta *meta = cb_data;
+
+	if (error)
+		atomic_set(&meta->error, 1);
+
+	if (atomic_dec_and_test(&meta->pending_dmas)) {
+		if (!card->eeh_state && card->gendisk)
+			disk_stats_complete(card, meta->bio, meta->start_time);
+
+		bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0);
+		kmem_cache_free(bio_meta_pool, meta);
+	}
+}
+
+static void rsxx_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct rsxx_cardinfo *card = q->queuedata;
+	struct rsxx_bio_meta *bio_meta;
+	int st = -EINVAL;
+
+	might_sleep();
+
+	if (!card)
+		goto req_err;
+
+	if (bio_end_sector(bio) > get_capacity(card->gendisk))
+		goto req_err;
+
+	if (unlikely(card->halt)) {
+		st = -EFAULT;
+		goto req_err;
+	}
+
+	if (unlikely(card->dma_fault)) {
+		st = (-EFAULT);
+		goto req_err;
+	}
+
+	if (bio->bi_iter.bi_size == 0) {
+		dev_err(CARD_TO_DEV(card), "size zero BIO!\n");
+		goto req_err;
+	}
+
+	bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL);
+	if (!bio_meta) {
+		st = -ENOMEM;
+		goto req_err;
+	}
+
+	bio_meta->bio = bio;
+	atomic_set(&bio_meta->error, 0);
+	atomic_set(&bio_meta->pending_dmas, 0);
+	bio_meta->start_time = jiffies;
+
+	if (!unlikely(card->halt))
+		disk_stats_start(card, bio);
+
+	dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n",
+		 bio_data_dir(bio) ? 'W' : 'R', bio_meta,
+		 (u64)bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size);
+
+	st = rsxx_dma_queue_bio(card, bio, &bio_meta->pending_dmas,
+				    bio_dma_done_cb, bio_meta);
+	if (st)
+		goto queue_err;
+
+	return;
+
+queue_err:
+	kmem_cache_free(bio_meta_pool, bio_meta);
+req_err:
+	bio_endio(bio, st);
+}
+
+/*----------------- Device Setup -------------------*/
+static bool rsxx_discard_supported(struct rsxx_cardinfo *card)
+{
+	unsigned char pci_rev;
+
+	pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev);
+
+	return (pci_rev >= RSXX_DISCARD_SUPPORT);
+}
+
+int rsxx_attach_dev(struct rsxx_cardinfo *card)
+{
+	mutex_lock(&card->dev_lock);
+
+	/* The block device requires the stripe size from the config. */
+	if (enable_blkdev) {
+		if (card->config_valid)
+			set_capacity(card->gendisk, card->size8 >> 9);
+		else
+			set_capacity(card->gendisk, 0);
+		add_disk(card->gendisk);
+
+		card->bdev_attached = 1;
+	}
+
+	mutex_unlock(&card->dev_lock);
+
+	return 0;
+}
+
+void rsxx_detach_dev(struct rsxx_cardinfo *card)
+{
+	mutex_lock(&card->dev_lock);
+
+	if (card->bdev_attached) {
+		del_gendisk(card->gendisk);
+		card->bdev_attached = 0;
+	}
+
+	mutex_unlock(&card->dev_lock);
+}
+
+int rsxx_setup_dev(struct rsxx_cardinfo *card)
+{
+	unsigned short blk_size;
+
+	mutex_init(&card->dev_lock);
+
+	if (!enable_blkdev)
+		return 0;
+
+	card->major = register_blkdev(0, DRIVER_NAME);
+	if (card->major < 0) {
+		dev_err(CARD_TO_DEV(card), "Failed to get major number\n");
+		return -ENOMEM;
+	}
+
+	card->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!card->queue) {
+		dev_err(CARD_TO_DEV(card), "Failed queue alloc\n");
+		unregister_blkdev(card->major, DRIVER_NAME);
+		return -ENOMEM;
+	}
+
+	card->gendisk = alloc_disk(blkdev_minors);
+	if (!card->gendisk) {
+		dev_err(CARD_TO_DEV(card), "Failed disk alloc\n");
+		blk_cleanup_queue(card->queue);
+		unregister_blkdev(card->major, DRIVER_NAME);
+		return -ENOMEM;
+	}
+
+	if (card->config_valid) {
+		blk_size = card->config.data.block_size;
+		blk_queue_dma_alignment(card->queue, blk_size - 1);
+		blk_queue_logical_block_size(card->queue, blk_size);
+	}
+
+	blk_queue_make_request(card->queue, rsxx_make_request);
+	blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY);
+	blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors);
+	blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE);
+
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue);
+	if (rsxx_discard_supported(card)) {
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, card->queue);
+		blk_queue_max_discard_sectors(card->queue,
+						RSXX_HW_BLK_SIZE >> 9);
+		card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE;
+		card->queue->limits.discard_alignment   = RSXX_HW_BLK_SIZE;
+		card->queue->limits.discard_zeroes_data = 1;
+	}
+
+	card->queue->queuedata = card;
+
+	snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name),
+		 "rsxx%d", card->disk_id);
+	card->gendisk->driverfs_dev = &card->dev->dev;
+	card->gendisk->major = card->major;
+	card->gendisk->first_minor = 0;
+	card->gendisk->fops = &rsxx_fops;
+	card->gendisk->private_data = card;
+	card->gendisk->queue = card->queue;
+
+	return 0;
+}
+
+void rsxx_destroy_dev(struct rsxx_cardinfo *card)
+{
+	if (!enable_blkdev)
+		return;
+
+	put_disk(card->gendisk);
+	card->gendisk = NULL;
+
+	blk_cleanup_queue(card->queue);
+	card->queue->queuedata = NULL;
+	unregister_blkdev(card->major, DRIVER_NAME);
+}
+
+int rsxx_dev_init(void)
+{
+	bio_meta_pool = KMEM_CACHE(rsxx_bio_meta, SLAB_HWCACHE_ALIGN);
+	if (!bio_meta_pool)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void rsxx_dev_cleanup(void)
+{
+	kmem_cache_destroy(bio_meta_pool);
+}
+
+
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
new file mode 100644
index 00000000000..cf8cd293abb
--- /dev/null
+++ b/drivers/block/rsxx/dma.c
@@ -0,0 +1,1104 @@
+/*
+* Filename: dma.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+*	Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/slab.h>
+#include "rsxx_priv.h"
+
+struct rsxx_dma {
+	struct list_head	 list;
+	u8			 cmd;
+	unsigned int		 laddr;     /* Logical address */
+	struct {
+		u32		 off;
+		u32		 cnt;
+	} sub_page;
+	dma_addr_t		 dma_addr;
+	struct page		 *page;
+	unsigned int		 pg_off;    /* Page Offset */
+	rsxx_dma_cb		 cb;
+	void			 *cb_data;
+};
+
+/* This timeout is used to detect a stalled DMA channel */
+#define DMA_ACTIVITY_TIMEOUT	msecs_to_jiffies(10000)
+
+struct hw_status {
+	u8	status;
+	u8	tag;
+	__le16	count;
+	__le32	_rsvd2;
+	__le64	_rsvd3;
+} __packed;
+
+enum rsxx_dma_status {
+	DMA_SW_ERR    = 0x1,
+	DMA_HW_FAULT  = 0x2,
+	DMA_CANCELLED = 0x4,
+};
+
+struct hw_cmd {
+	u8	command;
+	u8	tag;
+	u8	_rsvd;
+	u8	sub_page; /* Bit[0:2]: 512byte offset */
+			  /* Bit[4:6]: 512byte count */
+	__le32	device_addr;
+	__le64	host_addr;
+} __packed;
+
+enum rsxx_hw_cmd {
+	HW_CMD_BLK_DISCARD	= 0x70,
+	HW_CMD_BLK_WRITE	= 0x80,
+	HW_CMD_BLK_READ		= 0xC0,
+	HW_CMD_BLK_RECON_READ	= 0xE0,
+};
+
+enum rsxx_hw_status {
+	HW_STATUS_CRC		= 0x01,
+	HW_STATUS_HARD_ERR	= 0x02,
+	HW_STATUS_SOFT_ERR	= 0x04,
+	HW_STATUS_FAULT		= 0x08,
+};
+
+static struct kmem_cache *rsxx_dma_pool;
+
+struct dma_tracker {
+	int			next_tag;
+	struct rsxx_dma	*dma;
+};
+
+#define DMA_TRACKER_LIST_SIZE8 (sizeof(struct dma_tracker_list) + \
+		(sizeof(struct dma_tracker) * RSXX_MAX_OUTSTANDING_CMDS))
+
+struct dma_tracker_list {
+	spinlock_t		lock;
+	int			head;
+	struct dma_tracker	list[0];
+};
+
+
+/*----------------- Misc Utility Functions -------------------*/
+static unsigned int rsxx_addr8_to_laddr(u64 addr8, struct rsxx_cardinfo *card)
+{
+	unsigned long long tgt_addr8;
+
+	tgt_addr8 = ((addr8 >> card->_stripe.upper_shift) &
+		      card->_stripe.upper_mask) |
+		    ((addr8) & card->_stripe.lower_mask);
+	do_div(tgt_addr8, RSXX_HW_BLK_SIZE);
+	return tgt_addr8;
+}
+
+static unsigned int rsxx_get_dma_tgt(struct rsxx_cardinfo *card, u64 addr8)
+{
+	unsigned int tgt;
+
+	tgt = (addr8 >> card->_stripe.target_shift) & card->_stripe.target_mask;
+
+	return tgt;
+}
+
+void rsxx_dma_queue_reset(struct rsxx_cardinfo *card)
+{
+	/* Reset all DMA Command/Status Queues */
+	iowrite32(DMA_QUEUE_RESET, card->regmap + RESET);
+}
+
+static unsigned int get_dma_size(struct rsxx_dma *dma)
+{
+	if (dma->sub_page.cnt)
+		return dma->sub_page.cnt << 9;
+	else
+		return RSXX_HW_BLK_SIZE;
+}
+
+
+/*----------------- DMA Tracker -------------------*/
+static void set_tracker_dma(struct dma_tracker_list *trackers,
+			    int tag,
+			    struct rsxx_dma *dma)
+{
+	trackers->list[tag].dma = dma;
+}
+
+static struct rsxx_dma *get_tracker_dma(struct dma_tracker_list *trackers,
+					    int tag)
+{
+	return trackers->list[tag].dma;
+}
+
+static int pop_tracker(struct dma_tracker_list *trackers)
+{
+	int tag;
+
+	spin_lock(&trackers->lock);
+	tag = trackers->head;
+	if (tag != -1) {
+		trackers->head = trackers->list[tag].next_tag;
+		trackers->list[tag].next_tag = -1;
+	}
+	spin_unlock(&trackers->lock);
+
+	return tag;
+}
+
+static void push_tracker(struct dma_tracker_list *trackers, int tag)
+{
+	spin_lock(&trackers->lock);
+	trackers->list[tag].next_tag = trackers->head;
+	trackers->head = tag;
+	trackers->list[tag].dma = NULL;
+	spin_unlock(&trackers->lock);
+}
+
+
+/*----------------- Interrupt Coalescing -------------*/
+/*
+ * Interrupt Coalescing Register Format:
+ * Interrupt Timer (64ns units) [15:0]
+ * Interrupt Count [24:16]
+ * Reserved [31:25]
+*/
+#define INTR_COAL_LATENCY_MASK       (0x0000ffff)
+
+#define INTR_COAL_COUNT_SHIFT        16
+#define INTR_COAL_COUNT_BITS         9
+#define INTR_COAL_COUNT_MASK         (((1 << INTR_COAL_COUNT_BITS) - 1) << \
+					INTR_COAL_COUNT_SHIFT)
+#define INTR_COAL_LATENCY_UNITS_NS   64
+
+
+static u32 dma_intr_coal_val(u32 mode, u32 count, u32 latency)
+{
+	u32 latency_units = latency / INTR_COAL_LATENCY_UNITS_NS;
+
+	if (mode == RSXX_INTR_COAL_DISABLED)
+		return 0;
+
+	return ((count << INTR_COAL_COUNT_SHIFT) & INTR_COAL_COUNT_MASK) |
+			(latency_units & INTR_COAL_LATENCY_MASK);
+
+}
+
+static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card)
+{
+	int i;
+	u32 q_depth = 0;
+	u32 intr_coal;
+
+	if (card->config.data.intr_coal.mode != RSXX_INTR_COAL_AUTO_TUNE ||
+	    unlikely(card->eeh_state))
+		return;
+
+	for (i = 0; i < card->n_targets; i++)
+		q_depth += atomic_read(&card->ctrl[i].stats.hw_q_depth);
+
+	intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode,
+				      q_depth / 2,
+				      card->config.data.intr_coal.latency);
+	iowrite32(intr_coal, card->regmap + INTR_COAL);
+}
+
+/*----------------- RSXX DMA Handling -------------------*/
+static void rsxx_free_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma)
+{
+	if (dma->cmd != HW_CMD_BLK_DISCARD) {
+		if (!pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
+			pci_unmap_page(ctrl->card->dev, dma->dma_addr,
+				       get_dma_size(dma),
+				       dma->cmd == HW_CMD_BLK_WRITE ?
+						   PCI_DMA_TODEVICE :
+						   PCI_DMA_FROMDEVICE);
+		}
+	}
+
+	kmem_cache_free(rsxx_dma_pool, dma);
+}
+
+static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
+				  struct rsxx_dma *dma,
+				  unsigned int status)
+{
+	if (status & DMA_SW_ERR)
+		ctrl->stats.dma_sw_err++;
+	if (status & DMA_HW_FAULT)
+		ctrl->stats.dma_hw_fault++;
+	if (status & DMA_CANCELLED)
+		ctrl->stats.dma_cancelled++;
+
+	if (dma->cb)
+		dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0);
+
+	rsxx_free_dma(ctrl, dma);
+}
+
+int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
+			   struct list_head *q, unsigned int done)
+{
+	struct rsxx_dma *dma;
+	struct rsxx_dma *tmp;
+	int cnt = 0;
+
+	list_for_each_entry_safe(dma, tmp, q, list) {
+		list_del(&dma->list);
+		if (done & COMPLETE_DMA)
+			rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+		else
+			rsxx_free_dma(ctrl, dma);
+		cnt++;
+	}
+
+	return cnt;
+}
+
+static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl,
+				 struct rsxx_dma *dma)
+{
+	/*
+	 * Requeued DMAs go to the front of the queue so they are issued
+	 * first.
+	 */
+	spin_lock_bh(&ctrl->queue_lock);
+	ctrl->stats.sw_q_depth++;
+	list_add(&dma->list, &ctrl->queue);
+	spin_unlock_bh(&ctrl->queue_lock);
+}
+
+static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
+				      struct rsxx_dma *dma,
+				      u8 hw_st)
+{
+	unsigned int status = 0;
+	int requeue_cmd = 0;
+
+	dev_dbg(CARD_TO_DEV(ctrl->card),
+		"Handling DMA error(cmd x%02x, laddr x%08x st:x%02x)\n",
+		dma->cmd, dma->laddr, hw_st);
+
+	if (hw_st & HW_STATUS_CRC)
+		ctrl->stats.crc_errors++;
+	if (hw_st & HW_STATUS_HARD_ERR)
+		ctrl->stats.hard_errors++;
+	if (hw_st & HW_STATUS_SOFT_ERR)
+		ctrl->stats.soft_errors++;
+
+	switch (dma->cmd) {
+	case HW_CMD_BLK_READ:
+		if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) {
+			if (ctrl->card->scrub_hard) {
+				dma->cmd = HW_CMD_BLK_RECON_READ;
+				requeue_cmd = 1;
+				ctrl->stats.reads_retried++;
+			} else {
+				status |= DMA_HW_FAULT;
+				ctrl->stats.reads_failed++;
+			}
+		} else if (hw_st & HW_STATUS_FAULT) {
+			status |= DMA_HW_FAULT;
+			ctrl->stats.reads_failed++;
+		}
+
+		break;
+	case HW_CMD_BLK_RECON_READ:
+		if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) {
+			/* Data could not be reconstructed. */
+			status |= DMA_HW_FAULT;
+			ctrl->stats.reads_failed++;
+		}
+
+		break;
+	case HW_CMD_BLK_WRITE:
+		status |= DMA_HW_FAULT;
+		ctrl->stats.writes_failed++;
+
+		break;
+	case HW_CMD_BLK_DISCARD:
+		status |= DMA_HW_FAULT;
+		ctrl->stats.discards_failed++;
+
+		break;
+	default:
+		dev_err(CARD_TO_DEV(ctrl->card),
+			"Unknown command in DMA!(cmd: x%02x "
+			   "laddr x%08x st: x%02x\n",
+			   dma->cmd, dma->laddr, hw_st);
+		status |= DMA_SW_ERR;
+
+		break;
+	}
+
+	if (requeue_cmd)
+		rsxx_requeue_dma(ctrl, dma);
+	else
+		rsxx_complete_dma(ctrl, dma, status);
+}
+
+static void dma_engine_stalled(unsigned long data)
+{
+	struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data;
+	int cnt;
+
+	if (atomic_read(&ctrl->stats.hw_q_depth) == 0 ||
+	    unlikely(ctrl->card->eeh_state))
+		return;
+
+	if (ctrl->cmd.idx != ioread32(ctrl->regmap + SW_CMD_IDX)) {
+		/*
+		 * The dma engine was stalled because the SW_CMD_IDX write
+		 * was lost. Issue it again to recover.
+		 */
+		dev_warn(CARD_TO_DEV(ctrl->card),
+			"SW_CMD_IDX write was lost, re-writing...\n");
+		iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX);
+		mod_timer(&ctrl->activity_timer,
+			  jiffies + DMA_ACTIVITY_TIMEOUT);
+	} else {
+		dev_warn(CARD_TO_DEV(ctrl->card),
+			"DMA channel %d has stalled, faulting interface.\n",
+			ctrl->id);
+		ctrl->card->dma_fault = 1;
+
+		/* Clean up the DMA queue */
+		spin_lock(&ctrl->queue_lock);
+		cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA);
+		spin_unlock(&ctrl->queue_lock);
+
+		cnt += rsxx_dma_cancel(ctrl);
+
+		if (cnt)
+			dev_info(CARD_TO_DEV(ctrl->card),
+				"Freed %d queued DMAs on channel %d\n",
+				cnt, ctrl->id);
+	}
+}
+
+static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
+{
+	struct rsxx_dma *dma;
+	int tag;
+	int cmds_pending = 0;
+	struct hw_cmd *hw_cmd_buf;
+	int dir;
+
+	hw_cmd_buf = ctrl->cmd.buf;
+
+	if (unlikely(ctrl->card->halt) ||
+	    unlikely(ctrl->card->eeh_state))
+		return;
+
+	while (1) {
+		spin_lock_bh(&ctrl->queue_lock);
+		if (list_empty(&ctrl->queue)) {
+			spin_unlock_bh(&ctrl->queue_lock);
+			break;
+		}
+		spin_unlock_bh(&ctrl->queue_lock);
+
+		tag = pop_tracker(ctrl->trackers);
+		if (tag == -1)
+			break;
+
+		spin_lock_bh(&ctrl->queue_lock);
+		dma = list_entry(ctrl->queue.next, struct rsxx_dma, list);
+		list_del(&dma->list);
+		ctrl->stats.sw_q_depth--;
+		spin_unlock_bh(&ctrl->queue_lock);
+
+		/*
+		 * This will catch any DMAs that slipped in right before the
+		 * fault, but was queued after all the other DMAs were
+		 * cancelled.
+		 */
+		if (unlikely(ctrl->card->dma_fault)) {
+			push_tracker(ctrl->trackers, tag);
+			rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+			continue;
+		}
+
+		if (dma->cmd != HW_CMD_BLK_DISCARD) {
+			if (dma->cmd == HW_CMD_BLK_WRITE)
+				dir = PCI_DMA_TODEVICE;
+			else
+				dir = PCI_DMA_FROMDEVICE;
+
+			/*
+			 * The function pci_map_page is placed here because we
+			 * can only, by design, issue up to 255 commands to the
+			 * hardware at one time per DMA channel. So the maximum
+			 * amount of mapped memory would be 255 * 4 channels *
+			 * 4096 Bytes which is less than 2GB, the limit of a x8
+			 * Non-HWWD PCIe slot. This way the pci_map_page
+			 * function should never fail because of a lack of
+			 * mappable memory.
+			 */
+			dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page,
+					dma->pg_off, dma->sub_page.cnt << 9, dir);
+			if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
+				push_tracker(ctrl->trackers, tag);
+				rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+				continue;
+			}
+		}
+
+		set_tracker_dma(ctrl->trackers, tag, dma);
+		hw_cmd_buf[ctrl->cmd.idx].command  = dma->cmd;
+		hw_cmd_buf[ctrl->cmd.idx].tag      = tag;
+		hw_cmd_buf[ctrl->cmd.idx]._rsvd    = 0;
+		hw_cmd_buf[ctrl->cmd.idx].sub_page =
+					((dma->sub_page.cnt & 0x7) << 4) |
+					 (dma->sub_page.off & 0x7);
+
+		hw_cmd_buf[ctrl->cmd.idx].device_addr =
+					cpu_to_le32(dma->laddr);
+
+		hw_cmd_buf[ctrl->cmd.idx].host_addr =
+					cpu_to_le64(dma->dma_addr);
+
+		dev_dbg(CARD_TO_DEV(ctrl->card),
+			"Issue DMA%d(laddr %d tag %d) to idx %d\n",
+			ctrl->id, dma->laddr, tag, ctrl->cmd.idx);
+
+		ctrl->cmd.idx = (ctrl->cmd.idx + 1) & RSXX_CS_IDX_MASK;
+		cmds_pending++;
+
+		if (dma->cmd == HW_CMD_BLK_WRITE)
+			ctrl->stats.writes_issued++;
+		else if (dma->cmd == HW_CMD_BLK_DISCARD)
+			ctrl->stats.discards_issued++;
+		else
+			ctrl->stats.reads_issued++;
+	}
+
+	/* Let HW know we've queued commands. */
+	if (cmds_pending) {
+		atomic_add(cmds_pending, &ctrl->stats.hw_q_depth);
+		mod_timer(&ctrl->activity_timer,
+			  jiffies + DMA_ACTIVITY_TIMEOUT);
+
+		if (unlikely(ctrl->card->eeh_state)) {
+			del_timer_sync(&ctrl->activity_timer);
+			return;
+		}
+
+		iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX);
+	}
+}
+
+static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl)
+{
+	struct rsxx_dma *dma;
+	unsigned long flags;
+	u16 count;
+	u8 status;
+	u8 tag;
+	struct hw_status *hw_st_buf;
+
+	hw_st_buf = ctrl->status.buf;
+
+	if (unlikely(ctrl->card->halt) ||
+	    unlikely(ctrl->card->dma_fault) ||
+	    unlikely(ctrl->card->eeh_state))
+		return;
+
+	count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count);
+
+	while (count == ctrl->e_cnt) {
+		/*
+		 * The read memory-barrier is necessary to keep aggressive
+		 * processors/optimizers (such as the PPC Apple G5) from
+		 * reordering the following status-buffer tag & status read
+		 * *before* the count read on subsequent iterations of the
+		 * loop!
+		 */
+		rmb();
+
+		status = hw_st_buf[ctrl->status.idx].status;
+		tag    = hw_st_buf[ctrl->status.idx].tag;
+
+		dma = get_tracker_dma(ctrl->trackers, tag);
+		if (dma == NULL) {
+			spin_lock_irqsave(&ctrl->card->irq_lock, flags);
+			rsxx_disable_ier(ctrl->card, CR_INTR_DMA_ALL);
+			spin_unlock_irqrestore(&ctrl->card->irq_lock, flags);
+
+			dev_err(CARD_TO_DEV(ctrl->card),
+				"No tracker for tag %d "
+				"(idx %d id %d)\n",
+				tag, ctrl->status.idx, ctrl->id);
+			return;
+		}
+
+		dev_dbg(CARD_TO_DEV(ctrl->card),
+			"Completing DMA%d"
+			"(laddr x%x tag %d st: x%x cnt: x%04x) from idx %d.\n",
+			ctrl->id, dma->laddr, tag, status, count,
+			ctrl->status.idx);
+
+		atomic_dec(&ctrl->stats.hw_q_depth);
+
+		mod_timer(&ctrl->activity_timer,
+			  jiffies + DMA_ACTIVITY_TIMEOUT);
+
+		if (status)
+			rsxx_handle_dma_error(ctrl, dma, status);
+		else
+			rsxx_complete_dma(ctrl, dma, 0);
+
+		push_tracker(ctrl->trackers, tag);
+
+		ctrl->status.idx = (ctrl->status.idx + 1) &
+				   RSXX_CS_IDX_MASK;
+		ctrl->e_cnt++;
+
+		count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count);
+	}
+
+	dma_intr_coal_auto_tune(ctrl->card);
+
+	if (atomic_read(&ctrl->stats.hw_q_depth) == 0)
+		del_timer_sync(&ctrl->activity_timer);
+
+	spin_lock_irqsave(&ctrl->card->irq_lock, flags);
+	rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id));
+	spin_unlock_irqrestore(&ctrl->card->irq_lock, flags);
+
+	spin_lock_bh(&ctrl->queue_lock);
+	if (ctrl->stats.sw_q_depth)
+		queue_work(ctrl->issue_wq, &ctrl->issue_dma_work);
+	spin_unlock_bh(&ctrl->queue_lock);
+}
+
+static void rsxx_schedule_issue(struct work_struct *work)
+{
+	struct rsxx_dma_ctrl *ctrl;
+
+	ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work);
+
+	mutex_lock(&ctrl->work_lock);
+	rsxx_issue_dmas(ctrl);
+	mutex_unlock(&ctrl->work_lock);
+}
+
+static void rsxx_schedule_done(struct work_struct *work)
+{
+	struct rsxx_dma_ctrl *ctrl;
+
+	ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work);
+
+	mutex_lock(&ctrl->work_lock);
+	rsxx_dma_done(ctrl);
+	mutex_unlock(&ctrl->work_lock);
+}
+
+static int rsxx_queue_discard(struct rsxx_cardinfo *card,
+				  struct list_head *q,
+				  unsigned int laddr,
+				  rsxx_dma_cb cb,
+				  void *cb_data)
+{
+	struct rsxx_dma *dma;
+
+	dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL);
+	if (!dma)
+		return -ENOMEM;
+
+	dma->cmd          = HW_CMD_BLK_DISCARD;
+	dma->laddr        = laddr;
+	dma->dma_addr     = 0;
+	dma->sub_page.off = 0;
+	dma->sub_page.cnt = 0;
+	dma->page         = NULL;
+	dma->pg_off       = 0;
+	dma->cb	          = cb;
+	dma->cb_data      = cb_data;
+
+	dev_dbg(CARD_TO_DEV(card), "Queuing[D] laddr %x\n", dma->laddr);
+
+	list_add_tail(&dma->list, q);
+
+	return 0;
+}
+
+static int rsxx_queue_dma(struct rsxx_cardinfo *card,
+			      struct list_head *q,
+			      int dir,
+			      unsigned int dma_off,
+			      unsigned int dma_len,
+			      unsigned int laddr,
+			      struct page *page,
+			      unsigned int pg_off,
+			      rsxx_dma_cb cb,
+			      void *cb_data)
+{
+	struct rsxx_dma *dma;
+
+	dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL);
+	if (!dma)
+		return -ENOMEM;
+
+	dma->cmd          = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ;
+	dma->laddr        = laddr;
+	dma->sub_page.off = (dma_off >> 9);
+	dma->sub_page.cnt = (dma_len >> 9);
+	dma->page         = page;
+	dma->pg_off       = pg_off;
+	dma->cb	          = cb;
+	dma->cb_data      = cb_data;
+
+	dev_dbg(CARD_TO_DEV(card),
+		"Queuing[%c] laddr %x off %d cnt %d page %p pg_off %d\n",
+		dir ? 'W' : 'R', dma->laddr, dma->sub_page.off,
+		dma->sub_page.cnt, dma->page, dma->pg_off);
+
+	/* Queue the DMA */
+	list_add_tail(&dma->list, q);
+
+	return 0;
+}
+
+int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
+			   struct bio *bio,
+			   atomic_t *n_dmas,
+			   rsxx_dma_cb cb,
+			   void *cb_data)
+{
+	struct list_head dma_list[RSXX_MAX_TARGETS];
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	unsigned long long addr8;
+	unsigned int laddr;
+	unsigned int bv_len;
+	unsigned int bv_off;
+	unsigned int dma_off;
+	unsigned int dma_len;
+	int dma_cnt[RSXX_MAX_TARGETS];
+	int tgt;
+	int st;
+	int i;
+
+	addr8 = bio->bi_iter.bi_sector << 9; /* sectors are 512 bytes */
+	atomic_set(n_dmas, 0);
+
+	for (i = 0; i < card->n_targets; i++) {
+		INIT_LIST_HEAD(&dma_list[i]);
+		dma_cnt[i] = 0;
+	}
+
+	if (bio->bi_rw & REQ_DISCARD) {
+		bv_len = bio->bi_iter.bi_size;
+
+		while (bv_len > 0) {
+			tgt   = rsxx_get_dma_tgt(card, addr8);
+			laddr = rsxx_addr8_to_laddr(addr8, card);
+
+			st = rsxx_queue_discard(card, &dma_list[tgt], laddr,
+						    cb, cb_data);
+			if (st)
+				goto bvec_err;
+
+			dma_cnt[tgt]++;
+			atomic_inc(n_dmas);
+			addr8  += RSXX_HW_BLK_SIZE;
+			bv_len -= RSXX_HW_BLK_SIZE;
+		}
+	} else {
+		bio_for_each_segment(bvec, bio, iter) {
+			bv_len = bvec.bv_len;
+			bv_off = bvec.bv_offset;
+
+			while (bv_len > 0) {
+				tgt   = rsxx_get_dma_tgt(card, addr8);
+				laddr = rsxx_addr8_to_laddr(addr8, card);
+				dma_off = addr8 & RSXX_HW_BLK_MASK;
+				dma_len = min(bv_len,
+					      RSXX_HW_BLK_SIZE - dma_off);
+
+				st = rsxx_queue_dma(card, &dma_list[tgt],
+							bio_data_dir(bio),
+							dma_off, dma_len,
+							laddr, bvec.bv_page,
+							bv_off, cb, cb_data);
+				if (st)
+					goto bvec_err;
+
+				dma_cnt[tgt]++;
+				atomic_inc(n_dmas);
+				addr8  += dma_len;
+				bv_off += dma_len;
+				bv_len -= dma_len;
+			}
+		}
+	}
+
+	for (i = 0; i < card->n_targets; i++) {
+		if (!list_empty(&dma_list[i])) {
+			spin_lock_bh(&card->ctrl[i].queue_lock);
+			card->ctrl[i].stats.sw_q_depth += dma_cnt[i];
+			list_splice_tail(&dma_list[i], &card->ctrl[i].queue);
+			spin_unlock_bh(&card->ctrl[i].queue_lock);
+
+			queue_work(card->ctrl[i].issue_wq,
+				   &card->ctrl[i].issue_dma_work);
+		}
+	}
+
+	return 0;
+
+bvec_err:
+	for (i = 0; i < card->n_targets; i++)
+		rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i],
+					FREE_DMA);
+
+	return st;
+}
+
+
+/*----------------- DMA Engine Initialization & Setup -------------------*/
+int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl)
+{
+	ctrl->status.buf = pci_alloc_consistent(dev, STATUS_BUFFER_SIZE8,
+				&ctrl->status.dma_addr);
+	ctrl->cmd.buf = pci_alloc_consistent(dev, COMMAND_BUFFER_SIZE8,
+				&ctrl->cmd.dma_addr);
+	if (ctrl->status.buf == NULL || ctrl->cmd.buf == NULL)
+		return -ENOMEM;
+
+	memset(ctrl->status.buf, 0xac, STATUS_BUFFER_SIZE8);
+	iowrite32(lower_32_bits(ctrl->status.dma_addr),
+		ctrl->regmap + SB_ADD_LO);
+	iowrite32(upper_32_bits(ctrl->status.dma_addr),
+		ctrl->regmap + SB_ADD_HI);
+
+	memset(ctrl->cmd.buf, 0x83, COMMAND_BUFFER_SIZE8);
+	iowrite32(lower_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_LO);
+	iowrite32(upper_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_HI);
+
+	ctrl->status.idx = ioread32(ctrl->regmap + HW_STATUS_CNT);
+	if (ctrl->status.idx > RSXX_MAX_OUTSTANDING_CMDS) {
+		dev_crit(&dev->dev, "Failed reading status cnt x%x\n",
+			ctrl->status.idx);
+		return -EINVAL;
+	}
+	iowrite32(ctrl->status.idx, ctrl->regmap + HW_STATUS_CNT);
+	iowrite32(ctrl->status.idx, ctrl->regmap + SW_STATUS_CNT);
+
+	ctrl->cmd.idx = ioread32(ctrl->regmap + HW_CMD_IDX);
+	if (ctrl->cmd.idx > RSXX_MAX_OUTSTANDING_CMDS) {
+		dev_crit(&dev->dev, "Failed reading cmd cnt x%x\n",
+			ctrl->status.idx);
+		return -EINVAL;
+	}
+	iowrite32(ctrl->cmd.idx, ctrl->regmap + HW_CMD_IDX);
+	iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX);
+
+	return 0;
+}
+
+static int rsxx_dma_ctrl_init(struct pci_dev *dev,
+				  struct rsxx_dma_ctrl *ctrl)
+{
+	int i;
+	int st;
+
+	memset(&ctrl->stats, 0, sizeof(ctrl->stats));
+
+	ctrl->trackers = vmalloc(DMA_TRACKER_LIST_SIZE8);
+	if (!ctrl->trackers)
+		return -ENOMEM;
+
+	ctrl->trackers->head = 0;
+	for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) {
+		ctrl->trackers->list[i].next_tag = i + 1;
+		ctrl->trackers->list[i].dma = NULL;
+	}
+	ctrl->trackers->list[RSXX_MAX_OUTSTANDING_CMDS-1].next_tag = -1;
+	spin_lock_init(&ctrl->trackers->lock);
+
+	spin_lock_init(&ctrl->queue_lock);
+	mutex_init(&ctrl->work_lock);
+	INIT_LIST_HEAD(&ctrl->queue);
+
+	setup_timer(&ctrl->activity_timer, dma_engine_stalled,
+					(unsigned long)ctrl);
+
+	ctrl->issue_wq = alloc_ordered_workqueue(DRIVER_NAME"_issue", 0);
+	if (!ctrl->issue_wq)
+		return -ENOMEM;
+
+	ctrl->done_wq = alloc_ordered_workqueue(DRIVER_NAME"_done", 0);
+	if (!ctrl->done_wq)
+		return -ENOMEM;
+
+	INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue);
+	INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done);
+
+	st = rsxx_hw_buffers_init(dev, ctrl);
+	if (st)
+		return st;
+
+	return 0;
+}
+
+static int rsxx_dma_stripe_setup(struct rsxx_cardinfo *card,
+			      unsigned int stripe_size8)
+{
+	if (!is_power_of_2(stripe_size8)) {
+		dev_err(CARD_TO_DEV(card),
+			"stripe_size is NOT a power of 2!\n");
+		return -EINVAL;
+	}
+
+	card->_stripe.lower_mask = stripe_size8 - 1;
+
+	card->_stripe.upper_mask  = ~(card->_stripe.lower_mask);
+	card->_stripe.upper_shift = ffs(card->n_targets) - 1;
+
+	card->_stripe.target_mask = card->n_targets - 1;
+	card->_stripe.target_shift = ffs(stripe_size8) - 1;
+
+	dev_dbg(CARD_TO_DEV(card), "_stripe.lower_mask   = x%016llx\n",
+		card->_stripe.lower_mask);
+	dev_dbg(CARD_TO_DEV(card), "_stripe.upper_shift  = x%016llx\n",
+		card->_stripe.upper_shift);
+	dev_dbg(CARD_TO_DEV(card), "_stripe.upper_mask   = x%016llx\n",
+		card->_stripe.upper_mask);
+	dev_dbg(CARD_TO_DEV(card), "_stripe.target_mask  = x%016llx\n",
+		card->_stripe.target_mask);
+	dev_dbg(CARD_TO_DEV(card), "_stripe.target_shift = x%016llx\n",
+		card->_stripe.target_shift);
+
+	return 0;
+}
+
+int rsxx_dma_configure(struct rsxx_cardinfo *card)
+{
+	u32 intr_coal;
+
+	intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode,
+				      card->config.data.intr_coal.count,
+				      card->config.data.intr_coal.latency);
+	iowrite32(intr_coal, card->regmap + INTR_COAL);
+
+	return rsxx_dma_stripe_setup(card, card->config.data.stripe_size);
+}
+
+int rsxx_dma_setup(struct rsxx_cardinfo *card)
+{
+	unsigned long flags;
+	int st;
+	int i;
+
+	dev_info(CARD_TO_DEV(card),
+		"Initializing %d DMA targets\n",
+		card->n_targets);
+
+	/* Regmap is divided up into 4K chunks. One for each DMA channel */
+	for (i = 0; i < card->n_targets; i++)
+		card->ctrl[i].regmap = card->regmap + (i * 4096);
+
+	card->dma_fault = 0;
+
+	/* Reset the DMA queues */
+	rsxx_dma_queue_reset(card);
+
+	/************* Setup DMA Control *************/
+	for (i = 0; i < card->n_targets; i++) {
+		st = rsxx_dma_ctrl_init(card->dev, &card->ctrl[i]);
+		if (st)
+			goto failed_dma_setup;
+
+		card->ctrl[i].card = card;
+		card->ctrl[i].id = i;
+	}
+
+	card->scrub_hard = 1;
+
+	if (card->config_valid)
+		rsxx_dma_configure(card);
+
+	/* Enable the interrupts after all setup has completed. */
+	for (i = 0; i < card->n_targets; i++) {
+		spin_lock_irqsave(&card->irq_lock, flags);
+		rsxx_enable_ier_and_isr(card, CR_INTR_DMA(i));
+		spin_unlock_irqrestore(&card->irq_lock, flags);
+	}
+
+	return 0;
+
+failed_dma_setup:
+	for (i = 0; i < card->n_targets; i++) {
+		struct rsxx_dma_ctrl *ctrl = &card->ctrl[i];
+
+		if (ctrl->issue_wq) {
+			destroy_workqueue(ctrl->issue_wq);
+			ctrl->issue_wq = NULL;
+		}
+
+		if (ctrl->done_wq) {
+			destroy_workqueue(ctrl->done_wq);
+			ctrl->done_wq = NULL;
+		}
+
+		if (ctrl->trackers)
+			vfree(ctrl->trackers);
+
+		if (ctrl->status.buf)
+			pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8,
+					    ctrl->status.buf,
+					    ctrl->status.dma_addr);
+		if (ctrl->cmd.buf)
+			pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8,
+					    ctrl->cmd.buf, ctrl->cmd.dma_addr);
+	}
+
+	return st;
+}
+
+int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl)
+{
+	struct rsxx_dma *dma;
+	int i;
+	int cnt = 0;
+
+	/* Clean up issued DMAs */
+	for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) {
+		dma = get_tracker_dma(ctrl->trackers, i);
+		if (dma) {
+			atomic_dec(&ctrl->stats.hw_q_depth);
+			rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+			push_tracker(ctrl->trackers, i);
+			cnt++;
+		}
+	}
+
+	return cnt;
+}
+
+void rsxx_dma_destroy(struct rsxx_cardinfo *card)
+{
+	struct rsxx_dma_ctrl *ctrl;
+	int i;
+
+	for (i = 0; i < card->n_targets; i++) {
+		ctrl = &card->ctrl[i];
+
+		if (ctrl->issue_wq) {
+			destroy_workqueue(ctrl->issue_wq);
+			ctrl->issue_wq = NULL;
+		}
+
+		if (ctrl->done_wq) {
+			destroy_workqueue(ctrl->done_wq);
+			ctrl->done_wq = NULL;
+		}
+
+		if (timer_pending(&ctrl->activity_timer))
+			del_timer_sync(&ctrl->activity_timer);
+
+		/* Clean up the DMA queue */
+		spin_lock_bh(&ctrl->queue_lock);
+		rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA);
+		spin_unlock_bh(&ctrl->queue_lock);
+
+		rsxx_dma_cancel(ctrl);
+
+		vfree(ctrl->trackers);
+
+		pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8,
+				    ctrl->status.buf, ctrl->status.dma_addr);
+		pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8,
+				    ctrl->cmd.buf, ctrl->cmd.dma_addr);
+	}
+}
+
+int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
+{
+	int i;
+	int j;
+	int cnt;
+	struct rsxx_dma *dma;
+	struct list_head *issued_dmas;
+
+	issued_dmas = kzalloc(sizeof(*issued_dmas) * card->n_targets,
+			      GFP_KERNEL);
+	if (!issued_dmas)
+		return -ENOMEM;
+
+	for (i = 0; i < card->n_targets; i++) {
+		INIT_LIST_HEAD(&issued_dmas[i]);
+		cnt = 0;
+		for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) {
+			dma = get_tracker_dma(card->ctrl[i].trackers, j);
+			if (dma == NULL)
+				continue;
+
+			if (dma->cmd == HW_CMD_BLK_WRITE)
+				card->ctrl[i].stats.writes_issued--;
+			else if (dma->cmd == HW_CMD_BLK_DISCARD)
+				card->ctrl[i].stats.discards_issued--;
+			else
+				card->ctrl[i].stats.reads_issued--;
+
+			if (dma->cmd != HW_CMD_BLK_DISCARD) {
+				pci_unmap_page(card->dev, dma->dma_addr,
+					       get_dma_size(dma),
+					       dma->cmd == HW_CMD_BLK_WRITE ?
+					       PCI_DMA_TODEVICE :
+					       PCI_DMA_FROMDEVICE);
+			}
+
+			list_add_tail(&dma->list, &issued_dmas[i]);
+			push_tracker(card->ctrl[i].trackers, j);
+			cnt++;
+		}
+
+		spin_lock_bh(&card->ctrl[i].queue_lock);
+		list_splice(&issued_dmas[i], &card->ctrl[i].queue);
+
+		atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth);
+		card->ctrl[i].stats.sw_q_depth += cnt;
+		card->ctrl[i].e_cnt = 0;
+		spin_unlock_bh(&card->ctrl[i].queue_lock);
+	}
+
+	kfree(issued_dmas);
+
+	return 0;
+}
+
+int rsxx_dma_init(void)
+{
+	rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN);
+	if (!rsxx_dma_pool)
+		return -ENOMEM;
+
+	return 0;
+}
+
+
+void rsxx_dma_cleanup(void)
+{
+	kmem_cache_destroy(rsxx_dma_pool);
+}
+
diff --git a/drivers/block/rsxx/rsxx.h b/drivers/block/rsxx/rsxx.h
new file mode 100644
index 00000000000..24ba3642bd8
--- /dev/null
+++ b/drivers/block/rsxx/rsxx.h
@@ -0,0 +1,47 @@
+/*
+* Filename: rsxx.h
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+*	Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#ifndef __RSXX_H__
+#define __RSXX_H__
+
+/*----------------- IOCTL Definitions -------------------*/
+
+#define RSXX_MAX_DATA 8
+
+struct rsxx_reg_access {
+	__u32 addr;
+	__u32 cnt;
+	__u32 stat;
+	__u32 stream;
+	__u32 data[RSXX_MAX_DATA];
+};
+
+#define RSXX_MAX_REG_CNT	(RSXX_MAX_DATA * (sizeof(__u32)))
+
+#define RSXX_IOC_MAGIC 'r'
+
+#define RSXX_GETREG _IOWR(RSXX_IOC_MAGIC, 0x20, struct rsxx_reg_access)
+#define RSXX_SETREG _IOWR(RSXX_IOC_MAGIC, 0x21, struct rsxx_reg_access)
+
+#endif /* __RSXX_H_ */
diff --git a/drivers/block/rsxx/rsxx_cfg.h b/drivers/block/rsxx/rsxx_cfg.h
new file mode 100644
index 00000000000..f384c943846
--- /dev/null
+++ b/drivers/block/rsxx/rsxx_cfg.h
@@ -0,0 +1,72 @@
+/*
+* Filename: rsXX_cfg.h
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+*	Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#ifndef __RSXX_CFG_H__
+#define __RSXX_CFG_H__
+
+/* NOTE: Config values will be saved in network byte order (i.e. Big endian) */
+#include <linux/types.h>
+
+/*
+ * The card config version must match the driver's expected version. If it does
+ * not, the DMA interfaces will not be attached and the user will need to
+ * initialize/upgrade the card configuration using the card config utility.
+ */
+#define RSXX_CFG_VERSION	4
+
+struct card_cfg_hdr {
+	__u32	version;
+	__u32	crc;
+};
+
+struct card_cfg_data {
+	__u32	block_size;
+	__u32	stripe_size;
+	__u32	vendor_id;
+	__u32	cache_order;
+	struct {
+		__u32	mode;	/* Disabled, manual, auto-tune... */
+		__u32	count;	/* Number of intr to coalesce     */
+		__u32	latency;/* Max wait time (in ns)          */
+	} intr_coal;
+};
+
+struct rsxx_card_cfg {
+	struct card_cfg_hdr	hdr;
+	struct card_cfg_data	data;
+};
+
+/* Vendor ID Values */
+#define RSXX_VENDOR_ID_IBM		0
+#define RSXX_VENDOR_ID_DSI		1
+#define RSXX_VENDOR_COUNT		2
+
+/* Interrupt Coalescing Values */
+#define RSXX_INTR_COAL_DISABLED           0
+#define RSXX_INTR_COAL_EXPLICIT           1
+#define RSXX_INTR_COAL_AUTO_TUNE          2
+
+
+#endif /* __RSXX_CFG_H__ */
+
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
new file mode 100644
index 00000000000..6bbc64d0f69
--- /dev/null
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -0,0 +1,434 @@
+/*
+* Filename: rsxx_priv.h
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+*	Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#ifndef __RSXX_PRIV_H__
+#define __RSXX_PRIV_H__
+
+#include <linux/version.h>
+#include <linux/semaphore.h>
+
+#include <linux/fs.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/bio.h>
+#include <linux/vmalloc.h>
+#include <linux/timer.h>
+#include <linux/ioctl.h>
+#include <linux/delay.h>
+
+#include "rsxx.h"
+#include "rsxx_cfg.h"
+
+struct proc_cmd;
+
+#define PCI_DEVICE_ID_FS70_FLASH	0x04A9
+#define PCI_DEVICE_ID_FS80_FLASH	0x04AA
+
+#define RS70_PCI_REV_SUPPORTED	4
+
+#define DRIVER_NAME "rsxx"
+#define DRIVER_VERSION "4.0.3.2516"
+
+/* Block size is 4096 */
+#define RSXX_HW_BLK_SHIFT		12
+#define RSXX_HW_BLK_SIZE		(1 << RSXX_HW_BLK_SHIFT)
+#define RSXX_HW_BLK_MASK		(RSXX_HW_BLK_SIZE - 1)
+
+#define MAX_CREG_DATA8	32
+#define LOG_BUF_SIZE8	128
+
+#define RSXX_MAX_OUTSTANDING_CMDS	255
+#define RSXX_CS_IDX_MASK		0xff
+
+#define STATUS_BUFFER_SIZE8     4096
+#define COMMAND_BUFFER_SIZE8    4096
+
+#define RSXX_MAX_TARGETS	8
+
+struct dma_tracker_list;
+
+/* DMA Command/Status Buffer structure */
+struct rsxx_cs_buffer {
+	dma_addr_t	dma_addr;
+	void		*buf;
+	u32		idx;
+};
+
+struct rsxx_dma_stats {
+	u32 crc_errors;
+	u32 hard_errors;
+	u32 soft_errors;
+	u32 writes_issued;
+	u32 writes_failed;
+	u32 reads_issued;
+	u32 reads_failed;
+	u32 reads_retried;
+	u32 discards_issued;
+	u32 discards_failed;
+	u32 done_rescheduled;
+	u32 issue_rescheduled;
+	u32 dma_sw_err;
+	u32 dma_hw_fault;
+	u32 dma_cancelled;
+	u32 sw_q_depth;		/* Number of DMAs on the SW queue. */
+	atomic_t hw_q_depth;	/* Number of DMAs queued to HW. */
+};
+
+struct rsxx_dma_ctrl {
+	struct rsxx_cardinfo		*card;
+	int				id;
+	void				__iomem *regmap;
+	struct rsxx_cs_buffer		status;
+	struct rsxx_cs_buffer		cmd;
+	u16				e_cnt;
+	spinlock_t			queue_lock;
+	struct list_head		queue;
+	struct workqueue_struct		*issue_wq;
+	struct work_struct		issue_dma_work;
+	struct workqueue_struct		*done_wq;
+	struct work_struct		dma_done_work;
+	struct timer_list		activity_timer;
+	struct dma_tracker_list		*trackers;
+	struct rsxx_dma_stats		stats;
+	struct mutex			work_lock;
+};
+
+struct rsxx_cardinfo {
+	struct pci_dev		*dev;
+	unsigned int		halt;
+	unsigned int		eeh_state;
+
+	void			__iomem *regmap;
+	spinlock_t		irq_lock;
+	unsigned int		isr_mask;
+	unsigned int		ier_mask;
+
+	struct rsxx_card_cfg	config;
+	int			config_valid;
+
+	/* Embedded CPU Communication */
+	struct {
+		spinlock_t		lock;
+		bool			active;
+		struct creg_cmd		*active_cmd;
+		struct workqueue_struct	*creg_wq;
+		struct work_struct	done_work;
+		struct list_head	queue;
+		unsigned int		q_depth;
+		/* Cache the creg status to prevent ioreads */
+		struct {
+			u32		stat;
+			u32		failed_cancel_timer;
+			u32		creg_timeout;
+		} creg_stats;
+		struct timer_list	cmd_timer;
+		struct mutex		reset_lock;
+		int			reset;
+	} creg_ctrl;
+
+	struct {
+		char tmp[MAX_CREG_DATA8];
+		char buf[LOG_BUF_SIZE8]; /* terminated */
+		int buf_len;
+	} log;
+
+	struct workqueue_struct	*event_wq;
+	struct work_struct	event_work;
+	unsigned int		state;
+	u64			size8;
+
+	/* Lock the device attach/detach function */
+	struct mutex		dev_lock;
+
+	/* Block Device Variables */
+	bool			bdev_attached;
+	int			disk_id;
+	int			major;
+	struct request_queue	*queue;
+	struct gendisk		*gendisk;
+	struct {
+		/* Used to convert a byte address to a device address. */
+		u64 lower_mask;
+		u64 upper_shift;
+		u64 upper_mask;
+		u64 target_mask;
+		u64 target_shift;
+	} _stripe;
+	unsigned int		dma_fault;
+
+	int			scrub_hard;
+
+	int			n_targets;
+	struct rsxx_dma_ctrl	*ctrl;
+
+	struct dentry		*debugfs_dir;
+};
+
+enum rsxx_pci_regmap {
+	HWID		= 0x00,	/* Hardware Identification Register */
+	SCRATCH		= 0x04, /* Scratch/Debug Register */
+	RESET		= 0x08, /* Reset Register */
+	ISR		= 0x10, /* Interrupt Status Register */
+	IER		= 0x14, /* Interrupt Enable Register */
+	IPR		= 0x18, /* Interrupt Poll Register */
+	CB_ADD_LO	= 0x20, /* Command Host Buffer Address [31:0] */
+	CB_ADD_HI	= 0x24, /* Command Host Buffer Address [63:32]*/
+	HW_CMD_IDX	= 0x28, /* Hardware Processed Command Index */
+	SW_CMD_IDX	= 0x2C, /* Software Processed Command Index */
+	SB_ADD_LO	= 0x30, /* Status Host Buffer Address [31:0] */
+	SB_ADD_HI	= 0x34, /* Status Host Buffer Address [63:32] */
+	HW_STATUS_CNT	= 0x38, /* Hardware Status Counter */
+	SW_STATUS_CNT	= 0x3C, /* Deprecated */
+	CREG_CMD	= 0x40, /* CPU Command Register */
+	CREG_ADD	= 0x44, /* CPU Address Register */
+	CREG_CNT	= 0x48, /* CPU Count Register */
+	CREG_STAT	= 0x4C, /* CPU Status Register */
+	CREG_DATA0	= 0x50, /* CPU Data Registers */
+	CREG_DATA1	= 0x54,
+	CREG_DATA2	= 0x58,
+	CREG_DATA3	= 0x5C,
+	CREG_DATA4	= 0x60,
+	CREG_DATA5	= 0x64,
+	CREG_DATA6	= 0x68,
+	CREG_DATA7	= 0x6c,
+	INTR_COAL	= 0x70, /* Interrupt Coalescing Register */
+	HW_ERROR	= 0x74, /* Card Error Register */
+	PCI_DEBUG0	= 0x78, /* PCI Debug Registers */
+	PCI_DEBUG1	= 0x7C,
+	PCI_DEBUG2	= 0x80,
+	PCI_DEBUG3	= 0x84,
+	PCI_DEBUG4	= 0x88,
+	PCI_DEBUG5	= 0x8C,
+	PCI_DEBUG6	= 0x90,
+	PCI_DEBUG7	= 0x94,
+	PCI_POWER_THROTTLE = 0x98,
+	PERF_CTRL	= 0x9c,
+	PERF_TIMER_LO	= 0xa0,
+	PERF_TIMER_HI	= 0xa4,
+	PERF_RD512_LO	= 0xa8,
+	PERF_RD512_HI	= 0xac,
+	PERF_WR512_LO	= 0xb0,
+	PERF_WR512_HI	= 0xb4,
+	PCI_RECONFIG	= 0xb8,
+};
+
+enum rsxx_intr {
+	CR_INTR_DMA0	= 0x00000001,
+	CR_INTR_CREG	= 0x00000002,
+	CR_INTR_DMA1	= 0x00000004,
+	CR_INTR_EVENT	= 0x00000008,
+	CR_INTR_DMA2	= 0x00000010,
+	CR_INTR_DMA3	= 0x00000020,
+	CR_INTR_DMA4	= 0x00000040,
+	CR_INTR_DMA5	= 0x00000080,
+	CR_INTR_DMA6	= 0x00000100,
+	CR_INTR_DMA7	= 0x00000200,
+	CR_INTR_ALL_C	= 0x0000003f,
+	CR_INTR_ALL_G	= 0x000003ff,
+	CR_INTR_DMA_ALL = 0x000003f5,
+	CR_INTR_ALL	= 0xffffffff,
+};
+
+static inline int CR_INTR_DMA(int N)
+{
+	static const unsigned int _CR_INTR_DMA[] = {
+		CR_INTR_DMA0, CR_INTR_DMA1, CR_INTR_DMA2, CR_INTR_DMA3,
+		CR_INTR_DMA4, CR_INTR_DMA5, CR_INTR_DMA6, CR_INTR_DMA7
+	};
+	return _CR_INTR_DMA[N];
+}
+enum rsxx_pci_reset {
+	DMA_QUEUE_RESET		= 0x00000001,
+};
+
+enum rsxx_hw_fifo_flush {
+	RSXX_FLUSH_BUSY		= 0x00000002,
+	RSXX_FLUSH_TIMEOUT	= 0x00000004,
+};
+
+enum rsxx_pci_revision {
+	RSXX_DISCARD_SUPPORT = 2,
+	RSXX_EEH_SUPPORT     = 3,
+};
+
+enum rsxx_creg_cmd {
+	CREG_CMD_TAG_MASK	= 0x0000FF00,
+	CREG_OP_WRITE		= 0x000000C0,
+	CREG_OP_READ		= 0x000000E0,
+};
+
+enum rsxx_creg_addr {
+	CREG_ADD_CARD_CMD		= 0x80001000,
+	CREG_ADD_CARD_STATE		= 0x80001004,
+	CREG_ADD_CARD_SIZE		= 0x8000100c,
+	CREG_ADD_CAPABILITIES		= 0x80001050,
+	CREG_ADD_LOG			= 0x80002000,
+	CREG_ADD_NUM_TARGETS		= 0x80003000,
+	CREG_ADD_CRAM			= 0xA0000000,
+	CREG_ADD_CONFIG			= 0xB0000000,
+};
+
+enum rsxx_creg_card_cmd {
+	CARD_CMD_STARTUP		= 1,
+	CARD_CMD_SHUTDOWN		= 2,
+	CARD_CMD_LOW_LEVEL_FORMAT	= 3,
+	CARD_CMD_FPGA_RECONFIG_BR	= 4,
+	CARD_CMD_FPGA_RECONFIG_MAIN	= 5,
+	CARD_CMD_BACKUP			= 6,
+	CARD_CMD_RESET			= 7,
+	CARD_CMD_deprecated		= 8,
+	CARD_CMD_UNINITIALIZE		= 9,
+	CARD_CMD_DSTROY_EMERGENCY	= 10,
+	CARD_CMD_DSTROY_NORMAL		= 11,
+	CARD_CMD_DSTROY_EXTENDED	= 12,
+	CARD_CMD_DSTROY_ABORT		= 13,
+};
+
+enum rsxx_card_state {
+	CARD_STATE_SHUTDOWN		= 0x00000001,
+	CARD_STATE_STARTING		= 0x00000002,
+	CARD_STATE_FORMATTING		= 0x00000004,
+	CARD_STATE_UNINITIALIZED	= 0x00000008,
+	CARD_STATE_GOOD			= 0x00000010,
+	CARD_STATE_SHUTTING_DOWN	= 0x00000020,
+	CARD_STATE_FAULT		= 0x00000040,
+	CARD_STATE_RD_ONLY_FAULT	= 0x00000080,
+	CARD_STATE_DSTROYING		= 0x00000100,
+};
+
+enum rsxx_led {
+	LED_DEFAULT	= 0x0,
+	LED_IDENTIFY	= 0x1,
+	LED_SOAK	= 0x2,
+};
+
+enum rsxx_creg_flash_lock {
+	CREG_FLASH_LOCK		= 1,
+	CREG_FLASH_UNLOCK	= 2,
+};
+
+enum rsxx_card_capabilities {
+	CARD_CAP_SUBPAGE_WRITES = 0x00000080,
+};
+
+enum rsxx_creg_stat {
+	CREG_STAT_STATUS_MASK	= 0x00000003,
+	CREG_STAT_SUCCESS	= 0x1,
+	CREG_STAT_ERROR		= 0x2,
+	CREG_STAT_CHAR_PENDING	= 0x00000004, /* Character I/O pending bit */
+	CREG_STAT_LOG_PENDING	= 0x00000008, /* HW log message pending bit */
+	CREG_STAT_TAG_MASK	= 0x0000ff00,
+};
+
+enum rsxx_dma_finish {
+	FREE_DMA	= 0x0,
+	COMPLETE_DMA	= 0x1,
+};
+
+static inline unsigned int CREG_DATA(int N)
+{
+	return CREG_DATA0 + (N << 2);
+}
+
+/*----------------- Convenient Log Wrappers -------------------*/
+#define CARD_TO_DEV(__CARD)	(&(__CARD)->dev->dev)
+
+/***** config.c *****/
+int rsxx_load_config(struct rsxx_cardinfo *card);
+
+/***** core.c *****/
+void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr);
+void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr);
+void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card,
+				 unsigned int intr);
+void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card,
+				  unsigned int intr);
+
+/***** dev.c *****/
+int rsxx_attach_dev(struct rsxx_cardinfo *card);
+void rsxx_detach_dev(struct rsxx_cardinfo *card);
+int rsxx_setup_dev(struct rsxx_cardinfo *card);
+void rsxx_destroy_dev(struct rsxx_cardinfo *card);
+int rsxx_dev_init(void);
+void rsxx_dev_cleanup(void);
+
+/***** dma.c ****/
+typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card,
+				void *cb_data,
+				unsigned int status);
+int rsxx_dma_setup(struct rsxx_cardinfo *card);
+void rsxx_dma_destroy(struct rsxx_cardinfo *card);
+int rsxx_dma_init(void);
+int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
+				struct list_head *q,
+				unsigned int done);
+int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl);
+void rsxx_dma_cleanup(void);
+void rsxx_dma_queue_reset(struct rsxx_cardinfo *card);
+int rsxx_dma_configure(struct rsxx_cardinfo *card);
+int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
+			   struct bio *bio,
+			   atomic_t *n_dmas,
+			   rsxx_dma_cb cb,
+			   void *cb_data);
+int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl);
+int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card);
+int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card);
+
+/***** cregs.c *****/
+int rsxx_creg_write(struct rsxx_cardinfo *card, u32 addr,
+			unsigned int size8,
+			void *data,
+			int byte_stream);
+int rsxx_creg_read(struct rsxx_cardinfo *card,
+		       u32 addr,
+		       unsigned int size8,
+		       void *data,
+		       int byte_stream);
+int rsxx_read_hw_log(struct rsxx_cardinfo *card);
+int rsxx_get_card_state(struct rsxx_cardinfo *card,
+			    unsigned int *state);
+int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8);
+int rsxx_get_num_targets(struct rsxx_cardinfo *card,
+			     unsigned int *n_targets);
+int rsxx_get_card_capabilities(struct rsxx_cardinfo *card,
+				   u32 *capabilities);
+int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd);
+int rsxx_creg_setup(struct rsxx_cardinfo *card);
+void rsxx_creg_destroy(struct rsxx_cardinfo *card);
+int rsxx_creg_init(void);
+void rsxx_creg_cleanup(void);
+int rsxx_reg_access(struct rsxx_cardinfo *card,
+			struct rsxx_reg_access __user *ucmd,
+			int read);
+void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card);
+void rsxx_kick_creg_queue(struct rsxx_cardinfo *card);
+
+
+
+#endif /* __DRIVERS_BLOCK_RSXX_H__ */
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
new file mode 100644
index 00000000000..608532d3f8c
--- /dev/null
+++ b/drivers/block/skd_main.c
@@ -0,0 +1,5417 @@
+/* Copyright 2012 STEC, Inc.
+ *
+ * This file is licensed under the terms of the 3-clause
+ * BSD License (http://opensource.org/licenses/BSD-3-Clause)
+ * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html),
+ * at your option. Both licenses are also available in the LICENSE file
+ * distributed with this project. This file may not be copied, modified,
+ * or distributed except in accordance with those terms.
+ * Gordoni Waidhofer <gwaidhofer@stec-inc.com>
+ * Initial Driver Design!
+ * Thomas Swann <tswann@stec-inc.com>
+ * Interrupt handling.
+ * Ramprasad Chinthekindi <rchinthekindi@stec-inc.com>
+ * biomode implementation.
+ * Akhil Bhansali <abhansali@stec-inc.com>
+ * Added support for DISCARD / FLUSH and FUA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/time.h>
+#include <linux/hdreg.h>
+#include <linux/dma-mapping.h>
+#include <linux/completion.h>
+#include <linux/scatterlist.h>
+#include <linux/version.h>
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/aer.h>
+#include <linux/ctype.h>
+#include <linux/wait.h>
+#include <linux/uio.h>
+#include <scsi/scsi.h>
+#include <scsi/sg.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+#include "skd_s1120.h"
+
+static int skd_dbg_level;
+static int skd_isr_comp_limit = 4;
+
+enum {
+	STEC_LINK_2_5GTS = 0,
+	STEC_LINK_5GTS = 1,
+	STEC_LINK_8GTS = 2,
+	STEC_LINK_UNKNOWN = 0xFF
+};
+
+enum {
+	SKD_FLUSH_INITIALIZER,
+	SKD_FLUSH_ZERO_SIZE_FIRST,
+	SKD_FLUSH_DATA_SECOND,
+};
+
+#define SKD_ASSERT(expr) \
+	do { \
+		if (unlikely(!(expr))) { \
+			pr_err("Assertion failed! %s,%s,%s,line=%d\n",	\
+			       # expr, __FILE__, __func__, __LINE__); \
+		} \
+	} while (0)
+
+#define DRV_NAME "skd"
+#define DRV_VERSION "2.2.1"
+#define DRV_BUILD_ID "0260"
+#define PFX DRV_NAME ": "
+#define DRV_BIN_VERSION 0x100
+#define DRV_VER_COMPL   "2.2.1." DRV_BUILD_ID
+
+MODULE_AUTHOR("bug-reports: support@stec-inc.com");
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver (b" DRV_BUILD_ID ")");
+MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID);
+
+#define PCI_VENDOR_ID_STEC      0x1B39
+#define PCI_DEVICE_ID_S1120     0x0001
+
+#define SKD_FUA_NV		(1 << 1)
+#define SKD_MINORS_PER_DEVICE   16
+
+#define SKD_MAX_QUEUE_DEPTH     200u
+
+#define SKD_PAUSE_TIMEOUT       (5 * 1000)
+
+#define SKD_N_FITMSG_BYTES      (512u)
+
+#define SKD_N_SPECIAL_CONTEXT   32u
+#define SKD_N_SPECIAL_FITMSG_BYTES      (128u)
+
+/* SG elements are 32 bytes, so we can make this 4096 and still be under the
+ * 128KB limit.  That allows 4096*4K = 16M xfer size
+ */
+#define SKD_N_SG_PER_REQ_DEFAULT 256u
+#define SKD_N_SG_PER_SPECIAL    256u
+
+#define SKD_N_COMPLETION_ENTRY  256u
+#define SKD_N_READ_CAP_BYTES    (8u)
+
+#define SKD_N_INTERNAL_BYTES    (512u)
+
+/* 5 bits of uniqifier, 0xF800 */
+#define SKD_ID_INCR             (0x400)
+#define SKD_ID_TABLE_MASK       (3u << 8u)
+#define  SKD_ID_RW_REQUEST      (0u << 8u)
+#define  SKD_ID_INTERNAL        (1u << 8u)
+#define  SKD_ID_SPECIAL_REQUEST (2u << 8u)
+#define  SKD_ID_FIT_MSG         (3u << 8u)
+#define SKD_ID_SLOT_MASK        0x00FFu
+#define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu
+
+#define SKD_N_TIMEOUT_SLOT      4u
+#define SKD_TIMEOUT_SLOT_MASK   3u
+
+#define SKD_N_MAX_SECTORS 2048u
+
+#define SKD_MAX_RETRIES 2u
+
+#define SKD_TIMER_SECONDS(seconds) (seconds)
+#define SKD_TIMER_MINUTES(minutes) ((minutes) * (60))
+
+#define INQ_STD_NBYTES 36
+#define SKD_DISCARD_CDB_LENGTH	24
+
+enum skd_drvr_state {
+	SKD_DRVR_STATE_LOAD,
+	SKD_DRVR_STATE_IDLE,
+	SKD_DRVR_STATE_BUSY,
+	SKD_DRVR_STATE_STARTING,
+	SKD_DRVR_STATE_ONLINE,
+	SKD_DRVR_STATE_PAUSING,
+	SKD_DRVR_STATE_PAUSED,
+	SKD_DRVR_STATE_DRAINING_TIMEOUT,
+	SKD_DRVR_STATE_RESTARTING,
+	SKD_DRVR_STATE_RESUMING,
+	SKD_DRVR_STATE_STOPPING,
+	SKD_DRVR_STATE_FAULT,
+	SKD_DRVR_STATE_DISAPPEARED,
+	SKD_DRVR_STATE_PROTOCOL_MISMATCH,
+	SKD_DRVR_STATE_BUSY_ERASE,
+	SKD_DRVR_STATE_BUSY_SANITIZE,
+	SKD_DRVR_STATE_BUSY_IMMINENT,
+	SKD_DRVR_STATE_WAIT_BOOT,
+	SKD_DRVR_STATE_SYNCING,
+};
+
+#define SKD_WAIT_BOOT_TIMO      SKD_TIMER_SECONDS(90u)
+#define SKD_STARTING_TIMO       SKD_TIMER_SECONDS(8u)
+#define SKD_RESTARTING_TIMO     SKD_TIMER_MINUTES(4u)
+#define SKD_DRAINING_TIMO       SKD_TIMER_SECONDS(6u)
+#define SKD_BUSY_TIMO           SKD_TIMER_MINUTES(20u)
+#define SKD_STARTED_BUSY_TIMO   SKD_TIMER_SECONDS(60u)
+#define SKD_START_WAIT_SECONDS  90u
+
+enum skd_req_state {
+	SKD_REQ_STATE_IDLE,
+	SKD_REQ_STATE_SETUP,
+	SKD_REQ_STATE_BUSY,
+	SKD_REQ_STATE_COMPLETED,
+	SKD_REQ_STATE_TIMEOUT,
+	SKD_REQ_STATE_ABORTED,
+};
+
+enum skd_fit_msg_state {
+	SKD_MSG_STATE_IDLE,
+	SKD_MSG_STATE_BUSY,
+};
+
+enum skd_check_status_action {
+	SKD_CHECK_STATUS_REPORT_GOOD,
+	SKD_CHECK_STATUS_REPORT_SMART_ALERT,
+	SKD_CHECK_STATUS_REQUEUE_REQUEST,
+	SKD_CHECK_STATUS_REPORT_ERROR,
+	SKD_CHECK_STATUS_BUSY_IMMINENT,
+};
+
+struct skd_fitmsg_context {
+	enum skd_fit_msg_state state;
+
+	struct skd_fitmsg_context *next;
+
+	u32 id;
+	u16 outstanding;
+
+	u32 length;
+	u32 offset;
+
+	u8 *msg_buf;
+	dma_addr_t mb_dma_address;
+};
+
+struct skd_request_context {
+	enum skd_req_state state;
+
+	struct skd_request_context *next;
+
+	u16 id;
+	u32 fitmsg_id;
+
+	struct request *req;
+	u8 flush_cmd;
+	u8 discard_page;
+
+	u32 timeout_stamp;
+	u8 sg_data_dir;
+	struct scatterlist *sg;
+	u32 n_sg;
+	u32 sg_byte_count;
+
+	struct fit_sg_descriptor *sksg_list;
+	dma_addr_t sksg_dma_address;
+
+	struct fit_completion_entry_v1 completion;
+
+	struct fit_comp_error_info err_info;
+
+};
+#define SKD_DATA_DIR_HOST_TO_CARD       1
+#define SKD_DATA_DIR_CARD_TO_HOST       2
+#define SKD_DATA_DIR_NONE		3	/* especially for DISCARD requests. */
+
+struct skd_special_context {
+	struct skd_request_context req;
+
+	u8 orphaned;
+
+	void *data_buf;
+	dma_addr_t db_dma_address;
+
+	u8 *msg_buf;
+	dma_addr_t mb_dma_address;
+};
+
+struct skd_sg_io {
+	fmode_t mode;
+	void __user *argp;
+
+	struct sg_io_hdr sg;
+
+	u8 cdb[16];
+
+	u32 dxfer_len;
+	u32 iovcnt;
+	struct sg_iovec *iov;
+	struct sg_iovec no_iov_iov;
+
+	struct skd_special_context *skspcl;
+};
+
+typedef enum skd_irq_type {
+	SKD_IRQ_LEGACY,
+	SKD_IRQ_MSI,
+	SKD_IRQ_MSIX
+} skd_irq_type_t;
+
+#define SKD_MAX_BARS                    2
+
+struct skd_device {
+	volatile void __iomem *mem_map[SKD_MAX_BARS];
+	resource_size_t mem_phys[SKD_MAX_BARS];
+	u32 mem_size[SKD_MAX_BARS];
+
+	skd_irq_type_t irq_type;
+	u32 msix_count;
+	struct skd_msix_entry *msix_entries;
+
+	struct pci_dev *pdev;
+	int pcie_error_reporting_is_enabled;
+
+	spinlock_t lock;
+	struct gendisk *disk;
+	struct request_queue *queue;
+	struct device *class_dev;
+	int gendisk_on;
+	int sync_done;
+
+	atomic_t device_count;
+	u32 devno;
+	u32 major;
+	char name[32];
+	char isr_name[30];
+
+	enum skd_drvr_state state;
+	u32 drive_state;
+
+	u32 in_flight;
+	u32 cur_max_queue_depth;
+	u32 queue_low_water_mark;
+	u32 dev_max_queue_depth;
+
+	u32 num_fitmsg_context;
+	u32 num_req_context;
+
+	u32 timeout_slot[SKD_N_TIMEOUT_SLOT];
+	u32 timeout_stamp;
+	struct skd_fitmsg_context *skmsg_free_list;
+	struct skd_fitmsg_context *skmsg_table;
+
+	struct skd_request_context *skreq_free_list;
+	struct skd_request_context *skreq_table;
+
+	struct skd_special_context *skspcl_free_list;
+	struct skd_special_context *skspcl_table;
+
+	struct skd_special_context internal_skspcl;
+	u32 read_cap_blocksize;
+	u32 read_cap_last_lba;
+	int read_cap_is_valid;
+	int inquiry_is_valid;
+	u8 inq_serial_num[13];  /*12 chars plus null term */
+	u8 id_str[80];          /* holds a composite name (pci + sernum) */
+
+	u8 skcomp_cycle;
+	u32 skcomp_ix;
+	struct fit_completion_entry_v1 *skcomp_table;
+	struct fit_comp_error_info *skerr_table;
+	dma_addr_t cq_dma_address;
+
+	wait_queue_head_t waitq;
+
+	struct timer_list timer;
+	u32 timer_countdown;
+	u32 timer_substate;
+
+	int n_special;
+	int sgs_per_request;
+	u32 last_mtd;
+
+	u32 proto_ver;
+
+	int dbg_level;
+	u32 connect_time_stamp;
+	int connect_retries;
+#define SKD_MAX_CONNECT_RETRIES 16
+	u32 drive_jiffies;
+
+	u32 timo_slot;
+
+
+	struct work_struct completion_worker;
+};
+
+#define SKD_WRITEL(DEV, VAL, OFF) skd_reg_write32(DEV, VAL, OFF)
+#define SKD_READL(DEV, OFF)      skd_reg_read32(DEV, OFF)
+#define SKD_WRITEQ(DEV, VAL, OFF) skd_reg_write64(DEV, VAL, OFF)
+
+static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset)
+{
+	u32 val;
+
+	if (likely(skdev->dbg_level < 2))
+		return readl(skdev->mem_map[1] + offset);
+	else {
+		barrier();
+		val = readl(skdev->mem_map[1] + offset);
+		barrier();
+		pr_debug("%s:%s:%d offset %x = %x\n",
+			 skdev->name, __func__, __LINE__, offset, val);
+		return val;
+	}
+
+}
+
+static inline void skd_reg_write32(struct skd_device *skdev, u32 val,
+				   u32 offset)
+{
+	if (likely(skdev->dbg_level < 2)) {
+		writel(val, skdev->mem_map[1] + offset);
+		barrier();
+	} else {
+		barrier();
+		writel(val, skdev->mem_map[1] + offset);
+		barrier();
+		pr_debug("%s:%s:%d offset %x = %x\n",
+			 skdev->name, __func__, __LINE__, offset, val);
+	}
+}
+
+static inline void skd_reg_write64(struct skd_device *skdev, u64 val,
+				   u32 offset)
+{
+	if (likely(skdev->dbg_level < 2)) {
+		writeq(val, skdev->mem_map[1] + offset);
+		barrier();
+	} else {
+		barrier();
+		writeq(val, skdev->mem_map[1] + offset);
+		barrier();
+		pr_debug("%s:%s:%d offset %x = %016llx\n",
+			 skdev->name, __func__, __LINE__, offset, val);
+	}
+}
+
+
+#define SKD_IRQ_DEFAULT SKD_IRQ_MSI
+static int skd_isr_type = SKD_IRQ_DEFAULT;
+
+module_param(skd_isr_type, int, 0444);
+MODULE_PARM_DESC(skd_isr_type, "Interrupt type capability."
+		 " (0==legacy, 1==MSI, 2==MSI-X, default==1)");
+
+#define SKD_MAX_REQ_PER_MSG_DEFAULT 1
+static int skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT;
+
+module_param(skd_max_req_per_msg, int, 0444);
+MODULE_PARM_DESC(skd_max_req_per_msg,
+		 "Maximum SCSI requests packed in a single message."
+		 " (1-14, default==1)");
+
+#define SKD_MAX_QUEUE_DEPTH_DEFAULT 64
+#define SKD_MAX_QUEUE_DEPTH_DEFAULT_STR "64"
+static int skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT;
+
+module_param(skd_max_queue_depth, int, 0444);
+MODULE_PARM_DESC(skd_max_queue_depth,
+		 "Maximum SCSI requests issued to s1120."
+		 " (1-200, default==" SKD_MAX_QUEUE_DEPTH_DEFAULT_STR ")");
+
+static int skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT;
+module_param(skd_sgs_per_request, int, 0444);
+MODULE_PARM_DESC(skd_sgs_per_request,
+		 "Maximum SG elements per block request."
+		 " (1-4096, default==256)");
+
+static int skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT;
+module_param(skd_max_pass_thru, int, 0444);
+MODULE_PARM_DESC(skd_max_pass_thru,
+		 "Maximum SCSI pass-thru at a time." " (1-50, default==32)");
+
+module_param(skd_dbg_level, int, 0444);
+MODULE_PARM_DESC(skd_dbg_level, "s1120 debug level (0,1,2)");
+
+module_param(skd_isr_comp_limit, int, 0444);
+MODULE_PARM_DESC(skd_isr_comp_limit, "s1120 isr comp limit (0=none) default=4");
+
+/* Major device number dynamically assigned. */
+static u32 skd_major;
+
+static void skd_destruct(struct skd_device *skdev);
+static const struct block_device_operations skd_blockdev_ops;
+static void skd_send_fitmsg(struct skd_device *skdev,
+			    struct skd_fitmsg_context *skmsg);
+static void skd_send_special_fitmsg(struct skd_device *skdev,
+				    struct skd_special_context *skspcl);
+static void skd_request_fn(struct request_queue *rq);
+static void skd_end_request(struct skd_device *skdev,
+			    struct skd_request_context *skreq, int error);
+static int skd_preop_sg_list(struct skd_device *skdev,
+			     struct skd_request_context *skreq);
+static void skd_postop_sg_list(struct skd_device *skdev,
+			       struct skd_request_context *skreq);
+
+static void skd_restart_device(struct skd_device *skdev);
+static int skd_quiesce_dev(struct skd_device *skdev);
+static int skd_unquiesce_dev(struct skd_device *skdev);
+static void skd_release_special(struct skd_device *skdev,
+				struct skd_special_context *skspcl);
+static void skd_disable_interrupts(struct skd_device *skdev);
+static void skd_isr_fwstate(struct skd_device *skdev);
+static void skd_recover_requests(struct skd_device *skdev, int requeue);
+static void skd_soft_reset(struct skd_device *skdev);
+
+static const char *skd_name(struct skd_device *skdev);
+const char *skd_drive_state_to_str(int state);
+const char *skd_skdev_state_to_str(enum skd_drvr_state state);
+static void skd_log_skdev(struct skd_device *skdev, const char *event);
+static void skd_log_skmsg(struct skd_device *skdev,
+			  struct skd_fitmsg_context *skmsg, const char *event);
+static void skd_log_skreq(struct skd_device *skdev,
+			  struct skd_request_context *skreq, const char *event);
+
+/*
+ *****************************************************************************
+ * READ/WRITE REQUESTS
+ *****************************************************************************
+ */
+static void skd_fail_all_pending(struct skd_device *skdev)
+{
+	struct request_queue *q = skdev->queue;
+	struct request *req;
+
+	for (;; ) {
+		req = blk_peek_request(q);
+		if (req == NULL)
+			break;
+		blk_start_request(req);
+		__blk_end_request_all(req, -EIO);
+	}
+}
+
+static void
+skd_prep_rw_cdb(struct skd_scsi_request *scsi_req,
+		int data_dir, unsigned lba,
+		unsigned count)
+{
+	if (data_dir == READ)
+		scsi_req->cdb[0] = 0x28;
+	else
+		scsi_req->cdb[0] = 0x2a;
+
+	scsi_req->cdb[1] = 0;
+	scsi_req->cdb[2] = (lba & 0xff000000) >> 24;
+	scsi_req->cdb[3] = (lba & 0xff0000) >> 16;
+	scsi_req->cdb[4] = (lba & 0xff00) >> 8;
+	scsi_req->cdb[5] = (lba & 0xff);
+	scsi_req->cdb[6] = 0;
+	scsi_req->cdb[7] = (count & 0xff00) >> 8;
+	scsi_req->cdb[8] = count & 0xff;
+	scsi_req->cdb[9] = 0;
+}
+
+static void
+skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req,
+			    struct skd_request_context *skreq)
+{
+	skreq->flush_cmd = 1;
+
+	scsi_req->cdb[0] = 0x35;
+	scsi_req->cdb[1] = 0;
+	scsi_req->cdb[2] = 0;
+	scsi_req->cdb[3] = 0;
+	scsi_req->cdb[4] = 0;
+	scsi_req->cdb[5] = 0;
+	scsi_req->cdb[6] = 0;
+	scsi_req->cdb[7] = 0;
+	scsi_req->cdb[8] = 0;
+	scsi_req->cdb[9] = 0;
+}
+
+static void
+skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
+		     struct skd_request_context *skreq,
+		     struct page *page,
+		     u32 lba, u32 count)
+{
+	char *buf;
+	unsigned long len;
+	struct request *req;
+
+	buf = page_address(page);
+	len = SKD_DISCARD_CDB_LENGTH;
+
+	scsi_req->cdb[0] = UNMAP;
+	scsi_req->cdb[8] = len;
+
+	put_unaligned_be16(6 + 16, &buf[0]);
+	put_unaligned_be16(16, &buf[2]);
+	put_unaligned_be64(lba, &buf[8]);
+	put_unaligned_be32(count, &buf[16]);
+
+	req = skreq->req;
+	blk_add_request_payload(req, page, len);
+}
+
+static void skd_request_fn_not_online(struct request_queue *q);
+
+static void skd_request_fn(struct request_queue *q)
+{
+	struct skd_device *skdev = q->queuedata;
+	struct skd_fitmsg_context *skmsg = NULL;
+	struct fit_msg_hdr *fmh = NULL;
+	struct skd_request_context *skreq;
+	struct request *req = NULL;
+	struct skd_scsi_request *scsi_req;
+	struct page *page;
+	unsigned long io_flags;
+	int error;
+	u32 lba;
+	u32 count;
+	int data_dir;
+	u32 be_lba;
+	u32 be_count;
+	u64 be_dmaa;
+	u64 cmdctxt;
+	u32 timo_slot;
+	void *cmd_ptr;
+	int flush, fua;
+
+	if (skdev->state != SKD_DRVR_STATE_ONLINE) {
+		skd_request_fn_not_online(q);
+		return;
+	}
+
+	if (blk_queue_stopped(skdev->queue)) {
+		if (skdev->skmsg_free_list == NULL ||
+		    skdev->skreq_free_list == NULL ||
+		    skdev->in_flight >= skdev->queue_low_water_mark)
+			/* There is still some kind of shortage */
+			return;
+
+		queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue);
+	}
+
+	/*
+	 * Stop conditions:
+	 *  - There are no more native requests
+	 *  - There are already the maximum number of requests in progress
+	 *  - There are no more skd_request_context entries
+	 *  - There are no more FIT msg buffers
+	 */
+	for (;; ) {
+
+		flush = fua = 0;
+
+		req = blk_peek_request(q);
+
+		/* Are there any native requests to start? */
+		if (req == NULL)
+			break;
+
+		lba = (u32)blk_rq_pos(req);
+		count = blk_rq_sectors(req);
+		data_dir = rq_data_dir(req);
+		io_flags = req->cmd_flags;
+
+		if (io_flags & REQ_FLUSH)
+			flush++;
+
+		if (io_flags & REQ_FUA)
+			fua++;
+
+		pr_debug("%s:%s:%d new req=%p lba=%u(0x%x) "
+			 "count=%u(0x%x) dir=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 req, lba, lba, count, count, data_dir);
+
+		/* At this point we know there is a request */
+
+		/* Are too many requets already in progress? */
+		if (skdev->in_flight >= skdev->cur_max_queue_depth) {
+			pr_debug("%s:%s:%d qdepth %d, limit %d\n",
+				 skdev->name, __func__, __LINE__,
+				 skdev->in_flight, skdev->cur_max_queue_depth);
+			break;
+		}
+
+		/* Is a skd_request_context available? */
+		skreq = skdev->skreq_free_list;
+		if (skreq == NULL) {
+			pr_debug("%s:%s:%d Out of req=%p\n",
+				 skdev->name, __func__, __LINE__, q);
+			break;
+		}
+		SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE);
+		SKD_ASSERT((skreq->id & SKD_ID_INCR) == 0);
+
+		/* Now we check to see if we can get a fit msg */
+		if (skmsg == NULL) {
+			if (skdev->skmsg_free_list == NULL) {
+				pr_debug("%s:%s:%d Out of msg\n",
+					 skdev->name, __func__, __LINE__);
+				break;
+			}
+		}
+
+		skreq->flush_cmd = 0;
+		skreq->n_sg = 0;
+		skreq->sg_byte_count = 0;
+		skreq->discard_page = 0;
+
+		/*
+		 * OK to now dequeue request from q.
+		 *
+		 * At this point we are comitted to either start or reject
+		 * the native request. Note that skd_request_context is
+		 * available but is still at the head of the free list.
+		 */
+		blk_start_request(req);
+		skreq->req = req;
+		skreq->fitmsg_id = 0;
+
+		/* Either a FIT msg is in progress or we have to start one. */
+		if (skmsg == NULL) {
+			/* Are there any FIT msg buffers available? */
+			skmsg = skdev->skmsg_free_list;
+			if (skmsg == NULL) {
+				pr_debug("%s:%s:%d Out of msg skdev=%p\n",
+					 skdev->name, __func__, __LINE__,
+					 skdev);
+				break;
+			}
+			SKD_ASSERT(skmsg->state == SKD_MSG_STATE_IDLE);
+			SKD_ASSERT((skmsg->id & SKD_ID_INCR) == 0);
+
+			skdev->skmsg_free_list = skmsg->next;
+
+			skmsg->state = SKD_MSG_STATE_BUSY;
+			skmsg->id += SKD_ID_INCR;
+
+			/* Initialize the FIT msg header */
+			fmh = (struct fit_msg_hdr *)skmsg->msg_buf;
+			memset(fmh, 0, sizeof(*fmh));
+			fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
+			skmsg->length = sizeof(*fmh);
+		}
+
+		skreq->fitmsg_id = skmsg->id;
+
+		/*
+		 * Note that a FIT msg may have just been started
+		 * but contains no SoFIT requests yet.
+		 */
+
+		/*
+		 * Transcode the request, checking as we go. The outcome of
+		 * the transcoding is represented by the error variable.
+		 */
+		cmd_ptr = &skmsg->msg_buf[skmsg->length];
+		memset(cmd_ptr, 0, 32);
+
+		be_lba = cpu_to_be32(lba);
+		be_count = cpu_to_be32(count);
+		be_dmaa = cpu_to_be64((u64)skreq->sksg_dma_address);
+		cmdctxt = skreq->id + SKD_ID_INCR;
+
+		scsi_req = cmd_ptr;
+		scsi_req->hdr.tag = cmdctxt;
+		scsi_req->hdr.sg_list_dma_address = be_dmaa;
+
+		if (data_dir == READ)
+			skreq->sg_data_dir = SKD_DATA_DIR_CARD_TO_HOST;
+		else
+			skreq->sg_data_dir = SKD_DATA_DIR_HOST_TO_CARD;
+
+		if (io_flags & REQ_DISCARD) {
+			page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+			if (!page) {
+				pr_err("request_fn:Page allocation failed.\n");
+				skd_end_request(skdev, skreq, -ENOMEM);
+				break;
+			}
+			skreq->discard_page = 1;
+			req->completion_data = page;
+			skd_prep_discard_cdb(scsi_req, skreq, page, lba, count);
+
+		} else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) {
+			skd_prep_zerosize_flush_cdb(scsi_req, skreq);
+			SKD_ASSERT(skreq->flush_cmd == 1);
+
+		} else {
+			skd_prep_rw_cdb(scsi_req, data_dir, lba, count);
+		}
+
+		if (fua)
+			scsi_req->cdb[1] |= SKD_FUA_NV;
+
+		if (!req->bio)
+			goto skip_sg;
+
+		error = skd_preop_sg_list(skdev, skreq);
+
+		if (error != 0) {
+			/*
+			 * Complete the native request with error.
+			 * Note that the request context is still at the
+			 * head of the free list, and that the SoFIT request
+			 * was encoded into the FIT msg buffer but the FIT
+			 * msg length has not been updated. In short, the
+			 * only resource that has been allocated but might
+			 * not be used is that the FIT msg could be empty.
+			 */
+			pr_debug("%s:%s:%d error Out\n",
+				 skdev->name, __func__, __LINE__);
+			skd_end_request(skdev, skreq, error);
+			continue;
+		}
+
+skip_sg:
+		scsi_req->hdr.sg_list_len_bytes =
+			cpu_to_be32(skreq->sg_byte_count);
+
+		/* Complete resource allocations. */
+		skdev->skreq_free_list = skreq->next;
+		skreq->state = SKD_REQ_STATE_BUSY;
+		skreq->id += SKD_ID_INCR;
+
+		skmsg->length += sizeof(struct skd_scsi_request);
+		fmh->num_protocol_cmds_coalesced++;
+
+		/*
+		 * Update the active request counts.
+		 * Capture the timeout timestamp.
+		 */
+		skreq->timeout_stamp = skdev->timeout_stamp;
+		timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
+		skdev->timeout_slot[timo_slot]++;
+		skdev->in_flight++;
+		pr_debug("%s:%s:%d req=0x%x busy=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 skreq->id, skdev->in_flight);
+
+		/*
+		 * If the FIT msg buffer is full send it.
+		 */
+		if (skmsg->length >= SKD_N_FITMSG_BYTES ||
+		    fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) {
+			skd_send_fitmsg(skdev, skmsg);
+			skmsg = NULL;
+			fmh = NULL;
+		}
+	}
+
+	/*
+	 * Is a FIT msg in progress? If it is empty put the buffer back
+	 * on the free list. If it is non-empty send what we got.
+	 * This minimizes latency when there are fewer requests than
+	 * what fits in a FIT msg.
+	 */
+	if (skmsg != NULL) {
+		/* Bigger than just a FIT msg header? */
+		if (skmsg->length > sizeof(struct fit_msg_hdr)) {
+			pr_debug("%s:%s:%d sending msg=%p, len %d\n",
+				 skdev->name, __func__, __LINE__,
+				 skmsg, skmsg->length);
+			skd_send_fitmsg(skdev, skmsg);
+		} else {
+			/*
+			 * The FIT msg is empty. It means we got started
+			 * on the msg, but the requests were rejected.
+			 */
+			skmsg->state = SKD_MSG_STATE_IDLE;
+			skmsg->id += SKD_ID_INCR;
+			skmsg->next = skdev->skmsg_free_list;
+			skdev->skmsg_free_list = skmsg;
+		}
+		skmsg = NULL;
+		fmh = NULL;
+	}
+
+	/*
+	 * If req is non-NULL it means there is something to do but
+	 * we are out of a resource.
+	 */
+	if (req)
+		blk_stop_queue(skdev->queue);
+}
+
+static void skd_end_request(struct skd_device *skdev,
+			    struct skd_request_context *skreq, int error)
+{
+	struct request *req = skreq->req;
+	unsigned int io_flags = req->cmd_flags;
+
+	if ((io_flags & REQ_DISCARD) &&
+		(skreq->discard_page == 1)) {
+		pr_debug("%s:%s:%d, free the page!",
+			 skdev->name, __func__, __LINE__);
+		__free_page(req->completion_data);
+	}
+
+	if (unlikely(error)) {
+		struct request *req = skreq->req;
+		char *cmd = (rq_data_dir(req) == READ) ? "read" : "write";
+		u32 lba = (u32)blk_rq_pos(req);
+		u32 count = blk_rq_sectors(req);
+
+		pr_err("(%s): Error cmd=%s sect=%u count=%u id=0x%x\n",
+		       skd_name(skdev), cmd, lba, count, skreq->id);
+	} else
+		pr_debug("%s:%s:%d id=0x%x error=%d\n",
+			 skdev->name, __func__, __LINE__, skreq->id, error);
+
+	__blk_end_request_all(skreq->req, error);
+}
+
+static int skd_preop_sg_list(struct skd_device *skdev,
+			     struct skd_request_context *skreq)
+{
+	struct request *req = skreq->req;
+	int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
+	int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
+	struct scatterlist *sg = &skreq->sg[0];
+	int n_sg;
+	int i;
+
+	skreq->sg_byte_count = 0;
+
+	/* SKD_ASSERT(skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD ||
+		   skreq->sg_data_dir == SKD_DATA_DIR_CARD_TO_HOST); */
+
+	n_sg = blk_rq_map_sg(skdev->queue, req, sg);
+	if (n_sg <= 0)
+		return -EINVAL;
+
+	/*
+	 * Map scatterlist to PCI bus addresses.
+	 * Note PCI might change the number of entries.
+	 */
+	n_sg = pci_map_sg(skdev->pdev, sg, n_sg, pci_dir);
+	if (n_sg <= 0)
+		return -EINVAL;
+
+	SKD_ASSERT(n_sg <= skdev->sgs_per_request);
+
+	skreq->n_sg = n_sg;
+
+	for (i = 0; i < n_sg; i++) {
+		struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
+		u32 cnt = sg_dma_len(&sg[i]);
+		uint64_t dma_addr = sg_dma_address(&sg[i]);
+
+		sgd->control = FIT_SGD_CONTROL_NOT_LAST;
+		sgd->byte_count = cnt;
+		skreq->sg_byte_count += cnt;
+		sgd->host_side_addr = dma_addr;
+		sgd->dev_side_addr = 0;
+	}
+
+	skreq->sksg_list[n_sg - 1].next_desc_ptr = 0LL;
+	skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST;
+
+	if (unlikely(skdev->dbg_level > 1)) {
+		pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n",
+			 skdev->name, __func__, __LINE__,
+			 skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+		for (i = 0; i < n_sg; i++) {
+			struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
+			pr_debug("%s:%s:%d   sg[%d] count=%u ctrl=0x%x "
+				 "addr=0x%llx next=0x%llx\n",
+				 skdev->name, __func__, __LINE__,
+				 i, sgd->byte_count, sgd->control,
+				 sgd->host_side_addr, sgd->next_desc_ptr);
+		}
+	}
+
+	return 0;
+}
+
+static void skd_postop_sg_list(struct skd_device *skdev,
+			       struct skd_request_context *skreq)
+{
+	int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
+	int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
+
+	/*
+	 * restore the next ptr for next IO request so we
+	 * don't have to set it every time.
+	 */
+	skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr =
+		skreq->sksg_dma_address +
+		((skreq->n_sg) * sizeof(struct fit_sg_descriptor));
+	pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, pci_dir);
+}
+
+static void skd_request_fn_not_online(struct request_queue *q)
+{
+	struct skd_device *skdev = q->queuedata;
+	int error;
+
+	SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE);
+
+	skd_log_skdev(skdev, "req_not_online");
+	switch (skdev->state) {
+	case SKD_DRVR_STATE_PAUSING:
+	case SKD_DRVR_STATE_PAUSED:
+	case SKD_DRVR_STATE_STARTING:
+	case SKD_DRVR_STATE_RESTARTING:
+	case SKD_DRVR_STATE_WAIT_BOOT:
+	/* In case of starting, we haven't started the queue,
+	 * so we can't get here... but requests are
+	 * possibly hanging out waiting for us because we
+	 * reported the dev/skd0 already.  They'll wait
+	 * forever if connect doesn't complete.
+	 * What to do??? delay dev/skd0 ??
+	 */
+	case SKD_DRVR_STATE_BUSY:
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+	case SKD_DRVR_STATE_BUSY_ERASE:
+	case SKD_DRVR_STATE_DRAINING_TIMEOUT:
+		return;
+
+	case SKD_DRVR_STATE_BUSY_SANITIZE:
+	case SKD_DRVR_STATE_STOPPING:
+	case SKD_DRVR_STATE_SYNCING:
+	case SKD_DRVR_STATE_FAULT:
+	case SKD_DRVR_STATE_DISAPPEARED:
+	default:
+		error = -EIO;
+		break;
+	}
+
+	/* If we get here, terminate all pending block requeusts
+	 * with EIO and any scsi pass thru with appropriate sense
+	 */
+
+	skd_fail_all_pending(skdev);
+}
+
+/*
+ *****************************************************************************
+ * TIMER
+ *****************************************************************************
+ */
+
+static void skd_timer_tick_not_online(struct skd_device *skdev);
+
+static void skd_timer_tick(ulong arg)
+{
+	struct skd_device *skdev = (struct skd_device *)arg;
+
+	u32 timo_slot;
+	u32 overdue_timestamp;
+	unsigned long reqflags;
+	u32 state;
+
+	if (skdev->state == SKD_DRVR_STATE_FAULT)
+		/* The driver has declared fault, and we want it to
+		 * stay that way until driver is reloaded.
+		 */
+		return;
+
+	spin_lock_irqsave(&skdev->lock, reqflags);
+
+	state = SKD_READL(skdev, FIT_STATUS);
+	state &= FIT_SR_DRIVE_STATE_MASK;
+	if (state != skdev->drive_state)
+		skd_isr_fwstate(skdev);
+
+	if (skdev->state != SKD_DRVR_STATE_ONLINE) {
+		skd_timer_tick_not_online(skdev);
+		goto timer_func_out;
+	}
+	skdev->timeout_stamp++;
+	timo_slot = skdev->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
+
+	/*
+	 * All requests that happened during the previous use of
+	 * this slot should be done by now. The previous use was
+	 * over 7 seconds ago.
+	 */
+	if (skdev->timeout_slot[timo_slot] == 0)
+		goto timer_func_out;
+
+	/* Something is overdue */
+	overdue_timestamp = skdev->timeout_stamp - SKD_N_TIMEOUT_SLOT;
+
+	pr_debug("%s:%s:%d found %d timeouts, draining busy=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skdev->timeout_slot[timo_slot], skdev->in_flight);
+	pr_err("(%s): Overdue IOs (%d), busy %d\n",
+	       skd_name(skdev), skdev->timeout_slot[timo_slot],
+	       skdev->in_flight);
+
+	skdev->timer_countdown = SKD_DRAINING_TIMO;
+	skdev->state = SKD_DRVR_STATE_DRAINING_TIMEOUT;
+	skdev->timo_slot = timo_slot;
+	blk_stop_queue(skdev->queue);
+
+timer_func_out:
+	mod_timer(&skdev->timer, (jiffies + HZ));
+
+	spin_unlock_irqrestore(&skdev->lock, reqflags);
+}
+
+static void skd_timer_tick_not_online(struct skd_device *skdev)
+{
+	switch (skdev->state) {
+	case SKD_DRVR_STATE_IDLE:
+	case SKD_DRVR_STATE_LOAD:
+		break;
+	case SKD_DRVR_STATE_BUSY_SANITIZE:
+		pr_debug("%s:%s:%d drive busy sanitize[%x], driver[%x]\n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->drive_state, skdev->state);
+		/* If we've been in sanitize for 3 seconds, we figure we're not
+		 * going to get anymore completions, so recover requests now
+		 */
+		if (skdev->timer_countdown > 0) {
+			skdev->timer_countdown--;
+			return;
+		}
+		skd_recover_requests(skdev, 0);
+		break;
+
+	case SKD_DRVR_STATE_BUSY:
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+	case SKD_DRVR_STATE_BUSY_ERASE:
+		pr_debug("%s:%s:%d busy[%x], countdown=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->state, skdev->timer_countdown);
+		if (skdev->timer_countdown > 0) {
+			skdev->timer_countdown--;
+			return;
+		}
+		pr_debug("%s:%s:%d busy[%x], timedout=%d, restarting device.",
+			 skdev->name, __func__, __LINE__,
+			 skdev->state, skdev->timer_countdown);
+		skd_restart_device(skdev);
+		break;
+
+	case SKD_DRVR_STATE_WAIT_BOOT:
+	case SKD_DRVR_STATE_STARTING:
+		if (skdev->timer_countdown > 0) {
+			skdev->timer_countdown--;
+			return;
+		}
+		/* For now, we fault the drive.  Could attempt resets to
+		 * revcover at some point. */
+		skdev->state = SKD_DRVR_STATE_FAULT;
+
+		pr_err("(%s): DriveFault Connect Timeout (%x)\n",
+		       skd_name(skdev), skdev->drive_state);
+
+		/*start the queue so we can respond with error to requests */
+		/* wakeup anyone waiting for startup complete */
+		blk_start_queue(skdev->queue);
+		skdev->gendisk_on = -1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	case SKD_DRVR_STATE_ONLINE:
+		/* shouldn't get here. */
+		break;
+
+	case SKD_DRVR_STATE_PAUSING:
+	case SKD_DRVR_STATE_PAUSED:
+		break;
+
+	case SKD_DRVR_STATE_DRAINING_TIMEOUT:
+		pr_debug("%s:%s:%d "
+			 "draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->timo_slot,
+			 skdev->timer_countdown,
+			 skdev->in_flight,
+			 skdev->timeout_slot[skdev->timo_slot]);
+		/* if the slot has cleared we can let the I/O continue */
+		if (skdev->timeout_slot[skdev->timo_slot] == 0) {
+			pr_debug("%s:%s:%d Slot drained, starting queue.\n",
+				 skdev->name, __func__, __LINE__);
+			skdev->state = SKD_DRVR_STATE_ONLINE;
+			blk_start_queue(skdev->queue);
+			return;
+		}
+		if (skdev->timer_countdown > 0) {
+			skdev->timer_countdown--;
+			return;
+		}
+		skd_restart_device(skdev);
+		break;
+
+	case SKD_DRVR_STATE_RESTARTING:
+		if (skdev->timer_countdown > 0) {
+			skdev->timer_countdown--;
+			return;
+		}
+		/* For now, we fault the drive. Could attempt resets to
+		 * revcover at some point. */
+		skdev->state = SKD_DRVR_STATE_FAULT;
+		pr_err("(%s): DriveFault Reconnect Timeout (%x)\n",
+		       skd_name(skdev), skdev->drive_state);
+
+		/*
+		 * Recovering does two things:
+		 * 1. completes IO with error
+		 * 2. reclaims dma resources
+		 * When is it safe to recover requests?
+		 * - if the drive state is faulted
+		 * - if the state is still soft reset after out timeout
+		 * - if the drive registers are dead (state = FF)
+		 * If it is "unsafe", we still need to recover, so we will
+		 * disable pci bus mastering and disable our interrupts.
+		 */
+
+		if ((skdev->drive_state == FIT_SR_DRIVE_SOFT_RESET) ||
+		    (skdev->drive_state == FIT_SR_DRIVE_FAULT) ||
+		    (skdev->drive_state == FIT_SR_DRIVE_STATE_MASK))
+			/* It never came out of soft reset. Try to
+			 * recover the requests and then let them
+			 * fail. This is to mitigate hung processes. */
+			skd_recover_requests(skdev, 0);
+		else {
+			pr_err("(%s): Disable BusMaster (%x)\n",
+			       skd_name(skdev), skdev->drive_state);
+			pci_disable_device(skdev->pdev);
+			skd_disable_interrupts(skdev);
+			skd_recover_requests(skdev, 0);
+		}
+
+		/*start the queue so we can respond with error to requests */
+		/* wakeup anyone waiting for startup complete */
+		blk_start_queue(skdev->queue);
+		skdev->gendisk_on = -1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	case SKD_DRVR_STATE_RESUMING:
+	case SKD_DRVR_STATE_STOPPING:
+	case SKD_DRVR_STATE_SYNCING:
+	case SKD_DRVR_STATE_FAULT:
+	case SKD_DRVR_STATE_DISAPPEARED:
+	default:
+		break;
+	}
+}
+
+static int skd_start_timer(struct skd_device *skdev)
+{
+	int rc;
+
+	init_timer(&skdev->timer);
+	setup_timer(&skdev->timer, skd_timer_tick, (ulong)skdev);
+
+	rc = mod_timer(&skdev->timer, (jiffies + HZ));
+	if (rc)
+		pr_err("%s: failed to start timer %d\n",
+		       __func__, rc);
+	return rc;
+}
+
+static void skd_kill_timer(struct skd_device *skdev)
+{
+	del_timer_sync(&skdev->timer);
+}
+
+/*
+ *****************************************************************************
+ * IOCTL
+ *****************************************************************************
+ */
+static int skd_ioctl_sg_io(struct skd_device *skdev,
+			   fmode_t mode, void __user *argp);
+static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
+					struct skd_sg_io *sksgio);
+static int skd_sg_io_obtain_skspcl(struct skd_device *skdev,
+				   struct skd_sg_io *sksgio);
+static int skd_sg_io_prep_buffering(struct skd_device *skdev,
+				    struct skd_sg_io *sksgio);
+static int skd_sg_io_copy_buffer(struct skd_device *skdev,
+				 struct skd_sg_io *sksgio, int dxfer_dir);
+static int skd_sg_io_send_fitmsg(struct skd_device *skdev,
+				 struct skd_sg_io *sksgio);
+static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio);
+static int skd_sg_io_release_skspcl(struct skd_device *skdev,
+				    struct skd_sg_io *sksgio);
+static int skd_sg_io_put_status(struct skd_device *skdev,
+				struct skd_sg_io *sksgio);
+
+static void skd_complete_special(struct skd_device *skdev,
+				 volatile struct fit_completion_entry_v1
+				 *skcomp,
+				 volatile struct fit_comp_error_info *skerr,
+				 struct skd_special_context *skspcl);
+
+static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode,
+			  uint cmd_in, ulong arg)
+{
+	int rc = 0;
+	struct gendisk *disk = bdev->bd_disk;
+	struct skd_device *skdev = disk->private_data;
+	void __user *p = (void *)arg;
+
+	pr_debug("%s:%s:%d %s: CMD[%s] ioctl  mode 0x%x, cmd 0x%x arg %0lx\n",
+		 skdev->name, __func__, __LINE__,
+		 disk->disk_name, current->comm, mode, cmd_in, arg);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd_in) {
+	case SG_SET_TIMEOUT:
+	case SG_GET_TIMEOUT:
+	case SG_GET_VERSION_NUM:
+		rc = scsi_cmd_ioctl(disk->queue, disk, mode, cmd_in, p);
+		break;
+	case SG_IO:
+		rc = skd_ioctl_sg_io(skdev, mode, p);
+		break;
+
+	default:
+		rc = -ENOTTY;
+		break;
+	}
+
+	pr_debug("%s:%s:%d %s:  completion rc %d\n",
+		 skdev->name, __func__, __LINE__, disk->disk_name, rc);
+	return rc;
+}
+
+static int skd_ioctl_sg_io(struct skd_device *skdev, fmode_t mode,
+			   void __user *argp)
+{
+	int rc;
+	struct skd_sg_io sksgio;
+
+	memset(&sksgio, 0, sizeof(sksgio));
+	sksgio.mode = mode;
+	sksgio.argp = argp;
+	sksgio.iov = &sksgio.no_iov_iov;
+
+	switch (skdev->state) {
+	case SKD_DRVR_STATE_ONLINE:
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+		break;
+
+	default:
+		pr_debug("%s:%s:%d drive not online\n",
+			 skdev->name, __func__, __LINE__);
+		rc = -ENXIO;
+		goto out;
+	}
+
+	rc = skd_sg_io_get_and_check_args(skdev, &sksgio);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_obtain_skspcl(skdev, &sksgio);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_prep_buffering(skdev, &sksgio);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_TO_DEV);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_send_fitmsg(skdev, &sksgio);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_await(skdev, &sksgio);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_FROM_DEV);
+	if (rc)
+		goto out;
+
+	rc = skd_sg_io_put_status(skdev, &sksgio);
+	if (rc)
+		goto out;
+
+	rc = 0;
+
+out:
+	skd_sg_io_release_skspcl(skdev, &sksgio);
+
+	if (sksgio.iov != NULL && sksgio.iov != &sksgio.no_iov_iov)
+		kfree(sksgio.iov);
+	return rc;
+}
+
+static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
+					struct skd_sg_io *sksgio)
+{
+	struct sg_io_hdr *sgp = &sksgio->sg;
+	int i, acc;
+
+	if (!access_ok(VERIFY_WRITE, sksgio->argp, sizeof(sg_io_hdr_t))) {
+		pr_debug("%s:%s:%d access sg failed %p\n",
+			 skdev->name, __func__, __LINE__, sksgio->argp);
+		return -EFAULT;
+	}
+
+	if (__copy_from_user(sgp, sksgio->argp, sizeof(sg_io_hdr_t))) {
+		pr_debug("%s:%s:%d copy_from_user sg failed %p\n",
+			 skdev->name, __func__, __LINE__, sksgio->argp);
+		return -EFAULT;
+	}
+
+	if (sgp->interface_id != SG_INTERFACE_ID_ORIG) {
+		pr_debug("%s:%s:%d interface_id invalid 0x%x\n",
+			 skdev->name, __func__, __LINE__, sgp->interface_id);
+		return -EINVAL;
+	}
+
+	if (sgp->cmd_len > sizeof(sksgio->cdb)) {
+		pr_debug("%s:%s:%d cmd_len invalid %d\n",
+			 skdev->name, __func__, __LINE__, sgp->cmd_len);
+		return -EINVAL;
+	}
+
+	if (sgp->iovec_count > 256) {
+		pr_debug("%s:%s:%d iovec_count invalid %d\n",
+			 skdev->name, __func__, __LINE__, sgp->iovec_count);
+		return -EINVAL;
+	}
+
+	if (sgp->dxfer_len > (PAGE_SIZE * SKD_N_SG_PER_SPECIAL)) {
+		pr_debug("%s:%s:%d dxfer_len invalid %d\n",
+			 skdev->name, __func__, __LINE__, sgp->dxfer_len);
+		return -EINVAL;
+	}
+
+	switch (sgp->dxfer_direction) {
+	case SG_DXFER_NONE:
+		acc = -1;
+		break;
+
+	case SG_DXFER_TO_DEV:
+		acc = VERIFY_READ;
+		break;
+
+	case SG_DXFER_FROM_DEV:
+	case SG_DXFER_TO_FROM_DEV:
+		acc = VERIFY_WRITE;
+		break;
+
+	default:
+		pr_debug("%s:%s:%d dxfer_dir invalid %d\n",
+			 skdev->name, __func__, __LINE__, sgp->dxfer_direction);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(sksgio->cdb, sgp->cmdp, sgp->cmd_len)) {
+		pr_debug("%s:%s:%d copy_from_user cmdp failed %p\n",
+			 skdev->name, __func__, __LINE__, sgp->cmdp);
+		return -EFAULT;
+	}
+
+	if (sgp->mx_sb_len != 0) {
+		if (!access_ok(VERIFY_WRITE, sgp->sbp, sgp->mx_sb_len)) {
+			pr_debug("%s:%s:%d access sbp failed %p\n",
+				 skdev->name, __func__, __LINE__, sgp->sbp);
+			return -EFAULT;
+		}
+	}
+
+	if (sgp->iovec_count == 0) {
+		sksgio->iov[0].iov_base = sgp->dxferp;
+		sksgio->iov[0].iov_len = sgp->dxfer_len;
+		sksgio->iovcnt = 1;
+		sksgio->dxfer_len = sgp->dxfer_len;
+	} else {
+		struct sg_iovec *iov;
+		uint nbytes = sizeof(*iov) * sgp->iovec_count;
+		size_t iov_data_len;
+
+		iov = kmalloc(nbytes, GFP_KERNEL);
+		if (iov == NULL) {
+			pr_debug("%s:%s:%d alloc iovec failed %d\n",
+				 skdev->name, __func__, __LINE__,
+				 sgp->iovec_count);
+			return -ENOMEM;
+		}
+		sksgio->iov = iov;
+		sksgio->iovcnt = sgp->iovec_count;
+
+		if (copy_from_user(iov, sgp->dxferp, nbytes)) {
+			pr_debug("%s:%s:%d copy_from_user iovec failed %p\n",
+				 skdev->name, __func__, __LINE__, sgp->dxferp);
+			return -EFAULT;
+		}
+
+		/*
+		 * Sum up the vecs, making sure they don't overflow
+		 */
+		iov_data_len = 0;
+		for (i = 0; i < sgp->iovec_count; i++) {
+			if (iov_data_len + iov[i].iov_len < iov_data_len)
+				return -EINVAL;
+			iov_data_len += iov[i].iov_len;
+		}
+
+		/* SG_IO howto says that the shorter of the two wins */
+		if (sgp->dxfer_len < iov_data_len) {
+			sksgio->iovcnt = iov_shorten((struct iovec *)iov,
+						     sgp->iovec_count,
+						     sgp->dxfer_len);
+			sksgio->dxfer_len = sgp->dxfer_len;
+		} else
+			sksgio->dxfer_len = iov_data_len;
+	}
+
+	if (sgp->dxfer_direction != SG_DXFER_NONE) {
+		struct sg_iovec *iov = sksgio->iov;
+		for (i = 0; i < sksgio->iovcnt; i++, iov++) {
+			if (!access_ok(acc, iov->iov_base, iov->iov_len)) {
+				pr_debug("%s:%s:%d access data failed %p/%d\n",
+					 skdev->name, __func__, __LINE__,
+					 iov->iov_base, (int)iov->iov_len);
+				return -EFAULT;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int skd_sg_io_obtain_skspcl(struct skd_device *skdev,
+				   struct skd_sg_io *sksgio)
+{
+	struct skd_special_context *skspcl = NULL;
+	int rc;
+
+	for (;;) {
+		ulong flags;
+
+		spin_lock_irqsave(&skdev->lock, flags);
+		skspcl = skdev->skspcl_free_list;
+		if (skspcl != NULL) {
+			skdev->skspcl_free_list =
+				(struct skd_special_context *)skspcl->req.next;
+			skspcl->req.id += SKD_ID_INCR;
+			skspcl->req.state = SKD_REQ_STATE_SETUP;
+			skspcl->orphaned = 0;
+			skspcl->req.n_sg = 0;
+		}
+		spin_unlock_irqrestore(&skdev->lock, flags);
+
+		if (skspcl != NULL) {
+			rc = 0;
+			break;
+		}
+
+		pr_debug("%s:%s:%d blocking\n",
+			 skdev->name, __func__, __LINE__);
+
+		rc = wait_event_interruptible_timeout(
+				skdev->waitq,
+				(skdev->skspcl_free_list != NULL),
+				msecs_to_jiffies(sksgio->sg.timeout));
+
+		pr_debug("%s:%s:%d unblocking, rc=%d\n",
+			 skdev->name, __func__, __LINE__, rc);
+
+		if (rc <= 0) {
+			if (rc == 0)
+				rc = -ETIMEDOUT;
+			else
+				rc = -EINTR;
+			break;
+		}
+		/*
+		 * If we get here rc > 0 meaning the timeout to
+		 * wait_event_interruptible_timeout() had time left, hence the
+		 * sought event -- non-empty free list -- happened.
+		 * Retry the allocation.
+		 */
+	}
+	sksgio->skspcl = skspcl;
+
+	return rc;
+}
+
+static int skd_skreq_prep_buffering(struct skd_device *skdev,
+				    struct skd_request_context *skreq,
+				    u32 dxfer_len)
+{
+	u32 resid = dxfer_len;
+
+	/*
+	 * The DMA engine must have aligned addresses and byte counts.
+	 */
+	resid += (-resid) & 3;
+	skreq->sg_byte_count = resid;
+
+	skreq->n_sg = 0;
+
+	while (resid > 0) {
+		u32 nbytes = PAGE_SIZE;
+		u32 ix = skreq->n_sg;
+		struct scatterlist *sg = &skreq->sg[ix];
+		struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix];
+		struct page *page;
+
+		if (nbytes > resid)
+			nbytes = resid;
+
+		page = alloc_page(GFP_KERNEL);
+		if (page == NULL)
+			return -ENOMEM;
+
+		sg_set_page(sg, page, nbytes, 0);
+
+		/* TODO: This should be going through a pci_???()
+		 * routine to do proper mapping. */
+		sksg->control = FIT_SGD_CONTROL_NOT_LAST;
+		sksg->byte_count = nbytes;
+
+		sksg->host_side_addr = sg_phys(sg);
+
+		sksg->dev_side_addr = 0;
+		sksg->next_desc_ptr = skreq->sksg_dma_address +
+				      (ix + 1) * sizeof(*sksg);
+
+		skreq->n_sg++;
+		resid -= nbytes;
+	}
+
+	if (skreq->n_sg > 0) {
+		u32 ix = skreq->n_sg - 1;
+		struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix];
+
+		sksg->control = FIT_SGD_CONTROL_LAST;
+		sksg->next_desc_ptr = 0;
+	}
+
+	if (unlikely(skdev->dbg_level > 1)) {
+		u32 i;
+
+		pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n",
+			 skdev->name, __func__, __LINE__,
+			 skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+		for (i = 0; i < skreq->n_sg; i++) {
+			struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
+
+			pr_debug("%s:%s:%d   sg[%d] count=%u ctrl=0x%x "
+				 "addr=0x%llx next=0x%llx\n",
+				 skdev->name, __func__, __LINE__,
+				 i, sgd->byte_count, sgd->control,
+				 sgd->host_side_addr, sgd->next_desc_ptr);
+		}
+	}
+
+	return 0;
+}
+
+static int skd_sg_io_prep_buffering(struct skd_device *skdev,
+				    struct skd_sg_io *sksgio)
+{
+	struct skd_special_context *skspcl = sksgio->skspcl;
+	struct skd_request_context *skreq = &skspcl->req;
+	u32 dxfer_len = sksgio->dxfer_len;
+	int rc;
+
+	rc = skd_skreq_prep_buffering(skdev, skreq, dxfer_len);
+	/*
+	 * Eventually, errors or not, skd_release_special() is called
+	 * to recover allocations including partial allocations.
+	 */
+	return rc;
+}
+
+static int skd_sg_io_copy_buffer(struct skd_device *skdev,
+				 struct skd_sg_io *sksgio, int dxfer_dir)
+{
+	struct skd_special_context *skspcl = sksgio->skspcl;
+	u32 iov_ix = 0;
+	struct sg_iovec curiov;
+	u32 sksg_ix = 0;
+	u8 *bufp = NULL;
+	u32 buf_len = 0;
+	u32 resid = sksgio->dxfer_len;
+	int rc;
+
+	curiov.iov_len = 0;
+	curiov.iov_base = NULL;
+
+	if (dxfer_dir != sksgio->sg.dxfer_direction) {
+		if (dxfer_dir != SG_DXFER_TO_DEV ||
+		    sksgio->sg.dxfer_direction != SG_DXFER_TO_FROM_DEV)
+			return 0;
+	}
+
+	while (resid > 0) {
+		u32 nbytes = PAGE_SIZE;
+
+		if (curiov.iov_len == 0) {
+			curiov = sksgio->iov[iov_ix++];
+			continue;
+		}
+
+		if (buf_len == 0) {
+			struct page *page;
+			page = sg_page(&skspcl->req.sg[sksg_ix++]);
+			bufp = page_address(page);
+			buf_len = PAGE_SIZE;
+		}
+
+		nbytes = min_t(u32, nbytes, resid);
+		nbytes = min_t(u32, nbytes, curiov.iov_len);
+		nbytes = min_t(u32, nbytes, buf_len);
+
+		if (dxfer_dir == SG_DXFER_TO_DEV)
+			rc = __copy_from_user(bufp, curiov.iov_base, nbytes);
+		else
+			rc = __copy_to_user(curiov.iov_base, bufp, nbytes);
+
+		if (rc)
+			return -EFAULT;
+
+		resid -= nbytes;
+		curiov.iov_len -= nbytes;
+		curiov.iov_base += nbytes;
+		buf_len -= nbytes;
+	}
+
+	return 0;
+}
+
+static int skd_sg_io_send_fitmsg(struct skd_device *skdev,
+				 struct skd_sg_io *sksgio)
+{
+	struct skd_special_context *skspcl = sksgio->skspcl;
+	struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf;
+	struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1];
+
+	memset(skspcl->msg_buf, 0, SKD_N_SPECIAL_FITMSG_BYTES);
+
+	/* Initialize the FIT msg header */
+	fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
+	fmh->num_protocol_cmds_coalesced = 1;
+
+	/* Initialize the SCSI request */
+	if (sksgio->sg.dxfer_direction != SG_DXFER_NONE)
+		scsi_req->hdr.sg_list_dma_address =
+			cpu_to_be64(skspcl->req.sksg_dma_address);
+	scsi_req->hdr.tag = skspcl->req.id;
+	scsi_req->hdr.sg_list_len_bytes =
+		cpu_to_be32(skspcl->req.sg_byte_count);
+	memcpy(scsi_req->cdb, sksgio->cdb, sizeof(scsi_req->cdb));
+
+	skspcl->req.state = SKD_REQ_STATE_BUSY;
+	skd_send_special_fitmsg(skdev, skspcl);
+
+	return 0;
+}
+
+static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio)
+{
+	unsigned long flags;
+	int rc;
+
+	rc = wait_event_interruptible_timeout(skdev->waitq,
+					      (sksgio->skspcl->req.state !=
+					       SKD_REQ_STATE_BUSY),
+					      msecs_to_jiffies(sksgio->sg.
+							       timeout));
+
+	spin_lock_irqsave(&skdev->lock, flags);
+
+	if (sksgio->skspcl->req.state == SKD_REQ_STATE_ABORTED) {
+		pr_debug("%s:%s:%d skspcl %p aborted\n",
+			 skdev->name, __func__, __LINE__, sksgio->skspcl);
+
+		/* Build check cond, sense and let command finish. */
+		/* For a timeout, we must fabricate completion and sense
+		 * data to complete the command */
+		sksgio->skspcl->req.completion.status =
+			SAM_STAT_CHECK_CONDITION;
+
+		memset(&sksgio->skspcl->req.err_info, 0,
+		       sizeof(sksgio->skspcl->req.err_info));
+		sksgio->skspcl->req.err_info.type = 0x70;
+		sksgio->skspcl->req.err_info.key = ABORTED_COMMAND;
+		sksgio->skspcl->req.err_info.code = 0x44;
+		sksgio->skspcl->req.err_info.qual = 0;
+		rc = 0;
+	} else if (sksgio->skspcl->req.state != SKD_REQ_STATE_BUSY)
+		/* No longer on the adapter. We finish. */
+		rc = 0;
+	else {
+		/* Something's gone wrong. Still busy. Timeout or
+		 * user interrupted (control-C). Mark as an orphan
+		 * so it will be disposed when completed. */
+		sksgio->skspcl->orphaned = 1;
+		sksgio->skspcl = NULL;
+		if (rc == 0) {
+			pr_debug("%s:%s:%d timed out %p (%u ms)\n",
+				 skdev->name, __func__, __LINE__,
+				 sksgio, sksgio->sg.timeout);
+			rc = -ETIMEDOUT;
+		} else {
+			pr_debug("%s:%s:%d cntlc %p\n",
+				 skdev->name, __func__, __LINE__, sksgio);
+			rc = -EINTR;
+		}
+	}
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+	return rc;
+}
+
+static int skd_sg_io_put_status(struct skd_device *skdev,
+				struct skd_sg_io *sksgio)
+{
+	struct sg_io_hdr *sgp = &sksgio->sg;
+	struct skd_special_context *skspcl = sksgio->skspcl;
+	int resid = 0;
+
+	u32 nb = be32_to_cpu(skspcl->req.completion.num_returned_bytes);
+
+	sgp->status = skspcl->req.completion.status;
+	resid = sksgio->dxfer_len - nb;
+
+	sgp->masked_status = sgp->status & STATUS_MASK;
+	sgp->msg_status = 0;
+	sgp->host_status = 0;
+	sgp->driver_status = 0;
+	sgp->resid = resid;
+	if (sgp->masked_status || sgp->host_status || sgp->driver_status)
+		sgp->info |= SG_INFO_CHECK;
+
+	pr_debug("%s:%s:%d status %x masked %x resid 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 sgp->status, sgp->masked_status, sgp->resid);
+
+	if (sgp->masked_status == SAM_STAT_CHECK_CONDITION) {
+		if (sgp->mx_sb_len > 0) {
+			struct fit_comp_error_info *ei = &skspcl->req.err_info;
+			u32 nbytes = sizeof(*ei);
+
+			nbytes = min_t(u32, nbytes, sgp->mx_sb_len);
+
+			sgp->sb_len_wr = nbytes;
+
+			if (__copy_to_user(sgp->sbp, ei, nbytes)) {
+				pr_debug("%s:%s:%d copy_to_user sense failed %p\n",
+					 skdev->name, __func__, __LINE__,
+					 sgp->sbp);
+				return -EFAULT;
+			}
+		}
+	}
+
+	if (__copy_to_user(sksgio->argp, sgp, sizeof(sg_io_hdr_t))) {
+		pr_debug("%s:%s:%d copy_to_user sg failed %p\n",
+			 skdev->name, __func__, __LINE__, sksgio->argp);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int skd_sg_io_release_skspcl(struct skd_device *skdev,
+				    struct skd_sg_io *sksgio)
+{
+	struct skd_special_context *skspcl = sksgio->skspcl;
+
+	if (skspcl != NULL) {
+		ulong flags;
+
+		sksgio->skspcl = NULL;
+
+		spin_lock_irqsave(&skdev->lock, flags);
+		skd_release_special(skdev, skspcl);
+		spin_unlock_irqrestore(&skdev->lock, flags);
+	}
+
+	return 0;
+}
+
+/*
+ *****************************************************************************
+ * INTERNAL REQUESTS -- generated by driver itself
+ *****************************************************************************
+ */
+
+static int skd_format_internal_skspcl(struct skd_device *skdev)
+{
+	struct skd_special_context *skspcl = &skdev->internal_skspcl;
+	struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0];
+	struct fit_msg_hdr *fmh;
+	uint64_t dma_address;
+	struct skd_scsi_request *scsi;
+
+	fmh = (struct fit_msg_hdr *)&skspcl->msg_buf[0];
+	fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
+	fmh->num_protocol_cmds_coalesced = 1;
+
+	scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64];
+	memset(scsi, 0, sizeof(*scsi));
+	dma_address = skspcl->req.sksg_dma_address;
+	scsi->hdr.sg_list_dma_address = cpu_to_be64(dma_address);
+	sgd->control = FIT_SGD_CONTROL_LAST;
+	sgd->byte_count = 0;
+	sgd->host_side_addr = skspcl->db_dma_address;
+	sgd->dev_side_addr = 0;
+	sgd->next_desc_ptr = 0LL;
+
+	return 1;
+}
+
+#define WR_BUF_SIZE SKD_N_INTERNAL_BYTES
+
+static void skd_send_internal_skspcl(struct skd_device *skdev,
+				     struct skd_special_context *skspcl,
+				     u8 opcode)
+{
+	struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0];
+	struct skd_scsi_request *scsi;
+	unsigned char *buf = skspcl->data_buf;
+	int i;
+
+	if (skspcl->req.state != SKD_REQ_STATE_IDLE)
+		/*
+		 * A refresh is already in progress.
+		 * Just wait for it to finish.
+		 */
+		return;
+
+	SKD_ASSERT((skspcl->req.id & SKD_ID_INCR) == 0);
+	skspcl->req.state = SKD_REQ_STATE_BUSY;
+	skspcl->req.id += SKD_ID_INCR;
+
+	scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64];
+	scsi->hdr.tag = skspcl->req.id;
+
+	memset(scsi->cdb, 0, sizeof(scsi->cdb));
+
+	switch (opcode) {
+	case TEST_UNIT_READY:
+		scsi->cdb[0] = TEST_UNIT_READY;
+		sgd->byte_count = 0;
+		scsi->hdr.sg_list_len_bytes = 0;
+		break;
+
+	case READ_CAPACITY:
+		scsi->cdb[0] = READ_CAPACITY;
+		sgd->byte_count = SKD_N_READ_CAP_BYTES;
+		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+		break;
+
+	case INQUIRY:
+		scsi->cdb[0] = INQUIRY;
+		scsi->cdb[1] = 0x01;    /* evpd */
+		scsi->cdb[2] = 0x80;    /* serial number page */
+		scsi->cdb[4] = 0x10;
+		sgd->byte_count = 16;
+		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+		break;
+
+	case SYNCHRONIZE_CACHE:
+		scsi->cdb[0] = SYNCHRONIZE_CACHE;
+		sgd->byte_count = 0;
+		scsi->hdr.sg_list_len_bytes = 0;
+		break;
+
+	case WRITE_BUFFER:
+		scsi->cdb[0] = WRITE_BUFFER;
+		scsi->cdb[1] = 0x02;
+		scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8;
+		scsi->cdb[8] = WR_BUF_SIZE & 0xFF;
+		sgd->byte_count = WR_BUF_SIZE;
+		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+		/* fill incrementing byte pattern */
+		for (i = 0; i < sgd->byte_count; i++)
+			buf[i] = i & 0xFF;
+		break;
+
+	case READ_BUFFER:
+		scsi->cdb[0] = READ_BUFFER;
+		scsi->cdb[1] = 0x02;
+		scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8;
+		scsi->cdb[8] = WR_BUF_SIZE & 0xFF;
+		sgd->byte_count = WR_BUF_SIZE;
+		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+		memset(skspcl->data_buf, 0, sgd->byte_count);
+		break;
+
+	default:
+		SKD_ASSERT("Don't know what to send");
+		return;
+
+	}
+	skd_send_special_fitmsg(skdev, skspcl);
+}
+
+static void skd_refresh_device_data(struct skd_device *skdev)
+{
+	struct skd_special_context *skspcl = &skdev->internal_skspcl;
+
+	skd_send_internal_skspcl(skdev, skspcl, TEST_UNIT_READY);
+}
+
+static int skd_chk_read_buf(struct skd_device *skdev,
+			    struct skd_special_context *skspcl)
+{
+	unsigned char *buf = skspcl->data_buf;
+	int i;
+
+	/* check for incrementing byte pattern */
+	for (i = 0; i < WR_BUF_SIZE; i++)
+		if (buf[i] != (i & 0xFF))
+			return 1;
+
+	return 0;
+}
+
+static void skd_log_check_status(struct skd_device *skdev, u8 status, u8 key,
+				 u8 code, u8 qual, u8 fruc)
+{
+	/* If the check condition is of special interest, log a message */
+	if ((status == SAM_STAT_CHECK_CONDITION) && (key == 0x02)
+	    && (code == 0x04) && (qual == 0x06)) {
+		pr_err("(%s): *** LOST_WRITE_DATA ERROR *** key/asc/"
+		       "ascq/fruc %02x/%02x/%02x/%02x\n",
+		       skd_name(skdev), key, code, qual, fruc);
+	}
+}
+
+static void skd_complete_internal(struct skd_device *skdev,
+				  volatile struct fit_completion_entry_v1
+				  *skcomp,
+				  volatile struct fit_comp_error_info *skerr,
+				  struct skd_special_context *skspcl)
+{
+	u8 *buf = skspcl->data_buf;
+	u8 status;
+	int i;
+	struct skd_scsi_request *scsi =
+		(struct skd_scsi_request *)&skspcl->msg_buf[64];
+
+	SKD_ASSERT(skspcl == &skdev->internal_skspcl);
+
+	pr_debug("%s:%s:%d complete internal %x\n",
+		 skdev->name, __func__, __LINE__, scsi->cdb[0]);
+
+	skspcl->req.completion = *skcomp;
+	skspcl->req.state = SKD_REQ_STATE_IDLE;
+	skspcl->req.id += SKD_ID_INCR;
+
+	status = skspcl->req.completion.status;
+
+	skd_log_check_status(skdev, status, skerr->key, skerr->code,
+			     skerr->qual, skerr->fruc);
+
+	switch (scsi->cdb[0]) {
+	case TEST_UNIT_READY:
+		if (status == SAM_STAT_GOOD)
+			skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER);
+		else if ((status == SAM_STAT_CHECK_CONDITION) &&
+			 (skerr->key == MEDIUM_ERROR))
+			skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER);
+		else {
+			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
+				pr_debug("%s:%s:%d TUR failed, don't send anymore state 0x%x\n",
+					 skdev->name, __func__, __LINE__,
+					 skdev->state);
+				return;
+			}
+			pr_debug("%s:%s:%d **** TUR failed, retry skerr\n",
+				 skdev->name, __func__, __LINE__);
+			skd_send_internal_skspcl(skdev, skspcl, 0x00);
+		}
+		break;
+
+	case WRITE_BUFFER:
+		if (status == SAM_STAT_GOOD)
+			skd_send_internal_skspcl(skdev, skspcl, READ_BUFFER);
+		else {
+			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
+				pr_debug("%s:%s:%d write buffer failed, don't send anymore state 0x%x\n",
+					 skdev->name, __func__, __LINE__,
+					 skdev->state);
+				return;
+			}
+			pr_debug("%s:%s:%d **** write buffer failed, retry skerr\n",
+				 skdev->name, __func__, __LINE__);
+			skd_send_internal_skspcl(skdev, skspcl, 0x00);
+		}
+		break;
+
+	case READ_BUFFER:
+		if (status == SAM_STAT_GOOD) {
+			if (skd_chk_read_buf(skdev, skspcl) == 0)
+				skd_send_internal_skspcl(skdev, skspcl,
+							 READ_CAPACITY);
+			else {
+				pr_err(
+				       "(%s):*** W/R Buffer mismatch %d ***\n",
+				       skd_name(skdev), skdev->connect_retries);
+				if (skdev->connect_retries <
+				    SKD_MAX_CONNECT_RETRIES) {
+					skdev->connect_retries++;
+					skd_soft_reset(skdev);
+				} else {
+					pr_err(
+					       "(%s): W/R Buffer Connect Error\n",
+					       skd_name(skdev));
+					return;
+				}
+			}
+
+		} else {
+			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
+				pr_debug("%s:%s:%d "
+					 "read buffer failed, don't send anymore state 0x%x\n",
+					 skdev->name, __func__, __LINE__,
+					 skdev->state);
+				return;
+			}
+			pr_debug("%s:%s:%d "
+				 "**** read buffer failed, retry skerr\n",
+				 skdev->name, __func__, __LINE__);
+			skd_send_internal_skspcl(skdev, skspcl, 0x00);
+		}
+		break;
+
+	case READ_CAPACITY:
+		skdev->read_cap_is_valid = 0;
+		if (status == SAM_STAT_GOOD) {
+			skdev->read_cap_last_lba =
+				(buf[0] << 24) | (buf[1] << 16) |
+				(buf[2] << 8) | buf[3];
+			skdev->read_cap_blocksize =
+				(buf[4] << 24) | (buf[5] << 16) |
+				(buf[6] << 8) | buf[7];
+
+			pr_debug("%s:%s:%d last lba %d, bs %d\n",
+				 skdev->name, __func__, __LINE__,
+				 skdev->read_cap_last_lba,
+				 skdev->read_cap_blocksize);
+
+			set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
+
+			skdev->read_cap_is_valid = 1;
+
+			skd_send_internal_skspcl(skdev, skspcl, INQUIRY);
+		} else if ((status == SAM_STAT_CHECK_CONDITION) &&
+			   (skerr->key == MEDIUM_ERROR)) {
+			skdev->read_cap_last_lba = ~0;
+			set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
+			pr_debug("%s:%s:%d "
+				 "**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n",
+				 skdev->name, __func__, __LINE__);
+			skd_send_internal_skspcl(skdev, skspcl, INQUIRY);
+		} else {
+			pr_debug("%s:%s:%d **** READCAP failed, retry TUR\n",
+				 skdev->name, __func__, __LINE__);
+			skd_send_internal_skspcl(skdev, skspcl,
+						 TEST_UNIT_READY);
+		}
+		break;
+
+	case INQUIRY:
+		skdev->inquiry_is_valid = 0;
+		if (status == SAM_STAT_GOOD) {
+			skdev->inquiry_is_valid = 1;
+
+			for (i = 0; i < 12; i++)
+				skdev->inq_serial_num[i] = buf[i + 4];
+			skdev->inq_serial_num[12] = 0;
+		}
+
+		if (skd_unquiesce_dev(skdev) < 0)
+			pr_debug("%s:%s:%d **** failed, to ONLINE device\n",
+				 skdev->name, __func__, __LINE__);
+		 /* connection is complete */
+		skdev->connect_retries = 0;
+		break;
+
+	case SYNCHRONIZE_CACHE:
+		if (status == SAM_STAT_GOOD)
+			skdev->sync_done = 1;
+		else
+			skdev->sync_done = -1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	default:
+		SKD_ASSERT("we didn't send this");
+	}
+}
+
+/*
+ *****************************************************************************
+ * FIT MESSAGES
+ *****************************************************************************
+ */
+
+static void skd_send_fitmsg(struct skd_device *skdev,
+			    struct skd_fitmsg_context *skmsg)
+{
+	u64 qcmd;
+	struct fit_msg_hdr *fmh;
+
+	pr_debug("%s:%s:%d dma address 0x%llx, busy=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skmsg->mb_dma_address, skdev->in_flight);
+	pr_debug("%s:%s:%d msg_buf 0x%p, offset %x\n",
+		 skdev->name, __func__, __LINE__,
+		 skmsg->msg_buf, skmsg->offset);
+
+	qcmd = skmsg->mb_dma_address;
+	qcmd |= FIT_QCMD_QID_NORMAL;
+
+	fmh = (struct fit_msg_hdr *)skmsg->msg_buf;
+	skmsg->outstanding = fmh->num_protocol_cmds_coalesced;
+
+	if (unlikely(skdev->dbg_level > 1)) {
+		u8 *bp = (u8 *)skmsg->msg_buf;
+		int i;
+		for (i = 0; i < skmsg->length; i += 8) {
+			pr_debug("%s:%s:%d msg[%2d] %02x %02x %02x %02x "
+				 "%02x %02x %02x %02x\n",
+				 skdev->name, __func__, __LINE__,
+				 i, bp[i + 0], bp[i + 1], bp[i + 2],
+				 bp[i + 3], bp[i + 4], bp[i + 5],
+				 bp[i + 6], bp[i + 7]);
+			if (i == 0)
+				i = 64 - 8;
+		}
+	}
+
+	if (skmsg->length > 256)
+		qcmd |= FIT_QCMD_MSGSIZE_512;
+	else if (skmsg->length > 128)
+		qcmd |= FIT_QCMD_MSGSIZE_256;
+	else if (skmsg->length > 64)
+		qcmd |= FIT_QCMD_MSGSIZE_128;
+	else
+		/*
+		 * This makes no sense because the FIT msg header is
+		 * 64 bytes. If the msg is only 64 bytes long it has
+		 * no payload.
+		 */
+		qcmd |= FIT_QCMD_MSGSIZE_64;
+
+	SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND);
+
+}
+
+static void skd_send_special_fitmsg(struct skd_device *skdev,
+				    struct skd_special_context *skspcl)
+{
+	u64 qcmd;
+
+	if (unlikely(skdev->dbg_level > 1)) {
+		u8 *bp = (u8 *)skspcl->msg_buf;
+		int i;
+
+		for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) {
+			pr_debug("%s:%s:%d  spcl[%2d] %02x %02x %02x %02x  "
+				 "%02x %02x %02x %02x\n",
+				 skdev->name, __func__, __LINE__, i,
+				 bp[i + 0], bp[i + 1], bp[i + 2], bp[i + 3],
+				 bp[i + 4], bp[i + 5], bp[i + 6], bp[i + 7]);
+			if (i == 0)
+				i = 64 - 8;
+		}
+
+		pr_debug("%s:%s:%d skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n",
+			 skdev->name, __func__, __LINE__,
+			 skspcl, skspcl->req.id, skspcl->req.sksg_list,
+			 skspcl->req.sksg_dma_address);
+		for (i = 0; i < skspcl->req.n_sg; i++) {
+			struct fit_sg_descriptor *sgd =
+				&skspcl->req.sksg_list[i];
+
+			pr_debug("%s:%s:%d   sg[%d] count=%u ctrl=0x%x "
+				 "addr=0x%llx next=0x%llx\n",
+				 skdev->name, __func__, __LINE__,
+				 i, sgd->byte_count, sgd->control,
+				 sgd->host_side_addr, sgd->next_desc_ptr);
+		}
+	}
+
+	/*
+	 * Special FIT msgs are always 128 bytes: a 64-byte FIT hdr
+	 * and one 64-byte SSDI command.
+	 */
+	qcmd = skspcl->mb_dma_address;
+	qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128;
+
+	SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND);
+}
+
+/*
+ *****************************************************************************
+ * COMPLETION QUEUE
+ *****************************************************************************
+ */
+
+static void skd_complete_other(struct skd_device *skdev,
+			       volatile struct fit_completion_entry_v1 *skcomp,
+			       volatile struct fit_comp_error_info *skerr);
+
+struct sns_info {
+	u8 type;
+	u8 stat;
+	u8 key;
+	u8 asc;
+	u8 ascq;
+	u8 mask;
+	enum skd_check_status_action action;
+};
+
+static struct sns_info skd_chkstat_table[] = {
+	/* Good */
+	{ 0x70, 0x02, RECOVERED_ERROR, 0,    0,	   0x1c,
+	  SKD_CHECK_STATUS_REPORT_GOOD },
+
+	/* Smart alerts */
+	{ 0x70, 0x02, NO_SENSE,	       0x0B, 0x00, 0x1E,	/* warnings */
+	  SKD_CHECK_STATUS_REPORT_SMART_ALERT },
+	{ 0x70, 0x02, NO_SENSE,	       0x5D, 0x00, 0x1E,	/* thresholds */
+	  SKD_CHECK_STATUS_REPORT_SMART_ALERT },
+	{ 0x70, 0x02, RECOVERED_ERROR, 0x0B, 0x01, 0x1F,        /* temperature over trigger */
+	  SKD_CHECK_STATUS_REPORT_SMART_ALERT },
+
+	/* Retry (with limits) */
+	{ 0x70, 0x02, 0x0B,	       0,    0,	   0x1C,        /* This one is for DMA ERROR */
+	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
+	{ 0x70, 0x02, 0x06,	       0x0B, 0x00, 0x1E,        /* warnings */
+	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
+	{ 0x70, 0x02, 0x06,	       0x5D, 0x00, 0x1E,        /* thresholds */
+	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
+	{ 0x70, 0x02, 0x06,	       0x80, 0x30, 0x1F,        /* backup power */
+	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
+
+	/* Busy (or about to be) */
+	{ 0x70, 0x02, 0x06,	       0x3f, 0x01, 0x1F, /* fw changed */
+	  SKD_CHECK_STATUS_BUSY_IMMINENT },
+};
+
+/*
+ * Look up status and sense data to decide how to handle the error
+ * from the device.
+ * mask says which fields must match e.g., mask=0x18 means check
+ * type and stat, ignore key, asc, ascq.
+ */
+
+static enum skd_check_status_action
+skd_check_status(struct skd_device *skdev,
+		 u8 cmp_status, volatile struct fit_comp_error_info *skerr)
+{
+	int i, n;
+
+	pr_err("(%s): key/asc/ascq/fruc %02x/%02x/%02x/%02x\n",
+	       skd_name(skdev), skerr->key, skerr->code, skerr->qual,
+	       skerr->fruc);
+
+	pr_debug("%s:%s:%d stat: t=%02x stat=%02x k=%02x c=%02x q=%02x fruc=%02x\n",
+		 skdev->name, __func__, __LINE__, skerr->type, cmp_status,
+		 skerr->key, skerr->code, skerr->qual, skerr->fruc);
+
+	/* Does the info match an entry in the good category? */
+	n = sizeof(skd_chkstat_table) / sizeof(skd_chkstat_table[0]);
+	for (i = 0; i < n; i++) {
+		struct sns_info *sns = &skd_chkstat_table[i];
+
+		if (sns->mask & 0x10)
+			if (skerr->type != sns->type)
+				continue;
+
+		if (sns->mask & 0x08)
+			if (cmp_status != sns->stat)
+				continue;
+
+		if (sns->mask & 0x04)
+			if (skerr->key != sns->key)
+				continue;
+
+		if (sns->mask & 0x02)
+			if (skerr->code != sns->asc)
+				continue;
+
+		if (sns->mask & 0x01)
+			if (skerr->qual != sns->ascq)
+				continue;
+
+		if (sns->action == SKD_CHECK_STATUS_REPORT_SMART_ALERT) {
+			pr_err("(%s): SMART Alert: sense key/asc/ascq "
+			       "%02x/%02x/%02x\n",
+			       skd_name(skdev), skerr->key,
+			       skerr->code, skerr->qual);
+		}
+		return sns->action;
+	}
+
+	/* No other match, so nonzero status means error,
+	 * zero status means good
+	 */
+	if (cmp_status) {
+		pr_debug("%s:%s:%d status check: error\n",
+			 skdev->name, __func__, __LINE__);
+		return SKD_CHECK_STATUS_REPORT_ERROR;
+	}
+
+	pr_debug("%s:%s:%d status check good default\n",
+		 skdev->name, __func__, __LINE__);
+	return SKD_CHECK_STATUS_REPORT_GOOD;
+}
+
+static void skd_resolve_req_exception(struct skd_device *skdev,
+				      struct skd_request_context *skreq)
+{
+	u8 cmp_status = skreq->completion.status;
+
+	switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) {
+	case SKD_CHECK_STATUS_REPORT_GOOD:
+	case SKD_CHECK_STATUS_REPORT_SMART_ALERT:
+		skd_end_request(skdev, skreq, 0);
+		break;
+
+	case SKD_CHECK_STATUS_BUSY_IMMINENT:
+		skd_log_skreq(skdev, skreq, "retry(busy)");
+		blk_requeue_request(skdev->queue, skreq->req);
+		pr_info("(%s) drive BUSY imminent\n", skd_name(skdev));
+		skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT;
+		skdev->timer_countdown = SKD_TIMER_MINUTES(20);
+		skd_quiesce_dev(skdev);
+		break;
+
+	case SKD_CHECK_STATUS_REQUEUE_REQUEST:
+		if ((unsigned long) ++skreq->req->special < SKD_MAX_RETRIES) {
+			skd_log_skreq(skdev, skreq, "retry");
+			blk_requeue_request(skdev->queue, skreq->req);
+			break;
+		}
+	/* fall through to report error */
+
+	case SKD_CHECK_STATUS_REPORT_ERROR:
+	default:
+		skd_end_request(skdev, skreq, -EIO);
+		break;
+	}
+}
+
+/* assume spinlock is already held */
+static void skd_release_skreq(struct skd_device *skdev,
+			      struct skd_request_context *skreq)
+{
+	u32 msg_slot;
+	struct skd_fitmsg_context *skmsg;
+
+	u32 timo_slot;
+
+	/*
+	 * Reclaim the FIT msg buffer if this is
+	 * the first of the requests it carried to
+	 * be completed. The FIT msg buffer used to
+	 * send this request cannot be reused until
+	 * we are sure the s1120 card has copied
+	 * it to its memory. The FIT msg might have
+	 * contained several requests. As soon as
+	 * any of them are completed we know that
+	 * the entire FIT msg was transferred.
+	 * Only the first completed request will
+	 * match the FIT msg buffer id. The FIT
+	 * msg buffer id is immediately updated.
+	 * When subsequent requests complete the FIT
+	 * msg buffer id won't match, so we know
+	 * quite cheaply that it is already done.
+	 */
+	msg_slot = skreq->fitmsg_id & SKD_ID_SLOT_MASK;
+	SKD_ASSERT(msg_slot < skdev->num_fitmsg_context);
+
+	skmsg = &skdev->skmsg_table[msg_slot];
+	if (skmsg->id == skreq->fitmsg_id) {
+		SKD_ASSERT(skmsg->state == SKD_MSG_STATE_BUSY);
+		SKD_ASSERT(skmsg->outstanding > 0);
+		skmsg->outstanding--;
+		if (skmsg->outstanding == 0) {
+			skmsg->state = SKD_MSG_STATE_IDLE;
+			skmsg->id += SKD_ID_INCR;
+			skmsg->next = skdev->skmsg_free_list;
+			skdev->skmsg_free_list = skmsg;
+		}
+	}
+
+	/*
+	 * Decrease the number of active requests.
+	 * Also decrements the count in the timeout slot.
+	 */
+	SKD_ASSERT(skdev->in_flight > 0);
+	skdev->in_flight -= 1;
+
+	timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
+	SKD_ASSERT(skdev->timeout_slot[timo_slot] > 0);
+	skdev->timeout_slot[timo_slot] -= 1;
+
+	/*
+	 * Reset backpointer
+	 */
+	skreq->req = NULL;
+
+	/*
+	 * Reclaim the skd_request_context
+	 */
+	skreq->state = SKD_REQ_STATE_IDLE;
+	skreq->id += SKD_ID_INCR;
+	skreq->next = skdev->skreq_free_list;
+	skdev->skreq_free_list = skreq;
+}
+
+#define DRIVER_INQ_EVPD_PAGE_CODE   0xDA
+
+static void skd_do_inq_page_00(struct skd_device *skdev,
+			       volatile struct fit_completion_entry_v1 *skcomp,
+			       volatile struct fit_comp_error_info *skerr,
+			       uint8_t *cdb, uint8_t *buf)
+{
+	uint16_t insert_pt, max_bytes, drive_pages, drive_bytes, new_size;
+
+	/* Caller requested "supported pages".  The driver needs to insert
+	 * its page.
+	 */
+	pr_debug("%s:%s:%d skd_do_driver_inquiry: modify supported pages.\n",
+		 skdev->name, __func__, __LINE__);
+
+	/* If the device rejected the request because the CDB was
+	 * improperly formed, then just leave.
+	 */
+	if (skcomp->status == SAM_STAT_CHECK_CONDITION &&
+	    skerr->key == ILLEGAL_REQUEST && skerr->code == 0x24)
+		return;
+
+	/* Get the amount of space the caller allocated */
+	max_bytes = (cdb[3] << 8) | cdb[4];
+
+	/* Get the number of pages actually returned by the device */
+	drive_pages = (buf[2] << 8) | buf[3];
+	drive_bytes = drive_pages + 4;
+	new_size = drive_pages + 1;
+
+	/* Supported pages must be in numerical order, so find where
+	 * the driver page needs to be inserted into the list of
+	 * pages returned by the device.
+	 */
+	for (insert_pt = 4; insert_pt < drive_bytes; insert_pt++) {
+		if (buf[insert_pt] == DRIVER_INQ_EVPD_PAGE_CODE)
+			return; /* Device using this page code. abort */
+		else if (buf[insert_pt] > DRIVER_INQ_EVPD_PAGE_CODE)
+			break;
+	}
+
+	if (insert_pt < max_bytes) {
+		uint16_t u;
+
+		/* Shift everything up one byte to make room. */
+		for (u = new_size + 3; u > insert_pt; u--)
+			buf[u] = buf[u - 1];
+		buf[insert_pt] = DRIVER_INQ_EVPD_PAGE_CODE;
+
+		/* SCSI byte order increment of num_returned_bytes by 1 */
+		skcomp->num_returned_bytes =
+			be32_to_cpu(skcomp->num_returned_bytes) + 1;
+		skcomp->num_returned_bytes =
+			be32_to_cpu(skcomp->num_returned_bytes);
+	}
+
+	/* update page length field to reflect the driver's page too */
+	buf[2] = (uint8_t)((new_size >> 8) & 0xFF);
+	buf[3] = (uint8_t)((new_size >> 0) & 0xFF);
+}
+
+static void skd_get_link_info(struct pci_dev *pdev, u8 *speed, u8 *width)
+{
+	int pcie_reg;
+	u16 pci_bus_speed;
+	u8 pci_lanes;
+
+	pcie_reg = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	if (pcie_reg) {
+		u16 linksta;
+		pci_read_config_word(pdev, pcie_reg + PCI_EXP_LNKSTA, &linksta);
+
+		pci_bus_speed = linksta & 0xF;
+		pci_lanes = (linksta & 0x3F0) >> 4;
+	} else {
+		*speed = STEC_LINK_UNKNOWN;
+		*width = 0xFF;
+		return;
+	}
+
+	switch (pci_bus_speed) {
+	case 1:
+		*speed = STEC_LINK_2_5GTS;
+		break;
+	case 2:
+		*speed = STEC_LINK_5GTS;
+		break;
+	case 3:
+		*speed = STEC_LINK_8GTS;
+		break;
+	default:
+		*speed = STEC_LINK_UNKNOWN;
+		break;
+	}
+
+	if (pci_lanes <= 0x20)
+		*width = pci_lanes;
+	else
+		*width = 0xFF;
+}
+
+static void skd_do_inq_page_da(struct skd_device *skdev,
+			       volatile struct fit_completion_entry_v1 *skcomp,
+			       volatile struct fit_comp_error_info *skerr,
+			       uint8_t *cdb, uint8_t *buf)
+{
+	struct pci_dev *pdev = skdev->pdev;
+	unsigned max_bytes;
+	struct driver_inquiry_data inq;
+	u16 val;
+
+	pr_debug("%s:%s:%d skd_do_driver_inquiry: return driver page\n",
+		 skdev->name, __func__, __LINE__);
+
+	memset(&inq, 0, sizeof(inq));
+
+	inq.page_code = DRIVER_INQ_EVPD_PAGE_CODE;
+
+	skd_get_link_info(pdev, &inq.pcie_link_speed, &inq.pcie_link_lanes);
+	inq.pcie_bus_number = cpu_to_be16(pdev->bus->number);
+	inq.pcie_device_number = PCI_SLOT(pdev->devfn);
+	inq.pcie_function_number = PCI_FUNC(pdev->devfn);
+
+	pci_read_config_word(pdev, PCI_VENDOR_ID, &val);
+	inq.pcie_vendor_id = cpu_to_be16(val);
+
+	pci_read_config_word(pdev, PCI_DEVICE_ID, &val);
+	inq.pcie_device_id = cpu_to_be16(val);
+
+	pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &val);
+	inq.pcie_subsystem_vendor_id = cpu_to_be16(val);
+
+	pci_read_config_word(pdev, PCI_SUBSYSTEM_ID, &val);
+	inq.pcie_subsystem_device_id = cpu_to_be16(val);
+
+	/* Driver version, fixed lenth, padded with spaces on the right */
+	inq.driver_version_length = sizeof(inq.driver_version);
+	memset(&inq.driver_version, ' ', sizeof(inq.driver_version));
+	memcpy(inq.driver_version, DRV_VER_COMPL,
+	       min(sizeof(inq.driver_version), strlen(DRV_VER_COMPL)));
+
+	inq.page_length = cpu_to_be16((sizeof(inq) - 4));
+
+	/* Clear the error set by the device */
+	skcomp->status = SAM_STAT_GOOD;
+	memset((void *)skerr, 0, sizeof(*skerr));
+
+	/* copy response into output buffer */
+	max_bytes = (cdb[3] << 8) | cdb[4];
+	memcpy(buf, &inq, min_t(unsigned, max_bytes, sizeof(inq)));
+
+	skcomp->num_returned_bytes =
+		be32_to_cpu(min_t(uint16_t, max_bytes, sizeof(inq)));
+}
+
+static void skd_do_driver_inq(struct skd_device *skdev,
+			      volatile struct fit_completion_entry_v1 *skcomp,
+			      volatile struct fit_comp_error_info *skerr,
+			      uint8_t *cdb, uint8_t *buf)
+{
+	if (!buf)
+		return;
+	else if (cdb[0] != INQUIRY)
+		return;         /* Not an INQUIRY */
+	else if ((cdb[1] & 1) == 0)
+		return;         /* EVPD not set */
+	else if (cdb[2] == 0)
+		/* Need to add driver's page to supported pages list */
+		skd_do_inq_page_00(skdev, skcomp, skerr, cdb, buf);
+	else if (cdb[2] == DRIVER_INQ_EVPD_PAGE_CODE)
+		/* Caller requested driver's page */
+		skd_do_inq_page_da(skdev, skcomp, skerr, cdb, buf);
+}
+
+static unsigned char *skd_sg_1st_page_ptr(struct scatterlist *sg)
+{
+	if (!sg)
+		return NULL;
+	if (!sg_page(sg))
+		return NULL;
+	return sg_virt(sg);
+}
+
+static void skd_process_scsi_inq(struct skd_device *skdev,
+				 volatile struct fit_completion_entry_v1
+				 *skcomp,
+				 volatile struct fit_comp_error_info *skerr,
+				 struct skd_special_context *skspcl)
+{
+	uint8_t *buf;
+	struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf;
+	struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1];
+
+	dma_sync_sg_for_cpu(skdev->class_dev, skspcl->req.sg, skspcl->req.n_sg,
+			    skspcl->req.sg_data_dir);
+	buf = skd_sg_1st_page_ptr(skspcl->req.sg);
+
+	if (buf)
+		skd_do_driver_inq(skdev, skcomp, skerr, scsi_req->cdb, buf);
+}
+
+
+static int skd_isr_completion_posted(struct skd_device *skdev,
+					int limit, int *enqueued)
+{
+	volatile struct fit_completion_entry_v1 *skcmp = NULL;
+	volatile struct fit_comp_error_info *skerr;
+	u16 req_id;
+	u32 req_slot;
+	struct skd_request_context *skreq;
+	u16 cmp_cntxt = 0;
+	u8 cmp_status = 0;
+	u8 cmp_cycle = 0;
+	u32 cmp_bytes = 0;
+	int rc = 0;
+	int processed = 0;
+
+	for (;; ) {
+		SKD_ASSERT(skdev->skcomp_ix < SKD_N_COMPLETION_ENTRY);
+
+		skcmp = &skdev->skcomp_table[skdev->skcomp_ix];
+		cmp_cycle = skcmp->cycle;
+		cmp_cntxt = skcmp->tag;
+		cmp_status = skcmp->status;
+		cmp_bytes = be32_to_cpu(skcmp->num_returned_bytes);
+
+		skerr = &skdev->skerr_table[skdev->skcomp_ix];
+
+		pr_debug("%s:%s:%d "
+			 "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d "
+			 "busy=%d rbytes=0x%x proto=%d\n",
+			 skdev->name, __func__, __LINE__, skdev->skcomp_cycle,
+			 skdev->skcomp_ix, cmp_cycle, cmp_cntxt, cmp_status,
+			 skdev->in_flight, cmp_bytes, skdev->proto_ver);
+
+		if (cmp_cycle != skdev->skcomp_cycle) {
+			pr_debug("%s:%s:%d end of completions\n",
+				 skdev->name, __func__, __LINE__);
+			break;
+		}
+		/*
+		 * Update the completion queue head index and possibly
+		 * the completion cycle count. 8-bit wrap-around.
+		 */
+		skdev->skcomp_ix++;
+		if (skdev->skcomp_ix >= SKD_N_COMPLETION_ENTRY) {
+			skdev->skcomp_ix = 0;
+			skdev->skcomp_cycle++;
+		}
+
+		/*
+		 * The command context is a unique 32-bit ID. The low order
+		 * bits help locate the request. The request is usually a
+		 * r/w request (see skd_start() above) or a special request.
+		 */
+		req_id = cmp_cntxt;
+		req_slot = req_id & SKD_ID_SLOT_AND_TABLE_MASK;
+
+		/* Is this other than a r/w request? */
+		if (req_slot >= skdev->num_req_context) {
+			/*
+			 * This is not a completion for a r/w request.
+			 */
+			skd_complete_other(skdev, skcmp, skerr);
+			continue;
+		}
+
+		skreq = &skdev->skreq_table[req_slot];
+
+		/*
+		 * Make sure the request ID for the slot matches.
+		 */
+		if (skreq->id != req_id) {
+			pr_debug("%s:%s:%d mismatch comp_id=0x%x req_id=0x%x\n",
+				 skdev->name, __func__, __LINE__,
+				 req_id, skreq->id);
+			{
+				u16 new_id = cmp_cntxt;
+				pr_err("(%s): Completion mismatch "
+				       "comp_id=0x%04x skreq=0x%04x new=0x%04x\n",
+				       skd_name(skdev), req_id,
+				       skreq->id, new_id);
+
+				continue;
+			}
+		}
+
+		SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY);
+
+		if (skreq->state == SKD_REQ_STATE_ABORTED) {
+			pr_debug("%s:%s:%d reclaim req %p id=%04x\n",
+				 skdev->name, __func__, __LINE__,
+				 skreq, skreq->id);
+			/* a previously timed out command can
+			 * now be cleaned up */
+			skd_release_skreq(skdev, skreq);
+			continue;
+		}
+
+		skreq->completion = *skcmp;
+		if (unlikely(cmp_status == SAM_STAT_CHECK_CONDITION)) {
+			skreq->err_info = *skerr;
+			skd_log_check_status(skdev, cmp_status, skerr->key,
+					     skerr->code, skerr->qual,
+					     skerr->fruc);
+		}
+		/* Release DMA resources for the request. */
+		if (skreq->n_sg > 0)
+			skd_postop_sg_list(skdev, skreq);
+
+		if (!skreq->req) {
+			pr_debug("%s:%s:%d NULL backptr skdreq %p, "
+				 "req=0x%x req_id=0x%x\n",
+				 skdev->name, __func__, __LINE__,
+				 skreq, skreq->id, req_id);
+		} else {
+			/*
+			 * Capture the outcome and post it back to the
+			 * native request.
+			 */
+			if (likely(cmp_status == SAM_STAT_GOOD))
+				skd_end_request(skdev, skreq, 0);
+			else
+				skd_resolve_req_exception(skdev, skreq);
+		}
+
+		/*
+		 * Release the skreq, its FIT msg (if one), timeout slot,
+		 * and queue depth.
+		 */
+		skd_release_skreq(skdev, skreq);
+
+		/* skd_isr_comp_limit equal zero means no limit */
+		if (limit) {
+			if (++processed >= limit) {
+				rc = 1;
+				break;
+			}
+		}
+	}
+
+	if ((skdev->state == SKD_DRVR_STATE_PAUSING)
+		&& (skdev->in_flight) == 0) {
+		skdev->state = SKD_DRVR_STATE_PAUSED;
+		wake_up_interruptible(&skdev->waitq);
+	}
+
+	return rc;
+}
+
+static void skd_complete_other(struct skd_device *skdev,
+			       volatile struct fit_completion_entry_v1 *skcomp,
+			       volatile struct fit_comp_error_info *skerr)
+{
+	u32 req_id = 0;
+	u32 req_table;
+	u32 req_slot;
+	struct skd_special_context *skspcl;
+
+	req_id = skcomp->tag;
+	req_table = req_id & SKD_ID_TABLE_MASK;
+	req_slot = req_id & SKD_ID_SLOT_MASK;
+
+	pr_debug("%s:%s:%d table=0x%x id=0x%x slot=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 req_table, req_id, req_slot);
+
+	/*
+	 * Based on the request id, determine how to dispatch this completion.
+	 * This swich/case is finding the good cases and forwarding the
+	 * completion entry. Errors are reported below the switch.
+	 */
+	switch (req_table) {
+	case SKD_ID_RW_REQUEST:
+		/*
+		 * The caller, skd_completion_posted_isr() above,
+		 * handles r/w requests. The only way we get here
+		 * is if the req_slot is out of bounds.
+		 */
+		break;
+
+	case SKD_ID_SPECIAL_REQUEST:
+		/*
+		 * Make sure the req_slot is in bounds and that the id
+		 * matches.
+		 */
+		if (req_slot < skdev->n_special) {
+			skspcl = &skdev->skspcl_table[req_slot];
+			if (skspcl->req.id == req_id &&
+			    skspcl->req.state == SKD_REQ_STATE_BUSY) {
+				skd_complete_special(skdev,
+						     skcomp, skerr, skspcl);
+				return;
+			}
+		}
+		break;
+
+	case SKD_ID_INTERNAL:
+		if (req_slot == 0) {
+			skspcl = &skdev->internal_skspcl;
+			if (skspcl->req.id == req_id &&
+			    skspcl->req.state == SKD_REQ_STATE_BUSY) {
+				skd_complete_internal(skdev,
+						      skcomp, skerr, skspcl);
+				return;
+			}
+		}
+		break;
+
+	case SKD_ID_FIT_MSG:
+		/*
+		 * These id's should never appear in a completion record.
+		 */
+		break;
+
+	default:
+		/*
+		 * These id's should never appear anywhere;
+		 */
+		break;
+	}
+
+	/*
+	 * If we get here it is a bad or stale id.
+	 */
+}
+
+static void skd_complete_special(struct skd_device *skdev,
+				 volatile struct fit_completion_entry_v1
+				 *skcomp,
+				 volatile struct fit_comp_error_info *skerr,
+				 struct skd_special_context *skspcl)
+{
+	pr_debug("%s:%s:%d  completing special request %p\n",
+		 skdev->name, __func__, __LINE__, skspcl);
+	if (skspcl->orphaned) {
+		/* Discard orphaned request */
+		/* ?: Can this release directly or does it need
+		 * to use a worker? */
+		pr_debug("%s:%s:%d release orphaned %p\n",
+			 skdev->name, __func__, __LINE__, skspcl);
+		skd_release_special(skdev, skspcl);
+		return;
+	}
+
+	skd_process_scsi_inq(skdev, skcomp, skerr, skspcl);
+
+	skspcl->req.state = SKD_REQ_STATE_COMPLETED;
+	skspcl->req.completion = *skcomp;
+	skspcl->req.err_info = *skerr;
+
+	skd_log_check_status(skdev, skspcl->req.completion.status, skerr->key,
+			     skerr->code, skerr->qual, skerr->fruc);
+
+	wake_up_interruptible(&skdev->waitq);
+}
+
+/* assume spinlock is already held */
+static void skd_release_special(struct skd_device *skdev,
+				struct skd_special_context *skspcl)
+{
+	int i, was_depleted;
+
+	for (i = 0; i < skspcl->req.n_sg; i++) {
+		struct page *page = sg_page(&skspcl->req.sg[i]);
+		__free_page(page);
+	}
+
+	was_depleted = (skdev->skspcl_free_list == NULL);
+
+	skspcl->req.state = SKD_REQ_STATE_IDLE;
+	skspcl->req.id += SKD_ID_INCR;
+	skspcl->req.next =
+		(struct skd_request_context *)skdev->skspcl_free_list;
+	skdev->skspcl_free_list = (struct skd_special_context *)skspcl;
+
+	if (was_depleted) {
+		pr_debug("%s:%s:%d skspcl was depleted\n",
+			 skdev->name, __func__, __LINE__);
+		/* Free list was depleted. Their might be waiters. */
+		wake_up_interruptible(&skdev->waitq);
+	}
+}
+
+static void skd_reset_skcomp(struct skd_device *skdev)
+{
+	u32 nbytes;
+	struct fit_completion_entry_v1 *skcomp;
+
+	nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY;
+	nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY;
+
+	memset(skdev->skcomp_table, 0, nbytes);
+
+	skdev->skcomp_ix = 0;
+	skdev->skcomp_cycle = 1;
+}
+
+/*
+ *****************************************************************************
+ * INTERRUPTS
+ *****************************************************************************
+ */
+static void skd_completion_worker(struct work_struct *work)
+{
+	struct skd_device *skdev =
+		container_of(work, struct skd_device, completion_worker);
+	unsigned long flags;
+	int flush_enqueued = 0;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+
+	/*
+	 * pass in limit=0, which means no limit..
+	 * process everything in compq
+	 */
+	skd_isr_completion_posted(skdev, 0, &flush_enqueued);
+	skd_request_fn(skdev->queue);
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+}
+
+static void skd_isr_msg_from_dev(struct skd_device *skdev);
+
+irqreturn_t
+static skd_isr(int irq, void *ptr)
+{
+	struct skd_device *skdev;
+	u32 intstat;
+	u32 ack;
+	int rc = 0;
+	int deferred = 0;
+	int flush_enqueued = 0;
+
+	skdev = (struct skd_device *)ptr;
+	spin_lock(&skdev->lock);
+
+	for (;; ) {
+		intstat = SKD_READL(skdev, FIT_INT_STATUS_HOST);
+
+		ack = FIT_INT_DEF_MASK;
+		ack &= intstat;
+
+		pr_debug("%s:%s:%d intstat=0x%x ack=0x%x\n",
+			 skdev->name, __func__, __LINE__, intstat, ack);
+
+		/* As long as there is an int pending on device, keep
+		 * running loop.  When none, get out, but if we've never
+		 * done any processing, call completion handler?
+		 */
+		if (ack == 0) {
+			/* No interrupts on device, but run the completion
+			 * processor anyway?
+			 */
+			if (rc == 0)
+				if (likely (skdev->state
+					== SKD_DRVR_STATE_ONLINE))
+					deferred = 1;
+			break;
+		}
+
+		rc = IRQ_HANDLED;
+
+		SKD_WRITEL(skdev, ack, FIT_INT_STATUS_HOST);
+
+		if (likely((skdev->state != SKD_DRVR_STATE_LOAD) &&
+			   (skdev->state != SKD_DRVR_STATE_STOPPING))) {
+			if (intstat & FIT_ISH_COMPLETION_POSTED) {
+				/*
+				 * If we have already deferred completion
+				 * processing, don't bother running it again
+				 */
+				if (deferred == 0)
+					deferred =
+						skd_isr_completion_posted(skdev,
+						skd_isr_comp_limit, &flush_enqueued);
+			}
+
+			if (intstat & FIT_ISH_FW_STATE_CHANGE) {
+				skd_isr_fwstate(skdev);
+				if (skdev->state == SKD_DRVR_STATE_FAULT ||
+				    skdev->state ==
+				    SKD_DRVR_STATE_DISAPPEARED) {
+					spin_unlock(&skdev->lock);
+					return rc;
+				}
+			}
+
+			if (intstat & FIT_ISH_MSG_FROM_DEV)
+				skd_isr_msg_from_dev(skdev);
+		}
+	}
+
+	if (unlikely(flush_enqueued))
+		skd_request_fn(skdev->queue);
+
+	if (deferred)
+		schedule_work(&skdev->completion_worker);
+	else if (!flush_enqueued)
+		skd_request_fn(skdev->queue);
+
+	spin_unlock(&skdev->lock);
+
+	return rc;
+}
+
+static void skd_drive_fault(struct skd_device *skdev)
+{
+	skdev->state = SKD_DRVR_STATE_FAULT;
+	pr_err("(%s): Drive FAULT\n", skd_name(skdev));
+}
+
+static void skd_drive_disappeared(struct skd_device *skdev)
+{
+	skdev->state = SKD_DRVR_STATE_DISAPPEARED;
+	pr_err("(%s): Drive DISAPPEARED\n", skd_name(skdev));
+}
+
+static void skd_isr_fwstate(struct skd_device *skdev)
+{
+	u32 sense;
+	u32 state;
+	u32 mtd;
+	int prev_driver_state = skdev->state;
+
+	sense = SKD_READL(skdev, FIT_STATUS);
+	state = sense & FIT_SR_DRIVE_STATE_MASK;
+
+	pr_err("(%s): s1120 state %s(%d)=>%s(%d)\n",
+	       skd_name(skdev),
+	       skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
+	       skd_drive_state_to_str(state), state);
+
+	skdev->drive_state = state;
+
+	switch (skdev->drive_state) {
+	case FIT_SR_DRIVE_INIT:
+		if (skdev->state == SKD_DRVR_STATE_PROTOCOL_MISMATCH) {
+			skd_disable_interrupts(skdev);
+			break;
+		}
+		if (skdev->state == SKD_DRVR_STATE_RESTARTING)
+			skd_recover_requests(skdev, 0);
+		if (skdev->state == SKD_DRVR_STATE_WAIT_BOOT) {
+			skdev->timer_countdown = SKD_STARTING_TIMO;
+			skdev->state = SKD_DRVR_STATE_STARTING;
+			skd_soft_reset(skdev);
+			break;
+		}
+		mtd = FIT_MXD_CONS(FIT_MTD_FITFW_INIT, 0, 0);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_SR_DRIVE_ONLINE:
+		skdev->cur_max_queue_depth = skd_max_queue_depth;
+		if (skdev->cur_max_queue_depth > skdev->dev_max_queue_depth)
+			skdev->cur_max_queue_depth = skdev->dev_max_queue_depth;
+
+		skdev->queue_low_water_mark =
+			skdev->cur_max_queue_depth * 2 / 3 + 1;
+		if (skdev->queue_low_water_mark < 1)
+			skdev->queue_low_water_mark = 1;
+		pr_info(
+		       "(%s): Queue depth limit=%d dev=%d lowat=%d\n",
+		       skd_name(skdev),
+		       skdev->cur_max_queue_depth,
+		       skdev->dev_max_queue_depth, skdev->queue_low_water_mark);
+
+		skd_refresh_device_data(skdev);
+		break;
+
+	case FIT_SR_DRIVE_BUSY:
+		skdev->state = SKD_DRVR_STATE_BUSY;
+		skdev->timer_countdown = SKD_BUSY_TIMO;
+		skd_quiesce_dev(skdev);
+		break;
+	case FIT_SR_DRIVE_BUSY_SANITIZE:
+		/* set timer for 3 seconds, we'll abort any unfinished
+		 * commands after that expires
+		 */
+		skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE;
+		skdev->timer_countdown = SKD_TIMER_SECONDS(3);
+		blk_start_queue(skdev->queue);
+		break;
+	case FIT_SR_DRIVE_BUSY_ERASE:
+		skdev->state = SKD_DRVR_STATE_BUSY_ERASE;
+		skdev->timer_countdown = SKD_BUSY_TIMO;
+		break;
+	case FIT_SR_DRIVE_OFFLINE:
+		skdev->state = SKD_DRVR_STATE_IDLE;
+		break;
+	case FIT_SR_DRIVE_SOFT_RESET:
+		switch (skdev->state) {
+		case SKD_DRVR_STATE_STARTING:
+		case SKD_DRVR_STATE_RESTARTING:
+			/* Expected by a caller of skd_soft_reset() */
+			break;
+		default:
+			skdev->state = SKD_DRVR_STATE_RESTARTING;
+			break;
+		}
+		break;
+	case FIT_SR_DRIVE_FW_BOOTING:
+		pr_debug("%s:%s:%d ISR FIT_SR_DRIVE_FW_BOOTING %s\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
+		skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
+		skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
+		break;
+
+	case FIT_SR_DRIVE_DEGRADED:
+	case FIT_SR_PCIE_LINK_DOWN:
+	case FIT_SR_DRIVE_NEED_FW_DOWNLOAD:
+		break;
+
+	case FIT_SR_DRIVE_FAULT:
+		skd_drive_fault(skdev);
+		skd_recover_requests(skdev, 0);
+		blk_start_queue(skdev->queue);
+		break;
+
+	/* PCIe bus returned all Fs? */
+	case 0xFF:
+		pr_info("(%s): state=0x%x sense=0x%x\n",
+		       skd_name(skdev), state, sense);
+		skd_drive_disappeared(skdev);
+		skd_recover_requests(skdev, 0);
+		blk_start_queue(skdev->queue);
+		break;
+	default:
+		/*
+		 * Uknown FW State. Wait for a state we recognize.
+		 */
+		break;
+	}
+	pr_err("(%s): Driver state %s(%d)=>%s(%d)\n",
+	       skd_name(skdev),
+	       skd_skdev_state_to_str(prev_driver_state), prev_driver_state,
+	       skd_skdev_state_to_str(skdev->state), skdev->state);
+}
+
+static void skd_recover_requests(struct skd_device *skdev, int requeue)
+{
+	int i;
+
+	for (i = 0; i < skdev->num_req_context; i++) {
+		struct skd_request_context *skreq = &skdev->skreq_table[i];
+
+		if (skreq->state == SKD_REQ_STATE_BUSY) {
+			skd_log_skreq(skdev, skreq, "recover");
+
+			SKD_ASSERT((skreq->id & SKD_ID_INCR) != 0);
+			SKD_ASSERT(skreq->req != NULL);
+
+			/* Release DMA resources for the request. */
+			if (skreq->n_sg > 0)
+				skd_postop_sg_list(skdev, skreq);
+
+			if (requeue &&
+			    (unsigned long) ++skreq->req->special <
+			    SKD_MAX_RETRIES)
+				blk_requeue_request(skdev->queue, skreq->req);
+			else
+				skd_end_request(skdev, skreq, -EIO);
+
+			skreq->req = NULL;
+
+			skreq->state = SKD_REQ_STATE_IDLE;
+			skreq->id += SKD_ID_INCR;
+		}
+		if (i > 0)
+			skreq[-1].next = skreq;
+		skreq->next = NULL;
+	}
+	skdev->skreq_free_list = skdev->skreq_table;
+
+	for (i = 0; i < skdev->num_fitmsg_context; i++) {
+		struct skd_fitmsg_context *skmsg = &skdev->skmsg_table[i];
+
+		if (skmsg->state == SKD_MSG_STATE_BUSY) {
+			skd_log_skmsg(skdev, skmsg, "salvaged");
+			SKD_ASSERT((skmsg->id & SKD_ID_INCR) != 0);
+			skmsg->state = SKD_MSG_STATE_IDLE;
+			skmsg->id += SKD_ID_INCR;
+		}
+		if (i > 0)
+			skmsg[-1].next = skmsg;
+		skmsg->next = NULL;
+	}
+	skdev->skmsg_free_list = skdev->skmsg_table;
+
+	for (i = 0; i < skdev->n_special; i++) {
+		struct skd_special_context *skspcl = &skdev->skspcl_table[i];
+
+		/* If orphaned, reclaim it because it has already been reported
+		 * to the process as an error (it was just waiting for
+		 * a completion that didn't come, and now it will never come)
+		 * If busy, change to a state that will cause it to error
+		 * out in the wait routine and let it do the normal
+		 * reporting and reclaiming
+		 */
+		if (skspcl->req.state == SKD_REQ_STATE_BUSY) {
+			if (skspcl->orphaned) {
+				pr_debug("%s:%s:%d orphaned %p\n",
+					 skdev->name, __func__, __LINE__,
+					 skspcl);
+				skd_release_special(skdev, skspcl);
+			} else {
+				pr_debug("%s:%s:%d not orphaned %p\n",
+					 skdev->name, __func__, __LINE__,
+					 skspcl);
+				skspcl->req.state = SKD_REQ_STATE_ABORTED;
+			}
+		}
+	}
+	skdev->skspcl_free_list = skdev->skspcl_table;
+
+	for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++)
+		skdev->timeout_slot[i] = 0;
+
+	skdev->in_flight = 0;
+}
+
+static void skd_isr_msg_from_dev(struct skd_device *skdev)
+{
+	u32 mfd;
+	u32 mtd;
+	u32 data;
+
+	mfd = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
+
+	pr_debug("%s:%s:%d mfd=0x%x last_mtd=0x%x\n",
+		 skdev->name, __func__, __LINE__, mfd, skdev->last_mtd);
+
+	/* ignore any mtd that is an ack for something we didn't send */
+	if (FIT_MXD_TYPE(mfd) != FIT_MXD_TYPE(skdev->last_mtd))
+		return;
+
+	switch (FIT_MXD_TYPE(mfd)) {
+	case FIT_MTD_FITFW_INIT:
+		skdev->proto_ver = FIT_PROTOCOL_MAJOR_VER(mfd);
+
+		if (skdev->proto_ver != FIT_PROTOCOL_VERSION_1) {
+			pr_err("(%s): protocol mismatch\n",
+			       skdev->name);
+			pr_err("(%s):   got=%d support=%d\n",
+			       skdev->name, skdev->proto_ver,
+			       FIT_PROTOCOL_VERSION_1);
+			pr_err("(%s):   please upgrade driver\n",
+			       skdev->name);
+			skdev->state = SKD_DRVR_STATE_PROTOCOL_MISMATCH;
+			skd_soft_reset(skdev);
+			break;
+		}
+		mtd = FIT_MXD_CONS(FIT_MTD_GET_CMDQ_DEPTH, 0, 0);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_GET_CMDQ_DEPTH:
+		skdev->dev_max_queue_depth = FIT_MXD_DATA(mfd);
+		mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_DEPTH, 0,
+				   SKD_N_COMPLETION_ENTRY);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_SET_COMPQ_DEPTH:
+		SKD_WRITEQ(skdev, skdev->cq_dma_address, FIT_MSG_TO_DEVICE_ARG);
+		mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_ADDR, 0, 0);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_SET_COMPQ_ADDR:
+		skd_reset_skcomp(skdev);
+		mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_HOST_ID, 0, skdev->devno);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_CMD_LOG_HOST_ID:
+		skdev->connect_time_stamp = get_seconds();
+		data = skdev->connect_time_stamp & 0xFFFF;
+		mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_CMD_LOG_TIME_STAMP_LO:
+		skdev->drive_jiffies = FIT_MXD_DATA(mfd);
+		data = (skdev->connect_time_stamp >> 16) & 0xFFFF;
+		mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_HI, 0, data);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+		break;
+
+	case FIT_MTD_CMD_LOG_TIME_STAMP_HI:
+		skdev->drive_jiffies |= (FIT_MXD_DATA(mfd) << 16);
+		mtd = FIT_MXD_CONS(FIT_MTD_ARM_QUEUE, 0, 0);
+		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+		skdev->last_mtd = mtd;
+
+		pr_err("(%s): Time sync driver=0x%x device=0x%x\n",
+		       skd_name(skdev),
+		       skdev->connect_time_stamp, skdev->drive_jiffies);
+		break;
+
+	case FIT_MTD_ARM_QUEUE:
+		skdev->last_mtd = 0;
+		/*
+		 * State should be, or soon will be, FIT_SR_DRIVE_ONLINE.
+		 */
+		break;
+
+	default:
+		break;
+	}
+}
+
+static void skd_disable_interrupts(struct skd_device *skdev)
+{
+	u32 sense;
+
+	sense = SKD_READL(skdev, FIT_CONTROL);
+	sense &= ~FIT_CR_ENABLE_INTERRUPTS;
+	SKD_WRITEL(skdev, sense, FIT_CONTROL);
+	pr_debug("%s:%s:%d sense 0x%x\n",
+		 skdev->name, __func__, __LINE__, sense);
+
+	/* Note that the 1s is written. A 1-bit means
+	 * disable, a 0 means enable.
+	 */
+	SKD_WRITEL(skdev, ~0, FIT_INT_MASK_HOST);
+}
+
+static void skd_enable_interrupts(struct skd_device *skdev)
+{
+	u32 val;
+
+	/* unmask interrupts first */
+	val = FIT_ISH_FW_STATE_CHANGE +
+	      FIT_ISH_COMPLETION_POSTED + FIT_ISH_MSG_FROM_DEV;
+
+	/* Note that the compliment of mask is written. A 1-bit means
+	 * disable, a 0 means enable. */
+	SKD_WRITEL(skdev, ~val, FIT_INT_MASK_HOST);
+	pr_debug("%s:%s:%d interrupt mask=0x%x\n",
+		 skdev->name, __func__, __LINE__, ~val);
+
+	val = SKD_READL(skdev, FIT_CONTROL);
+	val |= FIT_CR_ENABLE_INTERRUPTS;
+	pr_debug("%s:%s:%d control=0x%x\n",
+		 skdev->name, __func__, __LINE__, val);
+	SKD_WRITEL(skdev, val, FIT_CONTROL);
+}
+
+/*
+ *****************************************************************************
+ * START, STOP, RESTART, QUIESCE, UNQUIESCE
+ *****************************************************************************
+ */
+
+static void skd_soft_reset(struct skd_device *skdev)
+{
+	u32 val;
+
+	val = SKD_READL(skdev, FIT_CONTROL);
+	val |= (FIT_CR_SOFT_RESET);
+	pr_debug("%s:%s:%d control=0x%x\n",
+		 skdev->name, __func__, __LINE__, val);
+	SKD_WRITEL(skdev, val, FIT_CONTROL);
+}
+
+static void skd_start_device(struct skd_device *skdev)
+{
+	unsigned long flags;
+	u32 sense;
+	u32 state;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+
+	/* ack all ghost interrupts */
+	SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
+
+	sense = SKD_READL(skdev, FIT_STATUS);
+
+	pr_debug("%s:%s:%d initial status=0x%x\n",
+		 skdev->name, __func__, __LINE__, sense);
+
+	state = sense & FIT_SR_DRIVE_STATE_MASK;
+	skdev->drive_state = state;
+	skdev->last_mtd = 0;
+
+	skdev->state = SKD_DRVR_STATE_STARTING;
+	skdev->timer_countdown = SKD_STARTING_TIMO;
+
+	skd_enable_interrupts(skdev);
+
+	switch (skdev->drive_state) {
+	case FIT_SR_DRIVE_OFFLINE:
+		pr_err("(%s): Drive offline...\n", skd_name(skdev));
+		break;
+
+	case FIT_SR_DRIVE_FW_BOOTING:
+		pr_debug("%s:%s:%d FIT_SR_DRIVE_FW_BOOTING %s\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
+		skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
+		skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
+		break;
+
+	case FIT_SR_DRIVE_BUSY_SANITIZE:
+		pr_info("(%s): Start: BUSY_SANITIZE\n",
+		       skd_name(skdev));
+		skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE;
+		skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
+		break;
+
+	case FIT_SR_DRIVE_BUSY_ERASE:
+		pr_info("(%s): Start: BUSY_ERASE\n", skd_name(skdev));
+		skdev->state = SKD_DRVR_STATE_BUSY_ERASE;
+		skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
+		break;
+
+	case FIT_SR_DRIVE_INIT:
+	case FIT_SR_DRIVE_ONLINE:
+		skd_soft_reset(skdev);
+		break;
+
+	case FIT_SR_DRIVE_BUSY:
+		pr_err("(%s): Drive Busy...\n", skd_name(skdev));
+		skdev->state = SKD_DRVR_STATE_BUSY;
+		skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
+		break;
+
+	case FIT_SR_DRIVE_SOFT_RESET:
+		pr_err("(%s) drive soft reset in prog\n",
+		       skd_name(skdev));
+		break;
+
+	case FIT_SR_DRIVE_FAULT:
+		/* Fault state is bad...soft reset won't do it...
+		 * Hard reset, maybe, but does it work on device?
+		 * For now, just fault so the system doesn't hang.
+		 */
+		skd_drive_fault(skdev);
+		/*start the queue so we can respond with error to requests */
+		pr_debug("%s:%s:%d starting %s queue\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
+		blk_start_queue(skdev->queue);
+		skdev->gendisk_on = -1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	case 0xFF:
+		/* Most likely the device isn't there or isn't responding
+		 * to the BAR1 addresses. */
+		skd_drive_disappeared(skdev);
+		/*start the queue so we can respond with error to requests */
+		pr_debug("%s:%s:%d starting %s queue to error-out reqs\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
+		blk_start_queue(skdev->queue);
+		skdev->gendisk_on = -1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	default:
+		pr_err("(%s) Start: unknown state %x\n",
+		       skd_name(skdev), skdev->drive_state);
+		break;
+	}
+
+	state = SKD_READL(skdev, FIT_CONTROL);
+	pr_debug("%s:%s:%d FIT Control Status=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
+
+	state = SKD_READL(skdev, FIT_INT_STATUS_HOST);
+	pr_debug("%s:%s:%d Intr Status=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
+
+	state = SKD_READL(skdev, FIT_INT_MASK_HOST);
+	pr_debug("%s:%s:%d Intr Mask=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
+
+	state = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
+	pr_debug("%s:%s:%d Msg from Dev=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
+
+	state = SKD_READL(skdev, FIT_HW_VERSION);
+	pr_debug("%s:%s:%d HW version=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+}
+
+static void skd_stop_device(struct skd_device *skdev)
+{
+	unsigned long flags;
+	struct skd_special_context *skspcl = &skdev->internal_skspcl;
+	u32 dev_state;
+	int i;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+
+	if (skdev->state != SKD_DRVR_STATE_ONLINE) {
+		pr_err("(%s): skd_stop_device not online no sync\n",
+		       skd_name(skdev));
+		goto stop_out;
+	}
+
+	if (skspcl->req.state != SKD_REQ_STATE_IDLE) {
+		pr_err("(%s): skd_stop_device no special\n",
+		       skd_name(skdev));
+		goto stop_out;
+	}
+
+	skdev->state = SKD_DRVR_STATE_SYNCING;
+	skdev->sync_done = 0;
+
+	skd_send_internal_skspcl(skdev, skspcl, SYNCHRONIZE_CACHE);
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+	wait_event_interruptible_timeout(skdev->waitq,
+					 (skdev->sync_done), (10 * HZ));
+
+	spin_lock_irqsave(&skdev->lock, flags);
+
+	switch (skdev->sync_done) {
+	case 0:
+		pr_err("(%s): skd_stop_device no sync\n",
+		       skd_name(skdev));
+		break;
+	case 1:
+		pr_err("(%s): skd_stop_device sync done\n",
+		       skd_name(skdev));
+		break;
+	default:
+		pr_err("(%s): skd_stop_device sync error\n",
+		       skd_name(skdev));
+	}
+
+stop_out:
+	skdev->state = SKD_DRVR_STATE_STOPPING;
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+	skd_kill_timer(skdev);
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	skd_disable_interrupts(skdev);
+
+	/* ensure all ints on device are cleared */
+	/* soft reset the device to unload with a clean slate */
+	SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
+	SKD_WRITEL(skdev, FIT_CR_SOFT_RESET, FIT_CONTROL);
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+	/* poll every 100ms, 1 second timeout */
+	for (i = 0; i < 10; i++) {
+		dev_state =
+			SKD_READL(skdev, FIT_STATUS) & FIT_SR_DRIVE_STATE_MASK;
+		if (dev_state == FIT_SR_DRIVE_INIT)
+			break;
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(msecs_to_jiffies(100));
+	}
+
+	if (dev_state != FIT_SR_DRIVE_INIT)
+		pr_err("(%s): skd_stop_device state error 0x%02x\n",
+		       skd_name(skdev), dev_state);
+}
+
+/* assume spinlock is held */
+static void skd_restart_device(struct skd_device *skdev)
+{
+	u32 state;
+
+	/* ack all ghost interrupts */
+	SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
+
+	state = SKD_READL(skdev, FIT_STATUS);
+
+	pr_debug("%s:%s:%d drive status=0x%x\n",
+		 skdev->name, __func__, __LINE__, state);
+
+	state &= FIT_SR_DRIVE_STATE_MASK;
+	skdev->drive_state = state;
+	skdev->last_mtd = 0;
+
+	skdev->state = SKD_DRVR_STATE_RESTARTING;
+	skdev->timer_countdown = SKD_RESTARTING_TIMO;
+
+	skd_soft_reset(skdev);
+}
+
+/* assume spinlock is held */
+static int skd_quiesce_dev(struct skd_device *skdev)
+{
+	int rc = 0;
+
+	switch (skdev->state) {
+	case SKD_DRVR_STATE_BUSY:
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+		pr_debug("%s:%s:%d stopping %s queue\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
+		blk_stop_queue(skdev->queue);
+		break;
+	case SKD_DRVR_STATE_ONLINE:
+	case SKD_DRVR_STATE_STOPPING:
+	case SKD_DRVR_STATE_SYNCING:
+	case SKD_DRVR_STATE_PAUSING:
+	case SKD_DRVR_STATE_PAUSED:
+	case SKD_DRVR_STATE_STARTING:
+	case SKD_DRVR_STATE_RESTARTING:
+	case SKD_DRVR_STATE_RESUMING:
+	default:
+		rc = -EINVAL;
+		pr_debug("%s:%s:%d state [%d] not implemented\n",
+			 skdev->name, __func__, __LINE__, skdev->state);
+	}
+	return rc;
+}
+
+/* assume spinlock is held */
+static int skd_unquiesce_dev(struct skd_device *skdev)
+{
+	int prev_driver_state = skdev->state;
+
+	skd_log_skdev(skdev, "unquiesce");
+	if (skdev->state == SKD_DRVR_STATE_ONLINE) {
+		pr_debug("%s:%s:%d **** device already ONLINE\n",
+			 skdev->name, __func__, __LINE__);
+		return 0;
+	}
+	if (skdev->drive_state != FIT_SR_DRIVE_ONLINE) {
+		/*
+		 * If there has been an state change to other than
+		 * ONLINE, we will rely on controller state change
+		 * to come back online and restart the queue.
+		 * The BUSY state means that driver is ready to
+		 * continue normal processing but waiting for controller
+		 * to become available.
+		 */
+		skdev->state = SKD_DRVR_STATE_BUSY;
+		pr_debug("%s:%s:%d drive BUSY state\n",
+			 skdev->name, __func__, __LINE__);
+		return 0;
+	}
+
+	/*
+	 * Drive has just come online, driver is either in startup,
+	 * paused performing a task, or bust waiting for hardware.
+	 */
+	switch (skdev->state) {
+	case SKD_DRVR_STATE_PAUSED:
+	case SKD_DRVR_STATE_BUSY:
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+	case SKD_DRVR_STATE_BUSY_ERASE:
+	case SKD_DRVR_STATE_STARTING:
+	case SKD_DRVR_STATE_RESTARTING:
+	case SKD_DRVR_STATE_FAULT:
+	case SKD_DRVR_STATE_IDLE:
+	case SKD_DRVR_STATE_LOAD:
+		skdev->state = SKD_DRVR_STATE_ONLINE;
+		pr_err("(%s): Driver state %s(%d)=>%s(%d)\n",
+		       skd_name(skdev),
+		       skd_skdev_state_to_str(prev_driver_state),
+		       prev_driver_state, skd_skdev_state_to_str(skdev->state),
+		       skdev->state);
+		pr_debug("%s:%s:%d **** device ONLINE...starting block queue\n",
+			 skdev->name, __func__, __LINE__);
+		pr_debug("%s:%s:%d starting %s queue\n",
+			 skdev->name, __func__, __LINE__, skdev->name);
+		pr_info("(%s): STEC s1120 ONLINE\n", skd_name(skdev));
+		blk_start_queue(skdev->queue);
+		skdev->gendisk_on = 1;
+		wake_up_interruptible(&skdev->waitq);
+		break;
+
+	case SKD_DRVR_STATE_DISAPPEARED:
+	default:
+		pr_debug("%s:%s:%d **** driver state %d, not implemented \n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->state);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/*
+ *****************************************************************************
+ * PCIe MSI/MSI-X INTERRUPT HANDLERS
+ *****************************************************************************
+ */
+
+static irqreturn_t skd_reserved_isr(int irq, void *skd_host_data)
+{
+	struct skd_device *skdev = skd_host_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	pr_debug("%s:%s:%d MSIX = 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	pr_err("(%s): MSIX reserved irq %d = 0x%x\n", skd_name(skdev),
+	       irq, SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	SKD_WRITEL(skdev, FIT_INT_RESERVED_MASK, FIT_INT_STATUS_HOST);
+	spin_unlock_irqrestore(&skdev->lock, flags);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_statec_isr(int irq, void *skd_host_data)
+{
+	struct skd_device *skdev = skd_host_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	pr_debug("%s:%s:%d MSIX = 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	SKD_WRITEL(skdev, FIT_ISH_FW_STATE_CHANGE, FIT_INT_STATUS_HOST);
+	skd_isr_fwstate(skdev);
+	spin_unlock_irqrestore(&skdev->lock, flags);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_comp_q(int irq, void *skd_host_data)
+{
+	struct skd_device *skdev = skd_host_data;
+	unsigned long flags;
+	int flush_enqueued = 0;
+	int deferred;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	pr_debug("%s:%s:%d MSIX = 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST);
+	deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit,
+						&flush_enqueued);
+	if (flush_enqueued)
+		skd_request_fn(skdev->queue);
+
+	if (deferred)
+		schedule_work(&skdev->completion_worker);
+	else if (!flush_enqueued)
+		skd_request_fn(skdev->queue);
+
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_msg_isr(int irq, void *skd_host_data)
+{
+	struct skd_device *skdev = skd_host_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	pr_debug("%s:%s:%d MSIX = 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	SKD_WRITEL(skdev, FIT_ISH_MSG_FROM_DEV, FIT_INT_STATUS_HOST);
+	skd_isr_msg_from_dev(skdev);
+	spin_unlock_irqrestore(&skdev->lock, flags);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data)
+{
+	struct skd_device *skdev = skd_host_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	pr_debug("%s:%s:%d MSIX = 0x%x\n",
+		 skdev->name, __func__, __LINE__,
+		 SKD_READL(skdev, FIT_INT_STATUS_HOST));
+	SKD_WRITEL(skdev, FIT_INT_QUEUE_FULL, FIT_INT_STATUS_HOST);
+	spin_unlock_irqrestore(&skdev->lock, flags);
+	return IRQ_HANDLED;
+}
+
+/*
+ *****************************************************************************
+ * PCIe MSI/MSI-X SETUP
+ *****************************************************************************
+ */
+
+struct skd_msix_entry {
+	int have_irq;
+	u32 vector;
+	u32 entry;
+	struct skd_device *rsp;
+	char isr_name[30];
+};
+
+struct skd_init_msix_entry {
+	const char *name;
+	irq_handler_t handler;
+};
+
+#define SKD_MAX_MSIX_COUNT              13
+#define SKD_MIN_MSIX_COUNT              7
+#define SKD_BASE_MSIX_IRQ               4
+
+static struct skd_init_msix_entry msix_entries[SKD_MAX_MSIX_COUNT] = {
+	{ "(DMA 0)",	    skd_reserved_isr },
+	{ "(DMA 1)",	    skd_reserved_isr },
+	{ "(DMA 2)",	    skd_reserved_isr },
+	{ "(DMA 3)",	    skd_reserved_isr },
+	{ "(State Change)", skd_statec_isr   },
+	{ "(COMPL_Q)",	    skd_comp_q	     },
+	{ "(MSG)",	    skd_msg_isr	     },
+	{ "(Reserved)",	    skd_reserved_isr },
+	{ "(Reserved)",	    skd_reserved_isr },
+	{ "(Queue Full 0)", skd_qfull_isr    },
+	{ "(Queue Full 1)", skd_qfull_isr    },
+	{ "(Queue Full 2)", skd_qfull_isr    },
+	{ "(Queue Full 3)", skd_qfull_isr    },
+};
+
+static void skd_release_msix(struct skd_device *skdev)
+{
+	struct skd_msix_entry *qentry;
+	int i;
+
+	if (skdev->msix_entries) {
+		for (i = 0; i < skdev->msix_count; i++) {
+			qentry = &skdev->msix_entries[i];
+			skdev = qentry->rsp;
+
+			if (qentry->have_irq)
+				devm_free_irq(&skdev->pdev->dev,
+					      qentry->vector, qentry->rsp);
+		}
+
+		kfree(skdev->msix_entries);
+	}
+
+	if (skdev->msix_count)
+		pci_disable_msix(skdev->pdev);
+
+	skdev->msix_count = 0;
+	skdev->msix_entries = NULL;
+}
+
+static int skd_acquire_msix(struct skd_device *skdev)
+{
+	int i, rc;
+	struct pci_dev *pdev = skdev->pdev;
+	struct msix_entry *entries;
+	struct skd_msix_entry *qentry;
+
+	entries = kzalloc(sizeof(struct msix_entry) * SKD_MAX_MSIX_COUNT,
+			  GFP_KERNEL);
+	if (!entries)
+		return -ENOMEM;
+
+	for (i = 0; i < SKD_MAX_MSIX_COUNT; i++)
+		entries[i].entry = i;
+
+	rc = pci_enable_msix_exact(pdev, entries, SKD_MAX_MSIX_COUNT);
+	if (rc) {
+		pr_err("(%s): failed to enable MSI-X %d\n",
+		       skd_name(skdev), rc);
+		goto msix_out;
+	}
+
+	skdev->msix_count = SKD_MAX_MSIX_COUNT;
+	skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) *
+				      skdev->msix_count, GFP_KERNEL);
+	if (!skdev->msix_entries) {
+		rc = -ENOMEM;
+		pr_err("(%s): msix table allocation error\n",
+		       skd_name(skdev));
+		goto msix_out;
+	}
+
+	for (i = 0; i < skdev->msix_count; i++) {
+		qentry = &skdev->msix_entries[i];
+		qentry->vector = entries[i].vector;
+		qentry->entry = entries[i].entry;
+		qentry->rsp = NULL;
+		qentry->have_irq = 0;
+		pr_debug("%s:%s:%d %s: <%s> msix (%d) vec %d, entry %x\n",
+			 skdev->name, __func__, __LINE__,
+			 pci_name(pdev), skdev->name,
+			 i, qentry->vector, qentry->entry);
+	}
+
+	/* Enable MSI-X vectors for the base queue */
+	for (i = 0; i < skdev->msix_count; i++) {
+		qentry = &skdev->msix_entries[i];
+		snprintf(qentry->isr_name, sizeof(qentry->isr_name),
+			 "%s%d-msix %s", DRV_NAME, skdev->devno,
+			 msix_entries[i].name);
+		rc = devm_request_irq(&skdev->pdev->dev, qentry->vector,
+				      msix_entries[i].handler, 0,
+				      qentry->isr_name, skdev);
+		if (rc) {
+			pr_err("(%s): Unable to register(%d) MSI-X "
+			       "handler %d: %s\n",
+			       skd_name(skdev), rc, i, qentry->isr_name);
+			goto msix_out;
+		} else {
+			qentry->have_irq = 1;
+			qentry->rsp = skdev;
+		}
+	}
+	pr_debug("%s:%s:%d %s: <%s> msix %d irq(s) enabled\n",
+		 skdev->name, __func__, __LINE__,
+		 pci_name(pdev), skdev->name, skdev->msix_count);
+	return 0;
+
+msix_out:
+	if (entries)
+		kfree(entries);
+	skd_release_msix(skdev);
+	return rc;
+}
+
+static int skd_acquire_irq(struct skd_device *skdev)
+{
+	int rc;
+	struct pci_dev *pdev;
+
+	pdev = skdev->pdev;
+	skdev->msix_count = 0;
+
+RETRY_IRQ_TYPE:
+	switch (skdev->irq_type) {
+	case SKD_IRQ_MSIX:
+		rc = skd_acquire_msix(skdev);
+		if (!rc)
+			pr_info("(%s): MSI-X %d irqs enabled\n",
+			       skd_name(skdev), skdev->msix_count);
+		else {
+			pr_err(
+			       "(%s): failed to enable MSI-X, re-trying with MSI %d\n",
+			       skd_name(skdev), rc);
+			skdev->irq_type = SKD_IRQ_MSI;
+			goto RETRY_IRQ_TYPE;
+		}
+		break;
+	case SKD_IRQ_MSI:
+		snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d-msi",
+			 DRV_NAME, skdev->devno);
+		rc = pci_enable_msi_range(pdev, 1, 1);
+		if (rc > 0) {
+			rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, 0,
+					      skdev->isr_name, skdev);
+			if (rc) {
+				pci_disable_msi(pdev);
+				pr_err(
+				       "(%s): failed to allocate the MSI interrupt %d\n",
+				       skd_name(skdev), rc);
+				goto RETRY_IRQ_LEGACY;
+			}
+			pr_info("(%s): MSI irq %d enabled\n",
+			       skd_name(skdev), pdev->irq);
+		} else {
+RETRY_IRQ_LEGACY:
+			pr_err(
+			       "(%s): failed to enable MSI, re-trying with LEGACY %d\n",
+			       skd_name(skdev), rc);
+			skdev->irq_type = SKD_IRQ_LEGACY;
+			goto RETRY_IRQ_TYPE;
+		}
+		break;
+	case SKD_IRQ_LEGACY:
+		snprintf(skdev->isr_name, sizeof(skdev->isr_name),
+			 "%s%d-legacy", DRV_NAME, skdev->devno);
+		rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr,
+				      IRQF_SHARED, skdev->isr_name, skdev);
+		if (!rc)
+			pr_info("(%s): LEGACY irq %d enabled\n",
+			       skd_name(skdev), pdev->irq);
+		else
+			pr_err("(%s): request LEGACY irq error %d\n",
+			       skd_name(skdev), rc);
+		break;
+	default:
+		pr_info("(%s): irq_type %d invalid, re-set to %d\n",
+		       skd_name(skdev), skdev->irq_type, SKD_IRQ_DEFAULT);
+		skdev->irq_type = SKD_IRQ_LEGACY;
+		goto RETRY_IRQ_TYPE;
+	}
+	return rc;
+}
+
+static void skd_release_irq(struct skd_device *skdev)
+{
+	switch (skdev->irq_type) {
+	case SKD_IRQ_MSIX:
+		skd_release_msix(skdev);
+		break;
+	case SKD_IRQ_MSI:
+		devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev);
+		pci_disable_msi(skdev->pdev);
+		break;
+	case SKD_IRQ_LEGACY:
+		devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev);
+		break;
+	default:
+		pr_err("(%s): wrong irq type %d!",
+		       skd_name(skdev), skdev->irq_type);
+		break;
+	}
+}
+
+/*
+ *****************************************************************************
+ * CONSTRUCT
+ *****************************************************************************
+ */
+
+static int skd_cons_skcomp(struct skd_device *skdev)
+{
+	int rc = 0;
+	struct fit_completion_entry_v1 *skcomp;
+	u32 nbytes;
+
+	nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY;
+	nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY;
+
+	pr_debug("%s:%s:%d comp pci_alloc, total bytes %d entries %d\n",
+		 skdev->name, __func__, __LINE__,
+		 nbytes, SKD_N_COMPLETION_ENTRY);
+
+	skcomp = pci_alloc_consistent(skdev->pdev, nbytes,
+				      &skdev->cq_dma_address);
+
+	if (skcomp == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	memset(skcomp, 0, nbytes);
+
+	skdev->skcomp_table = skcomp;
+	skdev->skerr_table = (struct fit_comp_error_info *)((char *)skcomp +
+							   sizeof(*skcomp) *
+							   SKD_N_COMPLETION_ENTRY);
+
+err_out:
+	return rc;
+}
+
+static int skd_cons_skmsg(struct skd_device *skdev)
+{
+	int rc = 0;
+	u32 i;
+
+	pr_debug("%s:%s:%d skmsg_table kzalloc, struct %lu, count %u total %lu\n",
+		 skdev->name, __func__, __LINE__,
+		 sizeof(struct skd_fitmsg_context),
+		 skdev->num_fitmsg_context,
+		 sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context);
+
+	skdev->skmsg_table = kzalloc(sizeof(struct skd_fitmsg_context)
+				     *skdev->num_fitmsg_context, GFP_KERNEL);
+	if (skdev->skmsg_table == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	for (i = 0; i < skdev->num_fitmsg_context; i++) {
+		struct skd_fitmsg_context *skmsg;
+
+		skmsg = &skdev->skmsg_table[i];
+
+		skmsg->id = i + SKD_ID_FIT_MSG;
+
+		skmsg->state = SKD_MSG_STATE_IDLE;
+		skmsg->msg_buf = pci_alloc_consistent(skdev->pdev,
+						      SKD_N_FITMSG_BYTES + 64,
+						      &skmsg->mb_dma_address);
+
+		if (skmsg->msg_buf == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+
+		skmsg->offset = (u32)((u64)skmsg->msg_buf &
+				      (~FIT_QCMD_BASE_ADDRESS_MASK));
+		skmsg->msg_buf += ~FIT_QCMD_BASE_ADDRESS_MASK;
+		skmsg->msg_buf = (u8 *)((u64)skmsg->msg_buf &
+				       FIT_QCMD_BASE_ADDRESS_MASK);
+		skmsg->mb_dma_address += ~FIT_QCMD_BASE_ADDRESS_MASK;
+		skmsg->mb_dma_address &= FIT_QCMD_BASE_ADDRESS_MASK;
+		memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES);
+
+		skmsg->next = &skmsg[1];
+	}
+
+	/* Free list is in order starting with the 0th entry. */
+	skdev->skmsg_table[i - 1].next = NULL;
+	skdev->skmsg_free_list = skdev->skmsg_table;
+
+err_out:
+	return rc;
+}
+
+static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev,
+						  u32 n_sg,
+						  dma_addr_t *ret_dma_addr)
+{
+	struct fit_sg_descriptor *sg_list;
+	u32 nbytes;
+
+	nbytes = sizeof(*sg_list) * n_sg;
+
+	sg_list = pci_alloc_consistent(skdev->pdev, nbytes, ret_dma_addr);
+
+	if (sg_list != NULL) {
+		uint64_t dma_address = *ret_dma_addr;
+		u32 i;
+
+		memset(sg_list, 0, nbytes);
+
+		for (i = 0; i < n_sg - 1; i++) {
+			uint64_t ndp_off;
+			ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor);
+
+			sg_list[i].next_desc_ptr = dma_address + ndp_off;
+		}
+		sg_list[i].next_desc_ptr = 0LL;
+	}
+
+	return sg_list;
+}
+
+static int skd_cons_skreq(struct skd_device *skdev)
+{
+	int rc = 0;
+	u32 i;
+
+	pr_debug("%s:%s:%d skreq_table kzalloc, struct %lu, count %u total %lu\n",
+		 skdev->name, __func__, __LINE__,
+		 sizeof(struct skd_request_context),
+		 skdev->num_req_context,
+		 sizeof(struct skd_request_context) * skdev->num_req_context);
+
+	skdev->skreq_table = kzalloc(sizeof(struct skd_request_context)
+				     * skdev->num_req_context, GFP_KERNEL);
+	if (skdev->skreq_table == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	pr_debug("%s:%s:%d alloc sg_table sg_per_req %u scatlist %lu total %lu\n",
+		 skdev->name, __func__, __LINE__,
+		 skdev->sgs_per_request, sizeof(struct scatterlist),
+		 skdev->sgs_per_request * sizeof(struct scatterlist));
+
+	for (i = 0; i < skdev->num_req_context; i++) {
+		struct skd_request_context *skreq;
+
+		skreq = &skdev->skreq_table[i];
+
+		skreq->id = i + SKD_ID_RW_REQUEST;
+		skreq->state = SKD_REQ_STATE_IDLE;
+
+		skreq->sg = kzalloc(sizeof(struct scatterlist) *
+				    skdev->sgs_per_request, GFP_KERNEL);
+		if (skreq->sg == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+		sg_init_table(skreq->sg, skdev->sgs_per_request);
+
+		skreq->sksg_list = skd_cons_sg_list(skdev,
+						    skdev->sgs_per_request,
+						    &skreq->sksg_dma_address);
+
+		if (skreq->sksg_list == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+
+		skreq->next = &skreq[1];
+	}
+
+	/* Free list is in order starting with the 0th entry. */
+	skdev->skreq_table[i - 1].next = NULL;
+	skdev->skreq_free_list = skdev->skreq_table;
+
+err_out:
+	return rc;
+}
+
+static int skd_cons_skspcl(struct skd_device *skdev)
+{
+	int rc = 0;
+	u32 i, nbytes;
+
+	pr_debug("%s:%s:%d skspcl_table kzalloc, struct %lu, count %u total %lu\n",
+		 skdev->name, __func__, __LINE__,
+		 sizeof(struct skd_special_context),
+		 skdev->n_special,
+		 sizeof(struct skd_special_context) * skdev->n_special);
+
+	skdev->skspcl_table = kzalloc(sizeof(struct skd_special_context)
+				      * skdev->n_special, GFP_KERNEL);
+	if (skdev->skspcl_table == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	for (i = 0; i < skdev->n_special; i++) {
+		struct skd_special_context *skspcl;
+
+		skspcl = &skdev->skspcl_table[i];
+
+		skspcl->req.id = i + SKD_ID_SPECIAL_REQUEST;
+		skspcl->req.state = SKD_REQ_STATE_IDLE;
+
+		skspcl->req.next = &skspcl[1].req;
+
+		nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+
+		skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes,
+						       &skspcl->mb_dma_address);
+		if (skspcl->msg_buf == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+
+		memset(skspcl->msg_buf, 0, nbytes);
+
+		skspcl->req.sg = kzalloc(sizeof(struct scatterlist) *
+					 SKD_N_SG_PER_SPECIAL, GFP_KERNEL);
+		if (skspcl->req.sg == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+
+		skspcl->req.sksg_list = skd_cons_sg_list(skdev,
+							 SKD_N_SG_PER_SPECIAL,
+							 &skspcl->req.
+							 sksg_dma_address);
+		if (skspcl->req.sksg_list == NULL) {
+			rc = -ENOMEM;
+			goto err_out;
+		}
+	}
+
+	/* Free list is in order starting with the 0th entry. */
+	skdev->skspcl_table[i - 1].req.next = NULL;
+	skdev->skspcl_free_list = skdev->skspcl_table;
+
+	return rc;
+
+err_out:
+	return rc;
+}
+
+static int skd_cons_sksb(struct skd_device *skdev)
+{
+	int rc = 0;
+	struct skd_special_context *skspcl;
+	u32 nbytes;
+
+	skspcl = &skdev->internal_skspcl;
+
+	skspcl->req.id = 0 + SKD_ID_INTERNAL;
+	skspcl->req.state = SKD_REQ_STATE_IDLE;
+
+	nbytes = SKD_N_INTERNAL_BYTES;
+
+	skspcl->data_buf = pci_alloc_consistent(skdev->pdev, nbytes,
+						&skspcl->db_dma_address);
+	if (skspcl->data_buf == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	memset(skspcl->data_buf, 0, nbytes);
+
+	nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+	skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes,
+					       &skspcl->mb_dma_address);
+	if (skspcl->msg_buf == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	memset(skspcl->msg_buf, 0, nbytes);
+
+	skspcl->req.sksg_list = skd_cons_sg_list(skdev, 1,
+						 &skspcl->req.sksg_dma_address);
+	if (skspcl->req.sksg_list == NULL) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	if (!skd_format_internal_skspcl(skdev)) {
+		rc = -EINVAL;
+		goto err_out;
+	}
+
+err_out:
+	return rc;
+}
+
+static int skd_cons_disk(struct skd_device *skdev)
+{
+	int rc = 0;
+	struct gendisk *disk;
+	struct request_queue *q;
+	unsigned long flags;
+
+	disk = alloc_disk(SKD_MINORS_PER_DEVICE);
+	if (!disk) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	skdev->disk = disk;
+	sprintf(disk->disk_name, DRV_NAME "%u", skdev->devno);
+
+	disk->major = skdev->major;
+	disk->first_minor = skdev->devno * SKD_MINORS_PER_DEVICE;
+	disk->fops = &skd_blockdev_ops;
+	disk->private_data = skdev;
+
+	q = blk_init_queue(skd_request_fn, &skdev->lock);
+	if (!q) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	skdev->queue = q;
+	disk->queue = q;
+	q->queuedata = skdev;
+
+	blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+	blk_queue_max_segments(q, skdev->sgs_per_request);
+	blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS);
+
+	/* set sysfs ptimal_io_size to 8K */
+	blk_queue_io_opt(q, 8192);
+
+	/* DISCARD Flag initialization. */
+	q->limits.discard_granularity = 8192;
+	q->limits.discard_alignment = 0;
+	q->limits.max_discard_sectors = UINT_MAX >> 9;
+	q->limits.discard_zeroes_data = 1;
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+
+	spin_lock_irqsave(&skdev->lock, flags);
+	pr_debug("%s:%s:%d stopping %s queue\n",
+		 skdev->name, __func__, __LINE__, skdev->name);
+	blk_stop_queue(skdev->queue);
+	spin_unlock_irqrestore(&skdev->lock, flags);
+
+err_out:
+	return rc;
+}
+
+#define SKD_N_DEV_TABLE         16u
+static u32 skd_next_devno;
+
+static struct skd_device *skd_construct(struct pci_dev *pdev)
+{
+	struct skd_device *skdev;
+	int blk_major = skd_major;
+	int rc;
+
+	skdev = kzalloc(sizeof(*skdev), GFP_KERNEL);
+
+	if (!skdev) {
+		pr_err(PFX "(%s): memory alloc failure\n",
+		       pci_name(pdev));
+		return NULL;
+	}
+
+	skdev->state = SKD_DRVR_STATE_LOAD;
+	skdev->pdev = pdev;
+	skdev->devno = skd_next_devno++;
+	skdev->major = blk_major;
+	skdev->irq_type = skd_isr_type;
+	sprintf(skdev->name, DRV_NAME "%d", skdev->devno);
+	skdev->dev_max_queue_depth = 0;
+
+	skdev->num_req_context = skd_max_queue_depth;
+	skdev->num_fitmsg_context = skd_max_queue_depth;
+	skdev->n_special = skd_max_pass_thru;
+	skdev->cur_max_queue_depth = 1;
+	skdev->queue_low_water_mark = 1;
+	skdev->proto_ver = 99;
+	skdev->sgs_per_request = skd_sgs_per_request;
+	skdev->dbg_level = skd_dbg_level;
+
+	atomic_set(&skdev->device_count, 0);
+
+	spin_lock_init(&skdev->lock);
+
+	INIT_WORK(&skdev->completion_worker, skd_completion_worker);
+
+	pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_skcomp(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_skmsg(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_skreq(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_skspcl(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_sksb(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
+	rc = skd_cons_disk(skdev);
+	if (rc < 0)
+		goto err_out;
+
+	pr_debug("%s:%s:%d VICTORY\n", skdev->name, __func__, __LINE__);
+	return skdev;
+
+err_out:
+	pr_debug("%s:%s:%d construct failed\n",
+		 skdev->name, __func__, __LINE__);
+	skd_destruct(skdev);
+	return NULL;
+}
+
+/*
+ *****************************************************************************
+ * DESTRUCT (FREE)
+ *****************************************************************************
+ */
+
+static void skd_free_skcomp(struct skd_device *skdev)
+{
+	if (skdev->skcomp_table != NULL) {
+		u32 nbytes;
+
+		nbytes = sizeof(skdev->skcomp_table[0]) *
+			 SKD_N_COMPLETION_ENTRY;
+		pci_free_consistent(skdev->pdev, nbytes,
+				    skdev->skcomp_table, skdev->cq_dma_address);
+	}
+
+	skdev->skcomp_table = NULL;
+	skdev->cq_dma_address = 0;
+}
+
+static void skd_free_skmsg(struct skd_device *skdev)
+{
+	u32 i;
+
+	if (skdev->skmsg_table == NULL)
+		return;
+
+	for (i = 0; i < skdev->num_fitmsg_context; i++) {
+		struct skd_fitmsg_context *skmsg;
+
+		skmsg = &skdev->skmsg_table[i];
+
+		if (skmsg->msg_buf != NULL) {
+			skmsg->msg_buf += skmsg->offset;
+			skmsg->mb_dma_address += skmsg->offset;
+			pci_free_consistent(skdev->pdev, SKD_N_FITMSG_BYTES,
+					    skmsg->msg_buf,
+					    skmsg->mb_dma_address);
+		}
+		skmsg->msg_buf = NULL;
+		skmsg->mb_dma_address = 0;
+	}
+
+	kfree(skdev->skmsg_table);
+	skdev->skmsg_table = NULL;
+}
+
+static void skd_free_sg_list(struct skd_device *skdev,
+			     struct fit_sg_descriptor *sg_list,
+			     u32 n_sg, dma_addr_t dma_addr)
+{
+	if (sg_list != NULL) {
+		u32 nbytes;
+
+		nbytes = sizeof(*sg_list) * n_sg;
+
+		pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr);
+	}
+}
+
+static void skd_free_skreq(struct skd_device *skdev)
+{
+	u32 i;
+
+	if (skdev->skreq_table == NULL)
+		return;
+
+	for (i = 0; i < skdev->num_req_context; i++) {
+		struct skd_request_context *skreq;
+
+		skreq = &skdev->skreq_table[i];
+
+		skd_free_sg_list(skdev, skreq->sksg_list,
+				 skdev->sgs_per_request,
+				 skreq->sksg_dma_address);
+
+		skreq->sksg_list = NULL;
+		skreq->sksg_dma_address = 0;
+
+		kfree(skreq->sg);
+	}
+
+	kfree(skdev->skreq_table);
+	skdev->skreq_table = NULL;
+}
+
+static void skd_free_skspcl(struct skd_device *skdev)
+{
+	u32 i;
+	u32 nbytes;
+
+	if (skdev->skspcl_table == NULL)
+		return;
+
+	for (i = 0; i < skdev->n_special; i++) {
+		struct skd_special_context *skspcl;
+
+		skspcl = &skdev->skspcl_table[i];
+
+		if (skspcl->msg_buf != NULL) {
+			nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+			pci_free_consistent(skdev->pdev, nbytes,
+					    skspcl->msg_buf,
+					    skspcl->mb_dma_address);
+		}
+
+		skspcl->msg_buf = NULL;
+		skspcl->mb_dma_address = 0;
+
+		skd_free_sg_list(skdev, skspcl->req.sksg_list,
+				 SKD_N_SG_PER_SPECIAL,
+				 skspcl->req.sksg_dma_address);
+
+		skspcl->req.sksg_list = NULL;
+		skspcl->req.sksg_dma_address = 0;
+
+		kfree(skspcl->req.sg);
+	}
+
+	kfree(skdev->skspcl_table);
+	skdev->skspcl_table = NULL;
+}
+
+static void skd_free_sksb(struct skd_device *skdev)
+{
+	struct skd_special_context *skspcl;
+	u32 nbytes;
+
+	skspcl = &skdev->internal_skspcl;
+
+	if (skspcl->data_buf != NULL) {
+		nbytes = SKD_N_INTERNAL_BYTES;
+
+		pci_free_consistent(skdev->pdev, nbytes,
+				    skspcl->data_buf, skspcl->db_dma_address);
+	}
+
+	skspcl->data_buf = NULL;
+	skspcl->db_dma_address = 0;
+
+	if (skspcl->msg_buf != NULL) {
+		nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+		pci_free_consistent(skdev->pdev, nbytes,
+				    skspcl->msg_buf, skspcl->mb_dma_address);
+	}
+
+	skspcl->msg_buf = NULL;
+	skspcl->mb_dma_address = 0;
+
+	skd_free_sg_list(skdev, skspcl->req.sksg_list, 1,
+			 skspcl->req.sksg_dma_address);
+
+	skspcl->req.sksg_list = NULL;
+	skspcl->req.sksg_dma_address = 0;
+}
+
+static void skd_free_disk(struct skd_device *skdev)
+{
+	struct gendisk *disk = skdev->disk;
+
+	if (disk != NULL) {
+		struct request_queue *q = disk->queue;
+
+		if (disk->flags & GENHD_FL_UP)
+			del_gendisk(disk);
+		if (q)
+			blk_cleanup_queue(q);
+		put_disk(disk);
+	}
+	skdev->disk = NULL;
+}
+
+static void skd_destruct(struct skd_device *skdev)
+{
+	if (skdev == NULL)
+		return;
+
+
+	pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
+	skd_free_disk(skdev);
+
+	pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
+	skd_free_sksb(skdev);
+
+	pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
+	skd_free_skspcl(skdev);
+
+	pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
+	skd_free_skreq(skdev);
+
+	pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
+	skd_free_skmsg(skdev);
+
+	pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
+	skd_free_skcomp(skdev);
+
+	pr_debug("%s:%s:%d skdev\n", skdev->name, __func__, __LINE__);
+	kfree(skdev);
+}
+
+/*
+ *****************************************************************************
+ * BLOCK DEVICE (BDEV) GLUE
+ *****************************************************************************
+ */
+
+static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct skd_device *skdev;
+	u64 capacity;
+
+	skdev = bdev->bd_disk->private_data;
+
+	pr_debug("%s:%s:%d %s: CMD[%s] getgeo device\n",
+		 skdev->name, __func__, __LINE__,
+		 bdev->bd_disk->disk_name, current->comm);
+
+	if (skdev->read_cap_is_valid) {
+		capacity = get_capacity(skdev->disk);
+		geo->heads = 64;
+		geo->sectors = 255;
+		geo->cylinders = (capacity) / (255 * 64);
+
+		return 0;
+	}
+	return -EIO;
+}
+
+static int skd_bdev_attach(struct skd_device *skdev)
+{
+	pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__);
+	add_disk(skdev->disk);
+	return 0;
+}
+
+static const struct block_device_operations skd_blockdev_ops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= skd_bdev_ioctl,
+	.getgeo		= skd_bdev_getgeo,
+};
+
+
+/*
+ *****************************************************************************
+ * PCIe DRIVER GLUE
+ *****************************************************************************
+ */
+
+static DEFINE_PCI_DEVICE_TABLE(skd_pci_tbl) = {
+	{ PCI_VENDOR_ID_STEC, PCI_DEVICE_ID_S1120,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
+	{ 0 }                     /* terminate list */
+};
+
+MODULE_DEVICE_TABLE(pci, skd_pci_tbl);
+
+static char *skd_pci_info(struct skd_device *skdev, char *str)
+{
+	int pcie_reg;
+
+	strcpy(str, "PCIe (");
+	pcie_reg = pci_find_capability(skdev->pdev, PCI_CAP_ID_EXP);
+
+	if (pcie_reg) {
+
+		char lwstr[6];
+		uint16_t pcie_lstat, lspeed, lwidth;
+
+		pcie_reg += 0x12;
+		pci_read_config_word(skdev->pdev, pcie_reg, &pcie_lstat);
+		lspeed = pcie_lstat & (0xF);
+		lwidth = (pcie_lstat & 0x3F0) >> 4;
+
+		if (lspeed == 1)
+			strcat(str, "2.5GT/s ");
+		else if (lspeed == 2)
+			strcat(str, "5.0GT/s ");
+		else
+			strcat(str, "<unknown> ");
+		snprintf(lwstr, sizeof(lwstr), "%dX)", lwidth);
+		strcat(str, lwstr);
+	}
+	return str;
+}
+
+static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int i;
+	int rc = 0;
+	char pci_str[32];
+	struct skd_device *skdev;
+
+	pr_info("STEC s1120 Driver(%s) version %s-b%s\n",
+	       DRV_NAME, DRV_VERSION, DRV_BUILD_ID);
+	pr_info("(skd?:??:[%s]): vendor=%04X device=%04x\n",
+	       pci_name(pdev), pdev->vendor, pdev->device);
+
+	rc = pci_enable_device(pdev);
+	if (rc)
+		return rc;
+	rc = pci_request_regions(pdev, DRV_NAME);
+	if (rc)
+		goto err_out;
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (!rc) {
+		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
+
+			pr_err("(%s): consistent DMA mask error %d\n",
+			       pci_name(pdev), rc);
+		}
+	} else {
+		(rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)));
+		if (rc) {
+
+			pr_err("(%s): DMA mask error %d\n",
+			       pci_name(pdev), rc);
+			goto err_out_regions;
+		}
+	}
+
+	if (!skd_major) {
+		rc = register_blkdev(0, DRV_NAME);
+		if (rc < 0)
+			goto err_out_regions;
+		BUG_ON(!rc);
+		skd_major = rc;
+	}
+
+	skdev = skd_construct(pdev);
+	if (skdev == NULL) {
+		rc = -ENOMEM;
+		goto err_out_regions;
+	}
+
+	skd_pci_info(skdev, pci_str);
+	pr_info("(%s): %s 64bit\n", skd_name(skdev), pci_str);
+
+	pci_set_master(pdev);
+	rc = pci_enable_pcie_error_reporting(pdev);
+	if (rc) {
+		pr_err(
+		       "(%s): bad enable of PCIe error reporting rc=%d\n",
+		       skd_name(skdev), rc);
+		skdev->pcie_error_reporting_is_enabled = 0;
+	} else
+		skdev->pcie_error_reporting_is_enabled = 1;
+
+
+	pci_set_drvdata(pdev, skdev);
+
+	skdev->disk->driverfs_dev = &pdev->dev;
+
+	for (i = 0; i < SKD_MAX_BARS; i++) {
+		skdev->mem_phys[i] = pci_resource_start(pdev, i);
+		skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
+		skdev->mem_map[i] = ioremap(skdev->mem_phys[i],
+					    skdev->mem_size[i]);
+		if (!skdev->mem_map[i]) {
+			pr_err("(%s): Unable to map adapter memory!\n",
+			       skd_name(skdev));
+			rc = -ENODEV;
+			goto err_out_iounmap;
+		}
+		pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->mem_map[i],
+			 (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
+	}
+
+	rc = skd_acquire_irq(skdev);
+	if (rc) {
+		pr_err("(%s): interrupt resource error %d\n",
+		       skd_name(skdev), rc);
+		goto err_out_iounmap;
+	}
+
+	rc = skd_start_timer(skdev);
+	if (rc)
+		goto err_out_timer;
+
+	init_waitqueue_head(&skdev->waitq);
+
+	skd_start_device(skdev);
+
+	rc = wait_event_interruptible_timeout(skdev->waitq,
+					      (skdev->gendisk_on),
+					      (SKD_START_WAIT_SECONDS * HZ));
+	if (skdev->gendisk_on > 0) {
+		/* device came on-line after reset */
+		skd_bdev_attach(skdev);
+		rc = 0;
+	} else {
+		/* we timed out, something is wrong with the device,
+		   don't add the disk structure */
+		pr_err(
+		       "(%s): error: waiting for s1120 timed out %d!\n",
+		       skd_name(skdev), rc);
+		/* in case of no error; we timeout with ENXIO */
+		if (!rc)
+			rc = -ENXIO;
+		goto err_out_timer;
+	}
+
+
+#ifdef SKD_VMK_POLL_HANDLER
+	if (skdev->irq_type == SKD_IRQ_MSIX) {
+		/* MSIX completion handler is being used for coredump */
+		vmklnx_scsi_register_poll_handler(skdev->scsi_host,
+						  skdev->msix_entries[5].vector,
+						  skd_comp_q, skdev);
+	} else {
+		vmklnx_scsi_register_poll_handler(skdev->scsi_host,
+						  skdev->pdev->irq, skd_isr,
+						  skdev);
+	}
+#endif  /* SKD_VMK_POLL_HANDLER */
+
+	return rc;
+
+err_out_timer:
+	skd_stop_device(skdev);
+	skd_release_irq(skdev);
+
+err_out_iounmap:
+	for (i = 0; i < SKD_MAX_BARS; i++)
+		if (skdev->mem_map[i])
+			iounmap(skdev->mem_map[i]);
+
+	if (skdev->pcie_error_reporting_is_enabled)
+		pci_disable_pcie_error_reporting(pdev);
+
+	skd_destruct(skdev);
+
+err_out_regions:
+	pci_release_regions(pdev);
+
+err_out:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return rc;
+}
+
+static void skd_pci_remove(struct pci_dev *pdev)
+{
+	int i;
+	struct skd_device *skdev;
+
+	skdev = pci_get_drvdata(pdev);
+	if (!skdev) {
+		pr_err("%s: no device data for PCI\n", pci_name(pdev));
+		return;
+	}
+	skd_stop_device(skdev);
+	skd_release_irq(skdev);
+
+	for (i = 0; i < SKD_MAX_BARS; i++)
+		if (skdev->mem_map[i])
+			iounmap((u32 *)skdev->mem_map[i]);
+
+	if (skdev->pcie_error_reporting_is_enabled)
+		pci_disable_pcie_error_reporting(pdev);
+
+	skd_destruct(skdev);
+
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+
+	return;
+}
+
+static int skd_pci_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	int i;
+	struct skd_device *skdev;
+
+	skdev = pci_get_drvdata(pdev);
+	if (!skdev) {
+		pr_err("%s: no device data for PCI\n", pci_name(pdev));
+		return -EIO;
+	}
+
+	skd_stop_device(skdev);
+
+	skd_release_irq(skdev);
+
+	for (i = 0; i < SKD_MAX_BARS; i++)
+		if (skdev->mem_map[i])
+			iounmap((u32 *)skdev->mem_map[i]);
+
+	if (skdev->pcie_error_reporting_is_enabled)
+		pci_disable_pcie_error_reporting(pdev);
+
+	pci_release_regions(pdev);
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+	return 0;
+}
+
+static int skd_pci_resume(struct pci_dev *pdev)
+{
+	int i;
+	int rc = 0;
+	struct skd_device *skdev;
+
+	skdev = pci_get_drvdata(pdev);
+	if (!skdev) {
+		pr_err("%s: no device data for PCI\n", pci_name(pdev));
+		return -1;
+	}
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_enable_wake(pdev, PCI_D0, 0);
+	pci_restore_state(pdev);
+
+	rc = pci_enable_device(pdev);
+	if (rc)
+		return rc;
+	rc = pci_request_regions(pdev, DRV_NAME);
+	if (rc)
+		goto err_out;
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (!rc) {
+		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
+
+			pr_err("(%s): consistent DMA mask error %d\n",
+			       pci_name(pdev), rc);
+		}
+	} else {
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc) {
+
+			pr_err("(%s): DMA mask error %d\n",
+			       pci_name(pdev), rc);
+			goto err_out_regions;
+		}
+	}
+
+	pci_set_master(pdev);
+	rc = pci_enable_pcie_error_reporting(pdev);
+	if (rc) {
+		pr_err("(%s): bad enable of PCIe error reporting rc=%d\n",
+		       skdev->name, rc);
+		skdev->pcie_error_reporting_is_enabled = 0;
+	} else
+		skdev->pcie_error_reporting_is_enabled = 1;
+
+	for (i = 0; i < SKD_MAX_BARS; i++) {
+
+		skdev->mem_phys[i] = pci_resource_start(pdev, i);
+		skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
+		skdev->mem_map[i] = ioremap(skdev->mem_phys[i],
+					    skdev->mem_size[i]);
+		if (!skdev->mem_map[i]) {
+			pr_err("(%s): Unable to map adapter memory!\n",
+			       skd_name(skdev));
+			rc = -ENODEV;
+			goto err_out_iounmap;
+		}
+		pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 skdev->mem_map[i],
+			 (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
+	}
+	rc = skd_acquire_irq(skdev);
+	if (rc) {
+
+		pr_err("(%s): interrupt resource error %d\n",
+		       pci_name(pdev), rc);
+		goto err_out_iounmap;
+	}
+
+	rc = skd_start_timer(skdev);
+	if (rc)
+		goto err_out_timer;
+
+	init_waitqueue_head(&skdev->waitq);
+
+	skd_start_device(skdev);
+
+	return rc;
+
+err_out_timer:
+	skd_stop_device(skdev);
+	skd_release_irq(skdev);
+
+err_out_iounmap:
+	for (i = 0; i < SKD_MAX_BARS; i++)
+		if (skdev->mem_map[i])
+			iounmap(skdev->mem_map[i]);
+
+	if (skdev->pcie_error_reporting_is_enabled)
+		pci_disable_pcie_error_reporting(pdev);
+
+err_out_regions:
+	pci_release_regions(pdev);
+
+err_out:
+	pci_disable_device(pdev);
+	return rc;
+}
+
+static void skd_pci_shutdown(struct pci_dev *pdev)
+{
+	struct skd_device *skdev;
+
+	pr_err("skd_pci_shutdown called\n");
+
+	skdev = pci_get_drvdata(pdev);
+	if (!skdev) {
+		pr_err("%s: no device data for PCI\n", pci_name(pdev));
+		return;
+	}
+
+	pr_err("%s: calling stop\n", skd_name(skdev));
+	skd_stop_device(skdev);
+}
+
+static struct pci_driver skd_driver = {
+	.name		= DRV_NAME,
+	.id_table	= skd_pci_tbl,
+	.probe		= skd_pci_probe,
+	.remove		= skd_pci_remove,
+	.suspend	= skd_pci_suspend,
+	.resume		= skd_pci_resume,
+	.shutdown	= skd_pci_shutdown,
+};
+
+/*
+ *****************************************************************************
+ * LOGGING SUPPORT
+ *****************************************************************************
+ */
+
+static const char *skd_name(struct skd_device *skdev)
+{
+	memset(skdev->id_str, 0, sizeof(skdev->id_str));
+
+	if (skdev->inquiry_is_valid)
+		snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:%s:[%s]",
+			 skdev->name, skdev->inq_serial_num,
+			 pci_name(skdev->pdev));
+	else
+		snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:??:[%s]",
+			 skdev->name, pci_name(skdev->pdev));
+
+	return skdev->id_str;
+}
+
+const char *skd_drive_state_to_str(int state)
+{
+	switch (state) {
+	case FIT_SR_DRIVE_OFFLINE:
+		return "OFFLINE";
+	case FIT_SR_DRIVE_INIT:
+		return "INIT";
+	case FIT_SR_DRIVE_ONLINE:
+		return "ONLINE";
+	case FIT_SR_DRIVE_BUSY:
+		return "BUSY";
+	case FIT_SR_DRIVE_FAULT:
+		return "FAULT";
+	case FIT_SR_DRIVE_DEGRADED:
+		return "DEGRADED";
+	case FIT_SR_PCIE_LINK_DOWN:
+		return "INK_DOWN";
+	case FIT_SR_DRIVE_SOFT_RESET:
+		return "SOFT_RESET";
+	case FIT_SR_DRIVE_NEED_FW_DOWNLOAD:
+		return "NEED_FW";
+	case FIT_SR_DRIVE_INIT_FAULT:
+		return "INIT_FAULT";
+	case FIT_SR_DRIVE_BUSY_SANITIZE:
+		return "BUSY_SANITIZE";
+	case FIT_SR_DRIVE_BUSY_ERASE:
+		return "BUSY_ERASE";
+	case FIT_SR_DRIVE_FW_BOOTING:
+		return "FW_BOOTING";
+	default:
+		return "???";
+	}
+}
+
+const char *skd_skdev_state_to_str(enum skd_drvr_state state)
+{
+	switch (state) {
+	case SKD_DRVR_STATE_LOAD:
+		return "LOAD";
+	case SKD_DRVR_STATE_IDLE:
+		return "IDLE";
+	case SKD_DRVR_STATE_BUSY:
+		return "BUSY";
+	case SKD_DRVR_STATE_STARTING:
+		return "STARTING";
+	case SKD_DRVR_STATE_ONLINE:
+		return "ONLINE";
+	case SKD_DRVR_STATE_PAUSING:
+		return "PAUSING";
+	case SKD_DRVR_STATE_PAUSED:
+		return "PAUSED";
+	case SKD_DRVR_STATE_DRAINING_TIMEOUT:
+		return "DRAINING_TIMEOUT";
+	case SKD_DRVR_STATE_RESTARTING:
+		return "RESTARTING";
+	case SKD_DRVR_STATE_RESUMING:
+		return "RESUMING";
+	case SKD_DRVR_STATE_STOPPING:
+		return "STOPPING";
+	case SKD_DRVR_STATE_SYNCING:
+		return "SYNCING";
+	case SKD_DRVR_STATE_FAULT:
+		return "FAULT";
+	case SKD_DRVR_STATE_DISAPPEARED:
+		return "DISAPPEARED";
+	case SKD_DRVR_STATE_BUSY_ERASE:
+		return "BUSY_ERASE";
+	case SKD_DRVR_STATE_BUSY_SANITIZE:
+		return "BUSY_SANITIZE";
+	case SKD_DRVR_STATE_BUSY_IMMINENT:
+		return "BUSY_IMMINENT";
+	case SKD_DRVR_STATE_WAIT_BOOT:
+		return "WAIT_BOOT";
+
+	default:
+		return "???";
+	}
+}
+
+static const char *skd_skmsg_state_to_str(enum skd_fit_msg_state state)
+{
+	switch (state) {
+	case SKD_MSG_STATE_IDLE:
+		return "IDLE";
+	case SKD_MSG_STATE_BUSY:
+		return "BUSY";
+	default:
+		return "???";
+	}
+}
+
+static const char *skd_skreq_state_to_str(enum skd_req_state state)
+{
+	switch (state) {
+	case SKD_REQ_STATE_IDLE:
+		return "IDLE";
+	case SKD_REQ_STATE_SETUP:
+		return "SETUP";
+	case SKD_REQ_STATE_BUSY:
+		return "BUSY";
+	case SKD_REQ_STATE_COMPLETED:
+		return "COMPLETED";
+	case SKD_REQ_STATE_TIMEOUT:
+		return "TIMEOUT";
+	case SKD_REQ_STATE_ABORTED:
+		return "ABORTED";
+	default:
+		return "???";
+	}
+}
+
+static void skd_log_skdev(struct skd_device *skdev, const char *event)
+{
+	pr_debug("%s:%s:%d (%s) skdev=%p event='%s'\n",
+		 skdev->name, __func__, __LINE__, skdev->name, skdev, event);
+	pr_debug("%s:%s:%d   drive_state=%s(%d) driver_state=%s(%d)\n",
+		 skdev->name, __func__, __LINE__,
+		 skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
+		 skd_skdev_state_to_str(skdev->state), skdev->state);
+	pr_debug("%s:%s:%d   busy=%d limit=%d dev=%d lowat=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skdev->in_flight, skdev->cur_max_queue_depth,
+		 skdev->dev_max_queue_depth, skdev->queue_low_water_mark);
+	pr_debug("%s:%s:%d   timestamp=0x%x cycle=%d cycle_ix=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix);
+}
+
+static void skd_log_skmsg(struct skd_device *skdev,
+			  struct skd_fitmsg_context *skmsg, const char *event)
+{
+	pr_debug("%s:%s:%d (%s) skmsg=%p event='%s'\n",
+		 skdev->name, __func__, __LINE__, skdev->name, skmsg, event);
+	pr_debug("%s:%s:%d   state=%s(%d) id=0x%04x length=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skd_skmsg_state_to_str(skmsg->state), skmsg->state,
+		 skmsg->id, skmsg->length);
+}
+
+static void skd_log_skreq(struct skd_device *skdev,
+			  struct skd_request_context *skreq, const char *event)
+{
+	pr_debug("%s:%s:%d (%s) skreq=%p event='%s'\n",
+		 skdev->name, __func__, __LINE__, skdev->name, skreq, event);
+	pr_debug("%s:%s:%d   state=%s(%d) id=0x%04x fitmsg=0x%04x\n",
+		 skdev->name, __func__, __LINE__,
+		 skd_skreq_state_to_str(skreq->state), skreq->state,
+		 skreq->id, skreq->fitmsg_id);
+	pr_debug("%s:%s:%d   timo=0x%x sg_dir=%d n_sg=%d\n",
+		 skdev->name, __func__, __LINE__,
+		 skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg);
+
+	if (skreq->req != NULL) {
+		struct request *req = skreq->req;
+		u32 lba = (u32)blk_rq_pos(req);
+		u32 count = blk_rq_sectors(req);
+
+		pr_debug("%s:%s:%d "
+			 "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
+			 skdev->name, __func__, __LINE__,
+			 req, lba, lba, count, count,
+			 (int)rq_data_dir(req));
+	} else
+		pr_debug("%s:%s:%d req=NULL\n",
+			 skdev->name, __func__, __LINE__);
+}
+
+/*
+ *****************************************************************************
+ * MODULE GLUE
+ *****************************************************************************
+ */
+
+static int __init skd_init(void)
+{
+	pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID);
+
+	switch (skd_isr_type) {
+	case SKD_IRQ_LEGACY:
+	case SKD_IRQ_MSI:
+	case SKD_IRQ_MSIX:
+		break;
+	default:
+		pr_err(PFX "skd_isr_type %d invalid, re-set to %d\n",
+		       skd_isr_type, SKD_IRQ_DEFAULT);
+		skd_isr_type = SKD_IRQ_DEFAULT;
+	}
+
+	if (skd_max_queue_depth < 1 ||
+	    skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) {
+		pr_err(PFX "skd_max_queue_depth %d invalid, re-set to %d\n",
+		       skd_max_queue_depth, SKD_MAX_QUEUE_DEPTH_DEFAULT);
+		skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT;
+	}
+
+	if (skd_max_req_per_msg < 1 || skd_max_req_per_msg > 14) {
+		pr_err(PFX "skd_max_req_per_msg %d invalid, re-set to %d\n",
+		       skd_max_req_per_msg, SKD_MAX_REQ_PER_MSG_DEFAULT);
+		skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT;
+	}
+
+	if (skd_sgs_per_request < 1 || skd_sgs_per_request > 4096) {
+		pr_err(PFX "skd_sg_per_request %d invalid, re-set to %d\n",
+		       skd_sgs_per_request, SKD_N_SG_PER_REQ_DEFAULT);
+		skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT;
+	}
+
+	if (skd_dbg_level < 0 || skd_dbg_level > 2) {
+		pr_err(PFX "skd_dbg_level %d invalid, re-set to %d\n",
+		       skd_dbg_level, 0);
+		skd_dbg_level = 0;
+	}
+
+	if (skd_isr_comp_limit < 0) {
+		pr_err(PFX "skd_isr_comp_limit %d invalid, set to %d\n",
+		       skd_isr_comp_limit, 0);
+		skd_isr_comp_limit = 0;
+	}
+
+	if (skd_max_pass_thru < 1 || skd_max_pass_thru > 50) {
+		pr_err(PFX "skd_max_pass_thru %d invalid, re-set to %d\n",
+		       skd_max_pass_thru, SKD_N_SPECIAL_CONTEXT);
+		skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT;
+	}
+
+	return pci_register_driver(&skd_driver);
+}
+
+static void __exit skd_exit(void)
+{
+	pr_info(PFX " v%s-b%s unloading\n", DRV_VERSION, DRV_BUILD_ID);
+
+	pci_unregister_driver(&skd_driver);
+
+	if (skd_major)
+		unregister_blkdev(skd_major, DRV_NAME);
+}
+
+module_init(skd_init);
+module_exit(skd_exit);
diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h
new file mode 100644
index 00000000000..61c757ff016
--- /dev/null
+++ b/drivers/block/skd_s1120.h
@@ -0,0 +1,330 @@
+/* Copyright 2012 STEC, Inc.
+ *
+ * This file is licensed under the terms of the 3-clause
+ * BSD License (http://opensource.org/licenses/BSD-3-Clause)
+ * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html),
+ * at your option. Both licenses are also available in the LICENSE file
+ * distributed with this project. This file may not be copied, modified,
+ * or distributed except in accordance with those terms.
+ */
+
+
+#ifndef SKD_S1120_H
+#define SKD_S1120_H
+
+#pragma pack(push, s1120_h, 1)
+
+/*
+ * Q-channel, 64-bit r/w
+ */
+#define FIT_Q_COMMAND			0x400u
+#define FIT_QCMD_QID_MASK		(0x3 << 1)
+#define  FIT_QCMD_QID0			(0x0 << 1)
+#define  FIT_QCMD_QID_NORMAL		FIT_QCMD_QID0
+#define  FIT_QCMD_QID1			(0x1 << 1)
+#define  FIT_QCMD_QID2			(0x2 << 1)
+#define  FIT_QCMD_QID3			(0x3 << 1)
+#define  FIT_QCMD_FLUSH_QUEUE		(0ull)	/* add QID */
+#define  FIT_QCMD_MSGSIZE_MASK		(0x3 << 4)
+#define  FIT_QCMD_MSGSIZE_64		(0x0 << 4)
+#define  FIT_QCMD_MSGSIZE_128		(0x1 << 4)
+#define  FIT_QCMD_MSGSIZE_256		(0x2 << 4)
+#define  FIT_QCMD_MSGSIZE_512		(0x3 << 4)
+#define  FIT_QCMD_BASE_ADDRESS_MASK	(0xFFFFFFFFFFFFFFC0ull)
+
+/*
+ * Control, 32-bit r/w
+ */
+#define FIT_CONTROL			0x500u
+#define  FIT_CR_HARD_RESET		(1u << 0u)
+#define  FIT_CR_SOFT_RESET		(1u << 1u)
+#define  FIT_CR_DIS_TIMESTAMPS		(1u << 6u)
+#define  FIT_CR_ENABLE_INTERRUPTS	(1u << 7u)
+
+/*
+ * Status, 32-bit, r/o
+ */
+#define FIT_STATUS			0x510u
+#define FIT_SR_DRIVE_STATE_MASK		0x000000FFu
+#define	FIT_SR_SIGNATURE		(0xFF << 8)
+#define	FIT_SR_PIO_DMA			(1 << 16)
+#define FIT_SR_DRIVE_OFFLINE		0x00
+#define FIT_SR_DRIVE_INIT		0x01
+/* #define FIT_SR_DRIVE_READY		0x02 */
+#define FIT_SR_DRIVE_ONLINE		0x03
+#define FIT_SR_DRIVE_BUSY		0x04
+#define FIT_SR_DRIVE_FAULT		0x05
+#define FIT_SR_DRIVE_DEGRADED		0x06
+#define FIT_SR_PCIE_LINK_DOWN		0x07
+#define FIT_SR_DRIVE_SOFT_RESET		0x08
+#define FIT_SR_DRIVE_INIT_FAULT		0x09
+#define FIT_SR_DRIVE_BUSY_SANITIZE	0x0A
+#define FIT_SR_DRIVE_BUSY_ERASE		0x0B
+#define FIT_SR_DRIVE_FW_BOOTING		0x0C
+#define FIT_SR_DRIVE_NEED_FW_DOWNLOAD	0xFE
+#define FIT_SR_DEVICE_MISSING		0xFF
+#define FIT_SR__RESERVED		0xFFFFFF00u
+
+/*
+ * FIT_STATUS - Status register data definition
+ */
+#define FIT_SR_STATE_MASK		(0xFF << 0)
+#define FIT_SR_SIGNATURE		(0xFF << 8)
+#define FIT_SR_PIO_DMA			(1 << 16)
+
+/*
+ * Interrupt status, 32-bit r/w1c (w1c ==> write 1 to clear)
+ */
+#define FIT_INT_STATUS_HOST		0x520u
+#define  FIT_ISH_FW_STATE_CHANGE	(1u << 0u)
+#define  FIT_ISH_COMPLETION_POSTED	(1u << 1u)
+#define  FIT_ISH_MSG_FROM_DEV		(1u << 2u)
+#define  FIT_ISH_UNDEFINED_3		(1u << 3u)
+#define  FIT_ISH_UNDEFINED_4		(1u << 4u)
+#define  FIT_ISH_Q0_FULL		(1u << 5u)
+#define  FIT_ISH_Q1_FULL		(1u << 6u)
+#define  FIT_ISH_Q2_FULL		(1u << 7u)
+#define  FIT_ISH_Q3_FULL		(1u << 8u)
+#define  FIT_ISH_QCMD_FIFO_OVERRUN	(1u << 9u)
+#define  FIT_ISH_BAD_EXP_ROM_READ	(1u << 10u)
+
+#define FIT_INT_DEF_MASK \
+	(FIT_ISH_FW_STATE_CHANGE | \
+	 FIT_ISH_COMPLETION_POSTED | \
+	 FIT_ISH_MSG_FROM_DEV | \
+	 FIT_ISH_Q0_FULL | \
+	 FIT_ISH_Q1_FULL | \
+	 FIT_ISH_Q2_FULL | \
+	 FIT_ISH_Q3_FULL | \
+	 FIT_ISH_QCMD_FIFO_OVERRUN | \
+	 FIT_ISH_BAD_EXP_ROM_READ)
+
+#define FIT_INT_QUEUE_FULL \
+	(FIT_ISH_Q0_FULL | \
+	 FIT_ISH_Q1_FULL | \
+	 FIT_ISH_Q2_FULL | \
+	 FIT_ISH_Q3_FULL)
+
+#define MSI_MSG_NWL_ERROR_0		0x00000000
+#define MSI_MSG_NWL_ERROR_1		0x00000001
+#define MSI_MSG_NWL_ERROR_2		0x00000002
+#define MSI_MSG_NWL_ERROR_3		0x00000003
+#define MSI_MSG_STATE_CHANGE		0x00000004
+#define MSI_MSG_COMPLETION_POSTED	0x00000005
+#define MSI_MSG_MSG_FROM_DEV		0x00000006
+#define MSI_MSG_RESERVED_0		0x00000007
+#define MSI_MSG_RESERVED_1		0x00000008
+#define MSI_MSG_QUEUE_0_FULL		0x00000009
+#define MSI_MSG_QUEUE_1_FULL		0x0000000A
+#define MSI_MSG_QUEUE_2_FULL		0x0000000B
+#define MSI_MSG_QUEUE_3_FULL		0x0000000C
+
+#define FIT_INT_RESERVED_MASK \
+	(FIT_ISH_UNDEFINED_3 | \
+	 FIT_ISH_UNDEFINED_4)
+
+/*
+ * Interrupt mask, 32-bit r/w
+ * Bit definitions are the same as FIT_INT_STATUS_HOST
+ */
+#define FIT_INT_MASK_HOST		0x528u
+
+/*
+ * Message to device, 32-bit r/w
+ */
+#define FIT_MSG_TO_DEVICE		0x540u
+
+/*
+ * Message from device, 32-bit, r/o
+ */
+#define FIT_MSG_FROM_DEVICE		0x548u
+
+/*
+ * 32-bit messages to/from device, composition/extraction macros
+ */
+#define FIT_MXD_CONS(TYPE, PARAM, DATA) \
+	((((TYPE)  & 0xFFu) << 24u) | \
+	(((PARAM) & 0xFFu) << 16u) | \
+	(((DATA)  & 0xFFFFu) << 0u))
+#define FIT_MXD_TYPE(MXD)		(((MXD) >> 24u) & 0xFFu)
+#define FIT_MXD_PARAM(MXD)		(((MXD) >> 16u) & 0xFFu)
+#define FIT_MXD_DATA(MXD)		(((MXD) >> 0u) & 0xFFFFu)
+
+/*
+ * Types of messages to/from device
+ */
+#define FIT_MTD_FITFW_INIT		0x01u
+#define FIT_MTD_GET_CMDQ_DEPTH		0x02u
+#define FIT_MTD_SET_COMPQ_DEPTH		0x03u
+#define FIT_MTD_SET_COMPQ_ADDR		0x04u
+#define FIT_MTD_ARM_QUEUE		0x05u
+#define FIT_MTD_CMD_LOG_HOST_ID		0x07u
+#define FIT_MTD_CMD_LOG_TIME_STAMP_LO	0x08u
+#define FIT_MTD_CMD_LOG_TIME_STAMP_HI	0x09u
+#define FIT_MFD_SMART_EXCEEDED		0x10u
+#define FIT_MFD_POWER_DOWN		0x11u
+#define FIT_MFD_OFFLINE			0x12u
+#define FIT_MFD_ONLINE			0x13u
+#define FIT_MFD_FW_RESTARTING		0x14u
+#define FIT_MFD_PM_ACTIVE		0x15u
+#define FIT_MFD_PM_STANDBY		0x16u
+#define FIT_MFD_PM_SLEEP		0x17u
+#define FIT_MFD_CMD_PROGRESS		0x18u
+
+#define FIT_MTD_DEBUG			0xFEu
+#define FIT_MFD_DEBUG			0xFFu
+
+#define FIT_MFD_MASK			(0xFFu)
+#define FIT_MFD_DATA_MASK		(0xFFu)
+#define FIT_MFD_MSG(x)			(((x) >> 24) & FIT_MFD_MASK)
+#define FIT_MFD_DATA(x)			((x) & FIT_MFD_MASK)
+
+/*
+ * Extra arg to FIT_MSG_TO_DEVICE, 64-bit r/w
+ * Used to set completion queue address (FIT_MTD_SET_COMPQ_ADDR)
+ * (was Response buffer in docs)
+ */
+#define FIT_MSG_TO_DEVICE_ARG		0x580u
+
+/*
+ * Hardware (ASIC) version, 32-bit r/o
+ */
+#define FIT_HW_VERSION			0x588u
+
+/*
+ * Scatter/gather list descriptor.
+ * 32-bytes and must be aligned on a 32-byte boundary.
+ * All fields are in little endian order.
+ */
+struct fit_sg_descriptor {
+	uint32_t control;
+	uint32_t byte_count;
+	uint64_t host_side_addr;
+	uint64_t dev_side_addr;
+	uint64_t next_desc_ptr;
+};
+
+#define FIT_SGD_CONTROL_NOT_LAST	0x000u
+#define FIT_SGD_CONTROL_LAST		0x40Eu
+
+/*
+ * Header at the beginning of a FIT message. The header
+ * is followed by SSDI requests each 64 bytes.
+ * A FIT message can be up to 512 bytes long and must start
+ * on a 64-byte boundary.
+ */
+struct fit_msg_hdr {
+	uint8_t protocol_id;
+	uint8_t num_protocol_cmds_coalesced;
+	uint8_t _reserved[62];
+};
+
+#define FIT_PROTOCOL_ID_FIT	1
+#define FIT_PROTOCOL_ID_SSDI	2
+#define FIT_PROTOCOL_ID_SOFIT	3
+
+
+#define FIT_PROTOCOL_MINOR_VER(mtd_val) ((mtd_val >> 16) & 0xF)
+#define FIT_PROTOCOL_MAJOR_VER(mtd_val) ((mtd_val >> 20) & 0xF)
+
+/*
+ * Format of a completion entry. The completion queue is circular
+ * and must have at least as many entries as the maximum number
+ * of commands that may be issued to the device.
+ *
+ * There are no head/tail pointers. The cycle value is used to
+ * infer the presence of new completion records.
+ * Initially the cycle in all entries is 0, the index is 0, and
+ * the cycle value to expect is 1. When completions are added
+ * their cycle values are set to 1. When the index wraps the
+ * cycle value to expect is incremented.
+ *
+ * Command_context is opaque and taken verbatim from the SSDI command.
+ * All other fields are big endian.
+ */
+#define FIT_PROTOCOL_VERSION_0		0
+
+/*
+ *  Protocol major version 1 completion entry.
+ *  The major protocol version is found in bits
+ *  20-23 of the FIT_MTD_FITFW_INIT response.
+ */
+struct fit_completion_entry_v1 {
+	uint32_t	num_returned_bytes;
+	uint16_t	tag;
+	uint8_t		status;  /* SCSI status */
+	uint8_t		cycle;
+};
+#define FIT_PROTOCOL_VERSION_1		1
+#define FIT_PROTOCOL_VERSION_CURRENT	FIT_PROTOCOL_VERSION_1
+
+struct fit_comp_error_info {
+	uint8_t		type:7; /* 00: Bits0-6 indicates the type of sense data. */
+	uint8_t		valid:1; /* 00: Bit 7 := 1 ==> info field is valid. */
+	uint8_t		reserved0; /* 01: Obsolete field */
+	uint8_t		key:4; /* 02: Bits0-3 indicate the sense key. */
+	uint8_t		reserved2:1; /* 02: Reserved bit. */
+	uint8_t		bad_length:1; /* 02: Incorrect Length Indicator */
+	uint8_t		end_medium:1; /* 02: End of Medium */
+	uint8_t		file_mark:1; /* 02: Filemark */
+	uint8_t		info[4]; /* 03: */
+	uint8_t		reserved1; /* 07: Additional Sense Length */
+	uint8_t		cmd_spec[4]; /* 08: Command Specific Information */
+	uint8_t		code; /* 0C: Additional Sense Code */
+	uint8_t		qual; /* 0D: Additional Sense Code Qualifier */
+	uint8_t		fruc; /* 0E: Field Replaceable Unit Code */
+	uint8_t		sks_high:7; /* 0F: Sense Key Specific (MSB) */
+	uint8_t		sks_valid:1; /* 0F: Sense Key Specific Valid */
+	uint16_t	sks_low; /* 10: Sense Key Specific (LSW) */
+	uint16_t	reserved3; /* 12: Part of additional sense bytes (unused) */
+	uint16_t	uec; /* 14: Additional Sense Bytes */
+	uint64_t	per; /* 16: Additional Sense Bytes */
+	uint8_t		reserved4[2]; /* 1E: Additional Sense Bytes (unused) */
+};
+
+
+/* Task management constants */
+#define SOFT_TASK_SIMPLE		0x00
+#define SOFT_TASK_HEAD_OF_QUEUE		0x01
+#define SOFT_TASK_ORDERED		0x02
+
+/* Version zero has the last 32 bits reserved,
+ * Version one has the last 32 bits sg_list_len_bytes;
+ */
+struct skd_command_header {
+	uint64_t	sg_list_dma_address;
+	uint16_t	tag;
+	uint8_t		attribute;
+	uint8_t		add_cdb_len;     /* In 32 bit words */
+	uint32_t	sg_list_len_bytes;
+};
+
+struct skd_scsi_request {
+	struct		skd_command_header hdr;
+	unsigned char	cdb[16];
+/*	unsigned char _reserved[16]; */
+};
+
+struct driver_inquiry_data {
+	uint8_t		peripheral_device_type:5;
+	uint8_t		qualifier:3;
+	uint8_t		page_code;
+	uint16_t	page_length;
+	uint16_t	pcie_bus_number;
+	uint8_t		pcie_device_number;
+	uint8_t		pcie_function_number;
+	uint8_t		pcie_link_speed;
+	uint8_t		pcie_link_lanes;
+	uint16_t	pcie_vendor_id;
+	uint16_t	pcie_device_id;
+	uint16_t	pcie_subsystem_vendor_id;
+	uint16_t	pcie_subsystem_device_id;
+	uint8_t		reserved1[2];
+	uint8_t		reserved2[3];
+	uint8_t		driver_version_length;
+	uint8_t		driver_version[0x14];
+};
+
+#pragma pack(pop, s1120_h)
+
+#endif /* SKD_S1120_H */
diff --git a/drivers/block/smart1,2.h b/drivers/block/smart1,2.h
index a0b403a6b4e..e5565fbaeb3 100644
--- a/drivers/block/smart1,2.h
+++ b/drivers/block/smart1,2.h
@@ -95,7 +95,7 @@ static unsigned long smart4_completed(ctlr_info_t *h)
  /*
  *  This hardware returns interrupt pending at a different place and 
  *  it does not tell us if the fifo is empty, we will have check  
- *  that by getting a 0 back from the comamnd_completed call. 
+ *  that by getting a 0 back from the command_completed call. 
  */
 static unsigned long smart4_intr_pending(ctlr_info_t *h)
 {
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index cbfd9c0aef0..5814deb6963 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -25,7 +25,7 @@
 #define DRV_MODULE_VERSION	"1.0"
 #define DRV_MODULE_RELDATE	"June 25, 2007"
 
-static char version[] __devinitdata =
+static char version[] =
 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
 MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
 MODULE_DESCRIPTION("Sun LDOM virtual disk client driver");
@@ -103,7 +103,7 @@ static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations vdc_fops = {
+static const struct block_device_operations vdc_fops = {
 	.owner		= THIS_MODULE,
 	.getgeo		= vdc_getgeo,
 };
@@ -461,7 +461,7 @@ static int generic_request(struct vdc_port *port, u8 op, void *buf, int len)
 	int op_len, err;
 	void *req_buf;
 
-	if (!(((u64)1 << ((u64)op - 1)) & port->operations))
+	if (!(((u64)1 << (u64)op) & port->operations))
 		return -EOPNOTSUPP;
 
 	switch (op) {
@@ -592,7 +592,7 @@ static int generic_request(struct vdc_port *port, u8 op, void *buf, int len)
 	return err;
 }
 
-static int __devinit vdc_alloc_tx_ring(struct vdc_port *port)
+static int vdc_alloc_tx_ring(struct vdc_port *port)
 {
 	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
 	unsigned long len, entry_size;
@@ -691,9 +691,8 @@ static int probe_disk(struct vdc_port *port)
 
 	port->disk = g;
 
-	blk_queue_max_hw_segments(q, port->ring_cookies);
-	blk_queue_max_phys_segments(q, port->ring_cookies);
-	blk_queue_max_sectors(q, port->max_xfer_size);
+	blk_queue_max_segments(q, port->ring_cookies);
+	blk_queue_max_hw_sectors(q, port->max_xfer_size);
 	g->major = vdc_major;
 	g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT;
 	strcpy(g->disk_name, port->disk_name);
@@ -726,7 +725,7 @@ static struct vio_driver_ops vdc_vio_ops = {
 	.handshake_complete	= vdc_handshake_complete,
 };
 
-static void __devinit print_version(void)
+static void print_version(void)
 {
 	static int version_printed;
 
@@ -734,8 +733,7 @@ static void __devinit print_version(void)
 		printk(KERN_INFO "%s", version);
 }
 
-static int __devinit vdc_port_probe(struct vio_dev *vdev,
-				    const struct vio_device_id *id)
+static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 {
 	struct mdesc_handle *hp;
 	struct vdc_port *port;
@@ -840,10 +838,7 @@ static struct vio_driver vdc_port_driver = {
 	.id_table	= vdc_port_match,
 	.probe		= vdc_port_probe,
 	.remove		= vdc_port_remove,
-	.driver		= {
-		.name	= "vdc_port",
-		.owner	= THIS_MODULE,
-	}
+	.name		= "vdc_port",
 };
 
 static int __init vdc_init(void)
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index cf7877fb8a7..6b44bbe528b 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -18,13 +18,14 @@
 
 #include <linux/module.h>
 #include <linux/fd.h>
+#include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/mutex.h>
 #include <linux/hdreg.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/platform_device.h>
 
-#include <asm/macintosh.h>
 #include <asm/mac_via.h>
 
 #define CARDNAME "swim"
@@ -220,6 +221,7 @@ extern int swim_read_sector_header(struct swim __iomem *base,
 extern int swim_read_sector_data(struct swim __iomem *base,
 				 unsigned char *data);
 
+static DEFINE_MUTEX(swim_mutex);
 static inline void set_swim_mode(struct swim __iomem *base, int enable)
 {
 	struct iwm __iomem *iwm_base;
@@ -547,7 +549,7 @@ static void redo_fd_request(struct request_queue *q)
 		case READ:
 			err = floppy_read_sectors(fs, blk_rq_pos(req),
 						  blk_rq_cur_sectors(req),
-						  req->buffer);
+						  bio_data(req->bio));
 			break;
 		}
 	done:
@@ -660,11 +662,23 @@ out:
 	return err;
 }
 
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+	int ret;
+
+	mutex_lock(&swim_mutex);
+	ret = floppy_open(bdev, mode);
+	mutex_unlock(&swim_mutex);
+
+	return ret;
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	struct floppy_state *fs = disk->private_data;
 	struct swim __iomem *base = fs->swd->base;
 
+	mutex_lock(&swim_mutex);
 	if (fs->ref_count < 0)
 		fs->ref_count = 0;
 	else if (fs->ref_count > 0)
@@ -672,8 +686,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 
 	if (fs->ref_count == 0)
 		swim_motor(base, OFF);
-
-	return 0;
+	mutex_unlock(&swim_mutex);
 }
 
 static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
@@ -689,7 +702,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
 	case FDEJECT:
 		if (fs->ref_count != 1)
 			return -EBUSY;
+		mutex_lock(&swim_mutex);
 		err = floppy_eject(fs);
+		mutex_unlock(&swim_mutex);
 		return err;
 
 	case FDGETPRM:
@@ -723,11 +738,12 @@ static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static int floppy_check_change(struct gendisk *disk)
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
 {
 	struct floppy_state *fs = disk->private_data;
 
-	return fs->ejected;
+	return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static int floppy_revalidate(struct gendisk *disk)
@@ -748,13 +764,13 @@ static int floppy_revalidate(struct gendisk *disk)
 	return !fs->disk_in;
 }
 
-static struct block_device_operations floppy_fops = {
+static const struct block_device_operations floppy_fops = {
 	.owner		 = THIS_MODULE,
-	.open		 = floppy_open,
+	.open		 = floppy_unlocked_open,
 	.release	 = floppy_release,
-	.locked_ioctl	 = floppy_ioctl,
+	.ioctl		 = floppy_ioctl,
 	.getgeo		 = floppy_getgeo,
-	.media_changed	 = floppy_check_change,
+	.check_events	 = floppy_check_events,
 	.revalidate_disk = floppy_revalidate,
 };
 
@@ -770,8 +786,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
 	return get_disk(swd->unit[drive].disk);
 }
 
-static int __devinit swim_add_floppy(struct swim_priv *swd,
-				     enum drive_location location)
+static int swim_add_floppy(struct swim_priv *swd, enum drive_location location)
 {
 	struct floppy_state *fs = &swd->unit[swd->floppy_count];
 	struct swim __iomem *base = swd->base;
@@ -794,7 +809,7 @@ static int __devinit swim_add_floppy(struct swim_priv *swd,
 	return 0;
 }
 
-static int __devinit swim_floppy_init(struct swim_priv *swd)
+static int swim_floppy_init(struct swim_priv *swd)
 {
 	int err;
 	int drive;
@@ -827,6 +842,7 @@ static int __devinit swim_floppy_init(struct swim_priv *swd)
 		swd->unit[drive].swd = swd;
 	}
 
+	spin_lock_init(&swd->lock);
 	swd->queue = blk_init_queue(do_fd_request, &swd->lock);
 	if (!swd->queue) {
 		err = -ENOMEM;
@@ -857,14 +873,14 @@ exit_put_disks:
 	return err;
 }
 
-static int __devinit swim_probe(struct platform_device *dev)
+static int swim_probe(struct platform_device *dev)
 {
 	struct resource *res;
 	struct swim __iomem *swim_base;
 	struct swim_priv *swd;
 	int ret;
 
-	res = platform_get_resource_byname(dev, IORESOURCE_MEM, "swim-regs");
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
 	if (!res) {
 		ret = -ENODEV;
 		goto out;
@@ -877,7 +893,7 @@ static int __devinit swim_probe(struct platform_device *dev)
 
 	swim_base = ioremap(res->start, resource_size(res));
 	if (!swim_base) {
-		return -ENOMEM;
+		ret = -ENOMEM;
 		goto out_release_io;
 	}
 
@@ -908,7 +924,6 @@ static int __devinit swim_probe(struct platform_device *dev)
 	return 0;
 
 out_kfree:
-	platform_set_drvdata(dev, NULL);
 	kfree(swd);
 out_iounmap:
 	iounmap(swim_base);
@@ -918,7 +933,7 @@ out:
 	return ret;
 }
 
-static int __devexit swim_remove(struct platform_device *dev)
+static int swim_remove(struct platform_device *dev)
 {
 	struct swim_priv *swd = platform_get_drvdata(dev);
 	int drive;
@@ -942,11 +957,10 @@ static int __devexit swim_remove(struct platform_device *dev)
 
 	iounmap(swd->base);
 
-	res = platform_get_resource_byname(dev, IORESOURCE_MEM, "swim-regs");
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
 	if (res)
 		release_mem_region(res->start, resource_size(res));
 
-	platform_set_drvdata(dev, NULL);
 	kfree(swd);
 
 	return 0;
@@ -954,7 +968,7 @@ static int __devexit swim_remove(struct platform_device *dev)
 
 static struct platform_driver swim_driver = {
 	.probe  = swim_probe,
-	.remove = __devexit_p(swim_remove),
+	.remove = swim_remove,
 	.driver   = {
 		.name	= CARDNAME,
 		.owner	= THIS_MODULE,
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 80df93e3cdd..523ee8fd4c1 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -16,6 +16,8 @@
  * handle GCR disks
  */
 
+#undef DEBUG
+
 #include <linux/stddef.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -25,8 +27,10 @@
 #include <linux/ioctl.h>
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
+#include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/wait.h>
 #include <asm/io.h>
 #include <asm/dbdma.h>
 #include <asm/prom.h>
@@ -35,12 +39,11 @@
 #include <asm/machdep.h>
 #include <asm/pmac_feature.h>
 
-static struct request_queue *swim3_queue;
-static struct gendisk *disks[2];
-static struct request *fd_req;
-
 #define MAX_FLOPPIES	2
 
+static DEFINE_MUTEX(swim3_mutex);
+static struct gendisk *disks[MAX_FLOPPIES];
+
 enum swim_state {
 	idle,
 	locating,
@@ -175,7 +178,6 @@ struct swim3 {
 
 struct floppy_state {
 	enum swim_state	state;
-	spinlock_t lock;
 	struct swim3 __iomem *swim3;	/* hardware registers */
 	struct dbdma_regs __iomem *dma;	/* DMA controller registers */
 	int	swim3_intr;	/* interrupt number for SWIM3 */
@@ -200,10 +202,22 @@ struct floppy_state {
 	int	ejected;
 	wait_queue_head_t wait;
 	int	wanted;
-	struct device_node*	media_bay; /* NULL when not in bay */
+	struct macio_dev *mdev;
 	char	dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)];
+	int	index;
+	struct request *cur_req;
 };
 
+#define swim3_err(fmt, arg...)	dev_err(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_warn(fmt, arg...)	dev_warn(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_info(fmt, arg...)	dev_info(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+
+#ifdef DEBUG
+#define swim3_dbg(fmt, arg...)	dev_dbg(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#else
+#define swim3_dbg(fmt, arg...)	do { } while(0)
+#endif
+
 static struct floppy_state floppy_states[MAX_FLOPPIES];
 static int floppy_count = 0;
 static DEFINE_SPINLOCK(swim3_lock);
@@ -222,17 +236,8 @@ static unsigned short write_postamble[] = {
 	0, 0, 0, 0, 0, 0
 };
 
-static void swim3_select(struct floppy_state *fs, int sel);
-static void swim3_action(struct floppy_state *fs, int action);
-static int swim3_readbit(struct floppy_state *fs, int bit);
-static void do_fd_request(struct request_queue * q);
-static void start_request(struct floppy_state *fs);
-static void set_timeout(struct floppy_state *fs, int nticks,
-			void (*proc)(unsigned long));
-static void scan_track(struct floppy_state *fs);
 static void seek_track(struct floppy_state *fs, int n);
 static void init_dma(struct dbdma_cmd *cp, int cmd, void *buf, int count);
-static void setup_transfer(struct floppy_state *fs);
 static void act(struct floppy_state *fs);
 static void scan_timeout(unsigned long data);
 static void seek_timeout(unsigned long data);
@@ -247,22 +252,26 @@ static int fd_eject(struct floppy_state *fs);
 static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long param);
 static int floppy_open(struct block_device *bdev, fmode_t mode);
-static int floppy_release(struct gendisk *disk, fmode_t mode);
-static int floppy_check_change(struct gendisk *disk);
+static void floppy_release(struct gendisk *disk, fmode_t mode);
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing);
 static int floppy_revalidate(struct gendisk *disk);
 
-static bool swim3_end_request(int err, unsigned int nr_bytes)
+static bool swim3_end_request(struct floppy_state *fs, int err, unsigned int nr_bytes)
 {
-	if (__blk_end_request(fd_req, err, nr_bytes))
-		return true;
+	struct request *req = fs->cur_req;
+	int rc;
 
-	fd_req = NULL;
-	return false;
-}
+	swim3_dbg("  end request, err=%d nr_bytes=%d, cur_req=%p\n",
+		  err, nr_bytes, req);
 
-static bool swim3_end_request_cur(int err)
-{
-	return swim3_end_request(err, blk_rq_cur_bytes(fd_req));
+	if (err)
+		nr_bytes = blk_rq_cur_bytes(req);
+	rc = __blk_end_request(req, err, nr_bytes);
+	if (rc)
+		return true;
+	fs->cur_req = NULL;
+	return false;
 }
 
 static void swim3_select(struct floppy_state *fs, int sel)
@@ -300,51 +309,53 @@ static int swim3_readbit(struct floppy_state *fs, int bit)
 	return (stat & DATA) == 0;
 }
 
-static void do_fd_request(struct request_queue * q)
-{
-	int i;
-	for(i=0;i<floppy_count;i++)
-	{
-#ifdef CONFIG_PMAC_MEDIABAY
-		if (floppy_states[i].media_bay &&
-			check_media_bay(floppy_states[i].media_bay, MB_FD))
-			continue;
-#endif /* CONFIG_PMAC_MEDIABAY */
-		start_request(&floppy_states[i]);
-	}
-}
-
 static void start_request(struct floppy_state *fs)
 {
 	struct request *req;
 	unsigned long x;
 
+	swim3_dbg("start request, initial state=%d\n", fs->state);
+
 	if (fs->state == idle && fs->wanted) {
 		fs->state = available;
 		wake_up(&fs->wait);
 		return;
 	}
 	while (fs->state == idle) {
-		if (!fd_req) {
-			fd_req = blk_fetch_request(swim3_queue);
-			if (!fd_req)
+		swim3_dbg("start request, idle loop, cur_req=%p\n", fs->cur_req);
+		if (!fs->cur_req) {
+			fs->cur_req = blk_fetch_request(disks[fs->index]->queue);
+			swim3_dbg("  fetched request %p\n", fs->cur_req);
+			if (!fs->cur_req)
 				break;
 		}
-		req = fd_req;
-#if 0
-		printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
-		       req->rq_disk->disk_name, req->cmd,
-		       (long)blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
-		printk("           errors=%d current_nr_sectors=%u\n",
-		       req->errors, blk_rq_cur_sectors(req));
+		req = fs->cur_req;
+
+		if (fs->mdev->media_bay &&
+		    check_media_bay(fs->mdev->media_bay) != MB_FD) {
+			swim3_dbg("%s", "  media bay absent, dropping req\n");
+			swim3_end_request(fs, -ENODEV, 0);
+			continue;
+		}
+
+#if 0 /* This is really too verbose */
+		swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
+			  req->rq_disk->disk_name, req->cmd,
+			  (long)blk_rq_pos(req), blk_rq_sectors(req),
+			  bio_data(req->bio));
+		swim3_dbg("           errors=%d current_nr_sectors=%u\n",
+			  req->errors, blk_rq_cur_sectors(req));
 #endif
 
 		if (blk_rq_pos(req) >= fs->total_secs) {
-			swim3_end_request_cur(-EIO);
+			swim3_dbg("  pos out of bounds (%ld, max is %ld)\n",
+				  (long)blk_rq_pos(req), (long)fs->total_secs);
+			swim3_end_request(fs, -EIO, 0);
 			continue;
 		}
 		if (fs->ejected) {
-			swim3_end_request_cur(-EIO);
+			swim3_dbg("%s", "  disk ejected\n");
+			swim3_end_request(fs, -EIO, 0);
 			continue;
 		}
 
@@ -352,7 +363,8 @@ static void start_request(struct floppy_state *fs)
 			if (fs->write_prot < 0)
 				fs->write_prot = swim3_readbit(fs, WRITE_PROT);
 			if (fs->write_prot) {
-				swim3_end_request_cur(-EIO);
+				swim3_dbg("%s", "  try to write, disk write protected\n");
+				swim3_end_request(fs, -EIO, 0);
 				continue;
 			}
 		}
@@ -367,7 +379,6 @@ static void start_request(struct floppy_state *fs)
 		x = ((long)blk_rq_pos(req)) % fs->secpercyl;
 		fs->head = x / fs->secpertrack;
 		fs->req_sector = x % fs->secpertrack + 1;
-		fd_req = req;
 		fs->state = do_transfer;
 		fs->retries = 0;
 
@@ -375,12 +386,14 @@ static void start_request(struct floppy_state *fs)
 	}
 }
 
+static void do_fd_request(struct request_queue * q)
+{
+	start_request(q->queuedata);
+}
+
 static void set_timeout(struct floppy_state *fs, int nticks,
 			void (*proc)(unsigned long))
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&fs->lock, flags);
 	if (fs->timeout_pending)
 		del_timer(&fs->timeout);
 	fs->timeout.expires = jiffies + nticks;
@@ -388,7 +401,6 @@ static void set_timeout(struct floppy_state *fs, int nticks,
 	fs->timeout.data = (unsigned long) fs;
 	add_timer(&fs->timeout);
 	fs->timeout_pending = 1;
-	spin_unlock_irqrestore(&fs->lock, flags);
 }
 
 static inline void scan_track(struct floppy_state *fs)
@@ -440,40 +452,45 @@ static inline void setup_transfer(struct floppy_state *fs)
 	struct swim3 __iomem *sw = fs->swim3;
 	struct dbdma_cmd *cp = fs->dma_cmd;
 	struct dbdma_regs __iomem *dr = fs->dma;
+	struct request *req = fs->cur_req;
 
-	if (blk_rq_cur_sectors(fd_req) <= 0) {
-		printk(KERN_ERR "swim3: transfer 0 sectors?\n");
+	if (blk_rq_cur_sectors(req) <= 0) {
+		swim3_warn("%s", "Transfer 0 sectors ?\n");
 		return;
 	}
-	if (rq_data_dir(fd_req) == WRITE)
+	if (rq_data_dir(req) == WRITE)
 		n = 1;
 	else {
 		n = fs->secpertrack - fs->req_sector + 1;
-		if (n > blk_rq_cur_sectors(fd_req))
-			n = blk_rq_cur_sectors(fd_req);
+		if (n > blk_rq_cur_sectors(req))
+			n = blk_rq_cur_sectors(req);
 	}
+
+	swim3_dbg("  setup xfer at sect %d (of %d) head %d for %d\n",
+		  fs->req_sector, fs->secpertrack, fs->head, n);
+
 	fs->scount = n;
 	swim3_select(fs, fs->head? READ_DATA_1: READ_DATA_0);
 	out_8(&sw->sector, fs->req_sector);
 	out_8(&sw->nsect, n);
 	out_8(&sw->gap3, 0);
 	out_le32(&dr->cmdptr, virt_to_bus(cp));
-	if (rq_data_dir(fd_req) == WRITE) {
+	if (rq_data_dir(req) == WRITE) {
 		/* Set up 3 dma commands: write preamble, data, postamble */
 		init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble));
 		++cp;
-		init_dma(cp, OUTPUT_MORE, fd_req->buffer, 512);
+		init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512);
 		++cp;
 		init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble));
 	} else {
-		init_dma(cp, INPUT_LAST, fd_req->buffer, n * 512);
+		init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512);
 	}
 	++cp;
 	out_le16(&cp->command, DBDMA_STOP);
 	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
 	in_8(&sw->error);
 	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
-	if (rq_data_dir(fd_req) == WRITE)
+	if (rq_data_dir(req) == WRITE)
 		out_8(&sw->control_bis, WRITE_SECTORS);
 	in_8(&sw->intr);
 	out_le32(&dr->control, (RUN << 16) | RUN);
@@ -486,12 +503,16 @@ static inline void setup_transfer(struct floppy_state *fs)
 static void act(struct floppy_state *fs)
 {
 	for (;;) {
+		swim3_dbg("  act loop, state=%d, req_cyl=%d, cur_cyl=%d\n",
+			  fs->state, fs->req_cyl, fs->cur_cyl);
+
 		switch (fs->state) {
 		case idle:
 			return;		/* XXX shouldn't get here */
 
 		case locating:
 			if (swim3_readbit(fs, TRACK_ZERO)) {
+				swim3_dbg("%s", "    locate track 0\n");
 				fs->cur_cyl = 0;
 				if (fs->req_cyl == 0)
 					fs->state = do_transfer;
@@ -509,7 +530,7 @@ static void act(struct floppy_state *fs)
 				break;
 			}
 			if (fs->req_cyl == fs->cur_cyl) {
-				printk("whoops, seeking 0\n");
+				swim3_warn("%s", "Whoops, seeking 0\n");
 				fs->state = do_transfer;
 				break;
 			}
@@ -525,7 +546,9 @@ static void act(struct floppy_state *fs)
 		case do_transfer:
 			if (fs->cur_cyl != fs->req_cyl) {
 				if (fs->retries > 5) {
-					swim3_end_request_cur(-EIO);
+					swim3_err("Wrong cylinder in transfer, want: %d got %d\n",
+						  fs->req_cyl, fs->cur_cyl);
+					swim3_end_request(fs, -EIO, 0);
 					fs->state = idle;
 					return;
 				}
@@ -540,7 +563,7 @@ static void act(struct floppy_state *fs)
 			return;
 
 		default:
-			printk(KERN_ERR"swim3: unknown state %d\n", fs->state);
+			swim3_err("Unknown state %d\n", fs->state);
 			return;
 		}
 	}
@@ -550,59 +573,75 @@ static void scan_timeout(unsigned long data)
 {
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
+
+	swim3_dbg("* scan timeout, state=%d\n", fs->state);
 
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
 	out_8(&sw->select, RELAX);
 	out_8(&sw->intr_enable, 0);
 	fs->cur_cyl = -1;
 	if (fs->retries > 5) {
-		swim3_end_request_cur(-EIO);
+		swim3_end_request(fs, -EIO, 0);
 		fs->state = idle;
 		start_request(fs);
 	} else {
 		fs->state = jogging;
 		act(fs);
 	}
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static void seek_timeout(unsigned long data)
 {
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
+
+	swim3_dbg("* seek timeout, state=%d\n", fs->state);
 
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	out_8(&sw->control_bic, DO_SEEK);
 	out_8(&sw->select, RELAX);
 	out_8(&sw->intr_enable, 0);
-	printk(KERN_ERR "swim3: seek timeout\n");
-	swim3_end_request_cur(-EIO);
+	swim3_err("%s", "Seek timeout\n");
+	swim3_end_request(fs, -EIO, 0);
 	fs->state = idle;
 	start_request(fs);
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static void settle_timeout(unsigned long data)
 {
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
 
+	swim3_dbg("* settle timeout, state=%d\n", fs->state);
+
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	if (swim3_readbit(fs, SEEK_COMPLETE)) {
 		out_8(&sw->select, RELAX);
 		fs->state = locating;
 		act(fs);
-		return;
+		goto unlock;
 	}
 	out_8(&sw->select, RELAX);
 	if (fs->settle_time < 2*HZ) {
 		++fs->settle_time;
 		set_timeout(fs, 1, settle_timeout);
-		return;
+		goto unlock;
 	}
-	printk(KERN_ERR "swim3: seek settle timeout\n");
-	swim3_end_request_cur(-EIO);
+	swim3_err("%s", "Seek settle timeout\n");
+	swim3_end_request(fs, -EIO, 0);
 	fs->state = idle;
 	start_request(fs);
+ unlock:
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static void xfer_timeout(unsigned long data)
@@ -610,8 +649,12 @@ static void xfer_timeout(unsigned long data)
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
 	struct dbdma_regs __iomem *dr = fs->dma;
+	unsigned long flags;
 	int n;
 
+	swim3_dbg("* xfer timeout, state=%d\n", fs->state);
+
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	out_le32(&dr->control, RUN << 16);
 	/* We must wait a bit for dbdma to stop */
@@ -620,12 +663,13 @@ static void xfer_timeout(unsigned long data)
 	out_8(&sw->intr_enable, 0);
 	out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
 	out_8(&sw->select, RELAX);
-	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
-	       (rq_data_dir(fd_req)==WRITE? "writ": "read"),
-	       (long)blk_rq_pos(fd_req));
-	swim3_end_request_cur(-EIO);
+	swim3_err("Timeout %sing sector %ld\n",
+	       (rq_data_dir(fs->cur_req)==WRITE? "writ": "read"),
+	       (long)blk_rq_pos(fs->cur_req));
+	swim3_end_request(fs, -EIO, 0);
 	fs->state = idle;
 	start_request(fs);
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static irqreturn_t swim3_interrupt(int irq, void *dev_id)
@@ -636,12 +680,17 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 	int stat, resid;
 	struct dbdma_regs __iomem *dr;
 	struct dbdma_cmd *cp;
+	unsigned long flags;
+	struct request *req = fs->cur_req;
+
+	swim3_dbg("* interrupt, state=%d\n", fs->state);
 
+	spin_lock_irqsave(&swim3_lock, flags);
 	intr = in_8(&sw->intr);
 	err = (intr & ERROR_INTR)? in_8(&sw->error): 0;
 	if ((intr & ERROR_INTR) && fs->state != do_transfer)
-		printk(KERN_ERR "swim3_interrupt, state=%d, dir=%x, intr=%x, err=%x\n",
-		       fs->state, rq_data_dir(fd_req), intr, err);
+		swim3_err("Non-transfer error interrupt: state=%d, dir=%x, intr=%x, err=%x\n",
+			  fs->state, rq_data_dir(req), intr, err);
 	switch (fs->state) {
 	case locating:
 		if (intr & SEEN_SECTOR) {
@@ -651,10 +700,10 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			del_timer(&fs->timeout);
 			fs->timeout_pending = 0;
 			if (sw->ctrack == 0xff) {
-				printk(KERN_ERR "swim3: seen sector but cyl=ff?\n");
+				swim3_err("%s", "Seen sector but cyl=ff?\n");
 				fs->cur_cyl = -1;
 				if (fs->retries > 5) {
-					swim3_end_request_cur(-EIO);
+					swim3_end_request(fs, -EIO, 0);
 					fs->state = idle;
 					start_request(fs);
 				} else {
@@ -666,8 +715,8 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			fs->cur_cyl = sw->ctrack;
 			fs->cur_sector = sw->csect;
 			if (fs->expect_cyl != -1 && fs->expect_cyl != fs->cur_cyl)
-				printk(KERN_ERR "swim3: expected cyl %d, got %d\n",
-				       fs->expect_cyl, fs->cur_cyl);
+				swim3_err("Expected cyl %d, got %d\n",
+					  fs->expect_cyl, fs->cur_cyl);
 			fs->state = do_transfer;
 			act(fs);
 		}
@@ -702,7 +751,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 		fs->timeout_pending = 0;
 		dr = fs->dma;
 		cp = fs->dma_cmd;
-		if (rq_data_dir(fd_req) == WRITE)
+		if (rq_data_dir(req) == WRITE)
 			++cp;
 		/*
 		 * Check that the main data transfer has finished.
@@ -727,31 +776,32 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 		if (intr & ERROR_INTR) {
 			n = fs->scount - 1 - resid / 512;
 			if (n > 0) {
-				blk_update_request(fd_req, 0, n << 9);
+				blk_update_request(req, 0, n << 9);
 				fs->req_sector += n;
 			}
 			if (fs->retries < 5) {
 				++fs->retries;
 				act(fs);
 			} else {
-				printk("swim3: error %sing block %ld (err=%x)\n",
-				       rq_data_dir(fd_req) == WRITE? "writ": "read",
-				       (long)blk_rq_pos(fd_req), err);
-				swim3_end_request_cur(-EIO);
+				swim3_err("Error %sing block %ld (err=%x)\n",
+				       rq_data_dir(req) == WRITE? "writ": "read",
+				       (long)blk_rq_pos(req), err);
+				swim3_end_request(fs, -EIO, 0);
 				fs->state = idle;
 			}
 		} else {
 			if ((stat & ACTIVE) == 0 || resid != 0) {
 				/* musta been an error */
-				printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid);
-				printk(KERN_ERR "  state=%d, dir=%x, intr=%x, err=%x\n",
-				       fs->state, rq_data_dir(fd_req), intr, err);
-				swim3_end_request_cur(-EIO);
+				swim3_err("fd dma error: stat=%x resid=%d\n", stat, resid);
+				swim3_err("  state=%d, dir=%x, intr=%x, err=%x\n",
+					  fs->state, rq_data_dir(req), intr, err);
+				swim3_end_request(fs, -EIO, 0);
 				fs->state = idle;
 				start_request(fs);
 				break;
 			}
-			if (swim3_end_request(0, fs->scount << 9)) {
+			fs->retries = 0;
+			if (swim3_end_request(fs, 0, fs->scount << 9)) {
 				fs->req_sector += fs->scount;
 				if (fs->req_sector > fs->secpertrack) {
 					fs->req_sector -= fs->secpertrack;
@@ -768,8 +818,9 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			start_request(fs);
 		break;
 	default:
-		printk(KERN_ERR "swim3: don't know what to do in state %d\n", fs->state);
+		swim3_err("Don't know what to do in state %d\n", fs->state);
 	}
+	spin_unlock_irqrestore(&swim3_lock, flags);
 	return IRQ_HANDLED;
 }
 
@@ -779,26 +830,34 @@ static void fd_dma_interrupt(int irq, void *dev_id)
 }
 */
 
+/* Called under the mutex to grab exclusive access to a drive */
 static int grab_drive(struct floppy_state *fs, enum swim_state state,
 		      int interruptible)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&fs->lock, flags);
-	if (fs->state != idle) {
+	swim3_dbg("%s", "-> grab drive\n");
+
+	spin_lock_irqsave(&swim3_lock, flags);
+	if (fs->state != idle && fs->state != available) {
 		++fs->wanted;
-		while (fs->state != available) {
-			if (interruptible && signal_pending(current)) {
-				--fs->wanted;
-				spin_unlock_irqrestore(&fs->lock, flags);
-				return -EINTR;
-			}
-			interruptible_sleep_on(&fs->wait);
+		/* this will enable irqs in order to sleep */
+		if (!interruptible)
+			wait_event_lock_irq(fs->wait,
+                                        fs->state == available,
+                                        swim3_lock);
+		else if (wait_event_interruptible_lock_irq(fs->wait,
+					fs->state == available,
+					swim3_lock)) {
+			--fs->wanted;
+			spin_unlock_irqrestore(&swim3_lock, flags);
+			return -EINTR;
 		}
 		--fs->wanted;
 	}
 	fs->state = state;
-	spin_unlock_irqrestore(&fs->lock, flags);
+	spin_unlock_irqrestore(&swim3_lock, flags);
+
 	return 0;
 }
 
@@ -806,10 +865,12 @@ static void release_drive(struct floppy_state *fs)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&fs->lock, flags);
+	swim3_dbg("%s", "-> release drive\n");
+
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->state = idle;
 	start_request(fs);
-	spin_unlock_irqrestore(&fs->lock, flags);
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static int fd_eject(struct floppy_state *fs)
@@ -840,7 +901,7 @@ static int fd_eject(struct floppy_state *fs)
 static struct floppy_struct floppy_type =
 	{ 2880,18,2,80,0,0x1B,0x00,0xCF,0x6C,NULL };	/*  7 1.44MB 3.5"   */
 
-static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
+static int floppy_locked_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long param)
 {
 	struct floppy_state *fs = bdev->bd_disk->private_data;
@@ -849,10 +910,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
 	if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-#ifdef CONFIG_PMAC_MEDIABAY
-	if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
+	if (fs->mdev->media_bay &&
+	    check_media_bay(fs->mdev->media_bay) != MB_FD)
 		return -ENXIO;
-#endif
 
 	switch (cmd) {
 	case FDEJECT:
@@ -869,6 +929,18 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
 	return -ENOTTY;
 }
 
+static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
+				 unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	mutex_lock(&swim3_mutex);
+	ret = floppy_locked_ioctl(bdev, mode, cmd, param);
+	mutex_unlock(&swim3_mutex);
+
+	return ret;
+}
+
 static int floppy_open(struct block_device *bdev, fmode_t mode)
 {
 	struct floppy_state *fs = bdev->bd_disk->private_data;
@@ -876,10 +948,9 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	int n, err = 0;
 
 	if (fs->ref_count == 0) {
-#ifdef CONFIG_PMAC_MEDIABAY
-		if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
+		if (fs->mdev->media_bay &&
+		    check_media_bay(fs->mdev->media_bay) != MB_FD)
 			return -ENXIO;
-#endif
 		out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2);
 		out_8(&sw->control_bic, 0xff);
 		out_8(&sw->mode, 0x95);
@@ -939,22 +1010,36 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	return 0;
 }
 
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+	int ret;
+
+	mutex_lock(&swim3_mutex);
+	ret = floppy_open(bdev, mode);
+	mutex_unlock(&swim3_mutex);
+
+	return ret;
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	struct floppy_state *fs = disk->private_data;
 	struct swim3 __iomem *sw = fs->swim3;
+
+	mutex_lock(&swim3_mutex);
 	if (fs->ref_count > 0 && --fs->ref_count == 0) {
 		swim3_action(fs, MOTOR_OFF);
 		out_8(&sw->control_bic, 0xff);
 		swim3_select(fs, RELAX);
 	}
-	return 0;
+	mutex_unlock(&swim3_mutex);
 }
 
-static int floppy_check_change(struct gendisk *disk)
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
 {
 	struct floppy_state *fs = disk->private_data;
-	return fs->ejected;
+	return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static int floppy_revalidate(struct gendisk *disk)
@@ -963,10 +1048,9 @@ static int floppy_revalidate(struct gendisk *disk)
 	struct swim3 __iomem *sw;
 	int ret, n;
 
-#ifdef CONFIG_PMAC_MEDIABAY
-	if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
+	if (fs->mdev->media_bay &&
+	    check_media_bay(fs->mdev->media_bay) != MB_FD)
 		return -ENXIO;
-#endif
 
 	sw = fs->swim3;
 	grab_drive(fs, revalidating, 0);
@@ -998,102 +1082,110 @@ static int floppy_revalidate(struct gendisk *disk)
 	return ret;
 }
 
-static struct block_device_operations floppy_fops = {
-	.open		= floppy_open,
+static const struct block_device_operations floppy_fops = {
+	.open		= floppy_unlocked_open,
 	.release	= floppy_release,
-	.locked_ioctl	= floppy_ioctl,
-	.media_changed	= floppy_check_change,
+	.ioctl		= floppy_ioctl,
+	.check_events	= floppy_check_events,
 	.revalidate_disk= floppy_revalidate,
 };
 
+static void swim3_mb_event(struct macio_dev* mdev, int mb_state)
+{
+	struct floppy_state *fs = macio_get_drvdata(mdev);
+	struct swim3 __iomem *sw;
+
+	if (!fs)
+		return;
+
+	sw = fs->swim3;
+
+	if (mb_state != MB_FD)
+		return;
+
+	/* Clear state */
+	out_8(&sw->intr_enable, 0);
+	in_8(&sw->intr);
+	in_8(&sw->error);
+}
+
 static int swim3_add_device(struct macio_dev *mdev, int index)
 {
-	struct device_node *swim = mdev->ofdev.node;
-	struct device_node *mediabay;
+	struct device_node *swim = mdev->ofdev.dev.of_node;
 	struct floppy_state *fs = &floppy_states[index];
 	int rc = -EBUSY;
 
+	/* Do this first for message macros */
+	memset(fs, 0, sizeof(*fs));
+	fs->mdev = mdev;
+	fs->index = index;
+
 	/* Check & Request resources */
 	if (macio_resource_count(mdev) < 2) {
-		printk(KERN_WARNING "ifd%d: no address for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "No address in device-tree\n");
 		return -ENXIO;
 	}
-	if (macio_irq_count(mdev) < 2) {
-		printk(KERN_WARNING "fd%d: no intrs for device %s\n",
-			index, swim->full_name);
+	if (macio_irq_count(mdev) < 1) {
+		swim3_err("%s", "No interrupt in device-tree\n");
+		return -ENXIO;
 	}
 	if (macio_request_resource(mdev, 0, "swim3 (mmio)")) {
-		printk(KERN_ERR "fd%d: can't request mmio resource for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Can't request mmio resource\n");
 		return -EBUSY;
 	}
 	if (macio_request_resource(mdev, 1, "swim3 (dma)")) {
-		printk(KERN_ERR "fd%d: can't request dma resource for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Can't request dma resource\n");
 		macio_release_resource(mdev, 0);
 		return -EBUSY;
 	}
 	dev_set_drvdata(&mdev->ofdev.dev, fs);
 
-	mediabay = (strcasecmp(swim->parent->type, "media-bay") == 0) ?
-		swim->parent : NULL;
-	if (mediabay == NULL)
+	if (mdev->media_bay == NULL)
 		pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1);
 	
-	memset(fs, 0, sizeof(*fs));
-	spin_lock_init(&fs->lock);
 	fs->state = idle;
 	fs->swim3 = (struct swim3 __iomem *)
 		ioremap(macio_resource_start(mdev, 0), 0x200);
 	if (fs->swim3 == NULL) {
-		printk("fd%d: couldn't map registers for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Couldn't map mmio registers\n");
 		rc = -ENOMEM;
 		goto out_release;
 	}
 	fs->dma = (struct dbdma_regs __iomem *)
 		ioremap(macio_resource_start(mdev, 1), 0x200);
 	if (fs->dma == NULL) {
-		printk("fd%d: couldn't map DMA for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Couldn't map dma registers\n");
 		iounmap(fs->swim3);
 		rc = -ENOMEM;
 		goto out_release;
 	}
 	fs->swim3_intr = macio_irq(mdev, 0);
-	fs->dma_intr = macio_irq(mdev, 1);;
+	fs->dma_intr = macio_irq(mdev, 1);
 	fs->cur_cyl = -1;
 	fs->cur_sector = -1;
 	fs->secpercyl = 36;
 	fs->secpertrack = 18;
 	fs->total_secs = 2880;
-	fs->media_bay = mediabay;
 	init_waitqueue_head(&fs->wait);
 
 	fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space);
 	memset(fs->dma_cmd, 0, 2 * sizeof(struct dbdma_cmd));
 	st_le16(&fs->dma_cmd[1].command, DBDMA_STOP);
 
+	if (mdev->media_bay == NULL || check_media_bay(mdev->media_bay) == MB_FD)
+		swim3_mb_event(mdev, MB_FD);
+
 	if (request_irq(fs->swim3_intr, swim3_interrupt, 0, "SWIM3", fs)) {
-		printk(KERN_ERR "fd%d: couldn't request irq %d for %s\n",
-		       index, fs->swim3_intr, swim->full_name);
+		swim3_err("%s", "Couldn't request interrupt\n");
 		pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 0);
 		goto out_unmap;
 		return -EBUSY;
 	}
-/*
-	if (request_irq(fs->dma_intr, fd_dma_interrupt, 0, "SWIM3-dma", fs)) {
-		printk(KERN_ERR "Couldn't get irq %d for SWIM3 DMA",
-		       fs->dma_intr);
-		return -EBUSY;
-	}
-*/
 
 	init_timer(&fs->timeout);
 
-	printk(KERN_INFO "fd%d: SWIM3 floppy controller %s\n", floppy_count,
-		mediabay ? "in media bay" : "");
+	swim3_info("SWIM3 floppy controller %s\n",
+		mdev->media_bay ? "in media bay" : "");
 
 	return 0;
 
@@ -1108,43 +1200,45 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
 	return rc;
 }
 
-static int __devinit swim3_attach(struct macio_dev *mdev, const struct of_device_id *match)
+static int swim3_attach(struct macio_dev *mdev,
+			const struct of_device_id *match)
 {
-	int i, rc;
 	struct gendisk *disk;
+	int index, rc;
+
+	index = floppy_count++;
+	if (index >= MAX_FLOPPIES)
+		return -ENXIO;
 
 	/* Add the drive */
-	rc = swim3_add_device(mdev, floppy_count);
+	rc = swim3_add_device(mdev, index);
 	if (rc)
 		return rc;
+	/* Now register that disk. Same comment about failure handling */
+	disk = disks[index] = alloc_disk(1);
+	if (disk == NULL)
+		return -ENOMEM;
+	disk->queue = blk_init_queue(do_fd_request, &swim3_lock);
+	if (disk->queue == NULL) {
+		put_disk(disk);
+		return -ENOMEM;
+	}
+	disk->queue->queuedata = &floppy_states[index];
 
-	/* Now create the queue if not there yet */
-	if (swim3_queue == NULL) {
+	if (index == 0) {
 		/* If we failed, there isn't much we can do as the driver is still
 		 * too dumb to remove the device, just bail out
 		 */
 		if (register_blkdev(FLOPPY_MAJOR, "fd"))
 			return 0;
-		swim3_queue = blk_init_queue(do_fd_request, &swim3_lock);
-		if (swim3_queue == NULL) {
-			unregister_blkdev(FLOPPY_MAJOR, "fd");
-			return 0;
-		}
 	}
 
-	/* Now register that disk. Same comment about failure handling */
-	i = floppy_count++;
-	disk = disks[i] = alloc_disk(1);
-	if (disk == NULL)
-		return 0;
-
 	disk->major = FLOPPY_MAJOR;
-	disk->first_minor = i;
+	disk->first_minor = index;
 	disk->fops = &floppy_fops;
-	disk->private_data = &floppy_states[i];
-	disk->queue = swim3_queue;
+	disk->private_data = &floppy_states[index];
 	disk->flags |= GENHD_FL_REMOVABLE;
-	sprintf(disk->disk_name, "fd%d", i);
+	sprintf(disk->disk_name, "fd%d", index);
 	set_capacity(disk, 2880);
 	add_disk(disk);
 
@@ -1162,13 +1256,19 @@ static struct of_device_id swim3_match[] =
 	{
 	.compatible	= "swim3"
 	},
+	{ /* end of list */ }
 };
 
 static struct macio_driver swim3_driver =
 {
-	.name 		= "swim3",
-	.match_table	= swim3_match,
+	.driver = {
+		.name 		= "swim3",
+		.of_match_table	= swim3_match,
+	},
 	.probe		= swim3_attach,
+#ifdef CONFIG_PMAC_MEDIABAY
+	.mediabay_event	= swim3_mb_event,
+#endif
 #if 0
 	.suspend	= swim3_suspend,
 	.resume		= swim3_resume,
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index da403b6a7f4..d5e2d12b9d9 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -409,7 +409,7 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 static void carm_remove_one (struct pci_dev *pdev);
 static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
-static struct pci_device_id carm_pci_tbl[] = {
+static const struct pci_device_id carm_pci_tbl[] = {
 	{ PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
 	{ PCI_VENDOR_ID_PROMISE, 0x8002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
 	{ }	/* terminate list */
@@ -423,7 +423,7 @@ static struct pci_driver carm_driver = {
 	.remove		= carm_remove_one,
 };
 
-static struct block_device_operations carm_bd_ops = {
+static const struct block_device_operations carm_bd_ops = {
 	.owner		= THIS_MODULE,
 	.getgeo		= carm_bdev_getgeo,
 };
@@ -619,8 +619,10 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	       host->state == HST_DEV_SCAN);
 	spin_unlock_irq(&host->lock);
 
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
 	return 0;
 
@@ -658,8 +660,10 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	BUG_ON(rc < 0);
 	crq->msg_bucket = (u32) rc;
 
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
 	return 0;
 }
@@ -1116,7 +1120,7 @@ static inline void carm_handle_resp(struct carm_host *host,
 			break;
 		case MISC_GET_FW_VER: {
 			struct carm_fw_ver *ver = (struct carm_fw_ver *)
-				mem + sizeof(struct carm_msg_get_fw_ver);
+				(mem + sizeof(struct carm_msg_get_fw_ver));
 			if (!error) {
 				host->fw_ver = le32_to_cpu(ver->version);
 				host->flags |= (ver->features & FL_FW_VER_MASK);
@@ -1518,8 +1522,7 @@ static int carm_init_disks(struct carm_host *host)
 			break;
 		}
 		disk->queue = q;
-		blk_queue_max_hw_segments(q, CARM_MAX_REQ_SG);
-		blk_queue_max_phys_segments(q, CARM_MAX_REQ_SG);
+		blk_queue_max_segments(q, CARM_MAX_REQ_SG);
 		blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
 
 		q->queuedata = port;
@@ -1564,15 +1567,13 @@ static int carm_init_shm(struct carm_host *host)
 
 static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 {
-	static unsigned int printed_version;
 	struct carm_host *host;
 	unsigned int pci_dac;
 	int rc;
 	struct request_queue *q;
 	unsigned int i;
 
-	if (!printed_version++)
-		printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+	printk_once(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
 
 	rc = pci_enable_device(pdev);
 	if (rc)
@@ -1743,20 +1744,6 @@ static void carm_remove_one (struct pci_dev *pdev)
 	kfree(host);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
-	pci_set_drvdata(pdev, NULL);
 }
 
-static int __init carm_init(void)
-{
-	return pci_register_driver(&carm_driver);
-}
-
-static void __exit carm_exit(void)
-{
-	pci_unregister_driver(&carm_driver);
-}
-
-module_init(carm_init);
-module_exit(carm_exit);
-
-
+module_pci_driver(carm_driver);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
deleted file mode 100644
index cc54473b8e7..00000000000
--- a/drivers/block/ub.c
+++ /dev/null
@@ -1,2487 +0,0 @@
-/*
- * The low performance USB storage driver (ub).
- *
- * Copyright (c) 1999, 2000 Matthew Dharm (mdharm-usb@one-eyed-alien.net)
- * Copyright (C) 2004 Pete Zaitcev (zaitcev@yahoo.com)
- *
- * This work is a part of Linux kernel, is derived from it,
- * and is not licensed separately. See file COPYING for details.
- *
- * TODO (sorted by decreasing priority)
- *  -- Return sense now that rq allows it (we always auto-sense anyway).
- *  -- set readonly flag for CDs, set removable flag for CF readers
- *  -- do inquiry and verify we got a disk and not a tape (for LUN mismatch)
- *  -- verify the 13 conditions and do bulk resets
- *  -- highmem
- *  -- move top_sense and work_bcs into separate allocations (if they survive)
- *     for cache purists and esoteric architectures.
- *  -- Allocate structure for LUN 0 before the first ub_sync_tur, avoid NULL. ?
- *  -- prune comments, they are too volumnous
- *  -- Resove XXX's
- *  -- CLEAR, CLR2STS, CLRRS seem to be ripe for refactoring.
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/usb.h>
-#include <linux/usb_usual.h>
-#include <linux/blkdev.h>
-#include <linux/timer.h>
-#include <linux/scatterlist.h>
-#include <scsi/scsi.h>
-
-#define DRV_NAME "ub"
-
-#define UB_MAJOR 180
-
-/*
- * The command state machine is the key model for understanding of this driver.
- *
- * The general rule is that all transitions are done towards the bottom
- * of the diagram, thus preventing any loops.
- *
- * An exception to that is how the STAT state is handled. A counter allows it
- * to be re-entered along the path marked with [C].
- *
- *       +--------+
- *       ! INIT   !
- *       +--------+
- *           !
- *        ub_scsi_cmd_start fails ->--------------------------------------\
- *           !                                                            !
- *           V                                                            !
- *       +--------+                                                       !
- *       ! CMD    !                                                       !
- *       +--------+                                                       !
- *           !                                            +--------+      !
- *         was -EPIPE -->-------------------------------->! CLEAR  !      !
- *           !                                            +--------+      !
- *           !                                                !           !
- *         was error -->------------------------------------- ! --------->\
- *           !                                                !           !
- *  /--<-- cmd->dir == NONE ?                                 !           !
- *  !        !                                                !           !
- *  !        V                                                !           !
- *  !    +--------+                                           !           !
- *  !    ! DATA   !                                           !           !
- *  !    +--------+                                           !           !
- *  !        !                           +---------+          !           !
- *  !      was -EPIPE -->--------------->! CLR2STS !          !           !
- *  !        !                           +---------+          !           !
- *  !        !                                !               !           !
- *  !        !                              was error -->---- ! --------->\
- *  !      was error -->--------------------- ! ------------- ! --------->\
- *  !        !                                !               !           !
- *  !        V                                !               !           !
- *  \--->+--------+                           !               !           !
- *       ! STAT   !<--------------------------/               !           !
- *  /--->+--------+                                           !           !
- *  !        !                                                !           !
- * [C]     was -EPIPE -->-----------\                         !           !
- *  !        !                      !                         !           !
- *  +<---- len == 0                 !                         !           !
- *  !        !                      !                         !           !
- *  !      was error -->--------------------------------------!---------->\
- *  !        !                      !                         !           !
- *  +<---- bad CSW                  !                         !           !
- *  +<---- bad tag                  !                         !           !
- *  !        !                      V                         !           !
- *  !        !                 +--------+                     !           !
- *  !        !                 ! CLRRS  !                     !           !
- *  !        !                 +--------+                     !           !
- *  !        !                      !                         !           !
- *  \------- ! --------------------[C]--------\               !           !
- *           !                                !               !           !
- *         cmd->error---\                +--------+           !           !
- *           !          +--------------->! SENSE  !<----------/           !
- *         STAT_FAIL----/                +--------+                       !
- *           !                                !                           V
- *           !                                V                      +--------+
- *           \--------------------------------\--------------------->! DONE   !
- *                                                                   +--------+
- */
-
-/*
- * This many LUNs per USB device.
- * Every one of them takes a host, see UB_MAX_HOSTS.
- */
-#define UB_MAX_LUNS   9
-
-/*
- */
-
-#define UB_PARTS_PER_LUN      8
-
-#define UB_MAX_CDB_SIZE      16		/* Corresponds to Bulk */
-
-#define UB_SENSE_SIZE  18
-
-/*
- */
-
-/* command block wrapper */
-struct bulk_cb_wrap {
-	__le32	Signature;		/* contains 'USBC' */
-	u32	Tag;			/* unique per command id */
-	__le32	DataTransferLength;	/* size of data */
-	u8	Flags;			/* direction in bit 0 */
-	u8	Lun;			/* LUN */
-	u8	Length;			/* of of the CDB */
-	u8	CDB[UB_MAX_CDB_SIZE];	/* max command */
-};
-
-#define US_BULK_CB_WRAP_LEN	31
-#define US_BULK_CB_SIGN		0x43425355	/*spells out USBC */
-#define US_BULK_FLAG_IN		1
-#define US_BULK_FLAG_OUT	0
-
-/* command status wrapper */
-struct bulk_cs_wrap {
-	__le32	Signature;		/* should = 'USBS' */
-	u32	Tag;			/* same as original command */
-	__le32	Residue;		/* amount not transferred */
-	u8	Status;			/* see below */
-};
-
-#define US_BULK_CS_WRAP_LEN	13
-#define US_BULK_CS_SIGN		0x53425355	/* spells out 'USBS' */
-#define US_BULK_STAT_OK		0
-#define US_BULK_STAT_FAIL	1
-#define US_BULK_STAT_PHASE	2
-
-/* bulk-only class specific requests */
-#define US_BULK_RESET_REQUEST	0xff
-#define US_BULK_GET_MAX_LUN	0xfe
-
-/*
- */
-struct ub_dev;
-
-#define UB_MAX_REQ_SG	9	/* cdrecord requires 32KB and maybe a header */
-#define UB_MAX_SECTORS 64
-
-/*
- * A second is more than enough for a 32K transfer (UB_MAX_SECTORS)
- * even if a webcam hogs the bus, but some devices need time to spin up.
- */
-#define UB_URB_TIMEOUT	(HZ*2)
-#define UB_DATA_TIMEOUT	(HZ*5)	/* ZIP does spin-ups in the data phase */
-#define UB_STAT_TIMEOUT	(HZ*5)	/* Same spinups and eject for a dataless cmd. */
-#define UB_CTRL_TIMEOUT	(HZ/2)	/* 500ms ought to be enough to clear a stall */
-
-/*
- * An instance of a SCSI command in transit.
- */
-#define UB_DIR_NONE	0
-#define UB_DIR_READ	1
-#define UB_DIR_ILLEGAL2	2
-#define UB_DIR_WRITE	3
-
-#define UB_DIR_CHAR(c)  (((c)==UB_DIR_WRITE)? 'w': \
-			 (((c)==UB_DIR_READ)? 'r': 'n'))
-
-enum ub_scsi_cmd_state {
-	UB_CMDST_INIT,			/* Initial state */
-	UB_CMDST_CMD,			/* Command submitted */
-	UB_CMDST_DATA,			/* Data phase */
-	UB_CMDST_CLR2STS,		/* Clearing before requesting status */
-	UB_CMDST_STAT,			/* Status phase */
-	UB_CMDST_CLEAR,			/* Clearing a stall (halt, actually) */
-	UB_CMDST_CLRRS,			/* Clearing before retrying status */
-	UB_CMDST_SENSE,			/* Sending Request Sense */
-	UB_CMDST_DONE			/* Final state */
-};
-
-struct ub_scsi_cmd {
-	unsigned char cdb[UB_MAX_CDB_SIZE];
-	unsigned char cdb_len;
-
-	unsigned char dir;		/* 0 - none, 1 - read, 3 - write. */
-	enum ub_scsi_cmd_state state;
-	unsigned int tag;
-	struct ub_scsi_cmd *next;
-
-	int error;			/* Return code - valid upon done */
-	unsigned int act_len;		/* Return size */
-	unsigned char key, asc, ascq;	/* May be valid if error==-EIO */
-
-	int stat_count;			/* Retries getting status. */
-	unsigned int timeo;		/* jiffies until rq->timeout changes */
-
-	unsigned int len;		/* Requested length */
-	unsigned int current_sg;
-	unsigned int nsg;		/* sgv[nsg] */
-	struct scatterlist sgv[UB_MAX_REQ_SG];
-
-	struct ub_lun *lun;
-	void (*done)(struct ub_dev *, struct ub_scsi_cmd *);
-	void *back;
-};
-
-struct ub_request {
-	struct request *rq;
-	unsigned int current_try;
-	unsigned int nsg;		/* sgv[nsg] */
-	struct scatterlist sgv[UB_MAX_REQ_SG];
-};
-
-/*
- */
-struct ub_capacity {
-	unsigned long nsec;		/* Linux size - 512 byte sectors */
-	unsigned int bsize;		/* Linux hardsect_size */
-	unsigned int bshift;		/* Shift between 512 and hard sects */
-};
-
-/*
- * This is a direct take-off from linux/include/completion.h
- * The difference is that I do not wait on this thing, just poll.
- * When I want to wait (ub_probe), I just use the stock completion.
- *
- * Note that INIT_COMPLETION takes no lock. It is correct. But why
- * in the bloody hell that thing takes struct instead of pointer to struct
- * is quite beyond me. I just copied it from the stock completion.
- */
-struct ub_completion {
-	unsigned int done;
-	spinlock_t lock;
-};
-
-static inline void ub_init_completion(struct ub_completion *x)
-{
-	x->done = 0;
-	spin_lock_init(&x->lock);
-}
-
-#define UB_INIT_COMPLETION(x)	((x).done = 0)
-
-static void ub_complete(struct ub_completion *x)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&x->lock, flags);
-	x->done++;
-	spin_unlock_irqrestore(&x->lock, flags);
-}
-
-static int ub_is_completed(struct ub_completion *x)
-{
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&x->lock, flags);
-	ret = x->done;
-	spin_unlock_irqrestore(&x->lock, flags);
-	return ret;
-}
-
-/*
- */
-struct ub_scsi_cmd_queue {
-	int qlen, qmax;
-	struct ub_scsi_cmd *head, *tail;
-};
-
-/*
- * The block device instance (one per LUN).
- */
-struct ub_lun {
-	struct ub_dev *udev;
-	struct list_head link;
-	struct gendisk *disk;
-	int id;				/* Host index */
-	int num;			/* LUN number */
-	char name[16];
-
-	int changed;			/* Media was changed */
-	int removable;
-	int readonly;
-
-	struct ub_request urq;
-
-	/* Use Ingo's mempool if or when we have more than one command. */
-	/*
-	 * Currently we never need more than one command for the whole device.
-	 * However, giving every LUN a command is a cheap and automatic way
-	 * to enforce fairness between them.
-	 */
-	int cmda[1];
-	struct ub_scsi_cmd cmdv[1];
-
-	struct ub_capacity capacity; 
-};
-
-/*
- * The USB device instance.
- */
-struct ub_dev {
-	spinlock_t *lock;
-	atomic_t poison;		/* The USB device is disconnected */
-	int openc;			/* protected by ub_lock! */
-					/* kref is too implicit for our taste */
-	int reset;			/* Reset is running */
-	int bad_resid;
-	unsigned int tagcnt;
-	char name[12];
-	struct usb_device *dev;
-	struct usb_interface *intf;
-
-	struct list_head luns;
-
-	unsigned int send_bulk_pipe;	/* cached pipe values */
-	unsigned int recv_bulk_pipe;
-	unsigned int send_ctrl_pipe;
-	unsigned int recv_ctrl_pipe;
-
-	struct tasklet_struct tasklet;
-
-	struct ub_scsi_cmd_queue cmd_queue;
-	struct ub_scsi_cmd top_rqs_cmd;	/* REQUEST SENSE */
-	unsigned char top_sense[UB_SENSE_SIZE];
-
-	struct ub_completion work_done;
-	struct urb work_urb;
-	struct timer_list work_timer;
-	int last_pipe;			/* What might need clearing */
-	__le32 signature;		/* Learned signature */
-	struct bulk_cb_wrap work_bcb;
-	struct bulk_cs_wrap work_bcs;
-	struct usb_ctrlrequest work_cr;
-
-	struct work_struct reset_work;
-	wait_queue_head_t reset_wait;
-};
-
-/*
- */
-static void ub_cleanup(struct ub_dev *sc);
-static int ub_request_fn_1(struct ub_lun *lun, struct request *rq);
-static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
-    struct ub_scsi_cmd *cmd, struct ub_request *urq);
-static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
-    struct ub_scsi_cmd *cmd, struct ub_request *urq);
-static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
-static void ub_end_rq(struct request *rq, unsigned int status);
-static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
-    struct ub_request *urq, struct ub_scsi_cmd *cmd);
-static int ub_submit_scsi(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
-static void ub_urb_complete(struct urb *urb);
-static void ub_scsi_action(unsigned long _dev);
-static void ub_scsi_dispatch(struct ub_dev *sc);
-static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
-static void ub_data_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
-static void ub_state_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd, int rc);
-static int __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
-static void ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
-static void ub_state_stat_counted(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
-static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
-static int ub_submit_clear_stall(struct ub_dev *sc, struct ub_scsi_cmd *cmd,
-    int stalled_pipe);
-static void ub_top_sense_done(struct ub_dev *sc, struct ub_scsi_cmd *scmd);
-static void ub_reset_enter(struct ub_dev *sc, int try);
-static void ub_reset_task(struct work_struct *work);
-static int ub_sync_tur(struct ub_dev *sc, struct ub_lun *lun);
-static int ub_sync_read_cap(struct ub_dev *sc, struct ub_lun *lun,
-    struct ub_capacity *ret);
-static int ub_sync_reset(struct ub_dev *sc);
-static int ub_probe_clear_stall(struct ub_dev *sc, int stalled_pipe);
-static int ub_probe_lun(struct ub_dev *sc, int lnum);
-
-/*
- */
-#ifdef CONFIG_USB_LIBUSUAL
-
-#define ub_usb_ids  usb_storage_usb_ids
-#else
-
-static struct usb_device_id ub_usb_ids[] = {
-	{ USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_BULK) },
-	{ }
-};
-
-MODULE_DEVICE_TABLE(usb, ub_usb_ids);
-#endif /* CONFIG_USB_LIBUSUAL */
-
-/*
- * Find me a way to identify "next free minor" for add_disk(),
- * and the array disappears the next day. However, the number of
- * hosts has something to do with the naming and /proc/partitions.
- * This has to be thought out in detail before changing.
- * If UB_MAX_HOST was 1000, we'd use a bitmap. Or a better data structure.
- */
-#define UB_MAX_HOSTS  26
-static char ub_hostv[UB_MAX_HOSTS];
-
-#define UB_QLOCK_NUM 5
-static spinlock_t ub_qlockv[UB_QLOCK_NUM];
-static int ub_qlock_next = 0;
-
-static DEFINE_SPINLOCK(ub_lock);	/* Locks globals and ->openc */
-
-/*
- * The id allocator.
- *
- * This also stores the host for indexing by minor, which is somewhat dirty.
- */
-static int ub_id_get(void)
-{
-	unsigned long flags;
-	int i;
-
-	spin_lock_irqsave(&ub_lock, flags);
-	for (i = 0; i < UB_MAX_HOSTS; i++) {
-		if (ub_hostv[i] == 0) {
-			ub_hostv[i] = 1;
-			spin_unlock_irqrestore(&ub_lock, flags);
-			return i;
-		}
-	}
-	spin_unlock_irqrestore(&ub_lock, flags);
-	return -1;
-}
-
-static void ub_id_put(int id)
-{
-	unsigned long flags;
-
-	if (id < 0 || id >= UB_MAX_HOSTS) {
-		printk(KERN_ERR DRV_NAME ": bad host ID %d\n", id);
-		return;
-	}
-
-	spin_lock_irqsave(&ub_lock, flags);
-	if (ub_hostv[id] == 0) {
-		spin_unlock_irqrestore(&ub_lock, flags);
-		printk(KERN_ERR DRV_NAME ": freeing free host ID %d\n", id);
-		return;
-	}
-	ub_hostv[id] = 0;
-	spin_unlock_irqrestore(&ub_lock, flags);
-}
-
-/*
- * This is necessitated by the fact that blk_cleanup_queue does not
- * necesserily destroy the queue. Instead, it may merely decrease q->refcnt.
- * Since our blk_init_queue() passes a spinlock common with ub_dev,
- * we have life time issues when ub_cleanup frees ub_dev.
- */
-static spinlock_t *ub_next_lock(void)
-{
-	unsigned long flags;
-	spinlock_t *ret;
-
-	spin_lock_irqsave(&ub_lock, flags);
-	ret = &ub_qlockv[ub_qlock_next];
-	ub_qlock_next = (ub_qlock_next + 1) % UB_QLOCK_NUM;
-	spin_unlock_irqrestore(&ub_lock, flags);
-	return ret;
-}
-
-/*
- * Downcount for deallocation. This rides on two assumptions:
- *  - once something is poisoned, its refcount cannot grow
- *  - opens cannot happen at this time (del_gendisk was done)
- * If the above is true, we can drop the lock, which we need for
- * blk_cleanup_queue(): the silly thing may attempt to sleep.
- * [Actually, it never needs to sleep for us, but it calls might_sleep()]
- */
-static void ub_put(struct ub_dev *sc)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ub_lock, flags);
-	--sc->openc;
-	if (sc->openc == 0 && atomic_read(&sc->poison)) {
-		spin_unlock_irqrestore(&ub_lock, flags);
-		ub_cleanup(sc);
-	} else {
-		spin_unlock_irqrestore(&ub_lock, flags);
-	}
-}
-
-/*
- * Final cleanup and deallocation.
- */
-static void ub_cleanup(struct ub_dev *sc)
-{
-	struct list_head *p;
-	struct ub_lun *lun;
-	struct request_queue *q;
-
-	while (!list_empty(&sc->luns)) {
-		p = sc->luns.next;
-		lun = list_entry(p, struct ub_lun, link);
-		list_del(p);
-
-		/* I don't think queue can be NULL. But... Stolen from sx8.c */
-		if ((q = lun->disk->queue) != NULL)
-			blk_cleanup_queue(q);
-		/*
-		 * If we zero disk->private_data BEFORE put_disk, we have
-		 * to check for NULL all over the place in open, release,
-		 * check_media and revalidate, because the block level
-		 * semaphore is well inside the put_disk.
-		 * But we cannot zero after the call, because *disk is gone.
-		 * The sd.c is blatantly racy in this area.
-		 */
-		/* disk->private_data = NULL; */
-		put_disk(lun->disk);
-		lun->disk = NULL;
-
-		ub_id_put(lun->id);
-		kfree(lun);
-	}
-
-	usb_set_intfdata(sc->intf, NULL);
-	usb_put_intf(sc->intf);
-	usb_put_dev(sc->dev);
-	kfree(sc);
-}
-
-/*
- * The "command allocator".
- */
-static struct ub_scsi_cmd *ub_get_cmd(struct ub_lun *lun)
-{
-	struct ub_scsi_cmd *ret;
-
-	if (lun->cmda[0])
-		return NULL;
-	ret = &lun->cmdv[0];
-	lun->cmda[0] = 1;
-	return ret;
-}
-
-static void ub_put_cmd(struct ub_lun *lun, struct ub_scsi_cmd *cmd)
-{
-	if (cmd != &lun->cmdv[0]) {
-		printk(KERN_WARNING "%s: releasing a foreign cmd %p\n",
-		    lun->name, cmd);
-		return;
-	}
-	if (!lun->cmda[0]) {
-		printk(KERN_WARNING "%s: releasing a free cmd\n", lun->name);
-		return;
-	}
-	lun->cmda[0] = 0;
-}
-
-/*
- * The command queue.
- */
-static void ub_cmdq_add(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-	struct ub_scsi_cmd_queue *t = &sc->cmd_queue;
-
-	if (t->qlen++ == 0) {
-		t->head = cmd;
-		t->tail = cmd;
-	} else {
-		t->tail->next = cmd;
-		t->tail = cmd;
-	}
-
-	if (t->qlen > t->qmax)
-		t->qmax = t->qlen;
-}
-
-static void ub_cmdq_insert(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-	struct ub_scsi_cmd_queue *t = &sc->cmd_queue;
-
-	if (t->qlen++ == 0) {
-		t->head = cmd;
-		t->tail = cmd;
-	} else {
-		cmd->next = t->head;
-		t->head = cmd;
-	}
-
-	if (t->qlen > t->qmax)
-		t->qmax = t->qlen;
-}
-
-static struct ub_scsi_cmd *ub_cmdq_pop(struct ub_dev *sc)
-{
-	struct ub_scsi_cmd_queue *t = &sc->cmd_queue;
-	struct ub_scsi_cmd *cmd;
-
-	if (t->qlen == 0)
-		return NULL;
-	if (--t->qlen == 0)
-		t->tail = NULL;
-	cmd = t->head;
-	t->head = cmd->next;
-	cmd->next = NULL;
-	return cmd;
-}
-
-#define ub_cmdq_peek(sc)  ((sc)->cmd_queue.head)
-
-/*
- * The request function is our main entry point
- */
-
-static void ub_request_fn(struct request_queue *q)
-{
-	struct ub_lun *lun = q->queuedata;
-	struct request *rq;
-
-	while ((rq = blk_peek_request(q)) != NULL) {
-		if (ub_request_fn_1(lun, rq) != 0) {
-			blk_stop_queue(q);
-			break;
-		}
-	}
-}
-
-static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
-{
-	struct ub_dev *sc = lun->udev;
-	struct ub_scsi_cmd *cmd;
-	struct ub_request *urq;
-	int n_elem;
-
-	if (atomic_read(&sc->poison)) {
-		blk_start_request(rq);
-		ub_end_rq(rq, DID_NO_CONNECT << 16);
-		return 0;
-	}
-
-	if (lun->changed && !blk_pc_request(rq)) {
-		blk_start_request(rq);
-		ub_end_rq(rq, SAM_STAT_CHECK_CONDITION);
-		return 0;
-	}
-
-	if (lun->urq.rq != NULL)
-		return -1;
-	if ((cmd = ub_get_cmd(lun)) == NULL)
-		return -1;
-	memset(cmd, 0, sizeof(struct ub_scsi_cmd));
-
-	blk_start_request(rq);
-
-	urq = &lun->urq;
-	memset(urq, 0, sizeof(struct ub_request));
-	urq->rq = rq;
-
-	/*
-	 * get scatterlist from block layer
-	 */
-	sg_init_table(&urq->sgv[0], UB_MAX_REQ_SG);
-	n_elem = blk_rq_map_sg(lun->disk->queue, rq, &urq->sgv[0]);
-	if (n_elem < 0) {
-		/* Impossible, because blk_rq_map_sg should not hit ENOMEM. */
-		printk(KERN_INFO "%s: failed request map (%d)\n",
-		    lun->name, n_elem);
-		goto drop;
-	}
-	if (n_elem > UB_MAX_REQ_SG) {	/* Paranoia */
-		printk(KERN_WARNING "%s: request with %d segments\n",
-		    lun->name, n_elem);
-		goto drop;
-	}
-	urq->nsg = n_elem;
-
-	if (blk_pc_request(rq)) {
-		ub_cmd_build_packet(sc, lun, cmd, urq);
-	} else {
-		ub_cmd_build_block(sc, lun, cmd, urq);
-	}
-	cmd->state = UB_CMDST_INIT;
-	cmd->lun = lun;
-	cmd->done = ub_rw_cmd_done;
-	cmd->back = urq;
-
-	cmd->tag = sc->tagcnt++;
-	if (ub_submit_scsi(sc, cmd) != 0)
-		goto drop;
-
-	return 0;
-
-drop:
-	ub_put_cmd(lun, cmd);
-	ub_end_rq(rq, DID_ERROR << 16);
-	return 0;
-}
-
-static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
-    struct ub_scsi_cmd *cmd, struct ub_request *urq)
-{
-	struct request *rq = urq->rq;
-	unsigned int block, nblks;
-
-	if (rq_data_dir(rq) == WRITE)
-		cmd->dir = UB_DIR_WRITE;
-	else
-		cmd->dir = UB_DIR_READ;
-
-	cmd->nsg = urq->nsg;
-	memcpy(cmd->sgv, urq->sgv, sizeof(struct scatterlist) * cmd->nsg);
-
-	/*
-	 * build the command
-	 *
-	 * The call to blk_queue_logical_block_size() guarantees that request
-	 * is aligned, but it is given in terms of 512 byte units, always.
-	 */
-	block = blk_rq_pos(rq) >> lun->capacity.bshift;
-	nblks = blk_rq_sectors(rq) >> lun->capacity.bshift;
-
-	cmd->cdb[0] = (cmd->dir == UB_DIR_READ)? READ_10: WRITE_10;
-	/* 10-byte uses 4 bytes of LBA: 2147483648KB, 2097152MB, 2048GB */
-	cmd->cdb[2] = block >> 24;
-	cmd->cdb[3] = block >> 16;
-	cmd->cdb[4] = block >> 8;
-	cmd->cdb[5] = block;
-	cmd->cdb[7] = nblks >> 8;
-	cmd->cdb[8] = nblks;
-	cmd->cdb_len = 10;
-
-	cmd->len = blk_rq_bytes(rq);
-}
-
-static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
-    struct ub_scsi_cmd *cmd, struct ub_request *urq)
-{
-	struct request *rq = urq->rq;
-
-	if (blk_rq_bytes(rq) == 0) {
-		cmd->dir = UB_DIR_NONE;
-	} else {
-		if (rq_data_dir(rq) == WRITE)
-			cmd->dir = UB_DIR_WRITE;
-		else
-			cmd->dir = UB_DIR_READ;
-	}
-
-	cmd->nsg = urq->nsg;
-	memcpy(cmd->sgv, urq->sgv, sizeof(struct scatterlist) * cmd->nsg);
-
-	memcpy(&cmd->cdb, rq->cmd, rq->cmd_len);
-	cmd->cdb_len = rq->cmd_len;
-
-	cmd->len = blk_rq_bytes(rq);
-
-	/*
-	 * To reapply this to every URB is not as incorrect as it looks.
-	 * In return, we avoid any complicated tracking calculations.
-	 */
-	cmd->timeo = rq->timeout;
-}
-
-static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-	struct ub_lun *lun = cmd->lun;
-	struct ub_request *urq = cmd->back;
-	struct request *rq;
-	unsigned int scsi_status;
-
-	rq = urq->rq;
-
-	if (cmd->error == 0) {
-		if (blk_pc_request(rq)) {
-			if (cmd->act_len >= rq->resid_len)
-				rq->resid_len = 0;
-			else
-				rq->resid_len -= cmd->act_len;
-			scsi_status = 0;
-		} else {
-			if (cmd->act_len != cmd->len) {
-				scsi_status = SAM_STAT_CHECK_CONDITION;
-			} else {
-				scsi_status = 0;
-			}
-		}
-	} else {
-		if (blk_pc_request(rq)) {
-			/* UB_SENSE_SIZE is smaller than SCSI_SENSE_BUFFERSIZE */
-			memcpy(rq->sense, sc->top_sense, UB_SENSE_SIZE);
-			rq->sense_len = UB_SENSE_SIZE;
-			if (sc->top_sense[0] != 0)
-				scsi_status = SAM_STAT_CHECK_CONDITION;
-			else
-				scsi_status = DID_ERROR << 16;
-		} else {
-			if (cmd->error == -EIO &&
-			    (cmd->key == 0 ||
-			     cmd->key == MEDIUM_ERROR ||
-			     cmd->key == UNIT_ATTENTION)) {
-				if (ub_rw_cmd_retry(sc, lun, urq, cmd) == 0)
-					return;
-			}
-			scsi_status = SAM_STAT_CHECK_CONDITION;
-		}
-	}
-
-	urq->rq = NULL;
-
-	ub_put_cmd(lun, cmd);
-	ub_end_rq(rq, scsi_status);
-	blk_start_queue(lun->disk->queue);
-}
-
-static void ub_end_rq(struct request *rq, unsigned int scsi_status)
-{
-	int error;
-
-	if (scsi_status == 0) {
-		error = 0;
-	} else {
-		error = -EIO;
-		rq->errors = scsi_status;
-	}
-	__blk_end_request_all(rq, error);
-}
-
-static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
-    struct ub_request *urq, struct ub_scsi_cmd *cmd)
-{
-
-	if (atomic_read(&sc->poison))
-		return -ENXIO;
-
-	ub_reset_enter(sc, urq->current_try);
-
-	if (urq->current_try >= 3)
-		return -EIO;
-	urq->current_try++;
-
-	/* Remove this if anyone complains of flooding. */
-	printk(KERN_DEBUG "%s: dir %c len/act %d/%d "
-	    "[sense %x %02x %02x] retry %d\n",
-	    sc->name, UB_DIR_CHAR(cmd->dir), cmd->len, cmd->act_len,
-	    cmd->key, cmd->asc, cmd->ascq, urq->current_try);
-
-	memset(cmd, 0, sizeof(struct ub_scsi_cmd));
-	ub_cmd_build_block(sc, lun, cmd, urq);
-
-	cmd->state = UB_CMDST_INIT;
-	cmd->lun = lun;
-	cmd->done = ub_rw_cmd_done;
-	cmd->back = urq;
-
-	cmd->tag = sc->tagcnt++;
-
-#if 0 /* Wasteful */
-	return ub_submit_scsi(sc, cmd);
-#else
-	ub_cmdq_add(sc, cmd);
-	return 0;
-#endif
-}
-
-/*
- * Submit a regular SCSI operation (not an auto-sense).
- *
- * The Iron Law of Good Submit Routine is:
- * Zero return - callback is done, Nonzero return - callback is not done.
- * No exceptions.
- *
- * Host is assumed locked.
- */
-static int ub_submit_scsi(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-
-	if (cmd->state != UB_CMDST_INIT ||
-	    (cmd->dir != UB_DIR_NONE && cmd->len == 0)) {
-		return -EINVAL;
-	}
-
-	ub_cmdq_add(sc, cmd);
-	/*
-	 * We can call ub_scsi_dispatch(sc) right away here, but it's a little
-	 * safer to jump to a tasklet, in case upper layers do something silly.
-	 */
-	tasklet_schedule(&sc->tasklet);
-	return 0;
-}
-
-/*
- * Submit the first URB for the queued command.
- * This function does not deal with queueing in any way.
- */
-static int ub_scsi_cmd_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-	struct bulk_cb_wrap *bcb;
-	int rc;
-
-	bcb = &sc->work_bcb;
-
-	/*
-	 * ``If the allocation length is eighteen or greater, and a device
-	 * server returns less than eithteen bytes of data, the application
-	 * client should assume that the bytes not transferred would have been
-	 * zeroes had the device server returned those bytes.''
-	 *
-	 * We zero sense for all commands so that when a packet request
-	 * fails it does not return a stale sense.
-	 */
-	memset(&sc->top_sense, 0, UB_SENSE_SIZE);
-
-	/* set up the command wrapper */
-	bcb->Signature = cpu_to_le32(US_BULK_CB_SIGN);
-	bcb->Tag = cmd->tag;		/* Endianness is not important */
-	bcb->DataTransferLength = cpu_to_le32(cmd->len);
-	bcb->Flags = (cmd->dir == UB_DIR_READ) ? 0x80 : 0;
-	bcb->Lun = (cmd->lun != NULL) ? cmd->lun->num : 0;
-	bcb->Length = cmd->cdb_len;
-
-	/* copy the command payload */
-	memcpy(bcb->CDB, cmd->cdb, UB_MAX_CDB_SIZE);
-
-	UB_INIT_COMPLETION(sc->work_done);
-
-	sc->last_pipe = sc->send_bulk_pipe;
-	usb_fill_bulk_urb(&sc->work_urb, sc->dev, sc->send_bulk_pipe,
-	    bcb, US_BULK_CB_WRAP_LEN, ub_urb_complete, sc);
-
-	if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
-		/* XXX Clear stalls */
-		ub_complete(&sc->work_done);
-		return rc;
-	}
-
-	sc->work_timer.expires = jiffies + UB_URB_TIMEOUT;
-	add_timer(&sc->work_timer);
-
-	cmd->state = UB_CMDST_CMD;
-	return 0;
-}
-
-/*
- * Timeout handler.
- */
-static void ub_urb_timeout(unsigned long arg)
-{
-	struct ub_dev *sc = (struct ub_dev *) arg;
-	unsigned long flags;
-
-	spin_lock_irqsave(sc->lock, flags);
-	if (!ub_is_completed(&sc->work_done))
-		usb_unlink_urb(&sc->work_urb);
-	spin_unlock_irqrestore(sc->lock, flags);
-}
-
-/*
- * Completion routine for the work URB.
- *
- * This can be called directly from usb_submit_urb (while we have
- * the sc->lock taken) and from an interrupt (while we do NOT have
- * the sc->lock taken). Therefore, bounce this off to a tasklet.
- */
-static void ub_urb_complete(struct urb *urb)
-{
-	struct ub_dev *sc = urb->context;
-
-	ub_complete(&sc->work_done);
-	tasklet_schedule(&sc->tasklet);
-}
-
-static void ub_scsi_action(unsigned long _dev)
-{
-	struct ub_dev *sc = (struct ub_dev *) _dev;
-	unsigned long flags;
-
-	spin_lock_irqsave(sc->lock, flags);
-	ub_scsi_dispatch(sc);
-	spin_unlock_irqrestore(sc->lock, flags);
-}
-
-static void ub_scsi_dispatch(struct ub_dev *sc)
-{
-	struct ub_scsi_cmd *cmd;
-	int rc;
-
-	while (!sc->reset && (cmd = ub_cmdq_peek(sc)) != NULL) {
-		if (cmd->state == UB_CMDST_DONE) {
-			ub_cmdq_pop(sc);
-			(*cmd->done)(sc, cmd);
-		} else if (cmd->state == UB_CMDST_INIT) {
-			if ((rc = ub_scsi_cmd_start(sc, cmd)) == 0)
-				break;
-			cmd->error = rc;
-			cmd->state = UB_CMDST_DONE;
-		} else {
-			if (!ub_is_completed(&sc->work_done))
-				break;
-			del_timer(&sc->work_timer);
-			ub_scsi_urb_compl(sc, cmd);
-		}
-	}
-}
-
-static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-	struct urb *urb = &sc->work_urb;
-	struct bulk_cs_wrap *bcs;
-	int endp;
-	int len;
-	int rc;
-
-	if (atomic_read(&sc->poison)) {
-		ub_state_done(sc, cmd, -ENODEV);
-		return;
-	}
-
-	endp = usb_pipeendpoint(sc->last_pipe);
-	if (usb_pipein(sc->last_pipe))
-		endp |= USB_DIR_IN;
-
-	if (cmd->state == UB_CMDST_CLEAR) {
-		if (urb->status == -EPIPE) {
-			/*
-			 * STALL while clearning STALL.
-			 * The control pipe clears itself - nothing to do.
-			 */
-			printk(KERN_NOTICE "%s: stall on control pipe\n",
-			    sc->name);
-			goto Bad_End;
-		}
-
-		/*
-		 * We ignore the result for the halt clear.
-		 */
-
-		usb_reset_endpoint(sc->dev, endp);
-
-		ub_state_sense(sc, cmd);
-
-	} else if (cmd->state == UB_CMDST_CLR2STS) {
-		if (urb->status == -EPIPE) {
-			printk(KERN_NOTICE "%s: stall on control pipe\n",
-			    sc->name);
-			goto Bad_End;
-		}
-
-		/*
-		 * We ignore the result for the halt clear.
-		 */
-
-		usb_reset_endpoint(sc->dev, endp);
-
-		ub_state_stat(sc, cmd);
-
-	} else if (cmd->state == UB_CMDST_CLRRS) {
-		if (urb->status == -EPIPE) {
-			printk(KERN_NOTICE "%s: stall on control pipe\n",
-			    sc->name);
-			goto Bad_End;
-		}
-
-		/*
-		 * We ignore the result for the halt clear.
-		 */
-
-		usb_reset_endpoint(sc->dev, endp);
-
-		ub_state_stat_counted(sc, cmd);
-
-	} else if (cmd->state == UB_CMDST_CMD) {
-		switch (urb->status) {
-		case 0:
-			break;
-		case -EOVERFLOW:
-			goto Bad_End;
-		case -EPIPE:
-			rc = ub_submit_clear_stall(sc, cmd, sc->last_pipe);
-			if (rc != 0) {
-				printk(KERN_NOTICE "%s: "
-				    "unable to submit clear (%d)\n",
-				    sc->name, rc);
-				/*
-				 * This is typically ENOMEM or some other such shit.
-				 * Retrying is pointless. Just do Bad End on it...
-				 */
-				ub_state_done(sc, cmd, rc);
-				return;
-			}
-			cmd->state = UB_CMDST_CLEAR;
-			return;
-		case -ESHUTDOWN:	/* unplug */
-		case -EILSEQ:		/* unplug timeout on uhci */
-			ub_state_done(sc, cmd, -ENODEV);
-			return;
-		default:
-			goto Bad_End;
-		}
-		if (urb->actual_length != US_BULK_CB_WRAP_LEN) {
-			goto Bad_End;
-		}
-
-		if (cmd->dir == UB_DIR_NONE || cmd->nsg < 1) {
-			ub_state_stat(sc, cmd);
-			return;
-		}
-
-		// udelay(125);		// usb-storage has this
-		ub_data_start(sc, cmd);
-
-	} else if (cmd->state == UB_CMDST_DATA) {
-		if (urb->status == -EPIPE) {
-			rc = ub_submit_clear_stall(sc, cmd, sc->last_pipe);
-			if (rc != 0) {
-				printk(KERN_NOTICE "%s: "
-				    "unable to submit clear (%d)\n",
-				    sc->name, rc);
-				ub_state_done(sc, cmd, rc);
-				return;
-			}
-			cmd->state = UB_CMDST_CLR2STS;
-			return;
-		}
-		if (urb->status == -EOVERFLOW) {
-			/*
-			 * A babble? Failure, but we must transfer CSW now.
-			 */
-			cmd->error = -EOVERFLOW;	/* A cheap trick... */
-			ub_state_stat(sc, cmd);
-			return;
-		}
-
-		if (cmd->dir == UB_DIR_WRITE) {
-			/*
-			 * Do not continue writes in case of a failure.
-			 * Doing so would cause sectors to be mixed up,
-			 * which is worse than sectors lost.
-			 *
-			 * We must try to read the CSW, or many devices
-			 * get confused.
-			 */
-			len = urb->actual_length;
-			if (urb->status != 0 ||
-			    len != cmd->sgv[cmd->current_sg].length) {
-				cmd->act_len += len;
-
-				cmd->error = -EIO;
-				ub_state_stat(sc, cmd);
-				return;
-			}
-
-		} else {
-			/*
-			 * If an error occurs on read, we record it, and
-			 * continue to fetch data in order to avoid bubble.
-			 *
-			 * As a small shortcut, we stop if we detect that
-			 * a CSW mixed into data.
-			 */
-			if (urb->status != 0)
-				cmd->error = -EIO;
-
-			len = urb->actual_length;
-			if (urb->status != 0 ||
-			    len != cmd->sgv[cmd->current_sg].length) {
-				if ((len & 0x1FF) == US_BULK_CS_WRAP_LEN)
-					goto Bad_End;
-			}
-		}
-
-		cmd->act_len += urb->actual_length;
-
-		if (++cmd->current_sg < cmd->nsg) {
-			ub_data_start(sc, cmd);
-			return;
-		}
-		ub_state_stat(sc, cmd);
-
-	} else if (cmd->state == UB_CMDST_STAT) {
-		if (urb->status == -EPIPE) {
-			rc = ub_submit_clear_stall(sc, cmd, sc->last_pipe);
-			if (rc != 0) {
-				printk(KERN_NOTICE "%s: "
-				    "unable to submit clear (%d)\n",
-				    sc->name, rc);
-				ub_state_done(sc, cmd, rc);
-				return;
-			}
-
-			/*
-			 * Having a stall when getting CSW is an error, so
-			 * make sure uppper levels are not oblivious to it.
-			 */
-			cmd->error = -EIO;		/* A cheap trick... */
-
-			cmd->state = UB_CMDST_CLRRS;
-			return;
-		}
-
-		/* Catch everything, including -EOVERFLOW and other nasties. */
-		if (urb->status != 0)
-			goto Bad_End;
-
-		if (urb->actual_length == 0) {
-			ub_state_stat_counted(sc, cmd);
-			return;
-		}
-
-		/*
-		 * Check the returned Bulk protocol status.
-		 * The status block has to be validated first.
-		 */
-
-		bcs = &sc->work_bcs;
-
-		if (sc->signature == cpu_to_le32(0)) {
-			/*
-			 * This is the first reply, so do not perform the check.
-			 * Instead, remember the signature the device uses
-			 * for future checks. But do not allow a nul.
-			 */
-			sc->signature = bcs->Signature;
-			if (sc->signature == cpu_to_le32(0)) {
-				ub_state_stat_counted(sc, cmd);
-				return;
-			}
-		} else {
-			if (bcs->Signature != sc->signature) {
-				ub_state_stat_counted(sc, cmd);
-				return;
-			}
-		}
-
-		if (bcs->Tag != cmd->tag) {
-			/*
-			 * This usually happens when we disagree with the
-			 * device's microcode about something. For instance,
-			 * a few of them throw this after timeouts. They buffer
-			 * commands and reply at commands we timed out before.
-			 * Without flushing these replies we loop forever.
-			 */
-			ub_state_stat_counted(sc, cmd);
-			return;
-		}
-
-		if (!sc->bad_resid) {
-			len = le32_to_cpu(bcs->Residue);
-			if (len != cmd->len - cmd->act_len) {
-				/*
-				 * Only start ignoring if this cmd ended well.
-				 */
-				if (cmd->len == cmd->act_len) {
-					printk(KERN_NOTICE "%s: "
-					    "bad residual %d of %d, ignoring\n",
-					    sc->name, len, cmd->len);
-					sc->bad_resid = 1;
-				}
-			}
-		}
-
-		switch (bcs->Status) {
-		case US_BULK_STAT_OK:
-			break;
-		case US_BULK_STAT_FAIL:
-			ub_state_sense(sc, cmd);
-			return;
-		case US_BULK_STAT_PHASE:
-			goto Bad_End;
-		default:
-			printk(KERN_INFO "%s: unknown CSW status 0x%x\n",
-			    sc->name, bcs->Status);
-			ub_state_done(sc, cmd, -EINVAL);
-			return;
-		}
-
-		/* Not zeroing error to preserve a babble indicator */
-		if (cmd->error != 0) {
-			ub_state_sense(sc, cmd);
-			return;
-		}
-		cmd->state = UB_CMDST_DONE;
-		ub_cmdq_pop(sc);
-		(*cmd->done)(sc, cmd);
-
-	} else if (cmd->state == UB_CMDST_SENSE) {
-		ub_state_done(sc, cmd, -EIO);
-
-	} else {
-		printk(KERN_WARNING "%s: wrong command state %d\n",
-		    sc->name, cmd->state);
-		ub_state_done(sc, cmd, -EINVAL);
-		return;
-	}
-	return;
-
-Bad_End: /* Little Excel is dead */
-	ub_state_done(sc, cmd, -EIO);
-}
-
-/*
- * Factorization helper for the command state machine:
- * Initiate a data segment transfer.
- */
-static void ub_data_start(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-	struct scatterlist *sg = &cmd->sgv[cmd->current_sg];
-	int pipe;
-	int rc;
-
-	UB_INIT_COMPLETION(sc->work_done);
-
-	if (cmd->dir == UB_DIR_READ)
-		pipe = sc->recv_bulk_pipe;
-	else
-		pipe = sc->send_bulk_pipe;
-	sc->last_pipe = pipe;
-	usb_fill_bulk_urb(&sc->work_urb, sc->dev, pipe, sg_virt(sg),
-	    sg->length, ub_urb_complete, sc);
-
-	if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
-		/* XXX Clear stalls */
-		ub_complete(&sc->work_done);
-		ub_state_done(sc, cmd, rc);
-		return;
-	}
-
-	if (cmd->timeo)
-		sc->work_timer.expires = jiffies + cmd->timeo;
-	else
-		sc->work_timer.expires = jiffies + UB_DATA_TIMEOUT;
-	add_timer(&sc->work_timer);
-
-	cmd->state = UB_CMDST_DATA;
-}
-
-/*
- * Factorization helper for the command state machine:
- * Finish the command.
- */
-static void ub_state_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd, int rc)
-{
-
-	cmd->error = rc;
-	cmd->state = UB_CMDST_DONE;
-	ub_cmdq_pop(sc);
-	(*cmd->done)(sc, cmd);
-}
-
-/*
- * Factorization helper for the command state machine:
- * Submit a CSW read.
- */
-static int __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-	int rc;
-
-	UB_INIT_COMPLETION(sc->work_done);
-
-	sc->last_pipe = sc->recv_bulk_pipe;
-	usb_fill_bulk_urb(&sc->work_urb, sc->dev, sc->recv_bulk_pipe,
-	    &sc->work_bcs, US_BULK_CS_WRAP_LEN, ub_urb_complete, sc);
-
-	if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
-		/* XXX Clear stalls */
-		ub_complete(&sc->work_done);
-		ub_state_done(sc, cmd, rc);
-		return -1;
-	}
-
-	if (cmd->timeo)
-		sc->work_timer.expires = jiffies + cmd->timeo;
-	else
-		sc->work_timer.expires = jiffies + UB_STAT_TIMEOUT;
-	add_timer(&sc->work_timer);
-	return 0;
-}
-
-/*
- * Factorization helper for the command state machine:
- * Submit a CSW read and go to STAT state.
- */
-static void ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-
-	if (__ub_state_stat(sc, cmd) != 0)
-		return;
-
-	cmd->stat_count = 0;
-	cmd->state = UB_CMDST_STAT;
-}
-
-/*
- * Factorization helper for the command state machine:
- * Submit a CSW read and go to STAT state with counter (along [C] path).
- */
-static void ub_state_stat_counted(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-
-	if (++cmd->stat_count >= 4) {
-		ub_state_sense(sc, cmd);
-		return;
-	}
-
-	if (__ub_state_stat(sc, cmd) != 0)
-		return;
-
-	cmd->state = UB_CMDST_STAT;
-}
-
-/*
- * Factorization helper for the command state machine:
- * Submit a REQUEST SENSE and go to SENSE state.
- */
-static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-	struct ub_scsi_cmd *scmd;
-	struct scatterlist *sg;
-	int rc;
-
-	if (cmd->cdb[0] == REQUEST_SENSE) {
-		rc = -EPIPE;
-		goto error;
-	}
-
-	scmd = &sc->top_rqs_cmd;
-	memset(scmd, 0, sizeof(struct ub_scsi_cmd));
-	scmd->cdb[0] = REQUEST_SENSE;
-	scmd->cdb[4] = UB_SENSE_SIZE;
-	scmd->cdb_len = 6;
-	scmd->dir = UB_DIR_READ;
-	scmd->state = UB_CMDST_INIT;
-	scmd->nsg = 1;
-	sg = &scmd->sgv[0];
-	sg_init_table(sg, UB_MAX_REQ_SG);
-	sg_set_page(sg, virt_to_page(sc->top_sense), UB_SENSE_SIZE,
-			(unsigned long)sc->top_sense & (PAGE_SIZE-1));
-	scmd->len = UB_SENSE_SIZE;
-	scmd->lun = cmd->lun;
-	scmd->done = ub_top_sense_done;
-	scmd->back = cmd;
-
-	scmd->tag = sc->tagcnt++;
-
-	cmd->state = UB_CMDST_SENSE;
-
-	ub_cmdq_insert(sc, scmd);
-	return;
-
-error:
-	ub_state_done(sc, cmd, rc);
-}
-
-/*
- * A helper for the command's state machine:
- * Submit a stall clear.
- */
-static int ub_submit_clear_stall(struct ub_dev *sc, struct ub_scsi_cmd *cmd,
-    int stalled_pipe)
-{
-	int endp;
-	struct usb_ctrlrequest *cr;
-	int rc;
-
-	endp = usb_pipeendpoint(stalled_pipe);
-	if (usb_pipein (stalled_pipe))
-		endp |= USB_DIR_IN;
-
-	cr = &sc->work_cr;
-	cr->bRequestType = USB_RECIP_ENDPOINT;
-	cr->bRequest = USB_REQ_CLEAR_FEATURE;
-	cr->wValue = cpu_to_le16(USB_ENDPOINT_HALT);
-	cr->wIndex = cpu_to_le16(endp);
-	cr->wLength = cpu_to_le16(0);
-
-	UB_INIT_COMPLETION(sc->work_done);
-
-	usb_fill_control_urb(&sc->work_urb, sc->dev, sc->send_ctrl_pipe,
-	    (unsigned char*) cr, NULL, 0, ub_urb_complete, sc);
-
-	if ((rc = usb_submit_urb(&sc->work_urb, GFP_ATOMIC)) != 0) {
-		ub_complete(&sc->work_done);
-		return rc;
-	}
-
-	sc->work_timer.expires = jiffies + UB_CTRL_TIMEOUT;
-	add_timer(&sc->work_timer);
-	return 0;
-}
-
-/*
- */
-static void ub_top_sense_done(struct ub_dev *sc, struct ub_scsi_cmd *scmd)
-{
-	unsigned char *sense = sc->top_sense;
-	struct ub_scsi_cmd *cmd;
-
-	/*
-	 * Find the command which triggered the unit attention or a check,
-	 * save the sense into it, and advance its state machine.
-	 */
-	if ((cmd = ub_cmdq_peek(sc)) == NULL) {
-		printk(KERN_WARNING "%s: sense done while idle\n", sc->name);
-		return;
-	}
-	if (cmd != scmd->back) {
-		printk(KERN_WARNING "%s: "
-		    "sense done for wrong command 0x%x\n",
-		    sc->name, cmd->tag);
-		return;
-	}
-	if (cmd->state != UB_CMDST_SENSE) {
-		printk(KERN_WARNING "%s: sense done with bad cmd state %d\n",
-		    sc->name, cmd->state);
-		return;
-	}
-
-	/*
-	 * Ignoring scmd->act_len, because the buffer was pre-zeroed.
-	 */
-	cmd->key = sense[2] & 0x0F;
-	cmd->asc = sense[12];
-	cmd->ascq = sense[13];
-
-	ub_scsi_urb_compl(sc, cmd);
-}
-
-/*
- * Reset management
- */
-
-static void ub_reset_enter(struct ub_dev *sc, int try)
-{
-
-	if (sc->reset) {
-		/* This happens often on multi-LUN devices. */
-		return;
-	}
-	sc->reset = try + 1;
-
-#if 0 /* Not needed because the disconnect waits for us. */
-	unsigned long flags;
-	spin_lock_irqsave(&ub_lock, flags);
-	sc->openc++;
-	spin_unlock_irqrestore(&ub_lock, flags);
-#endif
-
-#if 0 /* We let them stop themselves. */
-	struct ub_lun *lun;
-	list_for_each_entry(lun, &sc->luns, link) {
-		blk_stop_queue(lun->disk->queue);
-	}
-#endif
-
-	schedule_work(&sc->reset_work);
-}
-
-static void ub_reset_task(struct work_struct *work)
-{
-	struct ub_dev *sc = container_of(work, struct ub_dev, reset_work);
-	unsigned long flags;
-	struct ub_lun *lun;
-	int rc;
-
-	if (!sc->reset) {
-		printk(KERN_WARNING "%s: Running reset unrequested\n",
-		    sc->name);
-		return;
-	}
-
-	if (atomic_read(&sc->poison)) {
-		;
-	} else if ((sc->reset & 1) == 0) {
-		ub_sync_reset(sc);
-		msleep(700);	/* usb-storage sleeps 6s (!) */
-		ub_probe_clear_stall(sc, sc->recv_bulk_pipe);
-		ub_probe_clear_stall(sc, sc->send_bulk_pipe);
-	} else if (sc->dev->actconfig->desc.bNumInterfaces != 1) {
-		;
-	} else {
-		rc = usb_lock_device_for_reset(sc->dev, sc->intf);
-		if (rc < 0) {
-			printk(KERN_NOTICE
-			    "%s: usb_lock_device_for_reset failed (%d)\n",
-			    sc->name, rc);
-		} else {
-			rc = usb_reset_device(sc->dev);
-			if (rc < 0) {
-				printk(KERN_NOTICE "%s: "
-				    "usb_lock_device_for_reset failed (%d)\n",
-				    sc->name, rc);
-			}
-			usb_unlock_device(sc->dev);
-		}
-	}
-
-	/*
-	 * In theory, no commands can be running while reset is active,
-	 * so nobody can ask for another reset, and so we do not need any
-	 * queues of resets or anything. We do need a spinlock though,
-	 * to interact with block layer.
-	 */
-	spin_lock_irqsave(sc->lock, flags);
-	sc->reset = 0;
-	tasklet_schedule(&sc->tasklet);
-	list_for_each_entry(lun, &sc->luns, link) {
-		blk_start_queue(lun->disk->queue);
-	}
-	wake_up(&sc->reset_wait);
-	spin_unlock_irqrestore(sc->lock, flags);
-}
-
-/*
- * XXX Reset brackets are too much hassle to implement, so just stub them
- * in order to prevent forced unbinding (which deadlocks solid when our
- * ->disconnect method waits for the reset to complete and this kills keventd).
- *
- * XXX Tell Alan to move usb_unlock_device inside of usb_reset_device,
- * or else the post_reset is invoked, and restats I/O on a locked device.
- */
-static int ub_pre_reset(struct usb_interface *iface) {
-	return 0;
-}
-
-static int ub_post_reset(struct usb_interface *iface) {
-	return 0;
-}
-
-/*
- * This is called from a process context.
- */
-static void ub_revalidate(struct ub_dev *sc, struct ub_lun *lun)
-{
-
-	lun->readonly = 0;	/* XXX Query this from the device */
-
-	lun->capacity.nsec = 0;
-	lun->capacity.bsize = 512;
-	lun->capacity.bshift = 0;
-
-	if (ub_sync_tur(sc, lun) != 0)
-		return;			/* Not ready */
-	lun->changed = 0;
-
-	if (ub_sync_read_cap(sc, lun, &lun->capacity) != 0) {
-		/*
-		 * The retry here means something is wrong, either with the
-		 * device, with the transport, or with our code.
-		 * We keep this because sd.c has retries for capacity.
-		 */
-		if (ub_sync_read_cap(sc, lun, &lun->capacity) != 0) {
-			lun->capacity.nsec = 0;
-			lun->capacity.bsize = 512;
-			lun->capacity.bshift = 0;
-		}
-	}
-}
-
-/*
- * The open funcion.
- * This is mostly needed to keep refcounting, but also to support
- * media checks on removable media drives.
- */
-static int ub_bd_open(struct block_device *bdev, fmode_t mode)
-{
-	struct ub_lun *lun = bdev->bd_disk->private_data;
-	struct ub_dev *sc = lun->udev;
-	unsigned long flags;
-	int rc;
-
-	spin_lock_irqsave(&ub_lock, flags);
-	if (atomic_read(&sc->poison)) {
-		spin_unlock_irqrestore(&ub_lock, flags);
-		return -ENXIO;
-	}
-	sc->openc++;
-	spin_unlock_irqrestore(&ub_lock, flags);
-
-	if (lun->removable || lun->readonly)
-		check_disk_change(bdev);
-
-	/*
-	 * The sd.c considers ->media_present and ->changed not equivalent,
-	 * under some pretty murky conditions (a failure of READ CAPACITY).
-	 * We may need it one day.
-	 */
-	if (lun->removable && lun->changed && !(mode & FMODE_NDELAY)) {
-		rc = -ENOMEDIUM;
-		goto err_open;
-	}
-
-	if (lun->readonly && (mode & FMODE_WRITE)) {
-		rc = -EROFS;
-		goto err_open;
-	}
-
-	return 0;
-
-err_open:
-	ub_put(sc);
-	return rc;
-}
-
-/*
- */
-static int ub_bd_release(struct gendisk *disk, fmode_t mode)
-{
-	struct ub_lun *lun = disk->private_data;
-	struct ub_dev *sc = lun->udev;
-
-	ub_put(sc);
-	return 0;
-}
-
-/*
- * The ioctl interface.
- */
-static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
-    unsigned int cmd, unsigned long arg)
-{
-	struct gendisk *disk = bdev->bd_disk;
-	void __user *usermem = (void __user *) arg;
-
-	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
-}
-
-/*
- * This is called by check_disk_change if we reported a media change.
- * The main onjective here is to discover the features of the media such as
- * the capacity, read-only status, etc. USB storage generally does not
- * need to be spun up, but if we needed it, this would be the place.
- *
- * This call can sleep.
- *
- * The return code is not used.
- */
-static int ub_bd_revalidate(struct gendisk *disk)
-{
-	struct ub_lun *lun = disk->private_data;
-
-	ub_revalidate(lun->udev, lun);
-
-	/* XXX Support sector size switching like in sr.c */
-	blk_queue_logical_block_size(disk->queue, lun->capacity.bsize);
-	set_capacity(disk, lun->capacity.nsec);
-	// set_disk_ro(sdkp->disk, lun->readonly);
-
-	return 0;
-}
-
-/*
- * The check is called by the block layer to verify if the media
- * is still available. It is supposed to be harmless, lightweight and
- * non-intrusive in case the media was not changed.
- *
- * This call can sleep.
- *
- * The return code is bool!
- */
-static int ub_bd_media_changed(struct gendisk *disk)
-{
-	struct ub_lun *lun = disk->private_data;
-
-	if (!lun->removable)
-		return 0;
-
-	/*
-	 * We clean checks always after every command, so this is not
-	 * as dangerous as it looks. If the TEST_UNIT_READY fails here,
-	 * the device is actually not ready with operator or software
-	 * intervention required. One dangerous item might be a drive which
-	 * spins itself down, and come the time to write dirty pages, this
-	 * will fail, then block layer discards the data. Since we never
-	 * spin drives up, such devices simply cannot be used with ub anyway.
-	 */
-	if (ub_sync_tur(lun->udev, lun) != 0) {
-		lun->changed = 1;
-		return 1;
-	}
-
-	return lun->changed;
-}
-
-static struct block_device_operations ub_bd_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ub_bd_open,
-	.release	= ub_bd_release,
-	.locked_ioctl	= ub_bd_ioctl,
-	.media_changed	= ub_bd_media_changed,
-	.revalidate_disk = ub_bd_revalidate,
-};
-
-/*
- * Common ->done routine for commands executed synchronously.
- */
-static void ub_probe_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
-{
-	struct completion *cop = cmd->back;
-	complete(cop);
-}
-
-/*
- * Test if the device has a check condition on it, synchronously.
- */
-static int ub_sync_tur(struct ub_dev *sc, struct ub_lun *lun)
-{
-	struct ub_scsi_cmd *cmd;
-	enum { ALLOC_SIZE = sizeof(struct ub_scsi_cmd) };
-	unsigned long flags;
-	struct completion compl;
-	int rc;
-
-	init_completion(&compl);
-
-	rc = -ENOMEM;
-	if ((cmd = kzalloc(ALLOC_SIZE, GFP_KERNEL)) == NULL)
-		goto err_alloc;
-
-	cmd->cdb[0] = TEST_UNIT_READY;
-	cmd->cdb_len = 6;
-	cmd->dir = UB_DIR_NONE;
-	cmd->state = UB_CMDST_INIT;
-	cmd->lun = lun;			/* This may be NULL, but that's ok */
-	cmd->done = ub_probe_done;
-	cmd->back = &compl;
-
-	spin_lock_irqsave(sc->lock, flags);
-	cmd->tag = sc->tagcnt++;
-
-	rc = ub_submit_scsi(sc, cmd);
-	spin_unlock_irqrestore(sc->lock, flags);
-
-	if (rc != 0)
-		goto err_submit;
-
-	wait_for_completion(&compl);
-
-	rc = cmd->error;
-
-	if (rc == -EIO && cmd->key != 0)	/* Retries for benh's key */
-		rc = cmd->key;
-
-err_submit:
-	kfree(cmd);
-err_alloc:
-	return rc;
-}
-
-/*
- * Read the SCSI capacity synchronously (for probing).
- */
-static int ub_sync_read_cap(struct ub_dev *sc, struct ub_lun *lun,
-    struct ub_capacity *ret)
-{
-	struct ub_scsi_cmd *cmd;
-	struct scatterlist *sg;
-	char *p;
-	enum { ALLOC_SIZE = sizeof(struct ub_scsi_cmd) + 8 };
-	unsigned long flags;
-	unsigned int bsize, shift;
-	unsigned long nsec;
-	struct completion compl;
-	int rc;
-
-	init_completion(&compl);
-
-	rc = -ENOMEM;
-	if ((cmd = kzalloc(ALLOC_SIZE, GFP_KERNEL)) == NULL)
-		goto err_alloc;
-	p = (char *)cmd + sizeof(struct ub_scsi_cmd);
-
-	cmd->cdb[0] = 0x25;
-	cmd->cdb_len = 10;
-	cmd->dir = UB_DIR_READ;
-	cmd->state = UB_CMDST_INIT;
-	cmd->nsg = 1;
-	sg = &cmd->sgv[0];
-	sg_init_table(sg, UB_MAX_REQ_SG);
-	sg_set_page(sg, virt_to_page(p), 8, (unsigned long)p & (PAGE_SIZE-1));
-	cmd->len = 8;
-	cmd->lun = lun;
-	cmd->done = ub_probe_done;
-	cmd->back = &compl;
-
-	spin_lock_irqsave(sc->lock, flags);
-	cmd->tag = sc->tagcnt++;
-
-	rc = ub_submit_scsi(sc, cmd);
-	spin_unlock_irqrestore(sc->lock, flags);
-
-	if (rc != 0)
-		goto err_submit;
-
-	wait_for_completion(&compl);
-
-	if (cmd->error != 0) {
-		rc = -EIO;
-		goto err_read;
-	}
-	if (cmd->act_len != 8) {
-		rc = -EIO;
-		goto err_read;
-	}
-
-	/* sd.c special-cases sector size of 0 to mean 512. Needed? Safe? */
-	nsec = be32_to_cpu(*(__be32 *)p) + 1;
-	bsize = be32_to_cpu(*(__be32 *)(p + 4));
-	switch (bsize) {
-	case 512:	shift = 0;	break;
-	case 1024:	shift = 1;	break;
-	case 2048:	shift = 2;	break;
-	case 4096:	shift = 3;	break;
-	default:
-		rc = -EDOM;
-		goto err_inv_bsize;
-	}
-
-	ret->bsize = bsize;
-	ret->bshift = shift;
-	ret->nsec = nsec << shift;
-	rc = 0;
-
-err_inv_bsize:
-err_read:
-err_submit:
-	kfree(cmd);
-err_alloc:
-	return rc;
-}
-
-/*
- */
-static void ub_probe_urb_complete(struct urb *urb)
-{
-	struct completion *cop = urb->context;
-	complete(cop);
-}
-
-static void ub_probe_timeout(unsigned long arg)
-{
-	struct completion *cop = (struct completion *) arg;
-	complete(cop);
-}
-
-/*
- * Reset with a Bulk reset.
- */
-static int ub_sync_reset(struct ub_dev *sc)
-{
-	int ifnum = sc->intf->cur_altsetting->desc.bInterfaceNumber;
-	struct usb_ctrlrequest *cr;
-	struct completion compl;
-	struct timer_list timer;
-	int rc;
-
-	init_completion(&compl);
-
-	cr = &sc->work_cr;
-	cr->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE;
-	cr->bRequest = US_BULK_RESET_REQUEST;
-	cr->wValue = cpu_to_le16(0);
-	cr->wIndex = cpu_to_le16(ifnum);
-	cr->wLength = cpu_to_le16(0);
-
-	usb_fill_control_urb(&sc->work_urb, sc->dev, sc->send_ctrl_pipe,
-	    (unsigned char*) cr, NULL, 0, ub_probe_urb_complete, &compl);
-
-	if ((rc = usb_submit_urb(&sc->work_urb, GFP_KERNEL)) != 0) {
-		printk(KERN_WARNING
-		     "%s: Unable to submit a bulk reset (%d)\n", sc->name, rc);
-		return rc;
-	}
-
-	init_timer(&timer);
-	timer.function = ub_probe_timeout;
-	timer.data = (unsigned long) &compl;
-	timer.expires = jiffies + UB_CTRL_TIMEOUT;
-	add_timer(&timer);
-
-	wait_for_completion(&compl);
-
-	del_timer_sync(&timer);
-	usb_kill_urb(&sc->work_urb);
-
-	return sc->work_urb.status;
-}
-
-/*
- * Get number of LUNs by the way of Bulk GetMaxLUN command.
- */
-static int ub_sync_getmaxlun(struct ub_dev *sc)
-{
-	int ifnum = sc->intf->cur_altsetting->desc.bInterfaceNumber;
-	unsigned char *p;
-	enum { ALLOC_SIZE = 1 };
-	struct usb_ctrlrequest *cr;
-	struct completion compl;
-	struct timer_list timer;
-	int nluns;
-	int rc;
-
-	init_completion(&compl);
-
-	rc = -ENOMEM;
-	if ((p = kmalloc(ALLOC_SIZE, GFP_KERNEL)) == NULL)
-		goto err_alloc;
-	*p = 55;
-
-	cr = &sc->work_cr;
-	cr->bRequestType = USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE;
-	cr->bRequest = US_BULK_GET_MAX_LUN;
-	cr->wValue = cpu_to_le16(0);
-	cr->wIndex = cpu_to_le16(ifnum);
-	cr->wLength = cpu_to_le16(1);
-
-	usb_fill_control_urb(&sc->work_urb, sc->dev, sc->recv_ctrl_pipe,
-	    (unsigned char*) cr, p, 1, ub_probe_urb_complete, &compl);
-
-	if ((rc = usb_submit_urb(&sc->work_urb, GFP_KERNEL)) != 0)
-		goto err_submit;
-
-	init_timer(&timer);
-	timer.function = ub_probe_timeout;
-	timer.data = (unsigned long) &compl;
-	timer.expires = jiffies + UB_CTRL_TIMEOUT;
-	add_timer(&timer);
-
-	wait_for_completion(&compl);
-
-	del_timer_sync(&timer);
-	usb_kill_urb(&sc->work_urb);
-
-	if ((rc = sc->work_urb.status) < 0)
-		goto err_io;
-
-	if (sc->work_urb.actual_length != 1) {
-		nluns = 0;
-	} else {
-		if ((nluns = *p) == 55) {
-			nluns = 0;
-		} else {
-  			/* GetMaxLUN returns the maximum LUN number */
-			nluns += 1;
-			if (nluns > UB_MAX_LUNS)
-				nluns = UB_MAX_LUNS;
-		}
-	}
-
-	kfree(p);
-	return nluns;
-
-err_io:
-err_submit:
-	kfree(p);
-err_alloc:
-	return rc;
-}
-
-/*
- * Clear initial stalls.
- */
-static int ub_probe_clear_stall(struct ub_dev *sc, int stalled_pipe)
-{
-	int endp;
-	struct usb_ctrlrequest *cr;
-	struct completion compl;
-	struct timer_list timer;
-	int rc;
-
-	init_completion(&compl);
-
-	endp = usb_pipeendpoint(stalled_pipe);
-	if (usb_pipein (stalled_pipe))
-		endp |= USB_DIR_IN;
-
-	cr = &sc->work_cr;
-	cr->bRequestType = USB_RECIP_ENDPOINT;
-	cr->bRequest = USB_REQ_CLEAR_FEATURE;
-	cr->wValue = cpu_to_le16(USB_ENDPOINT_HALT);
-	cr->wIndex = cpu_to_le16(endp);
-	cr->wLength = cpu_to_le16(0);
-
-	usb_fill_control_urb(&sc->work_urb, sc->dev, sc->send_ctrl_pipe,
-	    (unsigned char*) cr, NULL, 0, ub_probe_urb_complete, &compl);
-
-	if ((rc = usb_submit_urb(&sc->work_urb, GFP_KERNEL)) != 0) {
-		printk(KERN_WARNING
-		     "%s: Unable to submit a probe clear (%d)\n", sc->name, rc);
-		return rc;
-	}
-
-	init_timer(&timer);
-	timer.function = ub_probe_timeout;
-	timer.data = (unsigned long) &compl;
-	timer.expires = jiffies + UB_CTRL_TIMEOUT;
-	add_timer(&timer);
-
-	wait_for_completion(&compl);
-
-	del_timer_sync(&timer);
-	usb_kill_urb(&sc->work_urb);
-
-	usb_reset_endpoint(sc->dev, endp);
-
-	return 0;
-}
-
-/*
- * Get the pipe settings.
- */
-static int ub_get_pipes(struct ub_dev *sc, struct usb_device *dev,
-    struct usb_interface *intf)
-{
-	struct usb_host_interface *altsetting = intf->cur_altsetting;
-	struct usb_endpoint_descriptor *ep_in = NULL;
-	struct usb_endpoint_descriptor *ep_out = NULL;
-	struct usb_endpoint_descriptor *ep;
-	int i;
-
-	/*
-	 * Find the endpoints we need.
-	 * We are expecting a minimum of 2 endpoints - in and out (bulk).
-	 * We will ignore any others.
-	 */
-	for (i = 0; i < altsetting->desc.bNumEndpoints; i++) {
-		ep = &altsetting->endpoint[i].desc;
-
-		/* Is it a BULK endpoint? */
-		if (usb_endpoint_xfer_bulk(ep)) {
-			/* BULK in or out? */
-			if (usb_endpoint_dir_in(ep)) {
-				if (ep_in == NULL)
-					ep_in = ep;
-			} else {
-				if (ep_out == NULL)
-					ep_out = ep;
-			}
-		}
-	}
-
-	if (ep_in == NULL || ep_out == NULL) {
-		printk(KERN_NOTICE "%s: failed endpoint check\n", sc->name);
-		return -ENODEV;
-	}
-
-	/* Calculate and store the pipe values */
-	sc->send_ctrl_pipe = usb_sndctrlpipe(dev, 0);
-	sc->recv_ctrl_pipe = usb_rcvctrlpipe(dev, 0);
-	sc->send_bulk_pipe = usb_sndbulkpipe(dev,
-		usb_endpoint_num(ep_out));
-	sc->recv_bulk_pipe = usb_rcvbulkpipe(dev, 
-		usb_endpoint_num(ep_in));
-
-	return 0;
-}
-
-/*
- * Probing is done in the process context, which allows us to cheat
- * and not to build a state machine for the discovery.
- */
-static int ub_probe(struct usb_interface *intf,
-    const struct usb_device_id *dev_id)
-{
-	struct ub_dev *sc;
-	int nluns;
-	int rc;
-	int i;
-
-	if (usb_usual_check_type(dev_id, USB_US_TYPE_UB))
-		return -ENXIO;
-
-	rc = -ENOMEM;
-	if ((sc = kzalloc(sizeof(struct ub_dev), GFP_KERNEL)) == NULL)
-		goto err_core;
-	sc->lock = ub_next_lock();
-	INIT_LIST_HEAD(&sc->luns);
-	usb_init_urb(&sc->work_urb);
-	tasklet_init(&sc->tasklet, ub_scsi_action, (unsigned long)sc);
-	atomic_set(&sc->poison, 0);
-	INIT_WORK(&sc->reset_work, ub_reset_task);
-	init_waitqueue_head(&sc->reset_wait);
-
-	init_timer(&sc->work_timer);
-	sc->work_timer.data = (unsigned long) sc;
-	sc->work_timer.function = ub_urb_timeout;
-
-	ub_init_completion(&sc->work_done);
-	sc->work_done.done = 1;		/* A little yuk, but oh well... */
-
-	sc->dev = interface_to_usbdev(intf);
-	sc->intf = intf;
-	// sc->ifnum = intf->cur_altsetting->desc.bInterfaceNumber;
-	usb_set_intfdata(intf, sc);
-	usb_get_dev(sc->dev);
-	/*
-	 * Since we give the interface struct to the block level through
-	 * disk->driverfs_dev, we have to pin it. Otherwise, block_uevent
-	 * oopses on close after a disconnect (kernels 2.6.16 and up).
-	 */
-	usb_get_intf(sc->intf);
-
-	snprintf(sc->name, 12, DRV_NAME "(%d.%d)",
-	    sc->dev->bus->busnum, sc->dev->devnum);
-
-	/* XXX Verify that we can handle the device (from descriptors) */
-
-	if (ub_get_pipes(sc, sc->dev, intf) != 0)
-		goto err_dev_desc;
-
-	/*
-	 * At this point, all USB initialization is done, do upper layer.
-	 * We really hate halfway initialized structures, so from the
-	 * invariants perspective, this ub_dev is fully constructed at
-	 * this point.
-	 */
-
-	/*
-	 * This is needed to clear toggles. It is a problem only if we do
-	 * `rmmod ub && modprobe ub` without disconnects, but we like that.
-	 */
-#if 0 /* iPod Mini fails if we do this (big white iPod works) */
-	ub_probe_clear_stall(sc, sc->recv_bulk_pipe);
-	ub_probe_clear_stall(sc, sc->send_bulk_pipe);
-#endif
-
-	/*
-	 * The way this is used by the startup code is a little specific.
-	 * A SCSI check causes a USB stall. Our common case code sees it
-	 * and clears the check, after which the device is ready for use.
-	 * But if a check was not present, any command other than
-	 * TEST_UNIT_READY ends with a lockup (including REQUEST_SENSE).
-	 *
-	 * If we neglect to clear the SCSI check, the first real command fails
-	 * (which is the capacity readout). We clear that and retry, but why
-	 * causing spurious retries for no reason.
-	 *
-	 * Revalidation may start with its own TEST_UNIT_READY, but that one
-	 * has to succeed, so we clear checks with an additional one here.
-	 * In any case it's not our business how revaliadation is implemented.
-	 */
-	for (i = 0; i < 3; i++) {  /* Retries for the schwag key from KS'04 */
-		if ((rc = ub_sync_tur(sc, NULL)) <= 0) break;
-		if (rc != 0x6) break;
-		msleep(10);
-	}
-
-	nluns = 1;
-	for (i = 0; i < 3; i++) {
-		if ((rc = ub_sync_getmaxlun(sc)) < 0)
-			break;
-		if (rc != 0) {
-			nluns = rc;
-			break;
-		}
-		msleep(100);
-	}
-
-	for (i = 0; i < nluns; i++) {
-		ub_probe_lun(sc, i);
-	}
-	return 0;
-
-err_dev_desc:
-	usb_set_intfdata(intf, NULL);
-	usb_put_intf(sc->intf);
-	usb_put_dev(sc->dev);
-	kfree(sc);
-err_core:
-	return rc;
-}
-
-static int ub_probe_lun(struct ub_dev *sc, int lnum)
-{
-	struct ub_lun *lun;
-	struct request_queue *q;
-	struct gendisk *disk;
-	int rc;
-
-	rc = -ENOMEM;
-	if ((lun = kzalloc(sizeof(struct ub_lun), GFP_KERNEL)) == NULL)
-		goto err_alloc;
-	lun->num = lnum;
-
-	rc = -ENOSR;
-	if ((lun->id = ub_id_get()) == -1)
-		goto err_id;
-
-	lun->udev = sc;
-
-	snprintf(lun->name, 16, DRV_NAME "%c(%d.%d.%d)",
-	    lun->id + 'a', sc->dev->bus->busnum, sc->dev->devnum, lun->num);
-
-	lun->removable = 1;		/* XXX Query this from the device */
-	lun->changed = 1;		/* ub_revalidate clears only */
-	ub_revalidate(sc, lun);
-
-	rc = -ENOMEM;
-	if ((disk = alloc_disk(UB_PARTS_PER_LUN)) == NULL)
-		goto err_diskalloc;
-
-	sprintf(disk->disk_name, DRV_NAME "%c", lun->id + 'a');
-	disk->major = UB_MAJOR;
-	disk->first_minor = lun->id * UB_PARTS_PER_LUN;
-	disk->fops = &ub_bd_fops;
-	disk->private_data = lun;
-	disk->driverfs_dev = &sc->intf->dev;
-
-	rc = -ENOMEM;
-	if ((q = blk_init_queue(ub_request_fn, sc->lock)) == NULL)
-		goto err_blkqinit;
-
-	disk->queue = q;
-
-	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
-	blk_queue_max_hw_segments(q, UB_MAX_REQ_SG);
-	blk_queue_max_phys_segments(q, UB_MAX_REQ_SG);
-	blk_queue_segment_boundary(q, 0xffffffff);	/* Dubious. */
-	blk_queue_max_sectors(q, UB_MAX_SECTORS);
-	blk_queue_logical_block_size(q, lun->capacity.bsize);
-
-	lun->disk = disk;
-	q->queuedata = lun;
-	list_add(&lun->link, &sc->luns);
-
-	set_capacity(disk, lun->capacity.nsec);
-	if (lun->removable)
-		disk->flags |= GENHD_FL_REMOVABLE;
-
-	add_disk(disk);
-
-	return 0;
-
-err_blkqinit:
-	put_disk(disk);
-err_diskalloc:
-	ub_id_put(lun->id);
-err_id:
-	kfree(lun);
-err_alloc:
-	return rc;
-}
-
-static void ub_disconnect(struct usb_interface *intf)
-{
-	struct ub_dev *sc = usb_get_intfdata(intf);
-	struct ub_lun *lun;
-	unsigned long flags;
-
-	/*
-	 * Prevent ub_bd_release from pulling the rug from under us.
-	 * XXX This is starting to look like a kref.
-	 * XXX Why not to take this ref at probe time?
-	 */
-	spin_lock_irqsave(&ub_lock, flags);
-	sc->openc++;
-	spin_unlock_irqrestore(&ub_lock, flags);
-
-	/*
-	 * Fence stall clearings, operations triggered by unlinkings and so on.
-	 * We do not attempt to unlink any URBs, because we do not trust the
-	 * unlink paths in HC drivers. Also, we get -84 upon disconnect anyway.
-	 */
-	atomic_set(&sc->poison, 1);
-
-	/*
-	 * Wait for reset to end, if any.
-	 */
-	wait_event(sc->reset_wait, !sc->reset);
-
-	/*
-	 * Blow away queued commands.
-	 *
-	 * Actually, this never works, because before we get here
-	 * the HCD terminates outstanding URB(s). It causes our
-	 * SCSI command queue to advance, commands fail to submit,
-	 * and the whole queue drains. So, we just use this code to
-	 * print warnings.
-	 */
-	spin_lock_irqsave(sc->lock, flags);
-	{
-		struct ub_scsi_cmd *cmd;
-		int cnt = 0;
-		while ((cmd = ub_cmdq_peek(sc)) != NULL) {
-			cmd->error = -ENOTCONN;
-			cmd->state = UB_CMDST_DONE;
-			ub_cmdq_pop(sc);
-			(*cmd->done)(sc, cmd);
-			cnt++;
-		}
-		if (cnt != 0) {
-			printk(KERN_WARNING "%s: "
-			    "%d was queued after shutdown\n", sc->name, cnt);
-		}
-	}
-	spin_unlock_irqrestore(sc->lock, flags);
-
-	/*
-	 * Unregister the upper layer.
-	 */
-	list_for_each_entry(lun, &sc->luns, link) {
-		del_gendisk(lun->disk);
-		/*
-		 * I wish I could do:
-		 *    queue_flag_set(QUEUE_FLAG_DEAD, q);
-		 * As it is, we rely on our internal poisoning and let
-		 * the upper levels to spin furiously failing all the I/O.
-		 */
-	}
-
-	/*
-	 * Testing for -EINPROGRESS is always a bug, so we are bending
-	 * the rules a little.
-	 */
-	spin_lock_irqsave(sc->lock, flags);
-	if (sc->work_urb.status == -EINPROGRESS) {	/* janitors: ignore */
-		printk(KERN_WARNING "%s: "
-		    "URB is active after disconnect\n", sc->name);
-	}
-	spin_unlock_irqrestore(sc->lock, flags);
-
-	/*
-	 * There is virtually no chance that other CPU runs a timeout so long
-	 * after ub_urb_complete should have called del_timer, but only if HCD
-	 * didn't forget to deliver a callback on unlink.
-	 */
-	del_timer_sync(&sc->work_timer);
-
-	/*
-	 * At this point there must be no commands coming from anyone
-	 * and no URBs left in transit.
-	 */
-
-	ub_put(sc);
-}
-
-static struct usb_driver ub_driver = {
-	.name =		"ub",
-	.probe =	ub_probe,
-	.disconnect =	ub_disconnect,
-	.id_table =	ub_usb_ids,
-	.pre_reset =	ub_pre_reset,
-	.post_reset =	ub_post_reset,
-};
-
-static int __init ub_init(void)
-{
-	int rc;
-	int i;
-
-	for (i = 0; i < UB_QLOCK_NUM; i++)
-		spin_lock_init(&ub_qlockv[i]);
-
-	if ((rc = register_blkdev(UB_MAJOR, DRV_NAME)) != 0)
-		goto err_regblkdev;
-
-	if ((rc = usb_register(&ub_driver)) != 0)
-		goto err_register;
-
-	usb_usual_set_present(USB_US_TYPE_UB);
-	return 0;
-
-err_register:
-	unregister_blkdev(UB_MAJOR, DRV_NAME);
-err_regblkdev:
-	return rc;
-}
-
-static void __exit ub_exit(void)
-{
-	usb_deregister(&ub_driver);
-
-	unregister_blkdev(UB_MAJOR, DRV_NAME);
-	usb_usual_clear_present(USB_US_TYPE_UB);
-}
-
-module_init(ub_init);
-module_exit(ub_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 858c34dd032..4cf81b5bf0f 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -40,13 +40,13 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/gfp.h>
 #include <linux/ioctl.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/timer.h>
 #include <linux/pci.h>
-#include <linux/slab.h>
 #include <linux/dma-mapping.h>
 
 #include <linux/fcntl.h>        /* O_ACCMODE */
@@ -108,8 +108,7 @@ struct cardinfo {
 				    * have been written
 				    */
 	struct bio	*bio, *currentbio, **biotail;
-	int		current_idx;
-	sector_t	current_sector;
+	struct bvec_iter current_iter;
 
 	struct request_queue *queue;
 
@@ -118,7 +117,7 @@ struct cardinfo {
 		struct mm_dma_desc	*desc;
 		int	 		cnt, headcnt;
 		struct bio		*bio, **biotail;
-		int			idx;
+		struct bvec_iter	iter;
 	} mm_pages[2];
 #define DESC_PER_PAGE ((PAGE_SIZE*2)/sizeof(struct mm_dma_desc))
 
@@ -140,7 +139,6 @@ struct cardinfo {
 };
 
 static struct cardinfo cards[MM_MAXCARDS];
-static struct block_device_operations mm_fops;
 static struct timer_list battery_timer;
 
 static int num_cards;
@@ -242,8 +240,7 @@ static void dump_dmastat(struct cardinfo *card, unsigned int dmastat)
  *
  * Whenever IO on the active page completes, the Ready page is activated
  * and the ex-Active page is clean out and made Ready.
- * Otherwise the Ready page is only activated when it becomes full, or
- * when mm_unplug_device is called via the unplug_io_fn.
+ * Otherwise the Ready page is only activated when it becomes full.
  *
  * If a request arrives while both pages a full, it is queued, and b_rdev is
  * overloaded to record whether it was a read or a write.
@@ -334,17 +331,6 @@ static inline void reset_page(struct mm_page *page)
 	page->biotail = &page->bio;
 }
 
-static void mm_unplug_device(struct request_queue *q)
-{
-	struct cardinfo *card = q->queuedata;
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->lock, flags);
-	if (blk_remove_plug(q))
-		activate(card);
-	spin_unlock_irqrestore(&card->lock, flags);
-}
-
 /*
  * If there is room on Ready page, take
  * one bh off list and add it.
@@ -357,16 +343,13 @@ static int add_bio(struct cardinfo *card)
 	dma_addr_t dma_handle;
 	int offset;
 	struct bio *bio;
-	struct bio_vec *vec;
-	int idx;
+	struct bio_vec vec;
 	int rw;
-	int len;
 
 	bio = card->currentbio;
 	if (!bio && card->bio) {
 		card->currentbio = card->bio;
-		card->current_idx = card->bio->bi_idx;
-		card->current_sector = card->bio->bi_sector;
+		card->current_iter = card->bio->bi_iter;
 		card->bio = card->bio->bi_next;
 		if (card->bio == NULL)
 			card->biotail = &card->bio;
@@ -375,18 +358,17 @@ static int add_bio(struct cardinfo *card)
 	}
 	if (!bio)
 		return 0;
-	idx = card->current_idx;
 
 	rw = bio_rw(bio);
 	if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE)
 		return 0;
 
-	vec = bio_iovec_idx(bio, idx);
-	len = vec->bv_len;
+	vec = bio_iter_iovec(bio, card->current_iter);
+
 	dma_handle = pci_map_page(card->dev,
-				  vec->bv_page,
-				  vec->bv_offset,
-				  len,
+				  vec.bv_page,
+				  vec.bv_offset,
+				  vec.bv_len,
 				  (rw == READ) ?
 				  PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE);
 
@@ -394,7 +376,7 @@ static int add_bio(struct cardinfo *card)
 	desc = &p->desc[p->cnt];
 	p->cnt++;
 	if (p->bio == NULL)
-		p->idx = idx;
+		p->iter = card->current_iter;
 	if ((p->biotail) != &bio->bi_next) {
 		*(p->biotail) = bio;
 		p->biotail = &(bio->bi_next);
@@ -404,8 +386,8 @@ static int add_bio(struct cardinfo *card)
 	desc->data_dma_handle = dma_handle;
 
 	desc->pci_addr = cpu_to_le64((u64)desc->data_dma_handle);
-	desc->local_addr = cpu_to_le64(card->current_sector << 9);
-	desc->transfer_size = cpu_to_le32(len);
+	desc->local_addr = cpu_to_le64(card->current_iter.bi_sector << 9);
+	desc->transfer_size = cpu_to_le32(vec.bv_len);
 	offset = (((char *)&desc->sem_control_bits) - ((char *)p->desc));
 	desc->sem_addr = cpu_to_le64((u64)(p->page_dma+offset));
 	desc->zero1 = desc->zero2 = 0;
@@ -420,10 +402,9 @@ static int add_bio(struct cardinfo *card)
 		desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ);
 	desc->sem_control_bits = desc->control_bits;
 
-	card->current_sector += (len >> 9);
-	idx++;
-	card->current_idx = idx;
-	if (idx >= bio->bi_vcnt)
+
+	bio_advance_iter(bio, &card->current_iter, vec.bv_len);
+	if (!card->current_iter.bi_size)
 		card->currentbio = NULL;
 
 	return 1;
@@ -452,23 +433,25 @@ static void process_page(unsigned long data)
 		struct mm_dma_desc *desc = &page->desc[page->headcnt];
 		int control = le32_to_cpu(desc->sem_control_bits);
 		int last = 0;
-		int idx;
+		struct bio_vec vec;
 
 		if (!(control & DMASCR_DMA_COMPLETE)) {
 			control = dma_status;
 			last = 1;
 		}
+
 		page->headcnt++;
-		idx = page->idx;
-		page->idx++;
-		if (page->idx >= bio->bi_vcnt) {
+		vec = bio_iter_iovec(bio, page->iter);
+		bio_advance_iter(bio, &page->iter, vec.bv_len);
+
+		if (!page->iter.bi_size) {
 			page->bio = bio->bi_next;
 			if (page->bio)
-				page->idx = page->bio->bi_idx;
+				page->iter = page->bio->bi_iter;
 		}
 
 		pci_unmap_page(card->dev, desc->data_dma_handle,
-			       bio_iovec_idx(bio, idx)->bv_len,
+			       vec.bv_len,
 				 (control & DMASCR_TRANSFER_READ) ?
 				PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE);
 		if (control & DMASCR_HARD_ERROR) {
@@ -479,7 +462,7 @@ static void process_page(unsigned long data)
 				le32_to_cpu(desc->local_addr)>>9,
 				le32_to_cpu(desc->transfer_size));
 			dump_dmastat(card, control);
-		} else if (test_bit(BIO_RW, &bio->bi_rw) &&
+		} else if ((bio->bi_rw & REQ_WRITE) &&
 			   le32_to_cpu(desc->local_addr) >> 9 ==
 				card->init_size) {
 			card->init_size += le32_to_cpu(desc->transfer_size) >> 9;
@@ -526,20 +509,37 @@ static void process_page(unsigned long data)
 	}
 }
 
-static int mm_make_request(struct request_queue *q, struct bio *bio)
+static void mm_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+	struct cardinfo *card = cb->data;
+
+	spin_lock_irq(&card->lock);
+	activate(card);
+	spin_unlock_irq(&card->lock);
+	kfree(cb);
+}
+
+static int mm_check_plugged(struct cardinfo *card)
+{
+	return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb));
+}
+
+static void mm_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct cardinfo *card = q->queuedata;
 	pr_debug("mm_make_request %llu %u\n",
-		 (unsigned long long)bio->bi_sector, bio->bi_size);
+		 (unsigned long long)bio->bi_iter.bi_sector,
+		 bio->bi_iter.bi_size);
 
 	spin_lock_irq(&card->lock);
 	*card->biotail = bio;
 	bio->bi_next = NULL;
 	card->biotail = &bio->bi_next;
-	blk_plug_device(q);
+	if (bio->bi_rw & REQ_SYNC || !mm_check_plugged(card))
+		activate(card);
 	spin_unlock_irq(&card->lock);
 
-	return 0;
+	return;
 }
 
 static irqreturn_t mm_interrupt(int irq, void *__card)
@@ -780,24 +780,13 @@ static int mm_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-/*
- * Future support for removable devices
- */
-static int mm_check_change(struct gendisk *disk)
-{
-/*  struct cardinfo *dev = disk->private_data; */
-	return 0;
-}
-
-static struct block_device_operations mm_fops = {
+static const struct block_device_operations mm_fops = {
 	.owner		= THIS_MODULE,
 	.getgeo		= mm_getgeo,
 	.revalidate_disk = mm_revalidate,
-	.media_changed	= mm_check_change,
 };
 
-static int __devinit mm_pci_probe(struct pci_dev *dev,
-				const struct pci_device_id *id)
+static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
 	int ret = -ENODEV;
 	struct cardinfo *card = &cards[num_cards];
@@ -908,7 +897,6 @@ static int __devinit mm_pci_probe(struct pci_dev *dev,
 	blk_queue_make_request(card->queue, mm_make_request);
 	card->queue->queue_lock = &card->lock;
 	card->queue->queuedata = card;
-	card->queue->unplug_fn = mm_unplug_device;
 
 	tasklet_init(&card->tasklet, process_page, (unsigned long)card);
 
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
deleted file mode 100644
index 390d69bb7c4..00000000000
--- a/drivers/block/viodasd.c
+++ /dev/null
@@ -1,812 +0,0 @@
-/* -*- linux-c -*-
- * viodasd.c
- *  Authors: Dave Boutcher <boutcher@us.ibm.com>
- *           Ryan Arnold <ryanarn@us.ibm.com>
- *           Colin Devilbiss <devilbis@us.ibm.com>
- *           Stephen Rothwell
- *
- * (C) Copyright 2000-2004 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * This routine provides access to disk space (termed "DASD" in historical
- * IBM terms) owned and managed by an OS/400 partition running on the
- * same box as this Linux partition.
- *
- * All disk operations are performed by sending messages back and forth to
- * the OS/400 partition.
- */
-#include <linux/major.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/blkdev.h>
-#include <linux/genhd.h>
-#include <linux/hdreg.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/dma-mapping.h>
-#include <linux/completion.h>
-#include <linux/device.h>
-#include <linux/scatterlist.h>
-
-#include <asm/uaccess.h>
-#include <asm/vio.h>
-#include <asm/iseries/hv_types.h>
-#include <asm/iseries/hv_lp_event.h>
-#include <asm/iseries/hv_lp_config.h>
-#include <asm/iseries/vio.h>
-#include <asm/firmware.h>
-
-MODULE_DESCRIPTION("iSeries Virtual DASD");
-MODULE_AUTHOR("Dave Boutcher");
-MODULE_LICENSE("GPL");
-
-/*
- * We only support 7 partitions per physical disk....so with minor
- * numbers 0-255 we get a maximum of 32 disks.
- */
-#define VIOD_GENHD_NAME		"iseries/vd"
-
-#define VIOD_VERS		"1.64"
-
-#define VIOD_KERN_WARNING	KERN_WARNING "viod: "
-#define VIOD_KERN_INFO		KERN_INFO "viod: "
-
-enum {
-	PARTITION_SHIFT = 3,
-	MAX_DISKNO = HVMAXARCHITECTEDVIRTUALDISKS,
-	MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name)
-};
-
-static DEFINE_SPINLOCK(viodasd_spinlock);
-
-#define VIOMAXREQ		16
-
-#define DEVICE_NO(cell)	((struct viodasd_device *)(cell) - &viodasd_devices[0])
-
-struct viodasd_waitevent {
-	struct completion	com;
-	int			rc;
-	u16			sub_result;
-	int			max_disk;	/* open */
-};
-
-static const struct vio_error_entry viodasd_err_table[] = {
-	{ 0x0201, EINVAL, "Invalid Range" },
-	{ 0x0202, EINVAL, "Invalid Token" },
-	{ 0x0203, EIO, "DMA Error" },
-	{ 0x0204, EIO, "Use Error" },
-	{ 0x0205, EIO, "Release Error" },
-	{ 0x0206, EINVAL, "Invalid Disk" },
-	{ 0x0207, EBUSY, "Cant Lock" },
-	{ 0x0208, EIO, "Already Locked" },
-	{ 0x0209, EIO, "Already Unlocked" },
-	{ 0x020A, EIO, "Invalid Arg" },
-	{ 0x020B, EIO, "Bad IFS File" },
-	{ 0x020C, EROFS, "Read Only Device" },
-	{ 0x02FF, EIO, "Internal Error" },
-	{ 0x0000, 0, NULL },
-};
-
-/*
- * Figure out the biggest I/O request (in sectors) we can accept
- */
-#define VIODASD_MAXSECTORS (4096 / 512 * VIOMAXBLOCKDMA)
-
-/*
- * Number of disk I/O requests we've sent to OS/400
- */
-static int num_req_outstanding;
-
-/*
- * This is our internal structure for keeping track of disk devices
- */
-struct viodasd_device {
-	u16		cylinders;
-	u16		tracks;
-	u16		sectors;
-	u16		bytes_per_sector;
-	u64		size;
-	int		read_only;
-	spinlock_t	q_lock;
-	struct gendisk	*disk;
-	struct device	*dev;
-} viodasd_devices[MAX_DISKNO];
-
-/*
- * External open entry point.
- */
-static int viodasd_open(struct block_device *bdev, fmode_t mode)
-{
-	struct viodasd_device *d = bdev->bd_disk->private_data;
-	HvLpEvent_Rc hvrc;
-	struct viodasd_waitevent we;
-	u16 flags = 0;
-
-	if (d->read_only) {
-		if (mode & FMODE_WRITE)
-			return -EROFS;
-		flags = vioblockflags_ro;
-	}
-
-	init_completion(&we.com);
-
-	/* Send the open event to OS/400 */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockopen,
-			HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			(u64)(unsigned long)&we, VIOVERSION << 16,
-			((u64)DEVICE_NO(d) << 48) | ((u64)flags << 32),
-			0, 0, 0);
-	if (hvrc != 0) {
-		printk(VIOD_KERN_WARNING "HV open failed %d\n", (int)hvrc);
-		return -EIO;
-	}
-
-	wait_for_completion(&we.com);
-
-	/* Check the return code */
-	if (we.rc != 0) {
-		const struct vio_error_entry *err =
-			vio_lookup_rc(viodasd_err_table, we.sub_result);
-
-		printk(VIOD_KERN_WARNING
-				"bad rc opening disk: %d:0x%04x (%s)\n",
-				(int)we.rc, we.sub_result, err->msg);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-/*
- * External release entry point.
- */
-static int viodasd_release(struct gendisk *disk, fmode_t mode)
-{
-	struct viodasd_device *d = disk->private_data;
-	HvLpEvent_Rc hvrc;
-
-	/* Send the event to OS/400.  We DON'T expect a response */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockclose,
-			HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			0, VIOVERSION << 16,
-			((u64)DEVICE_NO(d) << 48) /* | ((u64)flags << 32) */,
-			0, 0, 0);
-	if (hvrc != 0)
-		printk(VIOD_KERN_WARNING "HV close call failed %d\n",
-				(int)hvrc);
-	return 0;
-}
-
-
-/* External ioctl entry point.
- */
-static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	struct gendisk *disk = bdev->bd_disk;
-	struct viodasd_device *d = disk->private_data;
-
-	geo->sectors = d->sectors ? d->sectors : 32;
-	geo->heads = d->tracks ? d->tracks  : 64;
-	geo->cylinders = d->cylinders ? d->cylinders :
-		get_capacity(disk) / (geo->sectors * geo->heads);
-
-	return 0;
-}
-
-/*
- * Our file operations table
- */
-static struct block_device_operations viodasd_fops = {
-	.owner = THIS_MODULE,
-	.open = viodasd_open,
-	.release = viodasd_release,
-	.getgeo = viodasd_getgeo,
-};
-
-/*
- * End a request
- */
-static void viodasd_end_request(struct request *req, int error,
-		int num_sectors)
-{
-	__blk_end_request(req, error, num_sectors << 9);
-}
-
-/*
- * Send an actual I/O request to OS/400
- */
-static int send_request(struct request *req)
-{
-	u64 start;
-	int direction;
-	int nsg;
-	u16 viocmd;
-	HvLpEvent_Rc hvrc;
-	struct vioblocklpevent *bevent;
-	struct HvLpEvent *hev;
-	struct scatterlist sg[VIOMAXBLOCKDMA];
-	int sgindex;
-	struct viodasd_device *d;
-	unsigned long flags;
-
-	start = (u64)blk_rq_pos(req) << 9;
-
-	if (rq_data_dir(req) == READ) {
-		direction = DMA_FROM_DEVICE;
-		viocmd = viomajorsubtype_blockio | vioblockread;
-	} else {
-		direction = DMA_TO_DEVICE;
-		viocmd = viomajorsubtype_blockio | vioblockwrite;
-	}
-
-        d = req->rq_disk->private_data;
-
-	/* Now build the scatter-gather list */
-	sg_init_table(sg, VIOMAXBLOCKDMA);
-	nsg = blk_rq_map_sg(req->q, req, sg);
-	nsg = dma_map_sg(d->dev, sg, nsg, direction);
-
-	spin_lock_irqsave(&viodasd_spinlock, flags);
-	num_req_outstanding++;
-
-	/* This optimization handles a single DMA block */
-	if (nsg == 1)
-		hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-				HvLpEvent_Type_VirtualIo, viocmd,
-				HvLpEvent_AckInd_DoAck,
-				HvLpEvent_AckType_ImmediateAck,
-				viopath_sourceinst(viopath_hostLp),
-				viopath_targetinst(viopath_hostLp),
-				(u64)(unsigned long)req, VIOVERSION << 16,
-				((u64)DEVICE_NO(d) << 48), start,
-				((u64)sg_dma_address(&sg[0])) << 32,
-				sg_dma_len(&sg[0]));
-	else {
-		bevent = (struct vioblocklpevent *)
-			vio_get_event_buffer(viomajorsubtype_blockio);
-		if (bevent == NULL) {
-			printk(VIOD_KERN_WARNING
-			       "error allocating disk event buffer\n");
-			goto error_ret;
-		}
-
-		/*
-		 * Now build up the actual request.  Note that we store
-		 * the pointer to the request in the correlation
-		 * token so we can match the response up later
-		 */
-		memset(bevent, 0, sizeof(struct vioblocklpevent));
-		hev = &bevent->event;
-		hev->flags = HV_LP_EVENT_VALID | HV_LP_EVENT_DO_ACK |
-			HV_LP_EVENT_INT;
-		hev->xType = HvLpEvent_Type_VirtualIo;
-		hev->xSubtype = viocmd;
-		hev->xSourceLp = HvLpConfig_getLpIndex();
-		hev->xTargetLp = viopath_hostLp;
-		hev->xSizeMinus1 =
-			offsetof(struct vioblocklpevent, u.rw_data.dma_info) +
-			(sizeof(bevent->u.rw_data.dma_info[0]) * nsg) - 1;
-		hev->xSourceInstanceId = viopath_sourceinst(viopath_hostLp);
-		hev->xTargetInstanceId = viopath_targetinst(viopath_hostLp);
-		hev->xCorrelationToken = (u64)req;
-		bevent->version = VIOVERSION;
-		bevent->disk = DEVICE_NO(d);
-		bevent->u.rw_data.offset = start;
-
-		/*
-		 * Copy just the dma information from the sg list
-		 * into the request
-		 */
-		for (sgindex = 0; sgindex < nsg; sgindex++) {
-			bevent->u.rw_data.dma_info[sgindex].token =
-				sg_dma_address(&sg[sgindex]);
-			bevent->u.rw_data.dma_info[sgindex].len =
-				sg_dma_len(&sg[sgindex]);
-		}
-
-		/* Send the request */
-		hvrc = HvCallEvent_signalLpEvent(&bevent->event);
-		vio_free_event_buffer(viomajorsubtype_blockio, bevent);
-	}
-
-	if (hvrc != HvLpEvent_Rc_Good) {
-		printk(VIOD_KERN_WARNING
-		       "error sending disk event to OS/400 (rc %d)\n",
-		       (int)hvrc);
-		goto error_ret;
-	}
-	spin_unlock_irqrestore(&viodasd_spinlock, flags);
-	return 0;
-
-error_ret:
-	num_req_outstanding--;
-	spin_unlock_irqrestore(&viodasd_spinlock, flags);
-	dma_unmap_sg(d->dev, sg, nsg, direction);
-	return -1;
-}
-
-/*
- * This is the external request processing routine
- */
-static void do_viodasd_request(struct request_queue *q)
-{
-	struct request *req;
-
-	/*
-	 * If we already have the maximum number of requests
-	 * outstanding to OS/400 just bail out. We'll come
-	 * back later.
-	 */
-	while (num_req_outstanding < VIOMAXREQ) {
-		req = blk_fetch_request(q);
-		if (req == NULL)
-			return;
-		/* check that request contains a valid command */
-		if (!blk_fs_request(req)) {
-			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
-			continue;
-		}
-		/* Try sending the request */
-		if (send_request(req) != 0)
-			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
-	}
-}
-
-/*
- * Probe a single disk and fill in the viodasd_device structure
- * for it.
- */
-static int probe_disk(struct viodasd_device *d)
-{
-	HvLpEvent_Rc hvrc;
-	struct viodasd_waitevent we;
-	int dev_no = DEVICE_NO(d);
-	struct gendisk *g;
-	struct request_queue *q;
-	u16 flags = 0;
-
-retry:
-	init_completion(&we.com);
-
-	/* Send the open event to OS/400 */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockopen,
-			HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			(u64)(unsigned long)&we, VIOVERSION << 16,
-			((u64)dev_no << 48) | ((u64)flags<< 32),
-			0, 0, 0);
-	if (hvrc != 0) {
-		printk(VIOD_KERN_WARNING "bad rc on HV open %d\n", (int)hvrc);
-		return 0;
-	}
-
-	wait_for_completion(&we.com);
-
-	if (we.rc != 0) {
-		if (flags != 0)
-			return 0;
-		/* try again with read only flag set */
-		flags = vioblockflags_ro;
-		goto retry;
-	}
-	if (we.max_disk > (MAX_DISKNO - 1)) {
-		static int warned;
-
-		if (warned == 0) {
-			warned++;
-			printk(VIOD_KERN_INFO
-				"Only examining the first %d "
-				"of %d disks connected\n",
-				MAX_DISKNO, we.max_disk + 1);
-		}
-	}
-
-	/* Send the close event to OS/400.  We DON'T expect a response */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockclose,
-			HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			0, VIOVERSION << 16,
-			((u64)dev_no << 48) | ((u64)flags << 32),
-			0, 0, 0);
-	if (hvrc != 0) {
-		printk(VIOD_KERN_WARNING
-		       "bad rc sending event to OS/400 %d\n", (int)hvrc);
-		return 0;
-	}
-
-	if (d->dev == NULL) {
-		/* this is when we reprobe for new disks */
-		if (vio_create_viodasd(dev_no) == NULL) {
-			printk(VIOD_KERN_WARNING
-				"cannot allocate virtual device for disk %d\n",
-				dev_no);
-			return 0;
-		}
-		/*
-		 * The vio_create_viodasd will have recursed into this
-		 * routine with d->dev set to the new vio device and
-		 * will finish the setup of the disk below.
-		 */
-		return 1;
-	}
-
-	/* create the request queue for the disk */
-	spin_lock_init(&d->q_lock);
-	q = blk_init_queue(do_viodasd_request, &d->q_lock);
-	if (q == NULL) {
-		printk(VIOD_KERN_WARNING "cannot allocate queue for disk %d\n",
-				dev_no);
-		return 0;
-	}
-	g = alloc_disk(1 << PARTITION_SHIFT);
-	if (g == NULL) {
-		printk(VIOD_KERN_WARNING
-				"cannot allocate disk structure for disk %d\n",
-				dev_no);
-		blk_cleanup_queue(q);
-		return 0;
-	}
-
-	d->disk = g;
-	blk_queue_max_hw_segments(q, VIOMAXBLOCKDMA);
-	blk_queue_max_phys_segments(q, VIOMAXBLOCKDMA);
-	blk_queue_max_sectors(q, VIODASD_MAXSECTORS);
-	g->major = VIODASD_MAJOR;
-	g->first_minor = dev_no << PARTITION_SHIFT;
-	if (dev_no >= 26)
-		snprintf(g->disk_name, sizeof(g->disk_name),
-				VIOD_GENHD_NAME "%c%c",
-				'a' + (dev_no / 26) - 1, 'a' + (dev_no % 26));
-	else
-		snprintf(g->disk_name, sizeof(g->disk_name),
-				VIOD_GENHD_NAME "%c", 'a' + (dev_no % 26));
-	g->fops = &viodasd_fops;
-	g->queue = q;
-	g->private_data = d;
-	g->driverfs_dev = d->dev;
-	set_capacity(g, d->size >> 9);
-
-	printk(VIOD_KERN_INFO "disk %d: %lu sectors (%lu MB) "
-			"CHS=%d/%d/%d sector size %d%s\n",
-			dev_no, (unsigned long)(d->size >> 9),
-			(unsigned long)(d->size >> 20),
-			(int)d->cylinders, (int)d->tracks,
-			(int)d->sectors, (int)d->bytes_per_sector,
-			d->read_only ? " (RO)" : "");
-
-	/* register us in the global list */
-	add_disk(g);
-	return 1;
-}
-
-/* returns the total number of scatterlist elements converted */
-static int block_event_to_scatterlist(const struct vioblocklpevent *bevent,
-		struct scatterlist *sg, int *total_len)
-{
-	int i, numsg;
-	const struct rw_data *rw_data = &bevent->u.rw_data;
-	static const int offset =
-		offsetof(struct vioblocklpevent, u.rw_data.dma_info);
-	static const int element_size = sizeof(rw_data->dma_info[0]);
-
-	numsg = ((bevent->event.xSizeMinus1 + 1) - offset) / element_size;
-	if (numsg > VIOMAXBLOCKDMA)
-		numsg = VIOMAXBLOCKDMA;
-
-	*total_len = 0;
-	sg_init_table(sg, VIOMAXBLOCKDMA);
-	for (i = 0; (i < numsg) && (rw_data->dma_info[i].len > 0); ++i) {
-		sg_dma_address(&sg[i]) = rw_data->dma_info[i].token;
-		sg_dma_len(&sg[i]) = rw_data->dma_info[i].len;
-		*total_len += rw_data->dma_info[i].len;
-	}
-	return i;
-}
-
-/*
- * Restart all queues, starting with the one _after_ the disk given,
- * thus reducing the chance of starvation of higher numbered disks.
- */
-static void viodasd_restart_all_queues_starting_from(int first_index)
-{
-	int i;
-
-	for (i = first_index + 1; i < MAX_DISKNO; ++i)
-		if (viodasd_devices[i].disk)
-			blk_run_queue(viodasd_devices[i].disk->queue);
-	for (i = 0; i <= first_index; ++i)
-		if (viodasd_devices[i].disk)
-			blk_run_queue(viodasd_devices[i].disk->queue);
-}
-
-/*
- * For read and write requests, decrement the number of outstanding requests,
- * Free the DMA buffers we allocated.
- */
-static int viodasd_handle_read_write(struct vioblocklpevent *bevent)
-{
-	int num_sg, num_sect, pci_direction, total_len;
-	struct request *req;
-	struct scatterlist sg[VIOMAXBLOCKDMA];
-	struct HvLpEvent *event = &bevent->event;
-	unsigned long irq_flags;
-	struct viodasd_device *d;
-	int error;
-	spinlock_t *qlock;
-
-	num_sg = block_event_to_scatterlist(bevent, sg, &total_len);
-	num_sect = total_len >> 9;
-	if (event->xSubtype == (viomajorsubtype_blockio | vioblockread))
-		pci_direction = DMA_FROM_DEVICE;
-	else
-		pci_direction = DMA_TO_DEVICE;
-	req = (struct request *)bevent->event.xCorrelationToken;
-	d = req->rq_disk->private_data;
-
-	dma_unmap_sg(d->dev, sg, num_sg, pci_direction);
-
-	/*
-	 * Since this is running in interrupt mode, we need to make sure
-	 * we're not stepping on any global I/O operations
-	 */
-	spin_lock_irqsave(&viodasd_spinlock, irq_flags);
-	num_req_outstanding--;
-	spin_unlock_irqrestore(&viodasd_spinlock, irq_flags);
-
-	error = (event->xRc == HvLpEvent_Rc_Good) ? 0 : -EIO;
-	if (error) {
-		const struct vio_error_entry *err;
-		err = vio_lookup_rc(viodasd_err_table, bevent->sub_result);
-		printk(VIOD_KERN_WARNING "read/write error %d:0x%04x (%s)\n",
-				event->xRc, bevent->sub_result, err->msg);
-		num_sect = blk_rq_sectors(req);
-	}
-	qlock = req->q->queue_lock;
-	spin_lock_irqsave(qlock, irq_flags);
-	viodasd_end_request(req, error, num_sect);
-	spin_unlock_irqrestore(qlock, irq_flags);
-
-	/* Finally, try to get more requests off of this device's queue */
-	viodasd_restart_all_queues_starting_from(DEVICE_NO(d));
-
-	return 0;
-}
-
-/* This routine handles incoming block LP events */
-static void handle_block_event(struct HvLpEvent *event)
-{
-	struct vioblocklpevent *bevent = (struct vioblocklpevent *)event;
-	struct viodasd_waitevent *pwe;
-
-	if (event == NULL)
-		/* Notification that a partition went away! */
-		return;
-	/* First, we should NEVER get an int here...only acks */
-	if (hvlpevent_is_int(event)) {
-		printk(VIOD_KERN_WARNING
-		       "Yikes! got an int in viodasd event handler!\n");
-		if (hvlpevent_need_ack(event)) {
-			event->xRc = HvLpEvent_Rc_InvalidSubtype;
-			HvCallEvent_ackLpEvent(event);
-		}
-	}
-
-	switch (event->xSubtype & VIOMINOR_SUBTYPE_MASK) {
-	case vioblockopen:
-		/*
-		 * Handle a response to an open request.  We get all the
-		 * disk information in the response, so update it.  The
-		 * correlation token contains a pointer to a waitevent
-		 * structure that has a completion in it.  update the
-		 * return code in the waitevent structure and post the
-		 * completion to wake up the guy who sent the request
-		 */
-		pwe = (struct viodasd_waitevent *)event->xCorrelationToken;
-		pwe->rc = event->xRc;
-		pwe->sub_result = bevent->sub_result;
-		if (event->xRc == HvLpEvent_Rc_Good) {
-			const struct open_data *data = &bevent->u.open_data;
-			struct viodasd_device *device =
-				&viodasd_devices[bevent->disk];
-			device->read_only =
-				bevent->flags & vioblockflags_ro;
-			device->size = data->disk_size;
-			device->cylinders = data->cylinders;
-			device->tracks = data->tracks;
-			device->sectors = data->sectors;
-			device->bytes_per_sector = data->bytes_per_sector;
-			pwe->max_disk = data->max_disk;
-		}
-		complete(&pwe->com);
-		break;
-	case vioblockclose:
-		break;
-	case vioblockread:
-	case vioblockwrite:
-		viodasd_handle_read_write(bevent);
-		break;
-
-	default:
-		printk(VIOD_KERN_WARNING "invalid subtype!");
-		if (hvlpevent_need_ack(event)) {
-			event->xRc = HvLpEvent_Rc_InvalidSubtype;
-			HvCallEvent_ackLpEvent(event);
-		}
-	}
-}
-
-/*
- * Get the driver to reprobe for more disks.
- */
-static ssize_t probe_disks(struct device_driver *drv, const char *buf,
-		size_t count)
-{
-	struct viodasd_device *d;
-
-	for (d = viodasd_devices; d < &viodasd_devices[MAX_DISKNO]; d++) {
-		if (d->disk == NULL)
-			probe_disk(d);
-	}
-	return count;
-}
-static DRIVER_ATTR(probe, S_IWUSR, NULL, probe_disks);
-
-static int viodasd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
-{
-	struct viodasd_device *d = &viodasd_devices[vdev->unit_address];
-
-	d->dev = &vdev->dev;
-	if (!probe_disk(d))
-		return -ENODEV;
-	return 0;
-}
-
-static int viodasd_remove(struct vio_dev *vdev)
-{
-	struct viodasd_device *d;
-
-	d = &viodasd_devices[vdev->unit_address];
-	if (d->disk) {
-		del_gendisk(d->disk);
-		blk_cleanup_queue(d->disk->queue);
-		put_disk(d->disk);
-		d->disk = NULL;
-	}
-	d->dev = NULL;
-	return 0;
-}
-
-/**
- * viodasd_device_table: Used by vio.c to match devices that we
- * support.
- */
-static struct vio_device_id viodasd_device_table[] __devinitdata = {
-	{ "block", "IBM,iSeries-viodasd" },
-	{ "", "" }
-};
-MODULE_DEVICE_TABLE(vio, viodasd_device_table);
-
-static struct vio_driver viodasd_driver = {
-	.id_table = viodasd_device_table,
-	.probe = viodasd_probe,
-	.remove = viodasd_remove,
-	.driver = {
-		.name = "viodasd",
-		.owner = THIS_MODULE,
-	}
-};
-
-static int need_delete_probe;
-
-/*
- * Initialize the whole device driver.  Handle module and non-module
- * versions
- */
-static int __init viodasd_init(void)
-{
-	int rc;
-
-	if (!firmware_has_feature(FW_FEATURE_ISERIES)) {
-		rc = -ENODEV;
-		goto early_fail;
-	}
-
-	/* Try to open to our host lp */
-	if (viopath_hostLp == HvLpIndexInvalid)
-		vio_set_hostlp();
-
-	if (viopath_hostLp == HvLpIndexInvalid) {
-		printk(VIOD_KERN_WARNING "invalid hosting partition\n");
-		rc = -EIO;
-		goto early_fail;
-	}
-
-	printk(VIOD_KERN_INFO "vers " VIOD_VERS ", hosting partition %d\n",
-			viopath_hostLp);
-
-        /* register the block device */
-	rc =  register_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-	if (rc) {
-		printk(VIOD_KERN_WARNING
-				"Unable to get major number %d for %s\n",
-				VIODASD_MAJOR, VIOD_GENHD_NAME);
-		goto early_fail;
-	}
-	/* Actually open the path to the hosting partition */
-	rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio,
-				VIOMAXREQ + 2);
-	if (rc) {
-		printk(VIOD_KERN_WARNING
-		       "error opening path to host partition %d\n",
-		       viopath_hostLp);
-		goto unregister_blk;
-	}
-
-	/* Initialize our request handler */
-	vio_setHandler(viomajorsubtype_blockio, handle_block_event);
-
-	rc = vio_register_driver(&viodasd_driver);
-	if (rc) {
-		printk(VIOD_KERN_WARNING "vio_register_driver failed\n");
-		goto unset_handler;
-	}
-
-	/*
-	 * If this call fails, it just means that we cannot dynamically
-	 * add virtual disks, but the driver will still work fine for
-	 * all existing disk, so ignore the failure.
-	 */
-	if (!driver_create_file(&viodasd_driver.driver, &driver_attr_probe))
-		need_delete_probe = 1;
-
-	return 0;
-
-unset_handler:
-	vio_clearHandler(viomajorsubtype_blockio);
-	viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2);
-unregister_blk:
-	unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-early_fail:
-	return rc;
-}
-module_init(viodasd_init);
-
-void __exit viodasd_exit(void)
-{
-	if (need_delete_probe)
-		driver_remove_file(&viodasd_driver.driver, &driver_attr_probe);
-	vio_unregister_driver(&viodasd_driver);
-	vio_clearHandler(viomajorsubtype_blockio);
-	viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2);
-	unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-}
-module_exit(viodasd_exit);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c0facaa55cf..f63d358f3d9 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1,239 +1,285 @@
 //#define DEBUG
 #include <linux/spinlock.h>
+#include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/virtio.h>
 #include <linux/virtio_blk.h>
 #include <linux/scatterlist.h>
+#include <linux/string_helpers.h>
+#include <scsi/scsi_cmnd.h>
+#include <linux/idr.h>
+#include <linux/blk-mq.h>
+#include <linux/numa.h>
 
 #define PART_BITS 4
 
-static int major, index;
+static int major;
+static DEFINE_IDA(vd_index_ida);
+
+static struct workqueue_struct *virtblk_wq;
 
 struct virtio_blk
 {
-	spinlock_t lock;
-
 	struct virtio_device *vdev;
 	struct virtqueue *vq;
+	spinlock_t vq_lock;
 
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
 
-	/* Request tracking. */
-	struct list_head reqs;
+	/* Block layer tags. */
+	struct blk_mq_tag_set tag_set;
+
+	/* Process context for config space updates */
+	struct work_struct config_work;
+
+	/* Lock for config space updates */
+	struct mutex config_lock;
 
-	mempool_t *pool;
+	/* enable config space updates */
+	bool config_enable;
 
 	/* What host tells us, plus 2 for header & tailer. */
 	unsigned int sg_elems;
 
-	/* Scatterlist: can be too big for stack. */
-	struct scatterlist sg[/*sg_elems*/];
+	/* Ida index - used to track minor number allocations. */
+	int index;
 };
 
 struct virtblk_req
 {
-	struct list_head list;
 	struct request *req;
 	struct virtio_blk_outhdr out_hdr;
 	struct virtio_scsi_inhdr in_hdr;
 	u8 status;
+	struct scatterlist sg[];
 };
 
-static void blk_done(struct virtqueue *vq)
+static inline int virtblk_result(struct virtblk_req *vbr)
 {
-	struct virtio_blk *vblk = vq->vdev->priv;
-	struct virtblk_req *vbr;
-	unsigned int len;
-	unsigned long flags;
-
-	spin_lock_irqsave(&vblk->lock, flags);
-	while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
-		int error;
-
-		switch (vbr->status) {
-		case VIRTIO_BLK_S_OK:
-			error = 0;
-			break;
-		case VIRTIO_BLK_S_UNSUPP:
-			error = -ENOTTY;
-			break;
-		default:
-			error = -EIO;
-			break;
-		}
-
-		if (blk_pc_request(vbr->req)) {
-			vbr->req->resid_len = vbr->in_hdr.residual;
-			vbr->req->sense_len = vbr->in_hdr.sense_len;
-			vbr->req->errors = vbr->in_hdr.errors;
-		}
-
-		__blk_end_request_all(vbr->req, error);
-		list_del(&vbr->list);
-		mempool_free(vbr, vblk->pool);
+	switch (vbr->status) {
+	case VIRTIO_BLK_S_OK:
+		return 0;
+	case VIRTIO_BLK_S_UNSUPP:
+		return -ENOTTY;
+	default:
+		return -EIO;
 	}
-	/* In case queue is stopped waiting for more buffers. */
-	blk_start_queue(vblk->disk->queue);
-	spin_unlock_irqrestore(&vblk->lock, flags);
 }
 
-static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
-		   struct request *req)
+static int __virtblk_add_req(struct virtqueue *vq,
+			     struct virtblk_req *vbr,
+			     struct scatterlist *data_sg,
+			     bool have_data)
 {
-	unsigned long num, out = 0, in = 0;
-	struct virtblk_req *vbr;
-
-	vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
-	if (!vbr)
-		/* When another request finishes we'll try again. */
-		return false;
-
-	vbr->req = req;
-	if (blk_fs_request(vbr->req)) {
-		vbr->out_hdr.type = 0;
-		vbr->out_hdr.sector = blk_rq_pos(vbr->req);
-		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
-	} else if (blk_pc_request(vbr->req)) {
-		vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
-		vbr->out_hdr.sector = 0;
-		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
-	} else {
-		/* We don't put anything else in the queue. */
-		BUG();
-	}
+	struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
+	unsigned int num_out = 0, num_in = 0;
+	int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT;
 
-	if (blk_barrier_rq(vbr->req))
-		vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
-
-	sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
+	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
+	sgs[num_out++] = &hdr;
 
 	/*
 	 * If this is a packet command we need a couple of additional headers.
 	 * Behind the normal outhdr we put a segment with the scsi command
 	 * block, and before the normal inhdr we put the sense data and the
-	 * inhdr with additional status information before the normal inhdr.
+	 * inhdr with additional status information.
 	 */
-	if (blk_pc_request(vbr->req))
-		sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
+	if (type == VIRTIO_BLK_T_SCSI_CMD) {
+		sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
+		sgs[num_out++] = &cmd;
+	}
 
-	num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
+	if (have_data) {
+		if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
+			sgs[num_out++] = data_sg;
+		else
+			sgs[num_out + num_in++] = data_sg;
+	}
 
-	if (blk_pc_request(vbr->req)) {
-		sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96);
-		sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
-			   sizeof(vbr->in_hdr));
+	if (type == VIRTIO_BLK_T_SCSI_CMD) {
+		sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
+		sgs[num_out + num_in++] = &sense;
+		sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
+		sgs[num_out + num_in++] = &inhdr;
 	}
 
-	sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
-		   sizeof(vbr->status));
+	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
+	sgs[num_out + num_in++] = &status;
 
-	if (num) {
-		if (rq_data_dir(vbr->req) == WRITE) {
-			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
-			out += num;
-		} else {
-			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
-			in += num;
-		}
-	}
+	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
+}
 
-	if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) {
-		mempool_free(vbr, vblk->pool);
-		return false;
+static inline void virtblk_request_done(struct request *req)
+{
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
+	int error = virtblk_result(vbr);
+
+	if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
+		req->resid_len = vbr->in_hdr.residual;
+		req->sense_len = vbr->in_hdr.sense_len;
+		req->errors = vbr->in_hdr.errors;
+	} else if (req->cmd_type == REQ_TYPE_SPECIAL) {
+		req->errors = (error != 0);
 	}
 
-	list_add_tail(&vbr->list, &vblk->reqs);
-	return true;
+	blk_mq_end_io(req, error);
 }
 
-static void do_virtblk_request(struct request_queue *q)
+static void virtblk_done(struct virtqueue *vq)
 {
-	struct virtio_blk *vblk = q->queuedata;
-	struct request *req;
-	unsigned int issued = 0;
+	struct virtio_blk *vblk = vq->vdev->priv;
+	bool req_done = false;
+	struct virtblk_req *vbr;
+	unsigned long flags;
+	unsigned int len;
+
+	spin_lock_irqsave(&vblk->vq_lock, flags);
+	do {
+		virtqueue_disable_cb(vq);
+		while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
+			blk_mq_complete_request(vbr->req);
+			req_done = true;
+		}
+		if (unlikely(virtqueue_is_broken(vq)))
+			break;
+	} while (!virtqueue_enable_cb(vq));
+
+	/* In case queue is stopped waiting for more buffers. */
+	if (req_done)
+		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+	spin_unlock_irqrestore(&vblk->vq_lock, flags);
+}
 
-	while ((req = blk_peek_request(q)) != NULL) {
-		BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
+static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
+{
+	struct virtio_blk *vblk = hctx->queue->queuedata;
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
+	unsigned long flags;
+	unsigned int num;
+	const bool last = (req->cmd_flags & REQ_END) != 0;
+	int err;
+	bool notify = false;
+
+	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
-		/* If this request fails, stop queue and wait for something to
-		   finish to restart it. */
-		if (!do_req(q, vblk, req)) {
-			blk_stop_queue(q);
+	vbr->req = req;
+	if (req->cmd_flags & REQ_FLUSH) {
+		vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
+		vbr->out_hdr.sector = 0;
+		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+	} else {
+		switch (req->cmd_type) {
+		case REQ_TYPE_FS:
+			vbr->out_hdr.type = 0;
+			vbr->out_hdr.sector = blk_rq_pos(vbr->req);
+			vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+			break;
+		case REQ_TYPE_BLOCK_PC:
+			vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
+			vbr->out_hdr.sector = 0;
+			vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 			break;
+		case REQ_TYPE_SPECIAL:
+			vbr->out_hdr.type = VIRTIO_BLK_T_GET_ID;
+			vbr->out_hdr.sector = 0;
+			vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+			break;
+		default:
+			/* We don't put anything else in the queue. */
+			BUG();
 		}
-		blk_start_request(req);
-		issued++;
 	}
 
-	if (issued)
-		vblk->vq->vq_ops->kick(vblk->vq);
+	num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
+	if (num) {
+		if (rq_data_dir(vbr->req) == WRITE)
+			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+		else
+			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+	}
+
+	spin_lock_irqsave(&vblk->vq_lock, flags);
+	err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num);
+	if (err) {
+		virtqueue_kick(vblk->vq);
+		blk_mq_stop_hw_queue(hctx);
+		spin_unlock_irqrestore(&vblk->vq_lock, flags);
+		/* Out of mem doesn't actually happen, since we fall back
+		 * to direct descriptors */
+		if (err == -ENOMEM || err == -ENOSPC)
+			return BLK_MQ_RQ_QUEUE_BUSY;
+		return BLK_MQ_RQ_QUEUE_ERROR;
+	}
+
+	if (last && virtqueue_kick_prepare(vblk->vq))
+		notify = true;
+	spin_unlock_irqrestore(&vblk->vq_lock, flags);
+
+	if (notify)
+		virtqueue_notify(vblk->vq);
+	return BLK_MQ_RQ_QUEUE_OK;
 }
 
-/* return ATA identify data
+/* return id (s/n) string for *disk to *id_str
  */
-static int virtblk_identify(struct gendisk *disk, void *argp)
+static int virtblk_get_id(struct gendisk *disk, char *id_str)
 {
 	struct virtio_blk *vblk = disk->private_data;
-	void *opaque;
-	int err = -ENOMEM;
-
-	opaque = kmalloc(VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
-	if (!opaque)
-		goto out;
+	struct request *req;
+	struct bio *bio;
+	int err;
 
-	err = virtio_config_buf(vblk->vdev, VIRTIO_BLK_F_IDENTIFY,
-		offsetof(struct virtio_blk_config, identify), opaque,
-		VIRTIO_BLK_ID_BYTES);
+	bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES,
+			   GFP_KERNEL);
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
 
-	if (err)
-		goto out_kfree;
+	req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL);
+	if (IS_ERR(req)) {
+		bio_put(bio);
+		return PTR_ERR(req);
+	}
 
-	if (copy_to_user(argp, opaque, VIRTIO_BLK_ID_BYTES))
-		err = -EFAULT;
+	req->cmd_type = REQ_TYPE_SPECIAL;
+	err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+	blk_put_request(req);
 
-out_kfree:
-	kfree(opaque);
-out:
 	return err;
 }
 
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
-			 unsigned cmd, unsigned long data)
+			     unsigned int cmd, unsigned long data)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	struct virtio_blk *vblk = disk->private_data;
-	void __user *argp = (void __user *)data;
-
-	if (cmd == HDIO_GET_IDENTITY)
-		return virtblk_identify(disk, argp);
 
 	/*
 	 * Only allow the generic SCSI ioctls if the host can support it.
 	 */
 	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
-		return -ENOIOCTLCMD;
+		return -ENOTTY;
 
-	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp);
+	return scsi_cmd_blk_ioctl(bdev, mode, cmd,
+				  (void __user *)data);
 }
 
 /* We provide getgeo only to please some old bootloader/partitioning tools */
 static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
 {
 	struct virtio_blk *vblk = bd->bd_disk->private_data;
-	struct virtio_blk_geometry vgeo;
-	int err;
 
 	/* see if the host passed in geometry config */
-	err = virtio_config_val(vblk->vdev, VIRTIO_BLK_F_GEOMETRY,
-				offsetof(struct virtio_blk_config, geometry),
-				&vgeo);
-
-	if (!err) {
-		geo->heads = vgeo.heads;
-		geo->sectors = vgeo.sectors;
-		geo->cylinders = vgeo.cylinders;
+	if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
+		virtio_cread(vblk->vdev, struct virtio_blk_config,
+			     geometry.cylinders, &geo->cylinders);
+		virtio_cread(vblk->vdev, struct virtio_blk_config,
+			     geometry.heads, &geo->heads);
+		virtio_cread(vblk->vdev, struct virtio_blk_config,
+			     geometry.sectors, &geo->sectors);
 	} else {
 		/* some standard values, similar to sd */
 		geo->heads = 1 << 6;
@@ -243,8 +289,8 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations virtblk_fops = {
-	.locked_ioctl = virtblk_ioctl,
+static const struct block_device_operations virtblk_fops = {
+	.ioctl  = virtblk_ioctl,
 	.owner  = THIS_MODULE,
 	.getgeo = virtblk_getgeo,
 };
@@ -254,99 +300,315 @@ static int index_to_minor(int index)
 	return index << PART_BITS;
 }
 
+static int minor_to_index(int minor)
+{
+	return minor >> PART_BITS;
+}
+
+static ssize_t virtblk_serial_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	int err;
+
+	/* sysfs gives us a PAGE_SIZE buffer */
+	BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);
+
+	buf[VIRTIO_BLK_ID_BYTES] = '\0';
+	err = virtblk_get_id(disk, buf);
+	if (!err)
+		return strlen(buf);
+
+	if (err == -EIO) /* Unsupported? Make it empty. */
+		return 0;
+
+	return err;
+}
+DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL);
+
+static void virtblk_config_changed_work(struct work_struct *work)
+{
+	struct virtio_blk *vblk =
+		container_of(work, struct virtio_blk, config_work);
+	struct virtio_device *vdev = vblk->vdev;
+	struct request_queue *q = vblk->disk->queue;
+	char cap_str_2[10], cap_str_10[10];
+	char *envp[] = { "RESIZE=1", NULL };
+	u64 capacity, size;
+
+	mutex_lock(&vblk->config_lock);
+	if (!vblk->config_enable)
+		goto done;
+
+	/* Host must always specify the capacity. */
+	virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
+
+	/* If capacity is too big, truncate with warning. */
+	if ((sector_t)capacity != capacity) {
+		dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
+			 (unsigned long long)capacity);
+		capacity = (sector_t)-1;
+	}
+
+	size = capacity * queue_logical_block_size(q);
+	string_get_size(size, STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
+	string_get_size(size, STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
+
+	dev_notice(&vdev->dev,
+		  "new size: %llu %d-byte logical blocks (%s/%s)\n",
+		  (unsigned long long)capacity,
+		  queue_logical_block_size(q),
+		  cap_str_10, cap_str_2);
+
+	set_capacity(vblk->disk, capacity);
+	revalidate_disk(vblk->disk);
+	kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
+done:
+	mutex_unlock(&vblk->config_lock);
+}
+
+static void virtblk_config_changed(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+
+	queue_work(virtblk_wq, &vblk->config_work);
+}
+
+static int init_vq(struct virtio_blk *vblk)
+{
+	int err = 0;
+
+	/* We expect one virtqueue, for output. */
+	vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests");
+	if (IS_ERR(vblk->vq))
+		err = PTR_ERR(vblk->vq);
+
+	return err;
+}
+
+/*
+ * Legacy naming scheme used for virtio devices.  We are stuck with it for
+ * virtio blk but don't ever use it for any new driver.
+ */
+static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
+{
+	const int base = 'z' - 'a' + 1;
+	char *begin = buf + strlen(prefix);
+	char *end = buf + buflen;
+	char *p;
+	int unit;
+
+	p = end - 1;
+	*p = '\0';
+	unit = base;
+	do {
+		if (p == begin)
+			return -EINVAL;
+		*--p = 'a' + (index % unit);
+		index = (index / unit) - 1;
+	} while (index >= 0);
+
+	memmove(begin, p, end - p);
+	memcpy(buf, prefix, strlen(prefix));
+
+	return 0;
+}
+
+static int virtblk_get_cache_mode(struct virtio_device *vdev)
+{
+	u8 writeback;
+	int err;
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
+				   struct virtio_blk_config, wce,
+				   &writeback);
+	if (err)
+		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE);
+
+	return writeback;
+}
+
+static void virtblk_update_cache_mode(struct virtio_device *vdev)
+{
+	u8 writeback = virtblk_get_cache_mode(vdev);
+	struct virtio_blk *vblk = vdev->priv;
+
+	if (writeback)
+		blk_queue_flush(vblk->disk->queue, REQ_FLUSH);
+	else
+		blk_queue_flush(vblk->disk->queue, 0);
+
+	revalidate_disk(vblk->disk);
+}
+
+static const char *const virtblk_cache_types[] = {
+	"write through", "write back"
+};
+
+static ssize_t
+virtblk_cache_type_store(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct virtio_blk *vblk = disk->private_data;
+	struct virtio_device *vdev = vblk->vdev;
+	int i;
+
+	BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
+	for (i = ARRAY_SIZE(virtblk_cache_types); --i >= 0; )
+		if (sysfs_streq(buf, virtblk_cache_types[i]))
+			break;
+
+	if (i < 0)
+		return -EINVAL;
+
+	virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
+	virtblk_update_cache_mode(vdev);
+	return count;
+}
+
+static ssize_t
+virtblk_cache_type_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct virtio_blk *vblk = disk->private_data;
+	u8 writeback = virtblk_get_cache_mode(vblk->vdev);
+
+	BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
+	return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
+}
+
+static const struct device_attribute dev_attr_cache_type_ro =
+	__ATTR(cache_type, S_IRUGO,
+	       virtblk_cache_type_show, NULL);
+static const struct device_attribute dev_attr_cache_type_rw =
+	__ATTR(cache_type, S_IRUGO|S_IWUSR,
+	       virtblk_cache_type_show, virtblk_cache_type_store);
+
+static int virtblk_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
+{
+	struct virtio_blk *vblk = data;
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
+
+	sg_init_table(vbr->sg, vblk->sg_elems);
+	return 0;
+}
+
+static struct blk_mq_ops virtio_mq_ops = {
+	.queue_rq	= virtio_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.complete	= virtblk_request_done,
+	.init_request	= virtblk_init_request,
+};
+
+static unsigned int virtblk_queue_depth;
+module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
+
 static int virtblk_probe(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk;
-	int err;
+	struct request_queue *q;
+	int err, index;
+
 	u64 cap;
-	u32 v;
-	u32 blk_size, sg_elems;
+	u32 v, blk_size, sg_elems, opt_io_size;
+	u16 min_io_size;
+	u8 physical_block_exp, alignment_offset;
 
-	if (index_to_minor(index) >= 1 << MINORBITS)
-		return -ENOSPC;
+	err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
+			     GFP_KERNEL);
+	if (err < 0)
+		goto out;
+	index = err;
 
 	/* We need to know how many segments before we allocate. */
-	err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX,
-				offsetof(struct virtio_blk_config, seg_max),
-				&sg_elems);
-	if (err)
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
+				   struct virtio_blk_config, seg_max,
+				   &sg_elems);
+
+	/* We need at least one SG element, whatever they say. */
+	if (err || !sg_elems)
 		sg_elems = 1;
 
 	/* We need an extra sg elements at head and tail. */
 	sg_elems += 2;
-	vdev->priv = vblk = kmalloc(sizeof(*vblk) +
-				    sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL);
+	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
 	if (!vblk) {
 		err = -ENOMEM;
-		goto out;
+		goto out_free_index;
 	}
 
-	INIT_LIST_HEAD(&vblk->reqs);
-	spin_lock_init(&vblk->lock);
 	vblk->vdev = vdev;
 	vblk->sg_elems = sg_elems;
-	sg_init_table(vblk->sg, vblk->sg_elems);
+	mutex_init(&vblk->config_lock);
 
-	/* We expect one virtqueue, for output. */
-	vblk->vq = vdev->config->find_vq(vdev, 0, blk_done);
-	if (IS_ERR(vblk->vq)) {
-		err = PTR_ERR(vblk->vq);
-		goto out_free_vblk;
-	}
+	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
+	vblk->config_enable = true;
 
-	vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
-	if (!vblk->pool) {
-		err = -ENOMEM;
-		goto out_free_vq;
-	}
+	err = init_vq(vblk);
+	if (err)
+		goto out_free_vblk;
+	spin_lock_init(&vblk->vq_lock);
 
 	/* FIXME: How many partitions?  How long is a piece of string? */
 	vblk->disk = alloc_disk(1 << PART_BITS);
 	if (!vblk->disk) {
 		err = -ENOMEM;
-		goto out_mempool;
+		goto out_free_vq;
 	}
 
-	vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
-	if (!vblk->disk->queue) {
-		err = -ENOMEM;
-		goto out_put_disk;
+	/* Default queue sizing is to fill the ring. */
+	if (!virtblk_queue_depth) {
+		virtblk_queue_depth = vblk->vq->num_free;
+		/* ... but without indirect descs, we use 2 descs per req */
+		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
+			virtblk_queue_depth /= 2;
 	}
 
-	vblk->disk->queue->queuedata = vblk;
-	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, vblk->disk->queue);
+	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
+	vblk->tag_set.ops = &virtio_mq_ops;
+	vblk->tag_set.nr_hw_queues = 1;
+	vblk->tag_set.queue_depth = virtblk_queue_depth;
+	vblk->tag_set.numa_node = NUMA_NO_NODE;
+	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	vblk->tag_set.cmd_size =
+		sizeof(struct virtblk_req) +
+		sizeof(struct scatterlist) * sg_elems;
+	vblk->tag_set.driver_data = vblk;
+
+	err = blk_mq_alloc_tag_set(&vblk->tag_set);
+	if (err)
+		goto out_put_disk;
 
-	if (index < 26) {
-		sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26);
-	} else if (index < (26 + 1) * 26) {
-		sprintf(vblk->disk->disk_name, "vd%c%c",
-			'a' + index / 26 - 1, 'a' + index % 26);
-	} else {
-		const unsigned int m1 = (index / 26 - 1) / 26 - 1;
-		const unsigned int m2 = (index / 26 - 1) % 26;
-		const unsigned int m3 =  index % 26;
-		sprintf(vblk->disk->disk_name, "vd%c%c%c",
-			'a' + m1, 'a' + m2, 'a' + m3);
+	q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);
+	if (!q) {
+		err = -ENOMEM;
+		goto out_free_tags;
 	}
 
+	q->queuedata = vblk;
+
+	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
+
 	vblk->disk->major = major;
 	vblk->disk->first_minor = index_to_minor(index);
 	vblk->disk->private_data = vblk;
 	vblk->disk->fops = &virtblk_fops;
 	vblk->disk->driverfs_dev = &vdev->dev;
-	index++;
+	vblk->index = index;
 
-	/* If barriers are supported, tell block layer that queue is ordered */
-	if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER))
-		blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL);
+	/* configure queue flush support */
+	virtblk_update_cache_mode(vdev);
 
 	/* If disk is read-only in the host, the guest should obey */
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
 		set_disk_ro(vblk->disk, 1);
 
 	/* Host must always specify the capacity. */
-	vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
-			  &cap, sizeof(cap));
+	virtio_cread(vdev, struct virtio_blk_config, capacity, &cap);
 
 	/* If capacity is too big, truncate with warning. */
 	if ((sector_t)cap != cap) {
@@ -357,40 +619,86 @@ static int virtblk_probe(struct virtio_device *vdev)
 	set_capacity(vblk->disk, cap);
 
 	/* We can handle whatever the host told us to handle. */
-	blk_queue_max_phys_segments(vblk->disk->queue, vblk->sg_elems-2);
-	blk_queue_max_hw_segments(vblk->disk->queue, vblk->sg_elems-2);
+	blk_queue_max_segments(q, vblk->sg_elems-2);
+
+	/* No need to bounce any requests */
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
 
 	/* No real sector limit. */
-	blk_queue_max_sectors(vblk->disk->queue, -1U);
+	blk_queue_max_hw_sectors(q, -1U);
 
 	/* Host can optionally specify maximum segment size and number of
 	 * segments. */
-	err = virtio_config_val(vdev, VIRTIO_BLK_F_SIZE_MAX,
-				offsetof(struct virtio_blk_config, size_max),
-				&v);
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
+				   struct virtio_blk_config, size_max, &v);
 	if (!err)
-		blk_queue_max_segment_size(vblk->disk->queue, v);
+		blk_queue_max_segment_size(q, v);
 	else
-		blk_queue_max_segment_size(vblk->disk->queue, -1U);
+		blk_queue_max_segment_size(q, -1U);
 
 	/* Host can optionally specify the block size of the device */
-	err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE,
-				offsetof(struct virtio_blk_config, blk_size),
-				&blk_size);
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
+				   struct virtio_blk_config, blk_size,
+				   &blk_size);
 	if (!err)
-		blk_queue_logical_block_size(vblk->disk->queue, blk_size);
+		blk_queue_logical_block_size(q, blk_size);
+	else
+		blk_size = queue_logical_block_size(q);
+
+	/* Use topology information if available */
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+				   struct virtio_blk_config, physical_block_exp,
+				   &physical_block_exp);
+	if (!err && physical_block_exp)
+		blk_queue_physical_block_size(q,
+				blk_size * (1 << physical_block_exp));
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+				   struct virtio_blk_config, alignment_offset,
+				   &alignment_offset);
+	if (!err && alignment_offset)
+		blk_queue_alignment_offset(q, blk_size * alignment_offset);
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+				   struct virtio_blk_config, min_io_size,
+				   &min_io_size);
+	if (!err && min_io_size)
+		blk_queue_io_min(q, blk_size * min_io_size);
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+				   struct virtio_blk_config, opt_io_size,
+				   &opt_io_size);
+	if (!err && opt_io_size)
+		blk_queue_io_opt(q, blk_size * opt_io_size);
 
 	add_disk(vblk->disk);
+	err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
+	if (err)
+		goto out_del_disk;
+
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
+		err = device_create_file(disk_to_dev(vblk->disk),
+					 &dev_attr_cache_type_rw);
+	else
+		err = device_create_file(disk_to_dev(vblk->disk),
+					 &dev_attr_cache_type_ro);
+	if (err)
+		goto out_del_disk;
 	return 0;
 
+out_del_disk:
+	del_gendisk(vblk->disk);
+	blk_cleanup_queue(vblk->disk->queue);
+out_free_tags:
+	blk_mq_free_tag_set(&vblk->tag_set);
 out_put_disk:
 	put_disk(vblk->disk);
-out_mempool:
-	mempool_destroy(vblk->pool);
 out_free_vq:
-	vdev->config->del_vq(vblk->vq);
+	vdev->config->del_vqs(vdev);
 out_free_vblk:
 	kfree(vblk);
+out_free_index:
+	ida_simple_remove(&vd_index_ida, index);
 out:
 	return err;
 }
@@ -398,54 +706,126 @@ out:
 static void virtblk_remove(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk = vdev->priv;
+	int index = vblk->index;
+	int refc;
 
-	/* Nothing should be pending. */
-	BUG_ON(!list_empty(&vblk->reqs));
+	/* Prevent config work handler from accessing the device. */
+	mutex_lock(&vblk->config_lock);
+	vblk->config_enable = false;
+	mutex_unlock(&vblk->config_lock);
+
+	del_gendisk(vblk->disk);
+	blk_cleanup_queue(vblk->disk->queue);
+
+	blk_mq_free_tag_set(&vblk->tag_set);
 
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
 
-	del_gendisk(vblk->disk);
-	blk_cleanup_queue(vblk->disk->queue);
+	flush_work(&vblk->config_work);
+
+	refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
 	put_disk(vblk->disk);
-	mempool_destroy(vblk->pool);
-	vdev->config->del_vq(vblk->vq);
+	vdev->config->del_vqs(vdev);
 	kfree(vblk);
+
+	/* Only free device id if we don't have any users */
+	if (refc == 1)
+		ida_simple_remove(&vd_index_ida, index);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtblk_freeze(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+
+	/* Ensure we don't receive any more interrupts */
+	vdev->config->reset(vdev);
+
+	/* Prevent config work handler from accessing the device. */
+	mutex_lock(&vblk->config_lock);
+	vblk->config_enable = false;
+	mutex_unlock(&vblk->config_lock);
+
+	flush_work(&vblk->config_work);
+
+	blk_mq_stop_hw_queues(vblk->disk->queue);
+
+	vdev->config->del_vqs(vdev);
+	return 0;
 }
 
-static struct virtio_device_id id_table[] = {
+static int virtblk_restore(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+	int ret;
+
+	vblk->config_enable = true;
+	ret = init_vq(vdev->priv);
+	if (!ret)
+		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+
+	return ret;
+}
+#endif
+
+static const struct virtio_device_id id_table[] = {
 	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
 	{ 0 },
 };
 
 static unsigned int features[] = {
-	VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
-	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
-	VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_IDENTIFY
+	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
+	VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE
 };
 
 static struct virtio_driver virtio_blk = {
-	.feature_table = features,
-	.feature_table_size = ARRAY_SIZE(features),
-	.driver.name =	KBUILD_MODNAME,
-	.driver.owner =	THIS_MODULE,
-	.id_table =	id_table,
-	.probe =	virtblk_probe,
-	.remove =	__devexit_p(virtblk_remove),
+	.feature_table		= features,
+	.feature_table_size	= ARRAY_SIZE(features),
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.probe			= virtblk_probe,
+	.remove			= virtblk_remove,
+	.config_changed		= virtblk_config_changed,
+#ifdef CONFIG_PM_SLEEP
+	.freeze			= virtblk_freeze,
+	.restore		= virtblk_restore,
+#endif
 };
 
 static int __init init(void)
 {
+	int error;
+
+	virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
+	if (!virtblk_wq)
+		return -ENOMEM;
+
 	major = register_blkdev(0, "virtblk");
-	if (major < 0)
-		return major;
-	return register_virtio_driver(&virtio_blk);
+	if (major < 0) {
+		error = major;
+		goto out_destroy_workqueue;
+	}
+
+	error = register_virtio_driver(&virtio_blk);
+	if (error)
+		goto out_unregister_blkdev;
+	return 0;
+
+out_unregister_blkdev:
+	unregister_blkdev(major, "virtblk");
+out_destroy_workqueue:
+	destroy_workqueue(virtblk_wq);
+	return error;
 }
 
 static void __exit fini(void)
 {
 	unregister_blkdev(major, "virtblk");
 	unregister_virtio_driver(&virtio_blk);
+	destroy_workqueue(virtblk_wq);
 }
 module_init(init);
 module_exit(fini);
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
deleted file mode 100644
index ce242921992..00000000000
--- a/drivers/block/xd.c
+++ /dev/null
@@ -1,1101 +0,0 @@
-/*
- * This file contains the driver for an XT hard disk controller
- * (at least the DTC 5150X) for Linux.
- *
- * Author: Pat Mackinlay, pat@it.com.au
- * Date: 29/09/92
- * 
- * Revised: 01/01/93, ...
- *
- * Ref: DTC 5150X Controller Specification (thanks to Kevin Fowler,
- *   kevinf@agora.rain.com)
- * Also thanks to: Salvador Abreu, Dave Thaler, Risto Kankkunen and
- *   Wim Van Dorst.
- *
- * Revised: 04/04/94 by Risto Kankkunen
- *   Moved the detection code from xd_init() to xd_geninit() as it needed
- *   interrupts enabled and Linus didn't want to enable them in that first
- *   phase. xd_geninit() is the place to do these kinds of things anyway,
- *   he says.
- *
- * Modularized: 04/10/96 by Todd Fries, tfries@umr.edu
- *
- * Revised: 13/12/97 by Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl
- *   Fixed some problems with disk initialization and module initiation.
- *   Added support for manual geometry setting (except Seagate controllers)
- *   in form:
- *      xd_geo=<cyl_xda>,<head_xda>,<sec_xda>[,<cyl_xdb>,<head_xdb>,<sec_xdb>]
- *   Recovered DMA access. Abridged messages. Added support for DTC5051CX,
- *   WD1002-27X & XEBEC controllers. Driver uses now some jumper settings.
- *   Extended ioctl() support.
- *
- * Bugfix: 15/02/01, Paul G. - inform queue layer of tiny xd_maxsect.
- *
- */
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/genhd.h>
-#include <linux/hdreg.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/wait.h>
-#include <linux/blkdev.h>
-#include <linux/blkpg.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/dma.h>
-
-#include "xd.h"
-
-static void __init do_xd_setup (int *integers);
-#ifdef MODULE
-static int xd[5] = { -1,-1,-1,-1, };
-#endif
-
-#define XD_DONT_USE_DMA		0  /* Initial value. may be overriden using
-				      "nodma" module option */
-#define XD_INIT_DISK_DELAY	(30)  /* 30 ms delay during disk initialization */
-
-/* Above may need to be increased if a problem with the 2nd drive detection
-   (ST11M controller) or resetting a controller (WD) appears */
-
-static XD_INFO xd_info[XD_MAXDRIVES];
-
-/* If you try this driver and find that your card is not detected by the driver at bootup, you need to add your BIOS
-   signature and details to the following list of signatures. A BIOS signature is a string embedded into the first
-   few bytes of your controller's on-board ROM BIOS. To find out what yours is, use something like MS-DOS's DEBUG
-   command. Run DEBUG, and then you can examine your BIOS signature with:
-
-	d xxxx:0000
-
-   where xxxx is the segment of your controller (like C800 or D000 or something). On the ASCII dump at the right, you should
-   be able to see a string mentioning the manufacturer's copyright etc. Add this string into the table below. The parameters
-   in the table are, in order:
-
-	offset			; this is the offset (in bytes) from the start of your ROM where the signature starts
-	signature		; this is the actual text of the signature
-	xd_?_init_controller	; this is the controller init routine used by your controller
-	xd_?_init_drive		; this is the drive init routine used by your controller
-
-   The controllers directly supported at the moment are: DTC 5150x, WD 1004A27X, ST11M/R and override. If your controller is
-   made by the same manufacturer as one of these, try using the same init routines as they do. If that doesn't work, your
-   best bet is to use the "override" routines. These routines use a "portable" method of getting the disk's geometry, and
-   may work with your card. If none of these seem to work, try sending me some email and I'll see what I can do <grin>.
-
-   NOTE: You can now specify your XT controller's parameters from the command line in the form xd=TYPE,IRQ,IO,DMA. The driver
-   should be able to detect your drive's geometry from this info. (eg: xd=0,5,0x320,3 is the "standard"). */
-
-#include <asm/page.h>
-#define xd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL,get_order(size))
-#define xd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
-static char *xd_dma_buffer;
-
-static XD_SIGNATURE xd_sigs[] __initdata = {
-	{ 0x0000,"Override geometry handler",NULL,xd_override_init_drive,"n unknown" }, /* Pat Mackinlay, pat@it.com.au */
-	{ 0x0008,"[BXD06 (C) DTC 17-MAY-1985]",xd_dtc_init_controller,xd_dtc5150cx_init_drive," DTC 5150CX" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */
-	{ 0x000B,"CRD18A   Not an IBM rom. (C) Copyright Data Technology Corp. 05/31/88",xd_dtc_init_controller,xd_dtc_init_drive," DTC 5150X" }, /* Todd Fries, tfries@umr.edu */
-	{ 0x000B,"CXD23A Not an IBM ROM (C)Copyright Data Technology Corp 12/03/88",xd_dtc_init_controller,xd_dtc_init_drive," DTC 5150X" }, /* Pat Mackinlay, pat@it.com.au */
-	{ 0x0008,"07/15/86(C) Copyright 1986 Western Digital Corp.",xd_wd_init_controller,xd_wd_init_drive," Western Dig. 1002-27X" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */
-	{ 0x0008,"06/24/88(C) Copyright 1988 Western Digital Corp.",xd_wd_init_controller,xd_wd_init_drive," Western Dig. WDXT-GEN2" }, /* Dan Newcombe, newcombe@aa.csc.peachnet.edu */
-	{ 0x0015,"SEAGATE ST11 BIOS REVISION",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11M/R" }, /* Salvador Abreu, spa@fct.unl.pt */
-	{ 0x0010,"ST11R BIOS",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11M/R" }, /* Risto Kankkunen, risto.kankkunen@cs.helsinki.fi */
-	{ 0x0010,"ST11 BIOS v1.7",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11R" }, /* Alan Hourihane, alanh@fairlite.demon.co.uk */
-	{ 0x1000,"(c)Copyright 1987 SMS",xd_omti_init_controller,xd_omti_init_drive,"n OMTI 5520" }, /* Dirk Melchers, dirk@merlin.nbg.sub.org */
-	{ 0x0006,"COPYRIGHT XEBEC (C) 1984",xd_xebec_init_controller,xd_xebec_init_drive," XEBEC" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */
-	{ 0x0008,"(C) Copyright 1984 Western Digital Corp", xd_wd_init_controller, xd_wd_init_drive," Western Dig. 1002s-wx2" },
-	{ 0x0008,"(C) Copyright 1986 Western Digital Corporation", xd_wd_init_controller, xd_wd_init_drive," 1986 Western Digital" }, /* jfree@sovereign.org */
-};
-
-static unsigned int xd_bases[] __initdata =
-{
-	0xC8000, 0xCA000, 0xCC000,
-	0xCE000, 0xD0000, 0xD2000,
-	0xD4000, 0xD6000, 0xD8000,
-	0xDA000, 0xDC000, 0xDE000,
-	0xE0000
-};
-
-static DEFINE_SPINLOCK(xd_lock);
-
-static struct gendisk *xd_gendisk[2];
-
-static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
-
-static struct block_device_operations xd_fops = {
-	.owner	= THIS_MODULE,
-	.locked_ioctl	= xd_ioctl,
-	.getgeo = xd_getgeo,
-};
-static DECLARE_WAIT_QUEUE_HEAD(xd_wait_int);
-static u_char xd_drives, xd_irq = 5, xd_dma = 3, xd_maxsectors;
-static u_char xd_override __initdata = 0, xd_type __initdata = 0;
-static u_short xd_iobase = 0x320;
-static int xd_geo[XD_MAXDRIVES*3] __initdata = { 0, };
-
-static volatile int xdc_busy;
-static struct timer_list xd_watchdog_int;
-
-static volatile u_char xd_error;
-static int nodma = XD_DONT_USE_DMA;
-
-static struct request_queue *xd_queue;
-
-/* xd_init: register the block device number and set up pointer tables */
-static int __init xd_init(void)
-{
-	u_char i,controller;
-	unsigned int address;
-	int err;
-
-#ifdef MODULE
-	{
-		u_char count = 0;
-		for (i = 4; i > 0; i--)
-			if (((xd[i] = xd[i-1]) >= 0) && !count)
-				count = i;
-		if ((xd[0] = count))
-			do_xd_setup(xd);
-	}
-#endif
-
-	init_timer (&xd_watchdog_int); xd_watchdog_int.function = xd_watchdog;
-
-	if (!xd_dma_buffer)
-		xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200);
-	if (!xd_dma_buffer) {
-		printk(KERN_ERR "xd: Out of memory.\n");
-		return -ENOMEM;
-	}
-
-	err = -EBUSY;
-	if (register_blkdev(XT_DISK_MAJOR, "xd"))
-		goto out1;
-
-	err = -ENOMEM;
-	xd_queue = blk_init_queue(do_xd_request, &xd_lock);
-	if (!xd_queue)
-		goto out1a;
-
-	if (xd_detect(&controller,&address)) {
-
-		printk("Detected a%s controller (type %d) at address %06x\n",
-			xd_sigs[controller].name,controller,address);
-		if (!request_region(xd_iobase,4,"xd")) {
-			printk("xd: Ports at 0x%x are not available\n",
-				xd_iobase);
-			goto out2;
-		}
-		if (controller)
-			xd_sigs[controller].init_controller(address);
-		xd_drives = xd_initdrives(xd_sigs[controller].init_drive);
-		
-		printk("Detected %d hard drive%s (using IRQ%d & DMA%d)\n",
-			xd_drives,xd_drives == 1 ? "" : "s",xd_irq,xd_dma);
-	}
-
-	err = -ENODEV;
-	if (!xd_drives)
-		goto out3;
-
-	for (i = 0; i < xd_drives; i++) {
-		XD_INFO *p = &xd_info[i];
-		struct gendisk *disk = alloc_disk(64);
-		if (!disk)
-			goto Enomem;
-		p->unit = i;
-		disk->major = XT_DISK_MAJOR;
-		disk->first_minor = i<<6;
-		sprintf(disk->disk_name, "xd%c", i+'a');
-		disk->fops = &xd_fops;
-		disk->private_data = p;
-		disk->queue = xd_queue;
-		set_capacity(disk, p->heads * p->cylinders * p->sectors);
-		printk(" %s: CHS=%d/%d/%d\n", disk->disk_name,
-			p->cylinders, p->heads, p->sectors);
-		xd_gendisk[i] = disk;
-	}
-
-	err = -EBUSY;
-	if (request_irq(xd_irq,xd_interrupt_handler, 0, "XT hard disk", NULL)) {
-		printk("xd: unable to get IRQ%d\n",xd_irq);
-		goto out4;
-	}
-
-	if (request_dma(xd_dma,"xd")) {
-		printk("xd: unable to get DMA%d\n",xd_dma);
-		goto out5;
-	}
-
-	/* xd_maxsectors depends on controller - so set after detection */
-	blk_queue_max_sectors(xd_queue, xd_maxsectors);
-
-	for (i = 0; i < xd_drives; i++)
-		add_disk(xd_gendisk[i]);
-
-	return 0;
-
-out5:
-	free_irq(xd_irq, NULL);
-out4:
-	for (i = 0; i < xd_drives; i++)
-		put_disk(xd_gendisk[i]);
-out3:
-	release_region(xd_iobase,4);
-out2:
-	blk_cleanup_queue(xd_queue);
-out1a:
-	unregister_blkdev(XT_DISK_MAJOR, "xd");
-out1:
-	if (xd_dma_buffer)
-		xd_dma_mem_free((unsigned long)xd_dma_buffer,
-				xd_maxsectors * 0x200);
-	return err;
-Enomem:
-	err = -ENOMEM;
-	while (i--)
-		put_disk(xd_gendisk[i]);
-	goto out3;
-}
-
-/* xd_detect: scan the possible BIOS ROM locations for the signature strings */
-static u_char __init xd_detect (u_char *controller, unsigned int *address)
-{
-	int i, j;
-
-	if (xd_override)
-	{
-		*controller = xd_type;
-		*address = 0;
-		return(1);
-	}
-
-	for (i = 0; i < ARRAY_SIZE(xd_bases); i++) {
-		void __iomem *p = ioremap(xd_bases[i], 0x2000);
-		if (!p)
-			continue;
-		for (j = 1; j < ARRAY_SIZE(xd_sigs); j++) {
-			const char *s = xd_sigs[j].string;
-			if (check_signature(p + xd_sigs[j].offset, s, strlen(s))) {
-				*controller = j;
-				xd_type = j;
-				*address = xd_bases[i];
-				iounmap(p);
-				return 1;
-			}
-		}
-		iounmap(p);
-	}
-	return 0;
-}
-
-/* do_xd_request: handle an incoming request */
-static void do_xd_request (struct request_queue * q)
-{
-	struct request *req;
-
-	if (xdc_busy)
-		return;
-
-	req = blk_fetch_request(q);
-	while (req) {
-		unsigned block = blk_rq_pos(req);
-		unsigned count = blk_rq_cur_sectors(req);
-		XD_INFO *disk = req->rq_disk->private_data;
-		int res = -EIO;
-		int retry;
-
-		if (!blk_fs_request(req))
-			goto done;
-		if (block + count > get_capacity(req->rq_disk))
-			goto done;
-		for (retry = 0; (retry < XD_RETRIES) && !res; retry++)
-			res = xd_readwrite(rq_data_dir(req), disk, req->buffer,
-					   block, count);
-	done:
-		/* wrap up, 0 = success, -errno = fail */
-		if (!__blk_end_request_cur(req, res))
-			req = blk_fetch_request(q);
-	}
-}
-
-static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	XD_INFO *p = bdev->bd_disk->private_data;
-
-	geo->heads = p->heads;
-	geo->sectors = p->sectors;
-	geo->cylinders = p->cylinders;
-	return 0;
-}
-
-/* xd_ioctl: handle device ioctl's */
-static int xd_ioctl(struct block_device *bdev, fmode_t mode, u_int cmd, u_long arg)
-{
-	switch (cmd) {
-		case HDIO_SET_DMA:
-			if (!capable(CAP_SYS_ADMIN)) return -EACCES;
-			if (xdc_busy) return -EBUSY;
-			nodma = !arg;
-			if (nodma && xd_dma_buffer) {
-				xd_dma_mem_free((unsigned long)xd_dma_buffer,
-						xd_maxsectors * 0x200);
-				xd_dma_buffer = NULL;
-			} else if (!nodma && !xd_dma_buffer) {
-				xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200);
-				if (!xd_dma_buffer) {
-					nodma = XD_DONT_USE_DMA;
-					return -ENOMEM;
-				}
-			}
-			return 0;
-		case HDIO_GET_DMA:
-			return put_user(!nodma, (long __user *) arg);
-		case HDIO_GET_MULTCOUNT:
-			return put_user(xd_maxsectors, (long __user *) arg);
-		default:
-			return -EINVAL;
-	}
-}
-
-/* xd_readwrite: handle a read/write request */
-static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_int count)
-{
-	int drive = p->unit;
-	u_char cmdblk[6],sense[4];
-	u_short track,cylinder;
-	u_char head,sector,control,mode = PIO_MODE,temp;
-	char **real_buffer;
-	register int i;
-	
-#ifdef DEBUG_READWRITE
-	printk("xd_readwrite: operation = %s, drive = %d, buffer = 0x%X, block = %d, count = %d\n",operation == READ ? "read" : "write",drive,buffer,block,count);
-#endif /* DEBUG_READWRITE */
-
-	spin_unlock_irq(&xd_lock);
-
-	control = p->control;
-	if (!xd_dma_buffer)
-		xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200);
-	while (count) {
-		temp = count < xd_maxsectors ? count : xd_maxsectors;
-
-		track = block / p->sectors;
-		head = track % p->heads;
-		cylinder = track / p->heads;
-		sector = block % p->sectors;
-
-#ifdef DEBUG_READWRITE
-		printk("xd_readwrite: drive = %d, head = %d, cylinder = %d, sector = %d, count = %d\n",drive,head,cylinder,sector,temp);
-#endif /* DEBUG_READWRITE */
-
-		if (xd_dma_buffer) {
-			mode = xd_setup_dma(operation == READ ? DMA_MODE_READ : DMA_MODE_WRITE,(u_char *)(xd_dma_buffer),temp * 0x200);
-			real_buffer = &xd_dma_buffer;
-			for (i=0; i < (temp * 0x200); i++)
-				xd_dma_buffer[i] = buffer[i];
-		}
-		else
-			real_buffer = &buffer;
-
-		xd_build(cmdblk,operation == READ ? CMD_READ : CMD_WRITE,drive,head,cylinder,sector,temp & 0xFF,control);
-
-		switch (xd_command(cmdblk,mode,(u_char *)(*real_buffer),(u_char *)(*real_buffer),sense,XD_TIMEOUT)) {
-			case 1:
-				printk("xd%c: %s timeout, recalibrating drive\n",'a'+drive,(operation == READ ? "read" : "write"));
-				xd_recalibrate(drive);
-				spin_lock_irq(&xd_lock);
-				return -EIO;
-			case 2:
-				if (sense[0] & 0x30) {
-					printk("xd%c: %s - ",'a'+drive,(operation == READ ? "reading" : "writing"));
-					switch ((sense[0] & 0x30) >> 4) {
-					case 0: printk("drive error, code = 0x%X",sense[0] & 0x0F);
-						break;
-					case 1: printk("controller error, code = 0x%X",sense[0] & 0x0F);
-						break;
-					case 2: printk("command error, code = 0x%X",sense[0] & 0x0F);
-						break;
-					case 3: printk("miscellaneous error, code = 0x%X",sense[0] & 0x0F);
-						break;
-					}
-				}
-				if (sense[0] & 0x80)
-					printk(" - CHS = %d/%d/%d\n",((sense[2] & 0xC0) << 2) | sense[3],sense[1] & 0x1F,sense[2] & 0x3F);
-				/*	reported drive number = (sense[1] & 0xE0) >> 5 */
-				else
-					printk(" - no valid disk address\n");
-				spin_lock_irq(&xd_lock);
-				return -EIO;
-		}
-		if (xd_dma_buffer)
-			for (i=0; i < (temp * 0x200); i++)
-				buffer[i] = xd_dma_buffer[i];
-
-		count -= temp, buffer += temp * 0x200, block += temp;
-	}
-	spin_lock_irq(&xd_lock);
-	return 0;
-}
-
-/* xd_recalibrate: recalibrate a given drive and reset controller if necessary */
-static void xd_recalibrate (u_char drive)
-{
-	u_char cmdblk[6];
-	
-	xd_build(cmdblk,CMD_RECALIBRATE,drive,0,0,0,0,0);
-	if (xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 8))
-		printk("xd%c: warning! error recalibrating, controller may be unstable\n", 'a'+drive);
-}
-
-/* xd_interrupt_handler: interrupt service routine */
-static irqreturn_t xd_interrupt_handler(int irq, void *dev_id)
-{
-	if (inb(XD_STATUS) & STAT_INTERRUPT) {							/* check if it was our device */
-#ifdef DEBUG_OTHER
-		printk("xd_interrupt_handler: interrupt detected\n");
-#endif /* DEBUG_OTHER */
-		outb(0,XD_CONTROL);								/* acknowledge interrupt */
-		wake_up(&xd_wait_int);	/* and wake up sleeping processes */
-		return IRQ_HANDLED;
-	}
-	else
-		printk("xd: unexpected interrupt\n");
-	return IRQ_NONE;
-}
-
-/* xd_setup_dma: set up the DMA controller for a data transfer */
-static u_char xd_setup_dma (u_char mode,u_char *buffer,u_int count)
-{
-	unsigned long f;
-	
-	if (nodma)
-		return (PIO_MODE);
-	if (((unsigned long) buffer & 0xFFFF0000) != (((unsigned long) buffer + count) & 0xFFFF0000)) {
-#ifdef DEBUG_OTHER
-		printk("xd_setup_dma: using PIO, transfer overlaps 64k boundary\n");
-#endif /* DEBUG_OTHER */
-		return (PIO_MODE);
-	}
-	
-	f=claim_dma_lock();
-	disable_dma(xd_dma);
-	clear_dma_ff(xd_dma);
-	set_dma_mode(xd_dma,mode);
-	set_dma_addr(xd_dma, (unsigned long) buffer);
-	set_dma_count(xd_dma,count);
-	
-	release_dma_lock(f);
-
-	return (DMA_MODE);			/* use DMA and INT */
-}
-
-/* xd_build: put stuff into an array in a format suitable for the controller */
-static u_char *xd_build (u_char *cmdblk,u_char command,u_char drive,u_char head,u_short cylinder,u_char sector,u_char count,u_char control)
-{
-	cmdblk[0] = command;
-	cmdblk[1] = ((drive & 0x07) << 5) | (head & 0x1F);
-	cmdblk[2] = ((cylinder & 0x300) >> 2) | (sector & 0x3F);
-	cmdblk[3] = cylinder & 0xFF;
-	cmdblk[4] = count;
-	cmdblk[5] = control;
-	
-	return (cmdblk);
-}
-
-static void xd_watchdog (unsigned long unused)
-{
-	xd_error = 1;
-	wake_up(&xd_wait_int);
-}
-
-/* xd_waitport: waits until port & mask == flags or a timeout occurs. return 1 for a timeout */
-static inline u_char xd_waitport (u_short port,u_char flags,u_char mask,u_long timeout)
-{
-	u_long expiry = jiffies + timeout;
-	int success;
-
-	xdc_busy = 1;
-	while ((success = ((inb(port) & mask) != flags)) && time_before(jiffies, expiry))
-		schedule_timeout_uninterruptible(1);
-	xdc_busy = 0;
-	return (success);
-}
-
-static inline u_int xd_wait_for_IRQ (void)
-{
-	unsigned long flags;
-	xd_watchdog_int.expires = jiffies + 8 * HZ;
-	add_timer(&xd_watchdog_int);
-	
-	flags=claim_dma_lock();
-	enable_dma(xd_dma);
-	release_dma_lock(flags);
-	
-	sleep_on(&xd_wait_int);
-	del_timer(&xd_watchdog_int);
-	xdc_busy = 0;
-	
-	flags=claim_dma_lock();
-	disable_dma(xd_dma);
-	release_dma_lock(flags);
-	
-	if (xd_error) {
-		printk("xd: missed IRQ - command aborted\n");
-		xd_error = 0;
-		return (1);
-	}
-	return (0);
-}
-
-/* xd_command: handle all data transfers necessary for a single command */
-static u_int xd_command (u_char *command,u_char mode,u_char *indata,u_char *outdata,u_char *sense,u_long timeout)
-{
-	u_char cmdblk[6],csb,complete = 0;
-
-#ifdef DEBUG_COMMAND
-	printk("xd_command: command = 0x%X, mode = 0x%X, indata = 0x%X, outdata = 0x%X, sense = 0x%X\n",command,mode,indata,outdata,sense);
-#endif /* DEBUG_COMMAND */
-
-	outb(0,XD_SELECT);
-	outb(mode,XD_CONTROL);
-
-	if (xd_waitport(XD_STATUS,STAT_SELECT,STAT_SELECT,timeout))
-		return (1);
-
-	while (!complete) {
-		if (xd_waitport(XD_STATUS,STAT_READY,STAT_READY,timeout))
-			return (1);
-
-		switch (inb(XD_STATUS) & (STAT_COMMAND | STAT_INPUT)) {
-			case 0:
-				if (mode == DMA_MODE) {
-					if (xd_wait_for_IRQ())
-						return (1);
-				} else
-					outb(outdata ? *outdata++ : 0,XD_DATA);
-				break;
-			case STAT_INPUT:
-				if (mode == DMA_MODE) {
-					if (xd_wait_for_IRQ())
-						return (1);
-				} else
-					if (indata)
-						*indata++ = inb(XD_DATA);
-					else
-						inb(XD_DATA);
-				break;
-			case STAT_COMMAND:
-				outb(command ? *command++ : 0,XD_DATA);
-				break;
-			case STAT_COMMAND | STAT_INPUT:
-				complete = 1;
-				break;
-		}
-	}
-	csb = inb(XD_DATA);
-
-	if (xd_waitport(XD_STATUS,0,STAT_SELECT,timeout))					/* wait until deselected */
-		return (1);
-
-	if (csb & CSB_ERROR) {									/* read sense data if error */
-		xd_build(cmdblk,CMD_SENSE,(csb & CSB_LUN) >> 5,0,0,0,0,0);
-		if (xd_command(cmdblk,0,sense,NULL,NULL,XD_TIMEOUT))
-			printk("xd: warning! sense command failed!\n");
-	}
-
-#ifdef DEBUG_COMMAND
-	printk("xd_command: completed with csb = 0x%X\n",csb);
-#endif /* DEBUG_COMMAND */
-
-	return (csb & CSB_ERROR);
-}
-
-static u_char __init xd_initdrives (void (*init_drive)(u_char drive))
-{
-	u_char cmdblk[6],i,count = 0;
-
-	for (i = 0; i < XD_MAXDRIVES; i++) {
-		xd_build(cmdblk,CMD_TESTREADY,i,0,0,0,0,0);
-		if (!xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT*8)) {
-			msleep_interruptible(XD_INIT_DISK_DELAY);
-
-			init_drive(count);
-			count++;
-
-			msleep_interruptible(XD_INIT_DISK_DELAY);
-		}
-	}
-	return (count);
-}
-
-static void __init xd_manual_geo_set (u_char drive)
-{
-	xd_info[drive].heads = (u_char)(xd_geo[3 * drive + 1]);
-	xd_info[drive].cylinders = (u_short)(xd_geo[3 * drive]);
-	xd_info[drive].sectors = (u_char)(xd_geo[3 * drive + 2]);
-}
-
-static void __init xd_dtc_init_controller (unsigned int address)
-{
-	switch (address) {
-		case 0x00000:
-		case 0xC8000:	break;			/*initial: 0x320 */
-		case 0xCA000:	xd_iobase = 0x324; 
-		case 0xD0000:				/*5150CX*/
-		case 0xD8000:	break;			/*5150CX & 5150XL*/
-		default:        printk("xd_dtc_init_controller: unsupported BIOS address %06x\n",address);
-				break;
-	}
-	xd_maxsectors = 0x01;		/* my card seems to have trouble doing multi-block transfers? */
-
-	outb(0,XD_RESET);		/* reset the controller */
-}
-
-
-static void __init xd_dtc5150cx_init_drive (u_char drive)
-{
-	/* values from controller's BIOS - BIOS chip may be removed */
-	static u_short geometry_table[][4] = {
-		{0x200,8,0x200,0x100},
-		{0x267,2,0x267,0x267},
-		{0x264,4,0x264,0x80},
-		{0x132,4,0x132,0x0},
-		{0x132,2,0x80, 0x132},
-		{0x177,8,0x177,0x0},
-		{0x132,8,0x84, 0x0},
-		{},  /* not used */
-		{0x132,6,0x80, 0x100},
-		{0x200,6,0x100,0x100},
-		{0x264,2,0x264,0x80},
-		{0x280,4,0x280,0x100},
-		{0x2B9,3,0x2B9,0x2B9},
-		{0x2B9,5,0x2B9,0x2B9},
-		{0x280,6,0x280,0x100},
-		{0x132,4,0x132,0x0}};
-	u_char n;
-
-	n = inb(XD_JUMPER);
-	n = (drive ? n : (n >> 2)) & 0x33;
-	n = (n | (n >> 2)) & 0x0F;
-	if (xd_geo[3*drive])
-		xd_manual_geo_set(drive);
-	else
-		if (n != 7) {	
-			xd_info[drive].heads = (u_char)(geometry_table[n][1]);			/* heads */
-			xd_info[drive].cylinders = geometry_table[n][0];	/* cylinders */
-			xd_info[drive].sectors = 17;				/* sectors */
-#if 0
-			xd_info[drive].rwrite = geometry_table[n][2];	/* reduced write */
-			xd_info[drive].precomp = geometry_table[n][3]		/* write precomp */
-			xd_info[drive].ecc = 0x0B;				/* ecc length */
-#endif /* 0 */
-		}
-		else {
-			printk("xd%c: undetermined drive geometry\n",'a'+drive);
-			return;
-		}
-	xd_info[drive].control = 5;				/* control byte */
-	xd_setparam(CMD_DTCSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,geometry_table[n][2],geometry_table[n][3],0x0B);
-	xd_recalibrate(drive);
-}
-
-static void __init xd_dtc_init_drive (u_char drive)
-{
-	u_char cmdblk[6],buf[64];
-
-	xd_build(cmdblk,CMD_DTCGETGEOM,drive,0,0,0,0,0);
-	if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) {
-		xd_info[drive].heads = buf[0x0A];			/* heads */
-		xd_info[drive].cylinders = ((u_short *) (buf))[0x04];	/* cylinders */
-		xd_info[drive].sectors = 17;				/* sectors */
-		if (xd_geo[3*drive])
-			xd_manual_geo_set(drive);
-#if 0
-		xd_info[drive].rwrite = ((u_short *) (buf + 1))[0x05];	/* reduced write */
-		xd_info[drive].precomp = ((u_short *) (buf + 1))[0x06];	/* write precomp */
-		xd_info[drive].ecc = buf[0x0F];				/* ecc length */
-#endif /* 0 */
-		xd_info[drive].control = 0;				/* control byte */
-
-		xd_setparam(CMD_DTCSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,((u_short *) (buf + 1))[0x05],((u_short *) (buf + 1))[0x06],buf[0x0F]);
-		xd_build(cmdblk,CMD_DTCSETSTEP,drive,0,0,0,0,7);
-		if (xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 2))
-			printk("xd_dtc_init_drive: error setting step rate for xd%c\n", 'a'+drive);
-	}
-	else
-		printk("xd_dtc_init_drive: error reading geometry for xd%c\n", 'a'+drive);
-}
-
-static void __init xd_wd_init_controller (unsigned int address)
-{
-	switch (address) {
-		case 0x00000:
-		case 0xC8000:	break;			/*initial: 0x320 */
-		case 0xCA000:	xd_iobase = 0x324; break;
-		case 0xCC000:   xd_iobase = 0x328; break;
-		case 0xCE000:   xd_iobase = 0x32C; break;
-		case 0xD0000:	xd_iobase = 0x328; break; /* ? */
-		case 0xD8000:	xd_iobase = 0x32C; break; /* ? */
-		default:        printk("xd_wd_init_controller: unsupported BIOS address %06x\n",address);
-				break;
-	}
-	xd_maxsectors = 0x01;		/* this one doesn't wrap properly either... */
-
-	outb(0,XD_RESET);		/* reset the controller */
-
-	msleep(XD_INIT_DISK_DELAY);
-}
-
-static void __init xd_wd_init_drive (u_char drive)
-{
-	/* values from controller's BIOS - BIOS may be disabled */
-	static u_short geometry_table[][4] = {
-		{0x264,4,0x1C2,0x1C2},   /* common part */
-		{0x132,4,0x099,0x0},
-		{0x267,2,0x1C2,0x1C2},
-		{0x267,4,0x1C2,0x1C2},
-
-		{0x334,6,0x335,0x335},   /* 1004 series RLL */
-		{0x30E,4,0x30F,0x3DC},
-		{0x30E,2,0x30F,0x30F},
-		{0x267,4,0x268,0x268},
-
-		{0x3D5,5,0x3D6,0x3D6},   /* 1002 series RLL */
-		{0x3DB,7,0x3DC,0x3DC},
-		{0x264,4,0x265,0x265},
-		{0x267,4,0x268,0x268}};
-
-	u_char cmdblk[6],buf[0x200];
-	u_char n = 0,rll,jumper_state,use_jumper_geo;
-	u_char wd_1002 = (xd_sigs[xd_type].string[7] == '6');
-	
-	jumper_state = ~(inb(0x322));
-	if (jumper_state & 0x40)
-		xd_irq = 9;
-	rll = (jumper_state & 0x30) ? (0x04 << wd_1002) : 0;
-	xd_build(cmdblk,CMD_READ,drive,0,0,0,1,0);
-	if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) {
-		xd_info[drive].heads = buf[0x1AF];				/* heads */
-		xd_info[drive].cylinders = ((u_short *) (buf + 1))[0xD6];	/* cylinders */
-		xd_info[drive].sectors = 17;					/* sectors */
-		if (xd_geo[3*drive])
-			xd_manual_geo_set(drive);
-#if 0
-		xd_info[drive].rwrite = ((u_short *) (buf))[0xD8];		/* reduced write */
-		xd_info[drive].wprecomp = ((u_short *) (buf))[0xDA];		/* write precomp */
-		xd_info[drive].ecc = buf[0x1B4];				/* ecc length */
-#endif /* 0 */
-		xd_info[drive].control = buf[0x1B5];				/* control byte */
-		use_jumper_geo = !(xd_info[drive].heads) || !(xd_info[drive].cylinders);
-		if (xd_geo[3*drive]) {
-			xd_manual_geo_set(drive);
-			xd_info[drive].control = rll ? 7 : 5;
-		}
-		else if (use_jumper_geo) {
-			n = (((jumper_state & 0x0F) >> (drive << 1)) & 0x03) | rll;
-			xd_info[drive].cylinders = geometry_table[n][0];
-			xd_info[drive].heads = (u_char)(geometry_table[n][1]);
-			xd_info[drive].control = rll ? 7 : 5;
-#if 0
-			xd_info[drive].rwrite = geometry_table[n][2];
-			xd_info[drive].wprecomp = geometry_table[n][3];
-			xd_info[drive].ecc = 0x0B;
-#endif /* 0 */
-		}
-		if (!wd_1002) {
-			if (use_jumper_geo)
-				xd_setparam(CMD_WDSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,
-					geometry_table[n][2],geometry_table[n][3],0x0B);
-			else
-				xd_setparam(CMD_WDSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,
-					((u_short *) (buf))[0xD8],((u_short *) (buf))[0xDA],buf[0x1B4]);
-		}
-	/* 1002 based RLL controller requests converted addressing, but reports physical 
-	   (physical 26 sec., logical 17 sec.) 
-	   1004 based ???? */
-		if (rll & wd_1002) {
-			if ((xd_info[drive].cylinders *= 26,
-			     xd_info[drive].cylinders /= 17) > 1023)
-				xd_info[drive].cylinders = 1023;  /* 1024 ? */
-#if 0
-			xd_info[drive].rwrite *= 26; 
-			xd_info[drive].rwrite /= 17;
-			xd_info[drive].wprecomp *= 26
-			xd_info[drive].wprecomp /= 17;
-#endif /* 0 */
-		}
-	}
-	else
-		printk("xd_wd_init_drive: error reading geometry for xd%c\n",'a'+drive);	
-
-}
-
-static void __init xd_seagate_init_controller (unsigned int address)
-{
-	switch (address) {
-		case 0x00000:
-		case 0xC8000:	break;			/*initial: 0x320 */
-		case 0xD0000:	xd_iobase = 0x324; break;
-		case 0xD8000:	xd_iobase = 0x328; break;
-		case 0xE0000:	xd_iobase = 0x32C; break;
-		default:	printk("xd_seagate_init_controller: unsupported BIOS address %06x\n",address);
-				break;
-	}
-	xd_maxsectors = 0x40;
-
-	outb(0,XD_RESET);		/* reset the controller */
-}
-
-static void __init xd_seagate_init_drive (u_char drive)
-{
-	u_char cmdblk[6],buf[0x200];
-
-	xd_build(cmdblk,CMD_ST11GETGEOM,drive,0,0,0,1,0);
-	if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) {
-		xd_info[drive].heads = buf[0x04];				/* heads */
-		xd_info[drive].cylinders = (buf[0x02] << 8) | buf[0x03];	/* cylinders */
-		xd_info[drive].sectors = buf[0x05];				/* sectors */
-		xd_info[drive].control = 0;					/* control byte */
-	}
-	else
-		printk("xd_seagate_init_drive: error reading geometry from xd%c\n", 'a'+drive);
-}
-
-/* Omti support courtesy Dirk Melchers */
-static void __init xd_omti_init_controller (unsigned int address)
-{
-	switch (address) {
-		case 0x00000:
-		case 0xC8000:	break;			/*initial: 0x320 */
-		case 0xD0000:	xd_iobase = 0x324; break;
-		case 0xD8000:	xd_iobase = 0x328; break;
-		case 0xE0000:	xd_iobase = 0x32C; break;
-		default:	printk("xd_omti_init_controller: unsupported BIOS address %06x\n",address);
-				break;
-	}
-	
-	xd_maxsectors = 0x40;
-
-	outb(0,XD_RESET);		/* reset the controller */
-}
-
-static void __init xd_omti_init_drive (u_char drive)
-{
-	/* gets infos from drive */
-	xd_override_init_drive(drive);
-
-	/* set other parameters, Hardcoded, not that nice :-) */
-	xd_info[drive].control = 2;
-}
-
-/* Xebec support (AK) */
-static void __init xd_xebec_init_controller (unsigned int address)
-{
-/* iobase may be set manually in range 0x300 - 0x33C
-      irq may be set manually to 2(9),3,4,5,6,7
-      dma may be set manually to 1,2,3
-	(How to detect them ???)
-BIOS address may be set manually in range 0x0 - 0xF8000
-If you need non-standard settings use the xd=... command */
-
-	switch (address) {
-		case 0x00000:
-		case 0xC8000:	/* initially: xd_iobase==0x320 */
-		case 0xD0000:
-		case 0xD2000:
-		case 0xD4000:
-		case 0xD6000:
-		case 0xD8000:
-		case 0xDA000:
-		case 0xDC000:
-		case 0xDE000:
-		case 0xE0000:	break;
-		default:	printk("xd_xebec_init_controller: unsupported BIOS address %06x\n",address);
-				break;
-		}
-
-	xd_maxsectors = 0x01;
-	outb(0,XD_RESET);		/* reset the controller */
-
-	msleep(XD_INIT_DISK_DELAY);
-}
-
-static void __init xd_xebec_init_drive (u_char drive)
-{
-	/* values from controller's BIOS - BIOS chip may be removed */
-	static u_short geometry_table[][5] = {
-		{0x132,4,0x080,0x080,0x7},
-		{0x132,4,0x080,0x080,0x17},
-		{0x264,2,0x100,0x100,0x7},
-		{0x264,2,0x100,0x100,0x17},
-		{0x132,8,0x080,0x080,0x7},
-		{0x132,8,0x080,0x080,0x17},
-		{0x264,4,0x100,0x100,0x6},
-		{0x264,4,0x100,0x100,0x17},
-		{0x2BC,5,0x2BC,0x12C,0x6},
-		{0x3A5,4,0x3A5,0x3A5,0x7},
-		{0x26C,6,0x26C,0x26C,0x7},
-		{0x200,8,0x200,0x100,0x17},
-		{0x400,5,0x400,0x400,0x7},
-		{0x400,6,0x400,0x400,0x7},
-		{0x264,8,0x264,0x200,0x17},
-		{0x33E,7,0x33E,0x200,0x7}};
-	u_char n;
-
-	n = inb(XD_JUMPER) & 0x0F; /* BIOS's drive number: same geometry 
-					is assumed for BOTH drives */
-	if (xd_geo[3*drive])
-		xd_manual_geo_set(drive);
-	else {
-		xd_info[drive].heads = (u_char)(geometry_table[n][1]);			/* heads */
-		xd_info[drive].cylinders = geometry_table[n][0];	/* cylinders */
-		xd_info[drive].sectors = 17;				/* sectors */
-#if 0
-		xd_info[drive].rwrite = geometry_table[n][2];	/* reduced write */
-		xd_info[drive].precomp = geometry_table[n][3]		/* write precomp */
-		xd_info[drive].ecc = 0x0B;				/* ecc length */
-#endif /* 0 */
-	}
-	xd_info[drive].control = geometry_table[n][4];			/* control byte */
-	xd_setparam(CMD_XBSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,geometry_table[n][2],geometry_table[n][3],0x0B);
-	xd_recalibrate(drive);
-}
-
-/* xd_override_init_drive: this finds disk geometry in a "binary search" style, narrowing in on the "correct" number of heads
-   etc. by trying values until it gets the highest successful value. Idea courtesy Salvador Abreu (spa@fct.unl.pt). */
-static void __init xd_override_init_drive (u_char drive)
-{
-	u_short min[] = { 0,0,0 },max[] = { 16,1024,64 },test[] = { 0,0,0 };
-	u_char cmdblk[6],i;
-
-	if (xd_geo[3*drive])
-		xd_manual_geo_set(drive);
-	else {
-		for (i = 0; i < 3; i++) {
-			while (min[i] != max[i] - 1) {
-				test[i] = (min[i] + max[i]) / 2;
-				xd_build(cmdblk,CMD_SEEK,drive,(u_char) test[0],(u_short) test[1],(u_char) test[2],0,0);
-				if (!xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 2))
-					min[i] = test[i];
-				else
-					max[i] = test[i];
-			}
-			test[i] = min[i];
-		}
-		xd_info[drive].heads = (u_char) min[0] + 1;
-		xd_info[drive].cylinders = (u_short) min[1] + 1;
-		xd_info[drive].sectors = (u_char) min[2] + 1;
-	}
-	xd_info[drive].control = 0;
-}
-
-/* xd_setup: initialise controller from command line parameters */
-static void __init do_xd_setup (int *integers)
-{
-	switch (integers[0]) {
-		case 4: if (integers[4] < 0)
-				nodma = 1;
-			else if (integers[4] < 8)
-				xd_dma = integers[4];
-		case 3: if ((integers[3] > 0) && (integers[3] <= 0x3FC))
-				xd_iobase = integers[3];
-		case 2: if ((integers[2] > 0) && (integers[2] < 16))
-				xd_irq = integers[2];
-		case 1: xd_override = 1;
-			if ((integers[1] >= 0) && (integers[1] < ARRAY_SIZE(xd_sigs)))
-				xd_type = integers[1];
-		case 0: break;
-		default:printk("xd: too many parameters for xd\n");
-	}
-	xd_maxsectors = 0x01;
-}
-
-/* xd_setparam: set the drive characteristics */
-static void __init xd_setparam (u_char command,u_char drive,u_char heads,u_short cylinders,u_short rwrite,u_short wprecomp,u_char ecc)
-{
-	u_char cmdblk[14];
-
-	xd_build(cmdblk,command,drive,0,0,0,0,0);
-	cmdblk[6] = (u_char) (cylinders >> 8) & 0x03;
-	cmdblk[7] = (u_char) (cylinders & 0xFF);
-	cmdblk[8] = heads & 0x1F;
-	cmdblk[9] = (u_char) (rwrite >> 8) & 0x03;
-	cmdblk[10] = (u_char) (rwrite & 0xFF);
-	cmdblk[11] = (u_char) (wprecomp >> 8) & 0x03;
-	cmdblk[12] = (u_char) (wprecomp & 0xFF);
-	cmdblk[13] = ecc;
-
-	/* Some controllers require geometry info as data, not command */
-
-	if (xd_command(cmdblk,PIO_MODE,NULL,&cmdblk[6],NULL,XD_TIMEOUT * 2))
-		printk("xd: error setting characteristics for xd%c\n", 'a'+drive);
-}
-
-
-#ifdef MODULE
-
-module_param_array(xd, int, NULL, 0);
-module_param_array(xd_geo, int, NULL, 0);
-module_param(nodma, bool, 0);
-
-MODULE_LICENSE("GPL");
-
-void cleanup_module(void)
-{
-	int i;
-	unregister_blkdev(XT_DISK_MAJOR, "xd");
-	for (i = 0; i < xd_drives; i++) {
-		del_gendisk(xd_gendisk[i]);
-		put_disk(xd_gendisk[i]);
-	}
-	blk_cleanup_queue(xd_queue);
-	release_region(xd_iobase,4);
-	if (xd_drives) {
-		free_irq(xd_irq, NULL);
-		free_dma(xd_dma);
-		if (xd_dma_buffer)
-			xd_dma_mem_free((unsigned long)xd_dma_buffer, xd_maxsectors * 0x200);
-	}
-}
-#else
-
-static int __init xd_setup (char *str)
-{
-	int ints[5];
-	get_options (str, ARRAY_SIZE (ints), ints);
-	do_xd_setup (ints);
-	return 1;
-}
-
-/* xd_manual_geo_init: initialise drive geometry from command line parameters
-   (used only for WD drives) */
-static int __init xd_manual_geo_init (char *str)
-{
-	int i, integers[1 + 3*XD_MAXDRIVES];
-
-	get_options (str, ARRAY_SIZE (integers), integers);
-	if (integers[0]%3 != 0) {
-		printk("xd: incorrect number of parameters for xd_geo\n");
-		return 1;
-	}
-	for (i = 0; (i < integers[0]) && (i < 3*XD_MAXDRIVES); i++)
-		xd_geo[i] = integers[i+1];
-	return 1;
-}
-
-__setup ("xd=", xd_setup);
-__setup ("xd_geo=", xd_manual_geo_init);
-
-#endif /* MODULE */
-
-module_init(xd_init);
-MODULE_ALIAS_BLOCKDEV_MAJOR(XT_DISK_MAJOR);
diff --git a/drivers/block/xd.h b/drivers/block/xd.h
deleted file mode 100644
index 37cacef16e9..00000000000
--- a/drivers/block/xd.h
+++ /dev/null
@@ -1,134 +0,0 @@
-#ifndef _LINUX_XD_H
-#define _LINUX_XD_H
-
-/*
- * This file contains the definitions for the IO ports and errors etc. for XT hard disk controllers (at least the DTC 5150X).
- *
- * Author: Pat Mackinlay, pat@it.com.au
- * Date: 29/09/92
- *
- * Revised: 01/01/93, ...
- *
- * Ref: DTC 5150X Controller Specification (thanks to Kevin Fowler, kevinf@agora.rain.com)
- * Also thanks to: Salvador Abreu, Dave Thaler, Risto Kankkunen and Wim Van Dorst.
- */
-
-#include <linux/interrupt.h>
-
-/* XT hard disk controller registers */
-#define XD_DATA		(xd_iobase + 0x00)	/* data RW register */
-#define XD_RESET	(xd_iobase + 0x01)	/* reset WO register */
-#define XD_STATUS	(xd_iobase + 0x01)	/* status RO register */
-#define XD_SELECT	(xd_iobase + 0x02)	/* select WO register */
-#define XD_JUMPER	(xd_iobase + 0x02)	/* jumper RO register */
-#define XD_CONTROL	(xd_iobase + 0x03)	/* DMAE/INTE WO register */
-#define XD_RESERVED	(xd_iobase + 0x03)	/* reserved */
-
-/* XT hard disk controller commands (incomplete list) */
-#define CMD_TESTREADY	0x00	/* test drive ready */
-#define CMD_RECALIBRATE	0x01	/* recalibrate drive */
-#define CMD_SENSE	0x03	/* request sense */
-#define CMD_FORMATDRV	0x04	/* format drive */
-#define CMD_VERIFY	0x05	/* read verify */
-#define CMD_FORMATTRK	0x06	/* format track */
-#define CMD_FORMATBAD	0x07	/* format bad track */
-#define CMD_READ	0x08	/* read */
-#define CMD_WRITE	0x0A	/* write */
-#define CMD_SEEK	0x0B	/* seek */
-
-/* Controller specific commands */
-#define CMD_DTCSETPARAM	0x0C	/* set drive parameters (DTC 5150X & CX only?) */
-#define CMD_DTCGETECC	0x0D	/* get ecc error length (DTC 5150X only?) */
-#define CMD_DTCREADBUF	0x0E	/* read sector buffer (DTC 5150X only?) */
-#define CMD_DTCWRITEBUF 0x0F	/* write sector buffer (DTC 5150X only?) */
-#define CMD_DTCREMAPTRK	0x11	/* assign alternate track (DTC 5150X only?) */
-#define CMD_DTCGETPARAM	0xFB	/* get drive parameters (DTC 5150X only?) */
-#define CMD_DTCSETSTEP	0xFC	/* set step rate (DTC 5150X only?) */
-#define CMD_DTCSETGEOM	0xFE	/* set geometry data (DTC 5150X only?) */
-#define CMD_DTCGETGEOM	0xFF	/* get geometry data (DTC 5150X only?) */
-#define CMD_ST11GETGEOM 0xF8	/* get geometry data (Seagate ST11R/M only?) */
-#define CMD_WDSETPARAM	0x0C	/* set drive parameters (WD 1004A27X only?) */
-#define CMD_XBSETPARAM	0x0C	/* set drive parameters (XEBEC only?) */
-
-/* Bits for command status byte */
-#define CSB_ERROR	0x02	/* error */
-#define CSB_LUN		0x20	/* logical Unit Number */
-
-/* XT hard disk controller status bits */
-#define STAT_READY	0x01	/* controller is ready */
-#define STAT_INPUT	0x02	/* data flowing from controller to host */
-#define STAT_COMMAND	0x04	/* controller in command phase */
-#define STAT_SELECT	0x08	/* controller is selected */
-#define STAT_REQUEST	0x10	/* controller requesting data */
-#define STAT_INTERRUPT	0x20	/* controller requesting interrupt */
-
-/* XT hard disk controller control bits */
-#define PIO_MODE	0x00	/* control bits to set for PIO */
-#define DMA_MODE	0x03	/* control bits to set for DMA & interrupt */
-
-#define XD_MAXDRIVES	2	/* maximum 2 drives */
-#define XD_TIMEOUT	HZ	/* 1 second timeout */
-#define XD_RETRIES	4	/* maximum 4 retries */
-
-#undef DEBUG			/* define for debugging output */
-
-#ifdef DEBUG
-	#define DEBUG_STARTUP	/* debug driver initialisation */
-	#define DEBUG_OVERRIDE	/* debug override geometry detection */
-	#define DEBUG_READWRITE	/* debug each read/write command */
-	#define DEBUG_OTHER	/* debug misc. interrupt/DMA stuff */
-	#define DEBUG_COMMAND	/* debug each controller command */
-#endif /* DEBUG */
-
-/* this structure defines the XT drives and their types */
-typedef struct {
-	u_char heads;
-	u_short cylinders;
-	u_char sectors;
-	u_char control;
-	int unit;
-} XD_INFO;
-
-/* this structure defines a ROM BIOS signature */
-typedef struct {
-	unsigned int offset;
-	const char *string;
-	void (*init_controller)(unsigned int address);
-	void (*init_drive)(u_char drive);
-	const char *name;
-} XD_SIGNATURE;
-
-#ifndef MODULE
-static int xd_manual_geo_init (char *command);
-#endif /* MODULE */
-static u_char xd_detect (u_char *controller, unsigned int *address);
-static u_char xd_initdrives (void (*init_drive)(u_char drive));
-
-static void do_xd_request (struct request_queue * q);
-static int xd_ioctl (struct block_device *bdev,fmode_t mode,unsigned int cmd,unsigned long arg);
-static int xd_readwrite (u_char operation,XD_INFO *disk,char *buffer,u_int block,u_int count);
-static void xd_recalibrate (u_char drive);
-
-static irqreturn_t xd_interrupt_handler(int irq, void *dev_id);
-static u_char xd_setup_dma (u_char opcode,u_char *buffer,u_int count);
-static u_char *xd_build (u_char *cmdblk,u_char command,u_char drive,u_char head,u_short cylinder,u_char sector,u_char count,u_char control);
-static void xd_watchdog (unsigned long unused);
-static inline u_char xd_waitport (u_short port,u_char flags,u_char mask,u_long timeout);
-static u_int xd_command (u_char *command,u_char mode,u_char *indata,u_char *outdata,u_char *sense,u_long timeout);
-
-/* card specific setup and geometry gathering code */
-static void xd_dtc_init_controller (unsigned int address);
-static void xd_dtc5150cx_init_drive (u_char drive);
-static void xd_dtc_init_drive (u_char drive);
-static void xd_wd_init_controller (unsigned int address);
-static void xd_wd_init_drive (u_char drive);
-static void xd_seagate_init_controller (unsigned int address);
-static void xd_seagate_init_drive (u_char drive);
-static void xd_omti_init_controller (unsigned int address);
-static void xd_omti_init_drive (u_char drive);
-static void xd_xebec_init_controller (unsigned int address);
-static void xd_xebec_init_drive (u_char drive);
-static void xd_setparam (u_char command,u_char drive,u_char heads,u_short cylinders,u_short rwrite,u_short wprecomp,u_char ecc);
-static void xd_override_init_drive (u_char drive);
-
-#endif /* _LINUX_XD_H */
diff --git a/drivers/block/xen-blkback/Makefile b/drivers/block/xen-blkback/Makefile
new file mode 100644
index 00000000000..e491c1b7687
--- /dev/null
+++ b/drivers/block/xen-blkback/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
+
+xen-blkback-y	:= blkback.o xenbus.o
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
new file mode 100644
index 00000000000..64c60edcdfb
--- /dev/null
+++ b/drivers/block/xen-blkback/blkback.c
@@ -0,0 +1,1399 @@
+/******************************************************************************
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ *  drivers/block/xen-blkfront.c
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Copyright (c) 2005, Christopher Clark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/bitmap.h>
+
+#include <xen/events.h>
+#include <xen/page.h>
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/balloon.h>
+#include "common.h"
+
+/*
+ * Maximum number of unused free pages to keep in the internal buffer.
+ * Setting this to a value too low will reduce memory used in each backend,
+ * but can have a performance penalty.
+ *
+ * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
+ * be set to a lower value that might degrade performance on some intensive
+ * IO workloads.
+ */
+
+static int xen_blkif_max_buffer_pages = 1024;
+module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
+MODULE_PARM_DESC(max_buffer_pages,
+"Maximum number of free pages to keep in each block backend buffer");
+
+/*
+ * Maximum number of grants to map persistently in blkback. For maximum
+ * performance this should be the total numbers of grants that can be used
+ * to fill the ring, but since this might become too high, specially with
+ * the use of indirect descriptors, we set it to a value that provides good
+ * performance without using too much memory.
+ *
+ * When the list of persistent grants is full we clean it up using a LRU
+ * algorithm.
+ */
+
+static int xen_blkif_max_pgrants = 1056;
+module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
+MODULE_PARM_DESC(max_persistent_grants,
+                 "Maximum number of grants to map persistently");
+
+/*
+ * The LRU mechanism to clean the lists of persistent grants needs to
+ * be executed periodically. The time interval between consecutive executions
+ * of the purge mechanism is set in ms.
+ */
+#define LRU_INTERVAL 100
+
+/*
+ * When the persistent grants list is full we will remove unused grants
+ * from the list. The percent number of grants to be removed at each LRU
+ * execution.
+ */
+#define LRU_PERCENT_CLEAN 5
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats;
+module_param(log_stats, int, 0644);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+/* Number of free pages to remove on each call to free_xenballooned_pages */
+#define NUM_BATCH_FREE_PAGES 10
+
+static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkif->free_pages_lock, flags);
+	if (list_empty(&blkif->free_pages)) {
+		BUG_ON(blkif->free_pages_num != 0);
+		spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+		return alloc_xenballooned_pages(1, page, false);
+	}
+	BUG_ON(blkif->free_pages_num == 0);
+	page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
+	list_del(&page[0]->lru);
+	blkif->free_pages_num--;
+	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+
+	return 0;
+}
+
+static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
+                                  int num)
+{
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&blkif->free_pages_lock, flags);
+	for (i = 0; i < num; i++)
+		list_add(&page[i]->lru, &blkif->free_pages);
+	blkif->free_pages_num += num;
+	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+}
+
+static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
+{
+	/* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
+	struct page *page[NUM_BATCH_FREE_PAGES];
+	unsigned int num_pages = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkif->free_pages_lock, flags);
+	while (blkif->free_pages_num > num) {
+		BUG_ON(list_empty(&blkif->free_pages));
+		page[num_pages] = list_first_entry(&blkif->free_pages,
+		                                   struct page, lru);
+		list_del(&page[num_pages]->lru);
+		blkif->free_pages_num--;
+		if (++num_pages == NUM_BATCH_FREE_PAGES) {
+			spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+			free_xenballooned_pages(num_pages, page);
+			spin_lock_irqsave(&blkif->free_pages_lock, flags);
+			num_pages = 0;
+		}
+	}
+	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+	if (num_pages != 0)
+		free_xenballooned_pages(num_pages, page);
+}
+
+#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
+
+static int do_block_io_op(struct xen_blkif *blkif);
+static int dispatch_rw_block_io(struct xen_blkif *blkif,
+				struct blkif_request *req,
+				struct pending_req *pending_req);
+static void make_response(struct xen_blkif *blkif, u64 id,
+			  unsigned short op, int st);
+
+#define foreach_grant_safe(pos, n, rbtree, node) \
+	for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
+	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
+	     &(pos)->node != NULL; \
+	     (pos) = container_of(n, typeof(*(pos)), node), \
+	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
+
+
+/*
+ * We don't need locking around the persistent grant helpers
+ * because blkback uses a single-thread for each backed, so we
+ * can be sure that this functions will never be called recursively.
+ *
+ * The only exception to that is put_persistent_grant, that can be called
+ * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
+ * bit operations to modify the flags of a persistent grant and to count
+ * the number of used grants.
+ */
+static int add_persistent_gnt(struct xen_blkif *blkif,
+			       struct persistent_gnt *persistent_gnt)
+{
+	struct rb_node **new = NULL, *parent = NULL;
+	struct persistent_gnt *this;
+
+	if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
+		if (!blkif->vbd.overflow_max_grants)
+			blkif->vbd.overflow_max_grants = 1;
+		return -EBUSY;
+	}
+	/* Figure out where to put new node */
+	new = &blkif->persistent_gnts.rb_node;
+	while (*new) {
+		this = container_of(*new, struct persistent_gnt, node);
+
+		parent = *new;
+		if (persistent_gnt->gnt < this->gnt)
+			new = &((*new)->rb_left);
+		else if (persistent_gnt->gnt > this->gnt)
+			new = &((*new)->rb_right);
+		else {
+			pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n");
+			return -EINVAL;
+		}
+	}
+
+	bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE);
+	set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
+	/* Add new node and rebalance tree. */
+	rb_link_node(&(persistent_gnt->node), parent, new);
+	rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
+	blkif->persistent_gnt_c++;
+	atomic_inc(&blkif->persistent_gnt_in_use);
+	return 0;
+}
+
+static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
+						 grant_ref_t gref)
+{
+	struct persistent_gnt *data;
+	struct rb_node *node = NULL;
+
+	node = blkif->persistent_gnts.rb_node;
+	while (node) {
+		data = container_of(node, struct persistent_gnt, node);
+
+		if (gref < data->gnt)
+			node = node->rb_left;
+		else if (gref > data->gnt)
+			node = node->rb_right;
+		else {
+			if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) {
+				pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n");
+				return NULL;
+			}
+			set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
+			atomic_inc(&blkif->persistent_gnt_in_use);
+			return data;
+		}
+	}
+	return NULL;
+}
+
+static void put_persistent_gnt(struct xen_blkif *blkif,
+                               struct persistent_gnt *persistent_gnt)
+{
+	if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
+	          pr_alert_ratelimited(DRV_PFX " freeing a grant already unused");
+	set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
+	clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
+	atomic_dec(&blkif->persistent_gnt_in_use);
+}
+
+static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
+                                 unsigned int num)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct persistent_gnt *persistent_gnt;
+	struct rb_node *n;
+	int ret = 0;
+	int segs_to_unmap = 0;
+
+	foreach_grant_safe(persistent_gnt, n, root, node) {
+		BUG_ON(persistent_gnt->handle ==
+			BLKBACK_INVALID_HANDLE);
+		gnttab_set_unmap_op(&unmap[segs_to_unmap],
+			(unsigned long) pfn_to_kaddr(page_to_pfn(
+				persistent_gnt->page)),
+			GNTMAP_host_map,
+			persistent_gnt->handle);
+
+		pages[segs_to_unmap] = persistent_gnt->page;
+
+		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
+			!rb_next(&persistent_gnt->node)) {
+			ret = gnttab_unmap_refs(unmap, NULL, pages,
+				segs_to_unmap);
+			BUG_ON(ret);
+			put_free_pages(blkif, pages, segs_to_unmap);
+			segs_to_unmap = 0;
+		}
+
+		rb_erase(&persistent_gnt->node, root);
+		kfree(persistent_gnt);
+		num--;
+	}
+	BUG_ON(num != 0);
+}
+
+void xen_blkbk_unmap_purged_grants(struct work_struct *work)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct persistent_gnt *persistent_gnt;
+	int ret, segs_to_unmap = 0;
+	struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
+
+	while(!list_empty(&blkif->persistent_purge_list)) {
+		persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
+		                                  struct persistent_gnt,
+		                                  remove_node);
+		list_del(&persistent_gnt->remove_node);
+
+		gnttab_set_unmap_op(&unmap[segs_to_unmap],
+			vaddr(persistent_gnt->page),
+			GNTMAP_host_map,
+			persistent_gnt->handle);
+
+		pages[segs_to_unmap] = persistent_gnt->page;
+
+		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+			ret = gnttab_unmap_refs(unmap, NULL, pages,
+				segs_to_unmap);
+			BUG_ON(ret);
+			put_free_pages(blkif, pages, segs_to_unmap);
+			segs_to_unmap = 0;
+		}
+		kfree(persistent_gnt);
+	}
+	if (segs_to_unmap > 0) {
+		ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap);
+		BUG_ON(ret);
+		put_free_pages(blkif, pages, segs_to_unmap);
+	}
+}
+
+static void purge_persistent_gnt(struct xen_blkif *blkif)
+{
+	struct persistent_gnt *persistent_gnt;
+	struct rb_node *n;
+	unsigned int num_clean, total;
+	bool scan_used = false, clean_used = false;
+	struct rb_root *root;
+
+	if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
+	    (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
+	    !blkif->vbd.overflow_max_grants)) {
+		return;
+	}
+
+	if (work_pending(&blkif->persistent_purge_work)) {
+		pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n");
+		return;
+	}
+
+	num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
+	num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
+	num_clean = min(blkif->persistent_gnt_c, num_clean);
+	if ((num_clean == 0) ||
+	    (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
+		return;
+
+	/*
+	 * At this point, we can assure that there will be no calls
+         * to get_persistent_grant (because we are executing this code from
+         * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
+         * which means that the number of currently used grants will go down,
+         * but never up, so we will always be able to remove the requested
+         * number of grants.
+	 */
+
+	total = num_clean;
+
+	pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean);
+
+	BUG_ON(!list_empty(&blkif->persistent_purge_list));
+	root = &blkif->persistent_gnts;
+purge_list:
+	foreach_grant_safe(persistent_gnt, n, root, node) {
+		BUG_ON(persistent_gnt->handle ==
+			BLKBACK_INVALID_HANDLE);
+
+		if (clean_used) {
+			clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
+			continue;
+		}
+
+		if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
+			continue;
+		if (!scan_used &&
+		    (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags)))
+			continue;
+
+		rb_erase(&persistent_gnt->node, root);
+		list_add(&persistent_gnt->remove_node,
+		         &blkif->persistent_purge_list);
+		if (--num_clean == 0)
+			goto finished;
+	}
+	/*
+	 * If we get here it means we also need to start cleaning
+	 * grants that were used since last purge in order to cope
+	 * with the requested num
+	 */
+	if (!scan_used && !clean_used) {
+		pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean);
+		scan_used = true;
+		goto purge_list;
+	}
+finished:
+	if (!clean_used) {
+		pr_debug(DRV_PFX "Finished scanning for grants to clean, removing used flag\n");
+		clean_used = true;
+		goto purge_list;
+	}
+
+	blkif->persistent_gnt_c -= (total - num_clean);
+	blkif->vbd.overflow_max_grants = 0;
+
+	/* We can defer this work */
+	schedule_work(&blkif->persistent_purge_work);
+	pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total);
+	return;
+}
+
+/*
+ * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
+ */
+static struct pending_req *alloc_req(struct xen_blkif *blkif)
+{
+	struct pending_req *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkif->pending_free_lock, flags);
+	if (!list_empty(&blkif->pending_free)) {
+		req = list_entry(blkif->pending_free.next, struct pending_req,
+				 free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+	return req;
+}
+
+/*
+ * Return the 'pending_req' structure back to the freepool. We also
+ * wake up the thread if it was waiting for a free page.
+ */
+static void free_req(struct xen_blkif *blkif, struct pending_req *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&blkif->pending_free_lock, flags);
+	was_empty = list_empty(&blkif->pending_free);
+	list_add(&req->free_list, &blkif->pending_free);
+	spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&blkif->pending_free_wq);
+}
+
+/*
+ * Routines for managing virtual block devices (vbds).
+ */
+static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
+			     int operation)
+{
+	struct xen_vbd *vbd = &blkif->vbd;
+	int rc = -EACCES;
+
+	if ((operation != READ) && vbd->readonly)
+		goto out;
+
+	if (likely(req->nr_sects)) {
+		blkif_sector_t end = req->sector_number + req->nr_sects;
+
+		if (unlikely(end < req->sector_number))
+			goto out;
+		if (unlikely(end > vbd_sz(vbd)))
+			goto out;
+	}
+
+	req->dev  = vbd->pdevice;
+	req->bdev = vbd->bdev;
+	rc = 0;
+
+ out:
+	return rc;
+}
+
+static void xen_vbd_resize(struct xen_blkif *blkif)
+{
+	struct xen_vbd *vbd = &blkif->vbd;
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
+	unsigned long long new_size = vbd_sz(vbd);
+
+	pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n",
+		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
+	pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size);
+	vbd->size = new_size;
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		pr_warn(DRV_PFX "Error starting transaction");
+		return;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+			    (unsigned long long)vbd_sz(vbd));
+	if (err) {
+		pr_warn(DRV_PFX "Error writing new size");
+		goto abort;
+	}
+	/*
+	 * Write the current state; we will use this to synchronize
+	 * the front-end. If the current state is "connected" the
+	 * front-end will get the new size information online.
+	 */
+	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+	if (err) {
+		pr_warn(DRV_PFX "Error writing the state");
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		pr_warn(DRV_PFX "Error ending transaction");
+	return;
+abort:
+	xenbus_transaction_end(xbt, 1);
+}
+
+/*
+ * Notification from the guest OS.
+ */
+static void blkif_notify_work(struct xen_blkif *blkif)
+{
+	blkif->waiting_reqs = 1;
+	wake_up(&blkif->wq);
+}
+
+irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
+{
+	blkif_notify_work(dev_id);
+	return IRQ_HANDLED;
+}
+
+/*
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(struct xen_blkif *blkif)
+{
+	pr_info("xen-blkback (%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
+		 "  |  ds %4llu | pg: %4u/%4d\n",
+		 current->comm, blkif->st_oo_req,
+		 blkif->st_rd_req, blkif->st_wr_req,
+		 blkif->st_f_req, blkif->st_ds_req,
+		 blkif->persistent_gnt_c,
+		 xen_blkif_max_pgrants);
+	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+	blkif->st_rd_req = 0;
+	blkif->st_wr_req = 0;
+	blkif->st_oo_req = 0;
+	blkif->st_ds_req = 0;
+}
+
+int xen_blkif_schedule(void *arg)
+{
+	struct xen_blkif *blkif = arg;
+	struct xen_vbd *vbd = &blkif->vbd;
+	unsigned long timeout;
+	int ret;
+
+	xen_blkif_get(blkif);
+
+	while (!kthread_should_stop()) {
+		if (try_to_freeze())
+			continue;
+		if (unlikely(vbd->size != vbd_sz(vbd)))
+			xen_vbd_resize(blkif);
+
+		timeout = msecs_to_jiffies(LRU_INTERVAL);
+
+		timeout = wait_event_interruptible_timeout(
+			blkif->wq,
+			blkif->waiting_reqs || kthread_should_stop(),
+			timeout);
+		if (timeout == 0)
+			goto purge_gnt_list;
+		timeout = wait_event_interruptible_timeout(
+			blkif->pending_free_wq,
+			!list_empty(&blkif->pending_free) ||
+			kthread_should_stop(),
+			timeout);
+		if (timeout == 0)
+			goto purge_gnt_list;
+
+		blkif->waiting_reqs = 0;
+		smp_mb(); /* clear flag *before* checking for work */
+
+		ret = do_block_io_op(blkif);
+		if (ret > 0)
+			blkif->waiting_reqs = 1;
+		if (ret == -EACCES)
+			wait_event_interruptible(blkif->shutdown_wq,
+						 kthread_should_stop());
+
+purge_gnt_list:
+		if (blkif->vbd.feature_gnt_persistent &&
+		    time_after(jiffies, blkif->next_lru)) {
+			purge_persistent_gnt(blkif);
+			blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
+		}
+
+		/* Shrink if we have more than xen_blkif_max_buffer_pages */
+		shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
+
+		if (log_stats && time_after(jiffies, blkif->st_print))
+			print_stats(blkif);
+	}
+
+	/* Drain pending purge work */
+	flush_work(&blkif->persistent_purge_work);
+
+	if (log_stats)
+		print_stats(blkif);
+
+	blkif->xenblkd = NULL;
+	xen_blkif_put(blkif);
+
+	return 0;
+}
+
+/*
+ * Remove persistent grants and empty the pool of free pages
+ */
+void xen_blkbk_free_caches(struct xen_blkif *blkif)
+{
+	/* Free all persistent grant pages */
+	if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
+		free_persistent_gnts(blkif, &blkif->persistent_gnts,
+			blkif->persistent_gnt_c);
+
+	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
+	blkif->persistent_gnt_c = 0;
+
+	/* Since we are shutting down remove all pages from the buffer */
+	shrink_free_pagepool(blkif, 0 /* All */);
+}
+
+/*
+ * Unmap the grant references, and also remove the M2P over-rides
+ * used in the 'pending_req'.
+ */
+static void xen_blkbk_unmap(struct xen_blkif *blkif,
+                            struct grant_page *pages[],
+                            int num)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int i, invcount = 0;
+	int ret;
+
+	for (i = 0; i < num; i++) {
+		if (pages[i]->persistent_gnt != NULL) {
+			put_persistent_gnt(blkif, pages[i]->persistent_gnt);
+			continue;
+		}
+		if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
+			continue;
+		unmap_pages[invcount] = pages[i]->page;
+		gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page),
+				    GNTMAP_host_map, pages[i]->handle);
+		pages[i]->handle = BLKBACK_INVALID_HANDLE;
+		if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+			ret = gnttab_unmap_refs(unmap, NULL, unmap_pages,
+			                        invcount);
+			BUG_ON(ret);
+			put_free_pages(blkif, unmap_pages, invcount);
+			invcount = 0;
+		}
+	}
+	if (invcount) {
+		ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
+		BUG_ON(ret);
+		put_free_pages(blkif, unmap_pages, invcount);
+	}
+}
+
+static int xen_blkbk_map(struct xen_blkif *blkif,
+			 struct grant_page *pages[],
+			 int num, bool ro)
+{
+	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct persistent_gnt *persistent_gnt = NULL;
+	phys_addr_t addr = 0;
+	int i, seg_idx, new_map_idx;
+	int segs_to_map = 0;
+	int ret = 0;
+	int last_map = 0, map_until = 0;
+	int use_persistent_gnts;
+
+	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
+
+	/*
+	 * Fill out preq.nr_sects with proper amount of sectors, and setup
+	 * assign map[..] with the PFN of the page in our domain with the
+	 * corresponding grant reference for each page.
+	 */
+again:
+	for (i = map_until; i < num; i++) {
+		uint32_t flags;
+
+		if (use_persistent_gnts)
+			persistent_gnt = get_persistent_gnt(
+				blkif,
+				pages[i]->gref);
+
+		if (persistent_gnt) {
+			/*
+			 * We are using persistent grants and
+			 * the grant is already mapped
+			 */
+			pages[i]->page = persistent_gnt->page;
+			pages[i]->persistent_gnt = persistent_gnt;
+		} else {
+			if (get_free_page(blkif, &pages[i]->page))
+				goto out_of_memory;
+			addr = vaddr(pages[i]->page);
+			pages_to_gnt[segs_to_map] = pages[i]->page;
+			pages[i]->persistent_gnt = NULL;
+			flags = GNTMAP_host_map;
+			if (!use_persistent_gnts && ro)
+				flags |= GNTMAP_readonly;
+			gnttab_set_map_op(&map[segs_to_map++], addr,
+					  flags, pages[i]->gref,
+					  blkif->domid);
+		}
+		map_until = i + 1;
+		if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
+			break;
+	}
+
+	if (segs_to_map) {
+		ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
+		BUG_ON(ret);
+	}
+
+	/*
+	 * Now swizzle the MFN in our domain with the MFN from the other domain
+	 * so that when we access vaddr(pending_req,i) it has the contents of
+	 * the page from the other domain.
+	 */
+	for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
+		if (!pages[seg_idx]->persistent_gnt) {
+			/* This is a newly mapped grant */
+			BUG_ON(new_map_idx >= segs_to_map);
+			if (unlikely(map[new_map_idx].status != 0)) {
+				pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
+				pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
+				ret |= 1;
+				goto next;
+			}
+			pages[seg_idx]->handle = map[new_map_idx].handle;
+		} else {
+			continue;
+		}
+		if (use_persistent_gnts &&
+		    blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
+			/*
+			 * We are using persistent grants, the grant is
+			 * not mapped but we might have room for it.
+			 */
+			persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
+				                 GFP_KERNEL);
+			if (!persistent_gnt) {
+				/*
+				 * If we don't have enough memory to
+				 * allocate the persistent_gnt struct
+				 * map this grant non-persistenly
+				 */
+				goto next;
+			}
+			persistent_gnt->gnt = map[new_map_idx].ref;
+			persistent_gnt->handle = map[new_map_idx].handle;
+			persistent_gnt->page = pages[seg_idx]->page;
+			if (add_persistent_gnt(blkif,
+			                       persistent_gnt)) {
+				kfree(persistent_gnt);
+				persistent_gnt = NULL;
+				goto next;
+			}
+			pages[seg_idx]->persistent_gnt = persistent_gnt;
+			pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
+				 persistent_gnt->gnt, blkif->persistent_gnt_c,
+				 xen_blkif_max_pgrants);
+			goto next;
+		}
+		if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
+			blkif->vbd.overflow_max_grants = 1;
+			pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
+			         blkif->domid, blkif->vbd.handle);
+		}
+		/*
+		 * We could not map this grant persistently, so use it as
+		 * a non-persistent grant.
+		 */
+next:
+		new_map_idx++;
+	}
+	segs_to_map = 0;
+	last_map = map_until;
+	if (map_until != num)
+		goto again;
+
+	return ret;
+
+out_of_memory:
+	pr_alert(DRV_PFX "%s: out of memory\n", __func__);
+	put_free_pages(blkif, pages_to_gnt, segs_to_map);
+	return -ENOMEM;
+}
+
+static int xen_blkbk_map_seg(struct pending_req *pending_req)
+{
+	int rc;
+
+	rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
+			   pending_req->nr_pages,
+	                   (pending_req->operation != BLKIF_OP_READ));
+
+	return rc;
+}
+
+static int xen_blkbk_parse_indirect(struct blkif_request *req,
+				    struct pending_req *pending_req,
+				    struct seg_buf seg[],
+				    struct phys_req *preq)
+{
+	struct grant_page **pages = pending_req->indirect_pages;
+	struct xen_blkif *blkif = pending_req->blkif;
+	int indirect_grefs, rc, n, nseg, i;
+	struct blkif_request_segment *segments = NULL;
+
+	nseg = pending_req->nr_pages;
+	indirect_grefs = INDIRECT_PAGES(nseg);
+	BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
+
+	for (i = 0; i < indirect_grefs; i++)
+		pages[i]->gref = req->u.indirect.indirect_grefs[i];
+
+	rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
+	if (rc)
+		goto unmap;
+
+	for (n = 0, i = 0; n < nseg; n++) {
+		if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
+			/* Map indirect segments */
+			if (segments)
+				kunmap_atomic(segments);
+			segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
+		}
+		i = n % SEGS_PER_INDIRECT_FRAME;
+		pending_req->segments[n]->gref = segments[i].gref;
+		seg[n].nsec = segments[i].last_sect -
+			segments[i].first_sect + 1;
+		seg[n].offset = (segments[i].first_sect << 9);
+		if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
+		    (segments[i].last_sect < segments[i].first_sect)) {
+			rc = -EINVAL;
+			goto unmap;
+		}
+		preq->nr_sects += seg[n].nsec;
+	}
+
+unmap:
+	if (segments)
+		kunmap_atomic(segments);
+	xen_blkbk_unmap(blkif, pages, indirect_grefs);
+	return rc;
+}
+
+static int dispatch_discard_io(struct xen_blkif *blkif,
+				struct blkif_request *req)
+{
+	int err = 0;
+	int status = BLKIF_RSP_OKAY;
+	struct block_device *bdev = blkif->vbd.bdev;
+	unsigned long secure;
+	struct phys_req preq;
+
+	xen_blkif_get(blkif);
+
+	preq.sector_number = req->u.discard.sector_number;
+	preq.nr_sects      = req->u.discard.nr_sectors;
+
+	err = xen_vbd_translate(&preq, blkif, WRITE);
+	if (err) {
+		pr_warn(DRV_PFX "access denied: DISCARD [%llu->%llu] on dev=%04x\n",
+			preq.sector_number,
+			preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
+		goto fail_response;
+	}
+	blkif->st_ds_req++;
+
+	secure = (blkif->vbd.discard_secure &&
+		 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
+		 BLKDEV_DISCARD_SECURE : 0;
+
+	err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
+				   req->u.discard.nr_sectors,
+				   GFP_KERNEL, secure);
+fail_response:
+	if (err == -EOPNOTSUPP) {
+		pr_debug(DRV_PFX "discard op failed, not supported\n");
+		status = BLKIF_RSP_EOPNOTSUPP;
+	} else if (err)
+		status = BLKIF_RSP_ERROR;
+
+	make_response(blkif, req->u.discard.id, req->operation, status);
+	xen_blkif_put(blkif);
+	return err;
+}
+
+static int dispatch_other_io(struct xen_blkif *blkif,
+			     struct blkif_request *req,
+			     struct pending_req *pending_req)
+{
+	free_req(blkif, pending_req);
+	make_response(blkif, req->u.other.id, req->operation,
+		      BLKIF_RSP_EOPNOTSUPP);
+	return -EIO;
+}
+
+static void xen_blk_drain_io(struct xen_blkif *blkif)
+{
+	atomic_set(&blkif->drain, 1);
+	do {
+		if (atomic_read(&blkif->inflight) == 0)
+			break;
+		wait_for_completion_interruptible_timeout(
+				&blkif->drain_complete, HZ);
+
+		if (!atomic_read(&blkif->drain))
+			break;
+	} while (!kthread_should_stop());
+	atomic_set(&blkif->drain, 0);
+}
+
+/*
+ * Completion callback on the bio's. Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(struct pending_req *pending_req, int error)
+{
+	/* An error fails the entire request. */
+	if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
+	    (error == -EOPNOTSUPP)) {
+		pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
+		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
+		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+	} else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+		    (error == -EOPNOTSUPP)) {
+		pr_debug(DRV_PFX "write barrier op failed, not supported\n");
+		xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
+		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+	} else if (error) {
+		pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
+			 " error=%d\n", error);
+		pending_req->status = BLKIF_RSP_ERROR;
+	}
+
+	/*
+	 * If all of the bio's have completed it is time to unmap
+	 * the grant references associated with 'request' and provide
+	 * the proper response on the ring.
+	 */
+	if (atomic_dec_and_test(&pending_req->pendcnt)) {
+		struct xen_blkif *blkif = pending_req->blkif;
+
+		xen_blkbk_unmap(blkif,
+		                pending_req->segments,
+		                pending_req->nr_pages);
+		make_response(blkif, pending_req->id,
+			      pending_req->operation, pending_req->status);
+		free_req(blkif, pending_req);
+		/*
+		 * Make sure the request is freed before releasing blkif,
+		 * or there could be a race between free_req and the
+		 * cleanup done in xen_blkif_free during shutdown.
+		 *
+		 * NB: The fact that we might try to wake up pending_free_wq
+		 * before drain_complete (in case there's a drain going on)
+		 * it's not a problem with our current implementation
+		 * because we can assure there's no thread waiting on
+		 * pending_free_wq if there's a drain going on, but it has
+		 * to be taken into account if the current model is changed.
+		 */
+		if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) {
+			complete(&blkif->drain_complete);
+		}
+		xen_blkif_put(blkif);
+	}
+}
+
+/*
+ * bio callback.
+ */
+static void end_block_io_op(struct bio *bio, int error)
+{
+	__end_block_io_op(bio->bi_private, error);
+	bio_put(bio);
+}
+
+
+
+/*
+ * Function to copy the from the ring buffer the 'struct blkif_request'
+ * (which has the sectors we want, number of them, grant references, etc),
+ * and transmute  it to the block API to hand it over to the proper block disk.
+ */
+static int
+__do_block_io_op(struct xen_blkif *blkif)
+{
+	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	struct blkif_request req;
+	struct pending_req *pending_req;
+	RING_IDX rc, rp;
+	int more_to_do = 0;
+
+	rc = blk_rings->common.req_cons;
+	rp = blk_rings->common.sring->req_prod;
+	rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+	if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
+		rc = blk_rings->common.rsp_prod_pvt;
+		pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
+			rp, rc, rp - rc, blkif->vbd.pdevice);
+		return -EACCES;
+	}
+	while (rc != rp) {
+
+		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+			break;
+
+		if (kthread_should_stop()) {
+			more_to_do = 1;
+			break;
+		}
+
+		pending_req = alloc_req(blkif);
+		if (NULL == pending_req) {
+			blkif->st_oo_req++;
+			more_to_do = 1;
+			break;
+		}
+
+		switch (blkif->blk_protocol) {
+		case BLKIF_PROTOCOL_NATIVE:
+			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
+			break;
+		case BLKIF_PROTOCOL_X86_32:
+			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+			break;
+		case BLKIF_PROTOCOL_X86_64:
+			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+			break;
+		default:
+			BUG();
+		}
+		blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+		/* Apply all sanity checks to /private copy/ of request. */
+		barrier();
+
+		switch (req.operation) {
+		case BLKIF_OP_READ:
+		case BLKIF_OP_WRITE:
+		case BLKIF_OP_WRITE_BARRIER:
+		case BLKIF_OP_FLUSH_DISKCACHE:
+		case BLKIF_OP_INDIRECT:
+			if (dispatch_rw_block_io(blkif, &req, pending_req))
+				goto done;
+			break;
+		case BLKIF_OP_DISCARD:
+			free_req(blkif, pending_req);
+			if (dispatch_discard_io(blkif, &req))
+				goto done;
+			break;
+		default:
+			if (dispatch_other_io(blkif, &req, pending_req))
+				goto done;
+			break;
+		}
+
+		/* Yield point for this unbounded loop. */
+		cond_resched();
+	}
+done:
+	return more_to_do;
+}
+
+static int
+do_block_io_op(struct xen_blkif *blkif)
+{
+	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	int more_to_do;
+
+	do {
+		more_to_do = __do_block_io_op(blkif);
+		if (more_to_do)
+			break;
+
+		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+	} while (more_to_do);
+
+	return more_to_do;
+}
+/*
+ * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
+ * and call the 'submit_bio' to pass it to the underlying storage.
+ */
+static int dispatch_rw_block_io(struct xen_blkif *blkif,
+				struct blkif_request *req,
+				struct pending_req *pending_req)
+{
+	struct phys_req preq;
+	struct seg_buf *seg = pending_req->seg;
+	unsigned int nseg;
+	struct bio *bio = NULL;
+	struct bio **biolist = pending_req->biolist;
+	int i, nbio = 0;
+	int operation;
+	struct blk_plug plug;
+	bool drain = false;
+	struct grant_page **pages = pending_req->segments;
+	unsigned short req_operation;
+
+	req_operation = req->operation == BLKIF_OP_INDIRECT ?
+			req->u.indirect.indirect_op : req->operation;
+	if ((req->operation == BLKIF_OP_INDIRECT) &&
+	    (req_operation != BLKIF_OP_READ) &&
+	    (req_operation != BLKIF_OP_WRITE)) {
+		pr_debug(DRV_PFX "Invalid indirect operation (%u)\n",
+			 req_operation);
+		goto fail_response;
+	}
+
+	switch (req_operation) {
+	case BLKIF_OP_READ:
+		blkif->st_rd_req++;
+		operation = READ;
+		break;
+	case BLKIF_OP_WRITE:
+		blkif->st_wr_req++;
+		operation = WRITE_ODIRECT;
+		break;
+	case BLKIF_OP_WRITE_BARRIER:
+		drain = true;
+	case BLKIF_OP_FLUSH_DISKCACHE:
+		blkif->st_f_req++;
+		operation = WRITE_FLUSH;
+		break;
+	default:
+		operation = 0; /* make gcc happy */
+		goto fail_response;
+		break;
+	}
+
+	/* Check that the number of segments is sane. */
+	nseg = req->operation == BLKIF_OP_INDIRECT ?
+	       req->u.indirect.nr_segments : req->u.rw.nr_segments;
+
+	if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
+	    unlikely((req->operation != BLKIF_OP_INDIRECT) &&
+		     (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
+	    unlikely((req->operation == BLKIF_OP_INDIRECT) &&
+		     (nseg > MAX_INDIRECT_SEGMENTS))) {
+		pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
+			 nseg);
+		/* Haven't submitted any bio's yet. */
+		goto fail_response;
+	}
+
+	preq.nr_sects      = 0;
+
+	pending_req->blkif     = blkif;
+	pending_req->id        = req->u.rw.id;
+	pending_req->operation = req_operation;
+	pending_req->status    = BLKIF_RSP_OKAY;
+	pending_req->nr_pages  = nseg;
+
+	if (req->operation != BLKIF_OP_INDIRECT) {
+		preq.dev               = req->u.rw.handle;
+		preq.sector_number     = req->u.rw.sector_number;
+		for (i = 0; i < nseg; i++) {
+			pages[i]->gref = req->u.rw.seg[i].gref;
+			seg[i].nsec = req->u.rw.seg[i].last_sect -
+				req->u.rw.seg[i].first_sect + 1;
+			seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
+			if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+			    (req->u.rw.seg[i].last_sect <
+			     req->u.rw.seg[i].first_sect))
+				goto fail_response;
+			preq.nr_sects += seg[i].nsec;
+		}
+	} else {
+		preq.dev               = req->u.indirect.handle;
+		preq.sector_number     = req->u.indirect.sector_number;
+		if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
+			goto fail_response;
+	}
+
+	if (xen_vbd_translate(&preq, blkif, operation) != 0) {
+		pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n",
+			 operation == READ ? "read" : "write",
+			 preq.sector_number,
+			 preq.sector_number + preq.nr_sects,
+			 blkif->vbd.pdevice);
+		goto fail_response;
+	}
+
+	/*
+	 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
+	 * is set there.
+	 */
+	for (i = 0; i < nseg; i++) {
+		if (((int)preq.sector_number|(int)seg[i].nsec) &
+		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
+			pr_debug(DRV_PFX "Misaligned I/O request from domain %d",
+				 blkif->domid);
+			goto fail_response;
+		}
+	}
+
+	/* Wait on all outstanding I/O's and once that has been completed
+	 * issue the WRITE_FLUSH.
+	 */
+	if (drain)
+		xen_blk_drain_io(pending_req->blkif);
+
+	/*
+	 * If we have failed at this point, we need to undo the M2P override,
+	 * set gnttab_set_unmap_op on all of the grant references and perform
+	 * the hypercall to unmap the grants - that is all done in
+	 * xen_blkbk_unmap.
+	 */
+	if (xen_blkbk_map_seg(pending_req))
+		goto fail_flush;
+
+	/*
+	 * This corresponding xen_blkif_put is done in __end_block_io_op, or
+	 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
+	 */
+	xen_blkif_get(blkif);
+	atomic_inc(&blkif->inflight);
+
+	for (i = 0; i < nseg; i++) {
+		while ((bio == NULL) ||
+		       (bio_add_page(bio,
+				     pages[i]->page,
+				     seg[i].nsec << 9,
+				     seg[i].offset) == 0)) {
+
+			int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
+			bio = bio_alloc(GFP_KERNEL, nr_iovecs);
+			if (unlikely(bio == NULL))
+				goto fail_put_bio;
+
+			biolist[nbio++] = bio;
+			bio->bi_bdev    = preq.bdev;
+			bio->bi_private = pending_req;
+			bio->bi_end_io  = end_block_io_op;
+			bio->bi_iter.bi_sector  = preq.sector_number;
+		}
+
+		preq.sector_number += seg[i].nsec;
+	}
+
+	/* This will be hit if the operation was a flush or discard. */
+	if (!bio) {
+		BUG_ON(operation != WRITE_FLUSH);
+
+		bio = bio_alloc(GFP_KERNEL, 0);
+		if (unlikely(bio == NULL))
+			goto fail_put_bio;
+
+		biolist[nbio++] = bio;
+		bio->bi_bdev    = preq.bdev;
+		bio->bi_private = pending_req;
+		bio->bi_end_io  = end_block_io_op;
+	}
+
+	atomic_set(&pending_req->pendcnt, nbio);
+	blk_start_plug(&plug);
+
+	for (i = 0; i < nbio; i++)
+		submit_bio(operation, biolist[i]);
+
+	/* Let the I/Os go.. */
+	blk_finish_plug(&plug);
+
+	if (operation == READ)
+		blkif->st_rd_sect += preq.nr_sects;
+	else if (operation & WRITE)
+		blkif->st_wr_sect += preq.nr_sects;
+
+	return 0;
+
+ fail_flush:
+	xen_blkbk_unmap(blkif, pending_req->segments,
+	                pending_req->nr_pages);
+ fail_response:
+	/* Haven't submitted any bio's yet. */
+	make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
+	free_req(blkif, pending_req);
+	msleep(1); /* back off a bit */
+	return -EIO;
+
+ fail_put_bio:
+	for (i = 0; i < nbio; i++)
+		bio_put(biolist[i]);
+	atomic_set(&pending_req->pendcnt, 1);
+	__end_block_io_op(pending_req, -EINVAL);
+	msleep(1); /* back off a bit */
+	return -EIO;
+}
+
+
+
+/*
+ * Put a response on the ring on how the operation fared.
+ */
+static void make_response(struct xen_blkif *blkif, u64 id,
+			  unsigned short op, int st)
+{
+	struct blkif_response  resp;
+	unsigned long     flags;
+	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	int notify;
+
+	resp.id        = id;
+	resp.operation = op;
+	resp.status    = st;
+
+	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+	/* Place on the response ring for the relevant domain. */
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+		memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_32:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_64:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	default:
+		BUG();
+	}
+	blk_rings->common.rsp_prod_pvt++;
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+	if (notify)
+		notify_remote_via_irq(blkif->irq);
+}
+
+static int __init xen_blkif_init(void)
+{
+	int rc = 0;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	rc = xen_blkif_interface_init();
+	if (rc)
+		goto failed_init;
+
+	rc = xen_blkif_xenbus_init();
+	if (rc)
+		goto failed_init;
+
+ failed_init:
+	return rc;
+}
+
+module_init(xen_blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vbd");
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
new file mode 100644
index 00000000000..f65b807e323
--- /dev/null
+++ b/drivers/block/xen-blkback/common.h
@@ -0,0 +1,486 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BLKIF__BACKEND__COMMON_H__
+#define __XEN_BLKIF__BACKEND__COMMON_H__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/io.h>
+#include <linux/rbtree.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <asm/hypervisor.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+#define DRV_PFX "xen-blkback:"
+#define DPRINTK(fmt, args...)				\
+	pr_debug(DRV_PFX "(%s:%d) " fmt ".\n",		\
+		 __func__, __LINE__, ##args)
+
+
+/*
+ * This is the maximum number of segments that would be allowed in indirect
+ * requests. This value will also be passed to the frontend.
+ */
+#define MAX_INDIRECT_SEGMENTS 256
+
+#define SEGS_PER_INDIRECT_FRAME \
+	(PAGE_SIZE/sizeof(struct blkif_request_segment))
+#define MAX_INDIRECT_PAGES \
+	((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+#define INDIRECT_PAGES(_segs) \
+	((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+
+/* Not a real protocol.  Used to generate ring structs which contain
+ * the elements common to all protocols only.  This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places.  */
+struct blkif_common_request {
+	char dummy;
+};
+struct blkif_common_response {
+	char dummy;
+};
+
+struct blkif_x86_32_request_rw {
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request_discard {
+	uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
+	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	uint64_t       nr_sectors;
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request_other {
+	uint8_t        _pad1;
+	blkif_vdev_t   _pad2;
+	uint64_t       id;           /* private guest value, echoed in resp  */
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request_indirect {
+	uint8_t        indirect_op;
+	uint16_t       nr_segments;
+	uint64_t       id;
+	blkif_sector_t sector_number;
+	blkif_vdev_t   handle;
+	uint16_t       _pad1;
+	grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+	/*
+	 * The maximum number of indirect segments (and pages) that will
+	 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+	 * is also exported to the guest (via xenstore
+	 * feature-max-indirect-segments entry), so the frontend knows how
+	 * many indirect segments the backend supports.
+	 */
+	uint64_t       _pad2;        /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	union {
+		struct blkif_x86_32_request_rw rw;
+		struct blkif_x86_32_request_discard discard;
+		struct blkif_x86_32_request_other other;
+		struct blkif_x86_32_request_indirect indirect;
+	} u;
+} __attribute__((__packed__));
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_response {
+	uint64_t        id;              /* copied from request */
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+#pragma pack(pop)
+/* x86_64 protocol version */
+
+struct blkif_x86_64_request_rw {
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint32_t       _pad1;        /* offsetof(blkif_reqest..,u.rw.id)==8  */
+	uint64_t       id;
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request_discard {
+	uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
+	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
+        uint32_t       _pad2;        /* offsetof(blkif_..,u.discard.id)==8   */
+	uint64_t       id;
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	uint64_t       nr_sectors;
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request_other {
+	uint8_t        _pad1;
+	blkif_vdev_t   _pad2;
+	uint32_t       _pad3;        /* offsetof(blkif_..,u.discard.id)==8   */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request_indirect {
+	uint8_t        indirect_op;
+	uint16_t       nr_segments;
+	uint32_t       _pad1;        /* offsetof(blkif_..,u.indirect.id)==8   */
+	uint64_t       id;
+	blkif_sector_t sector_number;
+	blkif_vdev_t   handle;
+	uint16_t       _pad2;
+	grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+	/*
+	 * The maximum number of indirect segments (and pages) that will
+	 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+	 * is also exported to the guest (via xenstore
+	 * feature-max-indirect-segments entry), so the frontend knows how
+	 * many indirect segments the backend supports.
+	 */
+	uint32_t       _pad3;        /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	union {
+		struct blkif_x86_64_request_rw rw;
+		struct blkif_x86_64_request_discard discard;
+		struct blkif_x86_64_request_other other;
+		struct blkif_x86_64_request_indirect indirect;
+	} u;
+} __attribute__((__packed__));
+
+struct blkif_x86_64_response {
+	uint64_t       __attribute__((__aligned__(8))) id;
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
+		  struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
+		  struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
+		  struct blkif_x86_64_response);
+
+union blkif_back_rings {
+	struct blkif_back_ring        native;
+	struct blkif_common_back_ring common;
+	struct blkif_x86_32_back_ring x86_32;
+	struct blkif_x86_64_back_ring x86_64;
+};
+
+enum blkif_protocol {
+	BLKIF_PROTOCOL_NATIVE = 1,
+	BLKIF_PROTOCOL_X86_32 = 2,
+	BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+struct xen_vbd {
+	/* What the domain refers to this vbd as. */
+	blkif_vdev_t		handle;
+	/* Non-zero -> read-only */
+	unsigned char		readonly;
+	/* VDISK_xxx */
+	unsigned char		type;
+	/* phys device that this vbd maps to. */
+	u32			pdevice;
+	struct block_device	*bdev;
+	/* Cached size parameter. */
+	sector_t		size;
+	unsigned int		flush_support:1;
+	unsigned int		discard_secure:1;
+	unsigned int		feature_gnt_persistent:1;
+	unsigned int		overflow_max_grants:1;
+};
+
+struct backend_info;
+
+/* Number of available flags */
+#define PERSISTENT_GNT_FLAGS_SIZE	2
+/* This persistent grant is currently in use */
+#define PERSISTENT_GNT_ACTIVE		0
+/*
+ * This persistent grant has been used, this flag is set when we remove the
+ * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently.
+ */
+#define PERSISTENT_GNT_WAS_ACTIVE	1
+
+/* Number of requests that we can fit in a ring */
+#define XEN_BLKIF_REQS			32
+
+struct persistent_gnt {
+	struct page *page;
+	grant_ref_t gnt;
+	grant_handle_t handle;
+	DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE);
+	struct rb_node node;
+	struct list_head remove_node;
+};
+
+struct xen_blkif {
+	/* Unique identifier for this interface. */
+	domid_t			domid;
+	unsigned int		handle;
+	/* Physical parameters of the comms window. */
+	unsigned int		irq;
+	/* Comms information. */
+	enum blkif_protocol	blk_protocol;
+	union blkif_back_rings	blk_rings;
+	void			*blk_ring;
+	/* The VBD attached to this interface. */
+	struct xen_vbd		vbd;
+	/* Back pointer to the backend_info. */
+	struct backend_info	*be;
+	/* Private fields. */
+	spinlock_t		blk_ring_lock;
+	atomic_t		refcnt;
+
+	wait_queue_head_t	wq;
+	/* for barrier (drain) requests */
+	struct completion	drain_complete;
+	atomic_t		drain;
+	atomic_t		inflight;
+	/* One thread per one blkif. */
+	struct task_struct	*xenblkd;
+	unsigned int		waiting_reqs;
+
+	/* tree to store persistent grants */
+	struct rb_root		persistent_gnts;
+	unsigned int		persistent_gnt_c;
+	atomic_t		persistent_gnt_in_use;
+	unsigned long           next_lru;
+
+	/* used by the kworker that offload work from the persistent purge */
+	struct list_head	persistent_purge_list;
+	struct work_struct	persistent_purge_work;
+
+	/* buffer of free pages to map grant refs */
+	spinlock_t		free_pages_lock;
+	int			free_pages_num;
+	struct list_head	free_pages;
+
+	/* List of all 'pending_req' available */
+	struct list_head	pending_free;
+	/* And its spinlock. */
+	spinlock_t		pending_free_lock;
+	wait_queue_head_t	pending_free_wq;
+
+	/* statistics */
+	unsigned long		st_print;
+	unsigned long long			st_rd_req;
+	unsigned long long			st_wr_req;
+	unsigned long long			st_oo_req;
+	unsigned long long			st_f_req;
+	unsigned long long			st_ds_req;
+	unsigned long long			st_rd_sect;
+	unsigned long long			st_wr_sect;
+
+	struct work_struct	free_work;
+	/* Thread shutdown wait queue. */
+	wait_queue_head_t	shutdown_wq;
+};
+
+struct seg_buf {
+	unsigned long offset;
+	unsigned int nsec;
+};
+
+struct grant_page {
+	struct page 		*page;
+	struct persistent_gnt	*persistent_gnt;
+	grant_handle_t		handle;
+	grant_ref_t		gref;
+};
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+struct pending_req {
+	struct xen_blkif	*blkif;
+	u64			id;
+	int			nr_pages;
+	atomic_t		pendcnt;
+	unsigned short		operation;
+	int			status;
+	struct list_head	free_list;
+	struct grant_page	*segments[MAX_INDIRECT_SEGMENTS];
+	/* Indirect descriptors */
+	struct grant_page	*indirect_pages[MAX_INDIRECT_PAGES];
+	struct seg_buf		seg[MAX_INDIRECT_SEGMENTS];
+	struct bio		*biolist[MAX_INDIRECT_SEGMENTS];
+};
+
+
+#define vbd_sz(_v)	((_v)->bdev->bd_part ? \
+			 (_v)->bdev->bd_part->nr_sects : \
+			  get_capacity((_v)->bdev->bd_disk))
+
+#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define xen_blkif_put(_b)				\
+	do {						\
+		if (atomic_dec_and_test(&(_b)->refcnt))	\
+			schedule_work(&(_b)->free_work);\
+	} while (0)
+
+struct phys_req {
+	unsigned short		dev;
+	blkif_sector_t		nr_sects;
+	struct block_device	*bdev;
+	blkif_sector_t		sector_number;
+};
+int xen_blkif_interface_init(void);
+
+int xen_blkif_xenbus_init(void);
+
+irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
+int xen_blkif_schedule(void *arg);
+int xen_blkif_purge_persistent(void *arg);
+void xen_blkbk_free_caches(struct xen_blkif *blkif);
+
+int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
+			      struct backend_info *be, int state);
+
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+		      struct backend_info *be, int state);
+struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
+void xen_blkbk_unmap_purged_grants(struct work_struct *work);
+
+static inline void blkif_get_x86_32_req(struct blkif_request *dst,
+					struct blkif_x86_32_request *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
+	dst->operation = src->operation;
+	switch (src->operation) {
+	case BLKIF_OP_READ:
+	case BLKIF_OP_WRITE:
+	case BLKIF_OP_WRITE_BARRIER:
+	case BLKIF_OP_FLUSH_DISKCACHE:
+		dst->u.rw.nr_segments = src->u.rw.nr_segments;
+		dst->u.rw.handle = src->u.rw.handle;
+		dst->u.rw.id = src->u.rw.id;
+		dst->u.rw.sector_number = src->u.rw.sector_number;
+		barrier();
+		if (n > dst->u.rw.nr_segments)
+			n = dst->u.rw.nr_segments;
+		for (i = 0; i < n; i++)
+			dst->u.rw.seg[i] = src->u.rw.seg[i];
+		break;
+	case BLKIF_OP_DISCARD:
+		dst->u.discard.flag = src->u.discard.flag;
+		dst->u.discard.id = src->u.discard.id;
+		dst->u.discard.sector_number = src->u.discard.sector_number;
+		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+		break;
+	case BLKIF_OP_INDIRECT:
+		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+		dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+		dst->u.indirect.handle = src->u.indirect.handle;
+		dst->u.indirect.id = src->u.indirect.id;
+		dst->u.indirect.sector_number = src->u.indirect.sector_number;
+		barrier();
+		j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+		for (i = 0; i < j; i++)
+			dst->u.indirect.indirect_grefs[i] =
+				src->u.indirect.indirect_grefs[i];
+		break;
+	default:
+		/*
+		 * Don't know how to translate this op. Only get the
+		 * ID so failure can be reported to the frontend.
+		 */
+		dst->u.other.id = src->u.other.id;
+		break;
+	}
+}
+
+static inline void blkif_get_x86_64_req(struct blkif_request *dst,
+					struct blkif_x86_64_request *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
+	dst->operation = src->operation;
+	switch (src->operation) {
+	case BLKIF_OP_READ:
+	case BLKIF_OP_WRITE:
+	case BLKIF_OP_WRITE_BARRIER:
+	case BLKIF_OP_FLUSH_DISKCACHE:
+		dst->u.rw.nr_segments = src->u.rw.nr_segments;
+		dst->u.rw.handle = src->u.rw.handle;
+		dst->u.rw.id = src->u.rw.id;
+		dst->u.rw.sector_number = src->u.rw.sector_number;
+		barrier();
+		if (n > dst->u.rw.nr_segments)
+			n = dst->u.rw.nr_segments;
+		for (i = 0; i < n; i++)
+			dst->u.rw.seg[i] = src->u.rw.seg[i];
+		break;
+	case BLKIF_OP_DISCARD:
+		dst->u.discard.flag = src->u.discard.flag;
+		dst->u.discard.id = src->u.discard.id;
+		dst->u.discard.sector_number = src->u.discard.sector_number;
+		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+		break;
+	case BLKIF_OP_INDIRECT:
+		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+		dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+		dst->u.indirect.handle = src->u.indirect.handle;
+		dst->u.indirect.id = src->u.indirect.id;
+		dst->u.indirect.sector_number = src->u.indirect.sector_number;
+		barrier();
+		j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+		for (i = 0; i < j; i++)
+			dst->u.indirect.indirect_grefs[i] =
+				src->u.indirect.indirect_grefs[i];
+		break;
+	default:
+		/*
+		 * Don't know how to translate this op. Only get the
+		 * ID so failure can be reported to the frontend.
+		 */
+		dst->u.other.id = src->u.other.id;
+		break;
+	}
+}
+
+#endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
new file mode 100644
index 00000000000..3a8b810b498
--- /dev/null
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -0,0 +1,930 @@
+/*  Xenbus code for blkif backend
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+    Copyright (C) 2005 XenSource Ltd
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+*/
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include "common.h"
+
+struct backend_info {
+	struct xenbus_device	*dev;
+	struct xen_blkif	*blkif;
+	struct xenbus_watch	backend_watch;
+	unsigned		major;
+	unsigned		minor;
+	char			*mode;
+};
+
+static struct kmem_cache *xen_blkif_cachep;
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static void backend_changed(struct xenbus_watch *, const char **,
+			    unsigned int);
+static void xen_blkif_free(struct xen_blkif *blkif);
+static void xen_vbd_free(struct xen_vbd *vbd);
+
+struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be)
+{
+	return be->dev;
+}
+
+/*
+ * The last request could free the device from softirq context and
+ * xen_blkif_free() can sleep.
+ */
+static void xen_blkif_deferred_free(struct work_struct *work)
+{
+	struct xen_blkif *blkif;
+
+	blkif = container_of(work, struct xen_blkif, free_work);
+	xen_blkif_free(blkif);
+}
+
+static int blkback_name(struct xen_blkif *blkif, char *buf)
+{
+	char *devpath, *devname;
+	struct xenbus_device *dev = blkif->be->dev;
+
+	devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+	if (IS_ERR(devpath))
+		return PTR_ERR(devpath);
+
+	devname = strstr(devpath, "/dev/");
+	if (devname != NULL)
+		devname += strlen("/dev/");
+	else
+		devname  = devpath;
+
+	snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
+	kfree(devpath);
+
+	return 0;
+}
+
+static void xen_update_blkif_status(struct xen_blkif *blkif)
+{
+	int err;
+	char name[TASK_COMM_LEN];
+
+	/* Not ready to connect? */
+	if (!blkif->irq || !blkif->vbd.bdev)
+		return;
+
+	/* Already connected? */
+	if (blkif->be->dev->state == XenbusStateConnected)
+		return;
+
+	/* Attempt to connect: exit if we fail to. */
+	connect(blkif->be);
+	if (blkif->be->dev->state != XenbusStateConnected)
+		return;
+
+	err = blkback_name(blkif, name);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
+		return;
+	}
+
+	err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "block flush");
+		return;
+	}
+	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
+
+	blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name);
+	if (IS_ERR(blkif->xenblkd)) {
+		err = PTR_ERR(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+		xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+		return;
+	}
+}
+
+static struct xen_blkif *xen_blkif_alloc(domid_t domid)
+{
+	struct xen_blkif *blkif;
+	struct pending_req *req, *n;
+	int i, j;
+
+	BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
+
+	blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
+	if (!blkif)
+		return ERR_PTR(-ENOMEM);
+
+	blkif->domid = domid;
+	spin_lock_init(&blkif->blk_ring_lock);
+	atomic_set(&blkif->refcnt, 1);
+	init_waitqueue_head(&blkif->wq);
+	init_completion(&blkif->drain_complete);
+	atomic_set(&blkif->drain, 0);
+	blkif->st_print = jiffies;
+	blkif->persistent_gnts.rb_node = NULL;
+	spin_lock_init(&blkif->free_pages_lock);
+	INIT_LIST_HEAD(&blkif->free_pages);
+	INIT_LIST_HEAD(&blkif->persistent_purge_list);
+	blkif->free_pages_num = 0;
+	atomic_set(&blkif->persistent_gnt_in_use, 0);
+	atomic_set(&blkif->inflight, 0);
+	INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
+
+	INIT_LIST_HEAD(&blkif->pending_free);
+	INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
+
+	for (i = 0; i < XEN_BLKIF_REQS; i++) {
+		req = kzalloc(sizeof(*req), GFP_KERNEL);
+		if (!req)
+			goto fail;
+		list_add_tail(&req->free_list,
+		              &blkif->pending_free);
+		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+			req->segments[j] = kzalloc(sizeof(*req->segments[0]),
+			                           GFP_KERNEL);
+			if (!req->segments[j])
+				goto fail;
+		}
+		for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+			req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
+			                                 GFP_KERNEL);
+			if (!req->indirect_pages[j])
+				goto fail;
+		}
+	}
+	spin_lock_init(&blkif->pending_free_lock);
+	init_waitqueue_head(&blkif->pending_free_wq);
+	init_waitqueue_head(&blkif->shutdown_wq);
+
+	return blkif;
+
+fail:
+	list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+		list_del(&req->free_list);
+		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+			if (!req->segments[j])
+				break;
+			kfree(req->segments[j]);
+		}
+		for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+			if (!req->indirect_pages[j])
+				break;
+			kfree(req->indirect_pages[j]);
+		}
+		kfree(req);
+	}
+
+	kmem_cache_free(xen_blkif_cachep, blkif);
+
+	return ERR_PTR(-ENOMEM);
+}
+
+static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
+			 unsigned int evtchn)
+{
+	int err;
+
+	/* Already connected through? */
+	if (blkif->irq)
+		return 0;
+
+	err = xenbus_map_ring_valloc(blkif->be->dev, shared_page, &blkif->blk_ring);
+	if (err < 0)
+		return err;
+
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+	{
+		struct blkif_sring *sring;
+		sring = (struct blkif_sring *)blkif->blk_ring;
+		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_32:
+	{
+		struct blkif_x86_32_sring *sring_x86_32;
+		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
+		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_64:
+	{
+		struct blkif_x86_64_sring *sring_x86_64;
+		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
+		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
+						    xen_blkif_be_int, 0,
+						    "blkif-backend", blkif);
+	if (err < 0) {
+		xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
+		blkif->blk_rings.common.sring = NULL;
+		return err;
+	}
+	blkif->irq = err;
+
+	return 0;
+}
+
+static int xen_blkif_disconnect(struct xen_blkif *blkif)
+{
+	if (blkif->xenblkd) {
+		kthread_stop(blkif->xenblkd);
+		wake_up(&blkif->shutdown_wq);
+		blkif->xenblkd = NULL;
+	}
+
+	/* The above kthread_stop() guarantees that at this point we
+	 * don't have any discard_io or other_io requests. So, checking
+	 * for inflight IO is enough.
+	 */
+	if (atomic_read(&blkif->inflight) > 0)
+		return -EBUSY;
+
+	if (blkif->irq) {
+		unbind_from_irqhandler(blkif->irq, blkif);
+		blkif->irq = 0;
+	}
+
+	if (blkif->blk_rings.common.sring) {
+		xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
+		blkif->blk_rings.common.sring = NULL;
+	}
+
+	return 0;
+}
+
+static void xen_blkif_free(struct xen_blkif *blkif)
+{
+	struct pending_req *req, *n;
+	int i = 0, j;
+
+	xen_blkif_disconnect(blkif);
+	xen_vbd_free(&blkif->vbd);
+
+	/* Remove all persistent grants and the cache of ballooned pages. */
+	xen_blkbk_free_caches(blkif);
+
+	/* Make sure everything is drained before shutting down */
+	BUG_ON(blkif->persistent_gnt_c != 0);
+	BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
+	BUG_ON(blkif->free_pages_num != 0);
+	BUG_ON(!list_empty(&blkif->persistent_purge_list));
+	BUG_ON(!list_empty(&blkif->free_pages));
+	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
+
+	/* Check that there is no request in use */
+	list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+		list_del(&req->free_list);
+
+		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+			kfree(req->segments[j]);
+
+		for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+			kfree(req->indirect_pages[j]);
+
+		kfree(req);
+		i++;
+	}
+
+	WARN_ON(i != XEN_BLKIF_REQS);
+
+	kmem_cache_free(xen_blkif_cachep, blkif);
+}
+
+int __init xen_blkif_interface_init(void)
+{
+	xen_blkif_cachep = kmem_cache_create("blkif_cache",
+					     sizeof(struct xen_blkif),
+					     0, 0, NULL);
+	if (!xen_blkif_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ *  sysfs interface for VBD I/O requests
+ */
+
+#define VBD_SHOW(name, format, args...)					\
+	static ssize_t show_##name(struct device *_dev,			\
+				   struct device_attribute *attr,	\
+				   char *buf)				\
+	{								\
+		struct xenbus_device *dev = to_xenbus_device(_dev);	\
+		struct backend_info *be = dev_get_drvdata(&dev->dev);	\
+									\
+		return sprintf(buf, format, ##args);			\
+	}								\
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req,  "%llu\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req,  "%llu\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req,  "%llu\n", be->blkif->st_wr_req);
+VBD_SHOW(f_req,  "%llu\n", be->blkif->st_f_req);
+VBD_SHOW(ds_req,  "%llu\n", be->blkif->st_ds_req);
+VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect);
+
+static struct attribute *xen_vbdstat_attrs[] = {
+	&dev_attr_oo_req.attr,
+	&dev_attr_rd_req.attr,
+	&dev_attr_wr_req.attr,
+	&dev_attr_f_req.attr,
+	&dev_attr_ds_req.attr,
+	&dev_attr_rd_sect.attr,
+	&dev_attr_wr_sect.attr,
+	NULL
+};
+
+static struct attribute_group xen_vbdstat_group = {
+	.name = "statistics",
+	.attrs = xen_vbdstat_attrs,
+};
+
+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
+VBD_SHOW(mode, "%s\n", be->mode);
+
+static int xenvbd_sysfs_addif(struct xenbus_device *dev)
+{
+	int error;
+
+	error = device_create_file(&dev->dev, &dev_attr_physical_device);
+	if (error)
+		goto fail1;
+
+	error = device_create_file(&dev->dev, &dev_attr_mode);
+	if (error)
+		goto fail2;
+
+	error = sysfs_create_group(&dev->dev.kobj, &xen_vbdstat_group);
+	if (error)
+		goto fail3;
+
+	return 0;
+
+fail3:	sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
+fail2:	device_remove_file(&dev->dev, &dev_attr_mode);
+fail1:	device_remove_file(&dev->dev, &dev_attr_physical_device);
+	return error;
+}
+
+static void xenvbd_sysfs_delif(struct xenbus_device *dev)
+{
+	sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
+	device_remove_file(&dev->dev, &dev_attr_mode);
+	device_remove_file(&dev->dev, &dev_attr_physical_device);
+}
+
+
+static void xen_vbd_free(struct xen_vbd *vbd)
+{
+	if (vbd->bdev)
+		blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
+	vbd->bdev = NULL;
+}
+
+static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
+			  unsigned major, unsigned minor, int readonly,
+			  int cdrom)
+{
+	struct xen_vbd *vbd;
+	struct block_device *bdev;
+	struct request_queue *q;
+
+	vbd = &blkif->vbd;
+	vbd->handle   = handle;
+	vbd->readonly = readonly;
+	vbd->type     = 0;
+
+	vbd->pdevice  = MKDEV(major, minor);
+
+	bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
+				 FMODE_READ : FMODE_WRITE, NULL);
+
+	if (IS_ERR(bdev)) {
+		DPRINTK("xen_vbd_create: device %08x could not be opened.\n",
+			vbd->pdevice);
+		return -ENOENT;
+	}
+
+	vbd->bdev = bdev;
+	if (vbd->bdev->bd_disk == NULL) {
+		DPRINTK("xen_vbd_create: device %08x doesn't exist.\n",
+			vbd->pdevice);
+		xen_vbd_free(vbd);
+		return -ENOENT;
+	}
+	vbd->size = vbd_sz(vbd);
+
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
+		vbd->type |= VDISK_CDROM;
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+		vbd->type |= VDISK_REMOVABLE;
+
+	q = bdev_get_queue(bdev);
+	if (q && q->flush_flags)
+		vbd->flush_support = true;
+
+	if (q && blk_queue_secdiscard(q))
+		vbd->discard_secure = true;
+
+	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+		handle, blkif->domid);
+	return 0;
+}
+static int xen_blkbk_remove(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	DPRINTK("");
+
+	if (be->major || be->minor)
+		xenvbd_sysfs_delif(dev);
+
+	if (be->backend_watch.node) {
+		unregister_xenbus_watch(&be->backend_watch);
+		kfree(be->backend_watch.node);
+		be->backend_watch.node = NULL;
+	}
+
+	dev_set_drvdata(&dev->dev, NULL);
+
+	if (be->blkif) {
+		xen_blkif_disconnect(be->blkif);
+		xen_blkif_put(be->blkif);
+	}
+
+	kfree(be->mode);
+	kfree(be);
+	return 0;
+}
+
+int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
+			      struct backend_info *be, int state)
+{
+	struct xenbus_device *dev = be->dev;
+	int err;
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
+			    "%d", state);
+	if (err)
+		dev_warn(&dev->dev, "writing feature-flush-cache (%d)", err);
+
+	return err;
+}
+
+static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	struct xen_blkif *blkif = be->blkif;
+	int err;
+	int state = 0, discard_enable;
+	struct block_device *bdev = be->blkif->vbd.bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename, "discard-enable", "%d",
+			   &discard_enable);
+	if (err == 1 && !discard_enable)
+		return;
+
+	if (blk_queue_discard(q)) {
+		err = xenbus_printf(xbt, dev->nodename,
+			"discard-granularity", "%u",
+			q->limits.discard_granularity);
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-granularity (%d)", err);
+			return;
+		}
+		err = xenbus_printf(xbt, dev->nodename,
+			"discard-alignment", "%u",
+			q->limits.discard_alignment);
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-alignment (%d)", err);
+			return;
+		}
+		state = 1;
+		/* Optional. */
+		err = xenbus_printf(xbt, dev->nodename,
+				    "discard-secure", "%d",
+				    blkif->vbd.discard_secure);
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-secure (%d)", err);
+			return;
+		}
+	}
+	err = xenbus_printf(xbt, dev->nodename, "feature-discard",
+			    "%d", state);
+	if (err)
+		dev_warn(&dev->dev, "writing feature-discard (%d)", err);
+}
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+		      struct backend_info *be, int state)
+{
+	struct xenbus_device *dev = be->dev;
+	int err;
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+			    "%d", state);
+	if (err)
+		dev_warn(&dev->dev, "writing feature-barrier (%d)", err);
+
+	return err;
+}
+
+/*
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures, and watch the store waiting for the hotplug scripts to tell us
+ * the device's physical major and minor numbers.  Switch to InitWait.
+ */
+static int xen_blkbk_probe(struct xenbus_device *dev,
+			   const struct xenbus_device_id *id)
+{
+	int err;
+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
+					  GFP_KERNEL);
+	if (!be) {
+		xenbus_dev_fatal(dev, -ENOMEM,
+				 "allocating backend structure");
+		return -ENOMEM;
+	}
+	be->dev = dev;
+	dev_set_drvdata(&dev->dev, be);
+
+	be->blkif = xen_blkif_alloc(dev->otherend_id);
+	if (IS_ERR(be->blkif)) {
+		err = PTR_ERR(be->blkif);
+		be->blkif = NULL;
+		xenbus_dev_fatal(dev, err, "creating block interface");
+		goto fail;
+	}
+
+	/* setup back pointer */
+	be->blkif->be = be;
+
+	err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
+				   "%s/%s", dev->nodename, "physical-device");
+	if (err)
+		goto fail;
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	DPRINTK("failed");
+	xen_blkbk_remove(dev);
+	return err;
+}
+
+
+/*
+ * Callback received when the hotplug scripts have placed the physical-device
+ * node.  Read it and the mode node, and create a vbd.  If the frontend is
+ * ready, connect.
+ */
+static void backend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len)
+{
+	int err;
+	unsigned major;
+	unsigned minor;
+	struct backend_info *be
+		= container_of(watch, struct backend_info, backend_watch);
+	struct xenbus_device *dev = be->dev;
+	int cdrom = 0;
+	unsigned long handle;
+	char *device_type;
+
+	DPRINTK("");
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
+			   &major, &minor);
+	if (XENBUS_EXIST_ERR(err)) {
+		/*
+		 * Since this watch will fire once immediately after it is
+		 * registered, we expect this.  Ignore it, and wait for the
+		 * hotplug scripts.
+		 */
+		return;
+	}
+	if (err != 2) {
+		xenbus_dev_fatal(dev, err, "reading physical-device");
+		return;
+	}
+
+	if (be->major | be->minor) {
+		if (be->major != major || be->minor != minor)
+			pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n",
+				be->major, be->minor, major, minor);
+		return;
+	}
+
+	be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
+	if (IS_ERR(be->mode)) {
+		err = PTR_ERR(be->mode);
+		be->mode = NULL;
+		xenbus_dev_fatal(dev, err, "reading mode");
+		return;
+	}
+
+	device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
+	if (!IS_ERR(device_type)) {
+		cdrom = strcmp(device_type, "cdrom") == 0;
+		kfree(device_type);
+	}
+
+	/* Front end dir is a number, which is used as the handle. */
+	err = kstrtoul(strrchr(dev->otherend, '/') + 1, 0, &handle);
+	if (err)
+		return;
+
+	be->major = major;
+	be->minor = minor;
+
+	err = xen_vbd_create(be->blkif, handle, major, minor,
+			     !strchr(be->mode, 'w'), cdrom);
+
+	if (err)
+		xenbus_dev_fatal(dev, err, "creating vbd structure");
+	else {
+		err = xenvbd_sysfs_addif(dev);
+		if (err) {
+			xen_vbd_free(&be->blkif->vbd);
+			xenbus_dev_fatal(dev, err, "creating sysfs entries");
+		}
+	}
+
+	if (err) {
+		kfree(be->mode);
+		be->mode = NULL;
+		be->major = 0;
+		be->minor = 0;
+	} else {
+		/* We're potentially connected now */
+		xen_update_blkif_status(be->blkif);
+	}
+}
+
+
+/*
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+	int err;
+
+	DPRINTK("%s", xenbus_strstate(frontend_state));
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		if (dev->state == XenbusStateClosed) {
+			pr_info(DRV_PFX "%s: prepare for reconnect\n",
+				dev->nodename);
+			xenbus_switch_state(dev, XenbusStateInitWait);
+		}
+		break;
+
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+		/*
+		 * Ensure we connect even when two watches fire in
+		 * close succession and we miss the intermediate value
+		 * of frontend_state.
+		 */
+		if (dev->state == XenbusStateConnected)
+			break;
+
+		/*
+		 * Enforce precondition before potential leak point.
+		 * xen_blkif_disconnect() is idempotent.
+		 */
+		err = xen_blkif_disconnect(be->blkif);
+		if (err) {
+			xenbus_dev_fatal(dev, err, "pending I/O");
+			break;
+		}
+
+		err = connect_ring(be);
+		if (err)
+			break;
+		xen_update_blkif_status(be->blkif);
+		break;
+
+	case XenbusStateClosing:
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xen_blkif_disconnect(be->blkif);
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		/* implies xen_blkif_disconnect() via xen_blkbk_remove() */
+		device_unregister(&dev->dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+
+/* ** Connection ** */
+
+
+/*
+ * Write the physical details regarding the block device to the store, and
+ * switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = be->dev;
+
+	DPRINTK("%s", dev->otherend);
+
+	/* Supply the information about the device the frontend needs */
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		return;
+	}
+
+	/* If we can't advertise it is OK. */
+	xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
+
+	xen_blkbk_discard(xbt, be);
+
+	xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", 1);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/feature-persistent",
+				 dev->nodename);
+		goto abort;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u",
+			    MAX_INDIRECT_SEGMENTS);
+	if (err)
+		dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)",
+			 dev->nodename, err);
+
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+			    (unsigned long long)vbd_sz(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sectors",
+				 dev->nodename);
+		goto abort;
+	}
+
+	/* FIXME: use a typename instead */
+	err = xenbus_printf(xbt, dev->nodename, "info", "%u",
+			    be->blkif->vbd.type |
+			    (be->blkif->vbd.readonly ? VDISK_READONLY : 0));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/info",
+				 dev->nodename);
+		goto abort;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+			    (unsigned long)
+			    bdev_logical_block_size(be->blkif->vbd.bdev));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sector-size",
+				 dev->nodename);
+		goto abort;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
+			    bdev_physical_block_size(be->blkif->vbd.bdev));
+	if (err)
+		xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
+				 dev->nodename);
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		xenbus_dev_fatal(dev, err, "ending transaction");
+
+	err = xenbus_switch_state(dev, XenbusStateConnected);
+	if (err)
+		xenbus_dev_fatal(dev, err, "%s: switching to Connected state",
+				 dev->nodename);
+
+	return;
+ abort:
+	xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	unsigned long ring_ref;
+	unsigned int evtchn;
+	unsigned int pers_grants;
+	char protocol[64] = "";
+	int err;
+
+	DPRINTK("%s", dev->otherend);
+
+	err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
+			    &ring_ref, "event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				 "reading %s/ring-ref and event-channel",
+				 dev->otherend);
+		return err;
+	}
+
+	be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+			    "%63s", protocol, NULL);
+	if (err)
+		strcpy(protocol, "unspecified, assuming native");
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+	else {
+		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+		return -1;
+	}
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+			    "feature-persistent", "%u",
+			    &pers_grants, NULL);
+	if (err)
+		pers_grants = 0;
+
+	be->blkif->vbd.feature_gnt_persistent = pers_grants;
+	be->blkif->vbd.overflow_max_grants = 0;
+
+	pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
+		ring_ref, evtchn, be->blkif->blk_protocol, protocol,
+		pers_grants ? "persistent grants" : "");
+
+	/* Map the shared frame, irq etc. */
+	err = xen_blkif_map(be->blkif, ring_ref, evtchn);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
+				 ring_ref, evtchn);
+		return err;
+	}
+
+	return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id xen_blkbk_ids[] = {
+	{ "vbd" },
+	{ "" }
+};
+
+
+static DEFINE_XENBUS_DRIVER(xen_blkbk, ,
+	.probe = xen_blkbk_probe,
+	.remove = xen_blkbk_remove,
+	.otherend_changed = frontend_changed
+);
+
+
+int xen_blkif_xenbus_init(void)
+{
+	return xenbus_register_backend(&xen_blkbk_driver);
+}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index c1996829d5e..5deb235bd18 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -40,12 +40,18 @@
 #include <linux/hdreg.h>
 #include <linux/cdrom.h>
 #include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
 #include <linux/scatterlist.h>
+#include <linux/bitmap.h>
+#include <linux/list.h>
 
+#include <xen/xen.h>
 #include <xen/xenbus.h>
 #include <xen/grant_table.h>
 #include <xen/events.h>
 #include <xen/page.h>
+#include <xen/platform_pci.h>
 
 #include <xen/interface/grant_table.h>
 #include <xen/interface/io/blkif.h>
@@ -59,15 +65,40 @@ enum blkif_state {
 	BLKIF_STATE_SUSPENDED,
 };
 
+struct grant {
+	grant_ref_t gref;
+	unsigned long pfn;
+	struct list_head node;
+};
+
 struct blk_shadow {
 	struct blkif_request req;
-	unsigned long request;
-	unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct request *request;
+	struct grant **grants_used;
+	struct grant **indirect_grants;
+	struct scatterlist *sg;
+};
+
+struct split_bio {
+	struct bio *bio;
+	atomic_t pending;
+	int err;
 };
 
-static struct block_device_operations xlvbd_block_fops;
+static DEFINE_MUTEX(blkfront_mutex);
+static const struct block_device_operations xlvbd_block_fops;
 
-#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+/*
+ * Maximum number of segments in indirect requests, the actual value used by
+ * the frontend driver is the minimum of this value and the value provided
+ * by the backend driver.
+ */
+
+static unsigned int xen_blkif_max_segments = 32;
+module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
+MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
+
+#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
 
 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -76,6 +107,8 @@ static struct block_device_operations xlvbd_block_fops;
  */
 struct blkfront_info
 {
+	spinlock_t io_lock;
+	struct mutex mutex;
 	struct xenbus_device *xbdev;
 	struct gendisk *gd;
 	int vdevice;
@@ -83,24 +116,29 @@ struct blkfront_info
 	enum blkif_state connected;
 	int ring_ref;
 	struct blkif_front_ring ring;
-	struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int evtchn, irq;
 	struct request_queue *rq;
 	struct work_struct work;
 	struct gnttab_free_callback callback;
 	struct blk_shadow shadow[BLK_RING_SIZE];
+	struct list_head grants;
+	struct list_head indirect_pages;
+	unsigned int persistent_gnts_c;
 	unsigned long shadow_free;
-	int feature_barrier;
+	unsigned int feature_flush;
+	unsigned int flush_op;
+	unsigned int feature_discard:1;
+	unsigned int feature_secdiscard:1;
+	unsigned int discard_granularity;
+	unsigned int discard_alignment;
+	unsigned int feature_persistent:1;
+	unsigned int max_indirect_segments;
 	int is_ready;
-
-	/**
-	 * The number of people holding this device open.  We won't allow a
-	 * hot-unplug unless this is 0.
-	 */
-	int users;
 };
 
-static DEFINE_SPINLOCK(blkif_io_lock);
+static unsigned int nr_minors;
+static unsigned long *minors;
+static DEFINE_SPINLOCK(minor_lock);
 
 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
 	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
@@ -116,24 +154,175 @@ static DEFINE_SPINLOCK(blkif_io_lock);
 #define EXTENDED (1<<EXT_SHIFT)
 #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+#define EMULATED_HD_DISK_MINOR_OFFSET (0)
+#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
+#define EMULATED_SD_DISK_MINOR_OFFSET (0)
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
 
 #define DEV_NAME	"xvd"	/* name in /dev */
 
+#define SEGS_PER_INDIRECT_FRAME \
+	(PAGE_SIZE/sizeof(struct blkif_request_segment))
+#define INDIRECT_GREFS(_segs) \
+	((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+
+static int blkfront_setup_indirect(struct blkfront_info *info);
+
 static int get_id_from_freelist(struct blkfront_info *info)
 {
 	unsigned long free = info->shadow_free;
 	BUG_ON(free >= BLK_RING_SIZE);
-	info->shadow_free = info->shadow[free].req.id;
-	info->shadow[free].req.id = 0x0fffffee; /* debug */
+	info->shadow_free = info->shadow[free].req.u.rw.id;
+	info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
 	return free;
 }
 
-static void add_id_to_freelist(struct blkfront_info *info,
+static int add_id_to_freelist(struct blkfront_info *info,
 			       unsigned long id)
 {
-	info->shadow[id].req.id  = info->shadow_free;
-	info->shadow[id].request = 0;
+	if (info->shadow[id].req.u.rw.id != id)
+		return -EINVAL;
+	if (info->shadow[id].request == NULL)
+		return -EINVAL;
+	info->shadow[id].req.u.rw.id  = info->shadow_free;
+	info->shadow[id].request = NULL;
 	info->shadow_free = id;
+	return 0;
+}
+
+static int fill_grant_buffer(struct blkfront_info *info, int num)
+{
+	struct page *granted_page;
+	struct grant *gnt_list_entry, *n;
+	int i = 0;
+
+	while(i < num) {
+		gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
+		if (!gnt_list_entry)
+			goto out_of_memory;
+
+		if (info->feature_persistent) {
+			granted_page = alloc_page(GFP_NOIO);
+			if (!granted_page) {
+				kfree(gnt_list_entry);
+				goto out_of_memory;
+			}
+			gnt_list_entry->pfn = page_to_pfn(granted_page);
+		}
+
+		gnt_list_entry->gref = GRANT_INVALID_REF;
+		list_add(&gnt_list_entry->node, &info->grants);
+		i++;
+	}
+
+	return 0;
+
+out_of_memory:
+	list_for_each_entry_safe(gnt_list_entry, n,
+	                         &info->grants, node) {
+		list_del(&gnt_list_entry->node);
+		if (info->feature_persistent)
+			__free_page(pfn_to_page(gnt_list_entry->pfn));
+		kfree(gnt_list_entry);
+		i--;
+	}
+	BUG_ON(i != 0);
+	return -ENOMEM;
+}
+
+static struct grant *get_grant(grant_ref_t *gref_head,
+                               unsigned long pfn,
+                               struct blkfront_info *info)
+{
+	struct grant *gnt_list_entry;
+	unsigned long buffer_mfn;
+
+	BUG_ON(list_empty(&info->grants));
+	gnt_list_entry = list_first_entry(&info->grants, struct grant,
+	                                  node);
+	list_del(&gnt_list_entry->node);
+
+	if (gnt_list_entry->gref != GRANT_INVALID_REF) {
+		info->persistent_gnts_c--;
+		return gnt_list_entry;
+	}
+
+	/* Assign a gref to this page */
+	gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
+	BUG_ON(gnt_list_entry->gref == -ENOSPC);
+	if (!info->feature_persistent) {
+		BUG_ON(!pfn);
+		gnt_list_entry->pfn = pfn;
+	}
+	buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
+	gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
+	                                info->xbdev->otherend_id,
+	                                buffer_mfn, 0);
+	return gnt_list_entry;
+}
+
+static const char *op_name(int op)
+{
+	static const char *const names[] = {
+		[BLKIF_OP_READ] = "read",
+		[BLKIF_OP_WRITE] = "write",
+		[BLKIF_OP_WRITE_BARRIER] = "barrier",
+		[BLKIF_OP_FLUSH_DISKCACHE] = "flush",
+		[BLKIF_OP_DISCARD] = "discard" };
+
+	if (op < 0 || op >= ARRAY_SIZE(names))
+		return "unknown";
+
+	if (!names[op])
+		return "reserved";
+
+	return names[op];
+}
+static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
+{
+	unsigned int end = minor + nr;
+	int rc;
+
+	if (end > nr_minors) {
+		unsigned long *bitmap, *old;
+
+		bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
+				 GFP_KERNEL);
+		if (bitmap == NULL)
+			return -ENOMEM;
+
+		spin_lock(&minor_lock);
+		if (end > nr_minors) {
+			old = minors;
+			memcpy(bitmap, minors,
+			       BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
+			minors = bitmap;
+			nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
+		} else
+			old = bitmap;
+		spin_unlock(&minor_lock);
+		kfree(old);
+	}
+
+	spin_lock(&minor_lock);
+	if (find_next_bit(minors, end, minor) >= end) {
+		bitmap_set(minors, minor, nr);
+		rc = 0;
+	} else
+		rc = -EBUSY;
+	spin_unlock(&minor_lock);
+
+	return rc;
+}
+
+static void xlbd_release_minors(unsigned int minor, unsigned int nr)
+{
+	unsigned int end = minor + nr;
+
+	BUG_ON(end > nr_minors);
+	spin_lock(&minor_lock);
+	bitmap_clear(minors,  minor, nr);
+	spin_unlock(&minor_lock);
 }
 
 static void blkif_restart_queue_callback(void *arg)
@@ -192,76 +381,183 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
 }
 
 /*
- * blkif_queue_request
- *
- * request block io
+ * Generate a Xen blkfront IO request from a blk layer request.  Reads
+ * and writes are handled as expected.
  *
- * id: for guest use only.
- * operation: BLKIF_OP_{READ,WRITE,PROBE}
- * buffer: buffer to read/write into. this should be a
- *   virtual address in the guest os.
+ * @req: a request struct
  */
 static int blkif_queue_request(struct request *req)
 {
 	struct blkfront_info *info = req->rq_disk->private_data;
-	unsigned long buffer_mfn;
 	struct blkif_request *ring_req;
 	unsigned long id;
 	unsigned int fsect, lsect;
-	int i, ref;
+	int i, ref, n;
+	struct blkif_request_segment *segments = NULL;
+
+	/*
+	 * Used to store if we are able to queue the request by just using
+	 * existing persistent grants, or if we have to get new grants,
+	 * as there are not sufficiently many free.
+	 */
+	bool new_persistent_gnts;
 	grant_ref_t gref_head;
+	struct grant *gnt_list_entry = NULL;
 	struct scatterlist *sg;
+	int nseg, max_grefs;
 
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 		return 1;
 
-	if (gnttab_alloc_grant_references(
-		BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
-		gnttab_request_free_callback(
-			&info->callback,
-			blkif_restart_queue_callback,
-			info,
-			BLKIF_MAX_SEGMENTS_PER_REQUEST);
-		return 1;
-	}
+	max_grefs = req->nr_phys_segments;
+	if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
+		/*
+		 * If we are using indirect segments we need to account
+		 * for the indirect grefs used in the request.
+		 */
+		max_grefs += INDIRECT_GREFS(req->nr_phys_segments);
+
+	/* Check if we have enough grants to allocate a requests */
+	if (info->persistent_gnts_c < max_grefs) {
+		new_persistent_gnts = 1;
+		if (gnttab_alloc_grant_references(
+		    max_grefs - info->persistent_gnts_c,
+		    &gref_head) < 0) {
+			gnttab_request_free_callback(
+				&info->callback,
+				blkif_restart_queue_callback,
+				info,
+				max_grefs);
+			return 1;
+		}
+	} else
+		new_persistent_gnts = 0;
 
 	/* Fill out a communications ring structure. */
 	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
 	id = get_id_from_freelist(info);
-	info->shadow[id].request = (unsigned long)req;
-
-	ring_req->id = id;
-	ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
-	ring_req->handle = info->handle;
-
-	ring_req->operation = rq_data_dir(req) ?
-		BLKIF_OP_WRITE : BLKIF_OP_READ;
-	if (blk_barrier_rq(req))
-		ring_req->operation = BLKIF_OP_WRITE_BARRIER;
-
-	ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
-	BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-
-	for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
-		buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
-		fsect = sg->offset >> 9;
-		lsect = fsect + (sg->length >> 9) - 1;
-		/* install a grant reference. */
-		ref = gnttab_claim_grant_reference(&gref_head);
-		BUG_ON(ref == -ENOSPC);
-
-		gnttab_grant_foreign_access_ref(
-				ref,
-				info->xbdev->otherend_id,
-				buffer_mfn,
-				rq_data_dir(req) );
-
-		info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
-		ring_req->seg[i] =
-				(struct blkif_request_segment) {
-					.gref       = ref,
-					.first_sect = fsect,
-					.last_sect  = lsect };
+	info->shadow[id].request = req;
+
+	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
+		ring_req->operation = BLKIF_OP_DISCARD;
+		ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+		ring_req->u.discard.id = id;
+		ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
+		if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+			ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
+		else
+			ring_req->u.discard.flag = 0;
+	} else {
+		BUG_ON(info->max_indirect_segments == 0 &&
+		       req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+		BUG_ON(info->max_indirect_segments &&
+		       req->nr_phys_segments > info->max_indirect_segments);
+		nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
+		ring_req->u.rw.id = id;
+		if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+			/*
+			 * The indirect operation can only be a BLKIF_OP_READ or
+			 * BLKIF_OP_WRITE
+			 */
+			BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
+			ring_req->operation = BLKIF_OP_INDIRECT;
+			ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
+				BLKIF_OP_WRITE : BLKIF_OP_READ;
+			ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
+			ring_req->u.indirect.handle = info->handle;
+			ring_req->u.indirect.nr_segments = nseg;
+		} else {
+			ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
+			ring_req->u.rw.handle = info->handle;
+			ring_req->operation = rq_data_dir(req) ?
+				BLKIF_OP_WRITE : BLKIF_OP_READ;
+			if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+				/*
+				 * Ideally we can do an unordered flush-to-disk. In case the
+				 * backend onlysupports barriers, use that. A barrier request
+				 * a superset of FUA, so we can implement it the same
+				 * way.  (It's also a FLUSH+FUA, since it is
+				 * guaranteed ordered WRT previous writes.)
+				 */
+				ring_req->operation = info->flush_op;
+			}
+			ring_req->u.rw.nr_segments = nseg;
+		}
+		for_each_sg(info->shadow[id].sg, sg, nseg, i) {
+			fsect = sg->offset >> 9;
+			lsect = fsect + (sg->length >> 9) - 1;
+
+			if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
+			    (i % SEGS_PER_INDIRECT_FRAME == 0)) {
+				unsigned long uninitialized_var(pfn);
+
+				if (segments)
+					kunmap_atomic(segments);
+
+				n = i / SEGS_PER_INDIRECT_FRAME;
+				if (!info->feature_persistent) {
+					struct page *indirect_page;
+
+					/* Fetch a pre-allocated page to use for indirect grefs */
+					BUG_ON(list_empty(&info->indirect_pages));
+					indirect_page = list_first_entry(&info->indirect_pages,
+					                                 struct page, lru);
+					list_del(&indirect_page->lru);
+					pfn = page_to_pfn(indirect_page);
+				}
+				gnt_list_entry = get_grant(&gref_head, pfn, info);
+				info->shadow[id].indirect_grants[n] = gnt_list_entry;
+				segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
+				ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
+			}
+
+			gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info);
+			ref = gnt_list_entry->gref;
+
+			info->shadow[id].grants_used[i] = gnt_list_entry;
+
+			if (rq_data_dir(req) && info->feature_persistent) {
+				char *bvec_data;
+				void *shared_data;
+
+				BUG_ON(sg->offset + sg->length > PAGE_SIZE);
+
+				shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
+				bvec_data = kmap_atomic(sg_page(sg));
+
+				/*
+				 * this does not wipe data stored outside the
+				 * range sg->offset..sg->offset+sg->length.
+				 * Therefore, blkback *could* see data from
+				 * previous requests. This is OK as long as
+				 * persistent grants are shared with just one
+				 * domain. It may need refactoring if this
+				 * changes
+				 */
+				memcpy(shared_data + sg->offset,
+				       bvec_data   + sg->offset,
+				       sg->length);
+
+				kunmap_atomic(bvec_data);
+				kunmap_atomic(shared_data);
+			}
+			if (ring_req->operation != BLKIF_OP_INDIRECT) {
+				ring_req->u.rw.seg[i] =
+						(struct blkif_request_segment) {
+							.gref       = ref,
+							.first_sect = fsect,
+							.last_sect  = lsect };
+			} else {
+				n = i % SEGS_PER_INDIRECT_FRAME;
+				segments[n] =
+					(struct blkif_request_segment) {
+							.gref       = ref,
+							.first_sect = fsect,
+							.last_sect  = lsect };
+			}
+		}
+		if (segments)
+			kunmap_atomic(segments);
 	}
 
 	info->ring.req_prod_pvt++;
@@ -269,7 +565,8 @@ static int blkif_queue_request(struct request *req)
 	/* Keep a private copy so we can reissue requests when recovering. */
 	info->shadow[id].req = *ring_req;
 
-	gnttab_free_grant_references(gref_head);
+	if (new_persistent_gnts)
+		gnttab_free_grant_references(gref_head);
 
 	return 0;
 }
@@ -307,16 +604,18 @@ static void do_blkif_request(struct request_queue *rq)
 
 		blk_start_request(req);
 
-		if (!blk_fs_request(req)) {
+		if ((req->cmd_type != REQ_TYPE_FS) ||
+		    ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) &&
+		    !info->flush_op)) {
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 
 		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
-			 "(%u/%u) buffer:%p [%s]\n",
+			 "(%u/%u) [%s]\n",
 			 req, req->cmd, (unsigned long)blk_rq_pos(req),
 			 blk_rq_cur_sectors(req), blk_rq_sectors(req),
-			 req->buffer, rq_data_dir(req) ? "write" : "read");
+			 rq_data_dir(req) ? "write" : "read");
 
 		if (blkif_queue_request(req)) {
 			blk_requeue_request(rq, req);
@@ -333,27 +632,39 @@ wait:
 		flush_requests(info);
 }
 
-static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
+				unsigned int physical_sector_size,
+				unsigned int segments)
 {
 	struct request_queue *rq;
+	struct blkfront_info *info = gd->private_data;
 
-	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+	rq = blk_init_queue(do_blkif_request, &info->io_lock);
 	if (rq == NULL)
 		return -1;
 
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 
+	if (info->feature_discard) {
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq);
+		blk_queue_max_discard_sectors(rq, get_capacity(gd));
+		rq->limits.discard_granularity = info->discard_granularity;
+		rq->limits.discard_alignment = info->discard_alignment;
+		if (info->feature_secdiscard)
+			queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
+	}
+
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
 	blk_queue_logical_block_size(rq, sector_size);
-	blk_queue_max_sectors(rq, 512);
+	blk_queue_physical_block_size(rq, physical_sector_size);
+	blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
 
 	/* Each segment in a request is up to an aligned page in size. */
 	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
 	blk_queue_max_segment_size(rq, PAGE_SIZE);
 
 	/* Ensure a merged request will fit in a single I/O ring slot. */
-	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	blk_queue_max_segments(rq, segments);
 
 	/* Make sure buffer addresses are sector-aligned. */
 	blk_queue_dma_alignment(rq, 511);
@@ -367,34 +678,101 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 }
 
 
-static int xlvbd_barrier(struct blkfront_info *info)
+static void xlvbd_flush(struct blkfront_info *info)
 {
-	int err;
-
-	err = blk_queue_ordered(info->rq,
-				info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
-				NULL);
-
-	if (err)
-		return err;
-
-	printk(KERN_INFO "blkfront: %s: barriers %s\n",
+	blk_queue_flush(info->rq, info->feature_flush);
+	printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
 	       info->gd->disk_name,
-	       info->feature_barrier ? "enabled" : "disabled");
+	       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+		"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
+		"flush diskcache" : "barrier or flush"),
+	       info->feature_flush ? "enabled;" : "disabled;",
+	       "persistent grants:",
+	       info->feature_persistent ? "enabled;" : "disabled;",
+	       "indirect descriptors:",
+	       info->max_indirect_segments ? "enabled;" : "disabled;");
+}
+
+static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
+{
+	int major;
+	major = BLKIF_MAJOR(vdevice);
+	*minor = BLKIF_MINOR(vdevice);
+	switch (major) {
+		case XEN_IDE0_MAJOR:
+			*offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
+			*minor = ((*minor / 64) * PARTS_PER_DISK) +
+				EMULATED_HD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_IDE1_MAJOR:
+			*offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
+			*minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
+				EMULATED_HD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK0_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK1_MAJOR:
+		case XEN_SCSI_DISK2_MAJOR:
+		case XEN_SCSI_DISK3_MAJOR:
+		case XEN_SCSI_DISK4_MAJOR:
+		case XEN_SCSI_DISK5_MAJOR:
+		case XEN_SCSI_DISK6_MAJOR:
+		case XEN_SCSI_DISK7_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + 
+				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
+				EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor +
+				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
+				EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK8_MAJOR:
+		case XEN_SCSI_DISK9_MAJOR:
+		case XEN_SCSI_DISK10_MAJOR:
+		case XEN_SCSI_DISK11_MAJOR:
+		case XEN_SCSI_DISK12_MAJOR:
+		case XEN_SCSI_DISK13_MAJOR:
+		case XEN_SCSI_DISK14_MAJOR:
+		case XEN_SCSI_DISK15_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + 
+				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
+				EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor +
+				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
+				EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XENVBD_MAJOR:
+			*offset = *minor / PARTS_PER_DISK;
+			break;
+		default:
+			printk(KERN_WARNING "blkfront: your disk configuration is "
+					"incorrect, please use an xvd device instead\n");
+			return -ENODEV;
+	}
 	return 0;
 }
 
+static char *encode_disk_name(char *ptr, unsigned int n)
+{
+	if (n >= 26)
+		ptr = encode_disk_name(ptr, n / 26 - 1);
+	*ptr = 'a' + n % 26;
+	return ptr + 1;
+}
 
 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 			       struct blkfront_info *info,
-			       u16 vdisk_info, u16 sector_size)
+			       u16 vdisk_info, u16 sector_size,
+			       unsigned int physical_sector_size)
 {
 	struct gendisk *gd;
 	int nr_minors = 1;
-	int err = -ENODEV;
+	int err;
 	unsigned int offset;
 	int minor;
 	int nr_parts;
+	char *ptr;
 
 	BUG_ON(info->gd != NULL);
 	BUG_ON(info->rq != NULL);
@@ -406,39 +784,45 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	}
 
 	if (!VDEV_IS_EXTENDED(info->vdevice)) {
-		minor = BLKIF_MINOR(info->vdevice);
-		nr_parts = PARTS_PER_DISK;
+		err = xen_translate_vdev(info->vdevice, &minor, &offset);
+		if (err)
+			return err;		
+ 		nr_parts = PARTS_PER_DISK;
 	} else {
 		minor = BLKIF_MINOR_EXT(info->vdevice);
 		nr_parts = PARTS_PER_EXT_DISK;
+		offset = minor / nr_parts;
+		if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
+			printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
+					"emulated IDE disks,\n\t choose an xvd device name"
+					"from xvde on\n", info->vdevice);
+	}
+	if (minor >> MINORBITS) {
+		pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
+			info->vdevice, minor);
+		return -ENODEV;
 	}
 
 	if ((minor % nr_parts) == 0)
 		nr_minors = nr_parts;
 
-	gd = alloc_disk(nr_minors);
-	if (gd == NULL)
+	err = xlbd_reserve_minors(minor, nr_minors);
+	if (err)
 		goto out;
+	err = -ENODEV;
 
-	offset = minor / nr_parts;
+	gd = alloc_disk(nr_minors);
+	if (gd == NULL)
+		goto release;
 
-	if (nr_minors > 1) {
-		if (offset < 26)
-			sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
-		else
-			sprintf(gd->disk_name, "%s%c%c", DEV_NAME,
-				'a' + ((offset / 26)-1), 'a' + (offset % 26));
-	} else {
-		if (offset < 26)
-			sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
-				'a' + offset,
-				minor & (nr_parts - 1));
-		else
-			sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME,
-				'a' + ((offset / 26) - 1),
-				'a' + (offset % 26),
-				minor & (nr_parts - 1));
-	}
+	strcpy(gd->disk_name, DEV_NAME);
+	ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
+	BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
+	if (nr_minors > 1)
+		*ptr = 0;
+	else
+		snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
+			 "%d", minor & (nr_parts - 1));
 
 	gd->major = XENVBD_MAJOR;
 	gd->first_minor = minor;
@@ -447,16 +831,17 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	gd->driverfs_dev = &(info->xbdev->dev);
 	set_capacity(gd, capacity);
 
-	if (xlvbd_init_blk_queue(gd, sector_size)) {
+	if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
+				 info->max_indirect_segments ? :
+				 BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		del_gendisk(gd);
-		goto out;
+		goto release;
 	}
 
 	info->rq = gd->queue;
 	info->gd = gd;
 
-	if (info->feature_barrier)
-		xlvbd_barrier(info);
+	xlvbd_flush(info);
 
 	if (vdisk_info & VDISK_READONLY)
 		set_disk_ro(gd, 1);
@@ -469,10 +854,45 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 
 	return 0;
 
+ release:
+	xlbd_release_minors(minor, nr_minors);
  out:
 	return err;
 }
 
+static void xlvbd_release_gendisk(struct blkfront_info *info)
+{
+	unsigned int minor, nr_minors;
+	unsigned long flags;
+
+	if (info->rq == NULL)
+		return;
+
+	spin_lock_irqsave(&info->io_lock, flags);
+
+	/* No more blkif_request(). */
+	blk_stop_queue(info->rq);
+
+	/* No more gnttab callback work. */
+	gnttab_cancel_free_callback(&info->callback);
+	spin_unlock_irqrestore(&info->io_lock, flags);
+
+	/* Flush gnttab callback work. Must be done with no locks held. */
+	flush_work(&info->work);
+
+	del_gendisk(info->gd);
+
+	minor = info->gd->first_minor;
+	nr_minors = info->gd->minors;
+	xlbd_release_minors(minor, nr_minors);
+
+	blk_cleanup_queue(info->rq);
+	info->rq = NULL;
+
+	put_disk(info->gd);
+	info->gd = NULL;
+}
+
 static void kick_pending_request_queues(struct blkfront_info *info)
 {
 	if (!RING_FULL(&info->ring)) {
@@ -487,27 +907,105 @@ static void blkif_restart_queue(struct work_struct *work)
 {
 	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
 
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	if (info->connected == BLKIF_STATE_CONNECTED)
 		kick_pending_request_queues(info);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 }
 
 static void blkif_free(struct blkfront_info *info, int suspend)
 {
+	struct grant *persistent_gnt;
+	struct grant *n;
+	int i, j, segs;
+
 	/* Prevent new requests being issued until we fix things up. */
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	info->connected = suspend ?
 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 	/* No more blkif_request(). */
 	if (info->rq)
 		blk_stop_queue(info->rq);
+
+	/* Remove all persistent grants */
+	if (!list_empty(&info->grants)) {
+		list_for_each_entry_safe(persistent_gnt, n,
+		                         &info->grants, node) {
+			list_del(&persistent_gnt->node);
+			if (persistent_gnt->gref != GRANT_INVALID_REF) {
+				gnttab_end_foreign_access(persistent_gnt->gref,
+				                          0, 0UL);
+				info->persistent_gnts_c--;
+			}
+			if (info->feature_persistent)
+				__free_page(pfn_to_page(persistent_gnt->pfn));
+			kfree(persistent_gnt);
+		}
+	}
+	BUG_ON(info->persistent_gnts_c != 0);
+
+	/*
+	 * Remove indirect pages, this only happens when using indirect
+	 * descriptors but not persistent grants
+	 */
+	if (!list_empty(&info->indirect_pages)) {
+		struct page *indirect_page, *n;
+
+		BUG_ON(info->feature_persistent);
+		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+			list_del(&indirect_page->lru);
+			__free_page(indirect_page);
+		}
+	}
+
+	for (i = 0; i < BLK_RING_SIZE; i++) {
+		/*
+		 * Clear persistent grants present in requests already
+		 * on the shared ring
+		 */
+		if (!info->shadow[i].request)
+			goto free_shadow;
+
+		segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
+		       info->shadow[i].req.u.indirect.nr_segments :
+		       info->shadow[i].req.u.rw.nr_segments;
+		for (j = 0; j < segs; j++) {
+			persistent_gnt = info->shadow[i].grants_used[j];
+			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+			if (info->feature_persistent)
+				__free_page(pfn_to_page(persistent_gnt->pfn));
+			kfree(persistent_gnt);
+		}
+
+		if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
+			/*
+			 * If this is not an indirect operation don't try to
+			 * free indirect segments
+			 */
+			goto free_shadow;
+
+		for (j = 0; j < INDIRECT_GREFS(segs); j++) {
+			persistent_gnt = info->shadow[i].indirect_grants[j];
+			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+			__free_page(pfn_to_page(persistent_gnt->pfn));
+			kfree(persistent_gnt);
+		}
+
+free_shadow:
+		kfree(info->shadow[i].grants_used);
+		info->shadow[i].grants_used = NULL;
+		kfree(info->shadow[i].indirect_grants);
+		info->shadow[i].indirect_grants = NULL;
+		kfree(info->shadow[i].sg);
+		info->shadow[i].sg = NULL;
+	}
+
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 
 	/* Flush gnttab callback work. Must be done with no locks held. */
-	flush_scheduled_work();
+	flush_work(&info->work);
 
 	/* Free resources associated with old device channel. */
 	if (info->ring_ref != GRANT_INVALID_REF) {
@@ -522,11 +1020,86 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 
 }
 
-static void blkif_completion(struct blk_shadow *s)
+static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
+			     struct blkif_response *bret)
 {
-	int i;
-	for (i = 0; i < s->req.nr_segments; i++)
-		gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+	int i = 0;
+	struct scatterlist *sg;
+	char *bvec_data;
+	void *shared_data;
+	int nseg;
+
+	nseg = s->req.operation == BLKIF_OP_INDIRECT ?
+		s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+
+	if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
+		/*
+		 * Copy the data received from the backend into the bvec.
+		 * Since bv_offset can be different than 0, and bv_len different
+		 * than PAGE_SIZE, we have to keep track of the current offset,
+		 * to be sure we are copying the data from the right shared page.
+		 */
+		for_each_sg(s->sg, sg, nseg, i) {
+			BUG_ON(sg->offset + sg->length > PAGE_SIZE);
+			shared_data = kmap_atomic(
+				pfn_to_page(s->grants_used[i]->pfn));
+			bvec_data = kmap_atomic(sg_page(sg));
+			memcpy(bvec_data   + sg->offset,
+			       shared_data + sg->offset,
+			       sg->length);
+			kunmap_atomic(bvec_data);
+			kunmap_atomic(shared_data);
+		}
+	}
+	/* Add the persistent grant into the list of free grants */
+	for (i = 0; i < nseg; i++) {
+		if (gnttab_query_foreign_access(s->grants_used[i]->gref)) {
+			/*
+			 * If the grant is still mapped by the backend (the
+			 * backend has chosen to make this grant persistent)
+			 * we add it at the head of the list, so it will be
+			 * reused first.
+			 */
+			if (!info->feature_persistent)
+				pr_alert_ratelimited("backed has not unmapped grant: %u\n",
+						     s->grants_used[i]->gref);
+			list_add(&s->grants_used[i]->node, &info->grants);
+			info->persistent_gnts_c++;
+		} else {
+			/*
+			 * If the grant is not mapped by the backend we end the
+			 * foreign access and add it to the tail of the list,
+			 * so it will not be picked again unless we run out of
+			 * persistent grants.
+			 */
+			gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
+			s->grants_used[i]->gref = GRANT_INVALID_REF;
+			list_add_tail(&s->grants_used[i]->node, &info->grants);
+		}
+	}
+	if (s->req.operation == BLKIF_OP_INDIRECT) {
+		for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
+			if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) {
+				if (!info->feature_persistent)
+					pr_alert_ratelimited("backed has not unmapped grant: %u\n",
+							     s->indirect_grants[i]->gref);
+				list_add(&s->indirect_grants[i]->node, &info->grants);
+				info->persistent_gnts_c++;
+			} else {
+				struct page *indirect_page;
+
+				gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL);
+				/*
+				 * Add the used indirect page back to the list of
+				 * available pages for indirect grefs.
+				 */
+				indirect_page = pfn_to_page(s->indirect_grants[i]->pfn);
+				list_add(&indirect_page->lru, &info->indirect_pages);
+				s->indirect_grants[i]->gref = GRANT_INVALID_REF;
+				list_add_tail(&s->indirect_grants[i]->node, &info->grants);
+			}
+		}
+	}
 }
 
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -538,10 +1111,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	struct blkfront_info *info = (struct blkfront_info *)dev_id;
 	int error;
 
-	spin_lock_irqsave(&blkif_io_lock, flags);
+	spin_lock_irqsave(&info->io_lock, flags);
 
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
-		spin_unlock_irqrestore(&blkif_io_lock, flags);
+		spin_unlock_irqrestore(&info->io_lock, flags);
 		return IRQ_HANDLED;
 	}
 
@@ -554,21 +1127,63 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
 		bret = RING_GET_RESPONSE(&info->ring, i);
 		id   = bret->id;
-		req  = (struct request *)info->shadow[id].request;
+		/*
+		 * The backend has messed up and given us an id that we would
+		 * never have given to it (we stamp it up to BLK_RING_SIZE -
+		 * look in get_id_from_freelist.
+		 */
+		if (id >= BLK_RING_SIZE) {
+			WARN(1, "%s: response to %s has incorrect id (%ld)\n",
+			     info->gd->disk_name, op_name(bret->operation), id);
+			/* We can't safely get the 'struct request' as
+			 * the id is busted. */
+			continue;
+		}
+		req  = info->shadow[id].request;
 
-		blkif_completion(&info->shadow[id]);
+		if (bret->operation != BLKIF_OP_DISCARD)
+			blkif_completion(&info->shadow[id], info, bret);
 
-		add_id_to_freelist(info, id);
+		if (add_id_to_freelist(info, id)) {
+			WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
+			     info->gd->disk_name, op_name(bret->operation), id);
+			continue;
+		}
 
 		error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
 		switch (bret->operation) {
+		case BLKIF_OP_DISCARD:
+			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+				struct request_queue *rq = info->rq;
+				printk(KERN_WARNING "blkfront: %s: %s op failed\n",
+					   info->gd->disk_name, op_name(bret->operation));
+				error = -EOPNOTSUPP;
+				info->feature_discard = 0;
+				info->feature_secdiscard = 0;
+				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
+				queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
+			}
+			__blk_end_request_all(req, error);
+			break;
+		case BLKIF_OP_FLUSH_DISKCACHE:
 		case BLKIF_OP_WRITE_BARRIER:
 			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
-				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
-				       info->gd->disk_name);
+				printk(KERN_WARNING "blkfront: %s: %s op failed\n",
+				       info->gd->disk_name, op_name(bret->operation));
+				error = -EOPNOTSUPP;
+			}
+			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
+				     info->shadow[id].req.u.rw.nr_segments == 0)) {
+				printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
+				       info->gd->disk_name, op_name(bret->operation));
 				error = -EOPNOTSUPP;
-				info->feature_barrier = 0;
-				xlvbd_barrier(info);
+			}
+			if (unlikely(error)) {
+				if (error == -EOPNOTSUPP)
+					error = 0;
+				info->feature_flush = 0;
+				info->flush_op = 0;
+				xlvbd_flush(info);
 			}
 			/* fall through */
 		case BLKIF_OP_READ:
@@ -596,7 +1211,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
 	kick_pending_request_queues(info);
 
-	spin_unlock_irqrestore(&blkif_io_lock, flags);
+	spin_unlock_irqrestore(&info->io_lock, flags);
 
 	return IRQ_HANDLED;
 }
@@ -618,8 +1233,6 @@ static int setup_blkring(struct xenbus_device *dev,
 	SHARED_RING_INIT(sring);
 	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
 
-	sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-
 	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
 	if (err < 0) {
 		free_page((unsigned long)sring);
@@ -632,9 +1245,8 @@ static int setup_blkring(struct xenbus_device *dev,
 	if (err)
 		goto fail;
 
-	err = bind_evtchn_to_irqhandler(info->evtchn,
-					blkif_interrupt,
-					IRQF_SAMPLE_RANDOM, "blkif", info);
+	err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
+					"blkif", info);
 	if (err <= 0) {
 		xenbus_dev_fatal(dev, err,
 				 "bind_evtchn_to_irqhandler failed");
@@ -650,7 +1262,7 @@ fail:
 
 
 /* Common code used when first setting up, and when resuming. */
-static int talk_to_backend(struct xenbus_device *dev,
+static int talk_to_blkback(struct xenbus_device *dev,
 			   struct blkfront_info *info)
 {
 	const char *message = NULL;
@@ -687,6 +1299,11 @@ again:
 		message = "writing protocol";
 		goto abort_transaction;
 	}
+	err = xenbus_printf(xbt, dev->nodename,
+			    "feature-persistent", "%u", 1);
+	if (err)
+		dev_warn(&dev->dev,
+			 "writing persistent grants feature to xenbus");
 
 	err = xenbus_transaction_end(xbt, 0);
 	if (err) {
@@ -710,7 +1327,6 @@ again:
 	return err;
 }
 
-
 /**
  * Entry point to this code when a new device is created.  Allocate the basic
  * structures and the ring buffer for communication with the backend, and
@@ -736,101 +1352,218 @@ static int blkfront_probe(struct xenbus_device *dev,
 		}
 	}
 
+	if (xen_hvm_domain()) {
+		char *type;
+		int len;
+		/* no unplug has been done: do not hook devices != xen vbds */
+		if (xen_has_pv_and_legacy_disk_devices()) {
+			int major;
+
+			if (!VDEV_IS_EXTENDED(vdevice))
+				major = BLKIF_MAJOR(vdevice);
+			else
+				major = XENVBD_MAJOR;
+
+			if (major != XENVBD_MAJOR) {
+				printk(KERN_INFO
+						"%s: HVM does not support vbd %d as xen block device\n",
+						__FUNCTION__, vdevice);
+				return -ENODEV;
+			}
+		}
+		/* do not create a PV cdrom device if we are an HVM guest */
+		type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
+		if (IS_ERR(type))
+			return -ENODEV;
+		if (strncmp(type, "cdrom", 5) == 0) {
+			kfree(type);
+			return -ENODEV;
+		}
+		kfree(type);
+	}
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info) {
 		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
 		return -ENOMEM;
 	}
 
+	mutex_init(&info->mutex);
+	spin_lock_init(&info->io_lock);
 	info->xbdev = dev;
 	info->vdevice = vdevice;
+	INIT_LIST_HEAD(&info->grants);
+	INIT_LIST_HEAD(&info->indirect_pages);
+	info->persistent_gnts_c = 0;
 	info->connected = BLKIF_STATE_DISCONNECTED;
 	INIT_WORK(&info->work, blkif_restart_queue);
 
 	for (i = 0; i < BLK_RING_SIZE; i++)
-		info->shadow[i].req.id = i+1;
-	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+		info->shadow[i].req.u.rw.id = i+1;
+	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
 
 	/* Front end dir is a number, which is used as the id. */
 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
-	dev->dev.driver_data = info;
+	dev_set_drvdata(&dev->dev, info);
 
-	err = talk_to_backend(dev, info);
+	err = talk_to_blkback(dev, info);
 	if (err) {
 		kfree(info);
-		dev->dev.driver_data = NULL;
+		dev_set_drvdata(&dev->dev, NULL);
 		return err;
 	}
 
 	return 0;
 }
 
+static void split_bio_end(struct bio *bio, int error)
+{
+	struct split_bio *split_bio = bio->bi_private;
+
+	if (error)
+		split_bio->err = error;
+
+	if (atomic_dec_and_test(&split_bio->pending)) {
+		split_bio->bio->bi_phys_segments = 0;
+		bio_endio(split_bio->bio, split_bio->err);
+		kfree(split_bio);
+	}
+	bio_put(bio);
+}
 
 static int blkif_recover(struct blkfront_info *info)
 {
 	int i;
-	struct blkif_request *req;
+	struct request *req, *n;
 	struct blk_shadow *copy;
-	int j;
+	int rc;
+	struct bio *bio, *cloned_bio;
+	struct bio_list bio_list, merge_bio;
+	unsigned int segs, offset;
+	int pending, size;
+	struct split_bio *split_bio;
+	struct list_head requests;
 
 	/* Stage 1: Make a safe copy of the shadow state. */
-	copy = kmalloc(sizeof(info->shadow),
+	copy = kmemdup(info->shadow, sizeof(info->shadow),
 		       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
 	if (!copy)
 		return -ENOMEM;
-	memcpy(copy, info->shadow, sizeof(info->shadow));
 
 	/* Stage 2: Set up free list. */
 	memset(&info->shadow, 0, sizeof(info->shadow));
 	for (i = 0; i < BLK_RING_SIZE; i++)
-		info->shadow[i].req.id = i+1;
+		info->shadow[i].req.u.rw.id = i+1;
 	info->shadow_free = info->ring.req_prod_pvt;
-	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
+
+	rc = blkfront_setup_indirect(info);
+	if (rc) {
+		kfree(copy);
+		return rc;
+	}
 
-	/* Stage 3: Find pending requests and requeue them. */
+	segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	blk_queue_max_segments(info->rq, segs);
+	bio_list_init(&bio_list);
+	INIT_LIST_HEAD(&requests);
 	for (i = 0; i < BLK_RING_SIZE; i++) {
 		/* Not in use? */
-		if (copy[i].request == 0)
+		if (!copy[i].request)
 			continue;
 
-		/* Grab a request slot and copy shadow state into it. */
-		req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
-		*req = copy[i].req;
-
-		/* We get a new request id, and must reset the shadow state. */
-		req->id = get_id_from_freelist(info);
-		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
-
-		/* Rewrite any grant references invalidated by susp/resume. */
-		for (j = 0; j < req->nr_segments; j++)
-			gnttab_grant_foreign_access_ref(
-				req->seg[j].gref,
-				info->xbdev->otherend_id,
-				pfn_to_mfn(info->shadow[req->id].frame[j]),
-				rq_data_dir(
-					(struct request *)
-					info->shadow[req->id].request));
-		info->shadow[req->id].req = *req;
-
-		info->ring.req_prod_pvt++;
+		/*
+		 * Get the bios in the request so we can re-queue them.
+		 */
+		if (copy[i].request->cmd_flags &
+		    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+			/*
+			 * Flush operations don't contain bios, so
+			 * we need to requeue the whole request
+			 */
+			list_add(&copy[i].request->queuelist, &requests);
+			continue;
+		}
+		merge_bio.head = copy[i].request->bio;
+		merge_bio.tail = copy[i].request->biotail;
+		bio_list_merge(&bio_list, &merge_bio);
+		copy[i].request->bio = NULL;
+		blk_put_request(copy[i].request);
 	}
 
 	kfree(copy);
 
+	/*
+	 * Empty the queue, this is important because we might have
+	 * requests in the queue with more segments than what we
+	 * can handle now.
+	 */
+	spin_lock_irq(&info->io_lock);
+	while ((req = blk_fetch_request(info->rq)) != NULL) {
+		if (req->cmd_flags &
+		    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+			list_add(&req->queuelist, &requests);
+			continue;
+		}
+		merge_bio.head = req->bio;
+		merge_bio.tail = req->biotail;
+		bio_list_merge(&bio_list, &merge_bio);
+		req->bio = NULL;
+		if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
+			pr_alert("diskcache flush request found!\n");
+		__blk_put_request(info->rq, req);
+	}
+	spin_unlock_irq(&info->io_lock);
+
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 
 	/* Now safe for us to use the shared ring */
 	info->connected = BLKIF_STATE_CONNECTED;
 
-	/* Send off requeued requests */
-	flush_requests(info);
-
 	/* Kick any other new requests queued since we resumed */
 	kick_pending_request_queues(info);
 
-	spin_unlock_irq(&blkif_io_lock);
+	list_for_each_entry_safe(req, n, &requests, queuelist) {
+		/* Requeue pending requests (flush or discard) */
+		list_del_init(&req->queuelist);
+		BUG_ON(req->nr_phys_segments > segs);
+		blk_requeue_request(info->rq, req);
+	}
+	spin_unlock_irq(&info->io_lock);
+
+	while ((bio = bio_list_pop(&bio_list)) != NULL) {
+		/* Traverse the list of pending bios and re-queue them */
+		if (bio_segments(bio) > segs) {
+			/*
+			 * This bio has more segments than what we can
+			 * handle, we have to split it.
+			 */
+			pending = (bio_segments(bio) + segs - 1) / segs;
+			split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
+			BUG_ON(split_bio == NULL);
+			atomic_set(&split_bio->pending, pending);
+			split_bio->bio = bio;
+			for (i = 0; i < pending; i++) {
+				offset = (i * segs * PAGE_SIZE) >> 9;
+				size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
+					   (unsigned int)bio_sectors(bio) - offset);
+				cloned_bio = bio_clone(bio, GFP_NOIO);
+				BUG_ON(cloned_bio == NULL);
+				bio_trim(cloned_bio, offset, size);
+				cloned_bio->bi_private = split_bio;
+				cloned_bio->bi_end_io = split_bio_end;
+				submit_bio(cloned_bio->bi_rw, cloned_bio);
+			}
+			/*
+			 * Now we have to wait for all those smaller bios to
+			 * end, so we can also end the "parent" bio.
+			 */
+			continue;
+		}
+		/* We don't need to split this bio */
+		submit_bio(bio->bi_rw, bio);
+	}
 
 	return 0;
 }
@@ -843,20 +1576,162 @@ static int blkif_recover(struct blkfront_info *info)
  */
 static int blkfront_resume(struct xenbus_device *dev)
 {
-	struct blkfront_info *info = dev->dev.driver_data;
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 	int err;
 
 	dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
 
 	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
 
-	err = talk_to_backend(dev, info);
-	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
-		err = blkif_recover(info);
+	err = talk_to_blkback(dev, info);
+
+	/*
+	 * We have to wait for the backend to switch to
+	 * connected state, since we want to read which
+	 * features it supports.
+	 */
 
 	return err;
 }
 
+static void
+blkfront_closing(struct blkfront_info *info)
+{
+	struct xenbus_device *xbdev = info->xbdev;
+	struct block_device *bdev = NULL;
+
+	mutex_lock(&info->mutex);
+
+	if (xbdev->state == XenbusStateClosing) {
+		mutex_unlock(&info->mutex);
+		return;
+	}
+
+	if (info->gd)
+		bdev = bdget_disk(info->gd, 0);
+
+	mutex_unlock(&info->mutex);
+
+	if (!bdev) {
+		xenbus_frontend_closed(xbdev);
+		return;
+	}
+
+	mutex_lock(&bdev->bd_mutex);
+
+	if (bdev->bd_openers) {
+		xenbus_dev_error(xbdev, -EBUSY,
+				 "Device in use; refusing to close");
+		xenbus_switch_state(xbdev, XenbusStateClosing);
+	} else {
+		xlvbd_release_gendisk(info);
+		xenbus_frontend_closed(xbdev);
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
+}
+
+static void blkfront_setup_discard(struct blkfront_info *info)
+{
+	int err;
+	unsigned int discard_granularity;
+	unsigned int discard_alignment;
+	unsigned int discard_secure;
+
+	info->feature_discard = 1;
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+		"discard-granularity", "%u", &discard_granularity,
+		"discard-alignment", "%u", &discard_alignment,
+		NULL);
+	if (!err) {
+		info->discard_granularity = discard_granularity;
+		info->discard_alignment = discard_alignment;
+	}
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+		    "discard-secure", "%d", &discard_secure,
+		    NULL);
+	if (!err)
+		info->feature_secdiscard = !!discard_secure;
+}
+
+static int blkfront_setup_indirect(struct blkfront_info *info)
+{
+	unsigned int indirect_segments, segs;
+	int err, i;
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-max-indirect-segments", "%u", &indirect_segments,
+			    NULL);
+	if (err) {
+		info->max_indirect_segments = 0;
+		segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	} else {
+		info->max_indirect_segments = min(indirect_segments,
+						  xen_blkif_max_segments);
+		segs = info->max_indirect_segments;
+	}
+
+	err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
+	if (err)
+		goto out_of_memory;
+
+	if (!info->feature_persistent && info->max_indirect_segments) {
+		/*
+		 * We are using indirect descriptors but not persistent
+		 * grants, we need to allocate a set of pages that can be
+		 * used for mapping indirect grefs
+		 */
+		int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE;
+
+		BUG_ON(!list_empty(&info->indirect_pages));
+		for (i = 0; i < num; i++) {
+			struct page *indirect_page = alloc_page(GFP_NOIO);
+			if (!indirect_page)
+				goto out_of_memory;
+			list_add(&indirect_page->lru, &info->indirect_pages);
+		}
+	}
+
+	for (i = 0; i < BLK_RING_SIZE; i++) {
+		info->shadow[i].grants_used = kzalloc(
+			sizeof(info->shadow[i].grants_used[0]) * segs,
+			GFP_NOIO);
+		info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
+		if (info->max_indirect_segments)
+			info->shadow[i].indirect_grants = kzalloc(
+				sizeof(info->shadow[i].indirect_grants[0]) *
+				INDIRECT_GREFS(segs),
+				GFP_NOIO);
+		if ((info->shadow[i].grants_used == NULL) ||
+			(info->shadow[i].sg == NULL) ||
+		     (info->max_indirect_segments &&
+		     (info->shadow[i].indirect_grants == NULL)))
+			goto out_of_memory;
+		sg_init_table(info->shadow[i].sg, segs);
+	}
+
+
+	return 0;
+
+out_of_memory:
+	for (i = 0; i < BLK_RING_SIZE; i++) {
+		kfree(info->shadow[i].grants_used);
+		info->shadow[i].grants_used = NULL;
+		kfree(info->shadow[i].sg);
+		info->shadow[i].sg = NULL;
+		kfree(info->shadow[i].indirect_grants);
+		info->shadow[i].indirect_grants = NULL;
+	}
+	if (!list_empty(&info->indirect_pages)) {
+		struct page *indirect_page, *n;
+		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+			list_del(&indirect_page->lru);
+			__free_page(indirect_page);
+		}
+	}
+	return -ENOMEM;
+}
 
 /*
  * Invoked when the backend is finally 'ready' (and has told produced
@@ -866,13 +1741,41 @@ static void blkfront_connect(struct blkfront_info *info)
 {
 	unsigned long long sectors;
 	unsigned long sector_size;
+	unsigned int physical_sector_size;
 	unsigned int binfo;
 	int err;
+	int barrier, flush, discard, persistent;
+
+	switch (info->connected) {
+	case BLKIF_STATE_CONNECTED:
+		/*
+		 * Potentially, the back-end may be signalling
+		 * a capacity change; update the capacity.
+		 */
+		err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+				   "sectors", "%Lu", &sectors);
+		if (XENBUS_EXIST_ERR(err))
+			return;
+		printk(KERN_INFO "Setting capacity to %Lu\n",
+		       sectors);
+		set_capacity(info->gd, sectors);
+		revalidate_disk(info->gd);
 
-	if ((info->connected == BLKIF_STATE_CONNECTED) ||
-	    (info->connected == BLKIF_STATE_SUSPENDED) )
+		return;
+	case BLKIF_STATE_SUSPENDED:
+		/*
+		 * If we are recovering from suspension, we need to wait
+		 * for the backend to announce it's features before
+		 * reconnecting, at least we need to know if the backend
+		 * supports indirect descriptors, and how many.
+		 */
+		blkif_recover(info);
 		return;
 
+	default:
+		break;
+	}
+
 	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
 		__func__, info->xbdev->otherend);
 
@@ -888,13 +1791,71 @@ static void blkfront_connect(struct blkfront_info *info)
 		return;
 	}
 
+	/*
+	 * physcial-sector-size is a newer field, so old backends may not
+	 * provide this. Assume physical sector size to be the same as
+	 * sector_size in that case.
+	 */
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "physical-sector-size", "%u", &physical_sector_size);
+	if (err != 1)
+		physical_sector_size = sector_size;
+
+	info->feature_flush = 0;
+	info->flush_op = 0;
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-barrier", "%d", &barrier,
+			    NULL);
+
+	/*
+	 * If there's no "feature-barrier" defined, then it means
+	 * we're dealing with a very old backend which writes
+	 * synchronously; nothing to do.
+	 *
+	 * If there are barriers, then we use flush.
+	 */
+	if (!err && barrier) {
+		info->feature_flush = REQ_FLUSH | REQ_FUA;
+		info->flush_op = BLKIF_OP_WRITE_BARRIER;
+	}
+	/*
+	 * And if there is "feature-flush-cache" use that above
+	 * barriers.
+	 */
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-flush-cache", "%d", &flush,
+			    NULL);
+
+	if (!err && flush) {
+		info->feature_flush = REQ_FLUSH;
+		info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
+	}
+
 	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			    "feature-barrier", "%lu", &info->feature_barrier,
+			    "feature-discard", "%d", &discard,
+			    NULL);
+
+	if (!err && discard)
+		blkfront_setup_discard(info);
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-persistent", "%u", &persistent,
 			    NULL);
 	if (err)
-		info->feature_barrier = 0;
+		info->feature_persistent = 0;
+	else
+		info->feature_persistent = persistent;
+
+	err = blkfront_setup_indirect(info);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+				 info->xbdev->otherend);
+		return;
+	}
 
-	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
+	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
+				  physical_sector_size);
 	if (err) {
 		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
 				 info->xbdev->otherend);
@@ -904,10 +1865,10 @@ static void blkfront_connect(struct blkfront_info *info)
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
 	/* Kick pending requests. */
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	info->connected = BLKIF_STATE_CONNECTED;
 	kick_pending_request_queues(info);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 
 	add_disk(info->gd);
 
@@ -915,175 +1876,225 @@ static void blkfront_connect(struct blkfront_info *info)
 }
 
 /**
- * Handle the change of state of the backend to Closing.  We must delete our
- * device-layer structures now, to ensure that writes are flushed through to
- * the backend.  Once is this done, we can switch to Closed in
- * acknowledgement.
- */
-static void blkfront_closing(struct xenbus_device *dev)
-{
-	struct blkfront_info *info = dev->dev.driver_data;
-	unsigned long flags;
-
-	dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
-
-	if (info->rq == NULL)
-		goto out;
-
-	spin_lock_irqsave(&blkif_io_lock, flags);
-
-	/* No more blkif_request(). */
-	blk_stop_queue(info->rq);
-
-	/* No more gnttab callback work. */
-	gnttab_cancel_free_callback(&info->callback);
-	spin_unlock_irqrestore(&blkif_io_lock, flags);
-
-	/* Flush gnttab callback work. Must be done with no locks held. */
-	flush_scheduled_work();
-
-	blk_cleanup_queue(info->rq);
-	info->rq = NULL;
-
-	del_gendisk(info->gd);
-
- out:
-	xenbus_frontend_closed(dev);
-}
-
-/**
  * Callback received when the backend's state changes.
  */
-static void backend_changed(struct xenbus_device *dev,
+static void blkback_changed(struct xenbus_device *dev,
 			    enum xenbus_state backend_state)
 {
-	struct blkfront_info *info = dev->dev.driver_data;
-	struct block_device *bd;
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 
-	dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
+	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
 
 	switch (backend_state) {
 	case XenbusStateInitialising:
 	case XenbusStateInitWait:
 	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
 	case XenbusStateUnknown:
-	case XenbusStateClosed:
 		break;
 
 	case XenbusStateConnected:
 		blkfront_connect(info);
 		break;
 
-	case XenbusStateClosing:
-		if (info->gd == NULL) {
-			xenbus_frontend_closed(dev);
+	case XenbusStateClosed:
+		if (dev->state == XenbusStateClosed)
 			break;
-		}
-		bd = bdget_disk(info->gd, 0);
-		if (bd == NULL)
-			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
-
-		mutex_lock(&bd->bd_mutex);
-		if (info->users > 0)
-			xenbus_dev_error(dev, -EBUSY,
-					 "Device in use; refusing to close");
-		else
-			blkfront_closing(dev);
-		mutex_unlock(&bd->bd_mutex);
-		bdput(bd);
+		/* Missed the backend's Closing state -- fallthrough */
+	case XenbusStateClosing:
+		blkfront_closing(info);
 		break;
 	}
 }
 
-static int blkfront_remove(struct xenbus_device *dev)
+static int blkfront_remove(struct xenbus_device *xbdev)
 {
-	struct blkfront_info *info = dev->dev.driver_data;
+	struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
+	struct block_device *bdev = NULL;
+	struct gendisk *disk;
 
-	dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
+	dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
 
 	blkif_free(info, 0);
 
-	kfree(info);
+	mutex_lock(&info->mutex);
+
+	disk = info->gd;
+	if (disk)
+		bdev = bdget_disk(disk, 0);
+
+	info->xbdev = NULL;
+	mutex_unlock(&info->mutex);
+
+	if (!bdev) {
+		kfree(info);
+		return 0;
+	}
+
+	/*
+	 * The xbdev was removed before we reached the Closed
+	 * state. See if it's safe to remove the disk. If the bdev
+	 * isn't closed yet, we let release take care of it.
+	 */
+
+	mutex_lock(&bdev->bd_mutex);
+	info = disk->private_data;
+
+	dev_warn(disk_to_dev(disk),
+		 "%s was hot-unplugged, %d stale handles\n",
+		 xbdev->nodename, bdev->bd_openers);
+
+	if (info && !bdev->bd_openers) {
+		xlvbd_release_gendisk(info);
+		disk->private_data = NULL;
+		kfree(info);
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
 
 	return 0;
 }
 
 static int blkfront_is_ready(struct xenbus_device *dev)
 {
-	struct blkfront_info *info = dev->dev.driver_data;
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 
-	return info->is_ready;
+	return info->is_ready && info->xbdev;
 }
 
 static int blkif_open(struct block_device *bdev, fmode_t mode)
 {
-	struct blkfront_info *info = bdev->bd_disk->private_data;
-	info->users++;
-	return 0;
+	struct gendisk *disk = bdev->bd_disk;
+	struct blkfront_info *info;
+	int err = 0;
+
+	mutex_lock(&blkfront_mutex);
+
+	info = disk->private_data;
+	if (!info) {
+		/* xbdev gone */
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+	mutex_lock(&info->mutex);
+
+	if (!info->gd)
+		/* xbdev is closed */
+		err = -ERESTARTSYS;
+
+	mutex_unlock(&info->mutex);
+
+out:
+	mutex_unlock(&blkfront_mutex);
+	return err;
 }
 
-static int blkif_release(struct gendisk *disk, fmode_t mode)
+static void blkif_release(struct gendisk *disk, fmode_t mode)
 {
 	struct blkfront_info *info = disk->private_data;
-	info->users--;
-	if (info->users == 0) {
-		/* Check whether we have been instructed to close.  We will
-		   have ignored this request initially, as the device was
-		   still mounted. */
-		struct xenbus_device *dev = info->xbdev;
-		enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
+	struct block_device *bdev;
+	struct xenbus_device *xbdev;
+
+	mutex_lock(&blkfront_mutex);
+
+	bdev = bdget_disk(disk, 0);
 
-		if (state == XenbusStateClosing && info->is_ready)
-			blkfront_closing(dev);
+	if (!bdev) {
+		WARN(1, "Block device %s yanked out from us!\n", disk->disk_name);
+		goto out_mutex;
 	}
-	return 0;
+	if (bdev->bd_openers)
+		goto out;
+
+	/*
+	 * Check if we have been instructed to close. We will have
+	 * deferred this request, because the bdev was still open.
+	 */
+
+	mutex_lock(&info->mutex);
+	xbdev = info->xbdev;
+
+	if (xbdev && xbdev->state == XenbusStateClosing) {
+		/* pending switch to state closed */
+		dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
+		xlvbd_release_gendisk(info);
+		xenbus_frontend_closed(info->xbdev);
+ 	}
+
+	mutex_unlock(&info->mutex);
+
+	if (!xbdev) {
+		/* sudden device removal */
+		dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
+		xlvbd_release_gendisk(info);
+		disk->private_data = NULL;
+		kfree(info);
+	}
+
+out:
+	bdput(bdev);
+out_mutex:
+	mutex_unlock(&blkfront_mutex);
 }
 
-static struct block_device_operations xlvbd_block_fops =
+static const struct block_device_operations xlvbd_block_fops =
 {
 	.owner = THIS_MODULE,
 	.open = blkif_open,
 	.release = blkif_release,
 	.getgeo = blkif_getgeo,
-	.locked_ioctl = blkif_ioctl,
+	.ioctl = blkif_ioctl,
 };
 
 
-static struct xenbus_device_id blkfront_ids[] = {
+static const struct xenbus_device_id blkfront_ids[] = {
 	{ "vbd" },
 	{ "" }
 };
 
-static struct xenbus_driver blkfront = {
-	.name = "vbd",
-	.owner = THIS_MODULE,
-	.ids = blkfront_ids,
+static DEFINE_XENBUS_DRIVER(blkfront, ,
 	.probe = blkfront_probe,
 	.remove = blkfront_remove,
 	.resume = blkfront_resume,
-	.otherend_changed = backend_changed,
+	.otherend_changed = blkback_changed,
 	.is_ready = blkfront_is_ready,
-};
+);
 
 static int __init xlblk_init(void)
 {
+	int ret;
+
 	if (!xen_domain())
 		return -ENODEV;
 
+	if (!xen_has_pv_disk_devices())
+		return -ENODEV;
+
 	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
 		printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
 		       XENVBD_MAJOR, DEV_NAME);
 		return -ENODEV;
 	}
 
-	return xenbus_register_frontend(&blkfront);
+	ret = xenbus_register_frontend(&blkfront_driver);
+	if (ret) {
+		unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+		return ret;
+	}
+
+	return 0;
 }
 module_init(xlblk_init);
 
 
 static void __exit xlblk_exit(void)
 {
-	return xenbus_unregister_driver(&blkfront);
+	xenbus_unregister_driver(&blkfront_driver);
+	unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+	kfree(minors);
 }
 module_exit(xlblk_exit);
 
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index f08491a3a81..ab3ea62e5df 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -89,10 +89,12 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/mutex.h>
 #include <linux/ata.h>
 #include <linux/hdreg.h>
 #include <linux/platform_device.h>
 #if defined(CONFIG_OF)
+#include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #endif
@@ -212,6 +214,7 @@ struct ace_device {
 	u16 cf_id[ATA_ID_WORDS];
 };
 
+static DEFINE_MUTEX(xsysace_mutex);
 static int ace_major;
 
 /* ---------------------------------------------------------------------
@@ -390,9 +393,10 @@ static inline void ace_dump_mem(void *base, int len)
 
 static void ace_dump_regs(struct ace_device *ace)
 {
-	dev_info(ace->dev, "    ctrl:  %.8x  seccnt/cmd: %.4x      ver:%.4x\n"
-		 KERN_INFO "    status:%.8x  mpu_lba:%.8x  busmode:%4x\n"
-		 KERN_INFO "    error: %.8x  cfg_lba:%.8x  fatstat:%.4x\n",
+	dev_info(ace->dev,
+		 "    ctrl:  %.8x  seccnt/cmd: %.4x      ver:%.4x\n"
+		 "    status:%.8x  mpu_lba:%.8x  busmode:%4x\n"
+		 "    error: %.8x  cfg_lba:%.8x  fatstat:%.4x\n",
 		 ace_in32(ace, ACE_CTRL),
 		 ace_in(ace, ACE_SECCNTCMD),
 		 ace_in(ace, ACE_VERSION),
@@ -403,7 +407,7 @@ static void ace_dump_regs(struct ace_device *ace)
 		 ace_in32(ace, ACE_CFGLBA), ace_in(ace, ACE_FATSTAT));
 }
 
-void ace_fix_driveid(u16 *id)
+static void ace_fix_driveid(u16 *id)
 {
 #if defined(__BIG_ENDIAN)
 	int i;
@@ -452,19 +456,19 @@ static inline void ace_fsm_yieldirq(struct ace_device *ace)
 {
 	dev_dbg(ace->dev, "ace_fsm_yieldirq()\n");
 
-	if (ace->irq == NO_IRQ)
+	if (!ace->irq)
 		/* No IRQ assigned, so need to poll */
 		tasklet_schedule(&ace->fsm_tasklet);
 	ace->fsm_continue_flag = 0;
 }
 
 /* Get the next read/write request; ending requests that we don't handle */
-struct request *ace_get_next_request(struct request_queue * q)
+static struct request *ace_get_next_request(struct request_queue *q)
 {
 	struct request *req;
 
 	while ((req = blk_peek_request(q)) != NULL) {
-		if (blk_fs_request(req))
+		if (req->cmd_type == REQ_TYPE_FS)
 			break;
 		blk_start_request(req);
 		__blk_end_request_all(req, -EIO);
@@ -617,7 +621,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		ace_dump_mem(ace->cf_id, 512);	/* Debug: Dump out disk ID */
 
 		if (ace->data_result) {
-			/* Error occured, disable the disk */
+			/* Error occurred, disable the disk */
 			ace->media_change = 1;
 			set_capacity(ace->gd, 0);
 			dev_err(ace->dev, "error fetching CF id (%i)\n",
@@ -657,7 +661,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			rq_data_dir(req));
 
 		ace->req = req;
-		ace->data_ptr = req->buffer;
+		ace->data_ptr = bio_data(req->bio);
 		ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR;
 		ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF);
 
@@ -729,7 +733,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			 *      blk_rq_sectors(ace->req),
 			 *      blk_rq_cur_sectors(ace->req));
 			 */
-			ace->data_ptr = ace->req->buffer;
+			ace->data_ptr = bio_data(ace->req->bio);
 			ace->data_count = blk_rq_cur_sectors(ace->req) * 16;
 			ace_fsm_yieldirq(ace);
 			break;
@@ -797,7 +801,7 @@ static int ace_interrupt_checkstate(struct ace_device *ace)
 	u32 sreg = ace_in32(ace, ACE_STATUS);
 	u16 creg = ace_in(ace, ACE_CTRL);
 
-	/* Check for error occurance */
+	/* Check for error occurrence */
 	if ((sreg & (ACE_STATUS_CFGERROR | ACE_STATUS_CFCERROR)) &&
 	    (creg & ACE_CTRL_ERRORIRQ)) {
 		dev_err(ace->dev, "transfer failure\n");
@@ -863,12 +867,12 @@ static void ace_request(struct request_queue * q)
 	}
 }
 
-static int ace_media_changed(struct gendisk *gd)
+static unsigned int ace_check_events(struct gendisk *gd, unsigned int clearing)
 {
 	struct ace_device *ace = gd->private_data;
-	dev_dbg(ace->dev, "ace_media_changed(): %i\n", ace->media_change);
+	dev_dbg(ace->dev, "ace_check_events(): %i\n", ace->media_change);
 
-	return ace->media_change;
+	return ace->media_change ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static int ace_revalidate_disk(struct gendisk *gd)
@@ -900,15 +904,18 @@ static int ace_open(struct block_device *bdev, fmode_t mode)
 
 	dev_dbg(ace->dev, "ace_open() users=%i\n", ace->users + 1);
 
+	mutex_lock(&xsysace_mutex);
 	spin_lock_irqsave(&ace->lock, flags);
 	ace->users++;
 	spin_unlock_irqrestore(&ace->lock, flags);
 
 	check_disk_change(bdev);
+	mutex_unlock(&xsysace_mutex);
+
 	return 0;
 }
 
-static int ace_release(struct gendisk *disk, fmode_t mode)
+static void ace_release(struct gendisk *disk, fmode_t mode)
 {
 	struct ace_device *ace = disk->private_data;
 	unsigned long flags;
@@ -916,6 +923,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
 
 	dev_dbg(ace->dev, "ace_release() users=%i\n", ace->users - 1);
 
+	mutex_lock(&xsysace_mutex);
 	spin_lock_irqsave(&ace->lock, flags);
 	ace->users--;
 	if (ace->users == 0) {
@@ -923,7 +931,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
 		ace_out(ace, ACE_CTRL, val & ~ACE_CTRL_LOCKREQ);
 	}
 	spin_unlock_irqrestore(&ace->lock, flags);
-	return 0;
+	mutex_unlock(&xsysace_mutex);
 }
 
 static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -940,11 +948,11 @@ static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static struct block_device_operations ace_fops = {
+static const struct block_device_operations ace_fops = {
 	.owner = THIS_MODULE,
 	.open = ace_open,
 	.release = ace_release,
-	.media_changed = ace_media_changed,
+	.check_events = ace_check_events,
 	.revalidate_disk = ace_revalidate_disk,
 	.getgeo = ace_getgeo,
 };
@@ -952,7 +960,7 @@ static struct block_device_operations ace_fops = {
 /* --------------------------------------------------------------------
  * SystemACE device setup/teardown code
  */
-static int __devinit ace_setup(struct ace_device *ace)
+static int ace_setup(struct ace_device *ace)
 {
 	u16 version;
 	u16 val;
@@ -1025,12 +1033,12 @@ static int __devinit ace_setup(struct ace_device *ace)
 		ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ);
 
 	/* Now we can hook up the irq handler */
-	if (ace->irq != NO_IRQ) {
+	if (ace->irq) {
 		rc = request_irq(ace->irq, ace_interrupt, 0, "systemace", ace);
 		if (rc) {
 			/* Failure - fall back to polled mode */
 			dev_err(ace->dev, "request_irq failed\n");
-			ace->irq = NO_IRQ;
+			ace->irq = 0;
 		}
 	}
 
@@ -1065,7 +1073,7 @@ err_ioremap:
 	return -ENOMEM;
 }
 
-static void __devexit ace_teardown(struct ace_device *ace)
+static void ace_teardown(struct ace_device *ace)
 {
 	if (ace->gd) {
 		del_gendisk(ace->gd);
@@ -1077,15 +1085,14 @@ static void __devexit ace_teardown(struct ace_device *ace)
 
 	tasklet_kill(&ace->fsm_tasklet);
 
-	if (ace->irq != NO_IRQ)
+	if (ace->irq)
 		free_irq(ace->irq, ace);
 
 	iounmap(ace->baseaddr);
 }
 
-static int __devinit
-ace_alloc(struct device *dev, int id, resource_size_t physaddr,
-	  int irq, int bus_width)
+static int ace_alloc(struct device *dev, int id, resource_size_t physaddr,
+		     int irq, int bus_width)
 {
 	struct ace_device *ace;
 	int rc;
@@ -1126,7 +1133,7 @@ err_noreg:
 	return rc;
 }
 
-static void __devexit ace_free(struct device *dev)
+static void ace_free(struct device *dev)
 {
 	struct ace_device *ace = dev_get_drvdata(dev);
 	dev_dbg(dev, "ace_free(%p)\n", dev);
@@ -1142,16 +1149,22 @@ static void __devexit ace_free(struct device *dev)
  * Platform Bus Support
  */
 
-static int __devinit ace_probe(struct platform_device *dev)
+static int ace_probe(struct platform_device *dev)
 {
 	resource_size_t physaddr = 0;
 	int bus_width = ACE_BUS_WIDTH_16; /* FIXME: should not be hard coded */
-	int id = dev->id;
-	int irq = NO_IRQ;
+	u32 id = dev->id;
+	int irq = 0;
 	int i;
 
 	dev_dbg(&dev->dev, "ace_probe(%p)\n", dev);
 
+	/* device id and bus width */
+	if (of_property_read_u32(dev->dev.of_node, "port-number", &id))
+		id = 0;
+	if (of_find_property(dev->dev.of_node, "8-bit", NULL))
+		bus_width = ACE_BUS_WIDTH_8;
+
 	for (i = 0; i < dev->num_resources; i++) {
 		if (dev->resource[i].flags & IORESOURCE_MEM)
 			physaddr = dev->resource[i].start;
@@ -1159,74 +1172,22 @@ static int __devinit ace_probe(struct platform_device *dev)
 			irq = dev->resource[i].start;
 	}
 
-	/* Call the bus-independant setup code */
+	/* Call the bus-independent setup code */
 	return ace_alloc(&dev->dev, id, physaddr, irq, bus_width);
 }
 
 /*
  * Platform bus remove() method
  */
-static int __devexit ace_remove(struct platform_device *dev)
+static int ace_remove(struct platform_device *dev)
 {
 	ace_free(&dev->dev);
 	return 0;
 }
 
-static struct platform_driver ace_platform_driver = {
-	.probe = ace_probe,
-	.remove = __devexit_p(ace_remove),
-	.driver = {
-		.owner = THIS_MODULE,
-		.name = "xsysace",
-	},
-};
-
-/* ---------------------------------------------------------------------
- * OF_Platform Bus Support
- */
-
 #if defined(CONFIG_OF)
-static int __devinit
-ace_of_probe(struct of_device *op, const struct of_device_id *match)
-{
-	struct resource res;
-	resource_size_t physaddr;
-	const u32 *id;
-	int irq, bus_width, rc;
-
-	dev_dbg(&op->dev, "ace_of_probe(%p, %p)\n", op, match);
-
-	/* device id */
-	id = of_get_property(op->node, "port-number", NULL);
-
-	/* physaddr */
-	rc = of_address_to_resource(op->node, 0, &res);
-	if (rc) {
-		dev_err(&op->dev, "invalid address\n");
-		return rc;
-	}
-	physaddr = res.start;
-
-	/* irq */
-	irq = irq_of_parse_and_map(op->node, 0);
-
-	/* bus width */
-	bus_width = ACE_BUS_WIDTH_16;
-	if (of_find_property(op->node, "8-bit", NULL))
-		bus_width = ACE_BUS_WIDTH_8;
-
-	/* Call the bus-independant setup code */
-	return ace_alloc(&op->dev, id ? *id : 0, physaddr, irq, bus_width);
-}
-
-static int __devexit ace_of_remove(struct of_device *op)
-{
-	ace_free(&op->dev);
-	return 0;
-}
-
 /* Match table for of_platform binding */
-static struct of_device_id ace_of_match[] __devinitdata = {
+static const struct of_device_id ace_of_match[] = {
 	{ .compatible = "xlnx,opb-sysace-1.00.b", },
 	{ .compatible = "xlnx,opb-sysace-1.00.c", },
 	{ .compatible = "xlnx,xps-sysace-1.00.a", },
@@ -1234,35 +1195,20 @@ static struct of_device_id ace_of_match[] __devinitdata = {
 	{},
 };
 MODULE_DEVICE_TABLE(of, ace_of_match);
+#else /* CONFIG_OF */
+#define ace_of_match NULL
+#endif /* CONFIG_OF */
 
-static struct of_platform_driver ace_of_driver = {
-	.owner = THIS_MODULE,
-	.name = "xsysace",
-	.match_table = ace_of_match,
-	.probe = ace_of_probe,
-	.remove = __devexit_p(ace_of_remove),
+static struct platform_driver ace_platform_driver = {
+	.probe = ace_probe,
+	.remove = ace_remove,
 	.driver = {
+		.owner = THIS_MODULE,
 		.name = "xsysace",
+		.of_match_table = ace_of_match,
 	},
 };
 
-/* Registration helpers to keep the number of #ifdefs to a minimum */
-static inline int __init ace_of_register(void)
-{
-	pr_debug("xsysace: registering OF binding\n");
-	return of_register_platform_driver(&ace_of_driver);
-}
-
-static inline void __exit ace_of_unregister(void)
-{
-	of_unregister_platform_driver(&ace_of_driver);
-}
-#else /* CONFIG_OF */
-/* CONFIG_OF not enabled; do nothing helpers */
-static inline int __init ace_of_register(void) { return 0; }
-static inline void __exit ace_of_unregister(void) { }
-#endif /* CONFIG_OF */
-
 /* ---------------------------------------------------------------------
  * Module init/exit routines
  */
@@ -1276,11 +1222,6 @@ static int __init ace_init(void)
 		goto err_blk;
 	}
 
-	rc = ace_of_register();
-	if (rc)
-		goto err_of;
-
-	pr_debug("xsysace: registering platform binding\n");
 	rc = platform_driver_register(&ace_platform_driver);
 	if (rc)
 		goto err_plat;
@@ -1289,21 +1230,17 @@ static int __init ace_init(void)
 	return 0;
 
 err_plat:
-	ace_of_unregister();
-err_of:
 	unregister_blkdev(ace_major, "xsysace");
 err_blk:
 	printk(KERN_ERR "xsysace: registration failed; err=%i\n", rc);
 	return rc;
 }
+module_init(ace_init);
 
 static void __exit ace_exit(void)
 {
 	pr_debug("Unregistering Xilinx SystemACE driver\n");
 	platform_driver_unregister(&ace_platform_driver);
-	ace_of_unregister();
 	unregister_blkdev(ace_major, "xsysace");
 }
-
-module_init(ace_init);
 module_exit(ace_exit);
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 4575171e5be..968f9e52eff 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -33,6 +33,8 @@
 #include <linux/module.h>
 #include <linux/blkdev.h>
 #include <linux/bitops.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
 
 #include <asm/setup.h>
 #include <asm/amigahw.h>
@@ -41,9 +43,6 @@
 #include <linux/zorro.h>
 
 
-extern int m68k_realnum_memory;
-extern struct mem_info m68k_memory[NUM_MEMINFO];
-
 #define Z2MINOR_COMBINED      (0)
 #define Z2MINOR_Z2ONLY        (1)
 #define Z2MINOR_CHIPONLY      (2)
@@ -55,6 +54,7 @@ extern struct mem_info m68k_memory[NUM_MEMINFO];
 
 #define Z2RAM_CHUNK1024       ( Z2RAM_CHUNKSIZE >> 10 )
 
+static DEFINE_MUTEX(z2ram_mutex);
 static u_long *z2ram_map    = NULL;
 static u_long z2ram_size    = 0;
 static int z2_count         = 0;
@@ -64,7 +64,6 @@ static int current_device   = -1;
 
 static DEFINE_SPINLOCK(z2ram_lock);
 
-static struct block_device_operations z2_fops;
 static struct gendisk *z2ram_gendisk;
 
 static void do_z2_request(struct request_queue *q)
@@ -78,21 +77,25 @@ static void do_z2_request(struct request_queue *q)
 		int err = 0;
 
 		if (start + len > z2ram_size) {
-			printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
-				blk_rq_pos(req), blk_rq_cur_sectors(req));
+			pr_err(DEVICE_NAME ": bad access: block=%llu, "
+			       "count=%u\n",
+			       (unsigned long long)blk_rq_pos(req),
+			       blk_rq_cur_sectors(req));
 			err = -EIO;
 			goto done;
 		}
 		while (len) {
 			unsigned long addr = start & Z2RAM_CHUNKMASK;
 			unsigned long size = Z2RAM_CHUNKSIZE - addr;
+			void *buffer = bio_data(req->bio);
+
 			if (len < size)
 				size = len;
 			addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ];
 			if (rq_data_dir(req) == READ)
-				memcpy(req->buffer, (char *)addr, size);
+				memcpy(buffer, (char *)addr, size);
 			else
-				memcpy((char *)addr, req->buffer, size);
+				memcpy((char *)addr, buffer, size);
 			start += size;
 			len -= size;
 		}
@@ -112,8 +115,8 @@ get_z2ram( void )
 	if ( test_bit( i, zorro_unused_z2ram ) )
 	{
 	    z2_count++;
-	    z2ram_map[ z2ram_size++ ] = 
-		ZTWO_VADDR( Z2RAM_START ) + ( i << Z2RAM_CHUNKSHIFT );
+	    z2ram_map[z2ram_size++] = (unsigned long)ZTWO_VADDR(Z2RAM_START) +
+				      (i << Z2RAM_CHUNKSHIFT);
 	    clear_bit( i, zorro_unused_z2ram );
 	}
     }
@@ -153,6 +156,7 @@ static int z2_open(struct block_device *bdev, fmode_t mode)
 
     device = MINOR(bdev->bd_dev);
 
+    mutex_lock(&z2ram_mutex);
     if ( current_device != -1 && current_device != device )
     {
 	rc = -EBUSY;
@@ -294,28 +298,31 @@ static int z2_open(struct block_device *bdev, fmode_t mode)
 	set_capacity(z2ram_gendisk, z2ram_size >> 9);
     }
 
+    mutex_unlock(&z2ram_mutex);
     return 0;
 
 err_out_kfree:
     kfree(z2ram_map);
 err_out:
+    mutex_unlock(&z2ram_mutex);
     return rc;
 }
 
-static int
+static void
 z2_release(struct gendisk *disk, fmode_t mode)
 {
-    if ( current_device == -1 )
-	return 0;     
-
+    mutex_lock(&z2ram_mutex);
+    if ( current_device == -1 ) {
+    	mutex_unlock(&z2ram_mutex);
+    	return;
+    }
+    mutex_unlock(&z2ram_mutex);
     /*
      * FIXME: unmap memory
      */
-
-    return 0;
 }
 
-static struct block_device_operations z2_fops =
+static const struct block_device_operations z2_fops =
 {
 	.owner		= THIS_MODULE,
 	.open		= z2_open,
@@ -374,7 +381,7 @@ err:
 static void __exit z2_exit(void)
 {
     int i, j;
-    blk_unregister_region(MKDEV(Z2RAM_MAJOR, 0), 256);
+    blk_unregister_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT);
     unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME);
     del_gendisk(z2ram_gendisk);
     put_disk(z2ram_gendisk);
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
new file mode 100644
index 00000000000..6489c0fd0ea
--- /dev/null
+++ b/drivers/block/zram/Kconfig
@@ -0,0 +1,34 @@
+config ZRAM
+	tristate "Compressed RAM block device support"
+	depends on BLOCK && SYSFS && ZSMALLOC
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	default n
+	help
+	  Creates virtual block devices called /dev/zramX (X = 0, 1, ...).
+	  Pages written to these disks are compressed and stored in memory
+	  itself. These disks allow very fast I/O and compression provides
+	  good amounts of memory savings.
+
+	  It has several use cases, for example: /tmp storage, use as swap
+	  disks and maybe many more.
+
+	  See zram.txt for more information.
+
+config ZRAM_LZ4_COMPRESS
+	bool "Enable LZ4 algorithm support"
+	depends on ZRAM
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
+	default n
+	help
+	  This option enables LZ4 compression algorithm support. Compression
+	  algorithm can be changed using `comp_algorithm' device attribute.
+
+config ZRAM_DEBUG
+	bool "Compressed RAM block device debug support"
+	depends on ZRAM
+	default n
+	help
+	  This option adds additional debugging code to the compressed
+	  RAM block device driver.
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
new file mode 100644
index 00000000000..be0763ff57a
--- /dev/null
+++ b/drivers/block/zram/Makefile
@@ -0,0 +1,5 @@
+zram-y	:=	zcomp_lzo.o zcomp.o zram_drv.o
+
+zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o
+
+obj-$(CONFIG_ZRAM)	+=	zram.o
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
new file mode 100644
index 00000000000..f1ff39a3d1c
--- /dev/null
+++ b/drivers/block/zram/zcomp.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include "zcomp.h"
+#include "zcomp_lzo.h"
+#ifdef CONFIG_ZRAM_LZ4_COMPRESS
+#include "zcomp_lz4.h"
+#endif
+
+/*
+ * single zcomp_strm backend
+ */
+struct zcomp_strm_single {
+	struct mutex strm_lock;
+	struct zcomp_strm *zstrm;
+};
+
+/*
+ * multi zcomp_strm backend
+ */
+struct zcomp_strm_multi {
+	/* protect strm list */
+	spinlock_t strm_lock;
+	/* max possible number of zstrm streams */
+	int max_strm;
+	/* number of available zstrm streams */
+	int avail_strm;
+	/* list of available strms */
+	struct list_head idle_strm;
+	wait_queue_head_t strm_wait;
+};
+
+static struct zcomp_backend *backends[] = {
+	&zcomp_lzo,
+#ifdef CONFIG_ZRAM_LZ4_COMPRESS
+	&zcomp_lz4,
+#endif
+	NULL
+};
+
+static struct zcomp_backend *find_backend(const char *compress)
+{
+	int i = 0;
+	while (backends[i]) {
+		if (sysfs_streq(compress, backends[i]->name))
+			break;
+		i++;
+	}
+	return backends[i];
+}
+
+static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
+{
+	if (zstrm->private)
+		comp->backend->destroy(zstrm->private);
+	free_pages((unsigned long)zstrm->buffer, 1);
+	kfree(zstrm);
+}
+
+/*
+ * allocate new zcomp_strm structure with ->private initialized by
+ * backend, return NULL on error
+ */
+static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
+{
+	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL);
+	if (!zstrm)
+		return NULL;
+
+	zstrm->private = comp->backend->create();
+	/*
+	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
+	 * case when compressed size is larger than the original one
+	 */
+	zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+	if (!zstrm->private || !zstrm->buffer) {
+		zcomp_strm_free(comp, zstrm);
+		zstrm = NULL;
+	}
+	return zstrm;
+}
+
+/*
+ * get idle zcomp_strm or wait until other process release
+ * (zcomp_strm_release()) one for us
+ */
+static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
+{
+	struct zcomp_strm_multi *zs = comp->stream;
+	struct zcomp_strm *zstrm;
+
+	while (1) {
+		spin_lock(&zs->strm_lock);
+		if (!list_empty(&zs->idle_strm)) {
+			zstrm = list_entry(zs->idle_strm.next,
+					struct zcomp_strm, list);
+			list_del(&zstrm->list);
+			spin_unlock(&zs->strm_lock);
+			return zstrm;
+		}
+		/* zstrm streams limit reached, wait for idle stream */
+		if (zs->avail_strm >= zs->max_strm) {
+			spin_unlock(&zs->strm_lock);
+			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
+			continue;
+		}
+		/* allocate new zstrm stream */
+		zs->avail_strm++;
+		spin_unlock(&zs->strm_lock);
+
+		zstrm = zcomp_strm_alloc(comp);
+		if (!zstrm) {
+			spin_lock(&zs->strm_lock);
+			zs->avail_strm--;
+			spin_unlock(&zs->strm_lock);
+			wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
+			continue;
+		}
+		break;
+	}
+	return zstrm;
+}
+
+/* add stream back to idle list and wake up waiter or free the stream */
+static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm)
+{
+	struct zcomp_strm_multi *zs = comp->stream;
+
+	spin_lock(&zs->strm_lock);
+	if (zs->avail_strm <= zs->max_strm) {
+		list_add(&zstrm->list, &zs->idle_strm);
+		spin_unlock(&zs->strm_lock);
+		wake_up(&zs->strm_wait);
+		return;
+	}
+
+	zs->avail_strm--;
+	spin_unlock(&zs->strm_lock);
+	zcomp_strm_free(comp, zstrm);
+}
+
+/* change max_strm limit */
+static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
+{
+	struct zcomp_strm_multi *zs = comp->stream;
+	struct zcomp_strm *zstrm;
+
+	spin_lock(&zs->strm_lock);
+	zs->max_strm = num_strm;
+	/*
+	 * if user has lowered the limit and there are idle streams,
+	 * immediately free as much streams (and memory) as we can.
+	 */
+	while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) {
+		zstrm = list_entry(zs->idle_strm.next,
+				struct zcomp_strm, list);
+		list_del(&zstrm->list);
+		zcomp_strm_free(comp, zstrm);
+		zs->avail_strm--;
+	}
+	spin_unlock(&zs->strm_lock);
+	return true;
+}
+
+static void zcomp_strm_multi_destroy(struct zcomp *comp)
+{
+	struct zcomp_strm_multi *zs = comp->stream;
+	struct zcomp_strm *zstrm;
+
+	while (!list_empty(&zs->idle_strm)) {
+		zstrm = list_entry(zs->idle_strm.next,
+				struct zcomp_strm, list);
+		list_del(&zstrm->list);
+		zcomp_strm_free(comp, zstrm);
+	}
+	kfree(zs);
+}
+
+static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
+{
+	struct zcomp_strm *zstrm;
+	struct zcomp_strm_multi *zs;
+
+	comp->destroy = zcomp_strm_multi_destroy;
+	comp->strm_find = zcomp_strm_multi_find;
+	comp->strm_release = zcomp_strm_multi_release;
+	comp->set_max_streams = zcomp_strm_multi_set_max_streams;
+	zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
+	if (!zs)
+		return -ENOMEM;
+
+	comp->stream = zs;
+	spin_lock_init(&zs->strm_lock);
+	INIT_LIST_HEAD(&zs->idle_strm);
+	init_waitqueue_head(&zs->strm_wait);
+	zs->max_strm = max_strm;
+	zs->avail_strm = 1;
+
+	zstrm = zcomp_strm_alloc(comp);
+	if (!zstrm) {
+		kfree(zs);
+		return -ENOMEM;
+	}
+	list_add(&zstrm->list, &zs->idle_strm);
+	return 0;
+}
+
+static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
+{
+	struct zcomp_strm_single *zs = comp->stream;
+	mutex_lock(&zs->strm_lock);
+	return zs->zstrm;
+}
+
+static void zcomp_strm_single_release(struct zcomp *comp,
+		struct zcomp_strm *zstrm)
+{
+	struct zcomp_strm_single *zs = comp->stream;
+	mutex_unlock(&zs->strm_lock);
+}
+
+static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
+{
+	/* zcomp_strm_single support only max_comp_streams == 1 */
+	return false;
+}
+
+static void zcomp_strm_single_destroy(struct zcomp *comp)
+{
+	struct zcomp_strm_single *zs = comp->stream;
+	zcomp_strm_free(comp, zs->zstrm);
+	kfree(zs);
+}
+
+static int zcomp_strm_single_create(struct zcomp *comp)
+{
+	struct zcomp_strm_single *zs;
+
+	comp->destroy = zcomp_strm_single_destroy;
+	comp->strm_find = zcomp_strm_single_find;
+	comp->strm_release = zcomp_strm_single_release;
+	comp->set_max_streams = zcomp_strm_single_set_max_streams;
+	zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
+	if (!zs)
+		return -ENOMEM;
+
+	comp->stream = zs;
+	mutex_init(&zs->strm_lock);
+	zs->zstrm = zcomp_strm_alloc(comp);
+	if (!zs->zstrm) {
+		kfree(zs);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+/* show available compressors */
+ssize_t zcomp_available_show(const char *comp, char *buf)
+{
+	ssize_t sz = 0;
+	int i = 0;
+
+	while (backends[i]) {
+		if (sysfs_streq(comp, backends[i]->name))
+			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+					"[%s] ", backends[i]->name);
+		else
+			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+					"%s ", backends[i]->name);
+		i++;
+	}
+	sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
+	return sz;
+}
+
+bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
+{
+	return comp->set_max_streams(comp, num_strm);
+}
+
+struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
+{
+	return comp->strm_find(comp);
+}
+
+void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
+{
+	comp->strm_release(comp, zstrm);
+}
+
+int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
+		const unsigned char *src, size_t *dst_len)
+{
+	return comp->backend->compress(src, zstrm->buffer, dst_len,
+			zstrm->private);
+}
+
+int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
+		size_t src_len, unsigned char *dst)
+{
+	return comp->backend->decompress(src, src_len, dst);
+}
+
+void zcomp_destroy(struct zcomp *comp)
+{
+	comp->destroy(comp);
+	kfree(comp);
+}
+
+/*
+ * search available compressors for requested algorithm.
+ * allocate new zcomp and initialize it. return compressing
+ * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL)
+ * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in
+ * case of allocation error.
+ */
+struct zcomp *zcomp_create(const char *compress, int max_strm)
+{
+	struct zcomp *comp;
+	struct zcomp_backend *backend;
+
+	backend = find_backend(compress);
+	if (!backend)
+		return ERR_PTR(-EINVAL);
+
+	comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
+	if (!comp)
+		return ERR_PTR(-ENOMEM);
+
+	comp->backend = backend;
+	if (max_strm > 1)
+		zcomp_strm_multi_create(comp, max_strm);
+	else
+		zcomp_strm_single_create(comp);
+	if (!comp->stream) {
+		kfree(comp);
+		return ERR_PTR(-ENOMEM);
+	}
+	return comp;
+}
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
new file mode 100644
index 00000000000..c59d1fca72c
--- /dev/null
+++ b/drivers/block/zram/zcomp.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_H_
+#define _ZCOMP_H_
+
+#include <linux/mutex.h>
+
+struct zcomp_strm {
+	/* compression/decompression buffer */
+	void *buffer;
+	/*
+	 * The private data of the compression stream, only compression
+	 * stream backend can touch this (e.g. compression algorithm
+	 * working memory)
+	 */
+	void *private;
+	/* used in multi stream backend, protected by backend strm_lock */
+	struct list_head list;
+};
+
+/* static compression backend */
+struct zcomp_backend {
+	int (*compress)(const unsigned char *src, unsigned char *dst,
+			size_t *dst_len, void *private);
+
+	int (*decompress)(const unsigned char *src, size_t src_len,
+			unsigned char *dst);
+
+	void *(*create)(void);
+	void (*destroy)(void *private);
+
+	const char *name;
+};
+
+/* dynamic per-device compression frontend */
+struct zcomp {
+	void *stream;
+	struct zcomp_backend *backend;
+
+	struct zcomp_strm *(*strm_find)(struct zcomp *comp);
+	void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm);
+	bool (*set_max_streams)(struct zcomp *comp, int num_strm);
+	void (*destroy)(struct zcomp *comp);
+};
+
+ssize_t zcomp_available_show(const char *comp, char *buf);
+
+struct zcomp *zcomp_create(const char *comp, int max_strm);
+void zcomp_destroy(struct zcomp *comp);
+
+struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
+void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm);
+
+int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
+		const unsigned char *src, size_t *dst_len);
+
+int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
+		size_t src_len, unsigned char *dst);
+
+bool zcomp_set_max_streams(struct zcomp *comp, int num_strm);
+#endif /* _ZCOMP_H_ */
diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
new file mode 100644
index 00000000000..f2afb7e988c
--- /dev/null
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/lz4.h>
+
+#include "zcomp_lz4.h"
+
+static void *zcomp_lz4_create(void)
+{
+	return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
+}
+
+static void zcomp_lz4_destroy(void *private)
+{
+	kfree(private);
+}
+
+static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
+		size_t *dst_len, void *private)
+{
+	/* return  : Success if return 0 */
+	return lz4_compress(src, PAGE_SIZE, dst, dst_len, private);
+}
+
+static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len,
+		unsigned char *dst)
+{
+	size_t dst_len = PAGE_SIZE;
+	/* return  : Success if return 0 */
+	return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len);
+}
+
+struct zcomp_backend zcomp_lz4 = {
+	.compress = zcomp_lz4_compress,
+	.decompress = zcomp_lz4_decompress,
+	.create = zcomp_lz4_create,
+	.destroy = zcomp_lz4_destroy,
+	.name = "lz4",
+};
diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h
new file mode 100644
index 00000000000..60613fb29dd
--- /dev/null
+++ b/drivers/block/zram/zcomp_lz4.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_LZ4_H_
+#define _ZCOMP_LZ4_H_
+
+#include "zcomp.h"
+
+extern struct zcomp_backend zcomp_lz4;
+
+#endif /* _ZCOMP_LZ4_H_ */
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
new file mode 100644
index 00000000000..da1bc47d588
--- /dev/null
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/lzo.h>
+
+#include "zcomp_lzo.h"
+
+static void *lzo_create(void)
+{
+	return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+}
+
+static void lzo_destroy(void *private)
+{
+	kfree(private);
+}
+
+static int lzo_compress(const unsigned char *src, unsigned char *dst,
+		size_t *dst_len, void *private)
+{
+	int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private);
+	return ret == LZO_E_OK ? 0 : ret;
+}
+
+static int lzo_decompress(const unsigned char *src, size_t src_len,
+		unsigned char *dst)
+{
+	size_t dst_len = PAGE_SIZE;
+	int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len);
+	return ret == LZO_E_OK ? 0 : ret;
+}
+
+struct zcomp_backend zcomp_lzo = {
+	.compress = lzo_compress,
+	.decompress = lzo_decompress,
+	.create = lzo_create,
+	.destroy = lzo_destroy,
+	.name = "lzo",
+};
diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h
new file mode 100644
index 00000000000..128c5807fa1
--- /dev/null
+++ b/drivers/block/zram/zcomp_lzo.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_LZO_H_
+#define _ZCOMP_LZO_H_
+
+#include "zcomp.h"
+
+extern struct zcomp_backend zcomp_lzo;
+
+#endif /* _ZCOMP_LZO_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
new file mode 100644
index 00000000000..36e54be402d
--- /dev/null
+++ b/drivers/block/zram/zram_drv.c
@@ -0,0 +1,1046 @@
+/*
+ * Compressed RAM block device
+ *
+ * Copyright (C) 2008, 2009, 2010  Nitin Gupta
+ *               2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ */
+
+#define KMSG_COMPONENT "zram"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#ifdef CONFIG_ZRAM_DEBUG
+#define DEBUG
+#endif
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/err.h>
+
+#include "zram_drv.h"
+
+/* Globals */
+static int zram_major;
+static struct zram *zram_devices;
+static const char *default_compressor = "lzo";
+
+/* Module params (documentation at end) */
+static unsigned int num_devices = 1;
+
+#define ZRAM_ATTR_RO(name)						\
+static ssize_t zram_attr_##name##_show(struct device *d,		\
+				struct device_attribute *attr, char *b)	\
+{									\
+	struct zram *zram = dev_to_zram(d);				\
+	return scnprintf(b, PAGE_SIZE, "%llu\n",			\
+		(u64)atomic64_read(&zram->stats.name));			\
+}									\
+static struct device_attribute dev_attr_##name =			\
+	__ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL);
+
+static inline int init_done(struct zram *zram)
+{
+	return zram->meta != NULL;
+}
+
+static inline struct zram *dev_to_zram(struct device *dev)
+{
+	return (struct zram *)dev_to_disk(dev)->private_data;
+}
+
+static ssize_t disksize_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
+}
+
+static ssize_t initstate_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u32 val;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	val = init_done(zram);
+	up_read(&zram->init_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t orig_data_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n",
+		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
+}
+
+static ssize_t mem_used_total_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u64 val = 0;
+	struct zram *zram = dev_to_zram(dev);
+	struct zram_meta *meta = zram->meta;
+
+	down_read(&zram->init_lock);
+	if (init_done(zram))
+		val = zs_get_total_size_bytes(meta->mem_pool);
+	up_read(&zram->init_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+
+static ssize_t max_comp_streams_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	int val;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	val = zram->max_comp_streams;
+	up_read(&zram->init_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t max_comp_streams_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	int num;
+	struct zram *zram = dev_to_zram(dev);
+	int ret;
+
+	ret = kstrtoint(buf, 0, &num);
+	if (ret < 0)
+		return ret;
+	if (num < 1)
+		return -EINVAL;
+
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		if (!zcomp_set_max_streams(zram->comp, num)) {
+			pr_info("Cannot change max compression streams\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	zram->max_comp_streams = num;
+	ret = len;
+out:
+	up_write(&zram->init_lock);
+	return ret;
+}
+
+static ssize_t comp_algorithm_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	size_t sz;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	sz = zcomp_available_show(zram->compressor, buf);
+	up_read(&zram->init_lock);
+
+	return sz;
+}
+
+static ssize_t comp_algorithm_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		up_write(&zram->init_lock);
+		pr_info("Can't change algorithm for initialized device\n");
+		return -EBUSY;
+	}
+	strlcpy(zram->compressor, buf, sizeof(zram->compressor));
+	up_write(&zram->init_lock);
+	return len;
+}
+
+/* flag operations needs meta->tb_lock */
+static int zram_test_flag(struct zram_meta *meta, u32 index,
+			enum zram_pageflags flag)
+{
+	return meta->table[index].flags & BIT(flag);
+}
+
+static void zram_set_flag(struct zram_meta *meta, u32 index,
+			enum zram_pageflags flag)
+{
+	meta->table[index].flags |= BIT(flag);
+}
+
+static void zram_clear_flag(struct zram_meta *meta, u32 index,
+			enum zram_pageflags flag)
+{
+	meta->table[index].flags &= ~BIT(flag);
+}
+
+static inline int is_partial_io(struct bio_vec *bvec)
+{
+	return bvec->bv_len != PAGE_SIZE;
+}
+
+/*
+ * Check if request is within bounds and aligned on zram logical blocks.
+ */
+static inline int valid_io_request(struct zram *zram, struct bio *bio)
+{
+	u64 start, end, bound;
+
+	/* unaligned request */
+	if (unlikely(bio->bi_iter.bi_sector &
+		     (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
+		return 0;
+	if (unlikely(bio->bi_iter.bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
+		return 0;
+
+	start = bio->bi_iter.bi_sector;
+	end = start + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
+	bound = zram->disksize >> SECTOR_SHIFT;
+	/* out of range range */
+	if (unlikely(start >= bound || end > bound || start > end))
+		return 0;
+
+	/* I/O request is valid */
+	return 1;
+}
+
+static void zram_meta_free(struct zram_meta *meta)
+{
+	zs_destroy_pool(meta->mem_pool);
+	vfree(meta->table);
+	kfree(meta);
+}
+
+static struct zram_meta *zram_meta_alloc(u64 disksize)
+{
+	size_t num_pages;
+	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+	if (!meta)
+		goto out;
+
+	num_pages = disksize >> PAGE_SHIFT;
+	meta->table = vzalloc(num_pages * sizeof(*meta->table));
+	if (!meta->table) {
+		pr_err("Error allocating zram address table\n");
+		goto free_meta;
+	}
+
+	meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
+	if (!meta->mem_pool) {
+		pr_err("Error creating memory pool\n");
+		goto free_table;
+	}
+
+	rwlock_init(&meta->tb_lock);
+	return meta;
+
+free_table:
+	vfree(meta->table);
+free_meta:
+	kfree(meta);
+	meta = NULL;
+out:
+	return meta;
+}
+
+static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
+{
+	if (*offset + bvec->bv_len >= PAGE_SIZE)
+		(*index)++;
+	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
+}
+
+static int page_zero_filled(void *ptr)
+{
+	unsigned int pos;
+	unsigned long *page;
+
+	page = (unsigned long *)ptr;
+
+	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
+		if (page[pos])
+			return 0;
+	}
+
+	return 1;
+}
+
+static void handle_zero_page(struct bio_vec *bvec)
+{
+	struct page *page = bvec->bv_page;
+	void *user_mem;
+
+	user_mem = kmap_atomic(page);
+	if (is_partial_io(bvec))
+		memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
+	else
+		clear_page(user_mem);
+	kunmap_atomic(user_mem);
+
+	flush_dcache_page(page);
+}
+
+/* NOTE: caller should hold meta->tb_lock with write-side */
+static void zram_free_page(struct zram *zram, size_t index)
+{
+	struct zram_meta *meta = zram->meta;
+	unsigned long handle = meta->table[index].handle;
+
+	if (unlikely(!handle)) {
+		/*
+		 * No memory is allocated for zero filled pages.
+		 * Simply clear zero page flag.
+		 */
+		if (zram_test_flag(meta, index, ZRAM_ZERO)) {
+			zram_clear_flag(meta, index, ZRAM_ZERO);
+			atomic64_dec(&zram->stats.zero_pages);
+		}
+		return;
+	}
+
+	zs_free(meta->mem_pool, handle);
+
+	atomic64_sub(meta->table[index].size, &zram->stats.compr_data_size);
+	atomic64_dec(&zram->stats.pages_stored);
+
+	meta->table[index].handle = 0;
+	meta->table[index].size = 0;
+}
+
+static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
+{
+	int ret = 0;
+	unsigned char *cmem;
+	struct zram_meta *meta = zram->meta;
+	unsigned long handle;
+	u16 size;
+
+	read_lock(&meta->tb_lock);
+	handle = meta->table[index].handle;
+	size = meta->table[index].size;
+
+	if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
+		read_unlock(&meta->tb_lock);
+		clear_page(mem);
+		return 0;
+	}
+
+	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+	if (size == PAGE_SIZE)
+		copy_page(mem, cmem);
+	else
+		ret = zcomp_decompress(zram->comp, cmem, size, mem);
+	zs_unmap_object(meta->mem_pool, handle);
+	read_unlock(&meta->tb_lock);
+
+	/* Should NEVER happen. Return bio error if it does. */
+	if (unlikely(ret)) {
+		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
+		atomic64_inc(&zram->stats.failed_reads);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
+			  u32 index, int offset, struct bio *bio)
+{
+	int ret;
+	struct page *page;
+	unsigned char *user_mem, *uncmem = NULL;
+	struct zram_meta *meta = zram->meta;
+	page = bvec->bv_page;
+
+	read_lock(&meta->tb_lock);
+	if (unlikely(!meta->table[index].handle) ||
+			zram_test_flag(meta, index, ZRAM_ZERO)) {
+		read_unlock(&meta->tb_lock);
+		handle_zero_page(bvec);
+		return 0;
+	}
+	read_unlock(&meta->tb_lock);
+
+	if (is_partial_io(bvec))
+		/* Use  a temporary buffer to decompress the page */
+		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
+
+	user_mem = kmap_atomic(page);
+	if (!is_partial_io(bvec))
+		uncmem = user_mem;
+
+	if (!uncmem) {
+		pr_info("Unable to allocate temp memory\n");
+		ret = -ENOMEM;
+		goto out_cleanup;
+	}
+
+	ret = zram_decompress_page(zram, uncmem, index);
+	/* Should NEVER happen. Return bio error if it does. */
+	if (unlikely(ret))
+		goto out_cleanup;
+
+	if (is_partial_io(bvec))
+		memcpy(user_mem + bvec->bv_offset, uncmem + offset,
+				bvec->bv_len);
+
+	flush_dcache_page(page);
+	ret = 0;
+out_cleanup:
+	kunmap_atomic(user_mem);
+	if (is_partial_io(bvec))
+		kfree(uncmem);
+	return ret;
+}
+
+static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
+			   int offset)
+{
+	int ret = 0;
+	size_t clen;
+	unsigned long handle;
+	struct page *page;
+	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
+	struct zram_meta *meta = zram->meta;
+	struct zcomp_strm *zstrm;
+	bool locked = false;
+
+	page = bvec->bv_page;
+	if (is_partial_io(bvec)) {
+		/*
+		 * This is a partial IO. We need to read the full page
+		 * before to write the changes.
+		 */
+		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
+		if (!uncmem) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = zram_decompress_page(zram, uncmem, index);
+		if (ret)
+			goto out;
+	}
+
+	zstrm = zcomp_strm_find(zram->comp);
+	locked = true;
+	user_mem = kmap_atomic(page);
+
+	if (is_partial_io(bvec)) {
+		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
+		       bvec->bv_len);
+		kunmap_atomic(user_mem);
+		user_mem = NULL;
+	} else {
+		uncmem = user_mem;
+	}
+
+	if (page_zero_filled(uncmem)) {
+		kunmap_atomic(user_mem);
+		/* Free memory associated with this sector now. */
+		write_lock(&zram->meta->tb_lock);
+		zram_free_page(zram, index);
+		zram_set_flag(meta, index, ZRAM_ZERO);
+		write_unlock(&zram->meta->tb_lock);
+
+		atomic64_inc(&zram->stats.zero_pages);
+		ret = 0;
+		goto out;
+	}
+
+	ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
+	if (!is_partial_io(bvec)) {
+		kunmap_atomic(user_mem);
+		user_mem = NULL;
+		uncmem = NULL;
+	}
+
+	if (unlikely(ret)) {
+		pr_err("Compression failed! err=%d\n", ret);
+		goto out;
+	}
+	src = zstrm->buffer;
+	if (unlikely(clen > max_zpage_size)) {
+		clen = PAGE_SIZE;
+		if (is_partial_io(bvec))
+			src = uncmem;
+	}
+
+	handle = zs_malloc(meta->mem_pool, clen);
+	if (!handle) {
+		pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
+			index, clen);
+		ret = -ENOMEM;
+		goto out;
+	}
+	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+
+	if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
+		src = kmap_atomic(page);
+		copy_page(cmem, src);
+		kunmap_atomic(src);
+	} else {
+		memcpy(cmem, src, clen);
+	}
+
+	zcomp_strm_release(zram->comp, zstrm);
+	locked = false;
+	zs_unmap_object(meta->mem_pool, handle);
+
+	/*
+	 * Free memory associated with this sector
+	 * before overwriting unused sectors.
+	 */
+	write_lock(&zram->meta->tb_lock);
+	zram_free_page(zram, index);
+
+	meta->table[index].handle = handle;
+	meta->table[index].size = clen;
+	write_unlock(&zram->meta->tb_lock);
+
+	/* Update stats */
+	atomic64_add(clen, &zram->stats.compr_data_size);
+	atomic64_inc(&zram->stats.pages_stored);
+out:
+	if (locked)
+		zcomp_strm_release(zram->comp, zstrm);
+	if (is_partial_io(bvec))
+		kfree(uncmem);
+	if (ret)
+		atomic64_inc(&zram->stats.failed_writes);
+	return ret;
+}
+
+static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
+			int offset, struct bio *bio)
+{
+	int ret;
+	int rw = bio_data_dir(bio);
+
+	if (rw == READ) {
+		atomic64_inc(&zram->stats.num_reads);
+		ret = zram_bvec_read(zram, bvec, index, offset, bio);
+	} else {
+		atomic64_inc(&zram->stats.num_writes);
+		ret = zram_bvec_write(zram, bvec, index, offset);
+	}
+
+	return ret;
+}
+
+/*
+ * zram_bio_discard - handler on discard request
+ * @index: physical block index in PAGE_SIZE units
+ * @offset: byte offset within physical block
+ */
+static void zram_bio_discard(struct zram *zram, u32 index,
+			     int offset, struct bio *bio)
+{
+	size_t n = bio->bi_iter.bi_size;
+
+	/*
+	 * zram manages data in physical block size units. Because logical block
+	 * size isn't identical with physical block size on some arch, we
+	 * could get a discard request pointing to a specific offset within a
+	 * certain physical block.  Although we can handle this request by
+	 * reading that physiclal block and decompressing and partially zeroing
+	 * and re-compressing and then re-storing it, this isn't reasonable
+	 * because our intent with a discard request is to save memory.  So
+	 * skipping this logical block is appropriate here.
+	 */
+	if (offset) {
+		if (n <= (PAGE_SIZE - offset))
+			return;
+
+		n -= (PAGE_SIZE - offset);
+		index++;
+	}
+
+	while (n >= PAGE_SIZE) {
+		/*
+		 * Discard request can be large so the lock hold times could be
+		 * lengthy.  So take the lock once per page.
+		 */
+		write_lock(&zram->meta->tb_lock);
+		zram_free_page(zram, index);
+		write_unlock(&zram->meta->tb_lock);
+		index++;
+		n -= PAGE_SIZE;
+	}
+}
+
+static void zram_reset_device(struct zram *zram, bool reset_capacity)
+{
+	size_t index;
+	struct zram_meta *meta;
+
+	down_write(&zram->init_lock);
+	if (!init_done(zram)) {
+		up_write(&zram->init_lock);
+		return;
+	}
+
+	meta = zram->meta;
+	/* Free all pages that are still in this zram device */
+	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
+		unsigned long handle = meta->table[index].handle;
+		if (!handle)
+			continue;
+
+		zs_free(meta->mem_pool, handle);
+	}
+
+	zcomp_destroy(zram->comp);
+	zram->max_comp_streams = 1;
+
+	zram_meta_free(zram->meta);
+	zram->meta = NULL;
+	/* Reset stats */
+	memset(&zram->stats, 0, sizeof(zram->stats));
+
+	zram->disksize = 0;
+	if (reset_capacity)
+		set_capacity(zram->disk, 0);
+
+	up_write(&zram->init_lock);
+
+	/*
+	 * Revalidate disk out of the init_lock to avoid lockdep splat.
+	 * It's okay because disk's capacity is protected by init_lock
+	 * so that revalidate_disk always sees up-to-date capacity.
+	 */
+	if (reset_capacity)
+		revalidate_disk(zram->disk);
+}
+
+static ssize_t disksize_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	u64 disksize;
+	struct zcomp *comp;
+	struct zram_meta *meta;
+	struct zram *zram = dev_to_zram(dev);
+	int err;
+
+	disksize = memparse(buf, NULL);
+	if (!disksize)
+		return -EINVAL;
+
+	disksize = PAGE_ALIGN(disksize);
+	meta = zram_meta_alloc(disksize);
+	if (!meta)
+		return -ENOMEM;
+
+	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
+	if (IS_ERR(comp)) {
+		pr_info("Cannot initialise %s compressing backend\n",
+				zram->compressor);
+		err = PTR_ERR(comp);
+		goto out_free_meta;
+	}
+
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		pr_info("Cannot change disksize for initialized device\n");
+		err = -EBUSY;
+		goto out_destroy_comp;
+	}
+
+	zram->meta = meta;
+	zram->comp = comp;
+	zram->disksize = disksize;
+	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+	up_write(&zram->init_lock);
+
+	/*
+	 * Revalidate disk out of the init_lock to avoid lockdep splat.
+	 * It's okay because disk's capacity is protected by init_lock
+	 * so that revalidate_disk always sees up-to-date capacity.
+	 */
+	revalidate_disk(zram->disk);
+
+	return len;
+
+out_destroy_comp:
+	up_write(&zram->init_lock);
+	zcomp_destroy(comp);
+out_free_meta:
+	zram_meta_free(meta);
+	return err;
+}
+
+static ssize_t reset_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	int ret;
+	unsigned short do_reset;
+	struct zram *zram;
+	struct block_device *bdev;
+
+	zram = dev_to_zram(dev);
+	bdev = bdget_disk(zram->disk, 0);
+
+	if (!bdev)
+		return -ENOMEM;
+
+	/* Do not reset an active device! */
+	if (bdev->bd_holders) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = kstrtou16(buf, 10, &do_reset);
+	if (ret)
+		goto out;
+
+	if (!do_reset) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Make sure all pending I/O is finished */
+	fsync_bdev(bdev);
+	bdput(bdev);
+
+	zram_reset_device(zram, true);
+	return len;
+
+out:
+	bdput(bdev);
+	return ret;
+}
+
+static void __zram_make_request(struct zram *zram, struct bio *bio)
+{
+	int offset;
+	u32 index;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
+	offset = (bio->bi_iter.bi_sector &
+		  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
+
+	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+		zram_bio_discard(zram, index, offset, bio);
+		bio_endio(bio, 0);
+		return;
+	}
+
+	bio_for_each_segment(bvec, bio, iter) {
+		int max_transfer_size = PAGE_SIZE - offset;
+
+		if (bvec.bv_len > max_transfer_size) {
+			/*
+			 * zram_bvec_rw() can only make operation on a single
+			 * zram page. Split the bio vector.
+			 */
+			struct bio_vec bv;
+
+			bv.bv_page = bvec.bv_page;
+			bv.bv_len = max_transfer_size;
+			bv.bv_offset = bvec.bv_offset;
+
+			if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0)
+				goto out;
+
+			bv.bv_len = bvec.bv_len - max_transfer_size;
+			bv.bv_offset += max_transfer_size;
+			if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0)
+				goto out;
+		} else
+			if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0)
+				goto out;
+
+		update_position(&index, &offset, &bvec);
+	}
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return;
+
+out:
+	bio_io_error(bio);
+}
+
+/*
+ * Handler function for all zram I/O requests.
+ */
+static void zram_make_request(struct request_queue *queue, struct bio *bio)
+{
+	struct zram *zram = queue->queuedata;
+
+	down_read(&zram->init_lock);
+	if (unlikely(!init_done(zram)))
+		goto error;
+
+	if (!valid_io_request(zram, bio)) {
+		atomic64_inc(&zram->stats.invalid_io);
+		goto error;
+	}
+
+	__zram_make_request(zram, bio);
+	up_read(&zram->init_lock);
+
+	return;
+
+error:
+	up_read(&zram->init_lock);
+	bio_io_error(bio);
+}
+
+static void zram_slot_free_notify(struct block_device *bdev,
+				unsigned long index)
+{
+	struct zram *zram;
+	struct zram_meta *meta;
+
+	zram = bdev->bd_disk->private_data;
+	meta = zram->meta;
+
+	write_lock(&meta->tb_lock);
+	zram_free_page(zram, index);
+	write_unlock(&meta->tb_lock);
+	atomic64_inc(&zram->stats.notify_free);
+}
+
+static const struct block_device_operations zram_devops = {
+	.swap_slot_free_notify = zram_slot_free_notify,
+	.owner = THIS_MODULE
+};
+
+static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR,
+		disksize_show, disksize_store);
+static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
+static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
+static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
+static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
+static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR,
+		max_comp_streams_show, max_comp_streams_store);
+static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR,
+		comp_algorithm_show, comp_algorithm_store);
+
+ZRAM_ATTR_RO(num_reads);
+ZRAM_ATTR_RO(num_writes);
+ZRAM_ATTR_RO(failed_reads);
+ZRAM_ATTR_RO(failed_writes);
+ZRAM_ATTR_RO(invalid_io);
+ZRAM_ATTR_RO(notify_free);
+ZRAM_ATTR_RO(zero_pages);
+ZRAM_ATTR_RO(compr_data_size);
+
+static struct attribute *zram_disk_attrs[] = {
+	&dev_attr_disksize.attr,
+	&dev_attr_initstate.attr,
+	&dev_attr_reset.attr,
+	&dev_attr_num_reads.attr,
+	&dev_attr_num_writes.attr,
+	&dev_attr_failed_reads.attr,
+	&dev_attr_failed_writes.attr,
+	&dev_attr_invalid_io.attr,
+	&dev_attr_notify_free.attr,
+	&dev_attr_zero_pages.attr,
+	&dev_attr_orig_data_size.attr,
+	&dev_attr_compr_data_size.attr,
+	&dev_attr_mem_used_total.attr,
+	&dev_attr_max_comp_streams.attr,
+	&dev_attr_comp_algorithm.attr,
+	NULL,
+};
+
+static struct attribute_group zram_disk_attr_group = {
+	.attrs = zram_disk_attrs,
+};
+
+static int create_device(struct zram *zram, int device_id)
+{
+	int ret = -ENOMEM;
+
+	init_rwsem(&zram->init_lock);
+
+	zram->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!zram->queue) {
+		pr_err("Error allocating disk queue for device %d\n",
+			device_id);
+		goto out;
+	}
+
+	blk_queue_make_request(zram->queue, zram_make_request);
+	zram->queue->queuedata = zram;
+
+	 /* gendisk structure */
+	zram->disk = alloc_disk(1);
+	if (!zram->disk) {
+		pr_warn("Error allocating disk structure for device %d\n",
+			device_id);
+		goto out_free_queue;
+	}
+
+	zram->disk->major = zram_major;
+	zram->disk->first_minor = device_id;
+	zram->disk->fops = &zram_devops;
+	zram->disk->queue = zram->queue;
+	zram->disk->private_data = zram;
+	snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
+
+	/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
+	set_capacity(zram->disk, 0);
+	/* zram devices sort of resembles non-rotational disks */
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
+	/*
+	 * To ensure that we always get PAGE_SIZE aligned
+	 * and n*PAGE_SIZED sized I/O requests.
+	 */
+	blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
+	blk_queue_logical_block_size(zram->disk->queue,
+					ZRAM_LOGICAL_BLOCK_SIZE);
+	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
+	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
+	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
+	zram->disk->queue->limits.max_discard_sectors = UINT_MAX;
+	/*
+	 * zram_bio_discard() will clear all logical blocks if logical block
+	 * size is identical with physical block size(PAGE_SIZE). But if it is
+	 * different, we will skip discarding some parts of logical blocks in
+	 * the part of the request range which isn't aligned to physical block
+	 * size.  So we can't ensure that all discarded logical blocks are
+	 * zeroed.
+	 */
+	if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
+		zram->disk->queue->limits.discard_zeroes_data = 1;
+	else
+		zram->disk->queue->limits.discard_zeroes_data = 0;
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
+
+	add_disk(zram->disk);
+
+	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
+				&zram_disk_attr_group);
+	if (ret < 0) {
+		pr_warn("Error creating sysfs group");
+		goto out_free_disk;
+	}
+	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
+	zram->meta = NULL;
+	zram->max_comp_streams = 1;
+	return 0;
+
+out_free_disk:
+	del_gendisk(zram->disk);
+	put_disk(zram->disk);
+out_free_queue:
+	blk_cleanup_queue(zram->queue);
+out:
+	return ret;
+}
+
+static void destroy_device(struct zram *zram)
+{
+	sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
+			&zram_disk_attr_group);
+
+	del_gendisk(zram->disk);
+	put_disk(zram->disk);
+
+	blk_cleanup_queue(zram->queue);
+}
+
+static int __init zram_init(void)
+{
+	int ret, dev_id;
+
+	if (num_devices > max_num_devices) {
+		pr_warn("Invalid value for num_devices: %u\n",
+				num_devices);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	zram_major = register_blkdev(0, "zram");
+	if (zram_major <= 0) {
+		pr_warn("Unable to get major number\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	/* Allocate the device array and initialize each one */
+	zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
+	if (!zram_devices) {
+		ret = -ENOMEM;
+		goto unregister;
+	}
+
+	for (dev_id = 0; dev_id < num_devices; dev_id++) {
+		ret = create_device(&zram_devices[dev_id], dev_id);
+		if (ret)
+			goto free_devices;
+	}
+
+	pr_info("Created %u device(s) ...\n", num_devices);
+
+	return 0;
+
+free_devices:
+	while (dev_id)
+		destroy_device(&zram_devices[--dev_id]);
+	kfree(zram_devices);
+unregister:
+	unregister_blkdev(zram_major, "zram");
+out:
+	return ret;
+}
+
+static void __exit zram_exit(void)
+{
+	int i;
+	struct zram *zram;
+
+	for (i = 0; i < num_devices; i++) {
+		zram = &zram_devices[i];
+
+		destroy_device(zram);
+		/*
+		 * Shouldn't access zram->disk after destroy_device
+		 * because destroy_device already released zram->disk.
+		 */
+		zram_reset_device(zram, false);
+	}
+
+	unregister_blkdev(zram_major, "zram");
+
+	kfree(zram_devices);
+	pr_debug("Cleanup done!\n");
+}
+
+module_init(zram_init);
+module_exit(zram_exit);
+
+module_param(num_devices, uint, 0);
+MODULE_PARM_DESC(num_devices, "Number of zram devices");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
+MODULE_DESCRIPTION("Compressed RAM Block Device");
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
new file mode 100644
index 00000000000..7f21c145e31
--- /dev/null
+++ b/drivers/block/zram/zram_drv.h
@@ -0,0 +1,106 @@
+/*
+ * Compressed RAM block device
+ *
+ * Copyright (C) 2008, 2009, 2010  Nitin Gupta
+ *               2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ */
+
+#ifndef _ZRAM_DRV_H_
+#define _ZRAM_DRV_H_
+
+#include <linux/spinlock.h>
+#include <linux/zsmalloc.h>
+
+#include "zcomp.h"
+
+/*
+ * Some arbitrary value. This is just to catch
+ * invalid value for num_devices module parameter.
+ */
+static const unsigned max_num_devices = 32;
+
+/*-- Configurable parameters */
+
+/*
+ * Pages that compress to size greater than this are stored
+ * uncompressed in memory.
+ */
+static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
+
+/*
+ * NOTE: max_zpage_size must be less than or equal to:
+ *   ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would
+ * always return failure.
+ */
+
+/*-- End of configurable params */
+
+#define SECTOR_SHIFT		9
+#define SECTOR_SIZE		(1 << SECTOR_SHIFT)
+#define SECTORS_PER_PAGE_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
+#define SECTORS_PER_PAGE	(1 << SECTORS_PER_PAGE_SHIFT)
+#define ZRAM_LOGICAL_BLOCK_SHIFT 12
+#define ZRAM_LOGICAL_BLOCK_SIZE	(1 << ZRAM_LOGICAL_BLOCK_SHIFT)
+#define ZRAM_SECTOR_PER_LOGICAL_BLOCK	\
+	(1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT))
+
+/* Flags for zram pages (table[page_no].flags) */
+enum zram_pageflags {
+	/* Page consists entirely of zeros */
+	ZRAM_ZERO,
+
+	__NR_ZRAM_PAGEFLAGS,
+};
+
+/*-- Data structures */
+
+/* Allocated for each disk page */
+struct table {
+	unsigned long handle;
+	u16 size;	/* object size (excluding header) */
+	u8 flags;
+} __aligned(4);
+
+struct zram_stats {
+	atomic64_t compr_data_size;	/* compressed size of pages stored */
+	atomic64_t num_reads;	/* failed + successful */
+	atomic64_t num_writes;	/* --do-- */
+	atomic64_t failed_reads;	/* should NEVER! happen */
+	atomic64_t failed_writes;	/* can happen when memory is too low */
+	atomic64_t invalid_io;	/* non-page-aligned I/O requests */
+	atomic64_t notify_free;	/* no. of swap slot free notifications */
+	atomic64_t zero_pages;		/* no. of zero filled pages */
+	atomic64_t pages_stored;	/* no. of pages currently stored */
+};
+
+struct zram_meta {
+	rwlock_t tb_lock;	/* protect table */
+	struct table *table;
+	struct zs_pool *mem_pool;
+};
+
+struct zram {
+	struct zram_meta *meta;
+	struct request_queue *queue;
+	struct gendisk *disk;
+	struct zcomp *comp;
+
+	/* Prevent concurrent execution of device init, reset and R/W request */
+	struct rw_semaphore init_lock;
+	/*
+	 * This is the limit on amount of *uncompressed* worth of data
+	 * we can store in a disk.
+	 */
+	u64 disksize;	/* bytes */
+	int max_comp_streams;
+	struct zram_stats stats;
+	char compressor[10];
+};
+#endif