From e3cc067b0a79d3a3672bfe7cfba12f2e8ae56039 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 18 Sep 2009 16:31:22 -0700
Subject: xen/evtchn: track enabled state for each port

enable/disable_irq() complain if the enables/disables are unbalanced,
so keep track of the state and avoid redundant enables.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/evtchn.c | 71 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 15 deletions(-)

diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index af031950f9b..4356a9a030d 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -69,10 +69,36 @@ struct per_user_data {
 	const char *name;
 };
 
-/* Who's bound to each port? */
-static struct per_user_data *port_user[NR_EVENT_CHANNELS];
+/*
+ * Who's bound to each port?  This is logically an array of struct
+ * per_user_data *, but we encode the current enabled-state in bit 0.
+ */
+static unsigned long port_user[NR_EVENT_CHANNELS];
 static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
 
+static inline struct per_user_data *get_port_user(unsigned port)
+{
+	return (struct per_user_data *)(port_user[port] & ~1);
+}
+
+static inline void set_port_user(unsigned port, struct per_user_data *u)
+{
+	port_user[port] = (unsigned long)u;
+}
+
+static inline bool get_port_enabled(unsigned port)
+{
+	return port_user[port] & 1;
+}
+
+static inline void set_port_enabled(unsigned port, bool enabled)
+{
+	if (enabled)
+		port_user[port] |= 1;
+	else
+		port_user[port] &= ~1;
+}
+
 irqreturn_t evtchn_interrupt(int irq, void *data)
 {
 	unsigned int port = (unsigned long)data;
@@ -80,9 +106,15 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
 
 	spin_lock(&port_user_lock);
 
-	u = port_user[port];
+	u = get_port_user(port);
+
+	if (WARN(!get_port_enabled(port),
+		 "Interrupt for port %d, but apparently not enabled; per-user %p\n",
+		 port, u))
+		goto out;
 
 	disable_irq_nosync(irq);
+	set_port_enabled(port, false);
 
 	if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
 		u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
@@ -92,10 +124,10 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
 			kill_fasync(&u->evtchn_async_queue,
 				    SIGIO, POLL_IN);
 		}
-	} else {
+	} else
 		u->ring_overflow = 1;
-	}
 
+out:
 	spin_unlock(&port_user_lock);
 
 	return IRQ_HANDLED;
@@ -198,9 +230,18 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
 		goto out;
 
 	spin_lock_irq(&port_user_lock);
-	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
-		if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
-			enable_irq(irq_from_evtchn(kbuf[i]));
+
+	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
+		unsigned port = kbuf[i];
+
+		if (port < NR_EVENT_CHANNELS &&
+		    get_port_user(port) == u &&
+		    !get_port_enabled(port)) {
+			set_port_enabled(port, true);
+			enable_irq(irq_from_evtchn(port));
+		}
+	}
+
 	spin_unlock_irq(&port_user_lock);
 
 	rc = count;
@@ -222,8 +263,8 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
 	 * interrupt handler yet, and our caller has already
 	 * serialized bind operations.)
 	 */
-	BUG_ON(port_user[port] != NULL);
-	port_user[port] = u;
+	BUG_ON(get_port_user(port) != NULL);
+	set_port_user(port, u);
 
 	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
 				       u->name, (void *)(unsigned long)port);
@@ -242,7 +283,7 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
 	/* make sure we unbind the irq handler before clearing the port */
 	barrier();
 
-	port_user[port] = NULL;
+	set_port_user(port, NULL);
 }
 
 static long evtchn_ioctl(struct file *file,
@@ -333,7 +374,7 @@ static long evtchn_ioctl(struct file *file,
 		spin_lock_irq(&port_user_lock);
 
 		rc = -ENOTCONN;
-		if (port_user[unbind.port] != u) {
+		if (get_port_user(unbind.port) != u) {
 			spin_unlock_irq(&port_user_lock);
 			break;
 		}
@@ -355,7 +396,7 @@ static long evtchn_ioctl(struct file *file,
 
 		if (notify.port >= NR_EVENT_CHANNELS) {
 			rc = -EINVAL;
-		} else if (port_user[notify.port] != u) {
+		} else if (get_port_user(notify.port) != u) {
 			rc = -ENOTCONN;
 		} else {
 			notify_remote_via_evtchn(notify.port);
@@ -444,10 +485,10 @@ static int evtchn_release(struct inode *inode, struct file *filp)
 	free_page((unsigned long)u->ring);
 
 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
-		if (port_user[i] != u)
+		if (get_port_user(i) != u)
 			continue;
 
-		evtchn_unbind_from_user(port_user[i], i);
+		evtchn_unbind_from_user(get_port_user(i), i);
 	}
 
 	spin_unlock_irq(&port_user_lock);
-- 
cgit v1.2.3-18-g5258


From 93afe0b75ef3675ca05320919a57de8b9bbb159c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 18 Sep 2009 16:36:58 -0700
Subject: xen/evtchn: dynamically allocate port_user array

We only need the array when running as a Xen domain, so dynamically
allocate it as needed to save on bss space.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/evtchn.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 4356a9a030d..709c32d949b 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -73,7 +73,7 @@ struct per_user_data {
  * Who's bound to each port?  This is logically an array of struct
  * per_user_data *, but we encode the current enabled-state in bit 0.
  */
-static unsigned long port_user[NR_EVENT_CHANNELS];
+static unsigned long *port_user;
 static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
 
 static inline struct per_user_data *get_port_user(unsigned port)
@@ -522,8 +522,11 @@ static int __init evtchn_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
+	port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
+	if (port_user == NULL)
+		return -ENOMEM;
+
 	spin_lock_init(&port_user_lock);
-	memset(port_user, 0, sizeof(port_user));
 
 	/* Create '/dev/misc/evtchn'. */
 	err = misc_register(&evtchn_miscdev);
@@ -539,6 +542,9 @@ static int __init evtchn_init(void)
 
 static void __exit evtchn_cleanup(void)
 {
+	kfree(port_user);
+	port_user = NULL;
+
 	misc_deregister(&evtchn_miscdev);
 }
 
-- 
cgit v1.2.3-18-g5258


From 0edce91dcd83160019867a00746c679344dc0bbd Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 18 Sep 2009 16:55:29 -0700
Subject: xen/evtchn: ports start enabled

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/evtchn.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 709c32d949b..72dc7f20c5e 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -108,10 +108,9 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
 
 	u = get_port_user(port);
 
-	if (WARN(!get_port_enabled(port),
-		 "Interrupt for port %d, but apparently not enabled; per-user %p\n",
-		 port, u))
-		goto out;
+	WARN(!get_port_enabled(port),
+	     "Interrupt for port %d, but apparently not enabled; per-user %p\n",
+	     port, u);
 
 	disable_irq_nosync(irq);
 	set_port_enabled(port, false);
@@ -127,7 +126,6 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
 	} else
 		u->ring_overflow = 1;
 
-out:
 	spin_unlock(&port_user_lock);
 
 	return IRQ_HANDLED;
@@ -265,6 +263,7 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
 	 */
 	BUG_ON(get_port_user(port) != NULL);
 	set_port_user(port, u);
+	set_port_enabled(port, true); /* start enabled */
 
 	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
 				       u->name, (void *)(unsigned long)port);
-- 
cgit v1.2.3-18-g5258


From 1a1a17cddbfb1f81222b3f18ee8530c41fbc3b82 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 18 Sep 2009 17:13:41 -0700
Subject: xen/evtchn: remove spurious barrier

evtchn_unbind_from_user() is called under a lock, so there's no need to
worry about the ordering of unbind_from_irqhandler vs clearing the port
per-user data.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/evtchn.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 72dc7f20c5e..f79ac5ca379 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -279,9 +279,6 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
 
 	unbind_from_irqhandler(irq, (void *)(unsigned long)port);
 
-	/* make sure we unbind the irq handler before clearing the port */
-	barrier();
-
 	set_port_user(port, NULL);
 }
 
-- 
cgit v1.2.3-18-g5258


From 3f5e554f669098c84c82ce75e7577f7e0f3fccde Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 28 May 2010 15:28:27 -0700
Subject: xen/evtchn: don't do unbind_from_irqhandler under spinlock

unbind_from_irqhandler can end up doing /proc operations, which can't
happen under a spinlock.  So before removing the IRQ handler,
disable the irq under the port_user lock (masking the underlying event
channel and making sure the irq handler isn't running concurrently and
won't start running), then remove the handler without the lock.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/evtchn.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index f79ac5ca379..6a3a12945d0 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -375,10 +375,12 @@ static long evtchn_ioctl(struct file *file,
 			break;
 		}
 
-		evtchn_unbind_from_user(u, unbind.port);
+		disable_irq(irq_from_evtchn(unbind.port));
 
 		spin_unlock_irq(&port_user_lock);
 
+		evtchn_unbind_from_user(u, unbind.port);
+
 		rc = 0;
 		break;
 	}
@@ -484,11 +486,18 @@ static int evtchn_release(struct inode *inode, struct file *filp)
 		if (get_port_user(i) != u)
 			continue;
 
-		evtchn_unbind_from_user(get_port_user(i), i);
+		disable_irq(irq_from_evtchn(i));
 	}
 
 	spin_unlock_irq(&port_user_lock);
 
+	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+		if (get_port_user(i) != u)
+			continue;
+
+		evtchn_unbind_from_user(get_port_user(i), i);
+	}
+
 	kfree(u->name);
 	kfree(u);
 
-- 
cgit v1.2.3-18-g5258


From 376d908f52427591cef4acd172db9c3ef28676ec Mon Sep 17 00:00:00 2001
From: Bastian Blank <waldi@debian.org>
Date: Fri, 28 May 2010 15:43:49 -0700
Subject: xen/evtchn: Fix name of Xen event-channel device

The Xen event-channel device is named evtchn in the kernel but always
used as /dev/xen/evtchn in userspace. This patch fixes the name.

Signed-off-by: Bastian Blank <waldi@debian.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/evtchn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 6a3a12945d0..68119f6324d 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -517,7 +517,7 @@ static const struct file_operations evtchn_fops = {
 
 static struct miscdevice evtchn_miscdev = {
 	.minor        = MISC_DYNAMIC_MINOR,
-	.name         = "evtchn",
+	.name         = "xen/evtchn",
 	.fops         = &evtchn_fops,
 };
 static int __init evtchn_init(void)
-- 
cgit v1.2.3-18-g5258


From 70697d540c0598ad023a391d4c895044db9a6624 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 5 Oct 2010 11:13:44 -0700
Subject: xen/evtchn: add missing static

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/evtchn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 68119f6324d..f3594ec2ee3 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -99,7 +99,7 @@ static inline void set_port_enabled(unsigned port, bool enabled)
 		port_user[port] &= ~1;
 }
 
-irqreturn_t evtchn_interrupt(int irq, void *data)
+static irqreturn_t evtchn_interrupt(int irq, void *data)
 {
 	unsigned int port = (unsigned long)data;
 	struct per_user_data *u;
-- 
cgit v1.2.3-18-g5258


From 2fa0f93915eacf758da800e2c67b3b9adef1c5c5 Mon Sep 17 00:00:00 2001
From: Simon Guinot <sguinot@lacie.com>
Date: Thu, 21 Oct 2010 11:42:28 +0200
Subject: [ARM] Kirkwood: enhance TCLK detection

According to the Marvell LSP, the Sample at Reset regiter bit 21 can be
used to detect TCLK on 6281 and 6282 devices.

This patch has only been tested on LaCie boards.

Signed-off-by: Simon Guinot <sguinot@lacie.com>
Acked-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 arch/arm/mach-kirkwood/common.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c
index 1c82d4290da..ee99a5acc89 100644
--- a/arch/arm/mach-kirkwood/common.c
+++ b/arch/arm/mach-kirkwood/common.c
@@ -854,10 +854,9 @@ int __init kirkwood_find_tclk(void)
 
 	kirkwood_pcie_id(&dev, &rev);
 
-	if ((dev == MV88F6281_DEV_ID && (rev == MV88F6281_REV_A0 ||
-					rev == MV88F6281_REV_A1)) ||
-	    (dev == MV88F6282_DEV_ID))
-		return 200000000;
+	if (dev == MV88F6281_DEV_ID || dev == MV88F6282_DEV_ID)
+		if (((readl(SAMPLE_AT_RESET) >> 21) & 1) == 0)
+			return 200000000;
 
 	return 166666667;
 }
-- 
cgit v1.2.3-18-g5258


From d3491820e8a65c4a51c8e2a165c6a13f864101ba Mon Sep 17 00:00:00 2001
From: Simon Guinot <sguinot@lacie.com>
Date: Thu, 21 Oct 2010 11:42:29 +0200
Subject: [ARM] Kirkwood: fix timer initialization for LaCie boards

Signed-off-by: Simon Guinot <sguinot@lacie.com>
Acked-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 arch/arm/mach-kirkwood/d2net_v2-setup.c    |  2 +-
 arch/arm/mach-kirkwood/lacie_v2-common.c   | 14 --------------
 arch/arm/mach-kirkwood/lacie_v2-common.h   |  2 --
 arch/arm/mach-kirkwood/netspace_v2-setup.c |  6 +++---
 arch/arm/mach-kirkwood/netxbig_v2-setup.c  |  4 ++--
 5 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/arch/arm/mach-kirkwood/d2net_v2-setup.c b/arch/arm/mach-kirkwood/d2net_v2-setup.c
index cd62d0f82a7..b8078aeebef 100644
--- a/arch/arm/mach-kirkwood/d2net_v2-setup.c
+++ b/arch/arm/mach-kirkwood/d2net_v2-setup.c
@@ -227,5 +227,5 @@ MACHINE_START(D2NET_V2, "LaCie d2 Network v2")
 	.init_machine	= d2net_v2_init,
 	.map_io		= kirkwood_map_io,
 	.init_irq	= kirkwood_init_irq,
-	.timer		= &lacie_v2_timer,
+	.timer		= &kirkwood_timer,
 MACHINE_END
diff --git a/arch/arm/mach-kirkwood/lacie_v2-common.c b/arch/arm/mach-kirkwood/lacie_v2-common.c
index d3ea1b6c8a0..285edab776e 100644
--- a/arch/arm/mach-kirkwood/lacie_v2-common.c
+++ b/arch/arm/mach-kirkwood/lacie_v2-common.c
@@ -111,17 +111,3 @@ void __init lacie_v2_hdd_power_init(int hdd_num)
 			pr_err("Failed to power up HDD%d\n", i + 1);
 	}
 }
-
-/*****************************************************************************
- * Timer
- ****************************************************************************/
-
-static void lacie_v2_timer_init(void)
-{
-	kirkwood_tclk = 166666667;
-	orion_time_init(IRQ_KIRKWOOD_BRIDGE, kirkwood_tclk);
-}
-
-struct sys_timer lacie_v2_timer = {
-	.init = lacie_v2_timer_init,
-};
diff --git a/arch/arm/mach-kirkwood/lacie_v2-common.h b/arch/arm/mach-kirkwood/lacie_v2-common.h
index af521315b87..fc64f578536 100644
--- a/arch/arm/mach-kirkwood/lacie_v2-common.h
+++ b/arch/arm/mach-kirkwood/lacie_v2-common.h
@@ -13,6 +13,4 @@ void lacie_v2_register_flash(void);
 void lacie_v2_register_i2c_devices(void);
 void lacie_v2_hdd_power_init(int hdd_num);
 
-extern struct sys_timer lacie_v2_timer;
-
 #endif
diff --git a/arch/arm/mach-kirkwood/netspace_v2-setup.c b/arch/arm/mach-kirkwood/netspace_v2-setup.c
index fed264d28f4..fc934e5a9ed 100644
--- a/arch/arm/mach-kirkwood/netspace_v2-setup.c
+++ b/arch/arm/mach-kirkwood/netspace_v2-setup.c
@@ -221,7 +221,7 @@ MACHINE_START(NETSPACE_V2, "LaCie Network Space v2")
 	.init_machine	= netspace_v2_init,
 	.map_io		= kirkwood_map_io,
 	.init_irq	= kirkwood_init_irq,
-	.timer		= &lacie_v2_timer,
+	.timer		= &kirkwood_timer,
 MACHINE_END
 #endif
 
@@ -233,7 +233,7 @@ MACHINE_START(INETSPACE_V2, "LaCie Internet Space v2")
 	.init_machine	= netspace_v2_init,
 	.map_io		= kirkwood_map_io,
 	.init_irq	= kirkwood_init_irq,
-	.timer		= &lacie_v2_timer,
+	.timer		= &kirkwood_timer,
 MACHINE_END
 #endif
 
@@ -245,6 +245,6 @@ MACHINE_START(NETSPACE_MAX_V2, "LaCie Network Space Max v2")
 	.init_machine	= netspace_v2_init,
 	.map_io		= kirkwood_map_io,
 	.init_irq	= kirkwood_init_irq,
-	.timer		= &lacie_v2_timer,
+	.timer		= &kirkwood_timer,
 MACHINE_END
 #endif
diff --git a/arch/arm/mach-kirkwood/netxbig_v2-setup.c b/arch/arm/mach-kirkwood/netxbig_v2-setup.c
index d970e1eee37..a855c9f0829 100644
--- a/arch/arm/mach-kirkwood/netxbig_v2-setup.c
+++ b/arch/arm/mach-kirkwood/netxbig_v2-setup.c
@@ -405,7 +405,7 @@ MACHINE_START(NET2BIG_V2, "LaCie 2Big Network v2")
 	.init_machine	= netxbig_v2_init,
 	.map_io		= kirkwood_map_io,
 	.init_irq	= kirkwood_init_irq,
-	.timer		= &lacie_v2_timer,
+	.timer		= &kirkwood_timer,
 MACHINE_END
 #endif
 
@@ -417,6 +417,6 @@ MACHINE_START(NET5BIG_V2, "LaCie 5Big Network v2")
 	.init_machine	= netxbig_v2_init,
 	.map_io		= kirkwood_map_io,
 	.init_irq	= kirkwood_init_irq,
-	.timer		= &lacie_v2_timer,
+	.timer		= &kirkwood_timer,
 MACHINE_END
 #endif
-- 
cgit v1.2.3-18-g5258


From 3924996bab2845bdf9a9d16ff7c20445de1ab55d Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@fluxnic.net>
Date: Thu, 21 Oct 2010 15:48:33 -0400
Subject: [ARM] Kirkwood: restrict the scope of the PCIe reset workaround

Commit 21f0ba90a447 "orion/kirkwood: reset PCIe unit on boot" made the
reset of the PCIe unit unconditional.  While this may fix problems on some
targets, this also causes problems on other targets.

Saeed Bishara <saeed@marvell.com> said about the original problem: "We
couln't pinpoint the root cause of this issue, actually we failed to
reproduce that issue."

So let's restrict the reset of the PCIe unit only to the target where
the original problem was observed.

Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 arch/arm/mach-kirkwood/ts41x-setup.c    | 14 +++++++++++++-
 arch/arm/plat-orion/include/plat/pcie.h |  3 +++
 arch/arm/plat-orion/pcie.c              |  5 -----
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/arm/mach-kirkwood/ts41x-setup.c b/arch/arm/mach-kirkwood/ts41x-setup.c
index 2e14afef07a..6995199a912 100644
--- a/arch/arm/mach-kirkwood/ts41x-setup.c
+++ b/arch/arm/mach-kirkwood/ts41x-setup.c
@@ -27,6 +27,10 @@
 #include "mpp.h"
 #include "tsx1x-common.h"
 
+/* for the PCIe reset workaround */
+#include <plat/pcie.h>
+
+
 #define QNAP_TS41X_JUMPER_JP1	45
 
 static struct i2c_board_info __initdata qnap_ts41x_i2c_rtc = {
@@ -140,8 +144,16 @@ static void __init qnap_ts41x_init(void)
 
 static int __init ts41x_pci_init(void)
 {
-	if (machine_is_ts41x())
+	if (machine_is_ts41x()) {
+		/*
+		 * Without this explicit reset, the PCIe SATA controller
+		 * (Marvell 88sx7042/sata_mv) is known to stop working
+		 * after a few minutes.
+		 */
+		orion_pcie_reset((void __iomem *)PCIE_VIRT_BASE);
+
 		kirkwood_pcie_init(KW_PCIE0);
+	}
 
    return 0;
 }
diff --git a/arch/arm/plat-orion/include/plat/pcie.h b/arch/arm/plat-orion/include/plat/pcie.h
index 3ebfef72b4e..cc99163e73f 100644
--- a/arch/arm/plat-orion/include/plat/pcie.h
+++ b/arch/arm/plat-orion/include/plat/pcie.h
@@ -11,12 +11,15 @@
 #ifndef __PLAT_PCIE_H
 #define __PLAT_PCIE_H
 
+struct pci_bus;
+
 u32 orion_pcie_dev_id(void __iomem *base);
 u32 orion_pcie_rev(void __iomem *base);
 int orion_pcie_link_up(void __iomem *base);
 int orion_pcie_x4_mode(void __iomem *base);
 int orion_pcie_get_local_bus_nr(void __iomem *base);
 void orion_pcie_set_local_bus_nr(void __iomem *base, int nr);
+void orion_pcie_reset(void __iomem *base);
 void orion_pcie_setup(void __iomem *base,
 		      struct mbus_dram_target_info *dram);
 int orion_pcie_rd_conf(void __iomem *base, struct pci_bus *bus,
diff --git a/arch/arm/plat-orion/pcie.c b/arch/arm/plat-orion/pcie.c
index 779553a1595..af2d733c50b 100644
--- a/arch/arm/plat-orion/pcie.c
+++ b/arch/arm/plat-orion/pcie.c
@@ -181,11 +181,6 @@ void __init orion_pcie_setup(void __iomem *base,
 	u16 cmd;
 	u32 mask;
 
-	/*
-	 * soft reset PCIe unit
-	 */
-	orion_pcie_reset(base);
-
 	/*
 	 * Point PCIe unit MBUS decode windows to DRAM space.
 	 */
-- 
cgit v1.2.3-18-g5258


From aaa8e2b34c35d67abf1892cd62ea4e7565ca262c Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 15 Oct 2010 13:16:53 +0200
Subject: drbd: consolidate explicit drbd_md_sync into drbd_create_new_uuid

Every code path changing the current UUID needs to get it on stable
storage anyways. Flush it to disk right there, remove the now obsolte
explicit drbd_md_sync statements in the other code paths.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 3 ++-
 drivers/block/drbd/drbd_nl.c   | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8bfedc7164f..8d029b14e7c 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1267,7 +1267,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 			if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
 				drbd_uuid_new_current(mdev);
 				clear_bit(NEW_CUR_UUID, &mdev->flags);
-				drbd_md_sync(mdev);
 			}
 			spin_lock_irq(&mdev->req_lock);
 			_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
@@ -3659,6 +3658,8 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
 
 	get_random_bytes(&val, sizeof(u64));
 	_drbd_uuid_set(mdev, UI_CURRENT, val);
+	/* get it to stable storage _now_ */
+	drbd_md_sync(mdev);
 }
 
 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 87925e97e61..c498c4827de 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1953,7 +1953,6 @@ static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
 		drbd_uuid_new_current(mdev);
 		clear_bit(NEW_CUR_UUID, &mdev->flags);
-		drbd_md_sync(mdev);
 	}
 	drbd_suspend_io(mdev);
 	reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
-- 
cgit v1.2.3-18-g5258


From 3beec1d446fba335f07787636920892dd3b2c658 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 14 Oct 2010 13:31:48 +0200
Subject: drbd: tag a few error messages with "assert failed"

If those messages ever get logged, clearly state that they are
actually failed ASSERTS, so our regression tests can pick them up
from the logs more easily.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index c07c370c4c8..e0e0bf6f16a 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -114,11 +114,11 @@ struct drbd_conf;
 #define D_ASSERT(exp)	if (!(exp)) \
 	 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
 
-#define ERR_IF(exp) if (({				\
-	int _b = (exp) != 0;				\
-	if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n",	\
-		__func__, #exp, __FILE__, __LINE__);	\
-	 _b;						\
+#define ERR_IF(exp) if (({						\
+	int _b = (exp) != 0;						\
+	if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n",	\
+			__func__, #exp, __FILE__, __LINE__);		\
+	_b;								\
 	}))
 
 /* Defines to control fault insertion */
-- 
cgit v1.2.3-18-g5258


From 82f59cc6353889b426cf13b6596d5a3d100fa09e Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Sat, 16 Oct 2010 12:13:47 +0200
Subject: drbd: fix potential deadlock on detach

If we have contention in drbd_al_begin_iod (heavy randon IO),
an administrative request to detach the disk may deadlock
for similar reasons as the recently fixed deadlock if detaching
because of IO-error.

The approach taken here is to either go through the intermediate
cleanup state D_FAILED, or first lock out application io,
don't just go directly to D_DISKLESS.

We need an additional state bit (WAS_IO_ERROR) to distinguish
the -> D_FAILED because of IO-error from other failures.

Sanitize D_ATTACHING -> D_FAILED to D_ATTACHING -> D_DISKLESS.
If only attaching, ldev may be missing still, but would be referenced
from within the after_state_ch for -> D_FAILED, potentially
dereferencing a NULL pointer.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  20 ++++--
 drivers/block/drbd/drbd_main.c     | 138 ++++++++++++++++++++++---------------
 drivers/block/drbd/drbd_nl.c       |  16 ++++-
 drivers/block/drbd/drbd_receiver.c |   2 +-
 4 files changed, 113 insertions(+), 63 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e0e0bf6f16a..03c15e317c3 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -852,7 +852,8 @@ enum {
 	BITMAP_IO,		/* suspend application io;
 				   once no more io in flight, start bitmap io */
 	BITMAP_IO_QUEUED,       /* Started bitmap IO */
-	GO_DISKLESS,		/* Disk failed, local_cnt reached zero, we are going diskless */
+	GO_DISKLESS,		/* Disk is being detached, on io-error or admin request. */
+	WAS_IO_ERROR,		/* Local disk failed returned IO error */
 	RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
 	NET_CONGESTED,		/* The data socket is congested */
 
@@ -1281,6 +1282,7 @@ extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
 extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
 extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
 extern void drbd_go_diskless(struct drbd_conf *mdev);
+extern void drbd_ldev_destroy(struct drbd_conf *mdev);
 
 
 /* Meta data layout
@@ -1798,17 +1800,17 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
 	case EP_PASS_ON:
 		if (!forcedetach) {
 			if (__ratelimit(&drbd_ratelimit_state))
-				dev_err(DEV, "Local IO failed in %s."
-					     "Passing error on...\n", where);
+				dev_err(DEV, "Local IO failed in %s.\n", where);
 			break;
 		}
 		/* NOTE fall through to detach case if forcedetach set */
 	case EP_DETACH:
 	case EP_CALL_HELPER:
+		set_bit(WAS_IO_ERROR, &mdev->flags);
 		if (mdev->state.disk > D_FAILED) {
 			_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
-			dev_err(DEV, "Local IO failed in %s."
-				     "Detaching...\n", where);
+			dev_err(DEV,
+				"Local IO failed in %s. Detaching...\n", where);
 		}
 		break;
 	}
@@ -2127,7 +2129,11 @@ static inline void put_ldev(struct drbd_conf *mdev)
 	__release(local);
 	D_ASSERT(i >= 0);
 	if (i == 0) {
+		if (mdev->state.disk == D_DISKLESS)
+			/* even internal references gone, safe to destroy */
+			drbd_ldev_destroy(mdev);
 		if (mdev->state.disk == D_FAILED)
+			/* all application IO references gone. */
 			drbd_go_diskless(mdev);
 		wake_up(&mdev->misc_wait);
 	}
@@ -2138,6 +2144,10 @@ static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_stat
 {
 	int io_allowed;
 
+	/* never get a reference while D_DISKLESS */
+	if (mdev->state.disk == D_DISKLESS)
+		return 0;
+
 	atomic_inc(&mdev->local_cnt);
 	io_allowed = (mdev->state.disk >= mins);
 	if (!io_allowed)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8d029b14e7c..992c3aecdf7 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -834,6 +834,15 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
 		ns.conn = os.conn;
 
+	/* we cannot fail (again) if we already detached */
+	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
+		ns.disk = D_DISKLESS;
+
+	/* if we are only D_ATTACHING yet,
+	 * we can (and should) go directly to D_DISKLESS. */
+	if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
+		ns.disk = D_DISKLESS;
+
 	/* After C_DISCONNECTING only C_STANDALONE may follow */
 	if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
 		ns.conn = os.conn;
@@ -1055,7 +1064,15 @@ int __drbd_set_state(struct drbd_conf *mdev,
 	    !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
 		set_bit(DEVICE_DYING, &mdev->flags);
 
-	mdev->state.i = ns.i;
+	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+	 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+	 * drbd_ldev_destroy() won't happen before our corresponding
+	 * after_state_ch works run, where we put_ldev again. */
+	if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
+	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
+		atomic_inc(&mdev->local_cnt);
+
+	mdev->state = ns;
 	wake_up(&mdev->misc_wait);
 	wake_up(&mdev->state_wait);
 
@@ -1363,63 +1380,64 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	    os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
 		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
 
-	/* first half of local IO error */
-	if (os.disk > D_FAILED && ns.disk == D_FAILED) {
-		enum drbd_io_error_p eh = EP_PASS_ON;
+	/* first half of local IO error, failure to attach,
+	 * or administrative detach */
+	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
+		enum drbd_io_error_p eh;
+		int was_io_error;
+		/* corresponding get_ldev was in __drbd_set_state, to serialize
+		 * our cleanup here with the transition to D_DISKLESS,
+		 * so it is safe to dreference ldev here. */
+		eh = mdev->ldev->dc.on_io_error;
+		was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
+
+		/* current state still has to be D_FAILED,
+		 * there is only one way out: to D_DISKLESS,
+		 * and that may only happen after our put_ldev below. */
+		if (mdev->state.disk != D_FAILED)
+			dev_err(DEV,
+				"ASSERT FAILED: disk is %s during detach\n",
+				drbd_disk_str(mdev->state.disk));
 
 		if (drbd_send_state(mdev))
-			dev_warn(DEV, "Notified peer that my disk is broken.\n");
+			dev_warn(DEV, "Notified peer that I am detaching my disk\n");
 		else
-			dev_err(DEV, "Sending state for drbd_io_error() failed\n");
+			dev_err(DEV, "Sending state for detaching disk failed\n");
 
 		drbd_rs_cancel_all(mdev);
 
-		if (get_ldev_if_state(mdev, D_FAILED)) {
-			eh = mdev->ldev->dc.on_io_error;
-			put_ldev(mdev);
-		}
-		if (eh == EP_CALL_HELPER)
+		/* In case we want to get something to stable storage still,
+		 * this may be the last chance.
+		 * Following put_ldev may transition to D_DISKLESS. */
+		drbd_md_sync(mdev);
+		put_ldev(mdev);
+
+		if (was_io_error && eh == EP_CALL_HELPER)
 			drbd_khelper(mdev, "local-io-error");
 	}
 
+        /* second half of local IO error, failure to attach,
+         * or administrative detach,
+         * after local_cnt references have reached zero again */
+        if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
+                /* We must still be diskless,
+                 * re-attach has to be serialized with this! */
+                if (mdev->state.disk != D_DISKLESS)
+                        dev_err(DEV,
+                                "ASSERT FAILED: disk is %s while going diskless\n",
+                                drbd_disk_str(mdev->state.disk));
 
-	/* second half of local IO error handling,
-	 * after local_cnt references have reached zero: */
-	if (os.disk == D_FAILED && ns.disk == D_DISKLESS) {
-		mdev->rs_total = 0;
-		mdev->rs_failed = 0;
-		atomic_set(&mdev->rs_pending_cnt, 0);
-	}
-
-	if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
-		/* We must still be diskless,
-		 * re-attach has to be serialized with this! */
-		if (mdev->state.disk != D_DISKLESS)
-			dev_err(DEV,
-				"ASSERT FAILED: disk is %s while going diskless\n",
-				drbd_disk_str(mdev->state.disk));
+                mdev->rs_total = 0;
+                mdev->rs_failed = 0;
+                atomic_set(&mdev->rs_pending_cnt, 0);
 
-		/* we cannot assert local_cnt == 0 here, as get_ldev_if_state
-		 * will inc/dec it frequently. Since we became D_DISKLESS, no
-		 * one has touched the protected members anymore, though, so we
-		 * are safe to free them here. */
 		if (drbd_send_state(mdev))
-			dev_warn(DEV, "Notified peer that I detached my disk.\n");
+			dev_warn(DEV, "Notified peer that I'm now diskless.\n");
 		else
-			dev_err(DEV, "Sending state for detach failed\n");
-
-		lc_destroy(mdev->resync);
-		mdev->resync = NULL;
-		lc_destroy(mdev->act_log);
-		mdev->act_log = NULL;
-		__no_warn(local,
-			drbd_free_bc(mdev->ldev);
-			mdev->ldev = NULL;);
-
-		if (mdev->md_io_tmpp) {
-			__free_page(mdev->md_io_tmpp);
-			mdev->md_io_tmpp = NULL;
-		}
+			dev_err(DEV, "Sending state for being diskless failed\n");
+		/* corresponding get_ldev in __drbd_set_state
+		 * this may finaly trigger drbd_ldev_destroy. */
+		put_ldev(mdev);
 	}
 
 	/* Disks got bigger while they were detached */
@@ -2897,7 +2915,6 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
 	D_ASSERT(list_empty(&mdev->resync_work.list));
 	D_ASSERT(list_empty(&mdev->unplug_work.list));
 	D_ASSERT(list_empty(&mdev->go_diskless.list));
-
 }
 
 
@@ -3756,19 +3773,31 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 	return 1;
 }
 
+void drbd_ldev_destroy(struct drbd_conf *mdev)
+{
+	lc_destroy(mdev->resync);
+	mdev->resync = NULL;
+	lc_destroy(mdev->act_log);
+	mdev->act_log = NULL;
+	__no_warn(local,
+		drbd_free_bc(mdev->ldev);
+		mdev->ldev = NULL;);
+
+	if (mdev->md_io_tmpp) {
+		__free_page(mdev->md_io_tmpp);
+		mdev->md_io_tmpp = NULL;
+	}
+	clear_bit(GO_DISKLESS, &mdev->flags);
+}
+
 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 {
 	D_ASSERT(mdev->state.disk == D_FAILED);
 	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
 	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
-	 * the protected members anymore, though, so in the after_state_ch work
-	 * it will be safe to free them. */
+	 * the protected members anymore, though, so once put_ldev reaches zero
+	 * again, it will be safe to free them. */
 	drbd_force_state(mdev, NS(disk, D_DISKLESS));
-	/* We need to wait for return of references checked out while we still
-	 * have been D_FAILED, though (drbd_md_sync, bitmap io). */
-	wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
-
-	clear_bit(GO_DISKLESS, &mdev->flags);
 	return 1;
 }
 
@@ -3777,9 +3806,6 @@ void drbd_go_diskless(struct drbd_conf *mdev)
 	D_ASSERT(mdev->state.disk == D_FAILED);
 	if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
 		drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
-		/* don't drbd_queue_work_front,
-		 * we need to serialize with the after_state_ch work
-		 * of the -> D_FAILED transition. */
 }
 
 /**
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index c498c4827de..0cba7d3d2b5 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -870,6 +870,11 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		retcode = ERR_DISK_CONFIGURED;
 		goto fail;
 	}
+	/* It may just now have detached because of IO error.  Make sure
+	 * drbd_ldev_destroy is done already, we may end up here very fast,
+	 * e.g. if someone calls attach from the on-io-error handler,
+	 * to realize a "hot spare" feature (not that I'd recommend that) */
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
 
 	/* allocation not in the IO path, cqueue thread context */
 	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
@@ -1262,7 +1267,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
  force_diskless_dec:
 	put_ldev(mdev);
  force_diskless:
-	drbd_force_state(mdev, NS(disk, D_DISKLESS));
+	drbd_force_state(mdev, NS(disk, D_FAILED));
 	drbd_md_sync(mdev);
  release_bdev2_fail:
 	if (nbc)
@@ -1285,10 +1290,19 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	return 0;
 }
 
+/* Detaching the disk is a process in multiple stages.  First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
 static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 			  struct drbd_nl_cfg_reply *reply)
 {
+	drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
 	reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
+	if (mdev->state.disk == D_DISKLESS)
+		wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+	drbd_resume_io(mdev);
 	return 0;
 }
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6ec922c623a..04a823b01da 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3363,7 +3363,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 		if (ns.conn == C_MASK) {
 			ns.conn = C_CONNECTED;
 			if (mdev->state.disk == D_NEGOTIATING) {
-				drbd_force_state(mdev, NS(disk, D_DISKLESS));
+				drbd_force_state(mdev, NS(disk, D_FAILED));
 			} else if (peer_state.disk == D_NEGOTIATING) {
 				dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
 				peer_state.disk = D_DISKLESS;
-- 
cgit v1.2.3-18-g5258


From 6719fb036cea56a5ee9d0ac912ed8c7cabb27f49 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 18 Oct 2010 23:04:07 +0200
Subject: drbd: fix potential data divergence after multiple failures

If we get an IO-error during an activity log transaction,
if we failed to write the bitmap of the evicted extent,
we must not write the transaction itself.
If we failed to write the transaction,
we must not even submit the corresponding bio,
as its extent is not yet marked in the activity log.

Otherwise, if this was a disconneted Primary (degraded cluster), which
now lost its disk as well, and we later re-attach the same backend
storage, we possibly "forget" to resync some parts of the disk that
potentially have been changed.

On the receiving side, when receiving from a peer with unhealthy disk,
checking for pdsk == D_DISKLESS is not enough, we need to set out of
sync and do AL transactions for everything pdsk < D_INCONSISTENT on the
receiving side.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c   | 26 +++++++++++++++++++++-----
 drivers/block/drbd/drbd_receiver.c |  3 ++-
 drivers/block/drbd/drbd_req.c      | 19 ++++++++++++++-----
 3 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index ac04ef97eac..bd925180a2b 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -284,18 +284,32 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 	u32 xor_sum = 0;
 
 	if (!get_ldev(mdev)) {
-		dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
+		dev_err(DEV,
+			"disk is %s, cannot start al transaction (-%d +%d)\n",
+			drbd_disk_str(mdev->state.disk), evicted, new_enr);
 		complete(&((struct update_al_work *)w)->event);
 		return 1;
 	}
 	/* do we have to do a bitmap write, first?
 	 * TODO reduce maximum latency:
 	 * submit both bios, then wait for both,
-	 * instead of doing two synchronous sector writes. */
+	 * instead of doing two synchronous sector writes.
+	 * For now, we must not write the transaction,
+	 * if we cannot write out the bitmap of the evicted extent. */
 	if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
 		drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
 
-	mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
+	/* The bitmap write may have failed, causing a state change. */
+	if (mdev->state.disk < D_INCONSISTENT) {
+		dev_err(DEV,
+			"disk is %s, cannot write al transaction (-%d +%d)\n",
+			drbd_disk_str(mdev->state.disk), evicted, new_enr);
+		complete(&((struct update_al_work *)w)->event);
+		put_ldev(mdev);
+		return 1;
+	}
+
+	mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
 	buffer = (struct al_transaction *)page_address(mdev->md_io_page);
 
 	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
@@ -739,7 +753,7 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
 	unsigned int enr;
 	unsigned long add = 0;
 	char ppb[10];
-	int i;
+	int i, tmp;
 
 	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
 
@@ -747,7 +761,9 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
 		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
 		if (enr == LC_FREE)
 			continue;
-		add += drbd_bm_ALe_set_all(mdev, enr);
+		tmp = drbd_bm_ALe_set_all(mdev, enr);
+		dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
+		add += tmp;
 	}
 
 	lc_unlock(mdev->act_log);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 04a823b01da..1146faa7ae3 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1995,10 +1995,11 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 		break;
 	}
 
-	if (mdev->state.pdsk == D_DISKLESS) {
+	if (mdev->state.pdsk < D_INCONSISTENT) {
 		/* In case we have the only disk of the cluster, */
 		drbd_set_out_of_sync(mdev, e->sector, e->size);
 		e->flags |= EE_CALL_AL_COMPLETE_IO;
+		e->flags &= ~EE_MAY_SET_IN_SYNC;
 		drbd_al_begin_io(mdev, e->sector);
 	}
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 9e91a2545fc..d26b213dbf1 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -942,12 +942,21 @@ allocate_barrier:
 	if (local) {
 		req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
 
-		if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
-				     : rw == READ  ? DRBD_FAULT_DT_RD
-				     :               DRBD_FAULT_DT_RA))
+		/* State may have changed since we grabbed our reference on the
+		 * mdev->ldev member. Double check, and short-circuit to endio.
+		 * In case the last activity log transaction failed to get on
+		 * stable storage, and this is a WRITE, we may not even submit
+		 * this bio. */
+		if (get_ldev(mdev)) {
+			if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
+					     : rw == READ  ? DRBD_FAULT_DT_RD
+					     :               DRBD_FAULT_DT_RA))
+				bio_endio(req->private_bio, -EIO);
+			else
+				generic_make_request(req->private_bio);
+			put_ldev(mdev);
+		} else
 			bio_endio(req->private_bio, -EIO);
-		else
-			generic_make_request(req->private_bio);
 	}
 
 	/* we need to plug ALWAYS since we possibly need to kick lo_dev.
-- 
cgit v1.2.3-18-g5258


From bc571b8cb930ea78207851dd38b5a435fcb8891c Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 21 Oct 2010 18:07:31 +0200
Subject: drbd: fix a misleading printk

This codepath used to be called only for failed kmalloc GFP_ATOMIC,
but is now also triggered by other things.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 108d58015cd..1a2d2f0759b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -925,7 +925,7 @@ out:
 	drbd_md_sync(mdev);
 
 	if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
-		dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
+		dev_info(DEV, "Writing the whole bitmap\n");
 		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
 	}
 
-- 
cgit v1.2.3-18-g5258


From fb2c7a10eec051317ff091b2cb2d73c5ecd98c19 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 19 Oct 2010 12:08:13 +0200
Subject: drbd: rate limit an error message

If we don't rate limit it, and you happen to log err level messages via
serial console, an IO error on a disconnected Primary may cause serious
unresponsiveness.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index d26b213dbf1..31d04b17c45 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -813,7 +813,8 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
 			     mdev->state.conn >= C_CONNECTED));
 
 	if (!(local || remote) && !is_susp(mdev->state)) {
-		dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
 		goto fail_free_complete;
 	}
 
-- 
cgit v1.2.3-18-g5258


From 8825f7c3e5c7b251b49fc594658a96f59417ee16 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 21 Oct 2010 17:21:19 +0200
Subject: drbd: Silenced an assert

That assertion's condition needed adjustment for today's semantics

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.c | 2 +-
 include/linux/drbd.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 31d04b17c45..5c225485355 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -258,7 +258,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 		if (!hlist_unhashed(&req->colision))
 			hlist_del(&req->colision);
 		else
-			D_ASSERT((s & RQ_NET_MASK) == 0);
+			D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
 
 		/* for writes we need to do some extra housekeeping */
 		if (rw == WRITE)
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 9b2a0158f39..ef44c7a0638 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,7 +53,7 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.9rc2"
+#define REL_VERSION "8.3.9"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 95
-- 
cgit v1.2.3-18-g5258


From 2451fc3b2bd3a7205270da75a21dde0d5d7c96a2 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 24 Aug 2010 13:43:11 +0200
Subject: drbd: Removed the BIO_RW_BARRIER support form the receiver/epoch code

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  12 ---
 drivers/block/drbd/drbd_main.c     |   2 +-
 drivers/block/drbd/drbd_nl.c       |   4 +-
 drivers/block/drbd/drbd_proc.c     |   1 -
 drivers/block/drbd/drbd_receiver.c | 212 ++++++-------------------------------
 drivers/block/drbd/drbd_worker.c   |  21 ----
 6 files changed, 33 insertions(+), 219 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 03c15e317c3..1b915fd9278 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -749,17 +749,12 @@ struct drbd_epoch {
 
 /* drbd_epoch flag bits */
 enum {
-	DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
-	DE_BARRIER_IN_NEXT_EPOCH_DONE,
-	DE_CONTAINS_A_BARRIER,
 	DE_HAVE_BARRIER_NUMBER,
-	DE_IS_FINISHING,
 };
 
 enum epoch_event {
 	EV_PUT,
 	EV_GOT_BARRIER_NR,
-	EV_BARRIER_DONE,
 	EV_BECAME_LAST,
 	EV_CLEANUP = 32, /* used as flag */
 };
@@ -801,11 +796,6 @@ enum {
 	__EE_CALL_AL_COMPLETE_IO,
 	__EE_MAY_SET_IN_SYNC,
 
-	/* This epoch entry closes an epoch using a barrier.
-	 * On sucessful completion, the epoch is released,
-	 * and the P_BARRIER_ACK send. */
-	__EE_IS_BARRIER,
-
 	/* In case a barrier failed,
 	 * we need to resubmit without the barrier flag. */
 	__EE_RESUBMITTED,
@@ -820,7 +810,6 @@ enum {
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
-#define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
 #define	EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
 #define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
 #define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
@@ -948,7 +937,6 @@ enum write_ordering_e {
 	WO_none,
 	WO_drain_io,
 	WO_bdev_flush,
-	WO_bio_barrier
 };
 
 struct fifo_buffer {
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 992c3aecdf7..8e0d707df23 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2858,7 +2858,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	drbd_thread_init(mdev, &mdev->asender, drbd_asender);
 
 	mdev->agreed_pro_version = PRO_VERSION_MAX;
-	mdev->write_ordering = WO_bio_barrier;
+	mdev->write_ordering = WO_bdev_flush;
 	mdev->resync_wenr = LC_FREE;
 }
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 0cba7d3d2b5..899878fcf97 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1117,8 +1117,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	nbc = NULL;
 	resync_lru = NULL;
 
-	mdev->write_ordering = WO_bio_barrier;
-	drbd_bump_write_ordering(mdev, WO_bio_barrier);
+	mdev->write_ordering = WO_bdev_flush;
+	drbd_bump_write_ordering(mdev, WO_bdev_flush);
 
 	if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
 		set_bit(CRASHED_PRIMARY, &mdev->flags);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index ad325c5d0ce..7e6ac307e2d 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -158,7 +158,6 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 		[WO_none] = 'n',
 		[WO_drain_io] = 'd',
 		[WO_bdev_flush] = 'f',
-		[WO_bio_barrier] = 'b',
 	};
 
 	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 1146faa7ae3..2952c1277b1 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -49,11 +49,6 @@
 
 #include "drbd_vli.h"
 
-struct flush_work {
-	struct drbd_work w;
-	struct drbd_epoch *epoch;
-};
-
 enum finish_epoch {
 	FE_STILL_LIVE,
 	FE_DESTROYED,
@@ -66,16 +61,6 @@ static int drbd_do_auth(struct drbd_conf *mdev);
 static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
 static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
 
-static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
-{
-	struct drbd_epoch *prev;
-	spin_lock(&mdev->epoch_lock);
-	prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
-	if (prev == epoch || prev == mdev->current_epoch)
-		prev = NULL;
-	spin_unlock(&mdev->epoch_lock);
-	return prev;
-}
 
 #define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
 
@@ -981,7 +966,7 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi
 	return TRUE;
 }
 
-static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+static void drbd_flush(struct drbd_conf *mdev)
 {
 	int rv;
 
@@ -997,24 +982,6 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
 		}
 		put_ldev(mdev);
 	}
-
-	return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
-}
-
-static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
-	struct flush_work *fw = (struct flush_work *)w;
-	struct drbd_epoch *epoch = fw->epoch;
-
-	kfree(w);
-
-	if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
-		drbd_flush_after_epoch(mdev, epoch);
-
-	drbd_may_finish_epoch(mdev, epoch, EV_PUT |
-			      (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
-
-	return 1;
 }
 
 /**
@@ -1027,15 +994,13 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 					       struct drbd_epoch *epoch,
 					       enum epoch_event ev)
 {
-	int finish, epoch_size;
+	int epoch_size;
 	struct drbd_epoch *next_epoch;
-	int schedule_flush = 0;
 	enum finish_epoch rv = FE_STILL_LIVE;
 
 	spin_lock(&mdev->epoch_lock);
 	do {
 		next_epoch = NULL;
-		finish = 0;
 
 		epoch_size = atomic_read(&epoch->epoch_size);
 
@@ -1045,16 +1010,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 			break;
 		case EV_GOT_BARRIER_NR:
 			set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
-
-			/* Special case: If we just switched from WO_bio_barrier to
-			   WO_bdev_flush we should not finish the current epoch */
-			if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
-			    mdev->write_ordering != WO_bio_barrier &&
-			    epoch == mdev->current_epoch)
-				clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
-			break;
-		case EV_BARRIER_DONE:
-			set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
 			break;
 		case EV_BECAME_LAST:
 			/* nothing to do*/
@@ -1063,23 +1018,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 
 		if (epoch_size != 0 &&
 		    atomic_read(&epoch->active) == 0 &&
-		    test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
-		    epoch->list.prev == &mdev->current_epoch->list &&
-		    !test_bit(DE_IS_FINISHING, &epoch->flags)) {
-			/* Nearly all conditions are met to finish that epoch... */
-			if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
-			    mdev->write_ordering == WO_none ||
-			    (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
-			    ev & EV_CLEANUP) {
-				finish = 1;
-				set_bit(DE_IS_FINISHING, &epoch->flags);
-			} else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
-				 mdev->write_ordering == WO_bio_barrier) {
-				atomic_inc(&epoch->active);
-				schedule_flush = 1;
-			}
-		}
-		if (finish) {
+		    test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
 			if (!(ev & EV_CLEANUP)) {
 				spin_unlock(&mdev->epoch_lock);
 				drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
@@ -1102,6 +1041,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 				/* atomic_set(&epoch->active, 0); is already zero */
 				if (rv == FE_STILL_LIVE)
 					rv = FE_RECYCLED;
+				wake_up(&mdev->ee_wait);
 			}
 		}
 
@@ -1113,22 +1053,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 
 	spin_unlock(&mdev->epoch_lock);
 
-	if (schedule_flush) {
-		struct flush_work *fw;
-		fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
-		if (fw) {
-			fw->w.cb = w_flush;
-			fw->epoch = epoch;
-			drbd_queue_work(&mdev->data.work, &fw->w);
-		} else {
-			dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
-			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
-			/* That is not a recursion, only one level */
-			drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
-			drbd_may_finish_epoch(mdev, epoch, EV_PUT);
-		}
-	}
-
 	return rv;
 }
 
@@ -1144,19 +1068,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
 		[WO_none] = "none",
 		[WO_drain_io] = "drain",
 		[WO_bdev_flush] = "flush",
-		[WO_bio_barrier] = "barrier",
 	};
 
 	pwo = mdev->write_ordering;
 	wo = min(pwo, wo);
-	if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
-		wo = WO_bdev_flush;
 	if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
 		wo = WO_drain_io;
 	if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
 		wo = WO_none;
 	mdev->write_ordering = wo;
-	if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
+	if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
 		dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
 }
 
@@ -1192,7 +1113,7 @@ next_bio:
 	bio->bi_sector = sector;
 	bio->bi_bdev = mdev->ldev->backing_bdev;
 	/* we special case some flags in the multi-bio case, see below
-	 * (REQ_UNPLUG, REQ_HARDBARRIER) */
+	 * (REQ_UNPLUG) */
 	bio->bi_rw = rw;
 	bio->bi_private = e;
 	bio->bi_end_io = drbd_endio_sec;
@@ -1226,11 +1147,6 @@ next_bio:
 			bio->bi_rw &= ~REQ_UNPLUG;
 
 		drbd_generic_make_request(mdev, fault_type, bio);
-
-		/* strip off REQ_HARDBARRIER,
-		 * unless it is the first or last bio */
-		if (bios && bios->bi_next)
-			bios->bi_rw &= ~REQ_HARDBARRIER;
 	} while (bios);
 	maybe_kick_lo(mdev);
 	return 0;
@@ -1244,45 +1160,9 @@ fail:
 	return -ENOMEM;
 }
 
-/**
- * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
- * @mdev:	DRBD device.
- * @w:		work object.
- * @cancel:	The connection will be closed anyways (unused in this callback)
- */
-int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
-{
-	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
-	/* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
-	   (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
-	   so that we can finish that epoch in drbd_may_finish_epoch().
-	   That is necessary if we already have a long chain of Epochs, before
-	   we realize that REQ_HARDBARRIER is actually not supported */
-
-	/* As long as the -ENOTSUPP on the barrier is reported immediately
-	   that will never trigger. If it is reported late, we will just
-	   print that warning and continue correctly for all future requests
-	   with WO_bdev_flush */
-	if (previous_epoch(mdev, e->epoch))
-		dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
-
-	/* we still have a local reference,
-	 * get_ldev was done in receive_Data. */
-
-	e->w.cb = e_end_block;
-	if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
-		/* drbd_submit_ee fails for one reason only:
-		 * if was not able to allocate sufficient bios.
-		 * requeue, try again later. */
-		e->w.cb = w_e_reissue;
-		drbd_queue_work(&mdev->data.work, &e->w);
-	}
-	return 1;
-}
-
 static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
-	int rv, issue_flush;
+	int rv;
 	struct p_barrier *p = &mdev->data.rbuf.barrier;
 	struct drbd_epoch *epoch;
 
@@ -1300,44 +1180,40 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
 	 * Therefore we must send the barrier_ack after the barrier request was
 	 * completed. */
 	switch (mdev->write_ordering) {
-	case WO_bio_barrier:
 	case WO_none:
 		if (rv == FE_RECYCLED)
 			return TRUE;
-		break;
+
+		/* receiver context, in the writeout path of the other node.
+		 * avoid potential distributed deadlock */
+		epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+		if (epoch)
+			break;
+		else
+			dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+			/* Fall through */
 
 	case WO_bdev_flush:
 	case WO_drain_io:
-		if (rv == FE_STILL_LIVE) {
-			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
-			drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
-			rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
-		}
-		if (rv == FE_RECYCLED)
-			return TRUE;
-
-		/* The asender will send all the ACKs and barrier ACKs out, since
-		   all EEs moved from the active_ee to the done_ee. We need to
-		   provide a new epoch object for the EEs that come in soon */
-		break;
-	}
-
-	/* receiver context, in the writeout path of the other node.
-	 * avoid potential distributed deadlock */
-	epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
-	if (!epoch) {
-		dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
-		issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
 		drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
-		if (issue_flush) {
-			rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
-			if (rv == FE_RECYCLED)
-				return TRUE;
+		drbd_flush(mdev);
+
+		if (atomic_read(&mdev->current_epoch->epoch_size)) {
+			epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+			if (epoch)
+				break;
 		}
 
-		drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
+		epoch = mdev->current_epoch;
+		wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
+
+		D_ASSERT(atomic_read(&epoch->active) == 0);
+		D_ASSERT(epoch->flags == 0);
 
 		return TRUE;
+	default:
+		dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
+		return FALSE;
 	}
 
 	epoch->flags = 0;
@@ -1652,15 +1528,8 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
 	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
 	sector_t sector = e->sector;
-	struct drbd_epoch *epoch;
 	int ok = 1, pcmd;
 
-	if (e->flags & EE_IS_BARRIER) {
-		epoch = previous_epoch(mdev, e->epoch);
-		if (epoch)
-			drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
-	}
-
 	if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
 		if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 			pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
@@ -1817,27 +1686,6 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	e->epoch = mdev->current_epoch;
 	atomic_inc(&e->epoch->epoch_size);
 	atomic_inc(&e->epoch->active);
-
-	if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
-		struct drbd_epoch *epoch;
-		/* Issue a barrier if we start a new epoch, and the previous epoch
-		   was not a epoch containing a single request which already was
-		   a Barrier. */
-		epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
-		if (epoch == e->epoch) {
-			set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
-			rw |= REQ_HARDBARRIER;
-			e->flags |= EE_IS_BARRIER;
-		} else {
-			if (atomic_read(&epoch->epoch_size) > 1 ||
-			    !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
-				set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
-				set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
-				rw |= REQ_HARDBARRIER;
-				e->flags |= EE_IS_BARRIER;
-			}
-		}
-	}
 	spin_unlock(&mdev->epoch_lock);
 
 	dp_flags = be32_to_cpu(p->dp_flags);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 1a2d2f0759b..b0551ba7ad0 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -102,12 +102,6 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
 	put_ldev(mdev);
 }
 
-static int is_failed_barrier(int ee_flags)
-{
-	return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
-			== (EE_IS_BARRIER|EE_WAS_ERROR);
-}
-
 /* writes on behalf of the partner, or resync writes,
  * "submitted" by the receiver, final stage.  */
 static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
@@ -119,21 +113,6 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
 	int is_syncer_req;
 	int do_al_complete_io;
 
-	/* if this is a failed barrier request, disable use of barriers,
-	 * and schedule for resubmission */
-	if (is_failed_barrier(e->flags)) {
-		drbd_bump_write_ordering(mdev, WO_bdev_flush);
-		spin_lock_irqsave(&mdev->req_lock, flags);
-		list_del(&e->w.list);
-		e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
-		e->w.cb = w_e_reissue;
-		/* put_ldev actually happens below, once we come here again. */
-		__release(local);
-		spin_unlock_irqrestore(&mdev->req_lock, flags);
-		drbd_queue_work(&mdev->data.work, &e->w);
-		return;
-	}
-
 	D_ASSERT(e->block_id != ID_VACANT);
 
 	/* after we moved e to done_ee,
-- 
cgit v1.2.3-18-g5258


From a8a4e51e6965db84d2af041370ea2ab6232aa4f1 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 25 Aug 2010 10:21:04 +0200
Subject: drbd: REQ_HARDBARRIER -> REQ_FUA transition for meta data accesses

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c | 16 ++--------------
 drivers/block/drbd/drbd_int.h    |  7 +++----
 drivers/block/drbd/drbd_nl.c     |  4 ++--
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index bd925180a2b..ba95cba192b 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -78,11 +78,10 @@