From d705ae6b133f9f6a8beee617b1224b6a5c99c5da Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 Feb 2012 09:45:49 +0100
Subject: block: replace icq->changed with icq->flags

icq->changed was used for ICQ_*_CHANGED bits.  Rename it to flags and
access it under ioc->lock instead of using atomic bitops.
ioc_get_changed() is added so that the changed part can be fetched and
cleared as before.

icq->flags will be used to carry other flags.

Signed-off-by: Tejun Heo <tj@kernel.org>
Tested-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c     | 30 ++++++++++++++++++++++++++----
 block/cfq-iosched.c | 12 ++++++------
 2 files changed, 32 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 8b782a63c29..811879c752e 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -363,13 +363,13 @@ struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
 	return icq;
 }
 
-void ioc_set_changed(struct io_context *ioc, int which)
+void ioc_set_icq_flags(struct io_context *ioc, unsigned int flags)
 {
 	struct io_cq *icq;
 	struct hlist_node *n;
 
 	hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
-		set_bit(which, &icq->changed);
+		icq->flags |= flags;
 }
 
 /**
@@ -387,7 +387,7 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
 
 	spin_lock_irqsave(&ioc->lock, flags);
 	ioc->ioprio = ioprio;
-	ioc_set_changed(ioc, ICQ_IOPRIO_CHANGED);
+	ioc_set_icq_flags(ioc, ICQ_IOPRIO_CHANGED);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 }
 
@@ -404,11 +404,33 @@ void ioc_cgroup_changed(struct io_context *ioc)
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioc->lock, flags);
-	ioc_set_changed(ioc, ICQ_CGROUP_CHANGED);
+	ioc_set_icq_flags(ioc, ICQ_CGROUP_CHANGED);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 }
 EXPORT_SYMBOL(ioc_cgroup_changed);
 
+/**
+ * icq_get_changed - fetch and clear icq changed mask
+ * @icq: icq of interest
+ *
+ * Fetch and clear ICQ_*_CHANGED bits from @icq.  Grabs and releases
+ * @icq->ioc->lock.
+ */
+unsigned icq_get_changed(struct io_cq *icq)
+{
+	unsigned int changed = 0;
+	unsigned long flags;
+
+	if (unlikely(icq->flags & ICQ_CHANGED_MASK)) {
+		spin_lock_irqsave(&icq->ioc->lock, flags);
+		changed = icq->flags & ICQ_CHANGED_MASK;
+		icq->flags &= ~ICQ_CHANGED_MASK;
+		spin_unlock_irqrestore(&icq->ioc->lock, flags);
+	}
+	return changed;
+}
+EXPORT_SYMBOL(icq_get_changed);
+
 static int __init blk_ioc_init(void)
 {
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d0ba5053366..45729525356 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3470,20 +3470,20 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
+	unsigned int changed;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
 	spin_lock_irq(q->queue_lock);
 
 	/* handle changed notifications */
-	if (unlikely(cic->icq.changed)) {
-		if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
-			changed_ioprio(cic);
+	changed = icq_get_changed(&cic->icq);
+	if (unlikely(changed & ICQ_IOPRIO_CHANGED))
+		changed_ioprio(cic);
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-		if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
-			changed_cgroup(cic);
+	if (unlikely(changed & ICQ_CGROUP_CHANGED))
+		changed_cgroup(cic);
 #endif
-	}
 
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
-- 
cgit v1.2.3-18-g5258


From 2274b029f640cd652ab59c363e5beebf5f50e609 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 Feb 2012 09:45:52 +0100
Subject: block: simplify ioc_release_fn()

Reverse double lock dancing in ioc_release_fn() can be simplified by
just using trylock on the queue_lock and back out from ioc lock on
trylock failure.  Simplify it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Tested-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 46 ++++++++++------------------------------------
 1 file changed, 10 insertions(+), 36 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 811879c752e..f53c80ecaf0 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -79,7 +79,6 @@ static void ioc_release_fn(struct work_struct *work)
 {
 	struct io_context *ioc = container_of(work, struct io_context,
 					      release_work);
-	struct request_queue *last_q = NULL;
 	unsigned long flags;
 
 	/*
@@ -93,44 +92,19 @@ static void ioc_release_fn(struct work_struct *work)
 	while (!hlist_empty(&ioc->icq_list)) {
 		struct io_cq *icq = hlist_entry(ioc->icq_list.first,
 						struct io_cq, ioc_node);
-		struct request_queue *this_q = icq->q;
-
-		if (this_q != last_q) {
-			/*
-			 * Need to switch to @this_q.  Once we release
-			 * @ioc->lock, it can go away along with @cic.
-			 * Hold on to it.
-			 */
-			__blk_get_queue(this_q);
-
-			/*
-			 * blk_put_queue() might sleep thanks to kobject
-			 * idiocy.  Always release both locks, put and
-			 * restart.
-			 */
-			if (last_q) {
-				spin_unlock(last_q->queue_lock);
-				spin_unlock_irqrestore(&ioc->lock, flags);
-				blk_put_queue(last_q);
-			} else {
-				spin_unlock_irqrestore(&ioc->lock, flags);
-			}
-
-			last_q = this_q;
-			spin_lock_irqsave(this_q->queue_lock, flags);
-			spin_lock_nested(&ioc->lock, 1);
-			continue;
+		struct request_queue *q = icq->q;
+
+		if (spin_trylock(q->queue_lock)) {
+			ioc_exit_icq(icq);
+			spin_unlock(q->queue_lock);
+		} else {
+			spin_unlock_irqrestore(&ioc->lock, flags);
+			cpu_relax();
+			spin_lock_irqsave_nested(&ioc->lock, flags, 1);
 		}
-		ioc_exit_icq(icq);
 	}
 
-	if (last_q) {
-		spin_unlock(last_q->queue_lock);
-		spin_unlock_irqrestore(&ioc->lock, flags);
-		blk_put_queue(last_q);
-	} else {
-		spin_unlock_irqrestore(&ioc->lock, flags);
-	}
+	spin_unlock_irqrestore(&ioc->lock, flags);
 
 	kmem_cache_free(iocontext_cachep, ioc);
 }
-- 
cgit v1.2.3-18-g5258


From 621032ad6eaabf2fe771c4fa0d8f58e1fcfcdba6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 Feb 2012 09:45:53 +0100
Subject: block: exit_io_context() should call elevator_exit_icq_fn()

While updating locking, b2efa05265 "block, cfq: unlink
cfq_io_context's immediately" moved elevator_exit_icq_fn() invocation
from exit_io_context() to the final ioc put.  While this doesn't cause
catastrophic failure, it effectively removes task exit notification to
elevator and cause noticeable IO performance degradation with CFQ.

On task exit, CFQ used to immediately expire the slice if it was being
used by the exiting task as no more IO would be issued by the task;
however, after b2efa05265, the notification is lost and disk could sit
idle needlessly, leading to noticeable IO performance degradation for
certain workloads.

This patch renames ioc_exit_icq() to ioc_destroy_icq(), separates
elevator_exit_icq_fn() invocation into ioc_exit_icq() and invokes it
from exit_io_context().  ICQ_EXITED flag is added to avoid invoking
the callback more than once for the same icq.

Walking icq_list from ioc side and invoking elevator callback requires
reverse double locking.  This may be better implemented using RCU;
unfortunately, using RCU isn't trivial.  e.g. RCU protection would
need to cover request_queue and queue_lock switch on cleanup makes
grabbing queue_lock from RCU unsafe.  Reverse double locking should
do, at least for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-bisected-by: Shaohua Li <shli@kernel.org>
LKML-Reference: <CANejiEVzs=pUhQSTvUppkDcc2TNZyfohBRLygW5zFmXyk5A-xQ@mail.gmail.com>
Tested-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 47 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index f53c80ecaf0..92bf55540d8 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -36,10 +36,22 @@ static void icq_free_icq_rcu(struct rcu_head *head)
 	kmem_cache_free(icq->__rcu_icq_cache, icq);
 }
 
-/*
- * Exit and free an icq.  Called with both ioc and q locked.
- */
+/* Exit an icq. Called with both ioc and q locked. */
 static void ioc_exit_icq(struct io_cq *icq)
+{
+	struct elevator_type *et = icq->q->elevator->type;
+
+	if (icq->flags & ICQ_EXITED)
+		return;
+
+	if (et->ops.elevator_exit_icq_fn)
+		et->ops.elevator_exit_icq_fn(icq);
+
+	icq->flags |= ICQ_EXITED;
+}
+
+/* Release an icq.  Called with both ioc and q locked. */
+static void ioc_destroy_icq(struct io_cq *icq)
 {
 	struct io_context *ioc = icq->ioc;
 	struct request_queue *q = icq->q;
@@ -60,8 +72,7 @@ static void ioc_exit_icq(struct io_cq *icq)
 	if (rcu_dereference_raw(ioc->icq_hint) == icq)
 		rcu_assign_pointer(ioc->icq_hint, NULL);
 
-	if (et->ops.elevator_exit_icq_fn)
-		et->ops.elevator_exit_icq_fn(icq);
+	ioc_exit_icq(icq);
 
 	/*
 	 * @icq->q might have gone away by the time RCU callback runs
@@ -95,7 +106,7 @@ static void ioc_release_fn(struct work_struct *work)
 		struct request_queue *q = icq->q;
 
 		if (spin_trylock(q->queue_lock)) {
-			ioc_exit_icq(icq);
+			ioc_destroy_icq(icq);
 			spin_unlock(q->queue_lock);
 		} else {
 			spin_unlock_irqrestore(&ioc->lock, flags);
@@ -142,13 +153,41 @@ EXPORT_SYMBOL(put_io_context);
 void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
+	struct io_cq *icq;
+	struct hlist_node *n;
+	unsigned long flags;
 
 	task_lock(task);
 	ioc = task->io_context;
 	task->io_context = NULL;
 	task_unlock(task);
 
-	atomic_dec(&ioc->nr_tasks);
+	if (!atomic_dec_and_test(&ioc->nr_tasks)) {
+		put_io_context(ioc);
+		return;
+	}
+
+	/*
+	 * Need ioc lock to walk icq_list and q lock to exit icq.  Perform
+	 * reverse double locking.  Read comment in ioc_release_fn() for
+	 * explanation on the nested locking annotation.
+	 */
+retry:
+	spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+	hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) {
+		if (icq->flags & ICQ_EXITED)
+			continue;
+		if (spin_trylock(icq->q->queue_lock)) {
+			ioc_exit_icq(icq);
+			spin_unlock(icq->q->queue_lock);
+		} else {
+			spin_unlock_irqrestore(&ioc->lock, flags);
+			cpu_relax();
+			goto retry;
+		}
+	}
+	spin_unlock_irqrestore(&ioc->lock, flags);
+
 	put_io_context(ioc);
 }
 
@@ -168,7 +207,7 @@ void ioc_clear_queue(struct request_queue *q)
 		struct io_context *ioc = icq->ioc;
 
 		spin_lock(&ioc->lock);
-		ioc_exit_icq(icq);
+		ioc_destroy_icq(icq);
 		spin_unlock(&ioc->lock);
 	}
 }
-- 
cgit v1.2.3-18-g5258


From fe316bf2d5847bc5dd975668671a7b1067603bc7 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Fri, 2 Mar 2012 10:38:33 +0100
Subject: block: Fix NULL pointer dereference in sd_revalidate_disk

Since 2.6.39 (1196f8b), when a driver returns -ENOMEDIUM for open(),
__blkdev_get() calls rescan_partitions() to remove
in-kernel partition structures and raise KOBJ_CHANGE uevent.

However it ends up calling driver's revalidate_disk without open
and could cause oops.

In the case of SCSI:

  process A                  process B
  ----------------------------------------------
  sys_open
    __blkdev_get
      sd_open
        returns -ENOMEDIUM
                             scsi_remove_device
                               <scsi_device torn down>
      rescan_partitions
        sd_revalidate_disk
          <oops>
Oopses are reported here:
http://marc.info/?l=linux-scsi&m=132388619710052

This patch separates the partition invalidation from rescan_partitions()
and use it for -ENOMEDIUM case.

Reported-by: Huajun Li <huajun.li.lee@gmail.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/partition-generic.c | 48 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/partition-generic.c b/block/partition-generic.c
index d06ec1c829c..6df5d6928a4 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -389,17 +389,11 @@ static bool disk_unlock_native_capacity(struct gendisk *disk)
 	}
 }
 
-int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
+static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
 {
-	struct parsed_partitions *state = NULL;
 	struct disk_part_iter piter;
 	struct hd_struct *part;
-	int p, highest, res;
-rescan:
-	if (state && !IS_ERR(state)) {
-		kfree(state);
-		state = NULL;
-	}
+	int res;
 
 	if (bdev->bd_part_count)
 		return -EBUSY;
@@ -412,6 +406,24 @@ rescan:
 		delete_partition(disk, part->partno);
 	disk_part_iter_exit(&piter);
 
+	return 0;
+}
+
+int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
+{
+	struct parsed_partitions *state = NULL;
+	struct hd_struct *part;
+	int p, highest, res;
+rescan:
+	if (state && !IS_ERR(state)) {
+		kfree(state);
+		state = NULL;
+	}
+
+	res = drop_partitions(disk, bdev);
+	if (res)
+		return res;
+
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
 	check_disk_size_change(disk, bdev);
@@ -515,6 +527,26 @@ rescan:
 	return 0;
 }
 
+int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
+{
+	int res;
+
+	if (!bdev->bd_invalidated)
+		return 0;
+
+	res = drop_partitions(disk, bdev);
+	if (res)
+		return res;
+
+	set_capacity(disk, 0);
+	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
+	/* tell userspace that the media / partition table may have changed */
+	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+
+	return 0;
+}
+
 unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
-- 
cgit v1.2.3-18-g5258


From 9f53d2fe815b4011ff930a7b6db98385d45faa68 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Fri, 2 Mar 2012 10:43:28 +0100
Subject: block: fix __blkdev_get and add_disk race condition

The following situation might occur:

__blkdev_get:			add_disk:

				register_disk()
get_gendisk()

disk_block_events()
	disk->ev == NULL

				disk_add_events()

__disk_unblock_events()
	disk->ev != NULL
	--ev->block

Then we unblock events, when they are suppose to be blocked. This can
trigger events related block/genhd.c warnings, but also can crash in
sd_check_events() or other places.

I'm able to reproduce crashes with the following scripts (with
connected usb dongle as sdb disk).

<snip>
DEV=/dev/sdb
ENABLE=/sys/bus/usb/devices/1-2/bConfigurationValue

function stop_me()
{
	for i in `jobs -p` ; do kill $i 2> /dev/null ; done
	exit
}

trap stop_me SIGHUP SIGINT SIGTERM

for ((i = 0; i < 10; i++)) ; do
	while true; do fdisk -l $DEV  2>&1 > /dev/null ; done &
done

while true ; do
echo 1 > $ENABLE
sleep 1
echo 0 > $ENABLE
done
</snip>

I use the script to verify patch fixing oops in sd_revalidate_disk
http://marc.info/?l=linux-scsi&m=132935572512352&w=2
Without Jun'ichi Nomura patch titled "Fix NULL pointer dereference in
sd_revalidate_disk" or this one, script easily crash kernel within
a few seconds. With both patches applied I do not observe crash.
Unfortunately after some time (dozen of minutes), script will hung in:

[ 1563.906432]  [<c08354f5>] schedule_timeout_uninterruptible+0x15/0x20
[ 1563.906437]  [<c04532d5>] msleep+0x15/0x20
[ 1563.906443]  [<c05d60b2>] blk_drain_queue+0x32/0xd0
[ 1563.906447]  [<c05d6e00>] blk_cleanup_queue+0xd0/0x170
[ 1563.906454]  [<c06d278f>] scsi_free_queue+0x3f/0x60
[ 1563.906459]  [<c06d7e6e>] __scsi_remove_device+0x6e/0xb0
[ 1563.906463]  [<c06d4aff>] scsi_forget_host+0x4f/0x60
[ 1563.906468]  [<c06cd84a>] scsi_remove_host+0x5a/0xf0
[ 1563.906482]  [<f7f030fb>] quiesce_and_remove_host+0x5b/0xa0 [usb_storage]
[ 1563.906490]  [<f7f03203>] usb_stor_disconnect+0x13/0x20 [usb_storage]

Anyway I think this patch is some step forward.

As drawback, I do not teardown on sysfs file create error, because I do
not know how to nullify disk->ev (since it can be used). However add_disk
error handling practically does not exist too, and things will work
without this sysfs file, except events will not be exported to user
space.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 23b4f706332..b26c4085590 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -35,6 +35,7 @@ static DEFINE_IDR(ext_devt_idr);
 
 static struct device_type disk_type;
 
+static void disk_alloc_events(struct gendisk *disk);
 static void disk_add_events(struct gendisk *disk);
 static void disk_del_events(struct gendisk *disk);
 static void disk_release_events(struct gendisk *disk);
@@ -601,6 +602,8 @@ void add_disk(struct gendisk *disk)
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
+	disk_alloc_events(disk);
+
 	/* Register BDI before referencing it from bdev */
 	bdi = &disk->queue->backing_dev_info;
 	bdi_register_dev(bdi, disk_devt(disk));
@@ -1733,9 +1736,9 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
 		&disk_events_dfl_poll_msecs, 0644);
 
 /*
- * disk_{add|del|release}_events - initialize and destroy disk_events.
+ * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
  */
-static void disk_add_events(struct gendisk *disk)
+static void disk_alloc_events(struct gendisk *disk)
 {
 	struct disk_events *ev;
 
@@ -1748,16 +1751,6 @@ static void disk_add_events(struct gendisk *disk)
 		return;
 	}
 
-	if (sysfs_create_files(&disk_to_dev(disk)->kobj,
-			       disk_events_attrs) < 0) {
-		pr_warn("%s: failed to create sysfs files for events\n",
-			disk->disk_name);
-		kfree(ev);
-		return;
-	}
-
-	disk->ev = ev;
-
 	INIT_LIST_HEAD(&ev->node);
 	ev->disk = disk;
 	spin_lock_init(&ev->lock);
@@ -1766,8 +1759,21 @@ static void disk_add_events(struct gendisk *disk)
 	ev->poll_msecs = -1;
 	INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
 
+	disk->ev = ev;
+}
+
+static void disk_add_events(struct gendisk *disk)
+{
+	if (!disk->ev)
+		return;
+
+	/* FIXME: error handling */
+	if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
+		pr_warn("%s: failed to create sysfs files for events\n",
+			disk->disk_name);
+
 	mutex_lock(&disk_events_mutex);
-	list_add_tail(&ev->node, &disk_events);
+	list_add_tail(&disk->ev->node, &disk_events);
 	mutex_unlock(&disk_events_mutex);
 
 	/*
-- 
cgit v1.2.3-18-g5258


From 62d3c5439c534b0e6c653fc63e6d8c67be3a57b1 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 2 Mar 2012 10:51:00 +0100
Subject: Block: use a freezable workqueue for disk-event polling

This patch (as1519) fixes a bug in the block layer's disk-events
polling.  The polling is done by a work routine queued on the
system_nrt_wq workqueue.  Since that workqueue isn't freezable, the
polling continues even in the middle of a system sleep transition.

Obviously, polling a suspended drive for media changes and such isn't
a good thing to do; in the case of USB mass-storage devices it can
lead to real problems requiring device resets and even re-enumeration.

The patch fixes things by creating a new system-wide, non-reentrant,
freezable workqueue and using it for disk-events polling.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
CC: <stable@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index b26c4085590..df9816ede75 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1478,9 +1478,9 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
 	intv = disk_events_poll_jiffies(disk);
 	set_timer_slack(&ev->dwork.timer, intv / 4);
 	if (check_now)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	else if (intv)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv);
 out_unlock:
 	spin_unlock_irqrestore(&ev->lock, flags);
 }
@@ -1524,7 +1524,7 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
 	ev->clearing |= mask;
 	if (!ev->block) {
 		cancel_delayed_work(&ev->dwork);
-		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	}
 	spin_unlock_irq(&ev->lock);
 }
@@ -1561,7 +1561,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
 
 	/* uncondtionally schedule event check and wait for it to finish */
 	disk_block_events(disk);
-	queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+	queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	flush_delayed_work(&ev->dwork);
 	__disk_unblock_events(disk, false);
 
@@ -1598,7 +1598,7 @@ static void disk_events_workfn(struct work_struct *work)
 
 	intv = disk_events_poll_jiffies(disk);
 	if (!ev->block && intv)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv);
 
 	spin_unlock_irq(&ev->lock);
 
-- 
cgit v1.2.3-18-g5258


From ff8c1474cc2f5e11414c71ec4d739c18e6e669c0 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <xtfeng@gmail.com>
Date: Wed, 14 Mar 2012 15:34:48 +0100
Subject: block: fix ioc leak in put_io_context

When put_io_context is called, if ioc->icq_list is empty and refcount
is 1, kernel will not free the ioc.

This is caught by following kmemleak:

unreferenced object 0xffff880036349fe0 (size 216):
  comm "sh", pid 2137, jiffies 4294931140 (age 290579.412s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    01 00 01 00 ad 4e ad de ff ff ff ff 00 00 00 00  .....N..........
  backtrace:
    [<ffffffff8169f926>] kmemleak_alloc+0x26/0x50
    [<ffffffff81195a9c>] kmem_cache_alloc_node+0x1cc/0x2a0
    [<ffffffff81356b67>] create_io_context_slowpath+0x27/0x130
    [<ffffffff81356d2b>] get_task_io_context+0xbb/0xf0
    [<ffffffff81055f0e>] copy_process+0x188e/0x18b0
    [<ffffffff8105609b>] do_fork+0x11b/0x420
    [<ffffffff810247f8>] sys_clone+0x28/0x30
    [<ffffffff816d3373>] stub_clone+0x13/0x20
    [<ffffffffffffffff>] 0xffffffffffffffff

ioc should be freed if ioc->icq_list is empty.
Signed-off-by: Xiaotian Feng <dannyfeng@tencent.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 92bf55540d8..fb95dd2f889 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -130,6 +130,7 @@ static void ioc_release_fn(struct work_struct *work)
 void put_io_context(struct io_context *ioc)
 {
 	unsigned long flags;
+	bool free_ioc = false;
 
 	if (ioc == NULL)
 		return;
@@ -144,8 +145,13 @@ void put_io_context(struct io_context *ioc)
 		spin_lock_irqsave(&ioc->lock, flags);
 		if (!hlist_empty(&ioc->icq_list))
 			schedule_work(&ioc->release_work);
+		else
+			free_ioc = true;
 		spin_unlock_irqrestore(&ioc->lock, flags);
 	}
+
+	if (free_ioc)
+		kmem_cache_free(iocontext_cachep, ioc);
 }
 EXPORT_SYMBOL(put_io_context);
 
-- 
cgit v1.2.3-18-g5258