From d30a2605be9d5132d95944916e8f578fcfe4f976 Mon Sep 17 00:00:00 2001 From: David Woodhouse <David.Woodhouse@intel.com> Date: Mon, 11 Aug 2008 15:58:42 +0100 Subject: Add BLKDISCARD ioctl to allow userspace to discard sectors We may well want mkfs tools to use this to mark the whole device as unwanted before they format it, for example. The ioctl takes a pair of uint64_ts, which are start offset and length in _bytes_. Although at the moment it might make sense for them both to be in 512-byte sectors, I don't want to limit the ABI to that. Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- block/ioctl.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) (limited to 'block/ioctl.c') diff --git a/block/ioctl.c b/block/ioctl.c index 77185e5c026..342298bb608 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -111,6 +111,69 @@ static int blkdev_reread_part(struct block_device *bdev) return res; } +static void blk_ioc_discard_endio(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + complete(bio->bi_private); +} + +static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, + uint64_t len) +{ + struct request_queue *q = bdev_get_queue(bdev); + int ret = 0; + + if (start & 511) + return -EINVAL; + if (len & 511) + return -EINVAL; + start >>= 9; + len >>= 9; + + if (start + len > (bdev->bd_inode->i_size >> 9)) + return -EINVAL; + + if (!q->prepare_discard_fn) + return -EOPNOTSUPP; + + while (len && !ret) { + DECLARE_COMPLETION_ONSTACK(wait); + struct bio *bio; + + bio = bio_alloc(GFP_KERNEL, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = blk_ioc_discard_endio; + bio->bi_bdev = bdev; + bio->bi_private = &wait; + bio->bi_sector = start; + + if (len > q->max_hw_sectors) { + bio->bi_size = q->max_hw_sectors << 9; + len -= q->max_hw_sectors; + start += q->max_hw_sectors; + } else { + bio->bi_size = len << 9; + len = 0; + } + submit_bio(WRITE_DISCARD, bio); + + wait_for_completion(&wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + else if (!bio_flagged(bio, BIO_UPTODATE)) + ret = -EIO; + bio_put(bio); + } + return ret; +} + static int put_ushort(unsigned long arg, unsigned short val) { return put_user(val, (unsigned short __user *)arg); @@ -258,6 +321,19 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd, set_device_ro(bdev, n); unlock_kernel(); return 0; + + case BLKDISCARD: { + uint64_t range[2]; + + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(range, (void __user *)arg, sizeof(range))) + return -EFAULT; + + return blk_ioctl_discard(bdev, range[0], range[1]); + } + case HDIO_GETGEO: { struct hd_geometry geo; -- cgit v1.2.3-18-g5258 From e17fc0a1ccf88f6d4dcb363729f3141b0958c325 Mon Sep 17 00:00:00 2001 From: David Woodhouse <David.Woodhouse@intel.com> Date: Sat, 9 Aug 2008 16:42:20 +0100 Subject: Allow elevators to sort/merge discard requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit But blkdev_issue_discard() still emits requests which are interpreted as soft barriers, because naïve callers might otherwise issue subsequent writes to those same sectors, which might cross on the queue (if they're reallocated quickly enough). Callers still _can_ issue non-barrier discard requests, but they have to take care of queue ordering for themselves. Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- block/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block/ioctl.c') diff --git a/block/ioctl.c b/block/ioctl.c index 342298bb608..375c57922b0 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -161,7 +161,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, bio->bi_size = len << 9; len = 0; } - submit_bio(WRITE_DISCARD, bio); + submit_bio(DISCARD_NOBARRIER, bio); wait_for_completion(&wait); -- cgit v1.2.3-18-g5258 From ec2cdedf798385a9397ac50dd0405dd658f8529c Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 25 Aug 2008 19:30:15 +0900 Subject: block: allow deleting zero length partition delete_partition() was noop for zero length partition. As the addition code allows creating zero lenght partition and deletion is assumed to always succeed, this causes memory leak for zero length partitions. Allow zero length partitions to end their meaningless lives. While at it, allow deleting zero lenght partition via BLKPG_DEL_PARTITION ioctl too. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- block/ioctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block/ioctl.c') diff --git a/block/ioctl.c b/block/ioctl.c index 375c57922b0..c722de0ef2e 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -68,8 +68,6 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user case BLKPG_DEL_PARTITION: if (!disk->part[part-1]) return -ENXIO; - if (disk->part[part - 1]->nr_sects == 0) - return -ENXIO; bdevp = bdget_disk(disk, part); if (!bdevp) return -ENOMEM; -- cgit v1.2.3-18-g5258 From 88e341261ca4d39eec21b212961c77eff51105f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 25 Aug 2008 19:30:16 +0900 Subject: block: update add_partition() error handling d805dda4 tried to fix error case handling in add_partition() but had a few problems. * disk->part[] entry is set early and left dangling if operation fails. * Once device initialized, the last put_device() is responsible for freeing all the resources. The failure path freed part_stats and p regardless of put_device() causing double free. * holders subdir holds reference to the disk device, so failure path should remove it to release resources properly which was missing. This patch fixes the above problems and while at it move partition slot busy check into add_partition() for completeness and inlines holders subdirectory creation. Using separate function for it just obfuscates the code. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Abdel Benamrouche <draconux@gmail.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- block/ioctl.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'block/ioctl.c') diff --git a/block/ioctl.c b/block/ioctl.c index c722de0ef2e..eb046aeede8 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -43,12 +43,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user || pstart < 0 || plength < 0) return -EINVAL; } - /* partition number in use? */ + mutex_lock(&bdev->bd_mutex); - if (disk->part[part - 1]) { - mutex_unlock(&bdev->bd_mutex); - return -EBUSY; - } + /* overlap? */ for (i = 0; i < disk->minors - 1; i++) { struct hd_struct *s = disk->part[i]; -- cgit v1.2.3-18-g5258 From cf771cb5a7b716f3f9e532fd42a1e3a0a75adec5 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 3 Sep 2008 09:01:09 +0200 Subject: block: make variable and argument names more consistent In hd_struct, @partno is used to denote partition number and a number of other places use @part to denote hd_struct. Functions use @part and @index instead. This causes confusion and makes it difficult to use consistent variable names for hd_struct. Always use @partno if a variable represents partition number. Also, print out functions use @f or @part for seq_file argument. Use @seqf uniformly instead. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- block/ioctl.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'block/ioctl.c') diff --git a/block/ioctl.c b/block/ioctl.c index eb046aeede8..d77f5e280a6 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -15,7 +15,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user struct blkpg_ioctl_arg a; struct blkpg_partition p; long long start, length; - int part; + int partno; int i; int err; @@ -28,8 +28,8 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user disk = bdev->bd_disk; if (bdev != bdev->bd_contains) return -EINVAL; - part = p.pno; - if (part <= 0 || part >= disk->minors) + partno = p.pno; + if (partno <= 0 || partno >= disk->minors) return -EINVAL; switch (a.op) { case BLKPG_ADD_PARTITION: @@ -59,13 +59,14 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user } } /* all seems OK */ - err = add_partition(disk, part, start, length, ADDPART_FLAG_NONE); + err = add_partition(disk, partno, start, length, + ADDPART_FLAG_NONE); mutex_unlock(&bdev->bd_mutex); return err; case BLKPG_DEL_PARTITION: - if (!disk->part[part-1]) + if (!disk->part[partno - 1]) return -ENXIO; - bdevp = bdget_disk(disk, part); + bdevp = bdget_disk(disk, partno); if (!bdevp) return -ENOMEM; mutex_lock(&bdevp->bd_mutex); @@ -79,7 +80,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user invalidate_bdev(bdevp); mutex_lock_nested(&bdev->bd_mutex, 1); - delete_partition(disk, part); + delete_partition(disk, partno); mutex_unlock(&bdev->bd_mutex); mutex_unlock(&bdevp->bd_mutex); bdput(bdevp); -- cgit v1.2.3-18-g5258 From f331c0296f2a9fee0d396a70598b954062603015 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 3 Sep 2008 09:01:48 +0200 Subject: block: don't depend on consecutive minor space * Implement disk_devt() and part_devt() and use them to directly access devt instead of computing it from ->major and ->first_minor. Note that all references to ->major and ->first_minor outside of block layer is used to determine devt of the disk (the part0) and as ->major and ->first_minor will continue to represent devt for the disk, converting these users aren't strictly necessary. However, convert them for consistency. * Implement disk_max_parts() to avoid directly deferencing genhd->minors. * Update bdget_disk() such that it doesn't assume consecutive minor space. * Move devt computation from register_disk() to add_disk() and make it the only one (all other usages use the initially determined value). These changes clean up the code and will help disk->part dereference fix and extended block device numbers. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- block/ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block/ioctl.c') diff --git a/block/ioctl.c b/block/ioctl.c index d77f5e280a6..403f7d7e0c2 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -29,7 +29,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user if (bdev != bdev->bd_contains) return -EINVAL; partno = p.pno; - if (partno <= 0 || partno >= disk->minors) + if (partno <= 0 || partno > disk_max_parts(disk)) return -EINVAL; switch (a.op) { case BLKPG_ADD_PARTITION: @@ -47,7 +47,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user mutex_lock(&bdev->bd_mutex); /* overlap? */ - for (i = 0; i < disk->minors - 1; i++) { + for (i = 0; i < disk_max_parts(disk); i++) { struct hd_struct *s = disk->part[i]; if (!s) @@ -96,7 +96,7 @@ static int blkdev_reread_part(struct block_device *bdev) struct gendisk *disk = bdev->bd_disk; int res; - if (disk->minors == 1 || bdev != bdev->bd_contains) + if (!disk_max_parts(disk) || bdev != bdev->bd_contains) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; -- cgit v1.2.3-18-g5258 From e71bf0d0ee89e51b92776391c5634938236977d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 3 Sep 2008 09:03:02 +0200 Subject: block: fix disk->part[] dereferencing race disk->part[] is protected by its matching bdev's lock. However, non-critical accesses like collecting stats and printing out sysfs and proc information used to be performed without any locking. As partitions can come and go dynamically, partitions can go away underneath those non-critical accesses. As some of those accesses are writes, this theoretically can lead to silent corruption. This patch fixes the race by using RCU for the partition array and dev reference counter to hold partitions. * Rename disk->part[] to disk->__part[] to make sure no one outside genhd layer proper accesses it directly. * Use RCU for disk->__part[] dereferencing. * Implement disk_{get|put}_part() which can be used to get and put partitions from gendisk respectively. * Iterators are implemented to help iterate through all partitions safely. * Functions which require RCU readlock are marked with _rcu suffix. * Use disk_put_part() in __blkdev_put() instead of directly putting the contained kobject. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- block/ioctl.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'block/ioctl.c') diff --git a/block/ioctl.c b/block/ioctl.c index 403f7d7e0c2..a5f672ad55f 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -12,11 +12,12 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user { struct block_device *bdevp; struct gendisk *disk; + struct hd_struct *part; struct blkpg_ioctl_arg a; struct blkpg_partition p; + struct disk_part_iter piter; long long start, length; int partno; - int i; int err; if (!capable(CAP_SYS_ADMIN)) @@ -47,28 +48,33 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user mutex_lock(&bdev->bd_mutex); /* overlap? */ - for (i = 0; i < disk_max_parts(disk); i++) { - struct hd_struct *s = disk->part[i]; - - if (!s) - continue; - if (!(start+length <= s->start_sect || - start >= s->start_sect + s->nr_sects)) { + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) { + if (!(start + length <= part->start_sect || + start >= part->start_sect + part->nr_sects)) { + disk_part_iter_exit(&piter); mutex_unlock(&bdev->bd_mutex); return -EBUSY; } } + disk_part_iter_exit(&piter); + /* all seems OK */ err = add_partition(disk, partno, start, length, ADDPART_FLAG_NONE); mutex_unlock(&bdev->bd_mutex); return err; case BLKPG_DEL_PARTITION: - if (!disk->part[partno - 1]) + part = disk_get_part(disk, partno); + if (!part) return -ENXIO; - bdevp = bdget_disk(disk, partno); + + bdevp = bdget(part_devt(part)); + disk_put_part(part); if (!bdevp) return -ENOMEM; + mutex_lock(&bdevp->bd_mutex); if (bdevp->bd_openers) { mutex_unlock(&bdevp->bd_mutex); -- cgit v1.2.3-18-g5258 From b5d0b9df0ba5d9a044f3a21e7544f53d90bd1465 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 3 Sep 2008 09:06:42 +0200 Subject: block: introduce partition 0 genhd and partition code handled disk and partitions separately. All information about the whole disk was in struct genhd and partitions in struct hd_struct. However, the whole disk (part0) and other partitions have a lot in common and the data structures end up having good number of common fields and thus separate code paths doing the same thing. Also, the partition array was indexed by partno - 1 which gets pretty confusing at times. This patch introduces partition 0 and makes the partition array indexed by partno. Following patches will unify the handling of disk and parts piece-by-piece. This patch also implements disk_partitionable() which tests whether a disk is partitionable. With coming dynamic partition array change, the most common usage of disk_max_parts() will be testing whether a disk is partitionable and the number of max partitions will become much less important. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- block/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block/ioctl.c') diff --git a/block/ioctl.c b/block/ioctl.c index a5f672ad55f..64e7c67a64b 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -30,7 +30,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user if (bdev != bdev->bd_contains) return -EINVAL; partno = p.pno; - if (partno <= 0 || partno > disk_max_parts(disk)) + if (partno <= 0 || partno >= disk_max_parts(disk)) return -EINVAL; switch (a.op) { case BLKPG_ADD_PARTITION: @@ -102,7 +102,7 @@ static int blkdev_reread_part(struct block_device *bdev) struct gendisk *disk = bdev->bd_disk; int res; - if (!disk_max_parts(disk) || bdev != bdev->bd_contains) + if (!disk_partitionable(disk) || bdev != bdev->bd_contains) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; -- cgit v1.2.3-18-g5258 From 540eed5637b766bb1e881ef744c42617760b4815 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 25 Aug 2008 19:56:15 +0900 Subject: block: make partition array dynamic disk->__part used to be statically allocated to the maximum possible number of partitions. This patch makes partition array allocation dynamic. The added overhead is minimal as only real change is one memory dereference changed to RCU one. This saves both a bit of memory and cpu cycles iterating through unoccupied slots and makes increasing partition limit easier. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com> --- block/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block/ioctl.c') diff --git a/block/ioctl.c b/block/ioctl.c index 64e7c67a64b..38bee321e1f 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -30,7 +30,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user if (bdev != bdev->bd_contains) return -EINVAL; partno = p.pno; - if (partno <= 0 || partno >= disk_max_parts(disk)) + if (partno <= 0) return -EINVAL; switch (a.op) { case BLKPG_ADD_PARTITION: -- cgit v1.2.3-18-g5258