diff options
Diffstat (limited to 'block/genhd.c')
| -rw-r--r-- | block/genhd.c | 792 |
1 files changed, 697 insertions, 95 deletions
diff --git a/block/genhd.c b/block/genhd.c index a9ec910974c..791f4194313 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -15,19 +15,18 @@ #include <linux/slab.h> #include <linux/kmod.h> #include <linux/kobj_map.h> -#include <linux/buffer_head.h> #include <linux/mutex.h> #include <linux/idr.h> +#include <linux/log2.h> +#include <linux/pm_runtime.h> #include "blk.h" static DEFINE_MUTEX(block_class_lock); -#ifndef CONFIG_SYSFS_DEPRECATED struct kobject *block_depr; -#endif /* for extended dynamic devt allocation, currently only one major is used */ -#define MAX_EXT_DEVT (1 << MINORBITS) +#define NR_EXT_DEVT (1 << MINORBITS) /* For extended devt allocation. ext_devt_mutex prevents look up * results from going away underneath its user. @@ -37,6 +36,13 @@ static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; +static void disk_check_events(struct disk_events *ev, + unsigned int *clearing_ptr); +static void disk_alloc_events(struct gendisk *disk); +static void disk_add_events(struct gendisk *disk); +static void disk_del_events(struct gendisk *disk); +static void disk_release_events(struct gendisk *disk); + /** * disk_get_part - get partition * @disk: disk to look partition from @@ -98,7 +104,7 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, if (flags & DISK_PITER_REVERSE) piter->idx = ptbl->len - 1; - else if (flags & DISK_PITER_INCL_PART0) + else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) piter->idx = 0; else piter->idx = 1; @@ -134,7 +140,8 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) /* determine iteration parameters */ if (piter->flags & DISK_PITER_REVERSE) { inc = -1; - if (piter->flags & DISK_PITER_INCL_PART0) + if (piter->flags & (DISK_PITER_INCL_PART0 | + DISK_PITER_INCL_EMPTY_PART0)) end = -1; else end = 0; @@ -150,7 +157,10 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects) + if (!part_nr_sects_read(part) && + !(piter->flags & DISK_PITER_INCL_EMPTY) && + !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && + piter->idx == 0)) continue; get_device(part_to_dev(part)); @@ -184,7 +194,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { return part->start_sect <= sector && - sector < part->start_sect + part->nr_sects; + sector < part->start_sect + part_nr_sects_read(part); } /** @@ -237,7 +247,7 @@ static struct blk_major_name { } *major_names[BLKDEV_MAJOR_HASH_SIZE]; /* index in the above - for now: assume no multimajor ranges */ -static inline int major_to_index(int major) +static inline int major_to_index(unsigned major) { return major % BLKDEV_MAJOR_HASH_SIZE; } @@ -401,7 +411,7 @@ static int blk_mangle_minor(int minor) int blk_alloc_devt(struct hd_struct *part, dev_t *devt) { struct gendisk *disk = part_to_disk(part); - int idx, rc; + int idx; /* in consecutive minor range? */ if (part->partno < disk->minors) { @@ -410,19 +420,11 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) } /* allocate ext devt */ - do { - if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL)) - return -ENOMEM; - rc = idr_get_new(&ext_devt_idr, part, &idx); - } while (rc == -EAGAIN); - - if (rc) - return rc; - - if (idx > MAX_EXT_DEVT) { - idr_remove(&ext_devt_idr, idx); - return -EBUSY; - } + mutex_lock(&ext_devt_mutex); + idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_KERNEL); + mutex_unlock(&ext_devt_mutex); + if (idx < 0) + return idx == -ENOSPC ? -EBUSY : idx; *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); return 0; @@ -500,6 +502,72 @@ static int exact_lock(dev_t devt, void *data) return 0; } +static void register_disk(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + struct block_device *bdev; + struct disk_part_iter piter; + struct hd_struct *part; + int err; + + ddev->parent = disk->driverfs_dev; + + dev_set_name(ddev, "%s", disk->disk_name); + + /* delay uevents, until we scanned partition table */ + dev_set_uevent_suppress(ddev, 1); + + if (device_add(ddev)) + return; + if (!sysfs_deprecated) { + err = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); + if (err) { + device_del(ddev); + return; + } + } + + /* + * avoid probable deadlock caused by allocating memory with + * GFP_KERNEL in runtime_resume callback of its all ancestor + * devices + */ + pm_runtime_set_memalloc_noio(ddev, true); + + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + + /* No minors to use for partitions */ + if (!disk_part_scan_enabled(disk)) + goto exit; + + /* No such device (e.g., media were just removed) */ + if (!get_capacity(disk)) + goto exit; + + bdev = bdget_disk(disk, 0); + if (!bdev) + goto exit; + + bdev->bd_invalidated = 1; + err = blkdev_get(bdev, FMODE_READ, NULL); + if (err < 0) + goto exit; + blkdev_put(bdev, FMODE_READ); + +exit: + /* announce disk after possible partitions are created */ + dev_set_uevent_suppress(ddev, 0); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); +} + /** * add_disk - add partitioning information to kernel list * @disk: per-device partitioning information @@ -537,28 +605,69 @@ void add_disk(struct gendisk *disk) disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); + disk_alloc_events(disk); + + /* Register BDI before referencing it from bdev */ + bdi = &disk->queue->backing_dev_info; + bdi_register_dev(bdi, disk_devt(disk)); + blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); - bdi = &disk->queue->backing_dev_info; - bdi_register_dev(bdi, disk_devt(disk)); + /* + * Take an extra ref on queue which will be put on disk_release() + * so that it sticks around as long as @disk is there. + */ + WARN_ON_ONCE(!blk_get_queue(disk->queue)); + retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); -} + disk_add_events(disk); +} EXPORT_SYMBOL(add_disk); -EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ -void unlink_gendisk(struct gendisk *disk) +void del_gendisk(struct gendisk *disk) { + struct disk_part_iter piter; + struct hd_struct *part; + + disk_del_events(disk); + + /* invalidate stuff */ + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); + while ((part = disk_part_iter_next(&piter))) { + invalidate_partition(disk, part->partno); + delete_partition(disk, part->partno); + } + disk_part_iter_exit(&piter); + + invalidate_partition(disk, 0); + set_capacity(disk, 0); + disk->flags &= ~GENHD_FL_UP; + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); + + part_stat_set_all(&disk->part0, 0); + disk->part0.stamp = 0; + + kobject_put(disk->part0.holder_dir); + kobject_put(disk->slave_dir); + disk->driverfs_dev = NULL; + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); + pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); + device_del(disk_to_dev(disk)); + blk_free_devt(disk_to_dev(disk)->devt); } +EXPORT_SYMBOL(del_gendisk); /** * get_gendisk - get partitioning information for a given device @@ -592,6 +701,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) return disk; } +EXPORT_SYMBOL(get_gendisk); /** * bdget_disk - do bdget() by gendisk and partition number @@ -640,7 +750,7 @@ void __init printk_all_partitions(void) /* * Don't show empty devices or things that have been - * surpressed + * suppressed */ if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) @@ -655,10 +765,11 @@ void __init printk_all_partitions(void) while ((part = disk_part_iter_next(&piter))) { bool is_part0 = part == &disk->part0; - printk("%s%s %10llu %s", is_part0 ? "" : " ", + printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), - (unsigned long long)part->nr_sects >> 1, - disk_name(disk, part->partno, name_buf)); + (unsigned long long)part_nr_sects_read(part) >> 1 + , disk_name(disk, part->partno, name_buf), + part->info ? part->info->uuid : ""); if (is_part0) { if (disk->driverfs_dev != NULL && disk->driverfs_dev->driver != NULL) @@ -722,10 +833,10 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) static void *show_partition_start(struct seq_file *seqf, loff_t *pos) { - static void *p; + void *p; p = disk_seqf_start(seqf, pos); - if (!IS_ERR(p) && p && !*pos) + if (!IS_ERR_OR_NULL(p) && !*pos) seq_puts(seqf, "major minor #blocks name\n\n"); return p; } @@ -738,7 +849,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_partitionable(sgp) && + if (!get_capacity(sgp) || (!disk_max_parts(sgp) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) @@ -749,7 +860,7 @@ static int show_partition(struct seq_file *seqf, void *v) while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), - (unsigned long long)part->nr_sects >> 1, + (unsigned long long)part_nr_sects_read(part) >> 1, disk_name(sgp, part->partno, buf)); disk_part_iter_exit(&piter); @@ -798,10 +909,9 @@ static int __init genhd_device_init(void) register_blkdev(BLOCK_EXT_MAJOR, "blkext"); -#ifndef CONFIG_SYSFS_DEPRECATED /* create top-level block dir */ - block_depr = kobject_create_and_add("block", NULL); -#endif + if (!sysfs_deprecated) + block_depr = kobject_create_and_add("block", NULL); return 0; } @@ -848,13 +958,35 @@ static ssize_t disk_capability_show(struct device *dev, return sprintf(buf, "%x\n", disk->flags); } +static ssize_t disk_alignment_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); +} + +static ssize_t disk_discard_alignment_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); +} + static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, + NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -871,8 +1003,11 @@ static struct attribute *disk_attrs[] = { &dev_attr_removable.attr, &dev_attr_ro.attr, &dev_attr_size.attr, + &dev_attr_alignment_offset.attr, + &dev_attr_discard_alignment.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -886,19 +1021,11 @@ static struct attribute_group disk_attr_group = { .attrs = disk_attrs, }; -static struct attribute_group *disk_attr_groups[] = { +static const struct attribute_group *disk_attr_groups[] = { &disk_attr_group, NULL }; -static void disk_free_ptbl_rcu_cb(struct rcu_head *head) -{ - struct disk_part_tbl *ptbl = - container_of(head, struct disk_part_tbl, rcu_head); - - kfree(ptbl); -} - /** * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way * @disk: disk to replace part_tbl for @@ -919,7 +1046,7 @@ static void disk_replace_part_tbl(struct gendisk *disk, if (old_ptbl) { rcu_assign_pointer(old_ptbl->last_lookup, NULL); - call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); + kfree_rcu(old_ptbl, rcu_head); } } @@ -958,7 +1085,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) if (!new_ptbl) return -ENOMEM; - INIT_RCU_HEAD(&new_ptbl->rcu_head); new_ptbl->len = target; for (i = 0; i < len; i++) @@ -972,19 +1098,34 @@ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); + free_part_info(&disk->part0); + if (disk->queue) + blk_put_queue(disk->queue); kfree(disk); } struct class block_class = { .name = "block", }; +static char *block_devnode(struct device *dev, umode_t *mode, + kuid_t *uid, kgid_t *gid) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (disk->devnode) + return disk->devnode(disk, mode); + return NULL; +} + static struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, + .devnode = block_devnode, }; #ifdef CONFIG_PROC_FS @@ -1010,31 +1151,31 @@ static int diskstats_show(struct seq_file *seqf, void *v) "wsect wuse running use aveq" "\n\n"); */ - - disk_part_iter_init(&piter, gp, DISK_PITER_INCL_PART0); + + disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { cpu = part_stat_lock(); part_round_stats(cpu, hd); part_stat_unlock(); - seq_printf(seqf, "%4d %7d %s %lu %lu %llu " - "%u %lu %lu %llu %u %u %u %u\n", + seq_printf(seqf, "%4d %7d %s %lu %lu %lu " + "%u %lu %lu %lu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[0]), - part_stat_read(hd, merges[0]), - (unsigned long long)part_stat_read(hd, sectors[0]), - jiffies_to_msecs(part_stat_read(hd, ticks[0])), - part_stat_read(hd, ios[1]), - part_stat_read(hd, merges[1]), - (unsigned long long)part_stat_read(hd, sectors[1]), - jiffies_to_msecs(part_stat_read(hd, ticks[1])), - hd->in_flight, + part_stat_read(hd, ios[READ]), + part_stat_read(hd, merges[READ]), + part_stat_read(hd, sectors[READ]), + jiffies_to_msecs(part_stat_read(hd, ticks[READ])), + part_stat_read(hd, ios[WRITE]), + part_stat_read(hd, merges[WRITE]), + part_stat_read(hd, sectors[WRITE]), + jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), + part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); } disk_part_iter_exit(&piter); - + return 0; } @@ -1066,29 +1207,6 @@ static int __init proc_genhd_init(void) module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ -static void media_change_notify_thread(struct work_struct *work) -{ - struct gendisk *gd = container_of(work, struct gendisk, async_notify); - char event[] = "MEDIA_CHANGE=1"; - char *envp[] = { event, NULL }; - - /* - * set enviroment vars to indicate which event this is for - * so that user space will know to go check the media status. - */ - kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); - put_device(gd->driverfs_dev); -} - -#if 0 -void genhd_media_change_notify(struct gendisk *disk) -{ - get_device(disk->driverfs_dev); - schedule_work(&disk->async_notify); -} -EXPORT_SYMBOL_GPL(genhd_media_change_notify); -#endif /* 0 */ - dev_t blk_lookup_devt(const char *name, int partno) { dev_t devt = MKDEV(0, 0); @@ -1126,7 +1244,7 @@ EXPORT_SYMBOL(blk_lookup_devt); struct gendisk *alloc_disk(int minors) { - return alloc_disk_node(minors, -1); + return alloc_disk_node(minors, NUMA_NO_NODE); } EXPORT_SYMBOL(alloc_disk); @@ -1134,8 +1252,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id) { struct gendisk *disk; - disk = kmalloc_node(sizeof(struct gendisk), - GFP_KERNEL | __GFP_ZERO, node_id); + disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (disk) { if (!init_part_stats(&disk->part0)) { kfree(disk); @@ -1149,13 +1266,23 @@ struct gendisk *alloc_disk_node(int minors, int node_id) } disk->part_tbl->part[0] = &disk->part0; + /* + * set_capacity() and get_capacity() currently don't use + * seqcounter to read/update the part0->nr_sects. Still init + * the counter as we can read the sectors in IO submission + * patch using seqence counters. + * + * TODO: Ideally set_capacity() and get_capacity() should be + * converted to make use of bd_mutex and sequence counters. + */ + seqcount_init(&disk->part0.nr_sects_seq); + hd_ref_init(&disk->part0); + disk->minors = minors; rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); - INIT_WORK(&disk->async_notify, - media_change_notify_thread); } return disk; } @@ -1190,6 +1317,16 @@ void put_disk(struct gendisk *disk) EXPORT_SYMBOL(put_disk); +static void set_disk_ro_uevent(struct gendisk *gd, int ro) +{ + char event[] = "DISK_RO=1"; + char *envp[] = { event, NULL }; + + if (!ro) + event[8] = '0'; + kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); +} + void set_device_ro(struct block_device *bdev, int flag) { bdev->bd_part->policy = flag; @@ -1202,8 +1339,12 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0); + if (disk->part0.policy != flag) { + set_disk_ro_uevent(disk, flag); + disk->part0.policy = flag; + } + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) part->policy = flag; disk_part_iter_exit(&piter); @@ -1226,10 +1367,471 @@ int invalidate_partition(struct gendisk *disk, int partno) struct block_device *bdev = bdget_disk(disk, partno); if (bdev) { fsync_bdev(bdev); - res = __invalidate_device(bdev); + res = __invalidate_device(bdev, true); bdput(bdev); } return res; } EXPORT_SYMBOL(invalidate_partition); + +/* + * Disk events - monitor disk events like media change and eject request. + */ +struct disk_events { + struct list_head node; /* all disk_event's */ + struct gendisk *disk; /* the associated disk */ + spinlock_t lock; + + struct mutex block_mutex; /* protects blocking */ + int block; /* event blocking depth */ + unsigned int pending; /* events already sent out */ + unsigned int clearing; /* events being cleared */ + + long poll_msecs; /* interval, -1 for default */ + struct delayed_work dwork; +}; + +static const char *disk_events_strs[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", +}; + +static char *disk_uevents[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", +}; + +/* list of all disk_events */ +static DEFINE_MUTEX(disk_events_mutex); +static LIST_HEAD(disk_events); + +/* disable in-kernel polling by default */ +static unsigned long disk_events_dfl_poll_msecs = 0; + +static unsigned long disk_events_poll_jiffies(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + long intv_msecs = 0; + + /* + * If device-specific poll interval is set, always use it. If + * the default is being used, poll iff there are events which + * can't be monitored asynchronously. + */ + if (ev->poll_msecs >= 0) + intv_msecs = ev->poll_msecs; + else if (disk->events & ~disk->async_events) + intv_msecs = disk_events_dfl_poll_msecs; + + return msecs_to_jiffies(intv_msecs); +} + +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + unsigned long flags; + bool cancel; + + if (!ev) + return; + + /* + * Outer mutex ensures that the first blocker completes canceling + * the event work before further blockers are allowed to finish. + */ + mutex_lock(&ev->block_mutex); + + spin_lock_irqsave(&ev->lock, flags); + cancel = !ev->block++; + spin_unlock_irqrestore(&ev->lock, flags); + + if (cancel) + cancel_delayed_work_sync(&disk->ev->dwork); + + mutex_unlock(&ev->block_mutex); +} + +static void __disk_unblock_events(struct gendisk *disk, bool check_now) +{ + struct disk_events *ev = disk->ev; + unsigned long intv; + unsigned long flags; + + spin_lock_irqsave(&ev->lock, flags); + + if (WARN_ON_ONCE(ev->block <= 0)) + goto out_unlock; + + if (--ev->block) + goto out_unlock; + + /* + * Not exactly a latency critical operation, set poll timer + * slack to 25% and kick event check. + */ + intv = disk_events_poll_jiffies(disk); + set_timer_slack(&ev->dwork.timer, intv / 4); + if (check_now) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); + else if (intv) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); +out_unlock: + spin_unlock_irqrestore(&ev->lock, flags); +} + +/** + * disk_unblock_events - unblock disk event checking + * @disk: disk to unblock events for + * + * Undo disk_block_events(). When the block count reaches zero, it + * starts events polling if configured. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_unblock_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_unblock_events(disk, false); +} + +/** + * disk_flush_events - schedule immediate event checking and flushing + * @disk: disk to check and flush events for + * @mask: events to flush + * + * Schedule immediate event checking on @disk if not blocked. Events in + * @mask are scheduled to be cleared from the driver. Note that this + * doesn't clear the events from @disk->ev. + * + * CONTEXT: + * If @mask is non-zero must be called with bdev->bd_mutex held. + */ +void disk_flush_events(struct gendisk *disk, unsigned int mask) +{ + struct disk_events *ev = disk->ev; + + if (!ev) + return; + + spin_lock_irq(&ev->lock); + ev->clearing |= mask; + if (!ev->block) + mod_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); + spin_unlock_irq(&ev->lock); +} + +/** + * disk_clear_events - synchronously check, clear and return pending events + * @disk: disk to fetch and clear events from + * @mask: mask of events to be fetched and clearted + * + * Disk events are synchronously checked and pending events in @mask + * are cleared and returned. This ignores the block count. + * + * CONTEXT: + * Might sleep. + */ +unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) +{ + const struct block_device_operations *bdops = disk->fops; + struct disk_events *ev = disk->ev; + unsigned int pending; + unsigned int clearing = mask; + + if (!ev) { + /* for drivers still using the old ->media_changed method */ + if ((mask & DISK_EVENT_MEDIA_CHANGE) && + bdops->media_changed && bdops->media_changed(disk)) + return DISK_EVENT_MEDIA_CHANGE; + return 0; + } + + disk_block_events(disk); + + /* + * store the union of mask and ev->clearing on the stack so that the + * race with disk_flush_events does not cause ambiguity (ev->clearing + * can still be modified even if events are blocked). + */ + spin_lock_irq(&ev->lock); + clearing |= ev->clearing; + ev->clearing = 0; + spin_unlock_irq(&ev->lock); + + disk_check_events(ev, &clearing); + /* + * if ev->clearing is not 0, the disk_flush_events got called in the + * middle of this function, so we want to run the workfn without delay. + */ + __disk_unblock_events(disk, ev->clearing ? true : false); + + /* then, fetch and clear pending events */ + spin_lock_irq(&ev->lock); + pending = ev->pending & mask; + ev->pending &= ~mask; + spin_unlock_irq(&ev->lock); + WARN_ON_ONCE(clearing & mask); + + return pending; +} + +/* + * Separate this part out so that a different pointer for clearing_ptr can be + * passed in for disk_clear_events. + */ +static void disk_events_workfn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct disk_events *ev = container_of(dwork, struct disk_events, dwork); + + disk_check_events(ev, &ev->clearing); +} + +static void disk_check_events(struct disk_events *ev, + unsigned int *clearing_ptr) +{ + struct gendisk *disk = ev->disk; + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + unsigned int clearing = *clearing_ptr; + unsigned int events; + unsigned long intv; + int nr_events = 0, i; + + /* check events */ + events = disk->fops->check_events(disk, clearing); + + /* accumulate pending events and schedule next poll if necessary */ + spin_lock_irq(&ev->lock); + + events &= ~ev->pending; + ev->pending |= events; + *clearing_ptr &= ~clearing; + + intv = disk_events_poll_jiffies(disk); + if (!ev->block && intv) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); + + spin_unlock_irq(&ev->lock); + + /* + * Tell userland about new events. Only the events listed in + * @disk->events are reported. Unlisted events are processed the + * same internally but never get reported to userland. + */ + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if (events & disk->events & (1 << i)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + +/* + * A disk events enabled device has the following sysfs nodes under + * its /sys/block/X/ directory. + * + * events : list of all supported events + * events_async : list of events which can be detected w/o polling + * events_poll_msecs : polling interval, 0: disable, -1: system default + */ +static ssize_t __disk_events_show(unsigned int events, char *buf) +{ + const char *delim = ""; + ssize_t pos = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) + if (events & (1 << i)) { + pos += sprintf(buf + pos, "%s%s", + delim, disk_events_strs[i]); + delim = " "; + } + if (pos) + pos += sprintf(buf + pos, "\n"); + return pos; +} + +static ssize_t disk_events_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return __disk_events_show(disk->events, buf); +} + +static ssize_t disk_events_async_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return __disk_events_show(disk->async_events, buf); +} + +static ssize_t disk_events_poll_msecs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%ld\n", disk->ev->poll_msecs); +} + +static ssize_t disk_events_poll_msecs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + long intv; + + if (!count || !sscanf(buf, "%ld", &intv)) + return -EINVAL; + + if (intv < 0 && intv != -1) + return -EINVAL; + + disk_block_events(disk); + disk->ev->poll_msecs = intv; + __disk_unblock_events(disk, true); + + return count; +} + +static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL); +static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL); +static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR, + disk_events_poll_msecs_show, + disk_events_poll_msecs_store); + +static const struct attribute *disk_events_attrs[] = { + &dev_attr_events.attr, + &dev_attr_events_async.attr, + &dev_attr_events_poll_msecs.attr, + NULL, +}; + +/* + * The default polling interval can be specified by the kernel + * parameter block.events_dfl_poll_msecs which defaults to 0 + * (disable). This can also be modified runtime by writing to + * /sys/module/block/events_dfl_poll_msecs. + */ +static int disk_events_set_dfl_poll_msecs(const char *val, + const struct kernel_param *kp) +{ + struct disk_events *ev; + int ret; + + ret = param_set_ulong(val, kp); + if (ret < 0) + return ret; + + mutex_lock(&disk_events_mutex); + + list_for_each_entry(ev, &disk_events, node) + disk_flush_events(ev->disk, 0); + + mutex_unlock(&disk_events_mutex); + + return 0; +} + +static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { + .set = disk_events_set_dfl_poll_msecs, + .get = param_get_ulong, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "block." + +module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, + &disk_events_dfl_poll_msecs, 0644); + +/* + * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. + */ +static void disk_alloc_events(struct gendisk *disk) +{ + struct disk_events *ev; + + if (!disk->fops->check_events) + return; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) { + pr_warn("%s: failed to initialize events\n", disk->disk_name); + return; + } + + INIT_LIST_HEAD(&ev->node); + ev->disk = disk; + spin_lock_init(&ev->lock); + mutex_init(&ev->block_mutex); + ev->block = 1; + ev->poll_msecs = -1; + INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); + + disk->ev = ev; +} + +static void disk_add_events(struct gendisk *disk) +{ + if (!disk->ev) + return; + + /* FIXME: error handling */ + if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) + pr_warn("%s: failed to create sysfs files for events\n", + disk->disk_name); + + mutex_lock(&disk_events_mutex); + list_add_tail(&disk->ev->node, &disk_events); + mutex_unlock(&disk_events_mutex); + + /* + * Block count is initialized to 1 and the following initial + * unblock kicks it into action. + */ + __disk_unblock_events(disk, true); +} + +static void disk_del_events(struct gendisk *disk) +{ + if (!disk->ev) + return; + + disk_block_events(disk); + + mutex_lock(&disk_events_mutex); + list_del_init(&disk->ev->node); + mutex_unlock(&disk_events_mutex); + + sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); +} + +static void disk_release_events(struct gendisk *disk) +{ + /* the block count should be 1 from disk_del_events() */ + WARN_ON_ONCE(disk->ev && disk->ev->block != 1); + kfree(disk->ev); +} |
