diff options
50 files changed, 1206 insertions, 1038 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index d6da611f8f6..4ed7b5ceeed 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt @@ -89,6 +89,33 @@ Throttling/Upper Limit policy Limits for writes can be put using blkio.write_bps_device file. +Hierarchical Cgroups +==================== +- Currently none of the IO control policy supports hierarhical groups. But + cgroup interface does allow creation of hierarhical cgroups and internally + IO policies treat them as flat hierarchy. + + So this patch will allow creation of cgroup hierarhcy but at the backend + everything will be treated as flat. So if somebody created a hierarchy like + as follows. + + root + / \ + test1 test2 + | + test3 + + CFQ and throttling will practically treat all groups at same level. + + pivot + / | \ \ + root test1 test2 test3 + + Down the line we can implement hierarchical accounting/control support + and also introduce a new cgroup file "use_hierarchy" which will control + whether cgroup hierarchy is viewed as flat or hierarchical by the policy.. + This is how memory controller also has implemented the things. + Various user visible config options =================================== CONFIG_BLK_CGROUP diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b1febd0f6d2..455768a3eb9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1452,10 +1452,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) goto done; } - /* Currently we do not support hierarchy deeper than two level (0,1) */ - if (parent != cgroup->top_cgroup) - return ERR_PTR(-EPERM); - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) return ERR_PTR(-ENOMEM); diff --git a/block/blk-core.c b/block/blk-core.c index 4ce953f1b39..151070541e2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -33,7 +33,7 @@ #include "blk.h" -EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); @@ -1329,9 +1329,9 @@ static inline void blk_partition_remap(struct bio *bio) bio->bi_sector += p->start_sect; bio->bi_bdev = bdev->bd_contains; - trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, - bdev->bd_dev, - bio->bi_sector - p->start_sect); + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio, + bdev->bd_dev, + bio->bi_sector - p->start_sect); } } @@ -1500,7 +1500,7 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; if (old_sector != -1) - trace_block_remap(q, bio, old_dev, old_sector); + trace_block_bio_remap(q, bio, old_dev, old_sector); old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4cd59b0d7c1..c19d015ac5a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -87,7 +87,6 @@ struct cfq_rb_root { unsigned count; unsigned total_weight; u64 min_vdisktime; - struct rb_node *active; }; #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ .count = 0, .min_vdisktime = 0, } @@ -180,7 +179,6 @@ struct cfq_group { /* group service_tree key */ u64 vdisktime; unsigned int weight; - bool on_st; /* number of cfqq currently on this group */ int nr_cfqq; @@ -563,11 +561,6 @@ static void update_min_vdisktime(struct cfq_rb_root *st) u64 vdisktime = st->min_vdisktime; struct cfq_group *cfqg; - if (st->active) { - cfqg = rb_entry_cfqg(st->active); - vdisktime = cfqg->vdisktime; - } - if (st->left) { cfqg = rb_entry_cfqg(st->left); vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); @@ -646,11 +639,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) static inline bool cfq_slice_used(struct cfq_queue *cfqq) { if (cfq_cfqq_slice_new(cfqq)) - return 0; + return false; if (time_before(jiffies, cfqq->slice_end)) - return 0; + return false; - return 1; + return true; } /* @@ -869,7 +862,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) struct rb_node *n; cfqg->nr_cfqq++; - if (cfqg->on_st) + if (!RB_EMPTY_NODE(&cfqg->rb_node)) return; /* @@ -885,7 +878,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) cfqg->vdisktime = st->min_vdisktime; __cfq_group_service_tree_add(st, cfqg); - cfqg->on_st = true; st->total_weight += cfqg->weight; } @@ -894,9 +886,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) { struct cfq_rb_root *st = &cfqd->grp_service_tree; - if (st->active == &cfqg->rb_node) - st->active = NULL; - BUG_ON(cfqg->nr_cfqq < 1); cfqg->nr_cfqq--; @@ -905,7 +894,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) return; cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); - cfqg->on_st = false; st->total_weight -= cfqg->weight; if (!RB_EMPTY_NODE(&cfqg->rb_node)) cfq_rb_erase(&cfqg->rb_node, st); @@ -1095,7 +1083,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg) if (!atomic_dec_and_test(&cfqg->ref)) return; for_each_cfqg_st(cfqg, i, j, st) - BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); + BUG_ON(!RB_EMPTY_ROOT(&st->rb)); kfree(cfqg); } @@ -1687,9 +1675,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq == cfqd->active_queue) cfqd->active_queue = NULL; - if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) - cfqd->grp_service_tree.active = NULL; - if (cfqd->active_cic) { put_io_context(cfqd->active_cic->ioc); cfqd->active_cic = NULL; @@ -1901,10 +1886,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * in their service tree. */ if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) - return 1; + return true; cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", service_tree->count); - return 0; + return false; } static void cfq_arm_slice_timer(struct cfq_data *cfqd) @@ -2116,12 +2101,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) unsigned count; struct cfq_rb_root *st; unsigned group_slice; - - if (!cfqg) { - cfqd->serving_prio = IDLE_WORKLOAD; - cfqd->workload_expires = jiffies + 1; - return; - } + enum wl_prio_t original_prio = cfqd->serving_prio; /* Choose next priority. RT > BE > IDLE */ if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) @@ -2134,6 +2114,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) return; } + if (original_prio != cfqd->serving_prio) + goto new_workload; + /* * For RT and BE, we have to choose also the type * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload @@ -2148,6 +2131,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) if (count && !time_after(jiffies, cfqd->workload_expires)) return; +new_workload: /* otherwise select new workload type */ cfqd->serving_type = cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); @@ -2199,7 +2183,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) if (RB_EMPTY_ROOT(&st->rb)) return NULL; cfqg = cfq_rb_first_group(st); - st->active = &cfqg->rb_node; update_min_vdisktime(st); return cfqg; } @@ -2293,6 +2276,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) goto keep_queue; } + /* + * This is a deep seek queue, but the device is much faster than + * the queue can deliver, don't idle + **/ + if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) && + (cfq_cfqq_slice_new(cfqq) || + (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) { + cfq_clear_cfqq_deep(cfqq); + cfq_clear_cfqq_idle_window(cfqq); + } + if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { cfqq = NULL; goto keep_queue; @@ -2367,12 +2361,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, { /* the queue hasn't finished any request, can't estimate */ if (cfq_cfqq_slice_new(cfqq)) - return 1; + return true; if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, cfqq->slice_end)) - return 1; + return true; - return 0; + return false; } static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) @@ -3265,6 +3259,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) return true; + /* An idle queue should not be idle now for some reason */ + if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) + return true; + if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) return false; diff --git a/block/genhd.c b/block/genhd.c index 5fa2b44a72f..74331738163 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -18,6 +18,7 @@ #include <linux/buffer_head.h> #include <linux/mutex.h> #include <linux/idr.h> +#include <linux/log2.h> #include "blk.h" @@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; +static void disk_add_events(struct gendisk *disk); +static void disk_del_events(struct gendisk *disk); +static void disk_release_events(struct gendisk *disk); + /** * disk_get_part - get partition * @disk: disk to look partition from @@ -239,7 +244,7 @@ static struct blk_major_name { } *major_names[BLKDEV_MAJOR_HASH_SIZE]; /* index in the above - for now: assume no multimajor ranges */ -static inline int major_to_index(int major) +static inline int major_to_index(unsigned major) { return major % BLKDEV_MAJOR_HASH_SIZE; } @@ -502,6 +507,64 @@ static int exact_lock(dev_t devt, void *data) return 0; } +void register_disk(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + struct block_device *bdev; + struct disk_part_iter piter; + struct hd_struct *part; + int err; + + ddev->parent = disk->driverfs_dev; + + dev_set_name(ddev, disk->disk_name); + + /* delay uevents, until we scanned partition table */ + dev_set_uevent_suppress(ddev, 1); + + if (device_add(ddev)) + return; + if (!sysfs_deprecated) { + err = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); + if (err) { + device_del(ddev); + return; + } + } + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + + /* No minors to use for partitions */ + if (!disk_partitionable(disk)) + goto exit; + + /* No such device (e.g., media were just removed) */ + if (!get_capacity(disk)) + goto exit; + + bdev = bdget_disk(disk, 0); + if (!bdev) + goto exit; + + bdev->bd_invalidated = 1; + err = blkdev_get(bdev, FMODE_READ, NULL); + if (err < 0) + goto exit; + blkdev_put(bdev, FMODE_READ); + +exit: + /* announce disk after possible partitions are created */ + dev_set_uevent_suppress(ddev, 0); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); +} + /** * add_disk - add partitioning information to kernel list * @disk: per-device partitioning information @@ -551,18 +614,48 @@ void add_disk(struct gendisk *disk) retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); -} + disk_add_events(disk); +} EXPORT_SYMBOL(add_disk); -EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ -void unlink_gendisk(struct gendisk *disk) +void del_gendisk(struct gendisk *disk) { + struct disk_part_iter piter; + struct hd_struct *part; + + disk_del_events(disk); + + /* invalidate stuff */ + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); + while ((part = disk_part_iter_next(&piter))) { + invalidate_partition(disk, part->partno); + delete_partition(disk, part->partno); + } + disk_part_iter_exit(&piter); + + invalidate_partition(disk, 0); + blk_free_devt(disk_to_dev(disk)->devt); + set_capacity(disk, 0); + disk->flags &= ~GENHD_FL_UP; + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); + + part_stat_set_all(&disk->part0, 0); + disk->part0.stamp = 0; + + kobject_put(disk->part0.holder_dir); + kobject_put(disk->slave_dir); + disk->driverfs_dev = NULL; + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); + device_del(disk_to_dev(disk)); } +EXPORT_SYMBOL(del_gendisk); /** * get_gendisk - get partitioning information for a given device @@ -735,7 +828,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos) static void *p; p = disk_seqf_start(seqf, pos); - if (!IS_ERR(p) && p && !*pos) + if (!IS_ERR_OR_NULL(p) && !*pos) seq_puts(seqf, "major minor #blocks name\n\n"); return p; } @@ -1005,6 +1098,7 @@ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); @@ -1110,29 +1204,6 @@ static int __init proc_genhd_init(void) module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ -static void media_change_notify_thread(struct work_struct *work) -{ - struct gendisk *gd = container_of(work, struct gendisk, async_notify); - char event[] = "MEDIA_CHANGE=1"; - char *envp[] = { event, NULL }; - - /* - * set enviroment vars to indicate which event this is for - * so that user space will know to go check the media status. - */ - kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); - put_device(gd->driverfs_dev); -} - -#if 0 -void genhd_media_change_notify(struct gendisk *disk) -{ - get_device(disk->driverfs_dev); - schedule_work(&disk->async_notify); -} -EXPORT_SYMBOL_GPL(genhd_media_change_notify); -#endif /* 0 */ - dev_t blk_lookup_devt(const char *name, int partno) { dev_t devt = MKDEV(0, 0); @@ -1198,8 +1269,6 @@ struct gendisk *alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); - INIT_WORK(&disk->async_notify, - media_change_notify_thread); } return disk; } @@ -1291,3 +1360,422 @@ int invalidate_partition(struct gendisk *disk, int partno) } EXPORT_SYMBOL(invalidate_partition); + +/* + * Disk events - monitor disk events like media change and eject request. + */ +struct disk_events { + struct list_head node; /* all disk_event's */ + struct gendisk *disk; /* the associated disk */ + spinlock_t lock; + + int block; /* event blocking depth */ + unsigned int pending; /* events already sent out */ + unsigned int clearing; /* events being cleared */ + + long poll_msecs; /* interval, -1 for default */ + struct delayed_work dwork; +}; + +static const char *disk_events_strs[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", +}; + +static char *disk_uevents[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", +}; + +/* list of all disk_events */ +static DEFINE_MUTEX(disk_events_mutex); +static LIST_HEAD(disk_events); + +/* disable in-kernel polling by default */ +static unsigned long disk_events_dfl_poll_msecs = 0; + +static unsigned long disk_events_poll_jiffies(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + long intv_msecs = 0; + + /* + * If device-specific poll interval is set, always use it. If + * the default is being used, poll iff there are events which + * can't be monitored asynchronously. + */ + if (ev->poll_msecs >= 0) + intv_msecs = ev->poll_msecs; + else if (disk->events & ~disk->async_events) + intv_msecs = disk_events_dfl_poll_msecs; + + return msecs_to_jiffies(intv_msecs); +} + +static void __disk_block_events(struct gendisk *disk, bool sync) +{ + struct disk_events *ev = disk->ev; + unsigned long flags; + bool cancel; + + spin_lock_irqsave(&ev->lock, flags); + cancel = !ev->block++; + spin_unlock_irqrestore(&ev->lock, flags); + + if (cancel) { + if (sync) + cancel_delayed_work_sync(&disk->ev->dwork); + else + cancel_delayed_work(&disk->ev->dwork); + } +} + +static void __disk_unblock_events(struct gendisk *disk, bool check_now) +{ + struct disk_events *ev = disk->ev; + unsigned long intv; + unsigned long flags; + + spin_lock_irqsave(&ev->lock, flags); + + if (WARN_ON_ONCE(ev->block <= 0)) + goto out_unlock; + + if (--ev->block) + goto out_unlock; + + /* + * Not exactly a latency critical operation, set poll timer + * slack to 25% and kick event check. + */ + intv = disk_events_poll_jiffies(disk); + set_timer_slack(&ev->dwork.timer, intv / 4); + if (check_now) + queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + else if (intv) + queue_delayed_work(system_nrt_wq, &ev->dwork, intv); +out_unlock: + spin_unlock_irqrestore(&ev->lock, flags); +} + +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_block_events(disk, true); +} + +/** + * disk_unblock_events - unblock disk event checking + * @disk: disk to unblock events for + * + * Undo disk_block_events(). When the block count reaches zero, it + * starts events polling if configured. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_unblock_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_unblock_events(disk, true); +} + +/** + * disk_check_events - schedule immediate event checking + * @disk: disk to check events for + * + * Schedule immediate event checking on @disk if not blocked. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_check_events(struct gendisk *disk) +{ + if (disk->ev) { + __disk_block_events(disk, false); + __disk_unblock_events(disk, true); + } +} +EXPORT_SYMBOL_GPL(disk_check_events); + +/** + * disk_clear_events - synchronously check, clear and return pending events + * @disk: disk to fetch and clear events from + * @mask: mask of events to be fetched and clearted + * + * Disk events are synchronously checked and pending events in @mask + * are cleared and returned. This ignores the block count. + * + * CONTEXT: + * Might sleep. + */ +unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) +{ + const struct block_device_operations *bdops = disk->fops; + struct disk_events *ev = disk->ev; + unsigned int pending; + + if (!ev) { + /* for drivers still using the old ->media_changed method */ + if ((mask & DISK_EVENT_MEDIA_CHANGE) && + bdops->media_changed && bdops->media_changed(disk)) + return DISK_EVENT_MEDIA_CHANGE; + return 0; + } + + /* tell the workfn about the events being cleared */ + spin_lock_irq(&ev->lock); + ev->clearing |= mask; + spin_unlock_irq(&ev->lock); + + /* uncondtionally schedule event check and wait for it to finish */ + __disk_block_events(disk, true); + queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + flush_delayed_work(&ev->dwork); + __disk_unblock_events(disk, false); + + /* then, fetch and clear pending events */ + spin_lock_irq(&ev->lock); + WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */ + pending = ev->pending & mask; + ev->pending &= ~mask; + spin_unlock_irq(&ev->lock); + + return pending; +} + +static void disk_events_workfn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct disk_events *ev = container_of(dwork, struct disk_events, dwork); + struct gendisk *disk = ev->disk; + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + unsigned int clearing = ev->clearing; + unsigned int events; + unsigned long intv; + int nr_events = 0, i; + + /* check events */ + events = disk->fops->check_events(disk, clearing); + + /* accumulate pending events and schedule next poll if necessary */ + spin_lock_irq(&ev->lock); + + events &= ~ev->pending; + ev->pending |= events; + ev->clearing &= ~clearing; + + intv = disk_events_poll_jiffies(disk); + if (!ev->block && intv) + queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + + spin_unlock_irq(&ev->lock); + + /* tell userland about new events */ + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if (events & (1 << i)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj |