diff options
-rw-r--r-- | block/Kconfig.iosched | 4 | ||||
-rw-r--r-- | block/blk-cgroup.c | 1506 | ||||
-rw-r--r-- | block/blk-cgroup.h | 324 | ||||
-rw-r--r-- | block/blk-core.c | 180 | ||||
-rw-r--r-- | block/blk-ioc.c | 126 | ||||
-rw-r--r-- | block/blk-sysfs.c | 6 | ||||
-rw-r--r-- | block/blk-throttle.c | 450 | ||||
-rw-r--r-- | block/blk.h | 32 | ||||
-rw-r--r-- | block/cfq-iosched.c | 608 | ||||
-rw-r--r-- | block/cfq.h | 113 | ||||
-rw-r--r-- | block/deadline-iosched.c | 8 | ||||
-rw-r--r-- | block/elevator.c | 123 | ||||
-rw-r--r-- | block/noop-iosched.c | 8 | ||||
-rw-r--r-- | fs/bio.c | 61 | ||||
-rw-r--r-- | fs/ioprio.c | 2 | ||||
-rw-r--r-- | include/linux/bio.h | 8 | ||||
-rw-r--r-- | include/linux/blk_types.h | 10 | ||||
-rw-r--r-- | include/linux/blkdev.h | 12 | ||||
-rw-r--r-- | include/linux/elevator.h | 8 | ||||
-rw-r--r-- | include/linux/iocontext.h | 39 | ||||
-rw-r--r-- | include/linux/ioprio.h | 22 | ||||
-rw-r--r-- | init/Kconfig | 2 | ||||
-rw-r--r-- | kernel/fork.c | 5 |
23 files changed, 1745 insertions, 1912 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 3199b76f795..421bef9c4c4 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,8 +23,6 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" - # If BLK_CGROUP is a module, CFQ has to be built as module. - depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -34,8 +32,6 @@ config IOSCHED_CFQ This is the default I/O scheduler. - Note: If BLK_CGROUP=m, then CFQ can be built only as module. - config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" depends on IOSCHED_CFQ && BLK_CGROUP diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 126c341955d..4fdeb46b443 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -17,71 +17,38 @@ #include <linux/err.h> #include <linux/blkdev.h> #include <linux/slab.h> -#include "blk-cgroup.h" #include <linux/genhd.h> +#include <linux/delay.h> +#include <linux/atomic.h> +#include "blk-cgroup.h" +#include "blk.h" #define MAX_KEY_LEN 100 static DEFINE_SPINLOCK(blkio_list_lock); static LIST_HEAD(blkio_list); +static DEFINE_MUTEX(all_q_mutex); +static LIST_HEAD(all_q_list); + +/* List of groups pending per cpu stats allocation */ +static DEFINE_SPINLOCK(alloc_list_lock); +static LIST_HEAD(alloc_list); + +static void blkio_stat_alloc_fn(struct work_struct *); +static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn); + struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; EXPORT_SYMBOL_GPL(blkio_root_cgroup); +static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES]; + /* for encoding cft->private value on file */ #define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) /* What policy owns the file, proportional or throttle */ #define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) #define BLKIOFILE_ATTR(val) ((val) & 0xffff) -static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, - struct blkio_policy_node *pn) -{ - list_add(&pn->node, &blkcg->policy_list); -} - -static inline bool cftype_blkg_same_policy(struct cftype *cft, - struct blkio_group *blkg) -{ - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - - if (blkg->plid == plid) - return 1; - - return 0; -} - -/* Determines if policy node matches cgroup file being accessed */ -static inline bool pn_matches_cftype(struct cftype *cft, - struct blkio_policy_node *pn) -{ - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int fileid = BLKIOFILE_ATTR(cft->private); - - return (plid == pn->plid && fileid == pn->fileid); -} - -/* Must be called with blkcg->lock held */ -static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) -{ - list_del(&pn->node); -} - -/* Must be called with blkcg->lock held */ -static struct blkio_policy_node * -blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, - enum blkio_policy_id plid, int fileid) -{ - struct blkio_policy_node *pn; - - list_for_each_entry(pn, &blkcg->policy_list, node) { - if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) - return pn; - } - - return NULL; -} - struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), @@ -89,77 +56,85 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); -struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) +static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) { return container_of(task_subsys_state(tsk, blkio_subsys_id), struct blkio_cgroup, css); } -EXPORT_SYMBOL_GPL(task_blkio_cgroup); -static inline void -blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) +struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio) +{ + if (bio && bio->bi_css) + return container_of(bio->bi_css, struct blkio_cgroup, css); + return task_blkio_cgroup(current); +} +EXPORT_SYMBOL_GPL(bio_blkio_cgroup); + +static inline void blkio_update_group_weight(struct blkio_group *blkg, + int plid, unsigned int weight) { struct blkio_policy_type *blkiop; list_for_each_entry(blkiop, &blkio_list, list) { /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) + if (blkiop->plid != plid) continue; if (blkiop->ops.blkio_update_group_weight_fn) - blkiop->ops.blkio_update_group_weight_fn(blkg->key, + blkiop->ops.blkio_update_group_weight_fn(blkg->q, blkg, weight); } } -static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, - int fileid) +static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid, + u64 bps, int fileid) { struct blkio_policy_type *blkiop; list_for_each_entry(blkiop, &blkio_list, list) { /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) + if (blkiop->plid != plid) continue; if (fileid == BLKIO_THROTL_read_bps_device && blkiop->ops.blkio_update_group_read_bps_fn) - blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, + blkiop->ops.blkio_update_group_read_bps_fn(blkg->q, blkg, bps); if (fileid == BLKIO_THROTL_write_bps_device && blkiop->ops.blkio_update_group_write_bps_fn) - blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, + blkiop->ops.blkio_update_group_write_bps_fn(blkg->q, blkg, bps); } } static inline void blkio_update_group_iops(struct blkio_group *blkg, - unsigned int iops, int fileid) + int plid, unsigned int iops, + int fileid) { struct blkio_policy_type *blkiop; list_for_each_entry(blkiop, &blkio_list, list) { /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) + if (blkiop->plid != plid) continue; if (fileid == BLKIO_THROTL_read_iops_device && blkiop->ops.blkio_update_group_read_iops_fn) - blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, + blkiop->ops.blkio_update_group_read_iops_fn(blkg->q, blkg, iops); if (fileid == BLKIO_THROTL_write_iops_device && blkiop->ops.blkio_update_group_write_iops_fn) - blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, + blkiop->ops.blkio_update_group_write_iops_fn(blkg->q, blkg,iops); } } /* * Add to the appropriate stat variable depending on the request type. - * This should be called with the blkg->stats_lock held. + * This should be called with queue_lock held. */ static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, bool sync) @@ -177,7 +152,7 @@ static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, /* * Decrements the appropriate stat variable if non-zero depending on the * request type. Panics on value being zero. - * This should be called with the blkg->stats_lock held. + * This should be called with the queue_lock held. */ static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) { @@ -198,19 +173,22 @@ static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) } #ifdef CONFIG_DEBUG_BLK_CGROUP -/* This should be called with the blkg->stats_lock held. */ +/* This should be called with the queue_lock held. */ static void blkio_set_start_group_wait_time(struct blkio_group *blkg, - struct blkio_group *curr_blkg) + struct blkio_policy_type *pol, + struct blkio_group *curr_blkg) { - if (blkio_blkg_waiting(&blkg->stats)) + struct blkg_policy_data *pd = blkg->pd[pol->plid]; + + if (blkio_blkg_waiting(&pd->stats)) return; if (blkg == curr_blkg) return; - blkg->stats.start_group_wait_time = sched_clock(); - blkio_mark_blkg_waiting(&blkg->stats); + pd->stats.start_group_wait_time = sched_clock(); + blkio_mark_blkg_waiting(&pd->stats); } -/* This should be called with the blkg->stats_lock held. */ +/* This should be called with the queue_lock held. */ static void blkio_update_group_wait_time(struct blkio_group_stats *stats) { unsigned long long now; @@ -224,7 +202,7 @@ static void blkio_update_group_wait_time(struct blkio_group_stats *stats) blkio_clear_blkg_waiting(stats); } -/* This should be called with the blkg->stats_lock held. */ +/* This should be called with the queue_lock held. */ static void blkio_end_empty_time(struct blkio_group_stats *stats) { unsigned long long now; @@ -238,132 +216,146 @@ static void blkio_end_empty_time(struct blkio_group_stats *stats) blkio_clear_blkg_empty(stats); } -void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; + + lockdep_assert_held(blkg->q->queue_lock); + BUG_ON(blkio_blkg_idling(stats)); - spin_lock_irqsave(&blkg->stats_lock, flags); - BUG_ON(blkio_blkg_idling(&blkg->stats)); - blkg->stats.start_idle_time = sched_clock(); - blkio_mark_blkg_idling(&blkg->stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + stats->start_idle_time = sched_clock(); + blkio_mark_blkg_idling(stats); } EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); -void blkiocg_update_idle_time_stats(struct blkio_group *blkg) +void blkiocg_update_idle_time_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - unsigned long flags; - unsigned long long now; - struct blkio_group_stats *stats; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; + + lockdep_assert_held(blkg->q->queue_lock); - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; if (blkio_blkg_idling(stats)) { - now = sched_clock(); - if (time_after64(now, stats->start_idle_time)) + unsigned long long now = sched_clock(); + + if (time_after64(now, stats->start_idle_time)) { + u64_stats_update_begin(&stats->syncp); stats->idle_time += now - stats->start_idle_time; + u64_stats_update_end(&stats->syncp); + } blkio_clear_blkg_idling(stats); } - spin_unlock_irqrestore(&blkg->stats_lock, flags); } EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); -void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - unsigned long flags; - struct blkio_group_stats *stats; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; + + lockdep_assert_held(blkg->q->queue_lock); - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; + u64_stats_update_begin(&stats->syncp); stats->avg_queue_size_sum += stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; stats->avg_queue_size_samples++; blkio_update_group_wait_time(stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + u64_stats_update_end(&stats->syncp); } EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); -void blkiocg_set_start_empty_time(struct blkio_group *blkg) +void blkiocg_set_start_empty_time(struct blkio_group *blkg, + struct blkio_policy_type *pol) { - unsigned long flags; - struct blkio_group_stats *stats; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; + lockdep_assert_held(blkg->q->queue_lock); if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || - stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { - spin_unlock_irqrestore(&blkg->stats_lock, flags); + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) return; - } /* * group is already marked empty. This can happen if cfqq got new * request in parent group and moved to this group while being added * to service tree. Just ignore the event and move on. */ - if(blkio_blkg_empty(stats)) { - spin_unlock_irqrestore(&blkg->stats_lock, flags); + if (blkio_blkg_empty(stats)) return; - } stats->start_empty_time = sched_clock(); blkio_mark_blkg_empty(stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); } EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); void blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) + struct blkio_policy_type *pol, + unsigned long dequeue) { - blkg->stats.dequeue += dequeue; + struct blkg_policy_data *pd = blkg->pd[pol->plid]; + + lockdep_assert_held(blkg->q->queue_lock); + + pd->stats.dequeue += dequeue; } EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); #else static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, - struct blkio_group *curr_blkg) {} -static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} + struct blkio_policy_type *pol, + struct blkio_group *curr_blkg) { } +static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { } #endif void blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, - bool sync) + struct blkio_policy_type *pol, + struct blkio_group *curr_blkg, bool direction, + bool sync) { - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; + + lockdep_assert_held(blkg->q->queue_lock); - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, - sync); - blkio_end_empty_time(&blkg->stats); - blkio_set_start_group_wait_time(blkg, curr_blkg); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + u64_stats_update_begin(&stats->syncp); + blkio_add_stat(stats->stat_arr[BLKIO_STAT_QUEUED], 1, direction, sync); + blkio_end_empty_time(stats); + u64_stats_update_end(&stats->syncp); + + blkio_set_start_group_wait_time(blkg, pol, curr_blkg); } EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); void blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) + struct blkio_policy_type *pol, + bool direction, bool sync) { - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], - direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + lockdep_assert_held(blkg->q->queue_lock); + + u64_stats_update_begin(&stats->syncp); + blkio_check_and_dec_stat(stats->stat_arr[BLKIO_STAT_QUEUED], direction, + sync); + u64_stats_update_end(&stats->syncp); } EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); -void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, - unsigned long unaccounted_time) +void blkiocg_update_timeslice_used(struct blkio_group *blkg, + struct blkio_policy_type *pol, + unsigned long time, + unsigned long unaccounted_time) { - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; - spin_lock_irqsave(&blkg->stats_lock, flags); - blkg->stats.time += time; + lockdep_assert_held(blkg->q->queue_lock); + + u64_stats_update_begin(&stats->syncp); + stats->time += time; #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg->stats.unaccounted_time += unaccounted_time; + stats->unaccounted_time += unaccounted_time; #endif - spin_unlock_irqrestore(&blkg->stats_lock, flags); + u64_stats_update_end(&stats->syncp); } EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); @@ -372,11 +364,17 @@ EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); * is valid. */ void blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) + struct blkio_policy_type *pol, + uint64_t bytes, bool direction, bool sync) { + struct blkg_policy_data *pd = blkg->pd[pol->plid]; struct blkio_group_stats_cpu *stats_cpu; unsigned long flags; + /* If per cpu stats are not allocated yet, don't do any accounting. */ + if (pd->stats_cpu == NULL) + return; + /* * Disabling interrupts to provide mutual exclusion between two * writes on same cpu. It probably is not needed for 64bit. Not @@ -384,7 +382,7 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg, */ local_irq_save(flags); - stats_cpu = this_cpu_ptr(blkg->stats_cpu); + stats_cpu = this_cpu_ptr(pd->stats_cpu); u64_stats_update_begin(&stats_cpu->syncp); stats_cpu->sectors += bytes >> 9; @@ -398,213 +396,419 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg, EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); void blkiocg_update_completion_stats(struct blkio_group *blkg, - uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) + struct blkio_policy_type *pol, + uint64_t start_time, + uint64_t io_start_time, bool direction, + bool sync) { - struct blkio_group_stats *stats; - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; unsigned long long now = sched_clock(); - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; + lockdep_assert_held(blkg->q->queue_lock); + + u64_stats_update_begin(&stats->syncp); if (time_after64(now, io_start_time)) blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], now - io_start_time, direction, sync); if (time_after64(io_start_time, start_time)) blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], io_start_time - start_time, direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + u64_stats_update_end(&stats->syncp); } EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); /* Merged stats are per cpu. */ -void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, - bool sync) +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, + struct blkio_policy_type *pol, + bool direction, bool sync) { - struct blkio_group_stats_cpu *stats_cpu; - unsigned long flags; + struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats; - /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. - */ - local_irq_save(flags); + lockdep_assert_held(blkg->q->queue_lock); - stats_cpu = this_cpu_ptr(blkg->stats_cpu); - - u64_stats_update_begin(&stats_cpu->syncp); - blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, - direction, sync); - u64_stats_update_end(&stats_cpu->syncp); - local_irq_restore(flags); + u64_stats_update_begin(&stats->syncp); + blkio_add_stat(stats->stat_arr[BLKIO_STAT_MERGED], 1, direction, sync); + u64_stats_update_end(&stats->syncp); } EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); /* - * This function allocates the per cpu stats for blkio_group. Should be called - * from sleepable context as alloc_per_cpu() requires that. + * Worker for allocating per cpu stat for blk groups. This is scheduled on + * the system_nrt_wq once there are some groups on the alloc_list waiting + * for allocation. */ -int blkio_alloc_blkg_stats(struct blkio_group *blkg) +static void blkio_stat_alloc_fn(struct work_struct *work) { - /* Allocate memory for per cpu stats */ - blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); - if (!blkg->stats_cpu) - return -ENOMEM; - return 0; -} -EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); + static void *pcpu_stats[BLKIO_NR_POLICIES]; + struct delayed_work *dwork = to_delayed_work(work); + struct blkio_group *blkg; + int i; + bool empty = false; -void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev, - enum blkio_policy_id plid) -{ - unsigned long flags; +alloc_stats: + for (i = 0; i < BLKIO_NR_POLICIES; i++) { + if (pcpu_stats[i] != NULL) + continue; - spin_lock_irqsave(&blkcg->lock, flags); - spin_lock_init(&blkg->stats_lock); - rcu_assign_pointer(blkg->key, key); - blkg->blkcg_id = css_id(&blkcg->css); - hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - blkg->plid = plid; - spin_unlock_irqrestore(&blkcg->lock, flags); - /* Need to take css reference ? */ - cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); - blkg->dev = dev; + pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu); + + /* Allocation failed. Try again after some time. */ + if (pcpu_stats[i] == NULL) { + queue_delayed_work(system_nrt_wq, dwork, + msecs_to_jiffies(10)); + return; + } + } + + spin_lock_irq(&blkio_list_lock); + spin_lock(&alloc_list_lock); + + /* cgroup got deleted or queue exited. */ + if (!list_empty(&alloc_list)) { + blkg = list_first_entry(&alloc_list, struct blkio_group, + alloc_node); + for (i = 0; i < BLKIO_NR_POLICIES; i++) { + struct blkg_policy_data *pd = blkg->pd[i]; + + if (blkio_policy[i] && pd && !pd->stats_cpu) + swap(pd->stats_cpu, pcpu_stats[i]); + } + + list_del_init(&blkg->alloc_node); + } + + empty = list_empty(&alloc_list); + + spin_unlock(&alloc_list_lock); + spin_unlock_irq(&blkio_list_lock); + + if (!empty) + goto alloc_stats; } -EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); -static void __blkiocg_del_blkio_group(struct blkio_group *blkg) +/** + * blkg_free - free a blkg + * @blkg: blkg to free + * + * Free @blkg which may be partially allocated. + */ +static void blkg_free(struct blkio_group *blkg) { - hlist_del_init_rcu(&blkg->blkcg_node); - blkg->blkcg_id = 0; + int i; + + if (!blkg) + return; + + for (i = 0; i < BLKIO_NR_POLICIES; i++) { + struct blkg_policy_data *pd = blkg->pd[i]; + + if (pd) { + free_percpu(pd->stats_cpu); + kfree(pd); + } + } + + kfree(blkg); } -/* - * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 - * indicating that blk_group was unhashed by the time we got to it. +/** + * blkg_alloc - allocate a blkg + * @blkcg: block cgroup the new blkg is associated with + * @q: request_queue the new blkg is associated with + * + * Allocate a new blkg assocating @blkcg and @q. */ -int blkiocg_del_blkio_group(struct blkio_group *blkg) +static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg, + struct request_queue *q) { - struct blkio_cgroup *blkcg; - unsigned long flags; - struct cgroup_subsys_state *css; - int ret = 1; + struct blkio_group *blkg; + int i; - rcu_read_lock(); - css = css_lookup(&blkio_subsys, blkg->blkcg_id); - if (css) { - blkcg = container_of(css, struct blkio_cgroup, css); - spin_lock_irqsave(&blkcg->lock, flags); - if (!hlist_unhashed(&blkg->blkcg_node)) { - __blkiocg_del_blkio_group(blkg); - ret = 0; + /* alloc and init base part */ + blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); + if (!blkg) + return NULL; + + blkg->q = q; + INIT_LIST_HEAD(&blkg->q_node); + INIT_LIST_HEAD(&blkg->alloc_node); + blkg->blkcg = blkcg; + blkg->refcnt = 1; + cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); + + for (i = 0; i < BLKIO_NR_POLICIES; i++) { + struct blkio_policy_type *pol = blkio_policy[i]; + struct blkg_policy_data *pd; + + if (!pol) + continue; + + /* alloc per-policy data and attach it to blkg */ + pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC, + q->node); + if (!pd) { + blkg_free(blkg); + return NULL; } - spin_unlock_irqrestore(&blkcg->lock, flags); + + blkg->pd[i] = pd; + pd->blkg = blkg; } - rcu_read_unlock(); - return ret; + /* invoke per-policy init */ + for (i = 0; i < BLKIO_NR_POLICIES; i++) { + struct blkio_policy_type *pol = blkio_policy[i]; + + if (pol) + pol->ops.blkio_init_group_fn(blkg); + } + + return blkg; +} + +struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg, + struct request_queue *q, + enum blkio_policy_id plid, + bool for_root) + __releases(q->queue_lock) __acquires(q->queue_lock) +{ + struct blkio_group *blkg; + + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); + + /* + * This could be the first entry point of blkcg implementation and + * we shouldn't allow anything to go through for a bypassing queue. + * The following can be removed if blkg lookup is guaranteed to + * fail on a bypassing queue. + */ + if (unlikely(blk_queue_bypass(q)) && !for_root) + return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); + + blkg = blkg_lookup(blkcg, q); + if (blkg) + return blkg; + + /* blkg holds a reference to blkcg */ + if (!css_tryget(&blkcg->css)) + return ERR_PTR(-EINVAL); + + /* + * Allocate and initialize. + */ + blkg = blkg_alloc(blkcg, q); + + /* did alloc fail? */ + if (unlikely(!blkg)) { + blkg = ERR_PTR(-ENOMEM); + goto out; + } + + /* insert */ + spin_lock(&blkcg->lock); + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + list_add(&blkg->q_node, &q->blkg_list); + spin_unlock(&blkcg->lock); + + spin_lock(&alloc_list_lock); + list_add(&blkg->alloc_node, &alloc_list); + /* Queue per cpu stat allocation from worker thread. */ + queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0); + spin_unlock(&alloc_list_lock); +out: + return blkg; } -EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); +EXPORT_SYMBOL_GPL(blkg_lookup_create); /* called under rcu_read_lock(). */ -struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) +struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg, + struct request_queue *q) { struct blkio_group *blkg; struct hlist_node *n; - void *__key; - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { - __key = blkg->key; - if (__key == key) + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) + if (blkg->q == q) return blkg; + return NULL; +} +EXPORT_SYMBOL_GPL(blkg_lookup); + +static void blkg_destroy(struct blkio_group *blkg) +{ + struct request_queue *q = blkg->q; + struct blkio_cgroup *blkcg = blkg->blkcg; + + lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&blkcg->lock); + + /* Something wrong if we are trying to remove same group twice */ + WARN_ON_ONCE(list_empty(&blkg->q_node)); + WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + list_del_init(&blkg->q_node); + hlist_del_init_rcu(&blkg->blkcg_node); + + spin_lock(&alloc_list_lock); + list_del_init(&blkg->alloc_node); + spin_unlock(&alloc_list_lock); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + blkg_put(blkg); +} + +/* + * XXX: This updates blkg policy data in-place for root blkg, which is + * necessary across elevator switch and policy registration as root blkgs + * aren't shot down. This broken and racy implementation is temporary. + * Eventually, blkg shoot down will be replaced by proper in-place update. + */ +void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid) +{ + struct blkio_policy_type *pol = blkio_policy[plid]; + struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q); + struct blkg_policy_data *pd; + + if (!blkg) + return; + + kfree(blkg->pd[plid]); + blkg->pd[plid] = NULL; + + if (!pol) + return; + + pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL); + WARN_ON_ONCE(!pd); + + pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); + WARN_ON_ONCE(!pd->stats_cpu); + + blkg->pd[plid] = pd; + pd->blkg = blkg; + pol->ops.blkio_init_group_fn(blkg); +} +EXPORT_SYMBOL_GPL(update_root_blkg_pd); + +/** + * blkg_destroy_all - destroy all blkgs associated with a request_queue + * @q: request_queue of interest + * @destroy_root: whether to destroy root blkg or not + * + * Destroy blkgs associated with @q. If @destroy_root is %true, all are + * destroyed; otherwise, root blkg is left alone. + */ +void blkg_destroy_all(struct request_queue *q, bool destroy_root) +{ + struct blkio_group *blkg, *n; + + spin_lock_irq(q->queue_lock); + + list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + struct blkio_cgroup *blkcg = blkg->blkcg; + + /* skip root? */ + if (!destroy_root && blkg->blkcg == &blkio_root_cgroup) + continue; + + spin_lock(&blkcg->lock); + blkg_destroy(blkg); + spin_unlock(&blkcg->lock); } - return NULL; + spin_unlock_irq(q->queue_lock); } -EXPORT_SYMBOL_GPL(blkiocg_lookup_group); +EXPORT_SYMBOL_GPL(blkg_destroy_all); -static void blkio_reset_stats_cpu(struct blkio_group *blkg) +static void blkg_rcu_free(struct rcu_head *rcu_head) { - struct blkio_group_stats_cpu *stats_cpu; - int i, j, k; + blkg_free(container_of(rcu_head, struct blkio_group, rcu_head)); +} + +void __blkg_release(struct blkio_group *blkg) +{ + /* release the extra blkcg reference this blkg has been holding */ + css_put(&blkg->blkcg->css); + /* - * Note: On 64 bit arch this should not be an issue. This has the - * possibility of returning some inconsistent value on 32bit arch - * as 64bit update on 32bit is non atomic. Taking c |