diff options
35 files changed, 3031 insertions, 3291 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 03ca210406e..1fafa401a1c 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -534,6 +534,15 @@ Who: Kees Cook <keescook@chromium.org> ---------------------------- +What: cgroup option updates via remount +When: March 2013 +Why: Remount currently allows changing bound subsystems and + release_agent. Rebinding is hardly useful as it only works + when the hierarchy is empty and release_agent itself should be + replaced with conventional fsnotify. + +---------------------------- + What: setitimer accepts user NULL pointer (value) When: 3.6 Why: setitimer is not returning -EFAULT if user pointer is NULL. This diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 3199b76f795..421bef9c4c4 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,8 +23,6 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" - # If BLK_CGROUP is a module, CFQ has to be built as module. - depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -34,8 +32,6 @@ config IOSCHED_CFQ This is the default I/O scheduler. - Note: If BLK_CGROUP=m, then CFQ can be built only as module. - config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" depends on IOSCHED_CFQ && BLK_CGROUP diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ea84a23d5e6..02cf6335e9b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -11,1679 +11,906 @@ * Nauman Rafique <nauman@google.com> */ #include <linux/ioprio.h> -#include <linux/seq_file.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/err.h> #include <linux/blkdev.h> #include <linux/slab.h> -#include "blk-cgroup.h" #include <linux/genhd.h> +#include <linux/delay.h> +#include <linux/atomic.h> +#include "blk-cgroup.h" +#include "blk.h" #define MAX_KEY_LEN 100 -static DEFINE_SPINLOCK(blkio_list_lock); -static LIST_HEAD(blkio_list); +static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; -EXPORT_SYMBOL_GPL(blkio_root_cgroup); +struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; +EXPORT_SYMBOL_GPL(blkcg_root); -static struct cgroup_subsys_state *blkiocg_create(struct cgroup *); -static int blkiocg_can_attach(struct cgroup *, struct cgroup_taskset *); -static void blkiocg_attach(struct cgroup *, struct cgroup_taskset *); -static void blkiocg_destroy(struct cgroup *); -static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); +static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; -/* for encoding cft->private value on file */ -#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) -/* What policy owns the file, proportional or throttle */ -#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) -#define BLKIOFILE_ATTR(val) ((val) & 0xffff) - -struct cgroup_subsys blkio_subsys = { - .name = "blkio", - .create = blkiocg_create, - .can_attach = blkiocg_can_attach, - .attach = blkiocg_attach, - .destroy = blkiocg_destroy, - .populate = blkiocg_populate, -#ifdef CONFIG_BLK_CGROUP - /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ - .subsys_id = blkio_subsys_id, -#endif - .use_id = 1, - .module = THIS_MODULE, -}; -EXPORT_SYMBOL_GPL(blkio_subsys); - -static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, - struct blkio_policy_node *pn) +struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { - list_add(&pn->node, &blkcg->policy_list); + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + struct blkcg, css); } +EXPORT_SYMBOL_GPL(cgroup_to_blkcg); -static inline bool cftype_blkg_same_policy(struct cftype *cft, - struct blkio_group *blkg) +static struct blkcg *task_blkcg(struct task_struct *tsk) { - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - - if (blkg->plid == plid) - return 1; - - return 0; + return container_of(task_subsys_state(tsk, blkio_subsys_id), + struct blkcg, css); } -/* Determines if policy node matches cgroup file being accessed */ -static inline bool pn_matches_cftype(struct cftype *cft, - struct blkio_policy_node *pn) +struct blkcg *bio_blkcg(struct bio *bio) { - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int fileid = BLKIOFILE_ATTR(cft->private); - - return (plid == pn->plid && fileid == pn->fileid); + if (bio && bio->bi_css) + return container_of(bio->bi_css, struct blkcg, css); + return task_blkcg(current); } +EXPORT_SYMBOL_GPL(bio_blkcg); -/* Must be called with blkcg->lock held */ -static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) +static bool blkcg_policy_enabled(struct request_queue *q, + const struct blkcg_policy *pol) { - list_del(&pn->node); + return pol && test_bit(pol->plid, q->blkcg_pols); } -/* Must be called with blkcg->lock held */ -static struct blkio_policy_node * -blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, - enum blkio_policy_id plid, int fileid) +/** + * blkg_free - free a blkg + * @blkg: blkg to free + * + * Free @blkg which may be partially allocated. + */ +static void blkg_free(struct blkcg_gq *blkg) { - struct blkio_policy_node *pn; - - list_for_each_entry(pn, &blkcg->policy_list, node) { - if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) - return pn; - } + int i; - return NULL; -} + if (!blkg) + return; -struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) -{ - return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), - struct blkio_cgroup, css); -} -EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkg_policy_data *pd = blkg->pd[i]; -struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, blkio_subsys_id), - struct blkio_cgroup, css); -} -EXPORT_SYMBOL_GPL(task_blkio_cgroup); + if (!pd) + continue; -static inline void -blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) -{ - struct blkio_policy_type *blkiop; + if (pol && pol->pd_exit_fn) + pol->pd_exit_fn(blkg); - list_for_each_entry(blkiop, &blkio_list, list) { - /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) - continue; - if (blkiop->ops.blkio_update_group_weight_fn) - blkiop->ops.blkio_update_group_weight_fn(blkg->key, - blkg, weight); + kfree(pd); } + + kfree(blkg); } -static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, - int fileid) +/** + * blkg_alloc - allocate a blkg + * @blkcg: block cgroup the new blkg is associated with + * @q: request_queue the new blkg is associated with + * + * Allocate a new blkg assocating @blkcg and @q. + */ +static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) { - struct blkio_policy_type *blkiop; - - list_for_each_entry(blkiop, &blkio_list, list) { - - /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) - continue; + struct blkcg_gq *blkg; + int i; - if (fileid == BLKIO_THROTL_read_bps_device - && blkiop->ops.blkio_update_group_read_bps_fn) - blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, - blkg, bps); + /* alloc and init base part */ + blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); + if (!blkg) + return NULL; - if (fileid == BLKIO_THROTL_write_bps_device - && blkiop->ops.blkio_update_group_write_bps_fn) - blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, - blkg, bps); - } -} - -static inline void blkio_update_group_iops(struct blkio_group *blkg, - unsigned int iops, int fileid) -{ - struct blkio_policy_type *blkiop; + blkg->q = q; + INIT_LIST_HEAD(&blkg->q_node); + blkg->blkcg = blkcg; + blkg->refcnt = 1; - list_for_each_entry(blkiop, &blkio_list, list) { + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkg_policy_data *pd; - /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) + if (!blkcg_policy_enabled(q, pol)) continue; - if (fileid == BLKIO_THROTL_read_iops_device - && blkiop->ops.blkio_update_group_read_iops_fn) - blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, - blkg, iops); + /* alloc per-policy data and attach it to blkg */ + pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node); + if (!pd) { + blkg_free(blkg); + return NULL; + } - if (fileid == BLKIO_THROTL_write_iops_device - && blkiop->ops.blkio_update_group_write_iops_fn) - blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, - blkg,iops); + blkg->pd[i] = pd; + pd->blkg = blkg; } -} -/* - * Add to the appropriate stat variable depending on the request type. - * This should be called with the blkg->stats_lock held. - */ -static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, - bool sync) -{ - if (direction) - stat[BLKIO_STAT_WRITE] += add; - else - stat[BLKIO_STAT_READ] += add; - if (sync) - stat[BLKIO_STAT_SYNC] += add; - else - stat[BLKIO_STAT_ASYNC] += add; -} + /* invoke per-policy init */ + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; -/* - * Decrements the appropriate stat variable if non-zero depending on the - * request type. Panics on value being zero. - * This should be called with the blkg->stats_lock held. - */ -static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) -{ - if (direction) { - BUG_ON(stat[BLKIO_STAT_WRITE] == 0); - stat[BLKIO_STAT_WRITE]--; - } else { - BUG_ON(stat[BLKIO_STAT_READ] == 0); - stat[BLKIO_STAT_READ]--; + if (blkcg_policy_enabled(blkg->q, pol)) + pol->pd_init_fn(blkg); } - if (sync) { - BUG_ON(stat[BLKIO_STAT_SYNC] == 0); - stat[BLKIO_STAT_SYNC]--; - } else { - BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); - stat[BLKIO_STAT_ASYNC]--; - } -} -#ifdef CONFIG_DEBUG_BLK_CGROUP -/* This should be called with the blkg->stats_lock held. */ -static void blkio_set_start_group_wait_time(struct blkio_group *blkg, - struct blkio_group *curr_blkg) -{ - if (blkio_blkg_waiting(&blkg->stats)) - return; - if (blkg == curr_blkg) - return; - blkg->stats.start_group_wait_time = sched_clock(); - blkio_mark_blkg_waiting(&blkg->stats); + return blkg; } -/* This should be called with the blkg->stats_lock held. */ -static void blkio_update_group_wait_time(struct blkio_group_stats *stats) +static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) { - unsigned long long now; + struct blkcg_gq *blkg; - if (!blkio_blkg_waiting(stats)) - return; + blkg = rcu_dereference(blkcg->blkg_hint); + if (blkg && blkg->q == q) + return blkg; + + /* + * Hint didn't match. Look up from the radix tree. Note that we + * may not be holding queue_lock and thus are not sure whether + * @blkg from blkg_tree has already been removed or not, so we + * can't update hint to the lookup result. Leave it to the caller. + */ + blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); + if (blkg && blkg->q == q) + return blkg; - now = sched_clock(); - if (time_after64(now, stats->start_group_wait_time)) - stats->group_wait_time += now - stats->start_group_wait_time; - blkio_clear_blkg_waiting(stats); + return NULL; } -/* This should be called with the blkg->stats_lock held. */ -static void blkio_end_empty_time(struct blkio_group_stats *stats) +/** + * blkg_lookup - lookup blkg for the specified blkcg - q pair + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. This function should be called + * under RCU read lock and is guaranteed to return %NULL if @q is bypassing + * - see blk_queue_bypass_start() for details. + */ +struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { - unsigned long long now; - - if (!blkio_blkg_empty(stats)) - return; + WARN_ON_ONCE(!rcu_read_lock_held()); - now = sched_clock(); - if (time_after64(now, stats->start_empty_time)) - stats->empty_time += now - stats->start_empty_time; - blkio_clear_blkg_empty(stats); + if (unlikely(blk_queue_bypass(q))) + return NULL; + return __blkg_lookup(blkcg, q); } +EXPORT_SYMBOL_GPL(blkg_lookup); -void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) { - unsigned long flags; + struct blkcg_gq *blkg; + int ret; - spin_lock_irqsave(&blkg->stats_lock, flags); - BUG_ON(blkio_blkg_idling(&blkg->stats)); - blkg->stats.start_idle_time = sched_clock(); - blkio_mark_blkg_idling(&blkg->stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); -void blkiocg_update_idle_time_stats(struct blkio_group *blkg) -{ - unsigned long flags; - unsigned long long now; - struct blkio_group_stats *stats; - - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - if (blkio_blkg_idling(stats)) { - now = sched_clock(); - if (time_after64(now, stats->start_idle_time)) - stats->idle_time += now - stats->start_idle_time; - blkio_clear_blkg_idling(stats); + /* lookup and update hint on success, see __blkg_lookup() for details */ + blkg = __blkg_lookup(blkcg, q); + if (blkg) { + rcu_assign_pointer(blkcg->blkg_hint, blkg); + return blkg; } - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); -void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) -{ - unsigned long flags; - struct blkio_group_stats *stats; - - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - stats->avg_queue_size_sum += - stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + - stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; - stats->avg_queue_size_samples++; - blkio_update_group_wait_time(stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); + /* blkg holds a reference to blkcg */ + if (!css_tryget(&blkcg->css)) + return ERR_PTR(-EINVAL); -void blkiocg_set_start_empty_time(struct blkio_group *blkg) -{ - unsigned long flags; - struct blkio_group_stats *stats; + /* allocate */ + ret = -ENOMEM; + blkg = blkg_alloc(blkcg, q); + if (unlikely(!blkg)) + goto err_put; - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; + /* insert */ + ret = radix_tree_preload(GFP_ATOMIC); + if (ret) + goto err_free; - if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || - stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { - spin_unlock_irqrestore(&blkg->stats_lock, flags); - return; + spin_lock(&blkcg->lock); + ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); + if (likely(!ret)) { + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + list_add(&blkg->q_node, &q->blkg_list); } + spin_unlock(&blkcg->lock); - /* - * group is already marked empty. This can happen if cfqq got new - * request in parent group and moved to this group while being added - * to service tree. Just ignore the event and move on. - */ - if(blkio_blkg_empty(stats)) { - spin_unlock_irqrestore(&blkg->stats_lock, flags); - return; - } + radix_tree_preload_end(); - stats->start_empty_time = sched_clock(); - blkio_mark_blkg_empty(stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + if (!ret) + return blkg; +err_free: + blkg_free(blkg); +err_put: + css_put(&blkcg->css); + return ERR_PTR(ret); } -EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); -void blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) -{ - blkg->stats.dequeue += dequeue; -} -EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); -#else -static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, - struct blkio_group *curr_blkg) {} -static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} -#endif - -void blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, - bool sync) +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { - unsigned long flags; - - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, - sync); - blkio_end_empty_time(&blkg->stats); - blkio_set_start_group_wait_time(blkg, curr_blkg); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + /* + * This could be the first entry point of blkcg implementation and + * we shouldn't allow anything to go through for a bypassing queue. + */ + if (unlikely(blk_queue_bypass(q))) + return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); + return __blkg_lookup_create(blkcg, q); } -EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); +EXPORT_SYMBOL_GPL(blkg_lookup_create); -void blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) +static void blkg_destroy(struct blkcg_gq *blkg) { - unsigned long flags; + struct request_queue *q = blkg->q; + struct blkcg *blkcg = blkg->blkcg; - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], - direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); + lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&blkcg->lock); -void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, - unsigned long unaccounted_time) -{ - unsigned long flags; - - spin_lock_irqsave(&blkg->stats_lock, flags); - blkg->stats.time += time; -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg->stats.unaccounted_time += unaccounted_time; -#endif - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); + /* Something wrong if we are trying to remove same group twice */ + WARN_ON_ONCE(list_empty(&blkg->q_node)); + WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); -/* - * should be called under rcu read lock or queue lock to make sure blkg pointer - * is valid. - */ -void blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) -{ - struct blkio_group_stats_cpu *stats_cpu; - unsigned long flags; + radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); + list_del_init(&blkg->q_node); + hlist_del_init_rcu(&blkg->blkcg_node); /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. + * Both setting lookup hint to and clearing it from @blkg are done + * under queue_lock. If it's not pointing to @blkg now, it never + * will. Hint assignment itself can race safely. */ - local_irq_save(flags); - - stats_cpu = this_cpu_ptr(blkg->stats_cpu); - - u64_stats_update_begin(&stats_cpu->syncp); - stats_cpu->sectors += bytes >> 9; - blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], - 1, direction, sync); - blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], - bytes, direction, sync); - u64_stats_update_end(&stats_cpu->syncp); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); - -void blkiocg_update_completion_stats(struct blkio_group *blkg, - uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) -{ - struct blkio_group_stats *stats; - unsigned long flags; - unsigned long long now = sched_clock(); - - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - if (time_after64(now, io_start_time)) - blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], - now - io_start_time, direction, sync); - if (time_after64(io_start_time, start_time)) - blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], - io_start_time - start_time, direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); - -/* Merged stats are per cpu. */ -void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, - bool sync) -{ - struct blkio_group_stats_cpu *stats_cpu; - unsigned long flags; + if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) + rcu_assign_pointer(blkcg->blkg_hint, NULL); /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. */ - local_irq_save(flags); - - stats_cpu = this_cpu_ptr(blkg->stats_cpu); - - u64_stats_update_begin(&stats_cpu->syncp); - blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, - direction, sync); - u64_stats_update_end(&stats_cpu->syncp); - local_irq_restore(flags); + blkg_put(blkg); } -EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); -/* - * This function allocates the per cpu stats for blkio_group. Should be called - * from sleepable context as alloc_per_cpu() requires that. +/** + * blkg_destroy_all - destroy all blkgs associated with a request_queue + * @q: request_queue of interest + * + * Destroy all blkgs associated with @q. */ -int blkio_alloc_blkg_stats(struct blkio_group *blkg) +static void blkg_destroy_all(struct request_queue *q) { - /* Allocate memory for per cpu stats */ - blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); - if (!blkg->stats_cpu) - return -ENOMEM; - return 0; -} -EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); + struct blkcg_gq *blkg, *n; -void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev, - enum blkio_policy_id plid) -{ - unsigned long flags; - - spin_lock_irqsave(&blkcg->lock, flags); - spin_lock_init(&blkg->stats_lock); - rcu_assign_pointer(blkg->key, key); - blkg->blkcg_id = css_id(&blkcg->css); - hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - blkg->plid = plid; - spin_unlock_irqrestore(&blkcg->lock, flags); - /* Need to take css reference ? */ - cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); - blkg->dev = dev; -} -EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); - -static void __blkiocg_del_blkio_group(struct blkio_group *blkg) -{ - hlist_del_init_rcu(&blkg->blkcg_node); - blkg->blkcg_id = 0; -} + lockdep_assert_held(q->queue_lock); -/* - * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 - * indicating that blk_group was unhashed by the time we got to it. - */ -int blkiocg_del_blkio_group(struct blkio_group *blkg) -{ - struct blkio_cgroup *blkcg; - unsigned long flags; - struct cgroup_subsys_state *css; - int ret = 1; + list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; - rcu_read_lock(); - css = css_lookup(&blkio_subsys, blkg->blkcg_id); - if (css) { - blkcg = container_of(css, struct blkio_cgroup, css); - spin_lock_irqsave(&blkcg->lock, flags); - if (!hlist_unhashed(&blkg->blkcg_node)) { - __blkiocg_del_blkio_group(blkg); - ret = 0; - } - spin_unlock_irqrestore(&blkcg->lock, flags); + spin_lock(&blkcg->lock); + blkg_destroy(blkg); + spin_unlock(&blkcg->lock); } - - rcu_read_unlock(); - return ret; } -EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); -/* called under rcu_read_lock(). */ -struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) +static void blkg_rcu_free(struct rcu_head *rcu_head) { - struct blkio_group *blkg; - struct hlist_node *n; - void *__key; - - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { - __key = blkg->key; - if (__key == key) - return blkg; - } - - return NULL; + blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); } -EXPORT_SYMBOL_GPL(blkiocg_lookup_group); -static void blkio_reset_stats_cpu(struct blkio_group *blkg) +void __blkg_release(struct blkcg_gq *blkg) { - struct blkio_group_stats_cpu *stats_cpu; - int i, j, k; + /* release the extra blkcg reference this blkg has been holding */ + css_put(&blkg->blkcg->css); + /* - * Note: On 64 bit arch this should not be an issue. This has the - * possibility of returning some inconsistent value on 32bit arch - * as 64bit update on 32bit is non atomic. Taking care of this - * corner case makes code very complicated, like sending IPIs to - * cpus, taking care of stats of offline cpus etc. + * A group is freed in rcu manner. But having an rcu lock does not + * mean that one can access all the fields of blkg and assume these + * are valid. For example, don't try to follow throtl_data and + * request queue links. * - * reset stats is anyway more of a debug feature and this sounds a - * corner case. So I am not complicating the code yet until and - * unless this becomes a real issue. + * Having a reference to blkg under an rcu allows acess to only + * values local to groups like group stats and group rate limits */ - for_each_possible_cpu(i) { - stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); - stats_cpu->sectors = 0; - for(j = 0; j < BLKIO_STAT_CPU_NR; j++) - for (k = 0; k < BLKIO_STAT_TOTAL; k++) - stats_cpu->stat_arr_cpu[j][k] = 0; - } + call_rcu(&blkg->rcu_head, blkg_rcu_free); } +EXPORT_SYMBOL_GPL(__blkg_release); -static int -blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) +static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, + u64 val) { - struct blkio_cgroup *blkcg; - struct blkio_group *blkg; |