diff options
-rw-r--r-- | Documentation/cgroups/cgroups.txt | 20 | ||||
-rw-r--r-- | Documentation/cgroups/memory.txt | 4 | ||||
-rw-r--r-- | Documentation/cgroups/resource_counter.txt | 4 | ||||
-rw-r--r-- | block/blk-throttle.c | 35 | ||||
-rw-r--r-- | block/cfq-iosched.c | 131 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 1 | ||||
-rw-r--r-- | include/linux/cgroup.h | 112 | ||||
-rw-r--r-- | include/linux/vmpressure.h | 8 | ||||
-rw-r--r-- | init/Kconfig | 3 | ||||
-rw-r--r-- | kernel/cgroup.c | 1202 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 7 | ||||
-rw-r--r-- | kernel/cpuset.c | 71 | ||||
-rw-r--r-- | kernel/sched/core.c | 13 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 18 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 22 | ||||
-rw-r--r-- | mm/memcontrol.c | 426 | ||||
-rw-r--r-- | mm/page_cgroup.c | 2 | ||||
-rw-r--r-- | mm/vmpressure.c | 26 | ||||
-rw-r--r-- | net/core/netprio_cgroup.c | 8 | ||||
-rw-r--r-- | security/device_cgroup.c | 7 |
20 files changed, 1022 insertions, 1098 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 638bf17ff86..821de56d158 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -24,7 +24,6 @@ CONTENTS: 2.1 Basic Usage 2.2 Attaching processes 2.3 Mounting hierarchies by name - 2.4 Notification API 3. Kernel API 3.1 Overview 3.2 Synchronization @@ -472,25 +471,6 @@ you give a subsystem a name. The name of the subsystem appears as part of the hierarchy description in /proc/mounts and /proc/<pid>/cgroups. -2.4 Notification API --------------------- - -There is mechanism which allows to get notifications about changing -status of a cgroup. - -To register a new notification handler you need to: - - create a file descriptor for event notification using eventfd(2); - - open a control file to be monitored (e.g. memory.usage_in_bytes); - - write "<event_fd> <control_fd> <args>" to cgroup.event_control. - Interpretation of args is defined by control file implementation; - -eventfd will be woken up by control file implementation or when the -cgroup is removed. - -To unregister a notification handler just close eventfd. - -NOTE: Support of notifications should be implemented for the control -file. See documentation for the subsystem. 3. Kernel API ============= diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index e2bc132608f..2622115276a 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -577,7 +577,7 @@ Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" per-node page counts including "hierarchical_<counter>" which sums up all hierarchical children's values in addition to the memcg's own value. -The ouput format of memory.numa_stat is: +The output format of memory.numa_stat is: total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ... file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ... @@ -670,7 +670,7 @@ page tables. 8.1 Interface -This feature is disabled by default. It can be enabledi (and disabled again) by +This feature is disabled by default. It can be enabled (and disabled again) by writing to memory.move_charge_at_immigrate of the destination cgroup. If you want to enable it: diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt index c4d99ed0b41..52e1da16a30 100644 --- a/Documentation/cgroups/resource_counter.txt +++ b/Documentation/cgroups/resource_counter.txt @@ -97,8 +97,8 @@ to work with it. (struct res_counter *rc, struct res_counter *top, unsinged long val) - Almost same as res_cunter_uncharge() but propagation of uncharge - stops when rc == top. This is useful when kill a res_coutner in + Almost same as res_counter_uncharge() but propagation of uncharge + stops when rc == top. This is useful when kill a res_counter in child cgroup. 2.1 Other accounting routines diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 06534049afb..a760857e6b6 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1303,13 +1303,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &rwstat); } -static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, - cft->private, true); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, + &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } @@ -1335,19 +1332,17 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, return __blkg_prfill_u64(sf, pd, v); } -static int tg_print_conf_u64(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int tg_print_conf_u64(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64, - &blkcg_policy_throtl, cft->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, + &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } -static int tg_print_conf_uint(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int tg_print_conf_uint(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint, - &blkcg_policy_throtl, cft->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, + &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } @@ -1428,40 +1423,40 @@ static struct cftype throtl_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), - .read_seq_string = tg_print_conf_u64, + .seq_show = tg_print_conf_u64, .write_string = tg_set_conf_u64, .max_write_len = 256, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), - .read_seq_string = tg_print_conf_u64, + .seq_show = tg_print_conf_u64, .write_string = tg_set_conf_u64, .max_write_len = 256, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), - .read_seq_string = tg_print_conf_uint, + .seq_show = tg_print_conf_uint, .write_string = tg_set_conf_uint, .max_write_len = 256, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), - .read_seq_string = tg_print_conf_uint, + .seq_show = tg_print_conf_uint, .write_string = tg_set_conf_uint, .max_write_len = 256, }, { .name = "throttle.io_service_bytes", .private = offsetof(struct tg_stats_cpu, service_bytes), - .read_seq_string = tg_print_cpu_rwstat, + .seq_show = tg_print_cpu_rwstat, }, { .name = "throttle.io_serviced", .private = offsetof(struct tg_stats_cpu, serviced), - .read_seq_string = tg_print_cpu_rwstat, + .seq_show = tg_print_cpu_rwstat, }, { } /* terminate */ }; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4d5cec1ad80..744833b630c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1632,11 +1632,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); } -static int cfqg_print_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfqg_print_weight_device(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device, - &blkcg_policy_cfq, 0, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_weight_device, &blkcg_policy_cfq, + 0, false); return 0; } @@ -1650,26 +1650,23 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); } -static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device, - &blkcg_policy_cfq, 0, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, + 0, false); return 0; } -static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft, - struct seq_file *sf) +static int cfq_print_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight); + seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight); return 0; } -static int cfq_print_leaf_weight(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfq_print_leaf_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight); + seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight); return 0; } @@ -1762,23 +1759,17 @@ static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, return __cfq_set_weight(css, cft, val, true); } -static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_stat(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, - cft->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, + &blkcg_policy_cfq, seq_cft(sf)->private, false); return 0; } -static int cfqg_print_rwstat(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfqg_print_rwstat(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, - cft->private, true); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, + &blkcg_policy_cfq, seq_cft(sf)->private, true); return 0; } @@ -1798,23 +1789,19 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &sum); } -static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfqg_print_stat_recursive(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, - &blkcg_policy_cfq, cft->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_stat_recursive, &blkcg_policy_cfq, + seq_cft(sf)->private, false); return 0; } -static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, - &blkcg_policy_cfq, cft->private, true); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, + seq_cft(sf)->private, true); return 0; } @@ -1835,13 +1822,11 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, } /* print avg_queue_size */ -static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, - &blkcg_policy_cfq, 0, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, + 0, false); return 0; } #endif /* CONFIG_DEBUG_BLK_CGROUP */ @@ -1851,14 +1836,14 @@ static struct cftype cfq_blkcg_files[] = { { .name = "weight_device", .flags = CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cfqg_print_leaf_weight_device, + .seq_show = cfqg_print_leaf_weight_device, .write_string = cfqg_set_leaf_weight_device, .max_write_len = 256, }, { .name = "weight", .flags = CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cfq_print_leaf_weight, + .seq_show = cfq_print_leaf_weight, .write_u64 = cfq_set_leaf_weight, }, @@ -1866,26 +1851,26 @@ static struct cftype cfq_blkcg_files[] = { { .name = "weight_device", .flags = CFTYPE_NOT_ON_ROOT, - .read_seq_string = cfqg_print_weight_device, + .seq_show = cfqg_print_weight_device, .write_string = cfqg_set_weight_device, .max_write_len = 256, }, { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, - .read_seq_string = cfq_print_weight, + .seq_show = cfq_print_weight, .write_u64 = cfq_set_weight, }, { .name = "leaf_weight_device", - .read_seq_string = cfqg_print_leaf_weight_device, + .seq_show = cfqg_print_leaf_weight_device, .write_string = cfqg_set_leaf_weight_device, .max_write_len = 256, }, { .name = "leaf_weight", - .read_seq_string = cfq_print_leaf_weight, + .seq_show = cfq_print_leaf_weight, .write_u64 = cfq_set_leaf_weight, }, @@ -1893,114 +1878,114 @@ static struct cftype cfq_blkcg_files[] = { { .name = "time", .private = offsetof(struct cfq_group, stats.time), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "sectors", .private = offsetof(struct cfq_group, stats.sectors), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "io_service_bytes", .private = offsetof(struct cfq_group, stats.service_bytes), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, { .name = "io_serviced", .private = offsetof(struct cfq_group, stats.serviced), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, { .name = "io_service_time", .private = offsetof(struct cfq_group, stats.service_time), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, { .name = "io_wait_time", .private = offsetof(struct cfq_group, stats.wait_time), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, { .name = "io_merged", .private = offsetof(struct cfq_group, stats.merged), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, { .name = "io_queued", .private = offsetof(struct cfq_group, stats.queued), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, /* the same statictics which cover the cfqg and its descendants */ { .name = "time_recursive", .private = offsetof(struct cfq_group, stats.time), - .read_seq_string = cfqg_print_stat_recursive, + .seq_show = cfqg_print_stat_recursive, }, { .name = "sectors_recursive", .private = offsetof(struct cfq_group, stats.sectors), - .read_seq_string = cfqg_print_stat_recursive, + .seq_show = cfqg_print_stat_recursive, }, { .name = "io_service_bytes_recursive", .private = offsetof(struct cfq_group, stats.service_bytes), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, { .name = "io_serviced_recursive", .private = offsetof(struct cfq_group, stats.serviced), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, { .name = "io_service_time_recursive", .private = offsetof(struct cfq_group, stats.service_time), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, { .name = "io_wait_time_recursive", .private = offsetof(struct cfq_group, stats.wait_time), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, { .name = "io_merged_recursive", .private = offsetof(struct cfq_group, stats.merged), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, { .name = "io_queued_recursive", .private = offsetof(struct cfq_group, stats.queued), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, #ifdef CONFIG_DEBUG_BLK_CGROUP { .name = "avg_queue_size", - .read_seq_string = cfqg_print_avg_queue_size, + .seq_show = cfqg_print_avg_queue_size, }, { .name = "group_wait_time", .private = offsetof(struct cfq_group, stats.group_wait_time), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "idle_time", .private = offsetof(struct cfq_group, stats.idle_time), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "empty_time", .private = offsetof(struct cfq_group, stats.empty_time), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "dequeue", .private = offsetof(struct cfq_group, stats.dequeue), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "unaccounted_time", .private = offsetof(struct cfq_group, stats.unaccounted_time), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, #endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index fbcc851ed5a..61bcfc21d2a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -163,7 +163,6 @@ static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) static void bcachecg_destroy(struct cgroup *cgroup) { struct bch_cgroup *cg = cgroup_to_bcache(cgroup); - free_css_id(&bcache_subsys, &cg->css); kfree(cg); } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 39c1d946967..5c097596104 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -21,6 +21,7 @@ #include <linux/xattr.h> #include <linux/fs.h> #include <linux/percpu-refcount.h> +#include <linux/seq_file.h> #ifdef CONFIG_CGROUPS @@ -28,8 +29,6 @@ struct cgroupfs_root; struct cgroup_subsys; struct inode; struct cgroup; -struct css_id; -struct eventfd_ctx; extern int cgroup_init_early(void); extern int cgroup_init(void); @@ -79,8 +78,6 @@ struct cgroup_subsys_state { struct cgroup_subsys_state *parent; unsigned long flags; - /* ID for this css, if possible */ - struct css_id __rcu *id; /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; @@ -239,10 +236,6 @@ struct cgroup { struct rcu_head rcu_head; struct work_struct destroy_work; - /* List of events which userspace want to receive */ - struct list_head event_list; - spinlock_t event_list_lock; - /* directory xattrs */ struct simple_xattrs xattrs; }; @@ -280,6 +273,9 @@ enum { * - "tasks" is removed. Everything should be at process * granularity. Use "cgroup.procs" instead. * + * - "cgroup.procs" is not sorted. pids will be unique unless they + * got recycled inbetween reads. + * * - "release_agent" and "notify_on_release" are removed. * Replacement notification mechanism will be implemented. * @@ -320,9 +316,6 @@ struct cgroupfs_root { /* Unique id for this hierarchy. */ int hierarchy_id; - /* A list running through the attached subsystems */ - struct list_head subsys_list; - /* The root cgroup for this hierarchy */ struct cgroup top_cgroup; @@ -389,16 +382,6 @@ struct css_set { }; /* - * cgroup_map_cb is an abstract callback API for reporting map-valued - * control files - */ - -struct cgroup_map_cb { - int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); - void *state; -}; - -/* * struct cftype: handler definitions for cgroup control files * * When reading/writing to a file: @@ -445,10 +428,6 @@ struct cftype { */ struct cgroup_subsys *ss; - int (*open)(struct inode *inode, struct file *file); - ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos); /* * read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() @@ -458,24 +437,14 @@ struct cftype { * read_s64() is a signed version of read_u64() */ s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); - /* - * read_map() is used for defining a map of key/value - * pairs. It should call cb->fill(cb, key, value) for each - * entry. The key/value pairs (and their ordering) should not - * change between reboots. - */ - int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft, - struct cgroup_map_cb *cb); - /* - * read_seq_string() is used for outputting a simple sequence - * using seqfile. - */ - int (*read_seq_string)(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *m); - ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft, - struct file *file, - const char __user *buf, size_t nbytes, loff_t *ppos); + /* generic seq_file read interface */ + int (*seq_show)(struct seq_file *sf, void *v); + + /* optional ops, implement all or none */ + void *(*seq_start)(struct seq_file *sf, loff_t *ppos); + void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); + void (*seq_stop)(struct seq_file *sf, void *v); /* * write_u64() is a shortcut for the common case of accepting @@ -504,27 +473,6 @@ struct cftype { * kick type for multiplexing. */ int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); - - int (*release)(struct inode *inode, struct file *file); - - /* - * register_event() callback will be used to add new userspace - * waiter for changes related to the cftype. Implement it if - * you want to provide this functionality. Use eventfd_signal() - * on eventfd to send notification to userspace. - */ - int (*register_event)(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, - const char *args); - /* - * unregister_event() callback will be called when userspace - * closes the eventfd or on cgroup removing. - * This callback must be implemented, if you want provide - * notification functionality. - */ - void (*unregister_event)(struct cgroup_subsys_state *css, - struct cftype *cft, - struct eventfd_ctx *eventfd); }; /* @@ -538,6 +486,26 @@ struct cftype_set { }; /* + * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't + * access directly. + */ +struct cfent { + struct list_head node; + struct dentry *dentry; + struct cftype *type; + struct cgroup_subsys_state *css; + + /* file xattrs */ + struct simple_xattrs xattrs; +}; + +/* seq_file->private points to the following, only ->priv is public */ +struct cgroup_open_file { + struct cfent *cfe; + void *priv; +}; + +/* * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This * function can be called as long as @cgrp is accessible. */ @@ -552,6 +520,18 @@ static inline const char *cgroup_name(const struct cgroup *cgrp) return rcu_dereference(cgrp->name)->name; } +static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) +{ + struct cgroup_open_file *of = seq->private; + return of->cfe->css; +} + +static inline struct cftype *seq_cft(struct seq_file *seq) +{ + struct cgroup_open_file *of = seq->private; + return of->cfe->type; +} + int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); @@ -631,12 +611,8 @@ struct cgroup_subsys { #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; - /* - * Link to parent, and list entry in parent's children. - * Protected by cgroup_lock() - */ + /* link to parent, protected by cgroup_lock() */ struct cgroupfs_root *root; - struct list_head sibling; /* list of cftype_sets */ struct list_head cftsets; diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 3f3788d4936..3e4535876d3 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -7,6 +7,7 @@ #include <linux/gfp.h> #include <linux/types.h> #include <linux/cgroup.h> +#include <linux/eventfd.h> struct vmpressure { unsigned long scanned; @@ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr); extern void vmpressure_cleanup(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); -extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); -extern int vmpressure_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, +extern int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args); -extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, +extern void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); #else static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, diff --git a/init/Kconfig b/init/Kconfig index 5236dc562a3..8d402e33b7f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -854,7 +854,6 @@ config NUMA_BALANCING menuconfig CGROUPS boolean "Control Group support" - depends on EVENTFD help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory @@ -921,6 +920,7 @@ config MEMCG bool "Memory Resource Controller for Control Groups" depends on RESOURCE_COUNTERS select MM_OWNER + select EVENTFD help Provides a memory resource controller that manages both anonymous memory and page cache. (See Documentation/cgroups/memory.txt) @@ -1160,7 +1160,6 @@ config UIDGID_STRICT_TYPE_CHECKS config SCHED_AUTOGROUP bool "Automatic process group scheduling" - select EVENTFD select CGROUPS select CGROUP_SCHED select FAIR_GROUP_SCHED diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bc1dcabe921..e2f46ba37f7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -41,7 +41,6 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/backing-dev.h> -#include <linux/seq_file.h> #include <linux/slab.h> #include <linux/magic.h> #include <linux/spinlock.h> @@ -56,15 +55,20 @@ #include <linux/pid_namespace.h> #include <linux/idr.h> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ -#include <linux/eventfd.h> -#include <linux/poll.h> #include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> -#include <linux/file.h> #include <linux/atomic.h> /* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + +/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * @@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex); static DEFINE_MUTEX(cgroup_root_mutex); +#define cgroup_assert_mutex_or_rcu_locked() \ + rcu_lockdep_assert(rcu_read_lock_held() || \ + lockdep_is_held(&cgroup_mutex), \ + "cgroup_mutex or RCU read lock required"); + +#ifdef CONFIG_LOCKDEP +#define cgroup_assert_mutex_or_root_locked() \ + WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \ + !lockdep_is_held(&cgroup_root_mutex))) +#else +#define cgroup_assert_mutex_or_root_locked() do { } while (0) +#endif + /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup @@ -98,6 +115,12 @@ static DEFINE_MUTEX(cgroup_root_mutex); static struct workqueue_struct *cgroup_destroy_wq; /* + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. + */ +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + +/* * Generate an array of cgroup subsystem pointers. At boot time, this is * populated with the built in subsystems, and modular subsystems are * registered after that. The mutable section of this array is protected by @@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root; /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; -/* - * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. - */ -struct cfent { - struct list_head node; - struct dentry *dentry; - struct cftype *type; - struct cgroup_subsys_state *css; - - /* file xattrs */ - struct simple_xattrs xattrs; -}; - -/* - * cgroup_event represents events which userspace want to receive. - */ -struct cgroup_event { - /* - * css which the event belongs to. - */ - struct cgroup_subsys_state *css; - /* - * Control file which the event associated. - */ - struct cftype *cft; - /* - * eventfd to signal userspace about the event. - */ - struct eventfd_ctx *eventfd; - /* - * Each of these stored in a list by the cgroup. - */ - struct list_head list; - /* - * All fields below needed to unregister event when - * userspace closes eventfd. - */ - poll_table pt; - wait_queue_head_t *wqh; - wait_queue_t wait; - struct work_struct remove; -}; - /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static int cgroup_file_release(struct inode *inode, struct file *file); +static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -262,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp) } /** + * for_each_css - iterate all css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_mutex. + */ +#define for_each_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = rcu_dereference_check( \ + (cgrp)->subsys[(ssid)], \ + lockdep_is_held(&cgroup_mutex)))) { } \ + else + +/** * for_each_subsys - iterate all loaded cgroup subsystems * @ss: the iteration cursor - * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * - * Should be called under cgroup_mutex. + * Iterates through all loaded subsystems. Should be called under + * cgroup_mutex or cgroup_root_mutex. */ -#define for_each_subsys(ss, i) \ - for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - !((ss) = cgroup_subsys[i]); })) { } \ +#define for_each_subsys(ss, ssid) \ + for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ + (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((ss) = cgroup_subsys[(ssid)])) { } \ else /** @@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp) for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[i]) || true); (i)++) -/* iterate each subsystem attached to a hierarchy */ -#define for_each_root_subsys(root, ss) \ - list_for_each_entry((ss), &(root)->subsys_list, sibling) - /* iterate across the active hierarchies */ #define for_each_active_root(root) \ list_for_each |