aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/cgroups.txt20
-rw-r--r--Documentation/cgroups/memory.txt4
-rw-r--r--Documentation/cgroups/resource_counter.txt4
-rw-r--r--block/blk-throttle.c35
-rw-r--r--block/cfq-iosched.c131
-rw-r--r--drivers/md/bcache/request.c1
-rw-r--r--include/linux/cgroup.h112
-rw-r--r--include/linux/vmpressure.h8
-rw-r--r--init/Kconfig3
-rw-r--r--kernel/cgroup.c1202
-rw-r--r--kernel/cgroup_freezer.c7
-rw-r--r--kernel/cpuset.c71
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/cpuacct.c18
-rw-r--r--mm/hugetlb_cgroup.c22
-rw-r--r--mm/memcontrol.c426
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/vmpressure.c26
-rw-r--r--net/core/netprio_cgroup.c8
-rw-r--r--security/device_cgroup.c7
20 files changed, 1022 insertions, 1098 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 638bf17ff86..821de56d158 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -24,7 +24,6 @@ CONTENTS:
2.1 Basic Usage
2.2 Attaching processes
2.3 Mounting hierarchies by name
- 2.4 Notification API
3. Kernel API
3.1 Overview
3.2 Synchronization
@@ -472,25 +471,6 @@ you give a subsystem a name.
The name of the subsystem appears as part of the hierarchy description
in /proc/mounts and /proc/<pid>/cgroups.
-2.4 Notification API
---------------------
-
-There is mechanism which allows to get notifications about changing
-status of a cgroup.
-
-To register a new notification handler you need to:
- - create a file descriptor for event notification using eventfd(2);
- - open a control file to be monitored (e.g. memory.usage_in_bytes);
- - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
- Interpretation of args is defined by control file implementation;
-
-eventfd will be woken up by control file implementation or when the
-cgroup is removed.
-
-To unregister a notification handler just close eventfd.
-
-NOTE: Support of notifications should be implemented for the control
-file. See documentation for the subsystem.
3. Kernel API
=============
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index e2bc132608f..2622115276a 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -577,7 +577,7 @@ Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
per-node page counts including "hierarchical_<counter>" which sums up all
hierarchical children's values in addition to the memcg's own value.
-The ouput format of memory.numa_stat is:
+The output format of memory.numa_stat is:
total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
@@ -670,7 +670,7 @@ page tables.
8.1 Interface
-This feature is disabled by default. It can be enabledi (and disabled again) by
+This feature is disabled by default. It can be enabled (and disabled again) by
writing to memory.move_charge_at_immigrate of the destination cgroup.
If you want to enable it:
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index c4d99ed0b41..52e1da16a30 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -97,8 +97,8 @@ to work with it.
(struct res_counter *rc, struct res_counter *top,
unsinged long val)
- Almost same as res_cunter_uncharge() but propagation of uncharge
- stops when rc == top. This is useful when kill a res_coutner in
+ Almost same as res_counter_uncharge() but propagation of uncharge
+ stops when rc == top. This is useful when kill a res_counter in
child cgroup.
2.1 Other accounting routines
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 06534049afb..a760857e6b6 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1303,13 +1303,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, &rwstat);
}
-static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
{
- struct blkcg *blkcg = css_to_blkcg(css);
-
- blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
- cft->private, true);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
+ &blkcg_policy_throtl, seq_cft(sf)->private, true);
return 0;
}
@@ -1335,19 +1332,17 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
return __blkg_prfill_u64(sf, pd, v);
}
-static int tg_print_conf_u64(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int tg_print_conf_u64(struct seq_file *sf, void *v)
{
- blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64,
- &blkcg_policy_throtl, cft->private, false);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
+ &blkcg_policy_throtl, seq_cft(sf)->private, false);
return 0;
}
-static int tg_print_conf_uint(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int tg_print_conf_uint(struct seq_file *sf, void *v)
{
- blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint,
- &blkcg_policy_throtl, cft->private, false);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
+ &blkcg_policy_throtl, seq_cft(sf)->private, false);
return 0;
}
@@ -1428,40 +1423,40 @@ static struct cftype throtl_files[] = {
{
.name = "throttle.read_bps_device",
.private = offsetof(struct throtl_grp, bps[READ]),
- .read_seq_string = tg_print_conf_u64,
+ .seq_show = tg_print_conf_u64,
.write_string = tg_set_conf_u64,
.max_write_len = 256,
},
{
.name = "throttle.write_bps_device",
.private = offsetof(struct throtl_grp, bps[WRITE]),
- .read_seq_string = tg_print_conf_u64,
+ .seq_show = tg_print_conf_u64,
.write_string = tg_set_conf_u64,
.max_write_len = 256,
},
{
.name = "throttle.read_iops_device",
.private = offsetof(struct throtl_grp, iops[READ]),
- .read_seq_string = tg_print_conf_uint,
+ .seq_show = tg_print_conf_uint,
.write_string = tg_set_conf_uint,
.max_write_len = 256,
},
{
.name = "throttle.write_iops_device",
.private = offsetof(struct throtl_grp, iops[WRITE]),
- .read_seq_string = tg_print_conf_uint,
+ .seq_show = tg_print_conf_uint,
.write_string = tg_set_conf_uint,
.max_write_len = 256,
},
{
.name = "throttle.io_service_bytes",
.private = offsetof(struct tg_stats_cpu, service_bytes),
- .read_seq_string = tg_print_cpu_rwstat,
+ .seq_show = tg_print_cpu_rwstat,
},
{
.name = "throttle.io_serviced",
.private = offsetof(struct tg_stats_cpu, serviced),
- .read_seq_string = tg_print_cpu_rwstat,
+ .seq_show = tg_print_cpu_rwstat,
},
{ } /* terminate */
};
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4d5cec1ad80..744833b630c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1632,11 +1632,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf,
return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
}
-static int cfqg_print_weight_device(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int cfqg_print_weight_device(struct seq_file *sf, void *v)
{
- blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device,
- &blkcg_policy_cfq, 0, false);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_weight_device, &blkcg_policy_cfq,
+ 0, false);
return 0;
}
@@ -1650,26 +1650,23 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
}
-static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css,
- struct cftype *cft,
- struct seq_file *sf)
+static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
{
- blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device,
- &blkcg_policy_cfq, 0, false);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
+ 0, false);
return 0;
}
-static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft,
- struct seq_file *sf)
+static int cfq_print_weight(struct seq_file *sf, void *v)
{
- seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight);
+ seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
return 0;
}
-static int cfq_print_leaf_weight(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
{
- seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight);
+ seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
return 0;
}
@@ -1762,23 +1759,17 @@ static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
return __cfq_set_weight(css, cft, val, true);
}
-static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft,
- struct seq_file *sf)
+static int cfqg_print_stat(struct seq_file *sf, void *v)
{
- struct blkcg *blkcg = css_to_blkcg(css);
-
- blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
- cft->private, false);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+ &blkcg_policy_cfq, seq_cft(sf)->private, false);
return 0;
}
-static int cfqg_print_rwstat(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int cfqg_print_rwstat(struct seq_file *sf, void *v)
{
- struct blkcg *blkcg = css_to_blkcg(css);
-
- blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
- cft->private, true);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+ &blkcg_policy_cfq, seq_cft(sf)->private, true);
return 0;
}
@@ -1798,23 +1789,19 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
return __blkg_prfill_rwstat(sf, pd, &sum);
}
-static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
{
- struct blkcg *blkcg = css_to_blkcg(css);
-
- blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
- &blkcg_policy_cfq, cft->private, false);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
+ seq_cft(sf)->private, false);
return 0;
}
-static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
{
- struct blkcg *blkcg = css_to_blkcg(css);
-
- blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
- &blkcg_policy_cfq, cft->private, true);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
+ seq_cft(sf)->private, true);
return 0;
}
@@ -1835,13 +1822,11 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
}
/* print avg_queue_size */
-static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *sf)
+static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
{
- struct blkcg *blkcg = css_to_blkcg(css);
-
- blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
- &blkcg_policy_cfq, 0, false);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
+ 0, false);
return 0;
}
#endif /* CONFIG_DEBUG_BLK_CGROUP */
@@ -1851,14 +1836,14 @@ static struct cftype cfq_blkcg_files[] = {
{
.name = "weight_device",
.flags = CFTYPE_ONLY_ON_ROOT,
- .read_seq_string = cfqg_print_leaf_weight_device,
+ .seq_show = cfqg_print_leaf_weight_device,
.write_string = cfqg_set_leaf_weight_device,
.max_write_len = 256,
},
{
.name = "weight",
.flags = CFTYPE_ONLY_ON_ROOT,
- .read_seq_string = cfq_print_leaf_weight,
+ .seq_show = cfq_print_leaf_weight,
.write_u64 = cfq_set_leaf_weight,
},
@@ -1866,26 +1851,26 @@ static struct cftype cfq_blkcg_files[] = {
{
.name = "weight_device",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_seq_string = cfqg_print_weight_device,
+ .seq_show = cfqg_print_weight_device,
.write_string = cfqg_set_weight_device,
.max_write_len = 256,
},
{
.name = "weight",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_seq_string = cfq_print_weight,
+ .seq_show = cfq_print_weight,
.write_u64 = cfq_set_weight,
},
{
.name = "leaf_weight_device",
- .read_seq_string = cfqg_print_leaf_weight_device,
+ .seq_show = cfqg_print_leaf_weight_device,
.write_string = cfqg_set_leaf_weight_device,
.max_write_len = 256,
},
{
.name = "leaf_weight",
- .read_seq_string = cfq_print_leaf_weight,
+ .seq_show = cfq_print_leaf_weight,
.write_u64 = cfq_set_leaf_weight,
},
@@ -1893,114 +1878,114 @@ static struct cftype cfq_blkcg_files[] = {
{
.name = "time",
.private = offsetof(struct cfq_group, stats.time),
- .read_seq_string = cfqg_print_stat,
+ .seq_show = cfqg_print_stat,
},
{
.name = "sectors",
.private = offsetof(struct cfq_group, stats.sectors),
- .read_seq_string = cfqg_print_stat,
+ .seq_show = cfqg_print_stat,
},
{
.name = "io_service_bytes",
.private = offsetof(struct cfq_group, stats.service_bytes),
- .read_seq_string = cfqg_print_rwstat,
+ .seq_show = cfqg_print_rwstat,
},
{
.name = "io_serviced",
.private = offsetof(struct cfq_group, stats.serviced),
- .read_seq_string = cfqg_print_rwstat,
+ .seq_show = cfqg_print_rwstat,
},
{
.name = "io_service_time",
.private = offsetof(struct cfq_group, stats.service_time),
- .read_seq_string = cfqg_print_rwstat,
+ .seq_show = cfqg_print_rwstat,
},
{
.name = "io_wait_time",
.private = offsetof(struct cfq_group, stats.wait_time),
- .read_seq_string = cfqg_print_rwstat,
+ .seq_show = cfqg_print_rwstat,
},
{
.name = "io_merged",
.private = offsetof(struct cfq_group, stats.merged),
- .read_seq_string = cfqg_print_rwstat,
+ .seq_show = cfqg_print_rwstat,
},
{
.name = "io_queued",
.private = offsetof(struct cfq_group, stats.queued),
- .read_seq_string = cfqg_print_rwstat,
+ .seq_show = cfqg_print_rwstat,
},
/* the same statictics which cover the cfqg and its descendants */
{
.name = "time_recursive",
.private = offsetof(struct cfq_group, stats.time),
- .read_seq_string = cfqg_print_stat_recursive,
+ .seq_show = cfqg_print_stat_recursive,
},
{
.name = "sectors_recursive",
.private = offsetof(struct cfq_group, stats.sectors),
- .read_seq_string = cfqg_print_stat_recursive,
+ .seq_show = cfqg_print_stat_recursive,
},
{
.name = "io_service_bytes_recursive",
.private = offsetof(struct cfq_group, stats.service_bytes),
- .read_seq_string = cfqg_print_rwstat_recursive,
+ .seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_serviced_recursive",
.private = offsetof(struct cfq_group, stats.serviced),
- .read_seq_string = cfqg_print_rwstat_recursive,
+ .seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_service_time_recursive",
.private = offsetof(struct cfq_group, stats.service_time),
- .read_seq_string = cfqg_print_rwstat_recursive,
+ .seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_wait_time_recursive",
.private = offsetof(struct cfq_group, stats.wait_time),
- .read_seq_string = cfqg_print_rwstat_recursive,
+ .seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_merged_recursive",
.private = offsetof(struct cfq_group, stats.merged),
- .read_seq_string = cfqg_print_rwstat_recursive,
+ .seq_show = cfqg_print_rwstat_recursive,
},
{
.name = "io_queued_recursive",
.private = offsetof(struct cfq_group, stats.queued),
- .read_seq_string = cfqg_print_rwstat_recursive,
+ .seq_show = cfqg_print_rwstat_recursive,
},
#ifdef CONFIG_DEBUG_BLK_CGROUP
{
.name = "avg_queue_size",
- .read_seq_string = cfqg_print_avg_queue_size,
+ .seq_show = cfqg_print_avg_queue_size,
},
{
.name = "group_wait_time",
.private = offsetof(struct cfq_group, stats.group_wait_time),
- .read_seq_string = cfqg_print_stat,
+ .seq_show = cfqg_print_stat,
},
{
.name = "idle_time",
.private = offsetof(struct cfq_group, stats.idle_time),
- .read_seq_string = cfqg_print_stat,
+ .seq_show = cfqg_print_stat,
},
{
.name = "empty_time",
.private = offsetof(struct cfq_group, stats.empty_time),
- .read_seq_string = cfqg_print_stat,
+ .seq_show = cfqg_print_stat,
},
{
.name = "dequeue",
.private = offsetof(struct cfq_group, stats.dequeue),
- .read_seq_string = cfqg_print_stat,
+ .seq_show = cfqg_print_stat,
},
{
.name = "unaccounted_time",
.private = offsetof(struct cfq_group, stats.unaccounted_time),
- .read_seq_string = cfqg_print_stat,
+ .seq_show = cfqg_print_stat,
},
#endif /* CONFIG_DEBUG_BLK_CGROUP */
{ } /* terminate */
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index fbcc851ed5a..61bcfc21d2a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -163,7 +163,6 @@ static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
static void bcachecg_destroy(struct cgroup *cgroup)
{
struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
- free_css_id(&bcache_subsys, &cg->css);
kfree(cg);
}
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 39c1d946967..5c097596104 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -21,6 +21,7 @@
#include <linux/xattr.h>
#include <linux/fs.h>
#include <linux/percpu-refcount.h>
+#include <linux/seq_file.h>
#ifdef CONFIG_CGROUPS
@@ -28,8 +29,6 @@ struct cgroupfs_root;
struct cgroup_subsys;
struct inode;
struct cgroup;
-struct css_id;
-struct eventfd_ctx;
extern int cgroup_init_early(void);
extern int cgroup_init(void);
@@ -79,8 +78,6 @@ struct cgroup_subsys_state {
struct cgroup_subsys_state *parent;
unsigned long flags;
- /* ID for this css, if possible */
- struct css_id __rcu *id;
/* percpu_ref killing and RCU release */
struct rcu_head rcu_head;
@@ -239,10 +236,6 @@ struct cgroup {
struct rcu_head rcu_head;
struct work_struct destroy_work;
- /* List of events which userspace want to receive */
- struct list_head event_list;
- spinlock_t event_list_lock;
-
/* directory xattrs */
struct simple_xattrs xattrs;
};
@@ -280,6 +273,9 @@ enum {
* - "tasks" is removed. Everything should be at process
* granularity. Use "cgroup.procs" instead.
*
+ * - "cgroup.procs" is not sorted. pids will be unique unless they
+ * got recycled inbetween reads.
+ *
* - "release_agent" and "notify_on_release" are removed.
* Replacement notification mechanism will be implemented.
*
@@ -320,9 +316,6 @@ struct cgroupfs_root {
/* Unique id for this hierarchy. */
int hierarchy_id;
- /* A list running through the attached subsystems */
- struct list_head subsys_list;
-
/* The root cgroup for this hierarchy */
struct cgroup top_cgroup;
@@ -389,16 +382,6 @@ struct css_set {
};
/*
- * cgroup_map_cb is an abstract callback API for reporting map-valued
- * control files
- */
-
-struct cgroup_map_cb {
- int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
- void *state;
-};
-
-/*
* struct cftype: handler definitions for cgroup control files
*
* When reading/writing to a file:
@@ -445,10 +428,6 @@ struct cftype {
*/
struct cgroup_subsys *ss;
- int (*open)(struct inode *inode, struct file *file);
- ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes, loff_t *ppos);
/*
* read_u64() is a shortcut for the common case of returning a
* single integer. Use it in place of read()
@@ -458,24 +437,14 @@ struct cftype {
* read_s64() is a signed version of read_u64()
*/
s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
- /*
- * read_map() is used for defining a map of key/value
- * pairs. It should call cb->fill(cb, key, value) for each
- * entry. The key/value pairs (and their ordering) should not
- * change between reboots.
- */
- int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft,
- struct cgroup_map_cb *cb);
- /*
- * read_seq_string() is used for outputting a simple sequence
- * using seqfile.
- */
- int (*read_seq_string)(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *m);
- ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft,
- struct file *file,
- const char __user *buf, size_t nbytes, loff_t *ppos);
+ /* generic seq_file read interface */
+ int (*seq_show)(struct seq_file *sf, void *v);
+
+ /* optional ops, implement all or none */
+ void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
+ void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
+ void (*seq_stop)(struct seq_file *sf, void *v);
/*
* write_u64() is a shortcut for the common case of accepting
@@ -504,27 +473,6 @@ struct cftype {
* kick type for multiplexing.
*/
int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
-
- int (*release)(struct inode *inode, struct file *file);
-
- /*
- * register_event() callback will be used to add new userspace
- * waiter for changes related to the cftype. Implement it if
- * you want to provide this functionality. Use eventfd_signal()
- * on eventfd to send notification to userspace.
- */
- int (*register_event)(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd,
- const char *args);
- /*
- * unregister_event() callback will be called when userspace
- * closes the eventfd or on cgroup removing.
- * This callback must be implemented, if you want provide
- * notification functionality.
- */
- void (*unregister_event)(struct cgroup_subsys_state *css,
- struct cftype *cft,
- struct eventfd_ctx *eventfd);
};
/*
@@ -538,6 +486,26 @@ struct cftype_set {
};
/*
+ * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't
+ * access directly.
+ */
+struct cfent {
+ struct list_head node;
+ struct dentry *dentry;
+ struct cftype *type;
+ struct cgroup_subsys_state *css;
+
+ /* file xattrs */
+ struct simple_xattrs xattrs;
+};
+
+/* seq_file->private points to the following, only ->priv is public */
+struct cgroup_open_file {
+ struct cfent *cfe;
+ void *priv;
+};
+
+/*
* See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
* function can be called as long as @cgrp is accessible.
*/
@@ -552,6 +520,18 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
return rcu_dereference(cgrp->name)->name;
}
+static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
+{
+ struct cgroup_open_file *of = seq->private;
+ return of->cfe->css;
+}
+
+static inline struct cftype *seq_cft(struct seq_file *seq)
+{
+ struct cgroup_open_file *of = seq->private;
+ return of->cfe->type;
+}
+
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
@@ -631,12 +611,8 @@ struct cgroup_subsys {
#define MAX_CGROUP_TYPE_NAMELEN 32
const char *name;
- /*
- * Link to parent, and list entry in parent's children.
- * Protected by cgroup_lock()
- */
+ /* link to parent, protected by cgroup_lock() */
struct cgroupfs_root *root;
- struct list_head sibling;
/* list of cftype_sets */
struct list_head cftsets;
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 3f3788d4936..3e4535876d3 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -7,6 +7,7 @@
#include <linux/gfp.h>
#include <linux/types.h>
#include <linux/cgroup.h>
+#include <linux/eventfd.h>
struct vmpressure {
unsigned long scanned;
@@ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr);
extern void vmpressure_cleanup(struct vmpressure *vmpr);
extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
-extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
-extern int vmpressure_register_event(struct cgroup_subsys_state *css,
- struct cftype *cft,
+extern int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd,
const char *args);
-extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
- struct cftype *cft,
+extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
#else
static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
diff --git a/init/Kconfig b/init/Kconfig
index 5236dc562a3..8d402e33b7f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -854,7 +854,6 @@ config NUMA_BALANCING
menuconfig CGROUPS
boolean "Control Group support"
- depends on EVENTFD
help
This option adds support for grouping sets of processes together, for
use with process control subsystems such as Cpusets, CFS, memory
@@ -921,6 +920,7 @@ config MEMCG
bool "Memory Resource Controller for Control Groups"
depends on RESOURCE_COUNTERS
select MM_OWNER
+ select EVENTFD
help
Provides a memory resource controller that manages both anonymous
memory and page cache. (See Documentation/cgroups/memory.txt)
@@ -1160,7 +1160,6 @@ config UIDGID_STRICT_TYPE_CHECKS
config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
- select EVENTFD
select CGROUPS
select CGROUP_SCHED
select FAIR_GROUP_SCHED
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bc1dcabe921..e2f46ba37f7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -41,7 +41,6 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/backing-dev.h>
-#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/spinlock.h>
@@ -56,15 +55,20 @@
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/eventfd.h>
-#include <linux/poll.h>
#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
-#include <linux/file.h>
#include <linux/atomic.h>
/*
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+
+/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
@@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex);
static DEFINE_MUTEX(cgroup_root_mutex);
+#define cgroup_assert_mutex_or_rcu_locked() \
+ rcu_lockdep_assert(rcu_read_lock_held() || \
+ lockdep_is_held(&cgroup_mutex), \
+ "cgroup_mutex or RCU read lock required");
+
+#ifdef CONFIG_LOCKDEP
+#define cgroup_assert_mutex_or_root_locked() \
+ WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
+ !lockdep_is_held(&cgroup_root_mutex)))
+#else
+#define cgroup_assert_mutex_or_root_locked() do { } while (0)
+#endif
+
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
@@ -98,6 +115,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
static struct workqueue_struct *cgroup_destroy_wq;
/*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
* populated with the built in subsystems, and modular subsystems are
* registered after that. The mutable section of this array is protected by
@@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root;
/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
-/*
- * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
- */
-struct cfent {
- struct list_head node;
- struct dentry *dentry;
- struct cftype *type;
- struct cgroup_subsys_state *css;
-
- /* file xattrs */
- struct simple_xattrs xattrs;
-};
-
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct cgroup_event {
- /*
- * css which the event belongs to.
- */
- struct cgroup_subsys_state *css;
- /*
- * Control file which the event associated.
- */
- struct cftype *cft;
- /*
- * eventfd to signal userspace about the event.
- */
- struct eventfd_ctx *eventfd;
- /*
- * Each of these stored in a list by the cgroup.
- */
- struct list_head list;
- /*
- * All fields below needed to unregister event when
- * userspace closes eventfd.
- */
- poll_table pt;
- wait_queue_head_t *wqh;
- wait_queue_t wait;
- struct work_struct remove;
-};
-
/* The list of hierarchy roots */
static LIST_HEAD(cgroup_roots);
@@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
static int cgroup_file_release(struct inode *inode, struct file *file);
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -262,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp)
}
/**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_mutex.
+ */
+#define for_each_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = rcu_dereference_check( \
+ (cgrp)->subsys[(ssid)], \
+ lockdep_is_held(&cgroup_mutex)))) { } \
+ else
+
+/**
* for_each_subsys - iterate all loaded cgroup subsystems
* @ss: the iteration cursor
- * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
*
- * Should be called under cgroup_mutex.
+ * Iterates through all loaded subsystems. Should be called under
+ * cgroup_mutex or cgroup_root_mutex.
*/
-#define for_each_subsys(ss, i) \
- for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \
- if (({ lockdep_assert_held(&cgroup_mutex); \
- !((ss) = cgroup_subsys[i]); })) { } \
+#define for_each_subsys(ss, ssid) \
+ for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
+ (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((ss) = cgroup_subsys[(ssid)])) { } \
else
/**
@@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp)
for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[i]) || true); (i)++)
-/* iterate each subsystem attached to a hierarchy */
-#define for_each_root_subsys(root, ss) \
- list_for_each_entry((ss), &(root)->subsys_list, sibling)
-
/* iterate across the active hierarchies */
#define for_each_active_root(root) \
list_for_each