2 files changed, 125 insertions, 163 deletions
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 79285a36035..bde53da9cd8 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -27,11 +27,7 @@
 
 #include <linux/fdtable.h>
 
-#define PRIOIDX_SZ 128
-
-static unsigned long prioidx_map[PRIOIDX_SZ];
-static DEFINE_SPINLOCK(prioidx_map_lock);
-static atomic_t max_prioidx = ATOMIC_INIT(0);
+#define PRIOMAP_MIN_SZ		128
 
 static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)
 {
@@ -39,136 +35,157 @@ static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgr
 			    struct cgroup_netprio_state, css);
 }
 
-static int get_prioidx(u32 *prio)
-{
-	unsigned long flags;
-	u32 prioidx;
-
-	spin_lock_irqsave(&prioidx_map_lock, flags);
-	prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ);
-	if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) {
-		spin_unlock_irqrestore(&prioidx_map_lock, flags);
-		return -ENOSPC;
-	}
-	set_bit(prioidx, prioidx_map);
-	if (atomic_read(&max_prioidx) < prioidx)
-		atomic_set(&max_prioidx, prioidx);
-	spin_unlock_irqrestore(&prioidx_map_lock, flags);
-	*prio = prioidx;
-	return 0;
-}
-
-static void put_prioidx(u32 idx)
+/*
+ * Extend @dev->priomap so that it's large enough to accomodate
+ * @target_idx.  @dev->priomap.priomap_len > @target_idx after successful
+ * return.  Must be called under rtnl lock.
+ */
+static int extend_netdev_table(struct net_device *dev, u32 target_idx)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&prioidx_map_lock, flags);
-	clear_bit(idx, prioidx_map);
-	spin_unlock_irqrestore(&prioidx_map_lock, flags);
-}
+	struct netprio_map *old, *new;
+	size_t new_sz, new_len;
 
-static int extend_netdev_table(struct net_device *dev, u32 new_len)
-{
-	size_t new_size = sizeof(struct netprio_map) +
-			   ((sizeof(u32) * new_len));
-	struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL);
-	struct netprio_map *old_priomap;
+	/* is the existing priomap large enough? */
+	old = rtnl_dereference(dev->priomap);
+	if (old && old->priomap_len > target_idx)
+		return 0;
 
-	old_priomap  = rtnl_dereference(dev->priomap);
+	/*
+	 * Determine the new size.  Let's keep it power-of-two.  We start
+	 * from PRIOMAP_MIN_SZ and double it until it's large enough to
+	 * accommodate @target_idx.
+	 */
+	new_sz = PRIOMAP_MIN_SZ;
+	while (true) {
+		new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
+			sizeof(new->priomap[0]);
+		if (new_len > target_idx)
+			break;
+		new_sz *= 2;
+		/* overflowed? */
+		if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
+			return -ENOSPC;
+	}
 
-	if (!new_priomap) {
+	/* allocate & copy */
+	new = kzalloc(new_sz, GFP_KERNEL);
+	if (!new) {
 		pr_warn("Unable to alloc new priomap!\n");
 		return -ENOMEM;
 	}
 
-	if (old_priomap)
-		memcpy(new_priomap->priomap, old_priomap->priomap,
-		       old_priomap->priomap_len *
-		       sizeof(old_priomap->priomap[0]));
+	if (old)
+		memcpy(new->priomap, old->priomap,
+		       old->priomap_len * sizeof(old->priomap[0]));
 
-	new_priomap->priomap_len = new_len;
+	new->priomap_len = new_len;
 
-	rcu_assign_pointer(dev->priomap, new_priomap);
-	if (old_priomap)
-		kfree_rcu(old_priomap, rcu);
+	/* install the new priomap */
+	rcu_assign_pointer(dev->priomap, new);
+	if (old)
+		kfree_rcu(old, rcu);
 	return 0;
 }
 
-static int write_update_netdev_table(struct net_device *dev)
+/**
+ * netprio_prio - return the effective netprio of a cgroup-net_device pair
+ * @cgrp: cgroup part of the target pair
+ * @dev: net_device part of the target pair
+ *
+ * Should be called under RCU read or rtnl lock.
+ */
+static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev)
+{
+	struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
+
+	if (map && cgrp->id < map->priomap_len)
+		return map->priomap[cgrp->id];
+	return 0;
+}
+
+/**
+ * netprio_set_prio - set netprio on a cgroup-net_device pair
+ * @cgrp: cgroup part of the target pair
+ * @dev: net_device part of the target pair
+ * @prio: prio to set
+ *
+ * Set netprio to @prio on @cgrp-@dev pair.  Should be called under rtnl
+ * lock and may fail under memory pressure for non-zero @prio.
+ */
+static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev,
+			    u32 prio)
 {
-	int ret = 0;
-	u32 max_len;
 	struct netprio_map *map;
+	int ret;
 
-	max_len = atomic_read(&max_prioidx) + 1;
+	/* avoid extending priomap for zero writes */
 	map = rtnl_dereference(dev->priomap);
-	if (!map || map->priomap_len < max_len)
-		ret = extend_netdev_table(dev, max_len);
+	if (!prio && (!map || map->priomap_len <= cgrp->id))
+		return 0;
 
-	return ret;
+	ret = extend_netdev_table(dev, cgrp->id);
+	if (ret)
+		return ret;
+
+	map = rtnl_dereference(dev->priomap);
+	map->priomap[cgrp->id] = prio;
+	return 0;
 }
 
-static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
+static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)
 {
 	struct cgroup_netprio_state *cs;
-	int ret = -EINVAL;
 
 	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
 	if (!cs)
 		return ERR_PTR(-ENOMEM);
 
-	if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx)
-		goto out;
-
-	ret = get_prioidx(&cs->prioidx);
-	if (ret < 0) {
-		pr_warn("No space in priority index array\n");
-		goto out;
-	}
-
 	return &cs->css;
-out:
-	kfree(cs);
-	return ERR_PTR(ret);
 }
 
-static void cgrp_destroy(struct cgroup *cgrp)
+static int cgrp_css_online(struct cgroup *cgrp)
 {
-	struct cgroup_netprio_state *cs;
+	struct cgroup *parent = cgrp->parent;
 	struct net_device *dev;
-	struct netprio_map *map;
+	int ret = 0;
+
+	if (!parent)
+		return 0;
 
-	cs = cgrp_netprio_state(cgrp);
 	rtnl_lock();
+	/*
+	 * Inherit prios from the parent.  As all prios are set during
+	 * onlining, there is no need to clear them on offline.
+	 */
 	for_each_netdev(&init_net, dev) {
-		map = rtnl_dereference(dev->priomap);
-		if (map && cs->prioidx < map->priomap_len)
-			map->priomap[cs->prioidx] = 0;
+		u32 prio = netprio_prio(parent, dev);
+
+		ret = netprio_set_prio(cgrp, dev, prio);
+		if (ret)
+			break;
 	}
 	rtnl_unlock();
-	put_prioidx(cs->prioidx);
-	kfree(cs);
+	return ret;
+}
+
+static void cgrp_css_free(struct cgroup *cgrp)
+{
+	kfree(cgrp_netprio_state(cgrp));
 }
 
 static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)
 {
-	return (u64)cgrp_netprio_state(cgrp)->prioidx;
+	return cgrp->id;
 }
 
 static int read_priomap(struct cgroup *cont, struct cftype *cft,
 			struct cgroup_map_cb *cb)
 {
 	struct net_device *dev;
-	u32 prioidx = cgrp_netprio_state(cont)->prioidx;
-	u32 priority;
-	struct netprio_map *map;
 
 	rcu_read_lock();
-	for_each_netdev_rcu(&init_net, dev) {
-		map = rcu_dereference(dev->priomap);
-		priority = (map && prioidx < map->priomap_len) ? map->priomap[prioidx] : 0;
-		cb->fill(cb, dev->name, priority);
-	}
+	for_each_netdev_rcu(&init_net, dev)
+		cb->fill(cb, dev->name, netprio_prio(cont, dev));
 	rcu_read_unlock();
 	return 0;
 }
@@ -176,66 +193,24 @@ static int read_priomap(struct cgroup *cont, struct cftype *cft,
 static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
 			 const char *buffer)
 {
-	char *devname = kstrdup(buffer, GFP_KERNEL);
-	int ret = -EINVAL;
-	u32 prioidx = cgrp_netprio_state(cgrp)->prioidx;
-	unsigned long priority;
-	char *priostr;
+	char devname[IFNAMSIZ + 1];
 	struct net_device *dev;
-	struct netprio_map *map;
-
-	if (!devname)
-		return -ENOMEM;
-
-	/*
-	 * Minimally sized valid priomap string
-	 */
-	if (strlen(devname) < 3)
-		goto out_free_devname;
-
-	priostr = strstr(devname, " ");
-	if (!priostr)
-		goto out_free_devname;
-
-	/*
-	 *Separate the devname from the associated priority
-	 *and advance the priostr pointer to the priority value
-	 */
-	*priostr = '\0';
-	priostr++;
-
-	/*
-	 * If the priostr points to NULL, we're at the end of the passed
-	 * in string, and its not a valid write
-	 */
-	if (*priostr == '\0')
-		goto out_free_devname;
-
-	ret = kstrtoul(priostr, 10, &priority);
-	if (ret < 0)
-		goto out_free_devname;
+	u32 prio;
+	int ret;
 
-	ret = -ENODEV;
+	if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
+		return -EINVAL;
 
 	dev = dev_get_by_name(&init_net, devname);
 	if (!dev)
-		goto out_free_devname;
+		return -ENODEV;
 
 	rtnl_lock();
-	ret = write_update_netdev_table(dev);
-	if (ret < 0)
-		goto out_put_dev;
 
-	map = rtnl_dereference(dev->priomap);
-	if (map)
-		map->priomap[prioidx] = priority;
+	ret = netprio_set_prio(cgrp, dev, prio);
 
-out_put_dev:
 	rtnl_unlock();
 	dev_put(dev);
-
-out_free_devname:
-	kfree(devname);
 	return ret;
 }
 
@@ -276,22 +251,13 @@ static struct cftype ss_files[] = {
 
 struct cgroup_subsys net_prio_subsys = {
 	.name		= "net_prio",
-	.create		= cgrp_create,
-	.destroy	= cgrp_destroy,
+	.css_alloc	= cgrp_css_alloc,
+	.css_online	= cgrp_css_online,
+	.css_free	= cgrp_css_free,
 	.attach		= net_prio_attach,
 	.subsys_id	= net_prio_subsys_id,
 	.base_cftypes	= ss_files,
 	.module		= THIS_MODULE,
-
-	/*
-	 * net_prio has artificial limit on the number of cgroups and
-	 * disallows nesting making it impossible to co-mount it with other
-	 * hierarchical subsystems.  Remove the artificially low PRIOIDX_SZ
-	 * limit and properly nest configuration such that children follow
-	 * their parents' configurations by default and are allowed to
-	 * override and remove the following.
-	 */
-	.broken_hierarchy = true,
 };
 
 static int netprio_device_event(struct notifier_block *unused,
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 2ecde225ae6..31f06b63357 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -34,21 +34,25 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
 			    struct cgroup_cls_state, css);
 }
 
-static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
+static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)
 {
 	struct cgroup_cls_state *cs;
 
 	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
 	if (!cs)
 		return ERR_PTR(-ENOMEM);
+	return &cs->css;
+}
 
+static int cgrp_css_online(struct cgroup *cgrp)
+{
 	if (cgrp->parent)
-		cs->classid = cgrp_cls_state(cgrp->parent)->classid;
-
-	return &cs->css;
+		cgrp_cls_state(cgrp)->classid =
+			cgrp_cls_state(cgrp->parent)->classid;
+	return 0;
 }
 
-static void cgrp_destroy(struct cgroup *cgrp)
+static void cgrp_css_free(struct cgroup *cgrp)
 {
 	kfree(cgrp_cls_state(cgrp));
 }
@@ -75,20 +79,12 @@ static struct cftype ss_files[] = {
 
 struct cgroup_subsys net_cls_subsys = {
 	.name		= "net_cls",
-	.create		= cgrp_create,
-	.destroy	= cgrp_destroy,
+	.css_alloc	= cgrp_css_alloc,
+	.css_online	= cgrp_css_online,
+	.css_free	= cgrp_css_free,
 	.subsys_id	= net_cls_subsys_id,
 	.base_cftypes	= ss_files,
 	.module		= THIS_MODULE,
-
-	/*
-	 * While net_cls cgroup has the rudimentary hierarchy support of
-	 * inheriting the parent's classid on cgroup creation, it doesn't
-	 * properly propagates config changes in ancestors to their
-	 * descendents.  A child should follow the parent's configuration
-	 * but be allowed to override it.  Fix it and remove the following.
-	 */
-	.broken_hierarchy = true,
 };
 
 struct cls_cgroup_head {