55 files changed, 7295 insertions, 2420 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2590e91b328..a1a8e29e5fc 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -250,6 +250,64 @@ config NET_SCH_QFQ
 
 	  If unsure, say N.
 
+config NET_SCH_CODEL
+	tristate "Controlled Delay AQM (CODEL)"
+	help
+	  Say Y here if you want to use the Controlled Delay (CODEL)
+	  packet scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_codel.
+
+	  If unsure, say N.
+
+config NET_SCH_FQ_CODEL
+	tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)"
+	help
+	  Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL)
+	  packet scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_fq_codel.
+
+	  If unsure, say N.
+
+config NET_SCH_FQ
+	tristate "Fair Queue"
+	help
+	  Say Y here if you want to use the FQ packet scheduling algorithm.
+
+	  FQ does flow separation, and is able to respect pacing requirements
+	  set by TCP stack into sk->sk_pacing_rate (for localy generated
+	  traffic)
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_fq.
+
+	  If unsure, say N.
+
+config NET_SCH_HHF
+	tristate "Heavy-Hitter Filter (HHF)"
+	help
+	  Say Y here if you want to use the Heavy-Hitter Filter (HHF)
+	  packet scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_hhf.
+
+config NET_SCH_PIE
+	tristate "Proportional Integral controller Enhanced (PIE) scheduler"
+	help
+	  Say Y here if you want to use the Proportional Integral controller
+	  Enhanced scheduler packet scheduling algorithm.
+	  For more information, please see
+	  http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_pie.
+
+	  If unsure, say N.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
@@ -260,6 +318,32 @@ config NET_SCH_INGRESS
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_ingress.
 
+config NET_SCH_PLUG
+	tristate "Plug network traffic until release (PLUG)"
+	---help---
+
+	  This queuing discipline allows userspace to plug/unplug a network
+	  output queue, using the netlink interface.  When it receives an
+	  enqueue command it inserts a plug into the outbound queue that
+	  causes following packets to enqueue until a dequeue command arrives
+	  over netlink, causing the plug to be removed and resuming the normal
+	  packet flow.
+
+	  This module also provides a generic "network output buffering"
+	  functionality (aka output commit), wherein upon arrival of a dequeue
+	  command, only packets up to the first plug are released for delivery.
+	  The Remus HA project uses this module to enable speculative execution
+	  of virtual machines by allowing the generated network output to be rolled
+	  back if needed.
+
+	  For more information, please refer to http://wiki.xensource.com/xenwiki/Remus
+
+	  Say Y here if you are using this kernel for Xen dom0 and
+	  want to protect Xen guests with Remus.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_plug.
+
 comment "Classification"
 
 config NET_CLS
@@ -373,6 +457,7 @@ config NET_CLS_FLOW
 config NET_CLS_CGROUP
 	tristate "Control Group Classifier"
 	select NET_CLS
+	select CGROUP_NET_CLASSID
 	depends on CGROUPS
 	---help---
 	  Say Y here if you want to classify packets based on the control
@@ -381,6 +466,16 @@ config NET_CLS_CGROUP
 	  To compile this code as a module, choose M here: the
 	  module will be called cls_cgroup.
 
+config NET_CLS_BPF
+	tristate "BPF-based classifier"
+	select NET_CLS
+	---help---
+	  If you say Y here, you will be able to classify packets based on
+	  programmable BPF (JIT'ed) filters as an alternative to ematches.
+
+	  To compile this code as a module, choose M here: the module will
+	  be called cls_bpf.
+
 config NET_EMATCH
 	bool "Extended Matches"
 	select NET_CLS
@@ -459,6 +554,26 @@ config NET_EMATCH_TEXT
 	  To compile this code as a module, choose M here: the
 	  module will be called em_text.
 
+config NET_EMATCH_CANID
+	tristate "CAN Identifier"
+	depends on NET_EMATCH && (CAN=y || CAN=m)
+	---help---
+	  Say Y here if you want to be able to classify CAN frames based
+	  on CAN Identifier.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_canid.
+
+config NET_EMATCH_IPSET
+	tristate "IPset"
+	depends on NET_EMATCH && IP_SET
+	---help---
+	  Say Y here if you want to be able to classify packets based on
+	  ipset membership.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_ipset.
+
 config NET_CLS_ACT
 	bool "Actions"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index dc5889c0a15..0a869a11f3e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -33,9 +33,15 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_PLUG)	+= sch_plug.o
 obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
 obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
+obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
+obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
+obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
+obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
@@ -46,9 +52,12 @@ obj-$(CONFIG_NET_CLS_RSVP6)	+= cls_rsvp6.o
 obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o
 obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o
 obj-$(CONFIG_NET_CLS_CGROUP)	+= cls_cgroup.o
+obj-$(CONFIG_NET_CLS_BPF)	+= cls_bpf.o
 obj-$(CONFIG_NET_EMATCH)	+= ematch.o
 obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o
 obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o
 obj-$(CONFIG_NET_EMATCH_U32)	+= em_u32.o
 obj-$(CONFIG_NET_EMATCH_META)	+= em_meta.o
 obj-$(CONFIG_NET_EMATCH_TEXT)	+= em_text.o
+obj-$(CONFIG_NET_EMATCH_CANID)	+= em_canid.o
+obj-$(CONFIG_NET_EMATCH_IPSET)	+= em_ipset.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 93fdf131bd7..648778aef1a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -27,42 +27,40 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 
-void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
+void tcf_hash_destroy(struct tc_action *a)
 {
-	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
-	struct tcf_common **p1p;
-
-	for (p1p = &hinfo->htab[h]; *p1p; p1p = &(*p1p)->tcfc_next) {
-		if (*p1p == p) {
-			write_lock_bh(hinfo->lock);
-			*p1p = p->tcfc_next;
-			write_unlock_bh(hinfo->lock);
-			gen_kill_estimator(&p->tcfc_bstats,
-					   &p->tcfc_rate_est);
-			/*
-			 * gen_estimator est_timer() might access p->tcfc_lock
-			 * or bstats, wait a RCU grace period before freeing p
-			 */
-			kfree_rcu(p, tcfc_rcu);
-			return;
-		}
-	}
-	WARN_ON(1);
+	struct tcf_common *p = a->priv;
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+
+	spin_lock_bh(&hinfo->lock);
+	hlist_del(&p->tcfc_head);
+	spin_unlock_bh(&hinfo->lock);
+	gen_kill_estimator(&p->tcfc_bstats,
+			   &p->tcfc_rate_est);
+	/*
+	 * gen_estimator est_timer() might access p->tcfc_lock
+	 * or bstats, wait a RCU grace period before freeing p
+	 */
+	kfree_rcu(p, tcfc_rcu);
 }
 EXPORT_SYMBOL(tcf_hash_destroy);
 
-int tcf_hash_release(struct tcf_common *p, int bind,
-		     struct tcf_hashinfo *hinfo)
+int tcf_hash_release(struct tc_action *a, int bind)
 {
+	struct tcf_common *p = a->priv;
 	int ret = 0;
 
 	if (p) {
 		if (bind)
 			p->tcfc_bindcnt--;
+		else if (p->tcfc_bindcnt > 0)
+			return -EPERM;
 
 		p->tcfc_refcnt--;
 		if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
-			tcf_hash_destroy(p, hinfo);
+			if (a->ops->cleanup)
+				a->ops->cleanup(a, bind);
+			tcf_hash_destroy(a);
 			ret = 1;
 		}
 	}
@@ -71,20 +69,22 @@ int tcf_hash_release(struct tcf_common *p, int bind,
 EXPORT_SYMBOL(tcf_hash_release);
 
 static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
-			   struct tc_action *a, struct tcf_hashinfo *hinfo)
+			   struct tc_action *a)
 {
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct hlist_head *head;
 	struct tcf_common *p;
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
 	struct nlattr *nest;
 
-	read_lock_bh(hinfo->lock);
+	spin_lock_bh(&hinfo->lock);
 
 	s_i = cb->args[0];
 
 	for (i = 0; i < (hinfo->hmask + 1); i++) {
-		p = hinfo->htab[tcf_hash(i, hinfo->hmask)];
+		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
 
-		for (; p; p = p->tcfc_next) {
+		hlist_for_each_entry_rcu(p, head, tcfc_head) {
 			index++;
 			if (index < s_i)
 				continue;
@@ -107,7 +107,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
 		}
 	}
 done:
-	read_unlock_bh(hinfo->lock);
+	spin_unlock_bh(&hinfo->lock);
 	if (n_i)
 		cb->args[0] += n_i;
 	return n_i;
@@ -117,79 +117,82 @@ nla_put_failure:
 	goto done;
 }
 
-static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,
-			  struct tcf_hashinfo *hinfo)
+static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a)
 {
-	struct tcf_common *p, *s_p;
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct hlist_head *head;
+	struct hlist_node *n;
+	struct tcf_common *p;
 	struct nlattr *nest;
 	int i = 0, n_i = 0;
+	int ret = -EINVAL;
 
 	nest = nla_nest_start(skb, a->order);
 	if (nest == NULL)
 		goto nla_put_failure;
-	NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind);
+	if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+		goto nla_put_failure;
 	for (i = 0; i < (hinfo->hmask + 1); i++) {
-		p = hinfo->htab[tcf_hash(i, hinfo->hmask)];
-
-		while (p != NULL) {
-			s_p = p->tcfc_next;
-			if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo))
+		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
+		hlist_for_each_entry_safe(p, n, head, tcfc_head) {
+			a->priv = p;
+			ret = tcf_hash_release(a, 0);
+			if (ret == ACT_P_DELETED) {
 				module_put(a->ops->owner);
-			n_i++;
-			p = s_p;
+				n_i++;
+			} else if (ret < 0)
+				goto nla_put_failure;
 		}
 	}
-	NLA_PUT_U32(skb, TCA_FCNT, n_i);
+	if (nla_put_u32(skb, TCA_FCNT, n_i))
+		goto nla_put_failure;
 	nla_nest_end(skb, nest);
 
 	return n_i;
 nla_put_failure:
 	nla_nest_cancel(skb, nest);
-	return -EINVAL;
+	return ret;
 }
 
-int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
-		       int type, struct tc_action *a)
+static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
+			      int type, struct tc_action *a)
 {
-	struct tcf_hashinfo *hinfo = a->ops->hinfo;
-
 	if (type == RTM_DELACTION) {
-		return tcf_del_walker(skb, a, hinfo);
+		return tcf_del_walker(skb, a);
 	} else if (type == RTM_GETACTION) {
-		return tcf_dump_walker(skb, cb, a, hinfo);
+		return tcf_dump_walker(skb, cb, a);
 	} else {
 		WARN(1, "tcf_generic_walker: unknown action %d\n", type);
 		return -EINVAL;
 	}
 }
-EXPORT_SYMBOL(tcf_generic_walker);
 
-struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
+static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
 {
-	struct tcf_common *p;
+	struct tcf_common *p = NULL;
+	struct hlist_head *head;
 
-	read_lock_bh(hinfo->lock);
-	for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p;
-	     p = p->tcfc_next) {
+	spin_lock_bh(&hinfo->lock);
+	head = &hinfo->htab[tcf_hash(index, hinfo->hmask)];
+	hlist_for_each_entry_rcu(p, head, tcfc_head)
 		if (p->tcfc_index == index)
 			break;
-	}
-	read_unlock_bh(hinfo->lock);
+	spin_unlock_bh(&hinfo->lock);
 
 	return p;
 }
-EXPORT_SYMBOL(tcf_hash_lookup);
 
-u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo)
+u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)
 {
-	u32 val = *idx_gen;
+	u32 val = hinfo->index;
 
 	do {
 		if (++val == 0)
 			val = 1;
 	} while (tcf_hash_lookup(val, hinfo));
 
-	return (*idx_gen = val);
+	hinfo->index = val;
+	return val;
 }
 EXPORT_SYMBOL(tcf_hash_new_index);
 
@@ -206,34 +209,46 @@ int tcf_hash_search(struct tc_action *a, u32 index)
 }
 EXPORT_SYMBOL(tcf_hash_search);
 
-struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind,
-				  struct tcf_hashinfo *hinfo)
+int tcf_hash_check(u32 index, struct tc_action *a, int bind)
 {
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 	struct tcf_common *p = NULL;
 	if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
 		if (bind)
 			p->tcfc_bindcnt++;
 		p->tcfc_refcnt++;
 		a->priv = p;
+		return 1;
 	}
-	return p;
+	return 0;
 }
 EXPORT_SYMBOL(tcf_hash_check);
 
-struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est,
-				   struct tc_action *a, int size, int bind,
-				   u32 *idx_gen, struct tcf_hashinfo *hinfo)
+void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
 {
+	struct tcf_common *pc = a->priv;
+	if (est)
+		gen_kill_estimator(&pc->tcfc_bstats,
+				   &pc->tcfc_rate_est);
+	kfree_rcu(pc, tcfc_rcu);
+}
+EXPORT_SYMBOL(tcf_hash_cleanup);
+
+int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
+		    int size, int bind)
+{
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 	struct tcf_common *p = kzalloc(size, GFP_KERNEL);
 
 	if (unlikely(!p))
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	p->tcfc_refcnt = 1;
 	if (bind)
 		p->tcfc_bindcnt = 1;
 
 	spin_lock_init(&p->tcfc_lock);
-	p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo);
+	INIT_HLIST_NODE(&p->tcfc_head);
+	p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);
 	p->tcfc_tm.install = jiffies;
 	p->tcfc_tm.lastuse = jiffies;
 	if (est) {
@@ -241,42 +256,64 @@ struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est,
 					    &p->tcfc_lock, est);
 		if (err) {
 			kfree(p);
-			return ERR_PTR(err);
+			return err;
 		}
 	}
 
 	a->priv = (void *) p;
-	return p;
+	return 0;
 }
 EXPORT_SYMBOL(tcf_hash_create);
 
-void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo)
+void tcf_hash_insert(struct tc_action *a)
 {
+	struct tcf_common *p = a->priv;
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
 
-	write_lock_bh(hinfo->lock);
-	p->tcfc_next = hinfo->htab[h];
-	hinfo->htab[h] = p;
-	write_unlock_bh(hinfo->lock);
+	spin_lock_bh(&hinfo->lock);
+	hlist_add_head(&p->tcfc_head, &hinfo->htab[h]);
+	spin_unlock_bh(&hinfo->lock);
 }
 EXPORT_SYMBOL(tcf_hash_insert);
 
-static struct tc_action_ops *act_base = NULL;
+static LIST_HEAD(act_base);
 static DEFINE_RWLOCK(act_mod_lock);
 
-int tcf_register_action(struct tc_action_ops *act)
+int tcf_register_action(struct tc_action_ops *act, unsigned int mask)
 {
-	struct tc_action_ops *a, **ap;
+	struct tc_action_ops *a;
+	int err;
+
+	/* Must supply act, dump and init */
+	if (!act->act || !act->dump || !act->init)
+		return -EINVAL;
+
+	/* Supply defaults */
+	if (!act->lookup)
+		act->lookup = tcf_hash_search;
+	if (!act->walk)
+		act->walk = tcf_generic_walker;
+
+	act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL);
+	if (!act->hinfo)
+		return -ENOMEM;
+	err = tcf_hashinfo_init(act->hinfo, mask);
+	if (err) {
+		kfree(act->hinfo);
+		return err;
+	}
 
 	write_lock(&act_mod_lock);
-	for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) {
+	list_for_each_entry(a, &act_base, head) {
 		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
 			write_unlock(&act_mod_lock);
+			tcf_hashinfo_destroy(act->hinfo);
+			kfree(act->hinfo);
 			return -EEXIST;
 		}
 	}
-	act->next = NULL;
-	*ap = act;
+	list_add_tail(&act->head, &act_base);
 	write_unlock(&act_mod_lock);
 	return 0;
 }
@@ -284,17 +321,18 @@ EXPORT_SYMBOL(tcf_register_action);
 
 int tcf_unregister_action(struct tc_action_ops *act)
 {
-	struct tc_action_ops *a, **ap;
+	struct tc_action_ops *a;
 	int err = -ENOENT;
 
 	write_lock(&act_mod_lock);
-	for (ap = &act_base; (a = *ap) != NULL; ap = &a->next)
-		if (a == act)
+	list_for_each_entry(a, &act_base, head) {
+		if (a == act) {
+			list_del(&act->head);
+			tcf_hashinfo_destroy(act->hinfo);
+			kfree(act->hinfo);
+			err = 0;
 			break;
-	if (a) {
-		*ap = a->next;
-		a->next = NULL;
-		err = 0;
+		}
 	}
 	write_unlock(&act_mod_lock);
 	return err;
@@ -304,69 +342,42 @@ EXPORT_SYMBOL(tcf_unregister_action);
 /* lookup by name */
 static struct tc_action_ops *tc_lookup_action_n(char *kind)
 {
-	struct tc_action_ops *a = NULL;
+	struct tc_action_ops *a, *res = NULL;
 
 	if (kind) {
 		read_lock(&act_mod_lock);
-		for (a = act_base; a; a = a->next) {
+		list_for_each_entry(a, &act_base, head) {
 			if (strcmp(kind, a->kind) == 0) {
-				if (!try_module_get(a->owner)) {
-					read_unlock(&act_mod_lock);
-					return NULL;
-				}
+				if (try_module_get(a->owner))
+					res = a;
 				break;
 			}
 		}
 		read_unlock(&act_mod_lock);
 	}
-	return a;
+	return res;
 }
 
 /* lookup by nlattr */
 static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
 {
-	struct tc_action_ops *a = NULL;
+	struct tc_action_ops *a, *res = NULL;
 
 	if (kind) {
 		read_lock(&act_mod_lock);
-		for (a = act_base; a; a = a->next) {
+		list_for_each_entry(a, &act_base, head) {
 			if (nla_strcmp(kind, a->kind) == 0) {
-				if (!try_module_get(a->owner)) {
-					read_unlock(&act_mod_lock);
-					return NULL;
-				}
+				if (try_module_get(a->owner))
+					res = a;
 				break;
 			}
 		}
 		read_unlock(&act_mod_lock);
 	}
-	return a;
+	return res;
 }
 
-#if 0
-/* lookup by id */
-static struct tc_action_ops *tc_lookup_action_id(u32 type)
-{
-	struct tc_action_ops *a = NULL;
-
-	if (type) {
-		read_lock(&act_mod_lock);
-		for (a = act_base; a; a = a->next) {
-			if (a->type == type) {
-				if (!try_module_get(a->owner)) {
-					read_unlock(&act_mod_lock);
-					return NULL;
-				}
-				break;
-			}
-		}
-		read_unlock(&act_mod_lock);
-	}
-	return a;
-}
-#endif
-
-int tcf_action_exec(struct sk_buff *skb, const struct tc_action *act,
+int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
 		    struct tcf_result *res)
 {
 	const struct tc_action *a;
@@ -377,53 +388,44 @@ int tcf_action_exec(struct sk_buff *skb, const struct tc_action *act,
 		ret = TC_ACT_OK;
 		goto exec_done;
 	}
-	while ((a = act) != NULL) {
+	list_for_each_entry(a, actions, list) {
 repeat:
-		if (a->ops && a->ops->act) {
-			ret = a->ops->act(skb, a, res);
-			if (TC_MUNGED & skb->tc_verd) {
-				/* copied already, allow trampling */
-				skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
-				skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
-			}
-			if (ret == TC_ACT_REPEAT)
-				goto repeat;	/* we need a ttl - JHS */
-			if (ret != TC_ACT_PIPE)
-				goto exec_done;
+		ret = a->ops->act(skb, a, res);
+		if (TC_MUNGED & skb->tc_verd) {
+			/* copied already, allow trampling */
+			skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
+			skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
 		}
-		act = a->next;
+		if (ret == TC_ACT_REPEAT)
+			goto repeat;	/* we need a ttl - JHS */
+		if (ret != TC_ACT_PIPE)
+			goto exec_done;
 	}
 exec_done:
 	return ret;
 }
 EXPORT_SYMBOL(tcf_action_exec);
 
-void tcf_action_destroy(struct tc_action *act, int bind)
+int tcf_action_destroy(struct list_head *actions, int bind)
 {
-	struct tc_action *a;
+	struct tc_action *a, *tmp;
+	int ret = 0;
 
-	for (a = act; a; a = act) {
-		if (a->ops && a->ops->cleanup) {
-			if (a->ops->cleanup(a, bind) == ACT_P_DELETED)
-				module_put(a->ops->owner);
-			act = act->next;
-			kfree(a);
-		} else {
-			/*FIXME: Remove later - catch insertion bugs*/
-			WARN(1, "tcf_action_destroy: BUG? destroying NULL ops\n");
-			act = act->next;
-			kfree(a);
-		}
+	list_for_each_entry_safe(a, tmp, actions, list) {
+		ret = tcf_hash_release(a, bind);
+		if (ret == ACT_P_DELETED)
+			module_put(a->ops->owner);
+		else if (ret < 0)
+			return ret;
+		list_del(&a->list);
+		kfree(a);
 	}
+	return ret;
 }
 
 int
 tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
-	int err = -EINVAL;
-
-	if (a->ops == NULL || a->ops->dump == NULL)
-		return err;
 	return a->ops->dump(skb, a, bind, ref);
 }
 
@@ -434,10 +436,8 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
 
-	if (a->ops == NULL || a->ops->dump == NULL)
-		return err;
-
-	NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind);
+	if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+		goto nla_put_failure;
 	if (tcf_action_copy_stats(skb, a, 0))
 		goto nla_put_failure;
 	nest = nla_nest_start(skb, TCA_OPTIONS);
@@ -456,14 +456,13 @@ nla_put_failure:
 EXPORT_SYMBOL(tcf_action_dump_1);
 
 int
-tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref)
+tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref)
 {
 	struct tc_action *a;
 	int err = -EINVAL;
 	struct nlattr *nest;
 
-	while ((a = act) != NULL) {
-		act = a->next;
+	list_for_each_entry(a, actions, list) {
 		nest = nla_nest_start(skb, a->order);
 		if (nest == NULL)
 			goto nla_put_failure;
@@ -482,8 +481,9 @@ errout:
 	return err;
 }
 
-struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
-				    char *name, int ovr, int bind)
+struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
+				    struct nlattr *est, char *name, int ovr,
+				    int bind)
 {
 	struct tc_action *a;
 	struct tc_action_ops *a_o;
@@ -537,11 +537,13 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
 	if (a == NULL)
 		goto err_mod;
 
+	a->ops = a_o;
+	INIT_LIST_HEAD(&a->list);
 	/* backward compatibility for policer */
 	if (name == NULL)
-		err = a_o->init(tb[TCA_ACT_OPTIONS], est, a, ovr, bind);
+		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind);
 	else
-		err = a_o->init(nla, est, a, ovr, bind);
+		err = a_o->init(net, nla, est, a, ovr, bind);
 	if (err < 0)
 		goto err_free;
 
@@ -551,7 +553,6 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
 	 */
 	if (err != ACT_P_CREATED)
 		module_put(a_o->owner);
-	a->ops = a_o;
 
 	return a;
 
@@ -563,36 +564,33 @@ err_out:
 	return ERR_PTR(err);
 }
 
-struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est,
-				  char *name, int ovr, int bind)
+int tcf_action_init(struct net *net, struct nlattr *nla,
+				  struct nlattr *est, char *name, int ovr,
+				  int bind, struct list_head *actions)
 {
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
-	struct tc_action *head = NULL, *act, *act_prev = NULL;
+	struct tc_action *act;
 	int err;
 	int i;
 
 	err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
 	if (err < 0)
-		return ERR_PTR(err);
+		return err;
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_init_1(tb[i], est, name, ovr, bind);
-		if (IS_ERR(act))
+		act = tcf_action_init_1(net, tb[i], est, name, ovr, bind);
+		if (IS_ERR(act)) {
+			err = PTR_ERR(act);
 			goto err;
+		}
 		act->order = i;
-
-		if (head == NULL)
-			head = act;
-		else
-			act_prev->next = act;
-		act_prev = act;
+		list_add_tail(&act->list, actions);
 	}
-	return head;
+	return 0;
 
 err:
-	if (head != NULL)
-		tcf_action_destroy(head, bind);
-	return act;
+	tcf_action_destroy(actions, bind);
+	return err;
 }
 
 int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
@@ -600,9 +598,9 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 {
 	int err = 0;
 	struct gnet_dump d;
-	struct tcf_act_hdr *h = a->priv;
+	struct tcf_common *p = a->priv;
 
-	if (h == NULL)
+	if (p == NULL)
 		goto errout;
 
 	/* compat_mode being true specifies a call that is supposed
@@ -611,24 +609,20 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 	if (compat_mode) {
 		if (a->type == TCA_OLD_COMPAT)
 			err = gnet_stats_start_copy_compat(skb, 0,
-				TCA_STATS, TCA_XSTATS, &h->tcf_lock, &d);
+				TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d);
 		else
 			return 0;
 	} else
 		err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
-					    &h->tcf_lock, &d);
+					    &p->tcfc_lock, &d);
 
 	if (err < 0)
 		goto errout;
 
-	if (a->ops != NULL && a->ops->get_stats != NULL)
-		if (a->ops->get_stats(skb, a) < 0)
-			goto errout;
-
-	if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 ||
-	    gnet_stats_copy_rate_est(&d, &h->tcf_bstats,
-				     &h->tcf_rate_est) < 0 ||
-	    gnet_stats_copy_queue(&d, &h->tcf_qstats) < 0)
+	if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
+				     &p->tcfc_rate_est) < 0 ||
+	    gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0)
 		goto errout;
 
 	if (gnet_stats_finish_copy(&d) < 0)
@@ -641,7 +635,7 @@ errout:
 }
 
 static int
-tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
+tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq,
 	     u16 flags, int event, int bind, int ref)
 {
 	struct tcamsg *t;
@@ -649,50 +643,64 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
-
-	t = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
 	if (nest == NULL)
-		goto nla_put_failure;
+		goto out_nlmsg_trim;
 
-	if (tcf_action_dump(skb, a, bind, ref) < 0)
-		goto nla_put_failure;
+	if (tcf_action_dump(skb, actions, bind, ref) < 0)
+		goto out_nlmsg_trim;
 
 	nla_nest_end(skb, nest);
 
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nla_put_failure:
-nlmsg_failure:
+out_nlmsg_trim:
 	nlmsg_trim(skb, b);
 	return -1;
 }
 
 static int
-act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n,
-	       struct tc_action *a, int event)
+act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
+	       struct list_head *actions, int event)
 {
 	struct sk_buff *skb;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
-	if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
+	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
 
-	return rtnl_unicast(skb, net, pid);
+	return rtnl_unicast(skb, net, portid);
+}
+
+static struct tc_action *create_a(int i)
+{
+	struct tc_action *act;
+
+	act = kzalloc(sizeof(*act), GFP_KERNEL);
+	if (act == NULL) {
+		pr_debug("create_a: failed to alloc!\n");
+		return NULL;
+	}
+	act->order = i;
+	INIT_LIST_HEAD(&act->list);
+	return act;
 }
 
 static struct tc_action *
-tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid)
+tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)
 {
 	struct nlattr *tb[TCA_ACT_MAX + 1];
 	struct tc_action *a;
@@ -710,16 +718,14 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid)
 	index = nla_get_u32(tb[TCA_ACT_INDEX]);
 
 	err = -ENOMEM;
-	a = kzalloc(sizeof(struct tc_action), GFP_KERNEL);
+	a = create_a(0);
 	if (a == NULL)
 		goto err_out;
 
 	err = -EINVAL;
 	a->ops = tc_lookup_action(tb[TCA_ACT_KIND]);
-	if (a->ops == NULL)
+	if (a->ops == NULL) /* could happen in batch of actions */
 		goto err_free;
-	if (a->ops->lookup == NULL)
-		goto err_mod;
 	err = -ENOENT;
 	if (a->ops->lookup(a, index) == 0)
 		goto err_mod;
@@ -735,31 +741,18 @@ err_out:
 	return ERR_PTR(err);
 }
 
-static void cleanup_a(struct tc_action *act)
+static void cleanup_a(struct list_head *actions)
 {
-	struct tc_action *a;
+	struct tc_action *a, *tmp;
 
-	for (a = act; a; a = act) {
-		act = a->next;
+	list_for_each_entry_safe(a, tmp, actions, list) {
+		list_del(&a->list);
 		kfree(a);
 	}
 }
 
-static struct tc_action *create_a(int i)
-{
-	struct tc_action *act;
-
-	act = kzalloc(sizeof(*act), GFP_KERNEL);
-	if (act == NULL) {
-		pr_debug("create_a: failed to alloc!\n");
-		return NULL;
-	}
-	act->order = i;
-	return act;
-}
-
 static int tca_action_flush(struct net *net, struct nlattr *nla,
-			    struct nlmsghdr *n, u32 pid)
+			    struct nlmsghdr *n, u32 portid)
 {
 	struct sk_buff *skb;
 	unsigned char *b;
@@ -769,18 +762,12 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	struct nlattr *nest;
 	struct nlattr *tb[TCA_ACT_MAX + 1];
 	struct nlattr *kind;
-	struct tc_action *a = create_a(0);
+	struct tc_action a;
 	int err = -ENOMEM;
 
-	if (a == NULL) {
-		pr_debug("tca_action_flush: couldnt create tc_action\n");
-		return err;
-	}
-
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb) {
 		pr_debug("tca_action_flush: failed skb alloc\n");
-		kfree(a);
 		return err;
 	}
 
@@ -792,23 +779,27 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 
 	err = -EINVAL;
 	kind = tb[TCA_ACT_KIND];
-	a->ops = tc_lookup_action(kind);
-	if (a->ops == NULL)
+	memset(&a, 0, sizeof(struct tc_action));
+	INIT_LIST_HEAD(&a.list);
+	a.ops = tc_lookup_action(kind);
+	if (a.ops == NULL) /*some idjot trying to flush unknown action */
 		goto err_out;
 
-	nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
-	t = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0);
+	if (!nlh)
+		goto out_module_put;
+	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
 	if (nest == NULL)
-		goto nla_put_failure;
+		goto out_module_put;
 
-	err = a->ops->walk(skb, &dcb, RTM_DELACTION, a);
+	err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a);
 	if (err < 0)
-		goto nla_put_failure;
+		goto out_module_put;
 	if (err == 0)
 		goto noflush_out;
 
@@ -816,32 +807,61 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	nlh->nlmsg_flags |= NLM_F_ROOT;
-	module_put(a->ops->owner);
-	kfree(a);
-	err = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+	module_put(a.ops->owner);
+	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 			     n->nlmsg_flags & NLM_F_ECHO);
 	if (err > 0)
 		return 0;
 
 	return err;
 
-nla_put_failure:
-nlmsg_failure:
-	module_put(a->ops->owner);
+out_module_put:
+	module_put(a.ops->owner);
 err_out:
 noflush_out:
 	kfree_skb(skb);
-	kfree(a);
 	return err;
 }
 
 static int
+tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
+	       u32 portid)
+{
+	int ret;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
+			 0, 1) <= 0) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	/* now do the delete */
+	ret = tcf_action_destroy(actions, 0);
+	if (ret < 0) {
+		kfree_skb(skb);
+		return ret;
+	}
+
+	ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+			     n->nlmsg_flags & NLM_F_ECHO);
+	if (ret > 0)
+		return 0;
+	return ret;
+}
+
+static int
 tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
-	      u32 pid, int event)
+	      u32 portid, int event)
 {
 	int i, ret;
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
-	struct tc_action *head = NULL, *act, *act_prev = NULL;
+	struct tc_action *act;
+	LIST_HEAD(actions);
 
 	ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
 	if (ret < 0)
@@ -849,139 +869,88 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 
 	if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
 		if (tb[1] != NULL)
-			return tca_action_flush(net, tb[1], n, pid);
+			return tca_action_flush(net, tb[1], n, portid);
 		else
 			return -EINVAL;
 	}
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_get_1(tb[i], n, pid);
+		act = tcf_action_get_1(tb[i], n, portid);
 		if (IS_ERR(act)) {
 			ret = PTR_ERR(act);
 			goto err;
 		}
 		act->order = i;
-
-		if (head == NULL)
-			head = act;
-		else
-			act_prev->next = act;
-		act_prev = act;
+		list_add_tail(&act->list, &actions);
 	}
 
 	if (event == RTM_GETACTION)
-		ret = act_get_notify(net, pid, n, head, event);
+		ret = act_get_notify(net, portid, n, &actions, event);
 	else { /* delete */
-		struct sk_buff *skb;
-
-		skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-		if (!skb) {
-			ret = -ENOBUFS;
-			goto err;
-		}
-
-		if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event,
-				 0, 1) <= 0) {
-			kfree_skb(skb);
-			ret = -EINVAL;
+		ret = tcf_del_notify(net, n, &actions, portid);
+		if (ret)
 			goto err;
-		}
-
-		/* now do the delete */
-		tcf_action_destroy(head, 0);
-		ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
-				     n->nlmsg_flags & NLM_F_ECHO);
-		if (ret > 0)
-			return 0;
 		return ret;
 	}
 err:
-	cleanup_a(head);
+	cleanup_a(&actions);
 	return ret;
 }
 
-static int tcf_add_notify(struct net *net, struct tc_action *a,
-			  u32 pid, u32 seq, int event, u16 flags)
+static int
+tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
+	       u32 portid)
 {
-	struct tcamsg *t;
-	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
-	struct nlattr *nest;
-	unsigned char *b;
 	int err = 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
-	b = skb_tail_pointer(skb);
-
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
-	t = NLMSG_DATA(nlh);
-	t->tca_family = AF_UNSPEC;
-	t->tca__pad1 = 0;
-	t->tca__pad2 = 0;
-
-	nest = nla_nest_start(skb, TCA_ACT_TAB);
-	if (nest == NULL)
-		goto nla_put_failure;
-
-	if (tcf_action_dump(skb, a, 0, 0) < 0)
-		goto nla_put_failure;
-
-	nla_nest_end(skb, nest);
-
-	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
-	NETLINK_CB(skb).dst_group = RTNLGRP_TC;
+	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags,
+			 RTM_NEWACTION, 0, 0) <= 0) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
 
-	err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags & NLM_F_ECHO);
+	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+			     n->nlmsg_flags & NLM_F_ECHO);
 	if (err > 0)
 		err = 0;
 	return err;
-
-nla_put_failure:
-nlmsg_failure:
-	kfree_skb(skb);
-	return -1;
 }
 
-
 static int
 tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
-	       u32 pid, int ovr)
+	       u32 portid, int ovr)
 {
 	int ret = 0;
-	struct tc_action *act;
-	struct tc_action *a;
-	u32 seq = n->nlmsg_seq;
+	LIST_HEAD(actions);
 
-	act = tcf_action_init(nla, NULL, NULL, ovr, 0);
-	if (act == NULL)
-		goto done;
-	if (IS_ERR(act)) {
-		ret = PTR_ERR(act);
+	ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions);
+	if (ret)
 		goto done;
-	}
 
 	/* dump then free all the actions after update; inserted policy
 	 * stays intact
 	 */
-	ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags);
-	for (a = act; a; a = act) {
-		act = a->next;
-		kfree(a);
-	}
+	ret = tcf_add_notify(net, n, &actions, portid);
+	cleanup_a(&actions);
 done:
 	return ret;
 }
 
-static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)
 {
 	struct net *net = sock_net(skb->sk);
 	struct nlattr *tca[TCA_ACT_MAX + 1];
-	u32 pid = skb ? NETLINK_CB(skb).pid : 0;
+	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
 	int ret = 0, ovr = 0;
 
+	if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
 	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL);
 	if (ret < 0)
 		return ret;
@@ -1003,17 +972,17 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 		if (n->nlmsg_flags & NLM_F_REPLACE)
 			ovr = 1;
 replay:
-		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr);
+		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr);
 		if (ret == -EAGAIN)
 			goto replay;
 		break;
 	case RTM_DELACTION:
 		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
-				    pid, RTM_DELACTION);
+				    portid, RTM_DELACTION);
 		break;
 	case RTM_GETACTION:
 		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
-				    pid, RTM_GETACTION);
+				    portid, RTM_GETACTION);
 		break;
 	default:
 		BUG();
@@ -1059,7 +1028,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	struct tc_action_ops *a_o;
 	struct tc_action a;
 	int ret = 0;
-	struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh);
+	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
 	struct nlattr *kind = find_dump_kind(cb->nlh);
 
 	if (kind == NULL) {
@@ -1074,26 +1043,22 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	memset(&a, 0, sizeof(struct tc_action));
 	a.ops = a_o;
 
-	if (a_o->walk == NULL) {
-		WARN(1, "tc_dump_action: %s !capable of dumping table\n",
-		     a_o->kind);
-		goto nla_put_failure;
-	}
-
-	nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
-			cb->nlh->nlmsg_type, sizeof(*t));
-	t = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*t), 0);
+	if (!nlh)
+		goto out_module_put;
+	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
 	if (nest == NULL)
-		goto nla_put_failure;
+		goto out_module_put;
 
 	ret = a_o->walk(skb, cb, RTM_GETACTION, &a);
 	if (ret < 0)
-		goto nla_put_failure;
+		goto out_module_put;
 
 	if (ret > 0) {
 		nla_nest_end(skb, nest);
@@ -1102,13 +1067,12 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 		nla_nest_cancel(skb, nest);
 
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
-	if (NETLINK_CB(cb->skb).pid && ret)
+	if (NETLINK_CB(cb->skb).portid && ret)
 		nlh->nlmsg_flags |= NLM_F_MULTI;
 	module_put(a_o->owner);
 	return skb->len;
 
-nla_put_failure:
-nlmsg_failure:
+out_module_put:
 	module_put(a_o->owner);
 	nlmsg_trim(skb, b);
 	return skb->len;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 453a73431ac..edbf40dac70 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -37,26 +37,16 @@
 #include <net/tc_act/tc_csum.h>
 
 #define CSUM_TAB_MASK 15
-static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1];
-static u32 csum_idx_gen;
-static DEFINE_RWLOCK(csum_lock);
-
-static struct tcf_hashinfo csum_hash_info = {
-	.htab	= tcf_csum_ht,
-	.hmask	= CSUM_TAB_MASK,
-	.lock	= &csum_lock,
-};
 
 static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
 	[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
 };
 
-static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,
+static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
 			 struct tc_action *a, int ovr, int bind)
 {
 	struct nlattr *tb[TCA_CSUM_MAX + 1];
 	struct tc_csum *parm;
-	struct tcf_common *pc;
 	struct tcf_csum *p;
 	int ret = 0, err;
 
@@ -71,39 +61,31 @@ static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,
 		return -EINVAL;
 	parm = nla_data(tb[TCA_CSUM_PARMS]);
 
-	pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info);
-	if (!pc) {
-		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
-				     &csum_idx_gen, &csum_hash_info);
-		if (IS_ERR(pc))
-			return PTR_ERR(pc);
-		p = to_tcf_csum(pc);
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+		if (ret)
+			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		p = to_tcf_csum(pc);
-		if (!ovr) {
-			tcf_hash_release(pc, bind, &csum_hash_info);
+		if (bind)/* dont override defaults */
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
 			return -EEXIST;
-		}
 	}
 
+	p = to_tcf_csum(a);
 	spin_lock_bh(&p->tcf_lock);
 	p->tcf_action = parm->action;
 	p->update_flags = parm->update_flags;
 	spin_unlock_bh(&p->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(pc, &csum_hash_info);
+		tcf_hash_insert(a);
 
 	return ret;
 }
 
-static int tcf_csum_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_csum *p = a->priv;
-	return tcf_hash_release(&p->common, bind, &csum_hash_info);
-}
-
 /**
  * tcf_csum_skb_nextlayer - Get next layer pointer
  * @skb: sk_buff to use
@@ -166,15 +148,17 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
 	return 1;
 }
 
-static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
 			      unsigned int ihl, unsigned int ipl)
 {
 	struct icmp6hdr *icmp6h;
+	const struct ipv6hdr *ip6h;
 
 	icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));
 	if (icmp6h == NULL)
 		return 0;
 
+	ip6h = ipv6_hdr(skb);
 	icmp6h->icmp6_cksum = 0;
 	skb->csum = csum_partial(icmp6h, ipl - ihl, 0);
 	icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
@@ -186,15 +170,17 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h,
 	return 1;
 }
 
-static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph,
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
 			     unsigned int ihl, unsigned int ipl)
 {
 	struct tcphdr *tcph;
+	const struct iphdr *iph;
 
 	tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
 	if (tcph == NULL)
 		return 0;
 
+	iph = ip_hdr(skb);
 	tcph->check = 0;
 	skb->csum = csum_partial(tcph, ipl - ihl, 0);
 	tcph->check = tcp_v4_check(ipl - ihl,
@@ -205,15 +191,17 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph,
 	return 1;
 }
 
-static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
 			     unsigned int ihl, unsigned int ipl)
 {
 	struct tcphdr *tcph;
+	const struct ipv6hdr *ip6h;
 
 	tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
 	if (tcph == NULL)
 		return 0;
 
+	ip6h = ipv6_hdr(skb);
 	tcph->check = 0;
 	skb->csum = csum_partial(tcph, ipl - ihl, 0);
 	tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
@@ -225,10 +213,11 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h,
 	return 1;
 }
 
-static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph,
+static int tcf_csum_ipv4_udp(struct sk_buff *skb,
 			     unsigned int ihl, unsigned int ipl, int udplite)
 {
 	struct udphdr *udph;
+	const struct iphdr *iph;
 	u16 ul;
 
 	/*
@@ -242,6 +231,7 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph,
 	if (udph == NULL)
 		return 0;
 
+	iph = ip_hdr(skb);
 	ul = ntohs(udph->len);
 
 	if (udplite || udph->check) {
@@ -276,10 +266,11 @@ ignore_obscure_skb:
 	return 1;
 }
 
-static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+static int tcf_csum_ipv6_udp(struct sk_buff *skb,
 			     unsigned int ihl, unsigned int ipl, int udplite)
 {
 	struct udphdr *udph;
+	const struct ipv6hdr *ip6h;
 	u16 ul;
 
 	/*
@@ -293,6 +284,7 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h,
 	if (udph == NULL)
 		return 0;
 
+	ip6h = ipv6_hdr(skb);
 	ul = ntohs(udph->len);
 
 	udph->check = 0;
@@ -328,7 +320,7 @@ ignore_obscure_skb:
 
 static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
 {
-	struct iphdr *iph;
+	const struct iphdr *iph;
 	int ntkoff;
 
 	ntkoff = skb_network_offset(skb);
@@ -353,19 +345,19 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
 		break;
 	case IPPROTO_TCP:
 		if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
-			if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4,
+			if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4,
 					       ntohs(iph->tot_len)))
 				goto fail;
 		break;
 	case IPPROTO_UDP:
 		if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
-			if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
+			if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
 					       ntohs(iph->tot_len), 0))
 				goto fail;
 		break;
 	case IPPROTO_UDPLITE:
 		if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
-			if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
+			if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
 					       ntohs(iph->tot_len), 1))
 				goto fail;
 		break;
@@ -377,7 +369,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
 		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
 			goto fail;
 
-		ip_send_check(iph);
+		ip_send_check(ip_hdr(skb));
 	}
 
 	return 1;
@@ -397,7 +389,7 @@ static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
 
 	while (len > 1) {
 		switch (xh[off]) {
-		case IPV6_TLV_PAD0:
+		case IPV6_TLV_PAD1:
 			optlen = 1;
 			break;
 		case IPV6_TLV_JUMBO:
@@ -456,6 +448,7 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
 			ixhl = ipv6_optlen(ip6xh);
 			if (!pskb_may_pull(skb, hl + ixhl + ntkoff))
 				goto fail;
+			ip6xh = (void *)(skb_network_header(skb) + hl);
 			if ((nexthdr == NEXTHDR_HOP) &&
 			    !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))
 				goto fail;
@@ -464,25 +457,25 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
 			break;
 		case IPPROTO_ICMPV6:
 			if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
-				if (!tcf_csum_ipv6_icmp(skb, ip6h,
+				if (!tcf_csum_ipv6_icmp(skb,
 							hl, pl + sizeof(*ip6h)))
 					goto fail;
 			goto done;
 		case IPPROTO_TCP:
 			if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
-				if (!tcf_csum_ipv6_tcp(skb, ip6h,
+				if (!tcf_csum_ipv6_tcp(skb,
 						       hl, pl + sizeof(*ip6h)))
 					goto fail;
 			goto done;
 		case IPPROTO_UDP:
 			if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
-				if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
+				if (!tcf_csum_ipv6_udp(skb, hl,
 						       pl + sizeof(*ip6h), 0))
 					goto fail;
 			goto done;
 		case IPPROTO_UDPLITE:
 			if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
-				if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
+				if (!tcf_csum_ipv6_udp(skb, hl,
 						       pl + sizeof(*ip6h), 1))
 					goto fail;
 			goto done;
@@ -550,11 +543,13 @@ static int tcf_csum_dump(struct sk_buff *skb,
 	};
 	struct tcf_t t;
 
-	NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
-	NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t);
+	if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t))
+		goto nla_put_failure;
 
 	return skb->len;
 
@@ -565,16 +560,11 @@ nla_put_failure:
 
 static struct tc_action_ops act_csum_ops = {
 	.kind		= "csum",
-	.hinfo		= &csum_hash_info,
 	.type		= TCA_ACT_CSUM,
-	.capab		= TCA_CAP_NONE,
 	.owner		= THIS_MODULE,
 	.act		= tcf_csum,
 	.dump		= tcf_csum_dump,
-	.cleanup	= tcf_csum_cleanup,
-	.lookup		= tcf_hash_search,
 	.init		= tcf_csum_init,
-	.walk		= tcf_generic_walker
 };
 
 MODULE_DESCRIPTION("Checksum updating actions");
@@ -582,7 +572,7 @@ MODULE_LICENSE("GPL");
 
 static int __init csum_init_module(void)
 {
-	return tcf_register_action(&act_csum_ops);
+	return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK);
 }
 
 static void __exit csum_cleanup_module(void)
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index b77f5a06a65..d6bcbd9f779 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -24,20 +24,11 @@
 #include <net/tc_act/tc_gact.h>
 
 #define GACT_TAB_MASK	15
-static struct tcf_common *tcf_gact_ht[GACT_TAB_MASK + 1];
-static u32 gact_idx_gen;
-static DEFINE_RWLOCK(gact_lock);
-
-static struct tcf_hashinfo gact_hash_info = {
-	.htab	=	tcf_gact_ht,
-	.hmask	=	GACT_TAB_MASK,
-	.lock	=	&gact_lock,
-};
 
 #ifdef CONFIG_GACT_PROB
 static int gact_net_rand(struct tcf_gact *gact)
 {
-	if (!gact->tcfg_pval || net_random() % gact->tcfg_pval)
+	if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval)
 		return gact->tcf_action;
 	return gact->tcfg_paction;
 }
@@ -58,15 +49,18 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
 	[TCA_GACT_PROB]		= { .len = sizeof(struct tc_gact_p) },
 };
 
-static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
-			 struct tc_action *a, int ovr, int bind)
+static int tcf_gact_init(struct net *net, struct nlattr *nla,
+			 struct nlattr *est, struct tc_action *a,
+			 int ovr, int bind)
 {
 	struct nlattr *tb[TCA_GACT_MAX + 1];
 	struct tc_gact *parm;
 	struct tcf_gact *gact;
-	struct tcf_common *pc;
 	int ret = 0;
 	int err;
+#ifdef CONFIG_GACT_PROB
+	struct tc_gact_p *p_parm = NULL;
+#endif
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -82,29 +76,33 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
 #ifndef CONFIG_GACT_PROB
 	if (tb[TCA_GACT_PROB] != NULL)
 		return -EOPNOTSUPP;
+#else
+	if (tb[TCA_GACT_PROB]) {
+		p_parm = nla_data(tb[TCA_GACT_PROB]);
+		if (p_parm->ptype >= MAX_RAND)
+			return -EINVAL;
+	}
 #endif
 
-	pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info);
-	if (!pc) {
-		pc = tcf_hash_create(parm->index, est, a, sizeof(*gact),
-				     bind, &gact_idx_gen, &gact_hash_info);
-		if (IS_ERR(pc))
-			return PTR_ERR(pc);
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind);
+		if (ret)
+			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		if (!ovr) {
-			tcf_hash_release(pc, bind, &gact_hash_info);
+		if (bind)/* dont override defaults */
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
 			return -EEXIST;
-		}
 	}
 
-	gact = to_gact(pc);
+	gact = to_gact(a);
 
 	spin_lock_bh(&gact->tcf_lock);
 	gact->tcf_action = parm->action;
 #ifdef CONFIG_GACT_PROB
-	if (tb[TCA_GACT_PROB] != NULL) {
-		struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]);
+	if (p_parm) {
 		gact->tcfg_paction = p_parm->paction;
 		gact->tcfg_pval    = p_parm->pval;
 		gact->tcfg_ptype   = p_parm->ptype;
@@ -112,19 +110,10 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
 #endif
 	spin_unlock_bh(&gact->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(pc, &gact_hash_info);
+		tcf_hash_insert(a);
 	return ret;
 }
 
-static int tcf_gact_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_gact *gact = a->priv;
-
-	if (gact)
-		return tcf_hash_release(&gact->common, bind, &gact_hash_info);
-	return 0;
-}
-
 static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
 		    struct tcf_result *res)
 {
@@ -133,7 +122,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
 
 	spin_lock(&gact->tcf_lock);
 #ifdef CONFIG_GACT_PROB
-	if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL)
+	if (gact->tcfg_ptype)
 		action = gact_rand[gact->tcfg_ptype](gact);
 	else
 		action = gact->tcf_action;
@@ -162,7 +151,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 	};
 	struct tcf_t t;
 
-	NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
 #ifdef CONFIG_GACT_PROB
 	if (gact->tcfg_ptype) {
 		struct tc_gact_p p_opt = {
@@ -171,13 +161,15 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 			.ptype   = gact->tcfg_ptype,
 		};
 
-		NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt);
+		if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt))
+			goto nla_put_failure;
 	}
 #endif
 	t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
-	NLA_PUT(skb, TCA_GACT_TM, sizeof(t), &t);
+	if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t))
+		goto nla_put_failure;
 	return skb->len;
 
 nla_put_failure:
@@ -187,16 +179,11 @@ nla_put_failure:
 
 static struct tc_action_ops act_gact_ops = {
 	.kind		=	"gact",
-	.hinfo		=	&gact_hash_info,
 	.type		=	TCA_ACT_GACT,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_gact,
 	.dump		=	tcf_gact_dump,
-	.cleanup	=	tcf_gact_cleanup,
-	.lookup		=	tcf_hash_search,
 	.init		=	tcf_gact_init,
-	.walk		=	tcf_generic_walker
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
@@ -210,7 +197,7 @@ static int __init gact_init_module(void)
 #else
 	pr_info("GACT probability NOT on\n");
 #endif
-	return tcf_register_action(&act_gact_ops);
+	return tcf_register_action(&act_gact_ops, GACT_TAB_MASK);
 }
 
 static void __exit gact_cleanup_module(void)
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 60f8f616e8f..8a64a0734ae 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -1,5 +1,5 @@
 /*
- * net/sched/ipt.c	iptables target interface
+ * net/sched/ipt.c     iptables target interface
  *
  *TODO: Add other tables. For now we only support the ipv4 table targets
  *
@@ -8,7 +8,7 @@
  *		as published by the Free Software Foundation; either version
  *		2 of the License, or (at your option) any later version.
  *
- * Copyright:	Jamal Hadi Salim (2002-4)
+ * Copyright:	Jamal Hadi Salim (2002-13)
  */
 
 #include <linux/types.h>
@@ -29,15 +29,6 @@
 
 
 #define IPT_TAB_MASK     15
-static struct tcf_common *tcf_ipt_ht[IPT_TAB_MASK + 1];
-static u32 ipt_idx_gen;
-static DEFINE_RWLOCK(ipt_lock);
-
-static struct tcf_hashinfo ipt_hash_info = {
-	.htab	=	tcf_ipt_ht,
-	.hmask	=	IPT_TAB_MASK,
-	.lock	=	&ipt_lock,
-};
 
 static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
 {
@@ -77,22 +68,12 @@ static void ipt_destroy_target(struct xt_entry_target *t)
 	module_put(par.target->me);
 }
 
-static int tcf_ipt_release(struct tcf_ipt *ipt, int bind)
+static void tcf_ipt_release(struct tc_action *a, int bind)
 {
-	int ret = 0;
-	if (ipt) {
-		if (bind)
-			ipt->tcf_bindcnt--;
-		ipt->tcf_refcnt--;
-		if (ipt->tcf_bindcnt <= 0 && ipt->tcf_refcnt <= 0) {
-			ipt_destroy_target(ipt->tcfi_t);
-			kfree(ipt->tcfi_tname);
-			kfree(ipt->tcfi_t);
-			tcf_hash_destroy(&ipt->common, &ipt_hash_info);
-			ret = ACT_P_DELETED;
-		}
-	}
-	return ret;
+	struct tcf_ipt *ipt = to_ipt(a);
+	ipt_destroy_target(ipt->tcfi_t);
+	kfree(ipt->tcfi_tname);
+	kfree(ipt->tcfi_t);
 }
 
 static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
@@ -102,12 +83,11 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
 	[TCA_IPT_TARG]	= { .len = sizeof(struct xt_entry_target) },
 };
 
-static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
+static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 			struct tc_action *a, int ovr, int bind)
 {
 	struct nlattr *tb[TCA_IPT_MAX + 1];
 	struct tcf_ipt *ipt;
-	struct tcf_common *pc;
 	struct xt_entry_target *td, *t;
 	char *tname;
 	int ret = 0, err;
@@ -133,20 +113,20 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
 	if (tb[TCA_IPT_INDEX] != NULL)
 		index = nla_get_u32(tb[TCA_IPT_INDEX]);
 
-	pc = tcf_hash_check(index, a, bind, &ipt_hash_info);
-	if (!pc) {
-		pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind,
-				     &ipt_idx_gen, &ipt_hash_info);
-		if (IS_ERR(pc))
-			return PTR_ERR(pc);
+	if (!tcf_hash_check(index, a, bind) ) {
+		ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind);
+		if (ret)
+			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		if (!ovr) {
-			tcf_ipt_release(to_ipt(pc), bind);
+		if (bind)/* dont override defaults */
+			return 0;
+		tcf_hash_release(a, bind);
+
+		if (!ovr)
 			return -EEXIST;
-		}
 	}
-	ipt = to_ipt(pc);
+	ipt = to_ipt(a);
 
 	hook = nla_get_u32(tb[TCA_IPT_HOOK]);
 
@@ -177,7 +157,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
 	ipt->tcfi_hook  = hook;
 	spin_unlock_bh(&ipt->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(pc, &ipt_hash_info);
+		tcf_hash_insert(a);
 	return ret;
 
 err3:
@@ -185,16 +165,11 @@ err3:
 err2:
 	kfree(tname);
 err1:
-	kfree(pc);
+	if (ret == ACT_P_CREATED)
+		tcf_hash_cleanup(a, est);
 	return err;
 }
 
-static int tcf_ipt_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_ipt *ipt = a->priv;
-	return tcf_ipt_release(ipt, bind);
-}
-
 static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
 		   struct tcf_result *res)
 {
@@ -202,10 +177,8 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_ipt *ipt = a->priv;
 	struct xt_action_param par;
 
-	if (skb_cloned(skb)) {
-		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-			return TC_ACT_UNSPEC;
-	}
+	if (skb_unclone(skb, GFP_ATOMIC))
+		return TC_ACT_UNSPEC;
 
 	spin_lock(&ipt->tcf_lock);
 
@@ -235,9 +208,8 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
 		result = TC_ACT_PIPE;
 		break;
 	default:
-		if (net_ratelimit())
-			pr_notice("tc filter: Bogus netfilter code"
-				  " %d assume ACCEPT\n", ret);
+		net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n",
+				       ret);
 		result = TC_POLICE_OK;
 		break;
 	}
@@ -267,15 +239,17 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 	c.refcnt = ipt->tcf_refcnt - ref;
 	strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
 
-	NLA_PUT(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t);
-	NLA_PUT_U32(skb, TCA_IPT_INDEX, ipt->tcf_index);
-	NLA_PUT_U32(skb, TCA_IPT_HOOK, ipt->tcfi_hook);
-	NLA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c);
-	NLA_PUT_STRING(skb, TCA_IPT_TABLE, ipt->tcfi_tname);
+	if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
+	    nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) ||
+	    nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) ||
+	    nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) ||
+	    nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname))
+		goto nla_put_failure;
 	tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
 	tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
 	tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
-	NLA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm);
+	if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm))
+		goto nla_put_failure;
 	kfree(t);
 	return skb->len;
 
@@ -287,29 +261,49 @@ nla_put_failure:
 
 static struct tc_action_ops act_ipt_ops = {
 	.kind		=	"ipt",
-	.hinfo		=	&ipt_hash_info,
 	.type		=	TCA_ACT_IPT,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_ipt,
 	.dump		=	tcf_ipt_dump,
-	.cleanup	=	tcf_ipt_cleanup,
-	.lookup		=	tcf_hash_search,
+	.cleanup	=	tcf_ipt_release,
+	.init		=	tcf_ipt_init,
+};
+
+static struct tc_action_ops act_xt_ops = {
+	.kind		=	"xt",
+	.type		=	TCA_ACT_XT,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_ipt,
+	.dump		=	tcf_ipt_dump,
+	.cleanup	=	tcf_ipt_release,
 	.init		=	tcf_ipt_init,
-	.walk		=	tcf_generic_walker
 };
 
-MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
 MODULE_DESCRIPTION("Iptables target actions");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("act_xt");
 
 static int __init ipt_init_module(void)
 {
-	return tcf_register_action(&act_ipt_ops);
+	int ret1, ret2;
+
+	ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK);
+	if (ret1 < 0)
+		printk("Failed to load xt action\n");
+	ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK);
+	if (ret2 < 0)
+		printk("Failed to load ipt action\n");
+
+	if (ret1 < 0 && ret2 < 0) {
+		return ret1;
+	} else
+		return 0;
 }
 
 static void __exit ipt_cleanup_module(void)
 {
+	tcf_unregister_action(&act_xt_ops);
 	tcf_unregister_action(&act_ipt_ops);
 }
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index e051398fdf6..4f912c0e225 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -30,45 +30,27 @@
 #include <linux/if_arp.h>
 
 #define MIRRED_TAB_MASK     7
-static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1];
-static u32 mirred_idx_gen;
-static DEFINE_RWLOCK(mirred_lock);
 static LIST_HEAD(mirred_list);
 
-static struct tcf_hashinfo mirred_hash_info = {
-	.htab	=	tcf_mirred_ht,
-	.hmask	=	MIRRED_TAB_MASK,
-	.lock	=	&mirred_lock,
-};
-
-static int tcf_mirred_release(struct tcf_mirred *m, int bind)
+static void tcf_mirred_release(struct tc_action *a, int bind)
 {
-	if (m) {
-		if (bind)
-			m->tcf_bindcnt--;
-		m->tcf_refcnt--;
-		if (!m->tcf_bindcnt && m->tcf_refcnt <= 0) {
-			list_del(&m->tcfm_list);
-			if (m->tcfm_dev)
-				dev_put(m->tcfm_dev);
-			tcf_hash_destroy(&m->common, &mirred_hash_info);
-			return 1;
-		}
-	}
-	return 0;
+	struct tcf_mirred *m = to_mirred(a);
+	list_del(&m->tcfm_list);
+	if (m->tcfm_dev)
+		dev_put(m->tcfm_dev);
 }
 
 static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
 	[TCA_MIRRED_PARMS]	= { .len = sizeof(struct tc_mirred) },
 };
 
-static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,
-			   struct tc_action *a, int ovr, int bind)
+static int tcf_mirred_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action *a, int ovr,
+			   int bind)
 {
 	struct nlattr *tb[TCA_MIRRED_MAX + 1];
 	struct tc_mirred *parm;
 	struct tcf_mirred *m;
-	struct tcf_common *pc;
 	struct net_device *dev;
 	int ret, ok_push = 0;
 
@@ -88,7 +70,7 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,
 		return -EINVAL;
 	}
 	if (parm->ifindex) {
-		dev = __dev_get_by_index(&init_net, parm->ifindex);
+		dev = __dev_get_by_index(net, parm->ifindex);
 		if (dev == NULL)
 			return -ENODEV;
 		switch (dev->type) {
@@ -108,22 +90,20 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,
 		dev = NULL;
 	}
 
-	pc = tcf_hash_check(parm->index, a, bind, &mirred_hash_info);
-	if (!pc) {
+	if (!tcf_hash_check(parm->index, a, bind)) {
 		if (dev == NULL)
 			return -EINVAL;
-		pc = tcf_hash_create(parm->index, est, a, sizeof(*m), bind,
-				     &mirred_idx_gen, &mirred_hash_info);
-		if (IS_ERR(pc))
-			return PTR_ERR(pc);
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind);
+		if (ret)
+			return ret;
 		ret = ACT_P_CREATED;
 	} else {
 		if (!ovr) {
-			tcf_mirred_release(to_mirred(pc), bind);
+			tcf_hash_release(a, bind);
 			return -EEXIST;
 		}
 	}
-	m = to_mirred(pc);
+	m = to_mirred(a);
 
 	spin_lock_bh(&m->tcf_lock);
 	m->tcf_action = parm->action;
@@ -139,21 +119,12 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,
 	spin_unlock_bh(&m->tcf_lock);
 	if (ret == ACT_P_CREATED) {
 		list_add(&m->tcfm_list, &mirred_list);
-		tcf_hash_insert(pc, &mirred_hash_info);
+		tcf_hash_insert(a);
 	}
 
 	return ret;
 }
 
-static int tcf_mirred_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_mirred *m = a->priv;
-
-	if (m)
-		return tcf_mirred_release(m, bind);
-	return 0;
-}
-
 static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 		      struct tcf_result *res)
 {
@@ -174,9 +145,8 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 	}
 
 	if (!(dev->flags & IFF_UP)) {
-		if (net_ratelimit())
-			pr_notice("tc mirred to Houston: device %s is down\n",
-				  dev->name);
+		net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
+				       dev->name);
 		goto out;
 	}
 
@@ -201,13 +171,12 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 out:
 	if (err) {
 		m->tcf_qstats.overlimits++;
-		/* should we be asking for packet to be dropped?
-		 * may make sense for redirect case only
-		 */
-		retval = TC_ACT_SHOT;
-	} else {
+		if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
+			retval = TC_ACT_SHOT;
+		else
+			retval = m->tcf_action;
+	} else
 		retval = m->tcf_action;
-	}
 	spin_unlock(&m->tcf_lock);
 
 	return retval;
@@ -227,11 +196,13 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i
 	};
 	struct tcf_t t;
 
-	NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
-	NLA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t);
+	if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t))
+		goto nla_put_failure;
 	return skb->len;
 
 nla_put_failure:
@@ -242,7 +213,7 @@ nla_put_failure:
 static int mirred_device_event(struct notifier_block *unused,
 			       unsigned long event, void *ptr)
 {
-	struct net_device *dev = ptr;
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct tcf_mirred *m;
 
 	if (event == NETDEV_UNREGISTER)
@@ -260,19 +231,14 @@ static struct notifier_block mirred_device_notifier = {
 	.notifier_call = mirred_device_event,
 };
 
-
 static struct tc_action_ops act_mirred_ops = {
 	.kind		=	"mirred",
-	.hinfo		=	&mirred_hash_info,
 	.type		=	TCA_ACT_MIRRED,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_mirred,
 	.dump		=	tcf_mirred_dump,
-	.cleanup	=	tcf_mirred_cleanup,
-	.lookup		=	tcf_hash_search,
+	.cleanup	=	tcf_mirred_release,
 	.init		=	tcf_mirred_init,
-	.walk		=	tcf_generic_walker
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002)");
@@ -286,13 +252,13 @@ static int __init mirred_init_module(void)
 		return err;
 
 	pr_info("Mirror/redirect action on\n");
-	return tcf_register_action(&act_mirred_ops);
+	return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK);
 }
 
 static void __exit mirred_cleanup_module(void)
 {
-	unregister_netdevice_notifier(&mirred_device_notifier);
 	tcf_unregister_action(&act_mirred_ops);
+	unregister_netdevice_notifier(&mirred_device_notifier);
 }
 
 module_init(mirred_init_module);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 001d1b35486..270a030d5fd 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -30,28 +30,18 @@
 
 
 #define NAT_TAB_MASK	15
-static struct tcf_common *tcf_nat_ht[NAT_TAB_MASK + 1];
-static u32 nat_idx_gen;
-static DEFINE_RWLOCK(nat_lock);
-
-static struct tcf_hashinfo nat_hash_info = {
-	.htab	=	tcf_nat_ht,
-	.hmask	=	NAT_TAB_MASK,
-	.lock	=	&nat_lock,
-};
 
 static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
 	[TCA_NAT_PARMS]	= { .len = sizeof(struct tc_nat) },
 };
 
-static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,
+static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 			struct tc_action *a, int ovr, int bind)
 {
 	struct nlattr *tb[TCA_NAT_MAX + 1];
 	struct tc_nat *parm;
 	int ret = 0, err;
 	struct tcf_nat *p;
-	struct tcf_common *pc;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -64,21 +54,19 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,
 		return -EINVAL;
 	parm = nla_data(tb[TCA_NAT_PARMS]);
 
-	pc = tcf_hash_check(parm->index, a, bind, &nat_hash_info);
-	if (!pc) {
-		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
-				     &nat_idx_gen, &nat_hash_info);
-		if (IS_ERR(pc))
-			return PTR_ERR(pc);
-		p = to_tcf_nat(pc);
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+		if (ret)
+			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		p = to_tcf_nat(pc);
-		if (!ovr) {
-			tcf_hash_release(pc, bind, &nat_hash_info);
+		if (bind)
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
 			return -EEXIST;
-		}
 	}
+	p = to_tcf_nat(a);
 
 	spin_lock_bh(&p->tcf_lock);
 	p->old_addr = parm->old_addr;
@@ -90,18 +78,11 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,
 	spin_unlock_bh(&p->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(pc, &nat_hash_info);
+		tcf_hash_insert(a);
 
 	return ret;
 }
 
-static int tcf_nat_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_nat *p = a->priv;
-
-	return tcf_hash_release(&p->common, bind, &nat_hash_info);
-}
-
 static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
 		   struct tcf_result *res)
 {
@@ -284,11 +265,13 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
 	};
 	struct tcf_t t;
 
-	NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
-	NLA_PUT(skb, TCA_NAT_TM, sizeof(t), &t);
+	if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t))
+		goto nla_put_failure;
 
 	return skb->len;
 
@@ -299,16 +282,11 @@ nla_put_failure:
 
 static struct tc_action_ops act_nat_ops = {
 	.kind		=	"nat",
-	.hinfo		=	&nat_hash_info,
 	.type		=	TCA_ACT_NAT,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_nat,
 	.dump		=	tcf_nat_dump,
-	.cleanup	=	tcf_nat_cleanup,
-	.lookup		=	tcf_hash_search,
 	.init		=	tcf_nat_init,
-	.walk		=	tcf_generic_walker
 };
 
 MODULE_DESCRIPTION("Stateless NAT actions");
@@ -316,7 +294,7 @@ MODULE_LICENSE("GPL");
 
 static int __init nat_init_module(void)
 {
-	return tcf_register_action(&act_nat_ops);
+	return tcf_register_action(&act_nat_ops, NAT_TAB_MASK);
 }
 
 static void __exit nat_cleanup_module(void)
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 10d3aed8656..5f9bcb2e080 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -24,28 +24,19 @@
 #include <net/tc_act/tc_pedit.h>
 
 #define PEDIT_TAB_MASK	15
-static struct tcf_common *tcf_pedit_ht[PEDIT_TAB_MASK + 1];
-static u32 pedit_idx_gen;
-static DEFINE_RWLOCK(pedit_lock);
-
-static struct tcf_hashinfo pedit_hash_info = {
-	.htab	=	tcf_pedit_ht,
-	.hmask	=	PEDIT_TAB_MASK,
-	.lock	=	&pedit_lock,
-};
 
 static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
 	[TCA_PEDIT_PARMS]	= { .len = sizeof(struct tc_pedit) },
 };
 
-static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
-			  struct tc_action *a, int ovr, int bind)
+static int tcf_pedit_init(struct net *net, struct nlattr *nla,
+			  struct nlattr *est, struct tc_action *a,
+			  int ovr, int bind)
 {
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
 	struct tc_pedit *parm;
 	int ret = 0, err;
 	struct tcf_pedit *p;
-	struct tcf_common *pc;
 	struct tc_pedit_key *keys = NULL;
 	int ksize;
 
@@ -63,27 +54,27 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
 	if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
 		return -EINVAL;
 
-	pc = tcf_hash_check(parm->index, a, bind, &pedit_hash_info);
-	if (!pc) {
+	if (!tcf_hash_check(parm->index, a, bind)) {
 		if (!parm->nkeys)
 			return -EINVAL;
-		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
-				     &pedit_idx_gen, &pedit_hash_info);
-		if (IS_ERR(pc))
-			return PTR_ERR(pc);
-		p = to_pedit(pc);
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+		if (ret)
+			return ret;
+		p = to_pedit(a);
 		keys = kmalloc(ksize, GFP_KERNEL);
 		if (keys == NULL) {
-			kfree(pc);
+			tcf_hash_cleanup(a, est);
 			return -ENOMEM;
 		}
 		ret = ACT_P_CREATED;
 	} else {
-		p = to_pedit(pc);
-		if (!ovr) {
-			tcf_hash_release(pc, bind, &pedit_hash_info);
+		p = to_pedit(a);
+		tcf_hash_release(a, bind);
+		if (bind)
+			return 0;
+		if (!ovr)
 			return -EEXIST;
-		}
+
 		if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
 			keys = kmalloc(ksize, GFP_KERNEL);
 			if (keys == NULL)
@@ -102,22 +93,15 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
 	memcpy(p->tcfp_keys, parm->keys, ksize);
 	spin_unlock_bh(&p->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(pc, &pedit_hash_info);
+		tcf_hash_insert(a);
 	return ret;
 }
 
-static int tcf_pedit_cleanup(struct tc_action *a, int bind)
+static void tcf_pedit_cleanup(struct tc_action *a, int bind)
 {
 	struct tcf_pedit *p = a->priv;
-
-	if (p) {
-		struct tc_pedit_key *keys = p->tcfp_keys;
-		if (tcf_hash_release(&p->common, bind, &pedit_hash_info)) {
-			kfree(keys);
-			return 1;
-		}
-	}
-	return 0;
+	struct tc_pedit_key *keys = p->tcfp_keys;
+	kfree(keys);
 }
 
 static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
@@ -127,8 +111,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 	int i, munged = 0;
 	unsigned int off;
 
-	if (skb_cloned(skb) &&
-	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+	if (skb_unclone(skb, GFP_ATOMIC))
 		return p->tcf_action;
 
 	off = skb_network_offset(skb);
@@ -215,11 +198,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 	opt->refcnt = p->tcf_refcnt - ref;
 	opt->bindcnt = p->tcf_bindcnt - bind;
 
-	NLA_PUT(skb, TCA_PEDIT_PARMS, s, opt);
+	if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
+		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
-	NLA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t);
+	if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t))
+		goto nla_put_failure;
 	kfree(opt);
 	return skb->len;
 
@@ -231,16 +216,12 @@ nla_put_failure:
 
 static struct tc_action_ops act_pedit_ops = {
 	.kind		=	"pedit",
-	.hinfo		=	&pedit_hash_info,
 	.type		=	TCA_ACT_PEDIT,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_pedit,
 	.dump		=	tcf_pedit_dump,
 	.cleanup	=	tcf_pedit_cleanup,
-	.lookup		=	tcf_hash_search,
 	.init		=	tcf_pedit_init,
-	.walk		=	tcf_generic_walker
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
@@ -249,7 +230,7 @@ MODULE_LICENSE("GPL");
 
 static int __init pedit_init_module(void)
 {
-	return tcf_register_action(&act_pedit_ops);
+	return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK);
 }
 
 static void __exit pedit_cleanup_module(void)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 6fb3f5af0f8..0566e4606a4 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -22,19 +22,25 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 
-#define L2T(p, L)   qdisc_l2t((p)->tcfp_R_tab, L)
-#define L2T_P(p, L) qdisc_l2t((p)->tcfp_P_tab, L)
+struct tcf_police {
+	struct tcf_common	common;
+	int			tcfp_result;
+	u32			tcfp_ewma_rate;
+	s64			tcfp_burst;
+	u32			tcfp_mtu;
+	s64			tcfp_toks;
+	s64			tcfp_ptoks;
+	s64			tcfp_mtu_ptoks;
+	s64			tcfp_t_c;
+	struct psched_ratecfg	rate;
+	bool			rate_present;
+	struct psched_ratecfg	peak;
+	bool			peak_present;
+};
+#define to_police(pc)	\
+	container_of(pc, struct tcf_police, common)
 
 #define POL_TAB_MASK     15
-static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1];
-static u32 police_idx_gen;
-static DEFINE_RWLOCK(police_lock);
-
-static struct tcf_hashinfo police_hash_info = {
-	.htab	=	tcf_police_ht,
-	.hmask	=	POL_TAB_MASK,
-	.lock	=	&police_lock,
-};
 
 /* old policer structure from before tc actions */
 struct tc_police_compat {
@@ -52,18 +58,20 @@ struct tc_police_compat {
 static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
 			      int type, struct tc_action *a)
 {
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct hlist_head *head;
 	struct tcf_common *p;
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
 	struct nlattr *nest;
 
-	read_lock_bh(&police_lock);
+	spin_lock_bh(&hinfo->lock);
 
 	s_i = cb->args[0];
 
 	for (i = 0; i < (POL_TAB_MASK + 1); i++) {
-		p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)];
+		head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)];
 
-		for (; p; p = p->tcfc_next) {
+		hlist_for_each_entry_rcu(p, head, tcfc_head) {
 			index++;
 			if (index < s_i)
 				continue;
@@ -86,7 +94,7 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c
 		}
 	}
 done:
-	read_unlock_bh(&police_lock);
+	spin_unlock_bh(&hinfo->lock);
 	if (n_i)
 		cb->args[0] += n_i;
 	return n_i;
@@ -96,33 +104,6 @@ nla_put_failure:
 	goto done;
 }
 
-static void tcf_police_destroy(struct tcf_police *p)
-{
-	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
-	struct tcf_common **p1p;
-
-	for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->tcfc_next) {
-		if (*p1p == &p->common) {
-			write_lock_bh(&police_lock);
-			*p1p = p->tcf_next;
-			write_unlock_bh(&police_lock);
-			gen_kill_estimator(&p->tcf_bstats,
-					   &p->tcf_rate_est);
-			if (p->tcfp_R_tab)
-				qdisc_put_rtab(p->tcfp_R_tab);
-			if (p->tcfp_P_tab)
-				qdisc_put_rtab(p->tcfp_P_tab);
-			/*
-			 * gen_estimator est_timer() might access p->tcf_lock
-			 * or bstats, wait a RCU grace period before freeing p
-			 */
-			kfree_rcu(p, tcf_rcu);
-			return;
-		}
-	}
-	WARN_ON(1);
-}
-
 static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
 	[TCA_POLICE_RATE]	= { .len = TC_RTAB_SIZE },
 	[TCA_POLICE_PEAKRATE]	= { .len = TC_RTAB_SIZE },
@@ -130,8 +111,9 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
 	[TCA_POLICE_RESULT]	= { .type = NLA_U32 },
 };
 
-static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est,
-				 struct tc_action *a, int ovr, int bind)
+static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
+				 struct nlattr *est, struct tc_action *a,
+				 int ovr, int bind)
 {
 	unsigned int h;
 	int ret = 0, err;
@@ -139,6 +121,7 @@ static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est,
 	struct tc_police *parm;
 	struct tcf_police *police;
 	struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
 	int size;
 
 	if (nla == NULL)
@@ -156,19 +139,17 @@ static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est,
 	parm = nla_data(tb[TCA_POLICE_TBF]);
 
 	if (parm->index) {
-		struct tcf_common *pc;
-
-		pc = tcf_hash_lookup(parm->index, &police_hash_info);
-		if (pc != NULL) {
-			a->priv = pc;
-			police = to_police(pc);
+		if (tcf_hash_search(a, parm->index)) {
+			police = to_police(a->priv);
 			if (bind) {
 				police->tcf_bindcnt += 1;
 				police->tcf_refcnt += 1;
+				return 0;
 			}
 			if (ovr)
 				goto override;
-			return ret;
+			/* not replacing */
+			return -EEXIST;
 		}
 	}
 
@@ -211,26 +192,36 @@ override:
 	}
 
 	/* No failure allowed after this point */
-	if (R_tab != NULL) {
-		qdisc_put_rtab(police->tcfp_R_tab);
-		police->tcfp_R_tab = R_tab;
+	police->tcfp_mtu = parm->mtu;
+	if (police->tcfp_mtu == 0) {
+		police->tcfp_mtu = ~0;
+		if (R_tab)
+			police->tcfp_mtu = 255 << R_tab->rate.cell_log;
 	}
-	if (P_tab != NULL) {
-		qdisc_put_rtab(police->tcfp_P_tab);
-		police->tcfp_P_tab = P_tab;
+	if (R_tab) {
+		police->rate_present = true;
+		psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);
+		qdisc_put_rtab(R_tab);
+	} else {
+		police->rate_present = false;
+	}
+	if (P_tab) {
+		police->peak_present = true;
+		psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);
+		qdisc_put_rtab(P_tab);
+	} else {
+		police->peak_present = false;
 	}
 
 	if (tb[TCA_POLICE_RESULT])
 		police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
-	police->tcfp_toks = police->tcfp_burst = parm->burst;
-	police->tcfp_mtu = parm->mtu;
-	if (police->tcfp_mtu == 0) {
-		police->tcfp_mtu = ~0;
-		if (police->tcfp_R_tab)
-			police->tcfp_mtu = 255<<police->tcfp_R_tab->rate.cell_log;
+	police->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
+	police->tcfp_toks = police->tcfp_burst;
+	if (police->peak_present) {
+		police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak,
+							     police->tcfp_mtu);
+		police->tcfp_ptoks = police->tcfp_mtu_ptoks;
 	}
-	if (police->tcfp_P_tab)
-		police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu);
 	police->tcf_action = parm->action;
 
 	if (tb[TCA_POLICE_AVRATE])
@@ -240,14 +231,13 @@ override:
 	if (ret != ACT_P_CREATED)
 		return ret;
 
-	police->tcfp_t_c = psched_get_time();
+	police->tcfp_t_c = ktime_to_ns(ktime_get());
 	police->tcf_index = parm->index ? parm->index :
-		tcf_hash_new_index(&police_idx_gen, &police_hash_info);
+		tcf_hash_new_index(hinfo);
 	h = tcf_hash(police->tcf_index, POL_TAB_MASK);
-	write_lock_bh(&police_lock);
-	police->tcf_next = tcf_police_ht[h];
-	tcf_police_ht[h] = &police->common;
-	write_unlock_bh(&police_lock);
+	spin_lock_bh(&hinfo->lock);
+	hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
+	spin_unlock_bh(&hinfo->lock);
 
 	a->priv = police;
 	return ret;
@@ -255,40 +245,20 @@ override:
 failure_unlock:
 	spin_unlock_bh(&police->tcf_lock);
 failure:
-	if (P_tab)
-		qdisc_put_rtab(P_tab);
-	if (R_tab)
-		qdisc_put_rtab(R_tab);
+	qdisc_put_rtab(P_tab);
+	qdisc_put_rtab(R_tab);
 	if (ret == ACT_P_CREATED)
 		kfree(police);
 	return err;
 }
 
-static int tcf_act_police_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_police *p = a->priv;
-	int ret = 0;
-
-	if (p != NULL) {
-		if (bind)
-			p->tcf_bindcnt--;
-
-		p->tcf_refcnt--;
-		if (p->tcf_refcnt <= 0 && !p->tcf_bindcnt) {
-			tcf_police_destroy(p);
-			ret = 1;
-		}
-	}
-	return ret;
-}
-
 static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
 	struct tcf_police *police = a->priv;
-	psched_time_t now;
-	long toks;
-	long ptoks = 0;
+	s64 now;
+	s64 toks;
+	s64 ptoks = 0;
 
 	spin_lock(&police->tcf_lock);
 
@@ -304,24 +274,25 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
 	}
 
 	if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
-		if (police->tcfp_R_tab == NULL) {
+		if (!police->rate_present) {
 			spin_unlock(&police->tcf_lock);
 			return police->tcfp_result;
 		}
 
-		now = psched_get_time();
-		toks = psched_tdiff_bounded(now, police->tcfp_t_c,
-					    police->tcfp_burst);
-		if (police->tcfp_P_tab) {
+		now = ktime_to_ns(ktime_get());
+		toks = min_t(s64, now - police->tcfp_t_c,
+			     police->tcfp_burst);
+		if (police->peak_present) {
 			ptoks = toks + police->tcfp_ptoks;
-			if (ptoks > (long)L2T_P(police, police->tcfp_mtu))
-				ptoks = (long)L2T_P(police, police->tcfp_mtu);
-			ptoks -= L2T_P(police, qdisc_pkt_len(skb));
+			if (ptoks > police->tcfp_mtu_ptoks)
+				ptoks = police->tcfp_mtu_ptoks;
+			ptoks -= (s64) psched_l2t_ns(&police->peak,
+						     qdisc_pkt_len(skb));
 		}
 		toks += police->tcfp_toks;
-		if (toks > (long)police->tcfp_burst)
+		if (toks > police->tcfp_burst)
 			toks = police->tcfp_burst;
-		toks -= L2T(police, qdisc_pkt_len(skb));
+		toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb));
 		if ((toks|ptoks) >= 0) {
 			police->tcfp_t_c = now;
 			police->tcfp_toks = toks;
@@ -347,20 +318,23 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 		.index = police->tcf_index,
 		.action = police->tcf_action,
 		.mtu = police->tcfp_mtu,
-		.burst = police->tcfp_burst,
+		.burst = PSCHED_NS2TICKS(police->tcfp_burst),
 		.refcnt = police->tcf_refcnt - ref,
 		.bindcnt = police->tcf_bindcnt - bind,
 	};
 
-	if (police->tcfp_R_tab)
-		opt.rate = police->tcfp_R_tab->rate;
-	if (police->tcfp_P_tab)
-		opt.peakrate = police->tcfp_P_tab->rate;
-	NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
-	if (police->tcfp_result)
-		NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result);
-	if (police->tcfp_ewma_rate)
-		NLA_PUT_U32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate);
+	if (police->rate_present)
+		psched_ratecfg_getrate(&opt.rate, &police->rate);
+	if (police->peak_present)
+		psched_ratecfg_getrate(&opt.peakrate, &police->peak);
+	if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if (police->tcfp_result &&
+	    nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result))
+		goto nla_put_failure;
+	if (police->tcfp_ewma_rate &&
+	    nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
+		goto nla_put_failure;
 	return skb->len;
 
 nla_put_failure:
@@ -374,14 +348,10 @@ MODULE_LICENSE("GPL");
 
 static struct tc_action_ops act_police_ops = {
 	.kind		=	"police",
-	.hinfo		=	&police_hash_info,
 	.type		=	TCA_ID_POLICE,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_act_police,
 	.dump		=	tcf_act_police_dump,
-	.cleanup	=	tcf_act_police_cleanup,
-	.lookup		=	tcf_hash_search,
 	.init		=	tcf_act_police_locate,
 	.walk		=	tcf_act_police_walker
 };
@@ -389,7 +359,7 @@ static struct tc_action_ops act_police_ops = {
 static int __init
 police_init_module(void)
 {
-	return tcf_register_action(&act_police_ops);
+	return tcf_register_action(&act_police_ops, POL_TAB_MASK);
 }
 
 static void __exit
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 73e0a3ab4d5..992c2317ce8 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -25,15 +25,6 @@
 #include <net/tc_act/tc_defact.h>
 
 #define SIMP_TAB_MASK     7
-static struct tcf_common *tcf_simp_ht[SIMP_TAB_MASK + 1];
-static u32 simp_idx_gen;
-static DEFINE_RWLOCK(simp_lock);
-
-static struct tcf_hashinfo simp_hash_info = {
-	.htab	=	tcf_simp_ht,
-	.hmask	=	SIMP_TAB_MASK,
-	.lock	=	&simp_lock,
-};
 
 #define SIMP_MAX_DATA	32
 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
@@ -55,20 +46,10 @@ static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
 	return d->tcf_action;
 }
 
-static int tcf_simp_release(struct tcf_defact *d, int bind)
+static void tcf_simp_release(struct tc_action *a, int bind)
 {
-	int ret = 0;
-	if (d) {
-		if (bind)
-			d->tcf_bindcnt--;
-		d->tcf_refcnt--;
-		if (d->tcf_bindcnt <= 0 && d->tcf_refcnt <= 0) {
-			kfree(d->tcfd_defdata);
-			tcf_hash_destroy(&d->common, &simp_hash_info);
-			ret = 1;
-		}
-	}
-	return ret;
+	struct tcf_defact *d = to_defact(a);
+	kfree(d->tcfd_defdata);
 }
 
 static int alloc_defdata(struct tcf_defact *d, char *defdata)
@@ -95,13 +76,13 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
 	[TCA_DEF_DATA]	= { .type = NLA_STRING, .len = SIMP_MAX_DATA },
 };
 
-static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
-			 struct tc_action *a, int ovr, int bind)
+static int tcf_simp_init(struct net *net, struct nlattr *nla,
+			 struct nlattr *est, struct tc_action *a,
+			 int ovr, int bind)
 {
 	struct nlattr *tb[TCA_DEF_MAX + 1];
 	struct tc_defact *parm;
 	struct tcf_defact *d;
-	struct tcf_common *pc;
 	char *defdata;
 	int ret = 0, err;
 
@@ -121,44 +102,36 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
 	parm = nla_data(tb[TCA_DEF_PARMS]);
 	defdata = nla_data(tb[TCA_DEF_DATA]);
 
-	pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info);
-	if (!pc) {
-		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
-				     &simp_idx_gen, &simp_hash_info);
-		if (IS_ERR(pc))
-			return PTR_ERR(pc);
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+		if (ret)
+			return ret;
 
-		d = to_defact(pc);
+		d = to_defact(a);
 		ret = alloc_defdata(d, defdata);
 		if (ret < 0) {
-			kfree(pc);
+			tcf_hash_cleanup(a, est);
 			return ret;
 		}
 		d->tcf_action = parm->action;
 		ret = ACT_P_CREATED;
 	} else {
-		d = to_defact(pc);
-		if (!ovr) {
-			tcf_simp_release(d, bind);
+		d = to_defact(a);
+
+		if (bind)
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
 			return -EEXIST;
-		}
+
 		reset_policy(d, defdata, parm);
 	}
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(pc, &simp_hash_info);
+		tcf_hash_insert(a);
 	return ret;
 }
 
-static int tcf_simp_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_defact *d = a->priv;
-
-	if (d)
-		return tcf_simp_release(d, bind);
-	return 0;
-}
-
 static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
 			 int bind, int ref)
 {
@@ -172,12 +145,14 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
 	};
 	struct tcf_t t;
 
-	NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt);
-	NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata);
+	if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
+	    nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
+		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
-	NLA_PUT(skb, TCA_DEF_TM, sizeof(t), &t);
+	if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t))
+		goto nla_put_failure;
 	return skb->len;
 
 nla_put_failure:
@@ -187,15 +162,12 @@ nla_put_failure:
 
 static struct tc_action_ops act_simp_ops = {
 	.kind		=	"simple",
-	.hinfo		=	&simp_hash_info,
 	.type		=	TCA_ACT_SIMP,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_simp,
 	.dump		=	tcf_simp_dump,
-	.cleanup	=	tcf_simp_cleanup,
+	.cleanup	=	tcf_simp_release,
 	.init		=	tcf_simp_init,
-	.walk		=	tcf_generic_walker,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2005)");
@@ -204,7 +176,8 @@ MODULE_LICENSE("GPL");
 
 static int __init simp_init_module(void)
 {
-	int ret = tcf_register_action(&act_simp_ops);
+	int ret;
+	ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK);
 	if (!ret)
 		pr_info("Simple TC action Loaded\n");
 	return ret;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 35dbbe91027..fcfeeaf838b 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * Author: Alexander Duyck <alexander.h.duyck@intel.com>
  */
@@ -29,15 +28,6 @@
 #include <net/tc_act/tc_skbedit.h>
 
 #define SKBEDIT_TAB_MASK     15
-static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1];
-static u32 skbedit_idx_gen;
-static DEFINE_RWLOCK(skbedit_lock);
-
-static struct tcf_hashinfo skbedit_hash_info = {
-	.htab	=	tcf_skbedit_ht,
-	.hmask	=	SKBEDIT_TAB_MASK,
-	.lock	=	&skbedit_lock,
-};
 
 static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 		       struct tcf_result *res)
@@ -67,13 +57,13 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 	[TCA_SKBEDIT_MARK]		= { .len = sizeof(u32) },
 };
 
-static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
-			 struct tc_action *a, int ovr, int bind)
+static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
+			    struct nlattr *est, struct tc_action *a,
+			    int ovr, int bind)
 {
 	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
 	struct tc_skbedit *parm;
 	struct tcf_skbedit *d;
-	struct tcf_common *pc;
 	u32 flags = 0, *priority = NULL, *mark = NULL;
 	u16 *queue_mapping = NULL;
 	int ret = 0, err;
@@ -108,21 +98,20 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
 
 	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
 
-	pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info);
-	if (!pc) {
-		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
-				     &skbedit_idx_gen, &skbedit_hash_info);
-		if (IS_ERR(pc))
-			return PTR_ERR(pc);
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+		if (ret)
+			return ret;
 
-		d = to_skbedit(pc);
+		d = to_skbedit(a);
 		ret = ACT_P_CREATED;
 	} else {
-		d = to_skbedit(pc);
-		if (!ovr) {
-			tcf_hash_release(pc, bind, &skbedit_hash_info);
+		d = to_skbedit(a);
+		if (bind)
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
 			return -EEXIST;
-		}
 	}
 
 	spin_lock_bh(&d->tcf_lock);
@@ -140,19 +129,10 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
 	spin_unlock_bh(&d->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(pc, &skbedit_hash_info);
+		tcf_hash_insert(a);
 	return ret;
 }
 
-static int tcf_skbedit_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_skbedit *d = a->priv;
-
-	if (d)
-		return tcf_hash_release(&d->common, bind, &skbedit_hash_info);
-	return 0;
-}
-
 static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 			    int bind, int ref)
 {
@@ -166,20 +146,25 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 	};
 	struct tcf_t t;
 
-	NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt);
-	if (d->flags & SKBEDIT_F_PRIORITY)
-		NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
-			&d->priority);
-	if (d->flags & SKBEDIT_F_QUEUE_MAPPING)
-		NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING,
-			sizeof(d->queue_mapping), &d->queue_mapping);
-	if (d->flags & SKBEDIT_F_MARK)
-		NLA_PUT(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
-			&d->mark);
+	if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if ((d->flags & SKBEDIT_F_PRIORITY) &&
+	    nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
+		    &d->priority))
+		goto nla_put_failure;
+	if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) &&
+	    nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING,
+		    sizeof(d->queue_mapping), &d->queue_mapping))
+		goto nla_put_failure;
+	if ((d->flags & SKBEDIT_F_MARK) &&
+	    nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
+		    &d->mark))
+		goto nla_put_failure;
 	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
-	NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t);
+	if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t))
+		goto nla_put_failure;
 	return skb->len;
 
 nla_put_failure:
@@ -189,15 +174,11 @@ nla_put_failure:
 
 static struct tc_action_ops act_skbedit_ops = {
 	.kind		=	"skbedit",
-	.hinfo		=	&skbedit_hash_info,
 	.type		=	TCA_ACT_SKBEDIT,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_skbedit,
 	.dump		=	tcf_skbedit_dump,
-	.cleanup	=	tcf_skbedit_cleanup,
 	.init		=	tcf_skbedit_init,
-	.walk		=	tcf_generic_walker,
 };
 
 MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
@@ -206,7 +187,7 @@ MODULE_LICENSE("GPL");
 
 static int __init skbedit_init_module(void)
 {
-	return tcf_register_action(&act_skbedit_ops);
+	return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK);
 }
 
 static void __exit skbedit_cleanup_module(void)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a69d44f1dac..45527e6b52d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -22,7 +22,6 @@
 #include <linux/skbuff.h>
 #include <linux/init.h>
 #include <linux/kmod.h>
-#include <linux/netlink.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <net/net_namespace.h>
@@ -32,8 +31,7 @@
 #include <net/pkt_cls.h>
 
 /* The list of all installed classifier types */
-
-static struct tcf_proto_ops *tcf_proto_base __read_mostly;
+static LIST_HEAD(tcf_proto_base);
 
 /* Protects list of registered TC modules. It is pure SMP lock. */
 static DEFINE_RWLOCK(cls_mod_lock);
@@ -42,36 +40,35 @@ static DEFINE_RWLOCK(cls_mod_lock);
 
 static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
 {
-	const struct tcf_proto_ops *t = NULL;
+	const struct tcf_proto_ops *t, *res = NULL;
 
 	if (kind) {
 		read_lock(&cls_mod_lock);
-		for (t = tcf_proto_base; t; t = t->next) {
+		list_for_each_entry(t, &tcf_proto_base, head) {
 			if (nla_strcmp(kind, t->kind) == 0) {
-				if (!try_module_get(t->owner))
-					t = NULL;
+				if (try_module_get(t->owner))
+					res = t;
 				break;
 			}
 		}
 		read_unlock(&cls_mod_lock);
 	}
-	return t;
+	return res;
 }
 
 /* Register(unregister) new classifier type */
 
 int register_tcf_proto_ops(struct tcf_proto_ops *ops)
 {
-	struct tcf_proto_ops *t, **tp;
+	struct tcf_proto_ops *t;
 	int rc = -EEXIST;
 
 	write_lock(&cls_mod_lock);
-	for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
+	list_for_each_entry(t, &tcf_proto_base, head)
 		if (!strcmp(ops->kind, t->kind))
 			goto out;
 
-	ops->next = NULL;
-	*tp = ops;
+	list_add_tail(&ops->head, &tcf_proto_base);
 	rc = 0;
 out:
 	write_unlock(&cls_mod_lock);
@@ -81,19 +78,17 @@ EXPORT_SYMBOL(register_tcf_proto_ops);
 
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 {
-	struct tcf_proto_ops *t, **tp;
+	struct tcf_proto_ops *t;
 	int rc = -ENOENT;
 
 	write_lock(&cls_mod_lock);
-	for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
-		if (t == ops)
+	list_for_each_entry(t, &tcf_proto_base, head) {
+		if (t == ops) {
+			list_del(&t->head);
+			rc = 0;
 			break;
-
-	if (!t)
-		goto out;
-	*tp = t->next;
-	rc = 0;
-out:
+		}
+	}
 	write_unlock(&cls_mod_lock);
 	return rc;
 }
@@ -118,7 +113,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
 
 /* Add/change/delete/get a filter node */
 
-static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 {
 	struct net *net = sock_net(skb->sk);
 	struct nlattr *tca[TCA_MAX + 1];
@@ -139,8 +134,16 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 	int err;
 	int tp_created = 0;
 
+	if ((n->nlmsg_type != RTM_GETTFILTER) &&
+	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
 replay:
-	t = NLMSG_DATA(n);
+	err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
+	if (err < 0)
+		return err;
+
+	t = nlmsg_data(n);
 	protocol = TC_H_MIN(t->tcm_info);
 	prio = TC_H_MAJ(t->tcm_info);
 	nprio = prio;
@@ -162,10 +165,6 @@ replay:
 	if (dev == NULL)
 		return -ENODEV;
 
-	err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
-	if (err < 0)
-		return err;
-
 	/* Find qdisc */
 	if (!parent) {
 		q = dev->qdisc;
@@ -319,7 +318,8 @@ replay:
 		}
 	}
 
-	err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
+	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
+			      n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE);
 	if (err == 0) {
 		if (tp_created) {
 			spin_lock_bh(root_lock);
@@ -342,32 +342,35 @@ errout:
 	return err;
 }
 
-static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp,
-			 unsigned long fh, u32 pid, u32 seq, u16 flags, int event)
+static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp,
+			 unsigned long fh, u32 portid, u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
 	unsigned char *b = skb_tail_pointer(skb);
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
-	tcm = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	tcm = nlmsg_data(nlh);
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm__pad1 = 0;
 	tcm->tcm__pad2 = 0;
 	tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
 	tcm->tcm_parent = tp->classid;
 	tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
-	NLA_PUT_STRING(skb, TCA_KIND, tp->ops->kind);
+	if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
+		goto nla_put_failure;
 	tcm->tcm_handle = fh;
 	if (RTM_DELTFILTER != event) {
 		tcm->tcm_handle = 0;
-		if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0)
+		if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
 			goto nla_put_failure;
 	}
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
+out_nlmsg_trim:
 nla_put_failure:
 	nlmsg_trim(skb, b);
 	return -1;
@@ -378,18 +381,18 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 			  unsigned long fh, int event)
 {
 	struct sk_buff *skb;
-	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
-	if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) {
+	if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
 
-	return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 			      n->nlmsg_flags & NLM_F_ECHO);
 }
 
@@ -403,8 +406,9 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
 			 struct tcf_walker *arg)
 {
 	struct tcf_dump_args *a = (void *)arg;
+	struct net *net = sock_net(a->skb->sk);
 
-	return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid,
+	return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
 			     a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
 }
 
@@ -417,12 +421,12 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 	struct net_device *dev;
 	struct Qdisc *q;
 	struct tcf_proto *tp, **chain;
-	struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
+	struct tcmsg *tcm = nlmsg_data(cb->nlh);
 	unsigned long cl = 0;
 	const struct Qdisc_class_ops *cops;
 	struct tcf_dump_args arg;
 
-	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
 		return skb->len;
 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
 	if (!dev)
@@ -462,7 +466,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 		if (t > s_t)
 			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
 		if (cb->args[1] == 0) {
-			if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid,
+			if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid,
 					  cb->nlh->nlmsg_seq, NLM_F_MULTI,
 					  RTM_NEWTFILTER) <= 0)
 				break;
@@ -495,45 +499,41 @@ out:
 void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (exts->action) {
-		tcf_action_destroy(exts->action, TCA_ACT_UNBIND);
-		exts->action = NULL;
-	}
+	tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND);
+	INIT_LIST_HEAD(&exts->actions);
 #endif
 }
 EXPORT_SYMBOL(tcf_exts_destroy);
 
-int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb,
-		  struct nlattr *rate_tlv, struct tcf_exts *exts,
-		  const struct tcf_ext_map *map)
+int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
+		  struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
 {
-	memset(exts, 0, sizeof(*exts));
-
 #ifdef CONFIG_NET_CLS_ACT
 	{
 		struct tc_action *act;
 
-		if (map->police && tb[map->police]) {
-			act = tcf_action_init_1(tb[map->police], rate_tlv,
-						"police", TCA_ACT_NOREPLACE,
+		INIT_LIST_HEAD(&exts->actions);
+		if (exts->police && tb[exts->police]) {
+			act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
+						"police", ovr,
 						TCA_ACT_BIND);
 			if (IS_ERR(act))
 				return PTR_ERR(act);
 
-			act->type = TCA_OLD_COMPAT;
-			exts->action = act;
-		} else if (map->action && tb[map->action]) {
-			act = tcf_action_init(tb[map->action], rate_tlv, NULL,
-					      TCA_ACT_NOREPLACE, TCA_ACT_BIND);
-			if (IS_ERR(act))
-				return PTR_ERR(act);
-
-			exts->action = act;
+			act->type = exts->type = TCA_OLD_COMPAT;
+			list_add(&act->list, &exts->actions);
+		} else if (exts->action && tb[exts->action]) {
+			int err;
+			err = tcf_action_init(net, tb[exts->action], rate_tlv,
+					      NULL, ovr,
+					      TCA_ACT_BIND, &exts->actions);
+			if (err)
+				return err;
 		}
 	}
 #else
-	if ((map->action && tb[map->action]) ||
-	    (map->police && tb[map->police]))
+	if ((exts->action && tb[exts->action]) ||
+	    (exts->police && tb[exts->police]))
 		return -EOPNOTSUPP;
 #endif
 
@@ -545,43 +545,42 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
 		     struct tcf_exts *src)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (src->action) {
-		struct tc_action *act;
-		tcf_tree_lock(tp);
-		act = dst->action;
-		dst->action = src->action;
-		tcf_tree_unlock(tp);
-		if (act)
-			tcf_action_destroy(act, TCA_ACT_UNBIND);
-	}
+	LIST_HEAD(tmp);
+	tcf_tree_lock(tp);
+	list_splice_init(&dst->actions, &tmp);
+	list_splice(&src->actions, &dst->actions);
+	tcf_tree_unlock(tp);
+	tcf_action_destroy(&tmp, TCA_ACT_UNBIND);
 #endif
 }
 EXPORT_SYMBOL(tcf_exts_change);
 
-int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts,
-		  const struct tcf_ext_map *map)
+#define tcf_exts_first_act(ext) \
+		list_first_entry(&(exts)->actions, struct tc_action, list)
+
+int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (map->action && exts->action) {
+	if (exts->action && !list_empty(&exts->actions)) {
 		/*
 		 * again for backward compatible mode - we want
 		 * to work with both old and new modes of entering
 		 * tc data even if iproute2  was newer - jhs
 		 */
 		struct nlattr *nest;
-
-		if (exts->action->type != TCA_OLD_COMPAT) {
-			nest = nla_nest_start(skb, map->action);
+		if (exts->type != TCA_OLD_COMPAT) {
+			nest = nla_nest_start(skb, exts->action);
 			if (nest == NULL)
 				goto nla_put_failure;
-			if (tcf_action_dump(skb, exts->action, 0, 0) < 0)
+			if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0)
 				goto nla_put_failure;
 			nla_nest_end(skb, nest);
-		} else if (map->police) {
-			nest = nla_nest_start(skb, map->police);
-			if (nest == NULL)
+		} else if (exts->police) {
+			struct tc_action *act = tcf_exts_first_act(exts);
+			nest = nla_nest_start(skb, exts->police);
+			if (nest == NULL || !act)
 				goto nla_put_failure;
-			if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0)
+			if (tcf_action_dump_old(skb, act, 0, 0) < 0)
 				goto nla_put_failure;
 			nla_nest_end(skb, nest);
 		}
@@ -594,17 +593,14 @@ nla_put_failure: __attribute__ ((unused))
 EXPORT_SYMBOL(tcf_exts_dump);
 
 
-int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts,
-			const struct tcf_ext_map *map)
+int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (exts->action)
-		if (tcf_action_copy_stats(skb, exts->action, 1) < 0)
-			goto nla_put_failure;
+	struct tc_action *a = tcf_exts_first_act(exts);
+	if (tcf_action_copy_stats(skb, a, 1) < 0)
+		return -1;
 #endif
 	return 0;
-nla_put_failure: __attribute__ ((unused))
-	return -1;
 }
 EXPORT_SYMBOL(tcf_exts_dump_stats);
 
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index ea1f70b5a5f..0ae1813e3e9 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -34,16 +34,11 @@ struct basic_filter {
 	struct list_head	link;
 };
 
-static const struct tcf_ext_map basic_ext_map = {
-	.action = TCA_BASIC_ACT,
-	.police = TCA_BASIC_POLICE
-};
-
 static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			  struct tcf_result *res)
 {
 	int r;
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = tp->root;
 	struct basic_filter *f;
 
 	list_for_each_entry(f, &head->flist, link) {
@@ -61,7 +56,7 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
 {
 	unsigned long l = 0UL;
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = tp->root;
 	struct basic_filter *f;
 
 	if (head == NULL)
@@ -112,7 +107,7 @@ static void basic_destroy(struct tcf_proto *tp)
 
 static int basic_delete(struct tcf_proto *tp, unsigned long arg)
 {
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = tp->root;
 	struct basic_filter *t, *f = (struct basic_filter *) arg;
 
 	list_for_each_entry(t, &head->flist, link)
@@ -132,15 +127,17 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
 	[TCA_BASIC_EMATCHES]	= { .type = NLA_NESTED },
 };
 
-static int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
-			   unsigned long base, struct nlattr **tb,
-			   struct nlattr *est)
+static int basic_set_parms(struct net *net, struct tcf_proto *tp,
+			   struct basic_filter *f, unsigned long base,
+			   struct nlattr **tb,
+			   struct nlattr *est, bool ovr)
 {
-	int err = -EINVAL;
+	int err;
 	struct tcf_exts e;
 	struct tcf_ematch_tree t;
 
-	err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map);
+	tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
 	if (err < 0)
 		return err;
 
@@ -162,11 +159,12 @@ errout:
 	return err;
 }
 
-static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
-			struct nlattr **tca, unsigned long *arg)
+static int basic_change(struct net *net, struct sk_buff *in_skb,
+			struct tcf_proto *tp, unsigned long base, u32 handle,
+			struct nlattr **tca, unsigned long *arg, bool ovr)
 {
 	int err;
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = tp->root;
 	struct nlattr *tb[TCA_BASIC_MAX + 1];
 	struct basic_filter *f = (struct basic_filter *) *arg;
 
@@ -181,7 +179,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 	if (f != NULL) {
 		if (handle && f->handle != handle)
 			return -EINVAL;
-		return basic_set_parms(tp, f, base, tb, tca[TCA_RATE]);
+		return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);
 	}
 
 	err = -ENOBUFS;
@@ -189,6 +187,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 	if (f == NULL)
 		goto errout;
 
+	tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
 	err = -EINVAL;
 	if (handle)
 		f->handle = handle;
@@ -207,7 +206,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 		f->handle = head->hgenerator;
 	}
 
-	err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE]);
+	err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);
 	if (err < 0)
 		goto errout;
 
@@ -226,7 +225,7 @@ errout:
 
 static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = tp->root;
 	struct basic_filter *f;
 
 	list_for_each_entry(f, &head->flist, link) {
@@ -242,7 +241,7 @@ skip:
 	}
 }
 
-static int basic_dump(struct tcf_proto *tp, unsigned long fh,
+static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		      struct sk_buff *skb, struct tcmsg *t)
 {
 	struct basic_filter *f = (struct basic_filter *) fh;
@@ -257,16 +256,17 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh,
 	if (nest == NULL)
 		goto nla_put_failure;
 
-	if (f->res.classid)
-		NLA_PUT_U32(skb, TCA_BASIC_CLASSID, f->res.classid);
+	if (f->res.classid &&
+	    nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid))
+		goto nla_put_failure;
 
-	if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 ||
+	if (tcf_exts_dump(skb, &f->exts) < 0 ||
 	    tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
 		goto nla_put_failure;
 
 	nla_nest_end(skb, nest);
 
-	if (tcf_exts_dump_stats(skb, &f->exts, &basic_ext_map) < 0)
+	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
 		goto nla_put_failure;
 
 	return skb->len;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
new file mode 100644
index 00000000000..13f64df2c71
--- /dev/null
+++ b/net/sched/cls_bpf.c
@@ -0,0 +1,382 @@
+/*
+ * Berkeley Packet Filter based traffic classifier
+ *
+ * Might be used to classify traffic through flexible, user-defined and
+ * possibly JIT-ed BPF filters for traffic control as an alternative to
+ * ematches.
+ *
+ * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_DESCRIPTION("TC BPF based classifier");
+
+struct cls_bpf_head {
+	struct list_head plist;
+	u32 hgen;
+};
+
+struct cls_bpf_prog {
+	struct sk_filter *filter;
+	struct sock_filter *bpf_ops;
+	struct tcf_exts exts;
+	struct tcf_result res;
+	struct list_head link;
+	u32 handle;
+	u16 bpf_len;
+};
+
+static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
+	[TCA_BPF_CLASSID]	= { .type = NLA_U32 },
+	[TCA_BPF_OPS_LEN]	= { .type = NLA_U16 },
+	[TCA_BPF_OPS]		= { .type = NLA_BINARY,
+				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
+};
+
+static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+			    struct tcf_result *res)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog;
+	int ret;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		int filter_res = SK_RUN_FILTER(prog->filter, skb);
+
+		if (filter_res == 0)
+			continue;
+
+		*res = prog->res;
+		if (filter_res != -1)
+			res->classid = filter_res;
+
+		ret = tcf_exts_exec(skb, &prog->exts, res);
+		if (ret < 0)
+			continue;
+
+		return ret;
+	}
+
+	return -1;
+}
+
+static int cls_bpf_init(struct tcf_proto *tp)
+{
+	struct cls_bpf_head *head;
+
+	head = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (head == NULL)
+		return -ENOBUFS;
+
+	INIT_LIST_HEAD(&head->plist);
+	tp->root = head;
+
+	return 0;
+}
+
+static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
+{
+	tcf_unbind_filter(tp, &prog->res);
+	tcf_exts_destroy(tp, &prog->exts);
+
+	sk_unattached_filter_destroy(prog->filter);
+
+	kfree(prog->bpf_ops);
+	kfree(prog);
+}
+
+static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		if (prog == todel) {
+			tcf_tree_lock(tp);
+			list_del(&prog->link);
+			tcf_tree_unlock(tp);
+
+			cls_bpf_delete_prog(tp, prog);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static void cls_bpf_destroy(struct tcf_proto *tp)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog, *tmp;
+
+	list_for_each_entry_safe(prog, tmp, &head->plist, link) {
+		list_del(&prog->link);
+		cls_bpf_delete_prog(tp, prog);
+	}
+
+	kfree(head);
+}
+
+static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog;
+	unsigned long ret = 0UL;
+
+	if (head == NULL)
+		return 0UL;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		if (prog->handle == handle) {
+			ret = (unsigned long) prog;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void cls_bpf_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
+				   struct cls_bpf_prog *prog,
+				   unsigned long base, struct nlattr **tb,
+				   struct nlattr *est, bool ovr)
+{
+	struct sock_filter *bpf_ops, *bpf_old;
+	struct tcf_exts exts;
+	struct sock_fprog_kern tmp;
+	struct sk_filter *fp, *fp_old;
+	u16 bpf_size, bpf_len;
+	u32 classid;
+	int ret;
+
+	if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID])
+		return -EINVAL;
+
+	tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+	ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
+	if (ret < 0)
+		return ret;
+
+	classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+	bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
+	if (bpf_len > BPF_MAXINSNS || bpf_len == 0) {
+		ret = -EINVAL;
+		goto errout;
+	}
+
+	bpf_size = bpf_len * sizeof(*bpf_ops);
+	bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+	if (bpf_ops == NULL) {
+		ret = -ENOMEM;
+		goto errout;
+	}
+
+	memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
+
+	tmp.len = bpf_len;
+	tmp.filter = bpf_ops;
+
+	ret = sk_unattached_filter_create(&fp, &tmp);
+	if (ret)
+		goto errout_free;
+
+	tcf_tree_lock(tp);
+	fp_old = prog->filter;
+	bpf_old = prog->bpf_ops;
+
+	prog->bpf_len = bpf_len;
+	prog->bpf_ops = bpf_ops;
+	prog->filter = fp;
+	prog->res.classid = classid;
+	tcf_tree_unlock(tp);
+
+	tcf_bind_filter(tp, &prog->res, base);
+	tcf_exts_change(tp, &prog->exts, &exts);
+
+	if (fp_old)
+		sk_unattached_filter_destroy(fp_old);
+	if (bpf_old)
+		kfree(bpf_old);
+
+	return 0;
+
+errout_free:
+	kfree(bpf_ops);
+errout:
+	tcf_exts_destroy(tp, &exts);
+	return ret;
+}
+
+static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
+				   struct cls_bpf_head *head)
+{
+	unsigned int i = 0x80000000;
+
+	do {
+		if (++head->hgen == 0x7FFFFFFF)
+			head->hgen = 1;
+	} while (--i > 0 && cls_bpf_get(tp, head->hgen));
+	if (i == 0)
+		pr_err("Insufficient number of handles\n");
+
+	return i;
+}
+
+static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
+			  struct tcf_proto *tp, unsigned long base,
+			  u32 handle, struct nlattr **tca,
+			  unsigned long *arg, bool ovr)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg;
+	struct nlattr *tb[TCA_BPF_MAX + 1];
+	int ret;
+
+	if (tca[TCA_OPTIONS] == NULL)
+		return -EINVAL;
+
+	ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy);
+	if (ret < 0)
+		return ret;
+
+	if (prog != NULL) {
+		if (handle && prog->handle != handle)
+			return -EINVAL;
+		return cls_bpf_modify_existing(net, tp, prog, base, tb,
+					       tca[TCA_RATE], ovr);
+	}
+
+	prog = kzalloc(sizeof(*prog), GFP_KERNEL);
+	if (prog == NULL)
+		return -ENOBUFS;
+
+	tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+	if (handle == 0)
+		prog->handle = cls_bpf_grab_new_handle(tp, head);
+	else
+		prog->handle = handle;
+	if (prog->handle == 0) {
+		ret = -EINVAL;
+		goto errout;
+	}
+
+	ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
+	if (ret < 0)
+		goto errout;
+
+	tcf_tree_lock(tp);
+	list_add(&prog->link, &head->plist);
+	tcf_tree_unlock(tp);
+
+	*arg = (unsigned long) prog;
+
+	return 0;
+errout:
+	if (*arg == 0UL && prog)
+		kfree(prog);
+
+	return ret;
+}
+
+static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+			struct sk_buff *skb, struct tcmsg *tm)
+{
+	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
+	struct nlattr *nest, *nla;
+
+	if (prog == NULL)
+		return skb->len;
+
+	tm->tcm_handle = prog->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
+		goto nla_put_failure;
+	if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len))
+		goto nla_put_failure;
+
+	nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len *
+			  sizeof(struct sock_filter));
+	if (nla == NULL)
+		goto nla_put_failure;
+
+	memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
+
+	if (tcf_exts_dump(skb, &prog->exts) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &prog->exts) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		if (arg->count < arg->skip)
+			goto skip;
+		if (arg->fn(tp, (unsigned long) prog, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+skip:
+		arg->count++;
+	}
+}
+
+static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
+	.kind		=	"bpf",
+	.owner		=	THIS_MODULE,
+	.classify	=	cls_bpf_classify,
+	.init		=	cls_bpf_init,
+	.destroy	=	cls_bpf_destroy,
+	.get		=	cls_bpf_get,
+	.put		=	cls_bpf_put,
+	.change		=	cls_bpf_change,
+	.delete		=	cls_bpf_delete,
+	.walk		=	cls_bpf_walk,
+	.dump		=	cls_bpf_dump,
+};
+
+static int __init cls_bpf_init_mod(void)
+{
+	return register_tcf_proto_ops(&cls_bpf_ops);
+}
+
+static void __exit cls_bpf_exit_mod(void)
+{
+	unregister_tcf_proto_ops(&cls_bpf_ops);
+}
+
+module_init(cls_bpf_init_mod);
+module_exit(cls_bpf_exit_mod);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index f84fdc3a7f2..cacf01bd04f 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -11,90 +11,13 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/errno.h>
 #include <linux/skbuff.h>
-#include <linux/cgroup.h>
 #include <linux/rcupdate.h>
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
 #include <net/sock.h>
 #include <net/cls_cgroup.h>
 
-static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
-					       struct cgroup *cgrp);
-static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
-
-struct cgroup_subsys net_cls_subsys = {
-	.name		= "net_cls",
-	.create		= cgrp_create,
-	.destroy	= cgrp_destroy,
-	.populate	= cgrp_populate,
-#ifdef CONFIG_NET_CLS_CGROUP
-	.subsys_id	= net_cls_subsys_id,
-#endif
-	.module		= THIS_MODULE,
-};
-
-
-static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
-{
-	return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
-			    struct cgroup_cls_state, css);
-}
-
-static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
-{
-	return container_of(task_subsys_state(p, net_cls_subsys_id),
-			    struct cgroup_cls_state, css);
-}
-
-static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
-						 struct cgroup *cgrp)
-{
-	struct cgroup_cls_state *cs;
-
-	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
-	if (!cs)
-		return ERR_PTR(-ENOMEM);
-
-	if (cgrp->parent)
-		cs->classid = cgrp_cls_state(cgrp->parent)->classid;
-
-	return &cs->css;
-}
-
-static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-	kfree(cgrp_cls_state(cgrp));
-}
-
-static u64 read_classid(struct cgroup *cgrp, struct cftype *cft)
-{
-	return cgrp_cls_state(cgrp)->classid;
-}
-
-static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value)
-{
-	cgrp_cls_state(cgrp)->classid = (u32) value;
-	return 0;
-}
-
-static struct cftype ss_files[] = {
-	{
-		.name = "classid",
-		.read_u64 = read_classid,
-		.write_u64 = write_classid,
-	},
-};
-
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
-}
-
 struct cls_cgroup_head {
 	u32			handle;
 	struct tcf_exts		exts;
@@ -153,18 +76,14 @@ static int cls_cgroup_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static const struct tcf_ext_map cgroup_ext_map = {
-	.action = TCA_CGROUP_ACT,
-	.police = TCA_CGROUP_POLICE,
-};
-
 static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
 	[TCA_CGROUP_EMATCHES]	= { .type = NLA_NESTED },
 };
 
-static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
+static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
+			     struct tcf_proto *tp, unsigned long base,
 			     u32 handle, struct nlattr **tca,
-			     unsigned long *arg)
+			     unsigned long *arg, bool ovr)
 {
 	struct nlattr *tb[TCA_CGROUP_MAX + 1];
 	struct cls_cgroup_head *head = tp->root;
@@ -183,6 +102,7 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
 		if (head == NULL)
 			return -ENOBUFS;
 
+		tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
 		head->handle = handle;
 
 		tcf_tree_lock(tp);
@@ -198,7 +118,8 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
 	if (err < 0)
 		return err;
 
-	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &cgroup_ext_map);
+	tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
 	if (err < 0)
 		return err;
 
@@ -243,7 +164,7 @@ skip:
 	arg->count++;
 }
 
-static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh,
+static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 			   struct sk_buff *skb, struct tcmsg *t)
 {
 	struct cls_cgroup_head *head = tp->root;
@@ -256,13 +177,13 @@ static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh,
 	if (nest == NULL)
 		goto nla_put_failure;
 
-	if (tcf_exts_dump(skb, &head->exts, &cgroup_ext_map) < 0 ||
+	if (tcf_exts_dump(skb, &head->exts) < 0 ||
 	    tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)
 		goto nla_put_failure;
 
 	nla_nest_end(skb, nest);
 
-	if (tcf_exts_dump_stats(skb, &head->exts, &cgroup_ext_map) < 0)
+	if (tcf_exts_dump_stats(skb, &head->exts) < 0)
 		goto nla_put_failure;
 
 	return skb->len;
@@ -288,36 +209,12 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
 
 static int __init init_cgroup_cls(void)
 {
-	int ret;
-
-	ret = cgroup_load_subsys(&net_cls_subsys);
-	if (ret)
-		goto out;
-
-#ifndef CONFIG_NET_CLS_CGROUP
-	/* We can't use rcu_assign_pointer because this is an int. */
-	smp_wmb();
-	net_cls_subsys_id = net_cls_subsys.subsys_id;
-#endif
-
-	ret = register_tcf_proto_ops(&cls_cgroup_ops);
-	if (ret)
-		cgroup_unload_subsys(&net_cls_subsys);
-
-out:
-	return ret;
+	return register_tcf_proto_ops(&cls_cgroup_ops);
 }
 
 static void __exit exit_cgroup_cls(void)
 {
 	unregister_tcf_proto_ops(&cls_cgroup_ops);
-
-#ifndef CONFIG_NET_CLS_CGROUP
-	net_cls_subsys_id = -1;
-	synchronize_rcu();
-#endif
-
-	cgroup_unload_subsys(&net_cls_subsys);
 }
 
 module_init(init_cgroup_cls);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 1d8bd0dbcd1..35be16f7c19 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -56,11 +56,6 @@ struct flow_filter {
 	u32			hashrnd;
 };
 
-static const struct tcf_ext_map flow_ext_map = {
-	.action	= TCA_FLOW_ACT,
-	.police	= TCA_FLOW_POLICE,
-};
-
 static inline u32 addr_fold(void *addr)
 {
 	unsigned long a = (unsigned long)addr;
@@ -193,15 +188,19 @@ static u32 flow_get_rtclassid(const struct sk_buff *skb)
 
 static u32 flow_get_skuid(const struct sk_buff *skb)
 {
-	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
-		return skb->sk->sk_socket->file->f_cred->fsuid;
+	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
+		kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid;
+		return from_kuid(&init_user_ns, skuid);
+	}
 	return 0;
 }
 
 static u32 flow_get_skgid(const struct sk_buff *skb)
 {
-	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
-		return skb->sk->sk_socket->file->f_cred->fsgid;
+	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
+		kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid;
+		return from_kgid(&init_user_ns, skgid);
+	}
 	return 0;
 }
 
@@ -216,7 +215,7 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb)
 
 static u32 flow_get_rxhash(struct sk_buff *skb)
 {
-	return skb_get_rxhash(skb);
+	return skb_get_hash(skb);
 }
 
 static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
@@ -347,9 +346,10 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
 	[TCA_FLOW_PERTURB]	= { .type = NLA_U32 },
 };
 
-static int flow_change(struct tcf_proto *tp, unsigned long base,
+static int flow_change(struct net *net, struct sk_buff *in_skb,
+		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
-		       unsigned long *arg)
+		       unsigned long *arg, bool ovr)
 {
 	struct flow_head *head = tp->root;
 	struct flow_filter *f;
@@ -386,9 +386,14 @@ static int flow_change(struct tcf_proto *tp, unsigned long base,
 
 		if (fls(keymask) - 1 > FLOW_KEY_MAX)
 			return -EOPNOTSUPP;
+
+		if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) &&
+		    sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns)
+			return -EOPNOTSUPP;
 	}
 
-	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map);
+	tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
 	if (err < 0)
 		return err;
 
@@ -446,6 +451,7 @@ static int flow_change(struct tcf_proto *tp, unsigned long base,
 
 		f->handle = handle;
 		f->mask	  = ~0U;
+		tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
 
 		get_random_bytes(&f->hashrnd, 4);
 		f->perturb_timer.function = flow_perturbation;
@@ -557,7 +563,7 @@ static void flow_put(struct tcf_proto *tp, unsigned long f)
 {
 }
 
-static int flow_dump(struct tcf_proto *tp, unsigned long fh,
+static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		     struct sk_buff *skb, struct tcmsg *t)
 {
 	struct flow_filter *f = (struct flow_filter *)fh;
@@ -572,27 +578,34 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh,
 	if (nest == NULL)
 		goto nla_put_failure;
 
-	NLA_PUT_U32(skb, TCA_FLOW_KEYS, f->keymask);
-	NLA_PUT_U32(skb, TCA_FLOW_MODE, f->mode);
+	if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) ||
+	    nla_put_u32(skb, TCA_FLOW_MODE, f->mode))
+		goto nla_put_failure;
 
 	if (f->mask != ~0 || f->xor != 0) {
-		NLA_PUT_U32(skb, TCA_FLOW_MASK, f->mask);
-		NLA_PUT_U32(skb, TCA_FLOW_XOR, f->xor);
+		if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) ||
+		    nla_put_u32(skb, TCA_FLOW_XOR, f->xor))
+			goto nla_put_failure;
 	}
-	if (f->rshift)
-		NLA_PUT_U32(skb, TCA_FLOW_RSHIFT, f->rshift);
-	if (f->addend)
-		NLA_PUT_U32(skb, TCA_FLOW_ADDEND, f->addend);
+	if (f->rshift &&
+	    nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift))
+		goto nla_put_failure;
+	if (f->addend &&
+	    nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend))
+		goto nla_put_failure;
 
-	if (f->divisor)
-		NLA_PUT_U32(skb, TCA_FLOW_DIVISOR, f->divisor);
-	if (f->baseclass)
-		NLA_PUT_U32(skb, TCA_FLOW_BASECLASS, f->baseclass);
+	if (f->divisor &&
+	    nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor))
+		goto nla_put_failure;
+	if (f->baseclass &&
+	    nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass))
+		goto nla_put_failure;
 
-	if (f->perturb_period)
-		NLA_PUT_U32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ);
+	if (f->perturb_period &&
+	    nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ))
+		goto nla_put_failure;
 
-	if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0)
+	if (tcf_exts_dump(skb, &f->exts) < 0)
 		goto nla_put_failure;
 #ifdef CONFIG_NET_EMATCH
 	if (f->ematches.hdr.nmatches &&
@@ -601,7 +614,7 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh,
 #endif
 	nla_nest_end(skb, nest);
 
-	if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0)
+	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
 		goto nla_put_failure;
 
 	return skb->len;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 389af152ec4..861b03ccfed 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -29,11 +29,11 @@
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 
-#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
+#define HTSIZE 256
 
 struct fw_head {
-	struct fw_filter *ht[HTSIZE];
-	u32 mask;
+	u32			mask;
+	struct fw_filter	*ht[HTSIZE];
 };
 
 struct fw_filter {
@@ -41,46 +41,22 @@ struct fw_filter {
 	u32			id;
 	struct tcf_result	res;
 #ifdef CONFIG_NET_CLS_IND
-	char			indev[IFNAMSIZ];
+	int			ifindex;
 #endif /* CONFIG_NET_CLS_IND */
 	struct tcf_exts		exts;
 };
 
-static const struct tcf_ext_map fw_ext_map = {
-	.action = TCA_FW_ACT,
-	.police = TCA_FW_POLICE
-};
-
-static inline int fw_hash(u32 handle)
+static u32 fw_hash(u32 handle)
 {
-	if (HTSIZE == 4096)
-		return ((handle >> 24) & 0xFFF) ^
-		       ((handle >> 12) & 0xFFF) ^
-		       (handle & 0xFFF);
-	else if (HTSIZE == 2048)
-		return ((handle >> 22) & 0x7FF) ^
-		       ((handle >> 11) & 0x7FF) ^
-		       (handle & 0x7FF);
-	else if (HTSIZE == 1024)
-		return ((handle >> 20) & 0x3FF) ^
-		       ((handle >> 10) & 0x3FF) ^
-		       (handle & 0x3FF);
-	else if (HTSIZE == 512)
-		return (handle >> 27) ^
-		       ((handle >> 18) & 0x1FF) ^
-		       ((handle >> 9) & 0x1FF) ^
-		       (handle & 0x1FF);
-	else if (HTSIZE == 256) {
-		u8 *t = (u8 *) &handle;
-		return t[0] ^ t[1] ^ t[2] ^ t[3];
-	} else
-		return handle & (HTSIZE - 1);
+	handle ^= (handle >> 16);
+	handle ^= (handle >> 8);
+	return handle % HTSIZE;
 }
 
 static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			  struct tcf_result *res)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = tp->root;
 	struct fw_filter *f;
 	int r;
 	u32 id = skb->mark;
@@ -91,7 +67,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			if (f->id == id) {
 				*res = f->res;
 #ifdef CONFIG_NET_CLS_IND
-				if (!tcf_match_indev(skb, f->indev))
+				if (!tcf_match_indev(skb, f->ifindex))
 					continue;
 #endif /* CONFIG_NET_CLS_IND */
 				r = tcf_exts_exec(skb, &f->exts, res);
@@ -116,7 +92,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = tp->root;
 	struct fw_filter *f;
 
 	if (head == NULL)
@@ -165,7 +141,7 @@ static void fw_destroy(struct tcf_proto *tp)
 
 static int fw_delete(struct tcf_proto *tp, unsigned long arg)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = tp->root;
 	struct fw_filter *f = (struct fw_filter *)arg;
 	struct fw_filter **fp;
 
@@ -192,19 +168,19 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
 };
 
 static int
-fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,
-	struct nlattr **tb, struct nlattr **tca, unsigned long base)
+fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
+	struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = tp->root;
 	struct tcf_exts e;
 	u32 mask;
 	int err;
 
-	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &fw_ext_map);
+	tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
 	if (err < 0)
 		return err;
 
-	err = -EINVAL;
 	if (tb[TCA_FW_CLASSID]) {
 		f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
 		tcf_bind_filter(tp, &f->res, base);
@@ -212,12 +188,17 @@ fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,
 
 #ifdef CONFIG_NET_CLS_IND
 	if (tb[TCA_FW_INDEV]) {
-		err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV]);
-		if (err < 0)
+		int ret;
+		ret = tcf_change_indev(net, tb[TCA_FW_INDEV]);
+		if (ret < 0) {
+			err = ret;
 			goto errout;
+		}
+		f->ifindex = ret;
 	}
 #endif /* CONFIG_NET_CLS_IND */
 
+	err = -EINVAL;
 	if (tb[TCA_FW_MASK]) {
 		mask = nla_get_u32(tb[TCA_FW_MASK]);
 		if (mask != head->mask)
@@ -233,12 +214,13 @@ errout:
 	return err;
 }
 
-static int fw_change(struct tcf_proto *tp, unsigned long base,
+static int fw_change(struct net *net, struct sk_buff *in_skb,
+		     struct tcf_proto *tp, unsigned long base,
 		     u32 handle,
 		     struct nlattr **tca,
-		     unsigned long *arg)
+		     unsigned long *arg, bool ovr)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = tp->root;
 	struct fw_filter *f = (struct fw_filter *) *arg;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_FW_MAX + 1];
@@ -254,7 +236,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base,
 	if (f != NULL) {
 		if (f->id != handle && handle)
 			return -EINVAL;
-		return fw_change_attrs(tp, f, tb, tca, base);
+		return fw_change_attrs(net, tp, f, tb, tca, base, ovr);
 	}
 
 	if (!handle)
@@ -279,9 +261,10 @@ static int fw_change(struct tcf_proto *tp, unsigned long base,
 	if (f == NULL)
 		return -ENOBUFS;
 
+	tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
 	f->id = handle;
 
-	err = fw_change_attrs(tp, f, tb, tca, base);
+	err = fw_change_attrs(net, tp, f, tb, tca, base, ovr);
 	if (err < 0)
 		goto errout;
 
@@ -300,7 +283,7 @@ errout:
 
 static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = tp->root;
 	int h;
 
 	if (head == NULL)
@@ -326,10 +309,10 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int fw_dump(struct tcf_proto *tp, unsigned long fh,
+static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		   struct sk_buff *skb, struct tcmsg *t)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = tp->root;
 	struct fw_filter *f = (struct fw_filter *)fh;
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
@@ -346,21 +329,27 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh,
 	if (nest == NULL)
 		goto nla_put_failure;
 
-	if (f->res.classid)
-		NLA_PUT_U32(skb, TCA_FW_CLASSID, f->res.classid);
+	if (f->res.classid &&
+	    nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
+		goto nla_put_failure;
 #ifdef CONFIG_NET_CLS_IND
-	if (strlen(f->indev))
-		NLA_PUT_STRING(skb, TCA_FW_INDEV, f->indev);
+	if (f->ifindex) {
+		struct net_device *dev;
+		dev = __dev_get_by_index(net, f->ifindex);
+		if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
+			goto nla_put_failure;
+	}
 #endif /* CONFIG_NET_CLS_IND */
-	if (head->mask != 0xFFFFFFFF)
-		NLA_PUT_U32(skb, TCA_FW_MASK, head->mask);
+	if (head->mask != 0xFFFFFFFF &&
+	    nla_put_u32(skb, TCA_FW_MASK, head->mask))
+		goto nla_put_failure;
 
-	if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0)
+	if (tcf_exts_dump(skb, &f->exts) < 0)
 		goto nla_put_failure;
 
 	nla_nest_end(skb, nest);
 
-	if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0)
+	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
 		goto nla_put_failure;
 
 	return skb->len;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 13ab66e9df5..dd9fc2523c7 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -59,11 +59,6 @@ struct route4_filter {
 
 #define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
 
-static const struct tcf_ext_map route_ext_map = {
-	.police = TCA_ROUTE4_POLICE,
-	.action = TCA_ROUTE4_ACT
-};
-
 static inline int route4_fastmap_hash(u32 id, int iif)
 {
 	return id & 0xF;
@@ -128,7 +123,7 @@ static inline int route4_hash_wild(void)
 static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			   struct tcf_result *res)
 {
-	struct route4_head *head = (struct route4_head *)tp->root;
+	struct route4_head *head = tp->root;
 	struct dst_entry *dst;
 	struct route4_bucket *b;
 	struct route4_filter *f;
@@ -143,7 +138,7 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	if (head == NULL)
 		goto old_method;
 
-	iif = ((struct rtable *)dst)->rt_iif;
+	iif = inet_iif(skb);
 
 	h = route4_fastmap_hash(id, iif);
 	if (id == head->fastmap[h].id &&
@@ -218,7 +213,7 @@ static inline u32 from_hash(u32 id)
 
 static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
 {
-	struct route4_head *head = (struct route4_head *)tp->root;
+	struct route4_head *head = tp->root;
 	struct route4_bucket *b;
 	struct route4_filter *f;
 	unsigned int h1, h2;
@@ -289,7 +284,7 @@ static void route4_destroy(struct tcf_proto *tp)
 
 static int route4_delete(struct tcf_proto *tp, unsigned long arg)
 {
-	struct route4_head *head = (struct route4_head *)tp->root;
+	struct route4_head *head = tp->root;
 	struct route4_filter **fp, *f = (struct route4_filter *)arg;
 	unsigned int h = 0;
 	struct route4_bucket *b;
@@ -335,9 +330,11 @@ static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {
 	[TCA_ROUTE4_IIF]	= { .type = NLA_U32 },
 };
 
-static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
-	struct route4_filter *f, u32 handle, struct route4_head *head,
-	struct nlattr **tb, struct nlattr *est, int new)
+static int route4_set_parms(struct net *net, struct tcf_proto *tp,
+			    unsigned long base, struct route4_filter *f,
+			    u32 handle, struct route4_head *head,
+			    struct nlattr **tb, struct nlattr *est, int new,
+			    bool ovr)
 {
 	int err;
 	u32 id = 0, to = 0, nhandle = 0x8000;
@@ -346,7 +343,8 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
 	struct route4_bucket *b;
 	struct tcf_exts e;
 
-	err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map);
+	tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
 	if (err < 0)
 		return err;
 
@@ -427,10 +425,11 @@ errout:
 	return err;
 }
 
-static int route4_change(struct tcf_proto *tp, unsigned long base,
+static int route4_change(struct net *net, struct sk_buff *in_skb,
+		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle,
 		       struct nlattr **tca,
-		       unsigned long *arg)
+		       unsigned long *arg, bool ovr)
 {
 	struct route4_head *head = tp->root;
 	struct route4_filter *f, *f1, **fp;
@@ -456,8 +455,8 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
 		if (f->bkt)
 			old_handle = f->handle;
 
-		err = route4_set_parms(tp, base, f, handle, head, tb,
-			tca[TCA_RATE], 0);
+		err = route4_set_parms(net, tp, base, f, handle, head, tb,
+			tca[TCA_RATE], 0, ovr);
 		if (err < 0)
 			return err;
 
@@ -479,8 +478,9 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
 	if (f == NULL)
 		goto errout;
 
-	err = route4_set_parms(tp, base, f, handle, head, tb,
-		tca[TCA_RATE], 1);
+	tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+	err = route4_set_parms(net, tp, base, f, handle, head, tb,
+		tca[TCA_RATE], 1, ovr);
 	if (err < 0)
 		goto errout;
 
@@ -552,7 +552,7 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int route4_dump(struct tcf_proto *tp, unsigned long fh,
+static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		       struct sk_buff *skb, struct tcmsg *t)
 {
 	struct route4_filter *f = (struct route4_filter *)fh;
@@ -571,24 +571,28 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh,
 
 	if (!(f->handle & 0x8000)) {
 		id = f->id & 0xFF;
-		NLA_PUT_U32(skb, TCA_ROUTE4_TO, id);
+		if (nla_put_u32(skb, TCA_ROUTE4_TO, id))
+			goto nla_put_failure;
 	}
 	if (f->handle & 0x80000000) {
-		if ((f->handle >> 16) != 0xFFFF)
-			NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif);
+		if ((f->handle >> 16) != 0xFFFF &&
+		    nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif))
+			goto nla_put_failure;
 	} else {
 		id = f->id >> 16;
-		NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id);
+		if (nla_put_u32(skb, TCA_ROUTE4_FROM, id))
+			goto nla_put_failure;
 	}
-	if (f->res.classid)
-		NLA_PUT_U32(skb, TCA_ROUTE4_CLASSID, f->res.classid);
+	if (f->res.classid &&
+	    nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid))
+		goto nla_put_failure;
 
-	if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0)
+	if (tcf_exts_dump(skb, &f->exts) < 0)
 		goto nla_put_failure;
 
 	nla_nest_end(skb, nest);
 
-	if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0)
+	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
 		goto nla_put_failure;
 
 	return skb->len;
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index b01427924f8..1020e233a5d 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -116,11 +116,6 @@ static inline unsigned int hash_src(__be32 *src)
 	return h & 0xF;
 }
 
-static struct tcf_ext_map rsvp_ext_map = {
-	.police = TCA_RSVP_POLICE,
-	.action = TCA_RSVP_ACT
-};
-
 #define RSVP_APPLY_RESULT()				\
 {							\
 	int r = tcf_exts_exec(skb, &f->exts, res);	\
@@ -416,10 +411,11 @@ static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
 	[TCA_RSVP_PINFO]	= { .len = sizeof(struct tc_rsvp_pinfo) },
 };
 
-static int rsvp_change(struct tcf_proto *tp, unsigned long base,
+static int rsvp_change(struct net *net, struct sk_buff *in_skb,
+		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle,
 		       struct nlattr **tca,
-		       unsigned long *arg)
+		       unsigned long *arg, bool ovr)
 {
 	struct rsvp_head *data = tp->root;
 	struct rsvp_filter *f, **fp;
@@ -439,7 +435,8 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
 	if (err < 0)
 		return err;
 
-	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &rsvp_ext_map);
+	tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
 	if (err < 0)
 		return err;
 
@@ -470,6 +467,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
 	if (f == NULL)
 		goto errout2;
 
+	tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
 	h2 = 16;
 	if (tb[TCA_RSVP_SRC]) {
 		memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
@@ -596,7 +594,7 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
+static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		     struct sk_buff *skb, struct tcmsg *t)
 {
 	struct rsvp_filter *f = (struct rsvp_filter *)fh;
@@ -615,25 +613,29 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
 	if (nest == NULL)
 		goto nla_put_failure;
 
-	NLA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst);
+	if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst))
+		goto nla_put_failure;
 	pinfo.dpi = s->dpi;
 	pinfo.spi = f->spi;
 	pinfo.protocol = s->protocol;
 	pinfo.tunnelid = s->tunnelid;
 	pinfo.tunnelhdr = f->tunnelhdr;
 	pinfo.pad = 0;
-	NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
-	if (f->res.classid)
-		NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid);
-	if (((f->handle >> 8) & 0xFF) != 16)
-		NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src);
+	if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo))
+		goto nla_put_failure;
+	if (f->res.classid &&
+	    nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid))
+		goto nla_put_failure;
+	if (((f->handle >> 8) & 0xFF) != 16 &&
+	    nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src))
+		goto nla_put_failure;
 
-	if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0)
+	if (tcf_exts_dump(skb, &f->exts) < 0)
 		goto nla_put_failure;
 
 	nla_nest_end(skb, nest);
 
-	if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0)
+	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
 		goto nla_put_failure;
 	return skb->len;
 
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index dbe199234c6..c721cd4a469 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -24,9 +24,6 @@
 #define DEFAULT_HASH_SIZE	64	/* optimized for diffserv */
 
 
-#define	PRIV(tp)	((struct tcindex_data *) (tp)->root)
-
-
 struct tcindex_filter_result {
 	struct tcf_exts		exts;
 	struct tcf_result	res;
@@ -50,11 +47,6 @@ struct tcindex_data {
 	int fall_through;	/* 0: only classify if explicit match */
 };
 
-static const struct tcf_ext_map tcindex_ext_map = {
-	.police = TCA_TCINDEX_POLICE,
-	.action = TCA_TCINDEX_ACT
-};
-
 static inline int
 tcindex_filter_is_set(struct tcindex_filter_result *r)
 {
@@ -82,7 +74,7 @@ tcindex_lookup(struct tcindex_data *p, u16 key)
 static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			    struct tcf_result *res)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcindex_filter_result *f;
 	int key = (skb->tc_index & p->mask) >> p->shift;
 
@@ -107,7 +99,7 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcindex_filter_result *r;
 
 	pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
@@ -145,7 +137,7 @@ static int tcindex_init(struct tcf_proto *tp)
 static int
 __tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
 	struct tcindex_filter *f = NULL;
 
@@ -196,10 +188,17 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
 	[TCA_TCINDEX_CLASSID]		= { .type = NLA_U32 },
 };
 
+static void tcindex_filter_result_init(struct tcindex_filter_result *r)
+{
+	memset(r, 0, sizeof(*r));
+	tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+}
+
 static int
-tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
-		  struct tcindex_data *p, struct tcindex_filter_result *r,
-		  struct nlattr **tb, struct nlattr *est)
+tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
+		  u32 handle, struct tcindex_data *p,
+		  struct tcindex_filter_result *r, struct nlattr **tb,
+		  struct nlattr *est, bool ovr)
 {
 	int err, balloc = 0;
 	struct tcindex_filter_result new_filter_result, *old_r = r;
@@ -208,17 +207,17 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
 	struct tcindex_filter *f = NULL; /* make gcc behave */
 	struct tcf_exts e;
 
-	err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map);
+	tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
 	if (err < 0)
 		return err;
 
 	memcpy(&cp, p, sizeof(cp));
-	memset(&new_filter_result, 0, sizeof(new_filter_result));
+	tcindex_filter_result_init(&new_filter_result);
 
+	tcindex_filter_result_init(&cr);
 	if (old_r)
-		memcpy(&cr, r, sizeof(cr));
-	else
-		memset(&cr, 0, sizeof(cr));
+		cr.res = r->res;
 
 	if (tb[TCA_TCINDEX_HASH])
 		cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
@@ -270,9 +269,14 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
 	err = -ENOMEM;
 	if (!cp.perfect && !cp.h) {
 		if (valid_perfect_hash(&cp)) {
+			int i;
+
 			cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL);
 			if (!cp.perfect)
 				goto errout;
+			for (i = 0; i < cp.hash; i++)
+				tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT,
+					      TCA_TCINDEX_POLICE);
 			balloc = 1;
 		} else {
 			cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL);
@@ -298,14 +302,17 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
 		tcf_bind_filter(tp, &cr.res, base);
 	}
 
-	tcf_exts_change(tp, &cr.exts, &e);
+	if (old_r)
+		tcf_exts_change(tp, &r->exts, &e);
+	else
+		tcf_exts_change(tp, &cr.exts, &e);
 
 	tcf_tree_lock(tp);
 	if (old_r && old_r != r)
-		memset(old_r, 0, sizeof(*old_r));
+		tcindex_filter_result_init(old_r);
 
 	memcpy(p, &cp, sizeof(cp));
-	memcpy(r, &cr, sizeof(cr));
+	r->res = cr.res;
 
 	if (r == &new_filter_result) {
 		struct tcindex_filter **fp;
@@ -332,12 +339,13 @@ errout:
 }
 
 static int
-tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle,
-	       struct nlattr **tca, unsigned long *arg)
+tcindex_change(struct net *net, struct sk_buff *in_skb,
+	       struct tcf_proto *tp, unsigned long base, u32 handle,
+	       struct nlattr **tca, unsigned long *arg, bool ovr)
 {
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_TCINDEX_MAX + 1];
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
 	int err;
 
@@ -352,13 +360,14 @@ tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 	if (err < 0)
 		return err;
 
-	return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE]);
+	return tcindex_set_parms(net, tp, base, handle, p, r, tb,
+				 tca[TCA_RATE], ovr);
 }
 
 
 static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcindex_filter *f, *next;
 	int i;
 
@@ -405,7 +414,7 @@ static int tcindex_destroy_element(struct tcf_proto *tp,
 
 static void tcindex_destroy(struct tcf_proto *tp)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcf_walker walker;
 
 	pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
@@ -420,10 +429,10 @@ static void tcindex_destroy(struct tcf_proto *tp)
 }
 
 
-static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
+static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
     struct sk_buff *skb, struct tcmsg *t)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
@@ -438,10 +447,11 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
 
 	if (!fh) {
 		t->tcm_handle = ~0; /* whatever ... */
-		NLA_PUT_U32(skb, TCA_TCINDEX_HASH, p->hash);
-		NLA_PUT_U16(skb, TCA_TCINDEX_MASK, p->mask);
-		NLA_PUT_U32(skb, TCA_TCINDEX_SHIFT, p->shift);
-		NLA_PUT_U32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through);
+		if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) ||
+		    nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) ||
+		    nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) ||
+		    nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through))
+			goto nla_put_failure;
 		nla_nest_end(skb, nest);
 	} else {
 		if (p->perfect) {
@@ -460,14 +470,15 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
 			}
 		}
 		pr_debug("handle = %d\n", t->tcm_handle);
-		if (r->res.class)
-			NLA_PUT_U32(skb, TCA_TCINDEX_CLASSID, r->res.classid);
+		if (r->res.class &&
+		    nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))
+			goto nla_put_failure;
 
-		if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0)
+		if (tcf_exts_dump(skb, &r->exts) < 0)
 			goto nla_put_failure;
 		nla_nest_end(skb, nest);
 
-		if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0)
+		if (tcf_exts_dump_stats(skb, &r->exts) < 0)
 			goto nla_put_failure;
 	}
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 939b627b479..70c0be8d012 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -38,6 +38,7 @@
 #include <linux/errno.h>
 #include <linux/rtnetlink.h>
 #include <linux/skbuff.h>
+#include <linux/bitmap.h>
 #include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
@@ -48,7 +49,7 @@ struct tc_u_knode {
 	struct tc_u_hnode	*ht_up;
 	struct tcf_exts		exts;
 #ifdef CONFIG_NET_CLS_IND
-	char                     indev[IFNAMSIZ];
+	int			ifindex;
 #endif
 	u8			fshift;
 	struct tcf_result	res;
@@ -79,11 +80,6 @@ struct tc_u_common {
 	u32			hgenerator;
 };
 
-static const struct tcf_ext_map u32_ext_map = {
-	.action = TCA_U32_ACT,
-	.police = TCA_U32_POLICE
-};
-
 static inline unsigned int u32_hash_fold(__be32 key,
 					 const struct tc_u32_sel *sel,
 					 u8 fshift)
@@ -100,7 +96,7 @@ static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct
 		unsigned int	  off;
 	} stack[TC_U32_MAXDEPTH];
 
-	struct tc_u_hnode *ht = (struct tc_u_hnode *)tp->root;
+	struct tc_u_hnode *ht = tp->root;
 	unsigned int off = skb_network_offset(skb);
 	struct tc_u_knode *n;
 	int sdepth = 0;
@@ -157,7 +153,7 @@ check_terminal:
 
 				*res = n->res;
 #ifdef CONFIG_NET_CLS_IND
-				if (!tcf_match_indev(skb, n->indev)) {
+				if (!tcf_match_indev(skb, n->ifindex)) {
 					n = n->next;
 					goto next_knode;
 				}
@@ -234,8 +230,7 @@ out:
 	return -1;
 
 deadloop:
-	if (net_ratelimit())
-		pr_warning("cls_u32: dead loop\n");
+	net_warn_ratelimited("cls_u32: dead loop\n");
 	return -1;
 }
 
@@ -353,7 +348,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
 	return 0;
 }
 
-static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
+static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 {
 	struct tc_u_knode **kp;
 	struct tc_u_hnode *ht = key->ht_up;
@@ -466,17 +461,25 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
 	return 0;
 }
 
+#define NR_U32_NODE (1<<12)
 static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
 {
 	struct tc_u_knode *n;
-	unsigned int i = 0x7FF;
+	unsigned long i;
+	unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long),
+					GFP_KERNEL);
+	if (!bitmap)
+		return handle | 0xFFF;
 
 	for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
-		if (i < TC_U32_NODE(n->handle))
-			i = TC_U32_NODE(n->handle);
-	i++;
+		set_bit(TC_U32_NODE(n->handle), bitmap);
+
+	i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800);
+	if (i >= NR_U32_NODE)
+		i = find_next_zero_bit(bitmap, NR_U32_NODE, 1);
 
-	return handle | (i > 0xFFF ? 0xFFF : i);
+	kfree(bitmap);
+	return handle | (i >= NR_U32_NODE ? 0xFFF : i);
 }
 
 static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
@@ -489,15 +492,16 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
 	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
 };
 
-static int u32_set_parms(struct tcf_proto *tp, unsigned long base,
-			 struct tc_u_hnode *ht,
+static int u32_set_parms(struct net *net, struct tcf_proto *tp,
+			 unsigned long base, struct tc_u_hnode *ht,
 			 struct tc_u_knode *n, struct nlattr **tb,
-			 struct nlattr *est)
+			 struct nlattr *est, bool ovr)
 {
 	int err;
 	struct tcf_exts e;
 
-	err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map);
+	tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE);
+	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
 	if (err < 0)
 		return err;
 
@@ -532,9 +536,11 @@ static int u32_set_parms(struct tcf_proto *tp, unsigned long base,
 
 #ifdef CONFIG_NET_CLS_IND
 	if (tb[TCA_U32_INDEV]) {
-		err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV]);
-		if (err < 0)
+		int ret;
+		ret = tcf_change_indev(net, tb[TCA_U32_INDEV]);
+		if (ret < 0)
 			goto errout;
+		n->ifindex = ret;
 	}
 #endif
 	tcf_exts_change(tp, &n->exts, &e);
@@ -545,9 +551,10 @@ errout:
 	return err;
 }
 
-static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
+static int u32_change(struct net *net, struct sk_buff *in_skb,
+		      struct tcf_proto *tp, unsigned long base, u32 handle,
 		      struct nlattr **tca,
-		      unsigned long *arg)
+		      unsigned long *arg, bool ovr)
 {
 	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_hnode *ht;
@@ -570,7 +577,8 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 		if (TC_U32_KEY(n->handle) == 0)
 			return -EINVAL;
 
-		return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE]);
+		return u32_set_parms(net, tp, base, n->ht_up, n, tb,
+				     tca[TCA_RATE], ovr);
 	}
 
 	if (tb[TCA_U32_DIVISOR]) {
@@ -645,6 +653,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 	n->ht_up = ht;
 	n->handle = handle;
 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
 
 #ifdef CONFIG_CLS_U32_MARK
 	if (tb[TCA_U32_MARK]) {
@@ -656,7 +665,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 	}
 #endif
 
-	err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE]);
+	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);
 	if (err == 0) {
 		struct tc_u_knode **ins;
 		for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
@@ -714,7 +723,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int u32_dump(struct tcf_proto *tp, unsigned long fh,
+static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		     struct sk_buff *skb, struct tcmsg *t)
 {
 	struct tc_u_knode *n = (struct tc_u_knode *)fh;
@@ -733,43 +742,54 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
 		struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
 		u32 divisor = ht->divisor + 1;
 
-		NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor);
+		if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
+			goto nla_put_failure;
 	} else {
-		NLA_PUT(skb, TCA_U32_SEL,
-			sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
-			&n->sel);
+		if (nla_put(skb, TCA_U32_SEL,
+			    sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
+			    &n->sel))
+			goto nla_put_failure;
 		if (n->ht_up) {
 			u32 htid = n->handle & 0xFFFFF000;
-			NLA_PUT_U32(skb, TCA_U32_HASH, htid);
+			if (nla_put_u32(skb, TCA_U32_HASH, htid))
+				goto nla_put_failure;
 		}
-		if (n->res.classid)
-			NLA_PUT_U32(skb, TCA_U32_CLASSID, n->res.classid);
-		if (n->ht_down)
-			NLA_PUT_U32(skb, TCA_U32_LINK, n->ht_down->handle);
+		if (n->res.classid &&
+		    nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid))
+			goto nla_put_failure;
+		if (n->ht_down &&
+		    nla_put_u32(skb, TCA_U32_LINK, n->ht_down->handle))
+			goto nla_put_failure;
 
 #ifdef CONFIG_CLS_U32_MARK
-		if (n->mark.val || n->mark.mask)
-			NLA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark);
+		if ((n->mark.val || n->mark.mask) &&
+		    nla_put(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark))
+			goto nla_put_failure;
 #endif
 
-		if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0)
+		if (tcf_exts_dump(skb, &n->exts) < 0)
 			goto nla_put_failure;
 
 #ifdef CONFIG_NET_CLS_IND
-		if (strlen(n->indev))
-			NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev);
+		if (n->ifindex) {
+			struct net_device *dev;
+			dev = __dev_get_by_index(net, n->ifindex);
+			if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
+				goto nla_put_failure;
+		}
 #endif
 #ifdef CONFIG_CLS_U32_PERF
-		NLA_PUT(skb, TCA_U32_PCNT,
-		sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
-			n->pf);
+		if (nla_put(skb, TCA_U32_PCNT,
+			    sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
+			    n->pf))
+			goto nla_put_failure;
 #endif
 	}
 
 	nla_nest_end(skb, nest);
 
 	if (TC_U32_KEY(n->handle))
-		if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0)
+		if (tcf_exts_dump_stats(skb, &n->exts) < 0)
 			goto nla_put_failure;
 	return skb->len;
 
diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c
new file mode 100644
index 00000000000..bfd34e4c1af
--- /dev/null
+++ b/net/sched/em_canid.c
@@ -0,0 +1,240 @@
+/*
+ * em_canid.c  Ematch rule to match CAN frames according to their CAN IDs
+ *
+ *              This program is free software; you can distribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Idea:       Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
+ * Copyright:  (c) 2011 Czech Technical University in Prague
+ *             (c) 2011 Volkswagen Group Research
+ * Authors:    Michal Sojka <sojkam1@fel.cvut.cz>
+ *             Pavel Pisa <pisa@cmp.felk.cvut.cz>
+ *             Rostislav Lisovy <lisovy@gmail.cz>
+ * Funded by:  Volkswagen Group Research
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+#include <linux/can.h>
+
+#define EM_CAN_RULES_MAX 500
+
+struct canid_match {
+	/* For each SFF CAN ID (11 bit) there is one record in this bitfield */
+	DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS));
+
+	int rules_count;
+	int sff_rules_count;
+	int eff_rules_count;
+
+	/*
+	 * Raw rules copied from netlink message; Used for sending
+	 * information to userspace (when 'tc filter show' is invoked)
+	 * AND when matching EFF frames
+	 */
+	struct can_filter rules_raw[];
+};
+
+/**
+ * em_canid_get_id() - Extracts Can ID out of the sk_buff structure.
+ */
+static canid_t em_canid_get_id(struct sk_buff *skb)
+{
+	/* CAN ID is stored within the data field */
+	struct can_frame *cf = (struct can_frame *)skb->data;
+
+	return cf->can_id;
+}
+
+static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id,
+					u32 can_mask)
+{
+	int i;
+
+	/*
+	 * Limit can_mask and can_id to SFF range to
+	 * protect against write after end of array
+	 */
+	can_mask &= CAN_SFF_MASK;
+	can_id &= can_mask;
+
+	/* Single frame */
+	if (can_mask == CAN_SFF_MASK) {
+		set_bit(can_id, cm->match_sff);
+		return;
+	}
+
+	/* All frames */
+	if (can_mask == 0) {
+		bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS));
+		return;
+	}
+
+	/*
+	 * Individual frame filter.
+	 * Add record (set bit to 1) for each ID that
+	 * conforms particular rule
+	 */
+	for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) {
+		if ((i & can_mask) == can_id)
+			set_bit(i, cm->match_sff);
+	}
+}
+
+static inline struct canid_match *em_canid_priv(struct tcf_ematch *m)
+{
+	return (struct canid_match *)m->data;
+}
+
+static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m,
+			 struct tcf_pkt_info *info)
+{
+	struct canid_match *cm = em_canid_priv(m);
+	canid_t can_id;
+	int match = 0;
+	int i;
+	const struct can_filter *lp;
+
+	can_id = em_canid_get_id(skb);
+
+	if (can_id & CAN_EFF_FLAG) {
+		for (i = 0, lp = cm->rules_raw;
+		     i < cm->eff_rules_count; i++, lp++) {
+			if (!(((lp->can_id ^ can_id) & lp->can_mask))) {
+				match = 1;
+				break;
+			}
+		}
+	} else { /* SFF */
+		can_id &= CAN_SFF_MASK;
+		match = (test_bit(can_id, cm->match_sff) ? 1 : 0);
+	}
+
+	return match;
+}
+
+static int em_canid_change(struct tcf_proto *tp, void *data, int len,
+			  struct tcf_ematch *m)
+{
+	struct can_filter *conf = data; /* Array with rules */
+	struct canid_match *cm;
+	struct canid_match *cm_old = (struct canid_match *)m->data;
+	int i;
+
+	if (!len)
+		return -EINVAL;
+
+	if (len % sizeof(struct can_filter))
+		return -EINVAL;
+
+	if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX)
+		return -EINVAL;
+
+	cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL);
+	if (!cm)
+		return -ENOMEM;
+
+	cm->rules_count = len / sizeof(struct can_filter);
+
+	/*
+	 * We need two for() loops for copying rules into two contiguous
+	 * areas in rules_raw to process all eff rules with a simple loop.
+	 * NB: The configuration interface supports sff and eff rules.
+	 * We do not support filters here that match for the same can_id
+	 * provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123).
+	 * For this (unusual case) two filters have to be specified. The
+	 * SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id.
+	 */
+
+	/* Fill rules_raw with EFF rules first */
+	for (i = 0; i < cm->rules_count; i++) {
+		if (conf[i].can_id & CAN_EFF_FLAG) {
+			memcpy(cm->rules_raw + cm->eff_rules_count,
+				&conf[i],
+				sizeof(struct can_filter));
+
+			cm->eff_rules_count++;
+		}
+	}
+
+	/* append SFF frame rules */
+	for (i = 0; i < cm->rules_count; i++) {
+		if (!(conf[i].can_id & CAN_EFF_FLAG)) {
+			memcpy(cm->rules_raw
+				+ cm->eff_rules_count
+				+ cm->sff_rules_count,
+				&conf[i], sizeof(struct can_filter));
+
+			cm->sff_rules_count++;
+
+			em_canid_sff_match_add(cm,
+				conf[i].can_id, conf[i].can_mask);
+		}
+	}
+
+	m->datalen = sizeof(struct canid_match) + len;
+	m->data = (unsigned long)cm;
+
+	if (cm_old != NULL) {
+		pr_err("canid: Configuring an existing ematch!\n");
+		kfree(cm_old);
+	}
+
+	return 0;
+}
+
+static void em_canid_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+	struct canid_match *cm = em_canid_priv(m);
+
+	kfree(cm);
+}
+
+static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+	struct canid_match *cm = em_canid_priv(m);
+
+	/*
+	 * When configuring this ematch 'rules_count' is set not to exceed
+	 * 'rules_raw' array size
+	 */
+	if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count,
+	    &cm->rules_raw) < 0)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static struct tcf_ematch_ops em_canid_ops = {
+	.kind	  = TCF_EM_CANID,
+	.change	  = em_canid_change,
+	.match	  = em_canid_match,
+	.destroy  = em_canid_destroy,
+	.dump	  = em_canid_dump,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_canid_ops.link)
+};
+
+static int __init init_em_canid(void)
+{
+	return tcf_em_register(&em_canid_ops);
+}
+
+static void __exit exit_em_canid(void)
+{
+	tcf_em_unregister(&em_canid_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_canid);
+module_exit(exit_em_canid);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID);
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
new file mode 100644
index 00000000000..527aeb7a3ff
--- /dev/null
+++ b/net/sched/em_ipset.c
@@ -0,0 +1,136 @@
+/*
+ * net/sched/em_ipset.c	ipset ematch
+ *
+ * Copyright (c) 2012 Florian Westphal <fw@strlen.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/xt_set.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+#include <net/pkt_cls.h>
+
+static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len,
+			   struct tcf_ematch *em)
+{
+	struct xt_set_info *set = data;
+	ip_set_id_t index;
+	struct net *net = dev_net(qdisc_dev(tp->q));
+
+	if (data_len != sizeof(*set))
+		return -EINVAL;
+
+	index = ip_set_nfnl_get_byindex(net, set->index);
+	if (index == IPSET_INVALID_ID)
+		return -ENOENT;
+
+	em->datalen = sizeof(*set);
+	em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
+	if (em->data)
+		return 0;
+
+	ip_set_nfnl_put(net, index);
+	return -ENOMEM;
+}
+
+static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em)
+{
+	const struct xt_set_info *set = (const void *) em->data;
+	if (set) {
+		ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index);
+		kfree((void *) em->data);
+	}
+}
+
+static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
+			  struct tcf_pkt_info *info)
+{
+	struct ip_set_adt_opt opt;
+	struct xt_action_param acpar;
+	const struct xt_set_info *set = (const void *) em->data;
+	struct net_device *dev, *indev = NULL;
+	int ret, network_offset;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		acpar.family = NFPROTO_IPV4;
+		if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+			return 0;
+		acpar.thoff = ip_hdrlen(skb);
+		break;
+	case htons(ETH_P_IPV6):
+		acpar.family = NFPROTO_IPV6;
+		if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+			return 0;
+		/* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */
+		acpar.thoff = sizeof(struct ipv6hdr);
+		break;
+	default:
+		return 0;
+	}
+
+	acpar.hooknum = 0;
+
+	opt.family = acpar.family;
+	opt.dim = set->dim;
+	opt.flags = set->flags;
+	opt.cmdflags = 0;
+	opt.ext.timeout = ~0u;
+
+	network_offset = skb_network_offset(skb);
+	skb_pull(skb, network_offset);
+
+	dev = skb->dev;
+
+	rcu_read_lock();
+
+	if (dev && skb->skb_iif)
+		indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif);
+
+	acpar.in      = indev ? indev : dev;
+	acpar.out     = dev;
+
+	ret = ip_set_test(set->index, skb, &acpar, &opt);
+
+	rcu_read_unlock();
+
+	skb_push(skb, network_offset);
+	return ret;
+}
+
+static struct tcf_ematch_ops em_ipset_ops = {
+	.kind	  = TCF_EM_IPSET,
+	.change	  = em_ipset_change,
+	.destroy  = em_ipset_destroy,
+	.match	  = em_ipset_match,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_ipset_ops.link)
+};
+
+static int __init init_em_ipset(void)
+{
+	return tcf_em_register(&em_ipset_ops);
+}
+
+static void __exit exit_em_ipset(void)
+{
+	tcf_em_unregister(&em_ipset_ops);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("TC extended match for IP sets");
+
+module_init(init_em_ipset);
+module_exit(exit_em_ipset);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 1363bf14e61..9b8c0b0e60d 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -222,7 +222,7 @@ META_COLLECTOR(int_maclen)
 
 META_COLLECTOR(int_rxhash)
 {
-	dst->value = skb_get_rxhash(skb);
+	dst->value = skb_get_hash(skb);
 }
 
 /**************************************************************************
@@ -264,47 +264,59 @@ META_COLLECTOR(int_rtiif)
 	if (unlikely(skb_rtable(skb) == NULL))
 		*err = -1;
 	else
-		dst->value = skb_rtable(skb)->rt_iif;
+		dst->value = inet_iif(skb);
 }
 
 /**************************************************************************
  * Socket Attributes
  **************************************************************************/
 
-#define SKIP_NONLOCAL(skb)			\
-	if (unlikely(skb->sk == NULL)) {	\
-		*err = -1;			\
-		return;				\
-	}
+#define skip_nonlocal(skb) \
+	(unlikely(skb->sk == NULL))
 
 META_COLLECTOR(int_sk_family)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_family;
 }
 
 META_COLLECTOR(int_sk_state)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_state;
 }
 
 META_COLLECTOR(int_sk_reuse)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_reuse;
 }
 
 META_COLLECTOR(int_sk_bound_if)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	/* No error if bound_dev_if is 0, legal userspace check */
 	dst->value = skb->sk->sk_bound_dev_if;
 }
 
 META_COLLECTOR(var_sk_bound_if)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 
 	if (skb->sk->sk_bound_dev_if == 0) {
 		dst->value = (unsigned long) "any";
@@ -322,151 +334,226 @@ META_COLLECTOR(var_sk_bound_if)
 
 META_COLLECTOR(int_sk_refcnt)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = atomic_read(&skb->sk->sk_refcnt);
 }
 
 META_COLLECTOR(int_sk_rcvbuf)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_rcvbuf;
 }
 
 META_COLLECTOR(int_sk_shutdown)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_shutdown;
 }
 
 META_COLLECTOR(int_sk_proto)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_protocol;
 }
 
 META_COLLECTOR(int_sk_type)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_type;
 }
 
 META_COLLECTOR(int_sk_rmem_alloc)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = sk_rmem_alloc_get(skb->sk);
 }
 
 META_COLLECTOR(int_sk_wmem_alloc)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = sk_wmem_alloc_get(skb->sk);
 }
 
 META_COLLECTOR(int_sk_omem_alloc)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = atomic_read(&skb->sk->sk_omem_alloc);
 }
 
 META_COLLECTOR(int_sk_rcv_qlen)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_receive_queue.qlen;
 }
 
 META_COLLECTOR(int_sk_snd_qlen)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_write_queue.qlen;
 }
 
 META_COLLECTOR(int_sk_wmem_queued)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_wmem_queued;
 }
 
 META_COLLECTOR(int_sk_fwd_alloc)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_forward_alloc;
 }
 
 META_COLLECTOR(int_sk_sndbuf)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_sndbuf;
 }
 
 META_COLLECTOR(int_sk_alloc)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = (__force int) skb->sk->sk_allocation;
 }
 
 META_COLLECTOR(int_sk_hash)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_hash;
 }
 
 META_COLLECTOR(int_sk_lingertime)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_lingertime / HZ;
 }
 
 META_COLLECTOR(int_sk_err_qlen)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_error_queue.qlen;
 }
 
 META_COLLECTOR(int_sk_ack_bl)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_ack_backlog;
 }
 
 META_COLLECTOR(int_sk_max_ack_bl)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_max_ack_backlog;
 }
 
 META_COLLECTOR(int_sk_prio)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_priority;
 }
 
 META_COLLECTOR(int_sk_rcvlowat)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_rcvlowat;
 }
 
 META_COLLECTOR(int_sk_rcvtimeo)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_rcvtimeo / HZ;
 }
 
 META_COLLECTOR(int_sk_sndtimeo)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_sndtimeo / HZ;
 }
 
 META_COLLECTOR(int_sk_sendmsg_off)
 {
-	SKIP_NONLOCAL(skb);
-	dst->value = skb->sk->sk_sndmsg_off;
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
+	dst->value = skb->sk->sk_frag.offset;
 }
 
 META_COLLECTOR(int_sk_write_pend)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_write_pending;
 }
 
@@ -585,8 +672,9 @@ static void meta_var_apply_extras(struct meta_value *v,
 
 static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
 {
-	if (v->val && v->len)
-		NLA_PUT(skb, tlv, v->len, (void *) v->val);
+	if (v->val && v->len &&
+	    nla_put(skb, tlv, v->len, (void *) v->val))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -636,10 +724,13 @@ static void meta_int_apply_extras(struct meta_value *v,
 
 static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
 {
-	if (v->len == sizeof(unsigned long))
-		NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val);
-	else if (v->len == sizeof(u32))
-		NLA_PUT_U32(skb, tlv, v->val);
+	if (v->len == sizeof(unsigned long)) {
+		if (nla_put(skb, tlv, sizeof(unsigned long), &v->val))
+			goto nla_put_failure;
+	} else if (v->len == sizeof(u32)) {
+		if (nla_put_u32(skb, tlv, v->val))
+			goto nla_put_failure;
+	}
 
 	return 0;
 
@@ -789,8 +880,10 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len,
 		goto errout;
 
 	meta = kzalloc(sizeof(*meta), GFP_KERNEL);
-	if (meta == NULL)
+	if (meta == NULL) {
+		err = -ENOMEM;
 		goto errout;
+	}
 
 	memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
 	memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
@@ -831,7 +924,8 @@ static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
 	memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
 	memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right));
 
-	NLA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr);
+	if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr))
+		goto nla_put_failure;
 
 	ops = meta_type_ops(&meta->lvalue);
 	if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 ||
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 88d93eb9250..3a633debb6d 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -441,7 +441,8 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
 	if (top_start == NULL)
 		goto nla_put_failure;
 
-	NLA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr);
+	if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr))
+		goto nla_put_failure;
 
 	list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);
 	if (list_start == NULL)
@@ -457,7 +458,8 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
 			.flags = em->flags
 		};
 
-		NLA_PUT(skb, i + 1, sizeof(em_hdr), &em_hdr);
+		if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr))
+			goto nla_put_failure;
 
 		if (em->ops && em->ops->dump) {
 			if (em->ops->dump(skb, em) < 0)
@@ -535,9 +537,7 @@ pop_stack:
 	return res;
 
 stack_overflow:
-	if (net_ratelimit())
-		pr_warning("tc ematch: local stack overflow,"
-			   " increase NET_EMATCH_STACK\n");
+	net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n");
 	return -1;
 }
 EXPORT_SYMBOL(__tcf_em_tree_match);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 3d8981fde30..58bed7599db 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -135,7 +135,7 @@ static DEFINE_RWLOCK(qdisc_mod_lock);
 
 static struct Qdisc_ops *qdisc_base;
 
-/* Register/uregister queueing discipline */
+/* Register/unregister queueing discipline */
 
 int register_qdisc(struct Qdisc_ops *qops)
 {
@@ -200,6 +200,58 @@ int unregister_qdisc(struct Qdisc_ops *qops)
 }
 EXPORT_SYMBOL(unregister_qdisc);
 
+/* Get default qdisc if not otherwise specified */
+void qdisc_get_default(char *name, size_t len)
+{
+	read_lock(&qdisc_mod_lock);
+	strlcpy(name, default_qdisc_ops->id, len);
+	read_unlock(&qdisc_mod_lock);
+}
+
+static struct Qdisc_ops *qdisc_lookup_default(const char *name)
+{
+	struct Qdisc_ops *q = NULL;
+
+	for (q = qdisc_base; q; q = q->next) {
+		if (!strcmp(name, q->id)) {
+			if (!try_module_get(q->owner))
+				q = NULL;
+			break;
+		}
+	}
+
+	return q;
+}
+
+/* Set new default qdisc to use */
+int qdisc_set_default(const char *name)
+{
+	const struct Qdisc_ops *ops;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	write_lock(&qdisc_mod_lock);
+	ops = qdisc_lookup_default(name);
+	if (!ops) {
+		/* Not found, drop lock and try to load module */
+		write_unlock(&qdisc_mod_lock);
+		request_module("sch_%s", name);
+		write_lock(&qdisc_mod_lock);
+
+		ops = qdisc_lookup_default(name);
+	}
+
+	if (ops) {
+		/* Set new default */
+		module_put(default_qdisc_ops->owner);
+		default_qdisc_ops = ops;
+	}
+	write_unlock(&qdisc_mod_lock);
+
+	return ops ? 0 : -ENOENT;
+}
+
 /* We know handle. Find qdisc among all qdisc's attached to device
    (root qdisc, all its children, children of children etc.)
  */
@@ -219,11 +271,16 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 	return NULL;
 }
 
-static void qdisc_list_add(struct Qdisc *q)
+void qdisc_list_add(struct Qdisc *q)
 {
-	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
-		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
+	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
+		struct Qdisc *root = qdisc_dev(q)->qdisc;
+
+		WARN_ON_ONCE(root == &noop_qdisc);
+		list_add_tail(&q->list, &root->list);
+	}
 }
+EXPORT_SYMBOL(qdisc_list_add);
 
 void qdisc_list_del(struct Qdisc *q)
 {
@@ -285,28 +342,70 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 	return q;
 }
 
+/* The linklayer setting were not transferred from iproute2, in older
+ * versions, and the rate tables lookup systems have been dropped in
+ * the kernel. To keep backward compatible with older iproute2 tc
+ * utils, we detect the linklayer setting by detecting if the rate
+ * table were modified.
+ *
+ * For linklayer ATM table entries, the rate table will be aligned to
+ * 48 bytes, thus some table entries will contain the same value.  The
+ * mpu (min packet unit) is also encoded into the old rate table, thus
+ * starting from the mpu, we find low and high table entries for
+ * mapping this cell.  If these entries contain the same value, when
+ * the rate tables have been modified for linklayer ATM.
+ *
+ * This is done by rounding mpu to the nearest 48 bytes cell/entry,
+ * and then roundup to the next cell, calc the table entry one below,
+ * and compare.
+ */
+static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
+{
+	int low       = roundup(r->mpu, 48);
+	int high      = roundup(low+1, 48);
+	int cell_low  = low >> r->cell_log;
+	int cell_high = (high >> r->cell_log) - 1;
+
+	/* rtab is too inaccurate at rates > 100Mbit/s */
+	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
+		pr_debug("TC linklayer: Giving up ATM detection\n");
+		return TC_LINKLAYER_ETHERNET;
+	}
+
+	if ((cell_high > cell_low) && (cell_high < 256)
+	    && (rtab[cell_low] == rtab[cell_high])) {
+		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
+			 cell_low, cell_high, rtab[cell_high]);
+		return TC_LINKLAYER_ATM;
+	}
+	return TC_LINKLAYER_ETHERNET;
+}
+
 static struct qdisc_rate_table *qdisc_rtab_list;
 
 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 {
 	struct qdisc_rate_table *rtab;
 
+	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
+	    nla_len(tab) != TC_RTAB_SIZE)
+		return NULL;
+
 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
-		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
+		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
+		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
 			rtab->refcnt++;
 			return rtab;
 		}
 	}
 
-	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
-	    nla_len(tab) != TC_RTAB_SIZE)
-		return NULL;
-
 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 	if (rtab) {
 		rtab->rate = *r;
 		rtab->refcnt = 1;
 		memcpy(rtab->data, nla_data(tab), 1024);
+		if (r->linklayer == TC_LINKLAYER_UNAWARE)
+			r->linklayer = __detect_linklayer(r, rtab->data);
 		rtab->next = qdisc_rtab_list;
 		qdisc_rtab_list = rtab;
 	}
@@ -426,7 +525,8 @@ static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
 	nest = nla_nest_start(skb, TCA_STAB);
 	if (nest == NULL)
 		goto nla_put_failure;
-	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
+	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
+		goto nla_put_failure;
 	nla_nest_end(skb, nest);
 
 	return skb->len;
@@ -463,7 +563,7 @@ out:
 }
 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
 
-void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
+void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
 {
 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
@@ -492,20 +592,19 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
 }
 EXPORT_SYMBOL(qdisc_watchdog_init);
 
-void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
+void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
 {
-	ktime_t time;
-
 	if (test_bit(__QDISC_STATE_DEACTIVATED,
 		     &qdisc_root_sleeping(wd->qdisc)->state))
 		return;
 
 	qdisc_throttled(wd->qdisc);
-	time = ktime_set(0, 0);
-	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
-	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
+
+	hrtimer_start(&wd->timer,
+		      ns_to_ktime(expires),
+		      HRTIMER_MODE_ABS);
 }
-EXPORT_SYMBOL(qdisc_watchdog_schedule);
+EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
 
 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 {
@@ -545,7 +644,7 @@ static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 {
 	struct Qdisc_class_common *cl;
-	struct hlist_node *n, *next;
+	struct hlist_node *next;
 	struct hlist_head *nhash, *ohash;
 	unsigned int nsize, nmask, osize;
 	unsigned int i, h;
@@ -564,7 +663,7 @@ void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
 
 	sch_tree_lock(sch);
 	for (i = 0; i < osize; i++) {
-		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
+		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
 			h = qdisc_class_hash(cl->classid, nmask);
 			hlist_add_head(&cl->hnode, &nhash[h]);
 		}
@@ -643,9 +742,11 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
 	u32 parentid;
+	int drops;
 
 	if (n == 0)
 		return;
+	drops = max_t(int, n, 0);
 	while ((parentid = sch->parent)) {
 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
 			return;
@@ -662,6 +763,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 			cops->put(sch, cl);
 		}
 		sch->q.qlen -= n;
+		sch->qstats.drops += drops;
 	}
 }
 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
@@ -833,6 +935,8 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
 				goto err_out3;
 		}
 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
+		if (!netif_is_multiqueue(dev))
+			sch->flags |= TCQ_F_ONETXQUEUE;
 	}
 
 	sch->handle = handle;
@@ -969,25 +1073,30 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
  * Delete/get qdisc.
  */
 
-static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
 {
 	struct net *net = sock_net(skb->sk);
-	struct tcmsg *tcm = NLMSG_DATA(n);
+	struct tcmsg *tcm = nlmsg_data(n);
 	struct nlattr *tca[TCA_MAX + 1];
 	struct net_device *dev;
-	u32 clid = tcm->tcm_parent;
+	u32 clid;
 	struct Qdisc *q = NULL;
 	struct Qdisc *p = NULL;
 	int err;
 
-	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
-	if (!dev)
-		return -ENODEV;
+	if ((n->nlmsg_type != RTM_GETQDISC) &&
+	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
 
 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 	if (err < 0)
 		return err;
 
+	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
+		return -ENODEV;
+
+	clid = tcm->tcm_parent;
 	if (clid) {
 		if (clid != TC_H_ROOT) {
 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
@@ -1033,7 +1142,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
  * Create/change qdisc.
  */
 
-static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
 {
 	struct net *net = sock_net(skb->sk);
 	struct tcmsg *tcm;
@@ -1043,9 +1152,16 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 	struct Qdisc *q, *p;
 	int err;
 
+	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
 replay:
 	/* Reinit, just in case something touches this. */
-	tcm = NLMSG_DATA(n);
+	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+	if (err < 0)
+		return err;
+
+	tcm = nlmsg_data(n);
 	clid = tcm->tcm_parent;
 	q = p = NULL;
 
@@ -1053,9 +1169,6 @@ replay:
 	if (!dev)
 		return -ENODEV;
 
-	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
-	if (err < 0)
-		return err;
 
 	if (clid) {
 		if (clid != TC_H_ROOT) {
@@ -1184,7 +1297,7 @@ graft:
 }
 
 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
-			 u32 pid, u32 seq, u16 flags, int event)
+			 u32 portid, u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
@@ -1192,8 +1305,11 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	struct gnet_dump d;
 	struct qdisc_size_table *stab;
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
-	tcm = NLMSG_DATA(nlh);
+	cond_resched();
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	tcm = nlmsg_data(nlh);
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm__pad1 = 0;
 	tcm->tcm__pad2 = 0;
@@ -1201,7 +1317,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	tcm->tcm_parent = clid;
 	tcm->tcm_handle = q->handle;
 	tcm->tcm_info = atomic_read(&q->refcnt);
-	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
+	if (nla_put_string(skb, TCA_KIND, q->ops->id))
+		goto nla_put_failure;
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
 		goto nla_put_failure;
 	q->qstats.qlen = q->q.qlen;
@@ -1228,7 +1345,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
+out_nlmsg_trim:
 nla_put_failure:
 	nlmsg_trim(skb, b);
 	return -1;
@@ -1244,25 +1361,25 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb,
 			struct Qdisc *old, struct Qdisc *new)
 {
 	struct sk_buff *skb;
-	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
 	if (old && !tc_qdisc_dump_ignore(old)) {
-		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
+		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
 				  0, RTM_DELQDISC) < 0)
 			goto err_out;
 	}
 	if (new && !tc_qdisc_dump_ignore(new)) {
-		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
+		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
 			goto err_out;
 	}
 
 	if (skb->len)
-		return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 				      n->nlmsg_flags & NLM_F_ECHO);
 
 err_out:
@@ -1285,7 +1402,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
 		q_idx++;
 	} else {
 		if (!tc_qdisc_dump_ignore(q) &&
-		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
+		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
 			goto done;
 		q_idx++;
@@ -1296,7 +1413,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
 			continue;
 		}
 		if (!tc_qdisc_dump_ignore(q) &&
-		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
+		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
 			goto done;
 		q_idx++;
@@ -1320,9 +1437,9 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 	s_idx = cb->args[0];
 	s_q_idx = q_idx = cb->args[1];
 
-	rcu_read_lock();
 	idx = 0;
-	for_each_netdev_rcu(net, dev) {
+	ASSERT_RTNL();
+	for_each_netdev(net, dev) {
 		struct netdev_queue *dev_queue;
 
 		if (idx < s_idx)
@@ -1345,8 +1462,6 @@ cont:
 	}
 
 done:
-	rcu_read_unlock();
-
 	cb->args[0] = idx;
 	cb->args[1] = q_idx;
 
@@ -1361,29 +1476,33 @@ done:
 
 
 
-static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
 {
 	struct net *net = sock_net(skb->sk);
-	struct tcmsg *tcm = NLMSG_DATA(n);
+	struct tcmsg *tcm = nlmsg_data(n);
 	struct nlattr *tca[TCA_MAX + 1];
 	struct net_device *dev;
 	struct Qdisc *q = NULL;
 	const struct Qdisc_class_ops *cops;
 	unsigned long cl = 0;
 	unsigned long new_cl;
-	u32 pid = tcm->tcm_parent;
-	u32 clid = tcm->tcm_handle;
-	u32 qid = TC_H_MAJ(clid);
+	u32 portid;
+	u32 clid;
+	u32 qid;
 	int err;
 
-	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
-	if (!dev)
-		return -ENODEV;
+	if ((n->nlmsg_type != RTM_GETTCLASS) &&
+	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
 
 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 	if (err < 0)
 		return err;
 
+	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
+		return -ENODEV;
+
 	/*
 	   parent == TC_H_UNSPEC - unspecified parent.
 	   parent == TC_H_ROOT   - class is root, which has no parent.
@@ -1399,8 +1518,12 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 
 	/* Step 1. Determine qdisc handle X:0 */
 
-	if (pid != TC_H_ROOT) {
-		u32 qid1 = TC_H_MAJ(pid);
+	portid = tcm->tcm_parent;
+	clid = tcm->tcm_handle;
+	qid = TC_H_MAJ(clid);
+
+	if (portid != TC_H_ROOT) {
+		u32 qid1 = TC_H_MAJ(portid);
 
 		if (qid && qid1) {
 			/* If both majors are known, they must be identical. */
@@ -1414,10 +1537,10 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 		/* Now qid is genuine qdisc handle consistent
 		 * both with parent and child.
 		 *
-		 * TC_H_MAJ(pid) still may be unspecified, complete it now.
+		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
 		 */
-		if (pid)
-			pid = TC_H_MAKE(qid, pid);
+		if (portid)
+			portid = TC_H_MAKE(qid, portid);
 	} else {
 		if (qid == 0)
 			qid = dev->qdisc->handle;
@@ -1435,7 +1558,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 
 	/* Now try to get class */
 	if (clid == 0) {
-		if (pid == TC_H_ROOT)
+		if (portid == TC_H_ROOT)
 			clid = qid;
 	} else
 		clid = TC_H_MAKE(qid, clid);
@@ -1474,7 +1597,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 	new_cl = cl;
 	err = -EOPNOTSUPP;
 	if (cops->change)
-		err = cops->change(q, clid, pid, tca, &new_cl);
+		err = cops->change(q, clid, portid, tca, &new_cl);
 	if (err == 0)
 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
 
@@ -1488,7 +1611,7 @@ out:
 
 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 			  unsigned long cl,
-			  u32 pid, u32 seq, u16 flags, int event)
+			  u32 portid, u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
@@ -1496,8 +1619,11 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 	struct gnet_dump d;
 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
-	tcm = NLMSG_DATA(nlh);
+	cond_resched();
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	tcm = nlmsg_data(nlh);
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm__pad1 = 0;
 	tcm->tcm__pad2 = 0;
@@ -1505,7 +1631,8 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 	tcm->tcm_parent = q->handle;
 	tcm->tcm_handle = q->handle;
 	tcm->tcm_info = 0;
-	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
+	if (nla_put_string(skb, TCA_KIND, q->ops->id))
+		goto nla_put_failure;
 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
 		goto nla_put_failure;
 
@@ -1522,7 +1649,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
+out_nlmsg_trim:
 nla_put_failure:
 	nlmsg_trim(skb, b);
 	return -1;
@@ -1533,18 +1660,18 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb,
 			 unsigned long cl, int event)
 {
 	struct sk_buff *skb;
-	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
-	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
+	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
 
-	return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 			      n->nlmsg_flags & NLM_F_ECHO);
 }
 
@@ -1558,7 +1685,7 @@ static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walk
 {
 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
 
-	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
+	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
 }
 
@@ -1613,13 +1740,13 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
 
 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
+	struct tcmsg *tcm = nlmsg_data(cb->nlh);
 	struct net *net = sock_net(skb->sk);
 	struct netdev_queue *dev_queue;
 	struct net_device *dev;
 	int t, s_t;
 
-	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
 		return 0;
 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
 	if (!dev)
@@ -1688,12 +1815,10 @@ reclassify:
 		tp = otp;
 
 		if (verd++ >= MAX_REC_LOOP) {
-			if (net_ratelimit())
-				pr_notice("%s: packet reclassify loop"
-					  " rule prio %u protocol %02x\n",
-					  tp->q->ops->id,
-					  tp->prio & 0xffff,
-					  ntohs(tp->protocol));
+			net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
+					       tp->q->ops->id,
+					       tp->prio & 0xffff,
+					       ntohs(tp->protocol));
 			return TC_ACT_SHOT;
 		}
 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
@@ -1753,7 +1878,7 @@ static int __net_init psched_net_init(struct net *net)
 {
 	struct proc_dir_entry *e;
 
-	e = proc_net_fops_create(net, "psched", 0, &psched_fops);
+	e = proc_create("psched", 0, net->proc_net, &psched_fops);
 	if (e == NULL)
 		return -ENOMEM;
 
@@ -1762,7 +1887,7 @@ static int __net_init psched_net_init(struct net *net)
 
 static void __net_exit psched_net_exit(struct net *net)
 {
-	proc_net_remove(net, "psched");
+	remove_proc_entry("psched", net->proc_net);
 }
 #else
 static int __net_init psched_net_init(struct net *net)
@@ -1791,6 +1916,7 @@ static int __init pktsched_init(void)
 		return err;
 	}
 
+	register_qdisc(&pfifo_fast_ops);
 	register_qdisc(&pfifo_qdisc_ops);
 	register_qdisc(&bfifo_qdisc_ops);
 	register_qdisc(&pfifo_head_drop_qdisc_ops);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index e25e49061a0..8449b337f9e 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -16,8 +16,6 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 
-extern struct socket *sockfd_lookup(int fd, int *err);	/* @@@ fix this */
-
 /*
  * The ATM queuing discipline provides a framework for invoking classifiers
  * (aka "filters"), which in turn select classes of this queuing discipline.
@@ -423,8 +421,6 @@ drop: __maybe_unused
 		}
 		return ret;
 	}
-	qdisc_bstats_update(sch, skb);
-	bstats_update(&flow->bstats, skb);
 	/*
 	 * Okay, this may seem weird. We pretend we've dropped the packet if
 	 * it goes via ATM. The reason for this is that the outer qdisc
@@ -472,6 +468,8 @@ static void sch_atm_dequeue(unsigned long data)
 			if (unlikely(!skb))
 				break;
 
+			qdisc_bstats_update(sch, skb);
+			bstats_update(&flow->bstats, skb);
 			pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
 			/* remove any LL header somebody else has attached */
 			skb_pull(skb, skb_network_offset(skb));
@@ -601,26 +599,31 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
 	if (nest == NULL)
 		goto nla_put_failure;
 
-	NLA_PUT(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr);
+	if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr))
+		goto nla_put_failure;
 	if (flow->vcc) {
 		struct sockaddr_atmpvc pvc;
 		int state;
 
+		memset(&pvc, 0, sizeof(pvc));
 		pvc.sap_family = AF_ATMPVC;
 		pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
 		pvc.sap_addr.vpi = flow->vcc->vpi;
 		pvc.sap_addr.vci = flow->vcc->vci;
-		NLA_PUT(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc);
+		if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc))
+			goto nla_put_failure;
 		state = ATM_VF2VS(flow->vcc->flags);
-		NLA_PUT_U32(skb, TCA_ATM_STATE, state);
+		if (nla_put_u32(skb, TCA_ATM_STATE, state))
+			goto nla_put_failure;
 	}
-	if (flow->excess)
-		NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid);
-	else
-		NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0);
-
-	nla_nest_end(skb, nest);
-	return skb->len;
+	if (flow->excess) {
+		if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->classid))
+			goto nla_put_failure;
+	} else {
+		if (nla_put_u32(skb, TCA_ATM_EXCESS, 0))
+			goto nla_put_failure;
+	}
+	return nla_nest_end(skb, nest);
 
 nla_put_failure:
 	nla_nest_cancel(skb, nest);
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 24d94c097b3..ead526467cc 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -130,7 +130,7 @@ struct cbq_class {
 	psched_time_t		penalized;
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
+	struct gnet_stats_rate_est64 rate_est;
 	struct tc_cbq_xstats	xstats;
 
 	struct tcf_proto	*filter_list;
@@ -250,10 +250,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 				cl = defmap[TC_PRIO_BESTEFFORT];
 
-			if (cl == NULL || cl->level >= head->level)
+			if (cl == NULL)
 				goto fallback;
 		}
-
+		if (cl->level >= head->level)
+			goto fallback;
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
@@ -508,8 +509,7 @@ static void cbq_ovl_delay(struct cbq_class *cl)
 			cl->cpriority = TC_CBQ_MAXPRIO;
 			q->pmask |= (1<<TC_CBQ_MAXPRIO);
 
-			expires = ktime_set(0, 0);
-			expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched));
+			expires = ns_to_ktime(PSCHED_TICKS2NS(sched));
 			if (hrtimer_try_to_cancel(&q->delay_timer) &&
 			    ktime_to_ns(ktime_sub(
 					hrtimer_get_expires(&q->delay_timer),
@@ -962,8 +962,11 @@ cbq_dequeue(struct Qdisc *sch)
 		cbq_update(q);
 		if ((incr -= incr2) < 0)
 			incr = 0;
+		q->now += incr;
+	} else {
+		if (now > q->now)
+			q->now = now;
 	}
-	q->now += incr;
 	q->now_rt = now;
 
 	for (;;) {
@@ -1041,14 +1044,13 @@ static void cbq_adjust_levels(struct cbq_class *this)
 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 {
 	struct cbq_class *cl;
-	struct hlist_node *n;
 	unsigned int h;
 
 	if (q->quanta[prio] == 0)
 		return;
 
 	for (h = 0; h < q->clhash.hashsize; h++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
+		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 			/* BUGGGG... Beware! This expression suffer of
 			 * arithmetic overflows!
 			 */
@@ -1056,9 +1058,10 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 					q->quanta[prio];
 			}
-			if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
-				pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
-					   cl->common.classid, cl->quantum);
+			if (cl->quantum <= 0 ||
+			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
+				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
+					cl->common.classid, cl->quantum);
 				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 			}
 		}
@@ -1087,10 +1090,9 @@ static void cbq_sync_defmap(struct cbq_class *cl)
 			continue;
 
 		for (h = 0; h < q->clhash.hashsize; h++) {
-			struct hlist_node *n;
 			struct cbq_class *c;
 
-			hlist_for_each_entry(c, n, &q->clhash.hash[h],
+			hlist_for_each_entry(c, &q->clhash.hash[h],
 					     common.hnode) {
 				if (c->split == split && c->level < level &&
 				    c->defmap & (1<<i)) {
@@ -1210,7 +1212,6 @@ cbq_reset(struct Qdisc *sch)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct cbq_class *cl;
-	struct hlist_node *n;
 	int prio;
 	unsigned int h;
 
@@ -1228,7 +1229,7 @@ cbq_reset(struct Qdisc *sch)
 		q->active[prio] = NULL;
 
 	for (h = 0; h < q->clhash.hashsize; h++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
+		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 			qdisc_reset(cl->q);
 
 			cl->next_alive = NULL;
@@ -1425,7 +1426,8 @@ static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
 {
 	unsigned char *b = skb_tail_pointer(skb);
 
-	NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
+	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
+		goto nla_put_failure;
 	return skb->len;
 
 nla_put_failure:
@@ -1450,7 +1452,8 @@ static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
 	opt.minidle = (u32)(-cl->minidle);
 	opt.offtime = cl->offtime;
 	opt.change = ~0;
-	NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
+		goto nla_put_failure;
 	return skb->len;
 
 nla_put_failure:
@@ -1463,12 +1466,14 @@ static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_cbq_wrropt opt;
 
+	memset(&opt, 0, sizeof(opt));
 	opt.flags = 0;
 	opt.allot = cl->allot;
 	opt.priority = cl->priority + 1;
 	opt.cpriority = cl->cpriority + 1;
 	opt.weight = cl->weight;
-	NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
+		goto nla_put_failure;
 	return skb->len;
 
 nla_put_failure:
@@ -1485,7 +1490,8 @@ static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
 	opt.priority2 = cl->priority2 + 1;
 	opt.pad = 0;
 	opt.penalty = cl->penalty;
-	NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt))
+		goto nla_put_failure;
 	return skb->len;
 
 nla_put_failure:
@@ -1502,7 +1508,8 @@ static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
 		opt.split = cl->split ? cl->split->common.classid : 0;
 		opt.defmap = cl->defmap;
 		opt.defchange = ~0;
-		NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
+		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
+			goto nla_put_failure;
 	}
 	return skb->len;
 
@@ -1521,7 +1528,8 @@ static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
 		opt.police = cl->police;
 		opt.__res1 = 0;
 		opt.__res2 = 0;
-		NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
+		if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
+			goto nla_put_failure;
 	}
 	return skb->len;
 
@@ -1555,8 +1563,7 @@ static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
 		goto nla_put_failure;
 	if (cbq_dump_attr(skb, &q->link) < 0)
 		goto nla_put_failure;
-	nla_nest_end(skb, nest);
-	return skb->len;
+	return nla_nest_end(skb, nest);
 
 nla_put_failure:
 	nla_nest_cancel(skb, nest);
@@ -1591,8 +1598,7 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg,
 		goto nla_put_failure;
 	if (cbq_dump_attr(skb, cl) < 0)
 		goto nla_put_failure;
-	nla_nest_end(skb, nest);
-	return skb->len;
+	return nla_nest_end(skb, nest);
 
 nla_put_failure:
 	nla_nest_cancel(skb, nest);
@@ -1691,7 +1697,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
 static void cbq_destroy(struct Qdisc *sch)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct hlist_node *n, *next;
+	struct hlist_node *next;
 	struct cbq_class *cl;
 	unsigned int h;
 
@@ -1704,11 +1710,11 @@ static void cbq_destroy(struct Qdisc *sch)
 	 * be bound to classes which have been destroyed already. --TGR '04
 	 */
 	for (h = 0; h < q->clhash.hashsize; h++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
+		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
 			tcf_destroy_chain(&cl->filter_list);
 	}
 	for (h = 0; h < q->clhash.hashsize; h++) {
-		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
+		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
 					  common.hnode)
 			cbq_destroy_class(sch, cl);
 	}
@@ -1775,8 +1781,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 						    qdisc_root_sleeping_lock(sch),
 						    tca[TCA_RATE]);
 			if (err) {
-				if (rtab)
-					qdisc_put_rtab(rtab);
+				qdisc_put_rtab(rtab);
 				return err;
 			}
 		}
@@ -2007,14 +2012,13 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct cbq_class *cl;
-	struct hlist_node *n;
 	unsigned int h;
 
 	if (arg->stop)
 		return;
 
 	for (h = 0; h < q->clhash.hashsize; h++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
+		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 			if (arg->count < arg->skip) {
 				arg->count++;
 				continue;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 7e267d7b9c7..ed30e436128 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -14,7 +14,6 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
-#include <linux/reciprocal_div.h>
 #include <linux/vmalloc.h>
 #include <net/pkt_sched.h>
 #include <net/inet_ecn.h>
@@ -77,12 +76,6 @@ struct choke_sched_data {
 	struct sk_buff **tab;
 };
 
-/* deliver a random number between 0 and N - 1 */
-static u32 random_N(unsigned int N)
-{
-	return reciprocal_divide(random32(), N);
-}
-
 /* number of elements in queue including holes */
 static unsigned int choke_len(const struct choke_sched_data *q)
 {
@@ -233,7 +226,7 @@ static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
 	int retrys = 3;
 
 	do {
-		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
 		skb = q->tab[*pidx];
 		if (skb)
 			return skb;
@@ -332,15 +325,13 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	}
 
 	q->stats.pdrop++;
-	sch->qstats.drops++;
-	kfree_skb(skb);
-	return NET_XMIT_DROP;
+	return qdisc_drop(skb, sch);
 
- congestion_drop:
+congestion_drop:
 	qdisc_drop(skb, sch);
 	return NET_XMIT_CN;
 
- other_drop:
+other_drop:
 	if (ret & __NET_XMIT_BYPASS)
 		sch->qstats.drops++;
 	kfree_skb(skb);
@@ -400,12 +391,7 @@ static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
 
 static void choke_free(void *addr)
 {
-	if (addr) {
-		if (is_vmalloc_addr(addr))
-			vfree(addr);
-		else
-			kfree(addr);
-	}
+	kvfree(addr);
 }
 
 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
@@ -440,7 +426,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 	if (mask != q->tab_mask) {
 		struct sk_buff **ntab;
 
-		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
+		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
+			       GFP_KERNEL | __GFP_NOWARN);
 		if (!ntab)
 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
 		if (!ntab)
@@ -515,8 +502,9 @@ static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (opts == NULL)
 		goto nla_put_failure;
 
-	NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
-	NLA_PUT_U32(skb, TCA_CHOKE_MAX_P, q->parms.max_P);
+	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
+	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
+		goto nla_put_failure;
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
new file mode 100644
index 00000000000..2f9ab17db85
--- /dev/null
+++ b/net/sched/sch_codel.c
@@ -0,0 +1,276 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm
+ *
+ *  Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
+ *  Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
+ *
+ *  Implemented on linux by :
+ *  Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
+ *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The names of the authors may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/prefetch.h>
+#include <net/pkt_sched.h>
+#include <net/codel.h>
+
+
+#define DEFAULT_CODEL_LIMIT 1000
+
+struct codel_sched_data {
+	struct codel_params	params;
+	struct codel_vars	vars;
+	struct codel_stats	stats;
+	u32			drop_overlimit;
+};
+
+/* This is the specific function called from codel_dequeue()
+ * to dequeue a packet from queue. Note: backlog is handled in
+ * codel, we dont need to reduce it here.
+ */
+static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+{
+	struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+	prefetch(&skb->end); /* we'll need skb_shinfo() */
+	return skb;
+}
+
+static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
+
+	/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+	 * or HTB crashes. Defer it for next round.
+	 */
+	if (q->stats.drop_count && sch->q.qlen) {
+		qdisc_tree_decrease_qlen(sch, q->stats.drop_count);
+		q->stats.drop_count = 0;
+	}
+	if (skb)
+		qdisc_bstats_update(sch, skb);
+	return skb;
+}
+
+static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct codel_sched_data *q;
+
+	if (likely(qdisc_qlen(sch) < sch->limit)) {
+		codel_set_enqueue_time(skb);
+		return qdisc_enqueue_tail(skb, sch);
+	}
+	q = qdisc_priv(sch);
+	q->drop_overlimit++;
+	return qdisc_drop(skb, sch);
+}
+
+static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
+	[TCA_CODEL_TARGET]	= { .type = NLA_U32 },
+	[TCA_CODEL_LIMIT]	= { .type = NLA_U32 },
+	[TCA_CODEL_INTERVAL]	= { .type = NLA_U32 },
+	[TCA_CODEL_ECN]		= { .type = NLA_U32 },
+};
+
+static int codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_CODEL_MAX + 1];
+	unsigned int qlen;
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy);
+	if (err < 0)
+		return err;
+
+	sch_tree_lock(sch);
+
+	if (tb[TCA_CODEL_TARGET]) {
+		u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]);
+
+		q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
+	if (tb[TCA_CODEL_INTERVAL]) {
+		u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
+
+		q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
+	if (tb[TCA_CODEL_LIMIT])
+		sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]);
+
+	if (tb[TCA_CODEL_ECN])
+		q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]);
+
+	qlen = sch->q.qlen;
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		qdisc_drop(skb, sch);
+	}
+	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+
+	sch->limit = DEFAULT_CODEL_LIMIT;
+
+	codel_params_init(&q->params);
+	codel_vars_init(&q->vars);
+	codel_stats_init(&q->stats);
+
+	if (opt) {
+		int err = codel_change(sch, opt);
+
+		if (err)
+			return err;
+	}
+
+	if (sch->limit >= 1)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+	return 0;
+}
+
+static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_CODEL_TARGET,
+			codel_time_to_us(q->params.target)) ||
+	    nla_put_u32(skb, TCA_CODEL_LIMIT,
+			sch->limit) ||
+	    nla_put_u32(skb, TCA_CODEL_INTERVAL,
+			codel_time_to_us(q->params.interval)) ||
+	    nla_put_u32(skb, TCA_CODEL_ECN,
+			q->params.ecn))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -1;
+}
+
+static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	const struct codel_sched_data *q = qdisc_priv(sch);
+	struct tc_codel_xstats st = {
+		.maxpacket	= q->stats.maxpacket,
+		.count		= q->vars.count,
+		.lastcount	= q->vars.lastcount,
+		.drop_overlimit = q->drop_overlimit,
+		.ldelay		= codel_time_to_us(q->vars.ldelay),
+		.dropping	= q->vars.dropping,
+		.ecn_mark	= q->stats.ecn_mark,
+	};
+
+	if (q->vars.dropping) {
+		codel_tdiff_t delta = q->vars.drop_next - codel_get_time();
+
+		if (delta >= 0)
+			st.drop_next = codel_time_to_us(delta);
+		else
+			st.drop_next = -codel_time_to_us(-delta);
+	}
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void codel_reset(struct Qdisc *sch)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+
+	qdisc_reset_queue(sch);
+	codel_vars_init(&q->vars);
+}
+
+static struct Qdisc_ops codel_qdisc_ops __read_mostly = {
+	.id		=	"codel",
+	.priv_size	=	sizeof(struct codel_sched_data),
+
+	.enqueue	=	codel_qdisc_enqueue,
+	.dequeue	=	codel_qdisc_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	codel_init,
+	.reset		=	codel_reset,
+	.change 	=	codel_change,
+	.dump		=	codel_dump,
+	.dump_stats	=	codel_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init codel_module_init(void)
+{
+	return register_qdisc(&codel_qdisc_ops);
+}
+
+static void __exit codel_module_exit(void)
+{
+	unregister_qdisc(&codel_qdisc_ops);
+}
+
+module_init(codel_module_init)
+module_exit(codel_module_exit)
+
+MODULE_DESCRIPTION("Controlled Delay queue discipline");
+MODULE_AUTHOR("Dave Taht");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 6b7fe4a84f1..7bbbfe11219 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
 
 	struct gnet_stats_basic_packed		bstats;
 	struct gnet_stats_queue		qstats;
-	struct gnet_stats_rate_est	rate_est;
+	struct gnet_stats_rate_est64	rate_est;
 	struct list_head		alist;
 	struct Qdisc			*qdisc;
 
@@ -260,7 +260,8 @@ static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
-	NLA_PUT_U32(skb, TCA_DRR_QUANTUM, cl->quantum);
+	if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum))
+		goto nla_put_failure;
 	return nla_nest_end(skb, nest);
 
 nla_put_failure:
@@ -292,14 +293,13 @@ static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 {
 	struct drr_sched *q = qdisc_priv(sch);
 	struct drr_class *cl;
-	struct hlist_node *n;
 	unsigned int i;
 
 	if (arg->stop)
 		return;
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 			if (arg->count < arg->skip) {
 				arg->count++;
 				continue;
@@ -351,7 +351,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct drr_sched *q = qdisc_priv(sch);
 	struct drr_class *cl;
-	int err;
+	int err = 0;
 
 	cl = drr_classify(skb, sch, &err);
 	if (cl == NULL) {
@@ -375,8 +375,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		cl->deficit = cl->quantum;
 	}
 
-	bstats_update(&cl->bstats, skb);
-
 	sch->q.qlen++;
 	return err;
 }
@@ -393,8 +391,10 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
 	while (1) {
 		cl = list_first_entry(&q->active, struct drr_class, alist);
 		skb = cl->qdisc->ops->peek(cl->qdisc);
-		if (skb == NULL)
+		if (skb == NULL) {
+			qdisc_warn_nonwc(__func__, cl->qdisc);
 			goto out;
+		}
 
 		len = qdisc_pkt_len(skb);
 		if (len <= cl->deficit) {
@@ -402,6 +402,8 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
 			skb = qdisc_dequeue_peeked(cl->qdisc);
 			if (cl->qdisc->q.qlen == 0)
 				list_del(&cl->alist);
+
+			bstats_update(&cl->bstats, skb);
 			qdisc_bstats_update(sch, skb);
 			sch->q.qlen--;
 			return skb;
@@ -450,11 +452,10 @@ static void drr_reset_qdisc(struct Qdisc *sch)
 {
 	struct drr_sched *q = qdisc_priv(sch);
 	struct drr_class *cl;
-	struct hlist_node *n;
 	unsigned int i;
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 			if (cl->qdisc->q.qlen)
 				list_del(&cl->alist);
 			qdisc_reset(cl->qdisc);
@@ -467,13 +468,13 @@ static void drr_destroy_qdisc(struct Qdisc *sch)
 {
 	struct drr_sched *q = qdisc_priv(sch);
 	struct drr_class *cl;
-	struct hlist_node *n, *next;
+	struct hlist_node *next;
 	unsigned int i;
 
 	tcf_destroy_chain(&q->filter_list);
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
 					  common.hnode)
 			drr_destroy_class(sch, cl);
 	}
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 2c790204d04..49d6ef338b5 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -47,7 +47,7 @@ struct dsmark_qdisc_data {
 
 static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
 {
-	return (index <= p->indices && index > 0);
+	return index <= p->indices && index > 0;
 }
 
 /* ------------------------- Class/flow operations ------------------------- */
@@ -57,8 +57,8 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 
-	pr_debug("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",
-		sch, p, new, old);
+	pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n",
+		 __func__, sch, p, new, old);
 
 	if (new == NULL) {
 		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
@@ -85,8 +85,8 @@ static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
 
 static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
 {
-	pr_debug("dsmark_get(sch %p,[qdisc %p],classid %x)\n",
-		sch, qdisc_priv(sch), classid);
+	pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
+		 __func__, sch, qdisc_priv(sch), classid);
 
 	return TC_H_MIN(classid) + 1;
 }
@@ -118,8 +118,8 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
 	int err = -EINVAL;
 	u8 mask = 0;
 
-	pr_debug("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
-		"arg 0x%lx\n", sch, p, classid, parent, *arg);
+	pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n",
+		 __func__, sch, p, classid, parent, *arg);
 
 	if (!dsmark_valid_index(p, *arg)) {
 		err = -ENOENT;
@@ -166,7 +166,8 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 	int i;
 
-	pr_debug("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
+	pr_debug("%s(sch %p,[qdisc %p],walker %p)\n",
+		 __func__, sch, p, walker);
 
 	if (walker->stop)
 		return;
@@ -199,7 +200,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 	int err;
 
-	pr_debug("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
+	pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
 
 	if (p->set_tc_index) {
 		switch (skb->protocol) {
@@ -265,8 +266,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	return NET_XMIT_SUCCESS;
 
 drop:
-	kfree_skb(skb);
-	sch->qstats.drops++;
+	qdisc_drop(skb, sch);
 	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 }
 
@@ -276,7 +276,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
 	struct sk_buff *skb;
 	u32 index;
 
-	pr_debug("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p);
+	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 
 	skb = p->q->ops->dequeue(p->q);
 	if (skb == NULL)
@@ -304,8 +304,8 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
 		 * and don't need yet another qdisc as a bypass.
 		 */
 		if (p->mask[index] != 0xff || p->value[index])
-			pr_warning("dsmark_dequeue: unsupported protocol %d\n",
-				   ntohs(skb->protocol));
+			pr_warn("%s: unsupported protocol %d\n",
+				__func__, ntohs(skb->protocol));
 		break;
 	}
 
@@ -316,7 +316,7 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 
-	pr_debug("dsmark_peek(sch %p,[qdisc %p])\n", sch, p);
+	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 
 	return p->q->ops->peek(p->q);
 }
@@ -326,7 +326,7 @@ static unsigned int dsmark_drop(struct Qdisc *sch)
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 	unsigned int len;
 
-	pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
+	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 
 	if (p->q->ops->drop == NULL)
 		return 0;
@@ -347,7 +347,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
 	u16 indices;
 	u8 *mask;
 
-	pr_debug("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
+	pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);
 
 	if (!opt)
 		goto errout;
@@ -385,7 +385,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
 	if (p->q == NULL)
 		p->q = &noop_qdisc;
 
-	pr_debug("dsmark_init: qdisc %p\n", p->q);
+	pr_debug("%s: qdisc %p\n", __func__, p->q);
 
 	err = 0;
 errout:
@@ -396,7 +396,7 @@ static void dsmark_reset(struct Qdisc *sch)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 
-	pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
+	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 	qdisc_reset(p->q);
 	sch->q.qlen = 0;
 }
@@ -405,7 +405,7 @@ static void dsmark_destroy(struct Qdisc *sch)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 
-	pr_debug("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p);
+	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 
 	tcf_destroy_chain(&p->filter_list);
 	qdisc_destroy(p->q);
@@ -418,7 +418,7 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 	struct nlattr *opts = NULL;
 
-	pr_debug("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl);
+	pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);
 
 	if (!dsmark_valid_index(p, cl))
 		return -EINVAL;
@@ -429,8 +429,9 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
 	opts = nla_nest_start(skb, TCA_OPTIONS);
 	if (opts == NULL)
 		goto nla_put_failure;
-	NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]);
-	NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]);
+	if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) ||
+	    nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]))
+		goto nla_put_failure;
 
 	return nla_nest_end(skb, opts);
 
@@ -447,13 +448,16 @@ static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
 	opts = nla_nest_start(skb, TCA_OPTIONS);
 	if (opts == NULL)
 		goto nla_put_failure;
-	NLA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices);
+	if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices))
+		goto nla_put_failure;
 
-	if (p->default_index != NO_DEFAULT_INDEX)
-		NLA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index);
+	if (p->default_index != NO_DEFAULT_INDEX &&
+	    nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index))
+		goto nla_put_failure;
 
-	if (p->set_tc_index)
-		NLA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX);
+	if (p->set_tc_index &&
+	    nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX))
+		goto nla_put_failure;
 
 	return nla_nest_end(skb, opts);
 
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 66effe2da8e..e15a9eb2908 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -85,7 +85,8 @@ static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct tc_fifo_qopt opt = { .limit = sch->limit };
 
-	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
 	return skb->len;
 
 nla_put_failure:
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
new file mode 100644
index 00000000000..ba32c2b005d
--- /dev/null
+++ b/net/sched/sch_fq.c
@@ -0,0 +1,849 @@
+/*
+ * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
+ *
+ *  Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *  Meant to be mostly used for localy generated traffic :
+ *  Fast classification depends on skb->sk being set before reaching us.
+ *  If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
+ *  All packets belonging to a socket are considered as a 'flow'.
+ *
+ *  Flows are dynamically allocated and stored in a hash table of RB trees
+ *  They are also part of one Round Robin 'queues' (new or old flows)
+ *
+ *  Burst avoidance (aka pacing) capability :
+ *
+ *  Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
+ *  bunch of packets, and this packet scheduler adds delay between
+ *  packets to respect rate limitation.
+ *
+ *  enqueue() :
+ *   - lookup one RB tree (out of 1024 or more) to find the flow.
+ *     If non existent flow, create it, add it to the tree.
+ *     Add skb to the per flow list of skb (fifo).
+ *   - Use a special fifo for high prio packets
+ *
+ *  dequeue() : serves flows in Round Robin
+ *  Note : When a flow becomes empty, we do not immediately remove it from
+ *  rb trees, for performance reasons (its expected to send additional packets,
+ *  or SLAB cache will reuse socket for another flow)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+/*
+ * Per flow structure, dynamically allocated
+ */
+struct fq_flow {
+	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
+	union {
+		struct sk_buff *tail;	/* last skb in the list */
+		unsigned long  age;	/* jiffies when flow was emptied, for gc */
+	};
+	struct rb_node	fq_node; 	/* anchor in fq_root[] trees */
+	struct sock	*sk;
+	int		qlen;		/* number of packets in flow queue */
+	int		credit;
+	u32		socket_hash;	/* sk_hash */
+	struct fq_flow *next;		/* next pointer in RR lists, or &detached */
+
+	struct rb_node  rate_node;	/* anchor in q->delayed tree */
+	u64		time_next_packet;
+};
+
+struct fq_flow_head {
+	struct fq_flow *first;
+	struct fq_flow *last;
+};
+
+struct fq_sched_data {
+	struct fq_flow_head new_flows;
+
+	struct fq_flow_head old_flows;
+
+	struct rb_root	delayed;	/* for rate limited flows */
+	u64		time_next_delayed_flow;
+
+	struct fq_flow	internal;	/* for non classified or high prio packets */
+	u32		quantum;
+	u32		initial_quantum;
+	u32		flow_refill_delay;
+	u32		flow_max_rate;	/* optional max rate per flow */
+	u32		flow_plimit;	/* max packets per flow */
+	struct rb_root	*fq_root;
+	u8		rate_enable;
+	u8		fq_trees_log;
+
+	u32		flows;
+	u32		inactive_flows;
+	u32		throttled_flows;
+
+	u64		stat_gc_flows;
+	u64		stat_internal_packets;
+	u64		stat_tcp_retrans;
+	u64		stat_throttled;
+	u64		stat_flows_plimit;
+	u64		stat_pkts_too_long;
+	u64		stat_allocation_errors;
+	struct qdisc_watchdog watchdog;
+};
+
+/* special value to mark a detached flow (not on old/new list) */
+static struct fq_flow detached, throttled;
+
+static void fq_flow_set_detached(struct fq_flow *f)
+{
+	f->next = &detached;
+	f->age = jiffies;
+}
+
+static bool fq_flow_is_detached(const struct fq_flow *f)
+{
+	return f->next == &detached;
+}
+
+static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+	struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
+
+	while (*p) {
+		struct fq_flow *aux;
+
+		parent = *p;
+		aux = container_of(parent, struct fq_flow, rate_node);
+		if (f->time_next_packet >= aux->time_next_packet)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&f->rate_node, parent, p);
+	rb_insert_color(&f->rate_node, &q->delayed);
+	q->throttled_flows++;
+	q->stat_throttled++;
+
+	f->next = &throttled;
+	if (q->time_next_delayed_flow > f->time_next_packet)
+		q->time_next_delayed_flow = f->time_next_packet;
+}
+
+
+static struct kmem_cache *fq_flow_cachep __read_mostly;
+
+static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+{
+	if (head->first)
+		head->last->next = flow;
+	else
+		head->first = flow;
+	head->last = flow;
+	flow->next = NULL;
+}
+
+/* limit number of collected flows per round */
+#define FQ_GC_MAX 8
+#define FQ_GC_AGE (3*HZ)
+
+static bool fq_gc_candidate(const struct fq_flow *f)
+{
+	return fq_flow_is_detached(f) &&
+	       time_after(jiffies, f->age + FQ_GC_AGE);
+}
+
+static void fq_gc(struct fq_sched_data *q,
+		  struct rb_root *root,
+		  struct sock *sk)
+{
+	struct fq_flow *f, *tofree[FQ_GC_MAX];
+	struct rb_node **p, *parent;
+	int fcnt = 0;
+
+	p = &root->rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+
+		f = container_of(parent, struct fq_flow, fq_node);
+		if (f->sk == sk)
+			break;
+
+		if (fq_gc_candidate(f)) {
+			tofree[fcnt++] = f;
+			if (fcnt == FQ_GC_MAX)
+				break;
+		}
+
+		if (f->sk > sk)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+
+	q->flows -= fcnt;
+	q->inactive_flows -= fcnt;
+	q->stat_gc_flows += fcnt;
+	while (fcnt) {
+		struct fq_flow *f = tofree[--fcnt];
+
+		rb_erase(&f->fq_node, root);
+		kmem_cache_free(fq_flow_cachep, f);
+	}
+}
+
+static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+{
+	struct rb_node **p, *parent;
+	struct sock *sk = skb->sk;
+	struct rb_root *root;
+	struct fq_flow *f;
+
+	/* warning: no starvation prevention... */
+	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
+		return &q->internal;
+
+	if (unlikely(!sk)) {
+		/* By forcing low order bit to 1, we make sure to not
+		 * collide with a local flow (socket pointers are word aligned)
+		 */
+		sk = (struct sock *)(skb_get_hash(skb) | 1L);
+	}
+
+	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
+
+	if (q->flows >= (2U << q->fq_trees_log) &&
+	    q->inactive_flows > q->flows/2)
+		fq_gc(q, root, sk);
+
+	p = &root->rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+
+		f = container_of(parent, struct fq_flow, fq_node);
+		if (f->sk == sk) {
+			/* socket might have been reallocated, so check
+			 * if its sk_hash is the same.
+			 * It not, we need to refill credit with
+			 * initial quantum
+			 */
+			if (unlikely(skb->sk &&
+				     f->socket_hash != sk->sk_hash)) {
+				f->credit = q->initial_quantum;
+				f->socket_hash = sk->sk_hash;
+				f->time_next_packet = 0ULL;
+			}
+			return f;
+		}
+		if (f->sk > sk)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+
+	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!f)) {
+		q->stat_allocation_errors++;
+		return &q->internal;
+	}
+	fq_flow_set_detached(f);
+	f->sk = sk;
+	if (skb->sk)
+		f->socket_hash = sk->sk_hash;
+	f->credit = q->initial_quantum;
+
+	rb_link_node(&f->fq_node, parent, p);
+	rb_insert_color(&f->fq_node, root);
+
+	q->flows++;
+	q->inactive_flows++;
+	return f;
+}
+
+
+/* remove one skb from head of flow queue */
+static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
+{
+	struct sk_buff *skb = flow->head;
+
+	if (skb) {
+		flow->head = skb->next;
+		skb->next = NULL;
+		flow->qlen--;
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		sch->q.qlen--;
+	}
+	return skb;
+}
+
+/* We might add in the future detection of retransmits
+ * For the time being, just return false
+ */
+static bool skb_is_retransmit(struct sk_buff *skb)
+{
+	return false;
+}
+
+/* add skb to flow queue
+ * flow queue is a linked list, kind of FIFO, except for TCP retransmits
+ * We special case tcp retransmits to be transmitted before other packets.
+ * We rely on fact that TCP retransmits are unlikely, so we do not waste
+ * a separate queue or a pointer.
+ * head->  [retrans pkt 1]
+ *         [retrans pkt 2]
+ *         [ normal pkt 1]
+ *         [ normal pkt 2]
+ *         [ normal pkt 3]
+ * tail->  [ normal pkt 4]
+ */
+static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
+{
+	struct sk_buff *prev, *head = flow->head;
+
+	skb->next = NULL;
+	if (!head) {
+		flow->head = skb;
+		flow->tail = skb;
+		return;
+	}
+	if (likely(!skb_is_retransmit(skb))) {
+		flow->tail->next = skb;
+		flow->tail = skb;
+		return;
+	}
+
+	/* This skb is a tcp retransmit,
+	 * find the last retrans packet in the queue
+	 */
+	prev = NULL;
+	while (skb_is_retransmit(head)) {
+		prev = head;
+		head = head->next;
+		if (!head)
+			break;
+	}
+	if (!prev) { /* no rtx packet in queue, become the new head */
+		skb->next = flow->head;
+		flow->head = skb;
+	} else {
+		if (prev == flow->tail)
+			flow->tail = skb;
+		else
+			skb->next = prev->next;
+		prev->next = skb;
+	}
+}
+
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct fq_flow *f;
+
+	if (unlikely(sch->q.qlen >= sch->limit))
+		return qdisc_drop(skb, sch);
+
+	f = fq_classify(skb, q);
+	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
+		q->stat_flows_plimit++;
+		return qdisc_drop(skb, sch);
+	}
+
+	f->qlen++;
+	if (skb_is_retransmit(skb))
+		q->stat_tcp_retrans++;
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+	if (fq_flow_is_detached(f)) {
+		fq_flow_add_tail(&q->new_flows, f);
+		if (time_after(jiffies, f->age + q->flow_refill_delay))
+			f->credit = max_t(u32, f->credit, q->quantum);
+		q->inactive_flows--;
+		qdisc_unthrottled(sch);
+	}
+
+	/* Note: this overwrites f->age */
+	flow_queue_add(f, skb);
+
+	if (unlikely(f == &q->internal)) {
+		q->stat_internal_packets++;
+		qdisc_unthrottled(sch);
+	}
+	sch->q.qlen++;
+
+	return NET_XMIT_SUCCESS;
+}
+
+static void fq_check_throttled(struct fq_sched_data *q, u64 now)
+{
+	struct rb_node *p;
+
+	if (q->time_next_delayed_flow > now)
+		return;
+
+	q->time_next_delayed_flow = ~0ULL;
+	while ((p = rb_first(&q->delayed)) != NULL) {
+		struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
+
+		if (f->time_next_packet > now) {
+			q->time_next_delayed_flow = f->time_next_packet;
+			break;
+		}
+		rb_erase(p, &q->delayed);
+		q->throttled_flows--;
+		fq_flow_add_tail(&q->old_flows, f);
+	}
+}
+
+static struct sk_buff *fq_dequeue(struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	u64 now = ktime_to_ns(ktime_get());
+	struct fq_flow_head *head;
+	struct sk_buff *skb;
+	struct fq_flow *f;
+	u32 rate;
+
+	skb = fq_dequeue_head(sch, &q->internal);
+	if (skb)
+		goto out;
+	fq_check_throttled(q, now);
+begin:
+	head = &q->new_flows;
+	if (!head->first) {
+		head = &q->old_flows;
+		if (!head->first) {
+			if (q->time_next_delayed_flow != ~0ULL)
+				qdisc_watchdog_schedule_ns(&q->watchdog,
+							   q->time_next_delayed_flow);
+			return NULL;
+		}
+	}
+	f = head->first;
+
+	if (f->credit <= 0) {
+		f->credit += q->quantum;
+		head->first = f->next;
+		fq_flow_add_tail(&q->old_flows, f);
+		goto begin;
+	}
+
+	if (unlikely(f->head && now < f->time_next_packet)) {
+		head->first = f->next;
+		fq_flow_set_throttled(q, f);
+		goto begin;
+	}
+
+	skb = fq_dequeue_head(sch, f);
+	if (!skb) {
+		head->first = f->next;
+		/* force a pass through old_flows to prevent starvation */
+		if ((head == &q->new_flows) && q->old_flows.first) {
+			fq_flow_add_tail(&q->old_flows, f);
+		} else {
+			fq_flow_set_detached(f);
+			q->inactive_flows++;
+		}
+		goto begin;
+	}
+	prefetch(&skb->end);
+	f->time_next_packet = now;
+	f->credit -= qdisc_pkt_len(skb);
+
+	if (f->credit > 0 || !q->rate_enable)
+		goto out;
+
+	rate = q->flow_max_rate;
+	if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT)
+		rate = min(skb->sk->sk_pacing_rate, rate);
+
+	if (rate != ~0U) {
+		u32 plen = max(qdisc_pkt_len(skb), q->quantum);
+		u64 len = (u64)plen * NSEC_PER_SEC;
+
+		if (likely(rate))
+			do_div(len, rate);
+		/* Since socket rate can change later,
+		 * clamp the delay to 125 ms.
+		 * TODO: maybe segment the too big skb, as in commit
+		 * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
+		 */
+		if (unlikely(len > 125 * NSEC_PER_MSEC)) {
+			len = 125 * NSEC_PER_MSEC;
+			q->stat_pkts_too_long++;
+		}
+
+		f->time_next_packet = now + len;
+	}
+out:
+	qdisc_bstats_update(sch, skb);
+	qdisc_unthrottled(sch);
+	return skb;
+}
+
+static void fq_reset(struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct rb_root *root;
+	struct sk_buff *skb;
+	struct rb_node *p;
+	struct fq_flow *f;
+	unsigned int idx;
+
+	while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
+		kfree_skb(skb);
+
+	if (!q->fq_root)
+		return;
+
+	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
+		root = &q->fq_root[idx];
+		while ((p = rb_first(root)) != NULL) {
+			f = container_of(p, struct fq_flow, fq_node);
+			rb_erase(p, root);
+
+			while ((skb = fq_dequeue_head(sch, f)) != NULL)
+				kfree_skb(skb);
+
+			kmem_cache_free(fq_flow_cachep, f);
+		}
+	}
+	q->new_flows.first	= NULL;
+	q->old_flows.first	= NULL;
+	q->delayed		= RB_ROOT;
+	q->flows		= 0;
+	q->inactive_flows	= 0;
+	q->throttled_flows	= 0;
+}
+
+static void fq_rehash(struct fq_sched_data *q,
+		      struct rb_root *old_array, u32 old_log,
+		      struct rb_root *new_array, u32 new_log)
+{
+	struct rb_node *op, **np, *parent;
+	struct rb_root *oroot, *nroot;
+	struct fq_flow *of, *nf;
+	int fcnt = 0;
+	u32 idx;
+
+	for (idx = 0; idx < (1U << old_log); idx++) {
+		oroot = &old_array[idx];
+		while ((op = rb_first(oroot)) != NULL) {
+			rb_erase(op, oroot);
+			of = container_of(op, struct fq_flow, fq_node);
+			if (fq_gc_candidate(of)) {
+				fcnt++;
+				kmem_cache_free(fq_flow_cachep, of);
+				continue;
+			}
+			nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
+
+			np = &nroot->rb_node;
+			parent = NULL;
+			while (*np) {
+				parent = *np;
+
+				nf = container_of(parent, struct fq_flow, fq_node);
+				BUG_ON(nf->sk == of->sk);
+
+				if (nf->sk > of->sk)
+					np = &parent->rb_right;
+				else
+					np = &parent->rb_left;
+			}
+
+			rb_link_node(&of->fq_node, parent, np);
+			rb_insert_color(&of->fq_node, nroot);
+		}
+	}
+	q->flows -= fcnt;
+	q->inactive_flows -= fcnt;
+	q->stat_gc_flows += fcnt;
+}
+
+static void *fq_alloc_node(size_t sz, int node)
+{
+	void *ptr;
+
+	ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node);
+	if (!ptr)
+		ptr = vmalloc_node(sz, node);
+	return ptr;
+}
+
+static void fq_free(void *addr)
+{
+	kvfree(addr);
+}
+
+static int fq_resize(struct Qdisc *sch, u32 log)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct rb_root *array;
+	void *old_fq_root;
+	u32 idx;
+
+	if (q->fq_root && log == q->fq_trees_log)
+		return 0;
+
+	/* If XPS was setup, we can allocate memory on right NUMA node */
+	array = fq_alloc_node(sizeof(struct rb_root) << log,
+			      netdev_queue_numa_node_read(sch->dev_queue));
+	if (!array)
+		return -ENOMEM;
+
+	for (idx = 0; idx < (1U << log); idx++)
+		array[idx] = RB_ROOT;
+
+	sch_tree_lock(sch);
+
+	old_fq_root = q->fq_root;
+	if (old_fq_root)
+		fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
+
+	q->fq_root = array;
+	q->fq_trees_log = log;
+
+	sch_tree_unlock(sch);
+
+	fq_free(old_fq_root);
+
+	return 0;
+}
+
+static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+	[TCA_FQ_PLIMIT]			= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_PLIMIT]		= { .type = NLA_U32 },
+	[TCA_FQ_QUANTUM]		= { .type = NLA_U32 },
+	[TCA_FQ_INITIAL_QUANTUM]	= { .type = NLA_U32 },
+	[TCA_FQ_RATE_ENABLE]		= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
+	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
+};
+
+static int fq_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_FQ_MAX + 1];
+	int err, drop_count = 0;
+	u32 fq_log;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
+	if (err < 0)
+		return err;
+
+	sch_tree_lock(sch);
+
+	fq_log = q->fq_trees_log;
+
+	if (tb[TCA_FQ_BUCKETS_LOG]) {
+		u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
+
+		if (nval >= 1 && nval <= ilog2(256*1024))
+			fq_log = nval;
+		else
+			err = -EINVAL;
+	}
+	if (tb[TCA_FQ_PLIMIT])
+		sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
+
+	if (tb[TCA_FQ_FLOW_PLIMIT])
+		q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
+
+	if (tb[TCA_FQ_QUANTUM])
+		q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
+
+	if (tb[TCA_FQ_INITIAL_QUANTUM])
+		q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
+
+	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
+		pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
+				    nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
+
+	if (tb[TCA_FQ_FLOW_MAX_RATE])
+		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+
+	if (tb[TCA_FQ_RATE_ENABLE]) {
+		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
+
+		if (enable <= 1)
+			q->rate_enable = enable;
+		else
+			err = -EINVAL;
+	}
+
+	if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
+		u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
+
+		q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
+	}
+
+	if (!err) {
+		sch_tree_unlock(sch);
+		err = fq_resize(sch, fq_log);
+		sch_tree_lock(sch);
+	}
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = fq_dequeue(sch);
+
+		if (!skb)
+			break;
+		kfree_skb(skb);
+		drop_count++;
+	}
+	qdisc_tree_decrease_qlen(sch, drop_count);
+
+	sch_tree_unlock(sch);
+	return err;
+}
+
+static void fq_destroy(struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+
+	fq_reset(sch);
+	fq_free(q->fq_root);
+	qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int fq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	int err;
+
+	sch->limit		= 10000;
+	q->flow_plimit		= 100;
+	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
+	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
+	q->flow_refill_delay	= msecs_to_jiffies(40);
+	q->flow_max_rate	= ~0U;
+	q->rate_enable		= 1;
+	q->new_flows.first	= NULL;
+	q->old_flows.first	= NULL;
+	q->delayed		= RB_ROOT;
+	q->fq_root		= NULL;
+	q->fq_trees_log		= ilog2(1024);
+	qdisc_watchdog_init(&q->watchdog, sch);
+
+	if (opt)
+		err = fq_change(sch, opt);
+	else
+		err = fq_resize(sch, q->fq_trees_log);
+
+	return err;
+}
+
+static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
+
+	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
+	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
+	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
+	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
+			jiffies_to_usecs(q->flow_refill_delay)) ||
+	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	return -1;
+}
+
+static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	u64 now = ktime_to_ns(ktime_get());
+	struct tc_fq_qd_stats st = {
+		.gc_flows		= q->stat_gc_flows,
+		.highprio_packets	= q->stat_internal_packets,
+		.tcp_retrans		= q->stat_tcp_retrans,
+		.throttled		= q->stat_throttled,
+		.flows_plimit		= q->stat_flows_plimit,
+		.pkts_too_long		= q->stat_pkts_too_long,
+		.allocation_errors	= q->stat_allocation_errors,
+		.flows			= q->flows,
+		.inactive_flows		= q->inactive_flows,
+		.throttled_flows	= q->throttled_flows,
+		.time_next_delayed_flow	= q->time_next_delayed_flow - now,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
+	.id		=	"fq",
+	.priv_size	=	sizeof(struct fq_sched_data),
+
+	.enqueue	=	fq_enqueue,
+	.dequeue	=	fq_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	fq_init,
+	.reset		=	fq_reset,
+	.destroy	=	fq_destroy,
+	.change		=	fq_change,
+	.dump		=	fq_dump,
+	.dump_stats	=	fq_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init fq_module_init(void)
+{
+	int ret;
+
+	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
+					   sizeof(struct fq_flow),
+					   0, 0, NULL);
+	if (!fq_flow_cachep)
+		return -ENOMEM;
+
+	ret = register_qdisc(&fq_qdisc_ops);
+	if (ret)
+		kmem_cache_destroy(fq_flow_cachep);
+	return ret;
+}
+
+static void __exit fq_module_exit(void)
+{
+	unregister_qdisc(&fq_qdisc_ops);
+	kmem_cache_destroy(fq_flow_cachep);
+}
+
+module_init(fq_module_init)
+module_exit(fq_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
new file mode 100644
index 00000000000..063b726bf1f
--- /dev/null
+++ b/net/sched/sch_fq_codel.c
@@ -0,0 +1,620 @@
+/*
+ * Fair Queue CoDel discipline
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/flow_keys.h>
+#include <net/codel.h>
+
+/*	Fair Queue CoDel.
+ *
+ * Principles :
+ * Packets are classified (internal classifier or external) on flows.
+ * This is a Stochastic model (as we use a hash, several flows
+ *			       might be hashed on same slot)
+ * Each flow has a CoDel managed queue.
+ * Flows are linked onto two (Round Robin) lists,
+ * so that new flows have priority on old ones.
+ *
+ * For a given flow, packets are not reordered (CoDel uses a FIFO)
+ * head drops only.
+ * ECN capability is on by default.
+ * Low memory footprint (64 bytes per flow)
+ */
+
+struct fq_codel_flow {
+	struct sk_buff	  *head;
+	struct sk_buff	  *tail;
+	struct list_head  flowchain;
+	int		  deficit;
+	u32		  dropped; /* number of drops (or ECN marks) on this flow */
+	struct codel_vars cvars;
+}; /* please try to keep this structure <= 64 bytes */
+
+struct fq_codel_sched_data {
+	struct tcf_proto *filter_list;	/* optional external classifier */
+	struct fq_codel_flow *flows;	/* Flows table [flows_cnt] */
+	u32		*backlogs;	/* backlog table [flows_cnt] */
+	u32		flows_cnt;	/* number of flows */
+	u32		perturbation;	/* hash perturbation */
+	u32		quantum;	/* psched_mtu(qdisc_dev(sch)); */
+	struct codel_params cparams;
+	struct codel_stats cstats;
+	u32		drop_overlimit;
+	u32		new_flow_count;
+
+	struct list_head new_flows;	/* list of new flows */
+	struct list_head old_flows;	/* list of old flows */
+};
+
+static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
+				  const struct sk_buff *skb)
+{
+	struct flow_keys keys;
+	unsigned int hash;
+
+	skb_flow_dissect(skb, &keys);
+	hash = jhash_3words((__force u32)keys.dst,
+			    (__force u32)keys.src ^ keys.ip_proto,
+			    (__force u32)keys.ports, q->perturbation);
+	return ((u64)hash * q->flows_cnt) >> 32;
+}
+
+static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
+				      int *qerr)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	if (TC_H_MAJ(skb->priority) == sch->handle &&
+	    TC_H_MIN(skb->priority) > 0 &&
+	    TC_H_MIN(skb->priority) <= q->flows_cnt)
+		return TC_H_MIN(skb->priority);
+
+	if (!q->filter_list)
+		return fq_codel_hash(q, skb) + 1;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return 0;
+		}
+#endif
+		if (TC_H_MIN(res.classid) <= q->flows_cnt)
+			return TC_H_MIN(res.classid);
+	}
+	return 0;
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
+{
+	struct sk_buff *skb = flow->head;
+
+	flow->head = skb->next;
+	skb->next = NULL;
+	return skb;
+}
+
+/* add skb to flow queue (tail add) */
+static inline void flow_queue_add(struct fq_codel_flow *flow,
+				  struct sk_buff *skb)
+{
+	if (flow->head == NULL)
+		flow->head = skb;
+	else
+		flow->tail->next = skb;
+	flow->tail = skb;
+	skb->next = NULL;
+}
+
+static unsigned int fq_codel_drop(struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	unsigned int maxbacklog = 0, idx = 0, i, len;
+	struct fq_codel_flow *flow;
+
+	/* Queue is full! Find the fat flow and drop packet from it.
+	 * This might sound expensive, but with 1024 flows, we scan
+	 * 4KB of memory, and we dont need to handle a complex tree
+	 * in fast path (packet queue/enqueue) with many cache misses.
+	 */
+	for (i = 0; i < q->flows_cnt; i++) {
+		if (q->backlogs[i] > maxbacklog) {
+			maxbacklog = q->backlogs[i];
+			idx = i;
+		}
+	}
+	flow = &q->flows[idx];
+	skb = dequeue_head(flow);
+	len = qdisc_pkt_len(skb);
+	q->backlogs[idx] -= len;
+	kfree_skb(skb);
+	sch->q.qlen--;
+	sch->qstats.drops++;
+	sch->qstats.backlog -= len;
+	flow->dropped++;
+	return idx;
+}
+
+static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	unsigned int idx;
+	struct fq_codel_flow *flow;
+	int uninitialized_var(ret);
+
+	idx = fq_codel_classify(skb, sch, &ret);
+	if (idx == 0) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+	idx--;
+
+	codel_set_enqueue_time(skb);
+	flow = &q->flows[idx];
+	flow_queue_add(flow, skb);
+	q->backlogs[idx] += qdisc_pkt_len(skb);
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+
+	if (list_empty(&flow->flowchain)) {
+		list_add_tail(&flow->flowchain, &q->new_flows);
+		q->new_flow_count++;
+		flow->deficit = q->quantum;
+		flow->dropped = 0;
+	}
+	if (++sch->q.qlen <= sch->limit)
+		return NET_XMIT_SUCCESS;
+
+	q->drop_overlimit++;
+	/* Return Congestion Notification only if we dropped a packet
+	 * from this flow.
+	 */
+	if (fq_codel_drop(sch) == idx)
+		return NET_XMIT_CN;
+
+	/* As we dropped a packet, better let upper stack know this */
+	qdisc_tree_decrease_qlen(sch, 1);
+	return NET_XMIT_SUCCESS;
+}
+
+/* This is the specific function called from codel_dequeue()
+ * to dequeue a packet from queue. Note: backlog is handled in
+ * codel, we dont need to reduce it here.
+ */
+static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct fq_codel_flow *flow;
+	struct sk_buff *skb = NULL;
+
+	flow = container_of(vars, struct fq_codel_flow, cvars);
+	if (flow->head) {
+		skb = dequeue_head(flow);
+		q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
+		sch->q.qlen--;
+	}
+	return skb;
+}
+
+static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	struct fq_codel_flow *flow;
+	struct list_head *head;
+	u32 prev_drop_count, prev_ecn_mark;
+
+begin:
+	head = &q->new_flows;
+	if (list_empty(head)) {
+		head = &q->old_flows;
+		if (list_empty(head))
+			return NULL;
+	}
+	flow = list_first_entry(head, struct fq_codel_flow, flowchain);
+
+	if (flow->deficit <= 0) {
+		flow->deficit += q->quantum;
+		list_move_tail(&flow->flowchain, &q->old_flows);
+		goto begin;
+	}
+
+	prev_drop_count = q->cstats.drop_count;
+	prev_ecn_mark = q->cstats.ecn_mark;
+
+	skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
+			    dequeue);
+
+	flow->dropped += q->cstats.drop_count - prev_drop_count;
+	flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
+
+	if (!skb) {
+		/* force a pass through old_flows to prevent starvation */
+		if ((head == &q->new_flows) && !list_empty(&q->old_flows))
+			list_move_tail(&flow->flowchain, &q->old_flows);
+		else
+			list_del_init(&flow->flowchain);
+		goto begin;
+	}
+	qdisc_bstats_update(sch, skb);
+	flow->deficit -= qdisc_pkt_len(skb);
+	/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+	 * or HTB crashes. Defer it for next round.
+	 */
+	if (q->cstats.drop_count && sch->q.qlen) {
+		qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+		q->cstats.drop_count = 0;
+	}
+	return skb;
+}
+
+static void fq_codel_reset(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	while ((skb = fq_codel_dequeue(sch)) != NULL)
+		kfree_skb(skb);
+}
+
+static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
+	[TCA_FQ_CODEL_TARGET]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_LIMIT]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_INTERVAL]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_ECN]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_FLOWS]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_QUANTUM]	= { .type = NLA_U32 },
+};
+
+static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy);
+	if (err < 0)
+		return err;
+	if (tb[TCA_FQ_CODEL_FLOWS]) {
+		if (q->flows)
+			return -EINVAL;
+		q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]);
+		if (!q->flows_cnt ||
+		    q->flows_cnt > 65536)
+			return -EINVAL;
+	}
+	sch_tree_lock(sch);
+
+	if (tb[TCA_FQ_CODEL_TARGET]) {
+		u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]);
+
+		q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
+	if (tb[TCA_FQ_CODEL_INTERVAL]) {
+		u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
+
+		q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
+	if (tb[TCA_FQ_CODEL_LIMIT])
+		sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]);
+
+	if (tb[TCA_FQ_CODEL_ECN])
+		q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]);
+
+	if (tb[TCA_FQ_CODEL_QUANTUM])
+		q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
+
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = fq_codel_dequeue(sch);
+
+		kfree_skb(skb);
+		q->cstats.drop_count++;
+	}
+	qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+	q->cstats.drop_count = 0;
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static void *fq_codel_zalloc(size_t sz)
+{
+	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+	if (!ptr)
+		ptr = vzalloc(sz);
+	return ptr;
+}
+
+static void fq_codel_free(void *addr)
+{
+	kvfree(addr);
+}
+
+static void fq_codel_destroy(struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	fq_codel_free(q->backlogs);
+	fq_codel_free(q->flows);
+}
+
+static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	int i;
+
+	sch->limit = 10*1024;
+	q->flows_cnt = 1024;
+	q->quantum = psched_mtu(qdisc_dev(sch));
+	q->perturbation = prandom_u32();
+	INIT_LIST_HEAD(&q->new_flows);
+	INIT_LIST_HEAD(&q->old_flows);
+	codel_params_init(&q->cparams);
+	codel_stats_init(&q->cstats);
+	q->cparams.ecn = true;
+
+	if (opt) {
+		int err = fq_codel_change(sch, opt);
+		if (err)
+			return err;
+	}
+
+	if (!q->flows) {
+		q->flows = fq_codel_zalloc(q->flows_cnt *
+					   sizeof(struct fq_codel_flow));
+		if (!q->flows)
+			return -ENOMEM;
+		q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32));
+		if (!q->backlogs) {
+			fq_codel_free(q->flows);
+			return -ENOMEM;
+		}
+		for (i = 0; i < q->flows_cnt; i++) {
+			struct fq_codel_flow *flow = q->flows + i;
+
+			INIT_LIST_HEAD(&flow->flowchain);
+			codel_vars_init(&flow->cvars);
+		}
+	}
+	if (sch->limit >= 1)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
+	return 0;
+}
+
+static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET,
+			codel_time_to_us(q->cparams.target)) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_LIMIT,
+			sch->limit) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL,
+			codel_time_to_us(q->cparams.interval)) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_ECN,
+			q->cparams.ecn) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
+			q->quantum) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
+			q->flows_cnt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	return -1;
+}
+
+static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct tc_fq_codel_xstats st = {
+		.type				= TCA_FQ_CODEL_XSTATS_QDISC,
+	};
+	struct list_head *pos;
+
+	st.qdisc_stats.maxpacket = q->cstats.maxpacket;
+	st.qdisc_stats.drop_overlimit = q->drop_overlimit;
+	st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
+	st.qdisc_stats.new_flow_count = q->new_flow_count;
+
+	list_for_each(pos, &q->new_flows)
+		st.qdisc_stats.new_flows_len++;
+
+	list_for_each(pos, &q->old_flows)
+		st.qdisc_stats.old_flows_len++;
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
+			      u32 classid)
+{
+	/* we cannot bypass queue discipline anymore */
+	sch->flags &= ~TCQ_F_CAN_BYPASS;
+	return 0;
+}
+
+static void fq_codel_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				     struct gnet_dump *d)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	u32 idx = cl - 1;
+	struct gnet_stats_queue qs = { 0 };
+	struct tc_fq_codel_xstats xstats;
+
+	if (idx < q->flows_cnt) {
+		const struct fq_codel_flow *flow = &q->flows[idx];
+		const struct sk_buff *skb = flow->head;
+
+		memset(&xstats, 0, sizeof(xstats));
+		xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
+		xstats.class_stats.deficit = flow->deficit;
+		xstats.class_stats.ldelay =
+			codel_time_to_us(flow->cvars.ldelay);
+		xstats.class_stats.count = flow->cvars.count;
+		xstats.class_stats.lastcount = flow->cvars.lastcount;
+		xstats.class_stats.dropping = flow->cvars.dropping;
+		if (flow->cvars.dropping) {
+			codel_tdiff_t delta = flow->cvars.drop_next -
+					      codel_get_time();
+
+			xstats.class_stats.drop_next = (delta >= 0) ?
+				codel_time_to_us(delta) :
+				-codel_time_to_us(-delta);
+		}
+		while (skb) {
+			qs.qlen++;
+			skb = skb->next;
+		}
+		qs.backlog = q->backlogs[idx];
+		qs.drops = flow->dropped;
+	}
+	if (gnet_stats_copy_queue(d, &qs) < 0)
+		return -1;
+	if (idx < q->flows_cnt)
+		return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+	return 0;
+}
+
+static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->flows_cnt; i++) {
+		if (list_empty(&q->flows[i].flowchain) ||
+		    arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, i + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops fq_codel_class_ops = {
+	.leaf		=	fq_codel_leaf,
+	.get		=	fq_codel_get,
+	.put		=	fq_codel_put,
+	.tcf_chain	=	fq_codel_find_tcf,
+	.bind_tcf	=	fq_codel_bind,
+	.unbind_tcf	=	fq_codel_put,
+	.dump		=	fq_codel_dump_class,
+	.dump_stats	=	fq_codel_dump_class_stats,
+	.walk		=	fq_codel_walk,
+};
+
+static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
+	.cl_ops		=	&fq_codel_class_ops,
+	.id		=	"fq_codel",
+	.priv_size	=	sizeof(struct fq_codel_sched_data),
+	.enqueue	=	fq_codel_enqueue,
+	.dequeue	=	fq_codel_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.drop		=	fq_codel_drop,
+	.init		=	fq_codel_init,
+	.reset		=	fq_codel_reset,
+	.destroy	=	fq_codel_destroy,
+	.change		=	fq_codel_change,
+	.dump		=	fq_codel_dump,
+	.dump_stats =	fq_codel_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init fq_codel_module_init(void)
+{
+	return register_qdisc(&fq_codel_qdisc_ops);
+}
+
+static void __exit fq_codel_module_exit(void)
+{
+	unregister_qdisc(&fq_codel_qdisc_ops);
+}
+
+module_init(fq_codel_module_init)
+module_exit(fq_codel_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 67fc573e013..e1543b03e39 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -25,9 +25,15 @@
 #include <linux/rcupdate.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <linux/if_vlan.h>
+#include <net/sch_generic.h>
 #include <net/pkt_sched.h>
 #include <net/dst.h>
 
+/* Qdisc to use by default */
+const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
+EXPORT_SYMBOL(default_qdisc_ops);
+
 /* Main transmission queue. */
 
 /* Modifications to data participating in scheduling must be protected with
@@ -53,20 +59,19 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 {
 	struct sk_buff *skb = q->gso_skb;
+	const struct netdev_queue *txq = q->dev_queue;
 
 	if (unlikely(skb)) {
-		struct net_device *dev = qdisc_dev(q);
-		struct netdev_queue *txq;
-
 		/* check the reason of requeuing without tx lock first */
-		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+		txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb));
 		if (!netif_xmit_frozen_or_stopped(txq)) {
 			q->gso_skb = NULL;
 			q->q.qlen--;
 		} else
 			skb = NULL;
 	} else {
-		skb = q->dequeue(q);
+		if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq))
+			skb = q->dequeue(q);
 	}
 
 	return skb;
@@ -86,9 +91,8 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
 		 * deadloop is detected. Return OK to try the next skb.
 		 */
 		kfree_skb(skb);
-		if (net_ratelimit())
-			pr_warning("Dead loop on netdevice %s, fix it urgently!\n",
-				   dev_queue->dev->name);
+		net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
+				     dev_queue->dev->name);
 		ret = qdisc_qlen(q);
 	} else {
 		/*
@@ -136,9 +140,9 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 		ret = handle_dev_cpu_collision(skb, txq, q);
 	} else {
 		/* Driver returned NETDEV_TX_BUSY - requeue skb */
-		if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
-			pr_warning("BUG %s code %d qlen %d\n",
-				   dev->name, ret, q->q.qlen);
+		if (unlikely(ret != NETDEV_TX_BUSY))
+			net_warn_ratelimited("BUG %s code %d qlen %d\n",
+					     dev->name, ret, q->q.qlen);
 
 		ret = dev_requeue_skb(skb, q);
 	}
@@ -208,15 +212,19 @@ void __qdisc_run(struct Qdisc *q)
 
 unsigned long dev_trans_start(struct net_device *dev)
 {
-	unsigned long val, res = dev->trans_start;
+	unsigned long val, res;
 	unsigned int i;
 
+	if (is_vlan_dev(dev))
+		dev = vlan_dev_real_dev(dev);
+	res = dev->trans_start;
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		val = netdev_get_tx_queue(dev, i)->trans_start;
 		if (val && time_after(val, res))
 			res = val;
 	}
 	dev->trans_start = res;
+
 	return res;
 }
 EXPORT_SYMBOL(dev_trans_start);
@@ -302,6 +310,7 @@ void netif_carrier_on(struct net_device *dev)
 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
 		if (dev->reg_state == NETREG_UNINITIALIZED)
 			return;
+		atomic_inc(&dev->carrier_changes);
 		linkwatch_fire_event(dev);
 		if (netif_running(dev))
 			__netdev_watchdog_up(dev);
@@ -320,41 +329,24 @@ void netif_carrier_off(struct net_device *dev)
 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
 		if (dev->reg_state == NETREG_UNINITIALIZED)
 			return;
+		atomic_inc(&dev->carrier_changes);
 		linkwatch_fire_event(dev);
 	}
 }
 EXPORT_SYMBOL(netif_carrier_off);
 
-/**
- * 	netif_notify_peers - notify network peers about existence of @dev
- * 	@dev: network device
- *
- * Generate traffic such that interested network peers are aware of
- * @dev, such as by generating a gratuitous ARP. This may be used when
- * a device wants to inform the rest of the network about some sort of
- * reconfiguration such as a failover event or virtual machine
- * migration.
- */
-void netif_notify_peers(struct net_device *dev)
-{
-	rtnl_lock();
-	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
-	rtnl_unlock();
-}
-EXPORT_SYMBOL(netif_notify_peers);
-
 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
    under all circumstances. It is difficult to invent anything faster or
    cheaper.
  */
 
-static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
 {
 	kfree_skb(skb);
 	return NET_XMIT_CN;
 }
 
-static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
+static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
 {
 	return NULL;
 }
@@ -512,7 +504,8 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
 
 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
-	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
 	return skb->len;
 
 nla_put_failure:
@@ -543,15 +536,17 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.dump		=	pfifo_fast_dump,
 	.owner		=	THIS_MODULE,
 };
-EXPORT_SYMBOL(pfifo_fast_ops);
+
+static struct lock_class_key qdisc_tx_busylock;
 
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
-			  struct Qdisc_ops *ops)
+			  const struct Qdisc_ops *ops)
 {
 	void *p;
 	struct Qdisc *sch;
 	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
 	int err = -ENOBUFS;
+	struct net_device *dev = dev_queue->dev;
 
 	p = kzalloc_node(size, GFP_KERNEL,
 			 netdev_queue_numa_node_read(dev_queue));
@@ -571,12 +566,16 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	}
 	INIT_LIST_HEAD(&sch->list);
 	skb_queue_head_init(&sch->q);
+
 	spin_lock_init(&sch->busylock);
+	lockdep_set_class(&sch->busylock,
+			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
 	sch->ops = ops;
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;
 	sch->dev_queue = dev_queue;
-	dev_hold(qdisc_dev(sch));
+	dev_hold(dev);
 	atomic_set(&sch->refcnt, 1);
 
 	return sch;
@@ -585,10 +584,14 @@ errout:
 }
 
 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
-				struct Qdisc_ops *ops, unsigned int parentid)
+				const struct Qdisc_ops *ops,
+				unsigned int parentid)
 {
 	struct Qdisc *sch;
 
+	if (!try_module_get(ops->owner))
+		goto errout;
+
 	sch = qdisc_alloc(dev_queue, ops);
 	if (IS_ERR(sch))
 		goto errout;
@@ -692,11 +695,13 @@ static void attach_one_default_qdisc(struct net_device *dev,
 
 	if (dev->tx_queue_len) {
 		qdisc = qdisc_create_dflt(dev_queue,
-					  &pfifo_fast_ops, TC_H_ROOT);
+					  default_qdisc_ops, TC_H_ROOT);
 		if (!qdisc) {
 			netdev_info(dev, "activation failed\n");
 			return;
 		}
+		if (!netif_is_multiqueue(dev))
+			qdisc->flags |= TCQ_F_ONETXQUEUE;
 	}
 	dev_queue->qdisc_sleeping = qdisc;
 }
@@ -715,8 +720,8 @@ static void attach_default_qdiscs(struct net_device *dev)
 	} else {
 		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
 		if (qdisc) {
-			qdisc->ops->attach(qdisc);
 			dev->qdisc = qdisc;
+			qdisc->ops->attach(qdisc);
 		}
 	}
 }
@@ -743,9 +748,8 @@ void dev_activate(struct net_device *dev)
 	int need_watchdog;
 
 	/* No queueing discipline is attached to device;
-	   create default one i.e. pfifo_fast for devices,
-	   which need queueing and noqueue_qdisc for
-	   virtual interfaces
+	 * create default one for devices, which need queueing
+	 * and noqueue_qdisc for virtual interfaces
 	 */
 
 	if (dev->qdisc == &noop_qdisc)
@@ -827,7 +831,7 @@ void dev_deactivate_many(struct list_head *head)
 	struct net_device *dev;
 	bool sync_needed = false;
 
-	list_for_each_entry(dev, head, unreg_list) {
+	list_for_each_entry(dev, head, close_list) {
 		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
 					 &noop_qdisc);
 		if (dev_ingress_queue(dev))
@@ -846,7 +850,7 @@ void dev_deactivate_many(struct list_head *head)
 		synchronize_net();
 
 	/* Wait for outstanding qdisc_run calls. */
-	list_for_each_entry(dev, head, unreg_list)
+	list_for_each_entry(dev, head, close_list)
 		while (some_qdisc_is_busy(dev))
 			yield();
 }
@@ -855,7 +859,7 @@ void dev_deactivate(struct net_device *dev)
 {
 	LIST_HEAD(single);
 
-	list_add(&dev->unreg_list, &single);
+	list_add(&dev->close_list, &single);
 	dev_deactivate_many(&single);
 	list_del(&single);
 }
@@ -906,3 +910,39 @@ void dev_shutdown(struct net_device *dev)
 
 	WARN_ON(timer_pending(&dev->watchdog_timer));
 }
+
+void psched_ratecfg_precompute(struct psched_ratecfg *r,
+			       const struct tc_ratespec *conf,
+			       u64 rate64)
+{
+	memset(r, 0, sizeof(*r));
+	r->overhead = conf->overhead;
+	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
+	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
+	r->mult = 1;
+	/*
+	 * The deal here is to replace a divide by a reciprocal one
+	 * in fast path (a reciprocal divide is a multiply and a shift)
+	 *
+	 * Normal formula would be :
+	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
+	 *
+	 * We compute mult/shift to use instead :
+	 *  time_in_ns = (len * mult) >> shift;
+	 *
+	 * We try to get the highest possible mult value for accuracy,
+	 * but have to make sure no overflows will ever happen.
+	 */
+	if (r->rate_bytes_ps > 0) {
+		u64 factor = NSEC_PER_SEC;
+
+		for (;;) {
+			r->mult = div64_u64(factor, r->rate_bytes_ps);
+			if (r->mult & (1U << 31) || factor & (1ULL << 63))
+				break;
+			factor <<= 1;
+			r->shift++;
+		}
+	}
+}
+EXPORT_SYMBOL(psched_ratecfg_precompute);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 0b15236be7b..12cbc09157f 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -102,9 +102,8 @@ static inline int gred_wred_mode_check(struct Qdisc *sch)
 		if (q == NULL)
 			continue;
 
-		for (n = 0; n < table->DPs; n++)
-			if (table->tab[n] && table->tab[n] != q &&
-			    table->tab[n]->prio == q->prio)
+		for (n = i + 1; n < table->DPs; n++)
+			if (table->tab[n] && table->tab[n]->prio == q->prio)
 				return 1;
 	}
 
@@ -137,6 +136,7 @@ static inline void gred_store_wred_set(struct gred_sched *table,
 				       struct gred_sched_data *q)
 {
 	table->wred_set.qavg = q->vars.qavg;
+	table->wred_set.qidlestart = q->vars.qidlestart;
 }
 
 static inline int gred_use_ecn(struct gred_sched *t)
@@ -176,7 +176,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
 	}
 
-	/* sum up all the qaves of prios <= to ours to get the new qave */
+	/* sum up all the qaves of prios < ours to get the new qave */
 	if (!gred_wred_mode(t) && gred_rio_mode(t)) {
 		int i;
 
@@ -255,23 +255,23 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch)
 		u16 dp = tc_index_to_dp(skb);
 
 		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
-			if (net_ratelimit())
-				pr_warning("GRED: Unable to relocate VQ 0x%x "
-					   "after dequeue, screwing up "
-					   "backlog.\n", tc_index_to_dp(skb));
+			net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n",
+					     tc_index_to_dp(skb));
 		} else {
 			q->backlog -= qdisc_pkt_len(skb);
 
-			if (!q->backlog && !gred_wred_mode(t))
-				red_start_of_idle_period(&q->vars);
+			if (gred_wred_mode(t)) {
+				if (!sch->qstats.backlog)
+					red_start_of_idle_period(&t->wred_set);
+			} else {
+				if (!q->backlog)
+					red_start_of_idle_period(&q->vars);
+			}
 		}
 
 		return skb;
 	}
 
-	if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
-		red_start_of_idle_period(&t->wred_set);
-
 	return NULL;
 }
 
@@ -287,27 +287,26 @@ static unsigned int gred_drop(struct Qdisc *sch)
 		u16 dp = tc_index_to_dp(skb);
 
 		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
-			if (net_ratelimit())
-				pr_warning("GRED: Unable to relocate VQ 0x%x "
-					   "while dropping, screwing up "
-					   "backlog.\n", tc_index_to_dp(skb));
+			net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n",
+					     tc_index_to_dp(skb));
 		} else {
 			q->backlog -= len;
 			q->stats.other++;
 
-			if (!q->backlog && !gred_wred_mode(t))
-				red_start_of_idle_period(&q->vars);
+			if (gred_wred_mode(t)) {
+				if (!sch->qstats.backlog)
+					red_start_of_idle_period(&t->wred_set);
+			} else {
+				if (!q->backlog)
+					red_start_of_idle_period(&q->vars);
+			}
 		}
 
 		qdisc_drop(skb, sch);
 		return len;
 	}
 
-	if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
-		red_start_of_idle_period(&t->wred_set);
-
 	return 0;
-
 }
 
 static void gred_reset(struct Qdisc *sch)
@@ -371,8 +370,8 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
 
 	for (i = table->DPs; i < MAX_DPs; i++) {
 		if (table->tab[i]) {
-			pr_warning("GRED: Warning: Destroying "
-				   "shadowed VQ 0x%x\n", i);
+			pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n",
+				i);
 			gred_destroy_vq(table->tab[i]);
 			table->tab[i] = NULL;
 		}
@@ -521,14 +520,16 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 	opts = nla_nest_start(skb, TCA_OPTIONS);
 	if (opts == NULL)
 		goto nla_put_failure;
-	NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
+	if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt))
+		goto nla_put_failure;
 
 	for (i = 0; i < MAX_DPs; i++) {
 		struct gred_sched_data *q = table->tab[i];
 
 		max_p[i] = q ? q->parms.max_P : 0;
 	}
-	NLA_PUT(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p);
+	if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p))
+		goto nla_put_failure;
 
 	parms = nla_nest_start(skb, TCA_GRED_PARMS);
 	if (parms == NULL)
@@ -537,6 +538,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 	for (i = 0; i < MAX_DPs; i++) {
 		struct gred_sched_data *q = table->tab[i];
 		struct tc_gred_qopt opt;
+		unsigned long qavg;
 
 		memset(&opt, 0, sizeof(opt));
 
@@ -565,13 +567,12 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 		opt.packets	= q->packetsin;
 		opt.bytesin	= q->bytesin;
 
-		if (gred_wred_mode(table)) {
-			q->vars.qidlestart =
-				table->tab[table->def]->vars.qidlestart;
-			q->vars.qavg = table->tab[table->def]->vars.qavg;
-		}
+		if (gred_wred_mode(table))
+			gred_load_wred_set(table, q);
 
-		opt.qave = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg);
+		qavg = red_calc_qavg(&q->parms, &q->vars,
+				     q->vars.qavg >> q->parms.Wlog);
+		opt.qave = qavg >> q->parms.Wlog;
 
 append_opt:
 		if (nla_append(skb, sizeof(opt), &opt) < 0)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 9bdca2e011e..ec8aeaac1dd 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
 
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
+	struct gnet_stats_rate_est64 rate_est;
 	unsigned int	level;		/* class level in hierarchy */
 	struct tcf_proto *filter_list;	/* filter list */
 	unsigned int	filter_cnt;	/* filter count */
@@ -1305,7 +1305,8 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
 	tsc.m1 = sm2m(sc->sm1);
 	tsc.d  = dx2d(sc->dx);
 	tsc.m2 = sm2m(sc->sm2);
-	NLA_PUT(skb, attr, sizeof(tsc), &tsc);
+	if (nla_put(skb, attr, sizeof(tsc), &tsc))
+		goto nla_put_failure;
 
 	return skb->len;
 
@@ -1352,8 +1353,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
 		goto nla_put_failure;
 	if (hfsc_dump_curves(skb, cl) < 0)
 		goto nla_put_failure;
-	nla_nest_end(skb, nest);
-	return skb->len;
+	return nla_nest_end(skb, nest);
 
  nla_put_failure:
 	nla_nest_cancel(skb, nest);
@@ -1388,7 +1388,6 @@ static void
 hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
-	struct hlist_node *n;
 	struct hfsc_class *cl;
 	unsigned int i;
 
@@ -1396,7 +1395,7 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 		return;
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[i],
+		hlist_for_each_entry(cl, &q->clhash.hash[i],
 				     cl_common.hnode) {
 			if (arg->count < arg->skip) {
 				arg->count++;
@@ -1522,11 +1521,10 @@ hfsc_reset_qdisc(struct Qdisc *sch)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct hfsc_class *cl;
-	struct hlist_node *n;
 	unsigned int i;
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode)
+		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
 			hfsc_reset_class(cl);
 	}
 	q->eligible = RB_ROOT;
@@ -1539,16 +1537,16 @@ static void
 hfsc_destroy_qdisc(struct Qdisc *sch)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
-	struct hlist_node *n, *next;
+	struct hlist_node *next;
 	struct hfsc_class *cl;
 	unsigned int i;
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode)
+		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
 			tcf_destroy_chain(&cl->filter_list);
 	}
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
 					  cl_common.hnode)
 			hfsc_destroy_class(sch, cl);
 	}
@@ -1563,17 +1561,17 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_hfsc_qopt qopt;
 	struct hfsc_class *cl;
-	struct hlist_node *n;
 	unsigned int i;
 
 	sch->qstats.backlog = 0;
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode)
+		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
 			sch->qstats.backlog += cl->qdisc->qstats.backlog;
 	}
 
 	qopt.defcls = q->defcls;
-	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
+		goto nla_put_failure;
 	return skb->len;
 
  nla_put_failure:
@@ -1607,7 +1605,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (cl->qdisc->q.qlen == 1)
 		set_active(cl, qdisc_pkt_len(skb));
 
-	bstats_update(&cl->bstats, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
@@ -1655,6 +1652,7 @@ hfsc_dequeue(struct Qdisc *sch)
 		return NULL;
 	}
 
+	bstats_update(&cl->bstats, skb);
 	update_vf(cl, qdisc_pkt_len(skb), cur_time);
 	if (realtime)
 		cl->cl_cumul += qdisc_pkt_len(skb);
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
new file mode 100644
index 00000000000..d85b6812a7d
--- /dev/null
+++ b/net/sched/sch_hhf.c
@@ -0,0 +1,740 @@
+/* net/sched/sch_hhf.c		Heavy-Hitter Filter (HHF)
+ *
+ * Copyright (C) 2013 Terry Lam <vtlam@google.com>
+ * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
+ */
+
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <net/flow_keys.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+/*	Heavy-Hitter Filter (HHF)
+ *
+ * Principles :
+ * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
+ * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
+ * as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
+ * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
+ * in which the heavy-hitter bucket is served with less weight.
+ * In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
+ * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
+ * higher share of bandwidth.
+ *
+ * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
+ * following paper:
+ * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
+ * Accounting", in ACM SIGCOMM, 2002.
+ *
+ * Conceptually, a multi-stage filter comprises k independent hash functions
+ * and k counter arrays. Packets are indexed into k counter arrays by k hash
+ * functions, respectively. The counters are then increased by the packet sizes.
+ * Therefore,
+ *    - For a heavy-hitter flow: *all* of its k array counters must be large.
+ *    - For a non-heavy-hitter flow: some of its k array counters can be large
+ *      due to hash collision with other small flows; however, with high
+ *      probability, not *all* k counters are large.
+ *
+ * By the design of the multi-stage filter algorithm, the false negative rate
+ * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
+ * susceptible to false positives (non-heavy-hitters mistakenly classified as
+ * heavy-hitters).
+ * Therefore, we also implement the following optimizations to reduce false
+ * positives by avoiding unnecessary increment of the counter values:
+ *    - Optimization O1: once a heavy-hitter is identified, its bytes are not
+ *        accounted in the array counters. This technique is called "shielding"
+ *        in Section 3.3.1 of [EV02].
+ *    - Optimization O2: conservative update of counters
+ *                       (Section 3.3.2 of [EV02]),
+ *        New counter value = max {old counter value,
+ *                                 smallest counter value + packet bytes}
+ *
+ * Finally, we refresh the counters periodically since otherwise the counter
+ * values will keep accumulating.
+ *
+ * Once a flow is classified as heavy-hitter, we also save its per-flow state
+ * in an exact-matching flow table so that its subsequent packets can be
+ * dispatched to the heavy-hitter bucket accordingly.
+ *
+ *
+ * At a high level, this qdisc works as follows:
+ * Given a packet p:
+ *   - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
+ *     heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
+ *     bucket.
+ *   - Otherwise, forward p to the multi-stage filter, denoted filter F
+ *        + If F decides that p belongs to a non-heavy-hitter flow, then send p
+ *          to the non-heavy-hitter bucket.
+ *        + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
+ *          then set up a new flow entry for the flow-id of p in the table T and
+ *          send p to the heavy-hitter bucket.
+ *
+ * In this implementation:
+ *   - T is a fixed-size hash-table with 1024 entries. Hash collision is
+ *     resolved by linked-list chaining.
+ *   - F has four counter arrays, each array containing 1024 32-bit counters.
+ *     That means 4 * 1024 * 32 bits = 16KB of memory.
+ *   - Since each array in F contains 1024 counters, 10 bits are sufficient to
+ *     index into each array.
+ *     Hence, instead of having four hash functions, we chop the 32-bit
+ *     skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
+ *     computed as XOR sum of those three chunks.
+ *   - We need to clear the counter arrays periodically; however, directly
+ *     memsetting 16KB of memory can lead to cache eviction and unwanted delay.
+ *     So by representing each counter by a valid bit, we only need to reset
+ *     4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
+ *   - The Deficit Round Robin engine is taken from fq_codel implementation
+ *     (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
+ *     fq_codel_flow in fq_codel implementation.
+ *
+ */
+
+/* Non-configurable parameters */
+#define HH_FLOWS_CNT	 1024  /* number of entries in exact-matching table T */
+#define HHF_ARRAYS_CNT	 4     /* number of arrays in multi-stage filter F */
+#define HHF_ARRAYS_LEN	 1024  /* number of counters in each array of F */
+#define HHF_BIT_MASK_LEN 10    /* masking 10 bits */
+#define HHF_BIT_MASK	 0x3FF /* bitmask of 10 bits */
+
+#define WDRR_BUCKET_CNT  2     /* two buckets for Weighted DRR */
+enum wdrr_bucket_idx {
+	WDRR_BUCKET_FOR_HH	= 0, /* bucket id for heavy-hitters */
+	WDRR_BUCKET_FOR_NON_HH	= 1  /* bucket id for non-heavy-hitters */
+};
+
+#define hhf_time_before(a, b)	\
+	(typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
+
+/* Heavy-hitter per-flow state */
+struct hh_flow_state {
+	u32		 hash_id;	/* hash of flow-id (e.g. TCP 5-tuple) */
+	u32		 hit_timestamp;	/* last time heavy-hitter was seen */
+	struct list_head flowchain;	/* chaining under hash collision */
+};
+
+/* Weighted Deficit Round Robin (WDRR) scheduler */
+struct wdrr_bucket {
+	struct sk_buff	  *head;
+	struct sk_buff	  *tail;
+	struct list_head  bucketchain;
+	int		  deficit;
+};
+
+struct hhf_sched_data {
+	struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
+	u32		   perturbation;   /* hash perturbation */
+	u32		   quantum;        /* psched_mtu(qdisc_dev(sch)); */
+	u32		   drop_overlimit; /* number of times max qdisc packet
+					    * limit was hit
+					    */
+	struct list_head   *hh_flows;       /* table T (currently active HHs) */
+	u32		   hh_flows_limit;            /* max active HH allocs */
+	u32		   hh_flows_overlimit; /* num of disallowed HH allocs */
+	u32		   hh_flows_total_cnt;          /* total admitted HHs */
+	u32		   hh_flows_current_cnt;        /* total current HHs  */
+	u32		   *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */
+	u32		   hhf_arrays_reset_timestamp;  /* last time hhf_arrays
+							 * was reset
+							 */
+	unsigned long	   *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits
+							     * of hhf_arrays
+							     */
+	/* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */
+	struct list_head   new_buckets; /* list of new buckets */
+	struct list_head   old_buckets; /* list of old buckets */
+
+	/* Configurable HHF parameters */
+	u32		   hhf_reset_timeout; /* interval to reset counter
+					       * arrays in filter F
+					       * (default 40ms)
+					       */
+	u32		   hhf_admit_bytes;   /* counter thresh to classify as
+					       * HH (default 128KB).
+					       * With these default values,
+					       * 128KB / 40ms = 25 Mbps
+					       * i.e., we expect to capture HHs
+					       * sending > 25 Mbps.
+					       */
+	u32		   hhf_evict_timeout; /* aging threshold to evict idle
+					       * HHs out of table T. This should
+					       * be large enough to avoid
+					       * reordering during HH eviction.
+					       * (default 1s)
+					       */
+	u32		   hhf_non_hh_weight; /* WDRR weight for non-HHs
+					       * (default 2,
+					       *  i.e., non-HH : HH = 2 : 1)
+					       */
+};
+
+static u32 hhf_time_stamp(void)
+{
+	return jiffies;
+}
+
+static unsigned int skb_hash(const struct hhf_sched_data *q,
+			     const struct sk_buff *skb)
+{
+	struct flow_keys keys;
+	unsigned int hash;
+
+	if (skb->sk && skb->sk->sk_hash)
+		return skb->sk->sk_hash;
+
+	skb_flow_dissect(skb, &keys);
+	hash = jhash_3words((__force u32)keys.dst,
+			    (__force u32)keys.src ^ keys.ip_proto,
+			    (__force u32)keys.ports, q->perturbation);
+	return hash;
+}
+
+/* Looks up a heavy-hitter flow in a chaining list of table T. */
+static struct hh_flow_state *seek_list(const u32 hash,
+				       struct list_head *head,
+				       struct hhf_sched_data *q)
+{
+	struct hh_flow_state *flow, *next;
+	u32 now = hhf_time_stamp();
+
+	if (list_empty(head))
+		return NULL;
+
+	list_for_each_entry_safe(flow, next, head, flowchain) {
+		u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
+
+		if (hhf_time_before(prev, now)) {
+			/* Delete expired heavy-hitters, but preserve one entry
+			 * to avoid kzalloc() when next time this slot is hit.
+			 */
+			if (list_is_last(&flow->flowchain, head))
+				return NULL;
+			list_del(&flow->flowchain);
+			kfree(flow);
+			q->hh_flows_current_cnt--;
+		} else if (flow->hash_id == hash) {
+			return flow;
+		}
+	}
+	return NULL;
+}
+
+/* Returns a flow state entry for a new heavy-hitter.  Either reuses an expired
+ * entry or dynamically alloc a new entry.
+ */
+static struct hh_flow_state *alloc_new_hh(struct list_head *head,
+					  struct hhf_sched_data *q)
+{
+	struct hh_flow_state *flow;
+	u32 now = hhf_time_stamp();
+
+	if (!list_empty(head)) {
+		/* Find an expired heavy-hitter flow entry. */
+		list_for_each_entry(flow, head, flowchain) {
+			u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
+
+			if (hhf_time_before(prev, now))
+				return flow;
+		}
+	}
+
+	if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
+		q->hh_flows_overlimit++;
+		return NULL;
+	}
+	/* Create new entry. */
+	flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC);
+	if (!flow)
+		return NULL;
+
+	q->hh_flows_current_cnt++;
+	INIT_LIST_HEAD(&flow->flowchain);
+	list_add_tail(&flow->flowchain, head);
+
+	return flow;
+}
+
+/* Assigns packets to WDRR buckets.  Implements a multi-stage filter to
+ * classify heavy-hitters.
+ */
+static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	u32 tmp_hash, hash;
+	u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
+	struct hh_flow_state *flow;
+	u32 pkt_len, min_hhf_val;
+	int i;
+	u32 prev;
+	u32 now = hhf_time_stamp();
+
+	/* Reset the HHF counter arrays if this is the right time. */
+	prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
+	if (hhf_time_before(prev, now)) {
+		for (i = 0; i < HHF_ARRAYS_CNT; i++)
+			bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
+		q->hhf_arrays_reset_timestamp = now;
+	}
+
+	/* Get hashed flow-id of the skb. */
+	hash = skb_hash(q, skb);
+
+	/* Check if this packet belongs to an already established HH flow. */
+	flow_pos = hash & HHF_BIT_MASK;
+	flow = seek_list(hash, &q->hh_flows[flow_pos], q);
+	if (flow) { /* found its HH flow */
+		flow->hit_timestamp = now;
+		return WDRR_BUCKET_FOR_HH;
+	}
+
+	/* Now pass the packet through the multi-stage filter. */
+	tmp_hash = hash;
+	xorsum = 0;
+	for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) {
+		/* Split the skb_hash into three 10-bit chunks. */
+		filter_pos[i] = tmp_hash & HHF_BIT_MASK;
+		xorsum ^= filter_pos[i];
+		tmp_hash >>= HHF_BIT_MASK_LEN;
+	}
+	/* The last chunk is computed as XOR sum of other chunks. */
+	filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash;
+
+	pkt_len = qdisc_pkt_len(skb);
+	min_hhf_val = ~0U;
+	for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+		u32 val;
+
+		if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
+			q->hhf_arrays[i][filter_pos[i]] = 0;
+			__set_bit(filter_pos[i], q->hhf_valid_bits[i]);
+		}
+
+		val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
+		if (min_hhf_val > val)
+			min_hhf_val = val;
+	}
+
+	/* Found a new HH iff all counter values > HH admit threshold. */
+	if (min_hhf_val > q->hhf_admit_bytes) {
+		/* Just captured a new heavy-hitter. */
+		flow = alloc_new_hh(&q->hh_flows[flow_pos], q);
+		if (!flow) /* memory alloc problem */
+			return WDRR_BUCKET_FOR_NON_HH;
+		flow->hash_id = hash;
+		flow->hit_timestamp = now;
+		q->hh_flows_total_cnt++;
+
+		/* By returning without updating counters in q->hhf_arrays,
+		 * we implicitly implement "shielding" (see Optimization O1).
+		 */
+		return WDRR_BUCKET_FOR_HH;
+	}
+
+	/* Conservative update of HHF arrays (see Optimization O2). */
+	for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+		if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
+			q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
+	}
+	return WDRR_BUCKET_FOR_NON_HH;
+}
+
+/* Removes one skb from head of bucket. */
+static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
+{
+	struct sk_buff *skb = bucket->head;
+
+	bucket->head = skb->next;
+	skb->next = NULL;
+	return skb;
+}
+
+/* Tail-adds skb to bucket. */
+static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
+{
+	if (bucket->head == NULL)
+		bucket->head = skb;
+	else
+		bucket->tail->next = skb;
+	bucket->tail = skb;
+	skb->next = NULL;
+}
+
+static unsigned int hhf_drop(struct Qdisc *sch)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	struct wdrr_bucket *bucket;
+
+	/* Always try to drop from heavy-hitters first. */
+	bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
+	if (!bucket->head)
+		bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
+
+	if (bucket->head) {
+		struct sk_buff *skb = dequeue_head(bucket);
+
+		sch->q.qlen--;
+		sch->qstats.drops++;
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		kfree_skb(skb);
+	}
+
+	/* Return id of the bucket from which the packet was dropped. */
+	return bucket - q->buckets;
+}
+
+static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	enum wdrr_bucket_idx idx;
+	struct wdrr_bucket *bucket;
+
+	idx = hhf_classify(skb, sch);
+
+	bucket = &q->buckets[idx];
+	bucket_add(bucket, skb);
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+
+	if (list_empty(&bucket->bucketchain)) {
+		unsigned int weight;
+
+		/* The logic of new_buckets vs. old_buckets is the same as
+		 * new_flows vs. old_flows in the implementation of fq_codel,
+		 * i.e., short bursts of non-HHs should have strict priority.
+		 */
+		if (idx == WDRR_BUCKET_FOR_HH) {
+			/* Always move heavy-hitters to old bucket. */
+			weight = 1;
+			list_add_tail(&bucket->bucketchain, &q->old_buckets);
+		} else {
+			weight = q->hhf_non_hh_weight;
+			list_add_tail(&bucket->bucketchain, &q->new_buckets);
+		}
+		bucket->deficit = weight * q->quantum;
+	}
+	if (++sch->q.qlen <= sch->limit)
+		return NET_XMIT_SUCCESS;
+
+	q->drop_overlimit++;
+	/* Return Congestion Notification only if we dropped a packet from this
+	 * bucket.
+	 */
+	if (hhf_drop(sch) == idx)
+		return NET_XMIT_CN;
+
+	/* As we dropped a packet, better let upper stack know this. */
+	qdisc_tree_decrease_qlen(sch, 1);
+	return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = NULL;
+	struct wdrr_bucket *bucket;
+	struct list_head *head;
+
+begin:
+	head = &q->new_buckets;
+	if (list_empty(head)) {
+		head = &q->old_buckets;
+		if (list_empty(head))
+			return NULL;
+	}
+	bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
+
+	if (bucket->deficit <= 0) {
+		int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
+			      1 : q->hhf_non_hh_weight;
+
+		bucket->deficit += weight * q->quantum;
+		list_move_tail(&bucket->bucketchain, &q->old_buckets);
+		goto begin;
+	}
+
+	if (bucket->head) {
+		skb = dequeue_head(bucket);
+		sch->q.qlen--;
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+	}
+
+	if (!skb) {
+		/* Force a pass through old_buckets to prevent starvation. */
+		if ((head == &q->new_buckets) && !list_empty(&q->old_buckets))
+			list_move_tail(&bucket->bucketchain, &q->old_buckets);
+		else
+			list_del_init(&bucket->bucketchain);
+		goto begin;
+	}
+	qdisc_bstats_update(sch, skb);
+	bucket->deficit -= qdisc_pkt_len(skb);
+
+	return skb;
+}
+
+static void hhf_reset(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	while ((skb = hhf_dequeue(sch)) != NULL)
+		kfree_skb(skb);
+}
+
+static void *hhf_zalloc(size_t sz)
+{
+	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+	if (!ptr)
+		ptr = vzalloc(sz);
+
+	return ptr;
+}
+
+static void hhf_free(void *addr)
+{
+	kvfree(addr);
+}
+
+static void hhf_destroy(struct Qdisc *sch)
+{
+	int i;
+	struct hhf_sched_data *q = qdisc_priv(sch);
+
+	for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+		hhf_free(q->hhf_arrays[i]);
+		hhf_free(q->hhf_valid_bits[i]);
+	}
+
+	for (i = 0; i < HH_FLOWS_CNT; i++) {
+		struct hh_flow_state *flow, *next;
+		struct list_head *head = &q->hh_flows[i];
+
+		if (list_empty(head))
+			continue;
+		list_for_each_entry_safe(flow, next, head, flowchain) {
+			list_del(&flow->flowchain);
+			kfree(flow);
+		}
+	}
+	hhf_free(q->hh_flows);
+}
+
+static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
+	[TCA_HHF_BACKLOG_LIMIT]	 = { .type = NLA_U32 },
+	[TCA_HHF_QUANTUM]	 = { .type = NLA_U32 },
+	[TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
+	[TCA_HHF_RESET_TIMEOUT]	 = { .type = NLA_U32 },
+	[TCA_HHF_ADMIT_BYTES]	 = { .type = NLA_U32 },
+	[TCA_HHF_EVICT_TIMEOUT]	 = { .type = NLA_U32 },
+	[TCA_HHF_NON_HH_WEIGHT]	 = { .type = NLA_U32 },
+};
+
+static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_HHF_MAX + 1];
+	unsigned int qlen;
+	int err;
+	u64 non_hh_quantum;
+	u32 new_quantum = q->quantum;
+	u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_HHF_QUANTUM])
+		new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]);
+
+	if (tb[TCA_HHF_NON_HH_WEIGHT])
+		new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
+
+	non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
+	if (non_hh_quantum > INT_MAX)
+		return -EINVAL;
+
+	sch_tree_lock(sch);
+
+	if (tb[TCA_HHF_BACKLOG_LIMIT])
+		sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
+
+	q->quantum = new_quantum;
+	q->hhf_non_hh_weight = new_hhf_non_hh_weight;
+
+	if (tb[TCA_HHF_HH_FLOWS_LIMIT])
+		q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
+
+	if (tb[TCA_HHF_RESET_TIMEOUT]) {
+		u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
+
+		q->hhf_reset_timeout = usecs_to_jiffies(us);
+	}
+
+	if (tb[TCA_HHF_ADMIT_BYTES])
+		q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
+
+	if (tb[TCA_HHF_EVICT_TIMEOUT]) {
+		u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
+
+		q->hhf_evict_timeout = usecs_to_jiffies(us);
+	}
+
+	qlen = sch->q.qlen;
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = hhf_dequeue(sch);
+
+		kfree_skb(skb);
+	}
+	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	int i;
+
+	sch->limit = 1000;
+	q->quantum = psched_mtu(qdisc_dev(sch));
+	q->perturbation = prandom_u32();
+	INIT_LIST_HEAD(&q->new_buckets);
+	INIT_LIST_HEAD(&q->old_buckets);
+
+	/* Configurable HHF parameters */
+	q->hhf_reset_timeout = HZ / 25; /* 40  ms */
+	q->hhf_admit_bytes = 131072;    /* 128 KB */
+	q->hhf_evict_timeout = HZ;      /* 1  sec */
+	q->hhf_non_hh_weight = 2;
+
+	if (opt) {
+		int err = hhf_change(sch, opt);
+
+		if (err)
+			return err;
+	}
+
+	if (!q->hh_flows) {
+		/* Initialize heavy-hitter flow table. */
+		q->hh_flows = hhf_zalloc(HH_FLOWS_CNT *
+					 sizeof(struct list_head));
+		if (!q->hh_flows)
+			return -ENOMEM;
+		for (i = 0; i < HH_FLOWS_CNT; i++)
+			INIT_LIST_HEAD(&q->hh_flows[i]);
+
+		/* Cap max active HHs at twice len of hh_flows table. */
+		q->hh_flows_limit = 2 * HH_FLOWS_CNT;
+		q->hh_flows_overlimit = 0;
+		q->hh_flows_total_cnt = 0;
+		q->hh_flows_current_cnt = 0;
+
+		/* Initialize heavy-hitter filter arrays. */
+		for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+			q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
+						      sizeof(u32));
+			if (!q->hhf_arrays[i]) {
+				hhf_destroy(sch);
+				return -ENOMEM;
+			}
+		}
+		q->hhf_arrays_reset_timestamp = hhf_time_stamp();
+
+		/* Initialize valid bits of heavy-hitter filter arrays. */
+		for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+			q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
+							  BITS_PER_BYTE);
+			if (!q->hhf_valid_bits[i]) {
+				hhf_destroy(sch);
+				return -ENOMEM;
+			}
+		}
+
+		/* Initialize Weighted DRR buckets. */
+		for (i = 0; i < WDRR_BUCKET_CNT; i++) {
+			struct wdrr_bucket *bucket = q->buckets + i;
+
+			INIT_LIST_HEAD(&bucket->bucketchain);
+		}
+	}
+
+	return 0;
+}
+
+static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
+	    nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
+	    nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
+	    nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
+			jiffies_to_usecs(q->hhf_reset_timeout)) ||
+	    nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
+	    nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
+			jiffies_to_usecs(q->hhf_evict_timeout)) ||
+	    nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	return -1;
+}
+
+static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	struct tc_hhf_xstats st = {
+		.drop_overlimit = q->drop_overlimit,
+		.hh_overlimit	= q->hh_flows_overlimit,
+		.hh_tot_count	= q->hh_flows_total_cnt,
+		.hh_cur_count	= q->hh_flows_current_cnt,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
+	.id		=	"hhf",
+	.priv_size	=	sizeof(struct hhf_sched_data),
+
+	.enqueue	=	hhf_enqueue,
+	.dequeue	=	hhf_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.drop		=	hhf_drop,
+	.init		=	hhf_init,
+	.reset		=	hhf_reset,
+	.destroy	=	hhf_destroy,
+	.change		=	hhf_change,
+	.dump		=	hhf_dump,
+	.dump_stats	=	hhf_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init hhf_module_init(void)
+{
+	return register_qdisc(&hhf_qdisc_ops);
+}
+
+static void __exit hhf_module_exit(void)
+{
+	unregister_qdisc(&hhf_qdisc_ops);
+}
+
+module_init(hhf_module_init)
+module_exit(hhf_module_exit)
+MODULE_AUTHOR("Terry Lam");
+MODULE_AUTHOR("Nandita Dukkipati");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 29b942ce9e8..9f949abcace 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -38,6 +38,7 @@
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 #include <net/netlink.h>
+#include <net/sch_generic.h>
 #include <net/pkt_sched.h>
 
 /* HTB algorithm.
@@ -64,6 +65,10 @@ static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis f
 module_param    (htb_hysteresis, int, 0640);
 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
 
+static int htb_rate_est = 0; /* htb classes have a default rate estimator */
+module_param(htb_rate_est, int, 0640);
+MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
+
 /* used internaly to keep status of single class */
 enum htb_cmode {
 	HTB_CANT_SEND,		/* class can't send and can't borrow */
@@ -71,95 +76,105 @@ enum htb_cmode {
 	HTB_CAN_SEND		/* class can send */
 };
 
-/* interior & leaf nodes; props specific to leaves are marked L: */
+struct htb_prio {
+	union {
+		struct rb_root	row;
+		struct rb_root	feed;
+	};
+	struct rb_node	*ptr;
+	/* When class changes from state 1->2 and disconnects from
+	 * parent's feed then we lost ptr value and start from the
+	 * first child again. Here we store classid of the
+	 * last valid ptr (used when ptr is NULL).
+	 */
+	u32		last_ptr_id;
+};
+
+/* interior & leaf nodes; props specific to leaves are marked L:
+ * To reduce false sharing, place mostly read fields at beginning,
+ * and mostly written ones at the end.
+ */
 struct htb_class {
 	struct Qdisc_class_common common;
-	/* general class parameters */
-	struct gnet_stats_basic_packed bstats;
-	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
-	struct tc_htb_xstats xstats;	/* our special stats */
-	int refcnt;		/* usage count of this class */
+	struct psched_ratecfg	rate;
+	struct psched_ratecfg	ceil;
+	s64			buffer, cbuffer;/* token bucket depth/rate */
+	s64			mbuffer;	/* max wait time */
+	u32			prio;		/* these two are used only by leaves... */
+	int			quantum;	/* but stored for parent-to-leaf return */
 
-	/* topology */
-	int level;		/* our level (see above) */
-	unsigned int children;
-	struct htb_class *parent;	/* parent class */
+	struct tcf_proto	*filter_list;	/* class attached filters */
+	int			filter_cnt;
+	int			refcnt;		/* usage count of this class */
 
-	int prio;		/* these two are used only by leaves... */
-	int quantum;		/* but stored for parent-to-leaf return */
+	int			level;		/* our level (see above) */
+	unsigned int		children;
+	struct htb_class	*parent;	/* parent class */
+
+	struct gnet_stats_rate_est64 rate_est;
+
+	/*
+	 * Written often fields
+	 */
+	struct gnet_stats_basic_packed bstats;
+	struct gnet_stats_queue	qstats;
+	struct tc_htb_xstats	xstats;	/* our special stats */
+
+	/* token bucket parameters */
+	s64			tokens, ctokens;/* current number of tokens */
+	s64			t_c;		/* checkpoint time */
 
 	union {
 		struct htb_class_leaf {
-			struct Qdisc *q;
-			int deficit[TC_HTB_MAXDEPTH];
 			struct list_head drop_list;
+			int		deficit[TC_HTB_MAXDEPTH];
+			struct Qdisc	*q;
 		} leaf;
 		struct htb_class_inner {
-			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
-			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
-			/* When class changes from state 1->2 and disconnects from
-			 * parent's feed then we lost ptr value and start from the
-			 * first child again. Here we store classid of the
-			 * last valid ptr (used when ptr is NULL).
-			 */
-			u32 last_ptr_id[TC_HTB_NUMPRIO];
+			struct htb_prio clprio[TC_HTB_NUMPRIO];
 		} inner;
 	} un;
-	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
-	struct rb_node pq_node;	/* node for event queue */
-	psched_time_t pq_key;
+	s64			pq_key;
 
-	int prio_activity;	/* for which prios are we active */
-	enum htb_cmode cmode;	/* current mode of the class */
-
-	/* class attached filters */
-	struct tcf_proto *filter_list;
-	int filter_cnt;
+	int			prio_activity;	/* for which prios are we active */
+	enum htb_cmode		cmode;		/* current mode of the class */
+	struct rb_node		pq_node;	/* node for event queue */
+	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
+};
 
-	/* token bucket parameters */
-	struct qdisc_rate_table *rate;	/* rate table of the class itself */
-	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
-	long buffer, cbuffer;	/* token bucket depth/rate */
-	psched_tdiff_t mbuffer;	/* max wait time */
-	long tokens, ctokens;	/* current number of tokens */
-	psched_time_t t_c;	/* checkpoint time */
+struct htb_level {
+	struct rb_root	wait_pq;
+	struct htb_prio hprio[TC_HTB_NUMPRIO];
 };
 
 struct htb_sched {
 	struct Qdisc_class_hash clhash;
-	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
-
-	/* self list - roots of self generating tree */
-	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
-	int row_mask[TC_HTB_MAXDEPTH];
-	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
-	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
+	int			defcls;		/* class where unclassified flows go to */
+	int			rate2quantum;	/* quant = rate / rate2quantum */
 
-	/* self wait list - roots of wait PQs per row */
-	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
+	/* filters for qdisc itself */
+	struct tcf_proto	*filter_list;
 
-	/* time of nearest event per level (row) */
-	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
+#define HTB_WARN_TOOMANYEVENTS	0x1
+	unsigned int		warned;	/* only one warning */
+	int			direct_qlen;
+	struct work_struct	work;
 
-	int defcls;		/* class where unclassified flows go to */
+	/* non shaped skbs; let them go directly thru */
+	struct sk_buff_head	direct_queue;
+	long			direct_pkts;
 
-	/* filters for qdisc itself */
-	struct tcf_proto *filter_list;
+	struct qdisc_watchdog	watchdog;
 
-	int rate2quantum;	/* quant = rate / rate2quantum */
-	psched_time_t now;	/* cached dequeue time */
-	struct qdisc_watchdog watchdog;
+	s64			now;	/* cached dequeue time */
+	struct list_head	drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
 
-	/* non shaped skbs; let them go directly thru */
-	struct sk_buff_head direct_queue;
-	int direct_qlen;	/* max qlen of above */
+	/* time of nearest event per level (row) */
+	s64			near_ev_cache[TC_HTB_MAXDEPTH];
 
-	long direct_pkts;
+	int			row_mask[TC_HTB_MAXDEPTH];
 
-#define HTB_WARN_TOOMANYEVENTS	0x1
-	unsigned int warned;	/* only one warning */
-	struct work_struct work;
+	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
 };
 
 /* find class in global hash table using given handle */
@@ -204,11 +219,16 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 	if (skb->priority == sch->handle)
 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
 	cl = htb_find(skb->priority, sch);
-	if (cl && cl->level == 0)
-		return cl;
+	if (cl) {
+		if (cl->level == 0)
+			return cl;
+		/* Start with inner filter chain if a non-leaf class is selected */
+		tcf = cl->filter_list;
+	} else {
+		tcf = q->filter_list;
+	}
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	tcf = q->filter_list;
 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -273,9 +293,9 @@ static void htb_add_to_id_tree(struct rb_root *root,
  * already in the queue.
  */
 static void htb_add_to_wait_tree(struct htb_sched *q,
-				 struct htb_class *cl, long delay)
+				 struct htb_class *cl, s64 delay)
 {
-	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
+	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
 
 	cl->pq_key = q->now + delay;
 	if (cl->pq_key == q->now)
@@ -295,7 +315,7 @@ static void htb_add_to_wait_tree(struct htb_sched *q,
 			p = &parent->rb_left;
 	}
 	rb_link_node(&cl->pq_node, parent, p);
-	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
+	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 }
 
 /**
@@ -322,7 +342,7 @@ static inline void htb_add_class_to_row(struct htb_sched *q,
 	while (mask) {
 		int prio = ffz(~mask);
 		mask &= ~(1 << prio);
-		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
+		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
 	}
 }
 
@@ -348,16 +368,18 @@ static inline void htb_remove_class_from_row(struct htb_sched *q,
 						 struct htb_class *cl, int mask)
 {
 	int m = 0;
+	struct htb_level *hlevel = &q->hlevel[cl->level];
 
 	while (mask) {
 		int prio = ffz(~mask);
+		struct htb_prio *hprio = &hlevel->hprio[prio];
 
 		mask &= ~(1 << prio);
-		if (q->ptr[cl->level][prio] == cl->node + prio)
-			htb_next_rb_node(q->ptr[cl->level] + prio);
+		if (hprio->ptr == cl->node + prio)
+			htb_next_rb_node(&hprio->ptr);
 
-		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
-		if (!q->row[cl->level][prio].rb_node)
+		htb_safe_rb_erase(cl->node + prio, &hprio->row);
+		if (!hprio->row.rb_node)
 			m |= 1 << prio;
 	}
 	q->row_mask[cl->level] &= ~m;
@@ -381,13 +403,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
 			int prio = ffz(~m);
 			m &= ~(1 << prio);
 
-			if (p->un.inner.feed[prio].rb_node)
+			if (p->un.inner.clprio[prio].feed.rb_node)
 				/* parent already has its feed in use so that
 				 * reset bit in mask as parent is already ok
 				 */
 				mask &= ~(1 << prio);
 
-			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
+			htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
 		}
 		p->prio_activity |= mask;
 		cl = p;
@@ -417,18 +439,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
 			int prio = ffz(~m);
 			m &= ~(1 << prio);
 
-			if (p->un.inner.ptr[prio] == cl->node + prio) {
+			if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
 				/* we are removing child which is pointed to from
 				 * parent feed - forget the pointer but remember
 				 * classid
 				 */
-				p->un.inner.last_ptr_id[prio] = cl->common.classid;
-				p->un.inner.ptr[prio] = NULL;
+				p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
+				p->un.inner.clprio[prio].ptr = NULL;
 			}
 
-			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
+			htb_safe_rb_erase(cl->node + prio,
+					  &p->un.inner.clprio[prio].feed);
 
-			if (!p->un.inner.feed[prio].rb_node)
+			if (!p->un.inner.clprio[prio].feed.rb_node)
 				mask |= 1 << prio;
 		}
 
@@ -441,14 +464,14 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
 		htb_remove_class_from_row(q, cl, mask);
 }
 
-static inline long htb_lowater(const struct htb_class *cl)
+static inline s64 htb_lowater(const struct htb_class *cl)
 {
 	if (htb_hysteresis)
 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
 	else
 		return 0;
 }
-static inline long htb_hiwater(const struct htb_class *cl)
+static inline s64 htb_hiwater(const struct htb_class *cl)
 {
 	if (htb_hysteresis)
 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
@@ -469,9 +492,9 @@ static inline long htb_hiwater(const struct htb_class *cl)
  * mode transitions per time unit. The speed gain is about 1/6.
  */
 static inline enum htb_cmode
-htb_class_mode(struct htb_class *cl, long *diff)
+htb_class_mode(struct htb_class *cl, s64 *diff)
 {
-	long toks;
+	s64 toks;
 
 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
 		*diff = -toks;
@@ -495,7 +518,7 @@ htb_class_mode(struct htb_class *cl, long *diff)
  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
  */
 static void
-htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
+htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
 {
 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
 
@@ -558,9 +581,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 			__skb_queue_tail(&q->direct_queue, skb);
 			q->direct_pkts++;
 		} else {
-			kfree_skb(skb);
-			sch->qstats.drops++;
-			return NET_XMIT_DROP;
+			return qdisc_drop(skb, sch);
 		}
 #ifdef CONFIG_NET_CLS_ACT
 	} else if (!cl) {
@@ -576,7 +597,6 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		}
 		return ret;
 	} else {
-		bstats_update(&cl->bstats, skb);
 		htb_activate(q, cl);
 	}
 
@@ -584,26 +604,26 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	return NET_XMIT_SUCCESS;
 }
 
-static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff)
+static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
 {
-	long toks = diff + cl->tokens;
+	s64 toks = diff + cl->tokens;
 
 	if (toks > cl->buffer)
 		toks = cl->buffer;
-	toks -= (long) qdisc_l2t(cl->rate, bytes);
+	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
 	if (toks <= -cl->mbuffer)
 		toks = 1 - cl->mbuffer;
 
 	cl->tokens = toks;
 }
 
-static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff)
+static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
 {
-	long toks = diff + cl->ctokens;
+	s64 toks = diff + cl->ctokens;
 
 	if (toks > cl->cbuffer)
 		toks = cl->cbuffer;
-	toks -= (long) qdisc_l2t(cl->ceil, bytes);
+	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
 	if (toks <= -cl->mbuffer)
 		toks = 1 - cl->mbuffer;
 
@@ -626,10 +646,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
 {
 	int bytes = qdisc_pkt_len(skb);
 	enum htb_cmode old_mode;
-	long diff;
+	s64 diff;
 
 	while (cl) {
-		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
+		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 		if (cl->level >= level) {
 			if (cl->level == level)
 				cl->xstats.lends++;
@@ -646,7 +666,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
 		htb_change_class_mode(q, cl, &diff);
 		if (old_mode != cl->cmode) {
 			if (old_mode != HTB_CAN_SEND)
-				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
+				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 			if (cl->cmode != HTB_CAN_SEND)
 				htb_add_to_wait_tree(q, cl, diff);
 		}
@@ -666,18 +686,20 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
  * next pending event (0 for no event in pq, q->now for too many events).
  * Note: Applied are events whose have cl->pq_key <= q->now.
  */
-static psched_time_t htb_do_events(struct htb_sched *q, int level,
-				   unsigned long start)
+static s64 htb_do_events(struct htb_sched *q, const int level,
+			 unsigned long start)
 {
 	/* don't run for longer than 2 jiffies; 2 is used instead of
 	 * 1 to simplify things when jiffy is going to be incremented
 	 * too soon
 	 */
 	unsigned long stop_at = start + 2;
+	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
+
 	while (time_before(jiffies, stop_at)) {
 		struct htb_class *cl;
-		long diff;
-		struct rb_node *p = rb_first(&q->wait_pq[level]);
+		s64 diff;
+		struct rb_node *p = rb_first(wait_pq);
 
 		if (!p)
 			return 0;
@@ -686,8 +708,8 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
 		if (cl->pq_key > q->now)
 			return cl->pq_key;
 
-		htb_safe_rb_erase(p, q->wait_pq + level);
-		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
+		htb_safe_rb_erase(p, wait_pq);
+		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 		htb_change_class_mode(q, cl, &diff);
 		if (cl->cmode != HTB_CAN_SEND)
 			htb_add_to_wait_tree(q, cl, diff);
@@ -695,7 +717,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
 
 	/* too much load - let's continue after a break for scheduling */
 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
-		pr_warning("htb: too many events!\n");
+		pr_warn("htb: too many events!\n");
 		q->warned |= HTB_WARN_TOOMANYEVENTS;
 	}
 
@@ -730,8 +752,7 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
  *
  * Find leaf where current feed pointers points to.
  */
-static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
-					 struct rb_node **pptr, u32 * pid)
+static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
 {
 	int i;
 	struct {
@@ -740,10 +761,10 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
 		u32 *pid;
 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
 
-	BUG_ON(!tree->rb_node);
-	sp->root = tree->rb_node;
-	sp->pptr = pptr;
-	sp->pid = pid;
+	BUG_ON(!hprio->row.rb_node);
+	sp->root = hprio->row.rb_node;
+	sp->pptr = &hprio->ptr;
+	sp->pid = &hprio->last_ptr_id;
 
 	for (i = 0; i < 65535; i++) {
 		if (!*sp->pptr && *sp->pid) {
@@ -770,12 +791,15 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
 			}
 		} else {
 			struct htb_class *cl;
+			struct htb_prio *clp;
+
 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
 			if (!cl->level)
 				return cl;
-			(++sp)->root = cl->un.inner.feed[prio].rb_node;
-			sp->pptr = cl->un.inner.ptr + prio;
-			sp->pid = cl->un.inner.last_ptr_id + prio;
+			clp = &cl->un.inner.clprio[prio];
+			(++sp)->root = clp->feed.rb_node;
+			sp->pptr = &clp->ptr;
+			sp->pid = &clp->last_ptr_id;
 		}
 	}
 	WARN_ON(1);
@@ -785,15 +809,16 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
 /* dequeues packet at given priority and level; call only if
  * you are sure that there is active class at prio/level
  */
-static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
-					int level)
+static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
+					const int level)
 {
 	struct sk_buff *skb = NULL;
 	struct htb_class *cl, *start;
+	struct htb_level *hlevel = &q->hlevel[level];
+	struct htb_prio *hprio = &hlevel->hprio[prio];
+
 	/* look initial class up in the row */
-	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
-				     q->ptr[level] + prio,
-				     q->last_ptr_id[level] + prio);
+	start = cl = htb_lookup_leaf(hprio, prio);
 
 	do {
 next:
@@ -813,9 +838,7 @@ next:
 			if ((q->row_mask[level] & (1 << prio)) == 0)
 				return NULL;
 
-			next = htb_lookup_leaf(q->row[level] + prio,
-					       prio, q->ptr[level] + prio,
-					       q->last_ptr_id[level] + prio);
+			next = htb_lookup_leaf(hprio, prio);
 
 			if (cl == start)	/* fix start if we just deleted it */
 				start = next;
@@ -828,20 +851,19 @@ next:
 			break;
 
 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
-		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
-				  ptr[0]) + prio);
-		cl = htb_lookup_leaf(q->row[level] + prio, prio,
-				     q->ptr[level] + prio,
-				     q->last_ptr_id[level] + prio);
+		htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
+					 &q->hlevel[0].hprio[prio].ptr);
+		cl = htb_lookup_leaf(hprio, prio);
 
 	} while (cl != start);
 
 	if (likely(skb != NULL)) {
+		bstats_update(&cl->bstats, skb);
 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
 		if (cl->un.leaf.deficit[level] < 0) {
 			cl->un.leaf.deficit[level] += cl->quantum;
-			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
-					  ptr[0]) + prio);
+			htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
+						 &q->hlevel[0].hprio[prio].ptr);
 		}
 		/* this used to be after charge_class but this constelation
 		 * gives us slightly better performance
@@ -858,7 +880,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 	struct sk_buff *skb;
 	struct htb_sched *q = qdisc_priv(sch);
 	int level;
-	psched_time_t next_event;
+	s64 next_event;
 	unsigned long start_at;
 
 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
@@ -873,23 +895,22 @@ ok:
 
 	if (!sch->q.qlen)
 		goto fin;
-	q->now = psched_get_time();
+	q->now = ktime_to_ns(ktime_get());
 	start_at = jiffies;
 
-	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
+	next_event = q->now + 5LLU * NSEC_PER_SEC;
 
 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
 		/* common case optimization - skip event handler quickly */
 		int m;
-		psched_time_t event;
+		s64 event = q->near_ev_cache[level];
 
-		if (q->now >= q->near_ev_cache[level]) {
+		if (q->now >= event) {
 			event = htb_do_events(q, level, start_at);
 			if (!event)
-				event = q->now + PSCHED_TICKS_PER_SEC;
+				event = q->now + NSEC_PER_SEC;
 			q->near_ev_cache[level] = event;
-		} else
-			event = q->near_ev_cache[level];
+		}
 
 		if (next_event > event)
 			next_event = event;
@@ -905,10 +926,17 @@ ok:
 		}
 	}
 	sch->qstats.overlimits++;
-	if (likely(next_event > q->now))
-		qdisc_watchdog_schedule(&q->watchdog, next_event);
-	else
+	if (likely(next_event > q->now)) {
+		if (!test_bit(__QDISC_STATE_DEACTIVATED,
+			      &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
+			ktime_t time = ns_to_ktime(next_event);
+			qdisc_throttled(q->watchdog.qdisc);
+			hrtimer_start(&q->watchdog.timer, time,
+				      HRTIMER_MODE_ABS);
+		}
+	} else {
 		schedule_work(&q->work);
+	}
 fin:
 	return skb;
 }
@@ -943,11 +971,10 @@ static void htb_reset(struct Qdisc *sch)
 {
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl;
-	struct hlist_node *n;
 	unsigned int i;
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 			if (cl->level)
 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
 			else {
@@ -963,10 +990,8 @@ static void htb_reset(struct Qdisc *sch)
 	qdisc_watchdog_cancel(&q->watchdog);
 	__skb_queue_purge(&q->direct_queue);
 	sch->q.qlen = 0;
-	memset(q->row, 0, sizeof(q->row));
+	memset(q->hlevel, 0, sizeof(q->hlevel));
 	memset(q->row_mask, 0, sizeof(q->row_mask));
-	memset(q->wait_pq, 0, sizeof(q->wait_pq));
-	memset(q->ptr, 0, sizeof(q->ptr));
 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
 		INIT_LIST_HEAD(q->drops + i);
 }
@@ -976,6 +1001,9 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
+	[TCA_HTB_RATE64] = { .type = NLA_U64 },
+	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
 };
 
 static void htb_work_func(struct work_struct *work)
@@ -989,7 +1017,7 @@ static void htb_work_func(struct work_struct *work)
 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct htb_sched *q = qdisc_priv(sch);
-	struct nlattr *tb[TCA_HTB_INIT + 1];
+	struct nlattr *tb[TCA_HTB_MAX + 1];
 	struct tc_htb_glob *gopt;
 	int err;
 	int i;
@@ -997,20 +1025,16 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
 	if (!opt)
 		return -EINVAL;
 
-	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
+	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
 	if (err < 0)
 		return err;
 
-	if (tb[TCA_HTB_INIT] == NULL) {
-		pr_err("HTB: hey probably you have bad tc tool ?\n");
+	if (!tb[TCA_HTB_INIT])
 		return -EINVAL;
-	}
+
 	gopt = nla_data(tb[TCA_HTB_INIT]);
-	if (gopt->version != HTB_VER >> 16) {
-		pr_err("HTB: need tc/htb version %d (minor is %d), you have %d\n",
-		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
+	if (gopt->version != HTB_VER >> 16)
 		return -EINVAL;
-	}
 
 	err = qdisc_class_hash_init(&q->clhash);
 	if (err < 0)
@@ -1022,10 +1046,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
 	INIT_WORK(&q->work, htb_work_func);
 	skb_queue_head_init(&q->direct_queue);
 
-	q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
-	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
-		q->direct_qlen = 2;
-
+	if (tb[TCA_HTB_DIRECT_QLEN])
+		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
+	else {
+		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
+		if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
+			q->direct_qlen = 2;
+	}
 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
 		q->rate2quantum = 1;
 	q->defcls = gopt->defcls;
@@ -1035,12 +1062,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
 
 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
-	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
 	struct htb_sched *q = qdisc_priv(sch);
 	struct nlattr *nest;
 	struct tc_htb_glob gopt;
 
-	spin_lock_bh(root_lock);
+	/* Its safe to not acquire qdisc lock. As we hold RTNL,
+	 * no change can happen on the qdisc parameters.
+	 */
 
 	gopt.direct_pkts = q->direct_pkts;
 	gopt.version = HTB_VER;
@@ -1051,14 +1079,13 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
-	NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
-	nla_nest_end(skb, nest);
+	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
+	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
+		goto nla_put_failure;
 
-	spin_unlock_bh(root_lock);
-	return skb->len;
+	return nla_nest_end(skb, nest);
 
 nla_put_failure:
-	spin_unlock_bh(root_lock);
 	nla_nest_cancel(skb, nest);
 	return -1;
 }
@@ -1067,11 +1094,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
 			  struct sk_buff *skb, struct tcmsg *tcm)
 {
 	struct htb_class *cl = (struct htb_class *)arg;
-	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
 	struct nlattr *nest;
 	struct tc_htb_opt opt;
 
-	spin_lock_bh(root_lock);
+	/* Its safe to not acquire qdisc lock. As we hold RTNL,
+	 * no change can happen on the class parameters.
+	 */
 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
 	tcm->tcm_handle = cl->common.classid;
 	if (!cl->level && cl->un.leaf.q)
@@ -1083,21 +1111,25 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
 
 	memset(&opt, 0, sizeof(opt));
 
-	opt.rate = cl->rate->rate;
-	opt.buffer = cl->buffer;
-	opt.ceil = cl->ceil->rate;
-	opt.cbuffer = cl->cbuffer;
+	psched_ratecfg_getrate(&opt.rate, &cl->rate);
+	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
+	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
+	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
 	opt.quantum = cl->quantum;
 	opt.prio = cl->prio;
 	opt.level = cl->level;
-	NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
+	    nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps))
+		goto nla_put_failure;
+	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
+	    nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps))
+		goto nla_put_failure;
 
-	nla_nest_end(skb, nest);
-	spin_unlock_bh(root_lock);
-	return skb->len;
+	return nla_nest_end(skb, nest);
 
 nla_put_failure:
-	spin_unlock_bh(root_lock);
 	nla_nest_cancel(skb, nest);
 	return -1;
 }
@@ -1109,8 +1141,8 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
 
 	if (!cl->level && cl->un.leaf.q)
 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
-	cl->xstats.tokens = cl->tokens;
-	cl->xstats.ctokens = cl->ctokens;
+	cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
+	cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
 
 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
 	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
@@ -1184,7 +1216,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
 
 	if (parent->cmode != HTB_CAN_SEND)
-		htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
+		htb_safe_rb_erase(&parent->pq_node,
+				  &q->hlevel[parent->level].wait_pq);
 
 	parent->level = 0;
 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
@@ -1192,7 +1225,7 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
 	parent->tokens = parent->buffer;
 	parent->ctokens = parent->cbuffer;
-	parent->t_c = psched_get_time();
+	parent->t_c = ktime_to_ns(ktime_get());
 	parent->cmode = HTB_CAN_SEND;
 }
 
@@ -1203,9 +1236,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 		qdisc_destroy(cl->un.leaf.q);
 	}
 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
-	qdisc_put_rtab(cl->rate);
-	qdisc_put_rtab(cl->ceil);
-
 	tcf_destroy_chain(&cl->filter_list);
 	kfree(cl);
 }
@@ -1213,7 +1243,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 static void htb_destroy(struct Qdisc *sch)
 {
 	struct htb_sched *q = qdisc_priv(sch);
-	struct hlist_node *n, *next;
+	struct hlist_node *next;
 	struct htb_class *cl;
 	unsigned int i;
 
@@ -1227,11 +1257,11 @@ static void htb_destroy(struct Qdisc *sch)
 	tcf_destroy_chain(&q->filter_list);
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
 			tcf_destroy_chain(&cl->filter_list);
 	}
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
 					  common.hnode)
 			htb_destroy_class(sch, cl);
 	}
@@ -1247,9 +1277,10 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 	struct Qdisc *new_q = NULL;
 	int last_child = 0;
 
-	// TODO: why don't allow to delete subtree ? references ? does
-	// tc subsys quarantee us that in htb_destroy it holds no class
-	// refs so that we can remove children safely there ?
+	/* TODO: why don't allow to delete subtree ? references ? does
+	 * tc subsys guarantee us that in htb_destroy it holds no class
+	 * refs so that we can remove children safely there ?
+	 */
 	if (cl->children || cl->filter_cnt)
 		return -EBUSY;
 
@@ -1276,7 +1307,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 		htb_deactivate(q, cl);
 
 	if (cl->cmode != HTB_CAN_SEND)
-		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
+		htb_safe_rb_erase(&cl->pq_node,
+				  &q->hlevel[cl->level].wait_pq);
 
 	if (last_child)
 		htb_parent_to_leaf(q, cl, new_q);
@@ -1307,9 +1339,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
 	struct nlattr *opt = tca[TCA_OPTIONS];
-	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
-	struct nlattr *tb[__TCA_HTB_MAX];
+	struct nlattr *tb[TCA_HTB_MAX + 1];
 	struct tc_htb_opt *hopt;
+	u64 rate64, ceil64;
 
 	/* extract all subattrs from opt attr */
 	if (!opt)
@@ -1326,12 +1358,16 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
 
 	hopt = nla_data(tb[TCA_HTB_PARMS]);
-
-	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
-	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
-	if (!rtab || !ctab)
+	if (!hopt->rate.rate || !hopt->ceil.rate)
 		goto failure;
 
+	/* Keeping backward compatible with rate_table based iproute2 tc */
+	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
+		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
+
+	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
+		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
+
 	if (!cl) {		/* new class */
 		struct Qdisc *new_q;
 		int prio;
@@ -1365,12 +1401,14 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		if (!cl)
 			goto failure;
 
-		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
-					qdisc_root_sleeping_lock(sch),
-					tca[TCA_RATE] ? : &est.nla);
-		if (err) {
-			kfree(cl);
-			goto failure;
+		if (htb_rate_est || tca[TCA_RATE]) {
+			err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+						qdisc_root_sleeping_lock(sch),
+						tca[TCA_RATE] ? : &est.nla);
+			if (err) {
+				kfree(cl);
+				goto failure;
+			}
 		}
 
 		cl->refcnt = 1;
@@ -1400,7 +1438,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 
 			/* remove from evt list because of level change */
 			if (parent->cmode != HTB_CAN_SEND) {
-				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
+				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
 				parent->cmode = HTB_CAN_SEND;
 			}
 			parent->level = (parent->parent ? parent->parent->level
@@ -1414,10 +1452,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		cl->parent = parent;
 
 		/* set class to be in HTB_CAN_SEND state */
-		cl->tokens = hopt->buffer;
-		cl->ctokens = hopt->cbuffer;
-		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
-		cl->t_c = psched_get_time();
+		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
+		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
+		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
+		cl->t_c = ktime_to_ns(ktime_get());
 		cl->cmode = HTB_CAN_SEND;
 
 		/* attach to the hash list and parent's family */
@@ -1435,21 +1473,30 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		sch_tree_lock(sch);
 	}
 
+	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
+
+	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
+
+	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
+	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
+
 	/* it used to be a nasty bug here, we have to check that node
 	 * is really leaf before changing cl->un.leaf !
 	 */
 	if (!cl->level) {
-		cl->quantum = rtab->rate.rate / q->rate2quantum;
+		u64 quantum = cl->rate.rate_bytes_ps;
+
+		do_div(quantum, q->rate2quantum);
+		cl->quantum = min_t(u64, quantum, INT_MAX);
+
 		if (!hopt->quantum && cl->quantum < 1000) {
-			pr_warning(
-			       "HTB: quantum of class %X is small. Consider r2q change.\n",
-			       cl->common.classid);
+			pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
+				cl->common.classid);
 			cl->quantum = 1000;
 		}
 		if (!hopt->quantum && cl->quantum > 200000) {
-			pr_warning(
-			       "HTB: quantum of class %X is big. Consider r2q change.\n",
-			       cl->common.classid);
+			pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
+				cl->common.classid);
 			cl->quantum = 200000;
 		}
 		if (hopt->quantum)
@@ -1458,14 +1505,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 			cl->prio = TC_HTB_NUMPRIO - 1;
 	}
 
-	cl->buffer = hopt->buffer;
-	cl->cbuffer = hopt->cbuffer;
-	if (cl->rate)
-		qdisc_put_rtab(cl->rate);
-	cl->rate = rtab;
-	if (cl->ceil)
-		qdisc_put_rtab(cl->ceil);
-	cl->ceil = ctab;
+	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
+	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
+
 	sch_tree_unlock(sch);
 
 	qdisc_class_hash_grow(sch, &q->clhash);
@@ -1474,10 +1516,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 	return 0;
 
 failure:
-	if (rtab)
-		qdisc_put_rtab(rtab);
-	if (ctab)
-		qdisc_put_rtab(ctab);
 	return err;
 }
 
@@ -1521,14 +1559,13 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 {
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl;
-	struct hlist_node *n;
 	unsigned int i;
 
 	if (arg->stop)
 		return;
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 			if (arg->count < arg->skip) {
 				arg->count++;
 				continue;
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index bce1665239b..62871c14e1f 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -100,8 +100,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
-	nla_nest_end(skb, nest);
-	return skb->len;
+	return nla_nest_end(skb, nest);
 
 nla_put_failure:
 	nla_nest_cancel(skb, nest);
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 0a4b2f9a009..a8b2864a696 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -57,12 +57,13 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
 		dev_queue = netdev_get_tx_queue(dev, ntx);
-		qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(ntx + 1)));
 		if (qdisc == NULL)
 			goto err;
 		priv->qdiscs[ntx] = qdisc;
+		qdisc->flags |= TCQ_F_ONETXQUEUE;
 	}
 
 	sch->flags |= TCQ_F_MQROOT;
@@ -77,14 +78,19 @@ static void mq_attach(struct Qdisc *sch)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct mq_sched *priv = qdisc_priv(sch);
-	struct Qdisc *qdisc;
+	struct Qdisc *qdisc, *old;
 	unsigned int ntx;
 
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
 		qdisc = priv->qdiscs[ntx];
-		qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
-		if (qdisc)
-			qdisc_destroy(qdisc);
+		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+		if (old)
+			qdisc_destroy(old);
+#ifdef CONFIG_NET_SCHED
+		if (ntx < dev->real_num_tx_queues)
+			qdisc_list_add(qdisc);
+#endif
+
 	}
 	kfree(priv->qdiscs);
 	priv->qdiscs = NULL;
@@ -150,7 +156,8 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
 		dev_deactivate(dev);
 
 	*old = dev_graft_qdisc(dev_queue, new);
-
+	if (new)
+		new->flags |= TCQ_F_ONETXQUEUE;
 	if (dev->flags & IFF_UP)
 		dev_activate(dev);
 	return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 28de4309233..6749e2f540d 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -124,7 +124,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		dev_queue = netdev_get_tx_queue(dev, i);
-		qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(i + 1)));
 		if (qdisc == NULL) {
@@ -132,6 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 			goto err;
 		}
 		priv->qdiscs[i] = qdisc;
+		qdisc->flags |= TCQ_F_ONETXQUEUE;
 	}
 
 	/* If the mqprio options indicate that hardware should own
@@ -166,15 +167,17 @@ static void mqprio_attach(struct Qdisc *sch)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct mqprio_sched *priv = qdisc_priv(sch);
-	struct Qdisc *qdisc;
+	struct Qdisc *qdisc, *old;
 	unsigned int ntx;
 
 	/* Attach underlying qdisc */
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
 		qdisc = priv->qdiscs[ntx];
-		qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
-		if (qdisc)
-			qdisc_destroy(qdisc);
+		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+		if (old)
+			qdisc_destroy(old);
+		if (ntx < dev->real_num_tx_queues)
+			qdisc_list_add(qdisc);
 	}
 	kfree(priv->qdiscs);
 	priv->qdiscs = NULL;
@@ -205,6 +208,9 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
 
 	*old = dev_graft_qdisc(dev_queue, new);
 
+	if (new)
+		new->flags |= TCQ_F_ONETXQUEUE;
+
 	if (dev->flags & IFF_UP)
 		dev_activate(dev);
 
@@ -247,7 +253,8 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 		opt.offset[i] = dev->tc_to_txq[i].offset;
 	}
 
-	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
 
 	return skb->len;
 nla_put_failure:
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 49131d7a744..afb050a735f 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -11,8 +11,7 @@
  * more details.
  *
  * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; if not, see <http://www.gnu.org/licenses/>.
  *
  * Author: Alexander Duyck <alexander.h.duyck@intel.com>
  */
@@ -284,7 +283,8 @@ static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	opt.bands = q->bands;
 	opt.max_bands = q->max_bands;
 
-	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
 
 	return skb->len;
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e83d61ca78c..111d70fddae 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -23,9 +23,11 @@
 #include <linux/vmalloc.h>
 #include <linux/rtnetlink.h>
 #include <linux/reciprocal_div.h>
+#include <linux/rbtree.h>
 
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
 
 #define VERSION "1.3"
 
@@ -67,7 +69,8 @@
 */
 
 struct netem_sched_data {
-	/* internal t(ime)fifo qdisc uses sch->q and sch->limit */
+	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
+	struct rb_root t_root;
 
 	/* optional qdisc for classful handling (NULL at netem init) */
 	struct Qdisc	*qdisc;
@@ -78,16 +81,17 @@ struct netem_sched_data {
 	psched_tdiff_t jitter;
 
 	u32 loss;
+	u32 ecn;
 	u32 limit;
 	u32 counter;
 	u32 gap;
 	u32 duplicate;
 	u32 reorder;
 	u32 corrupt;
-	u32 rate;
+	u64 rate;
 	s32 packet_overhead;
 	u32 cell_size;
-	u32 cell_size_reciprocal;
+	struct reciprocal_value cell_size_reciprocal;
 	s32 cell_overhead;
 
 	struct crndstate {
@@ -106,6 +110,18 @@ struct netem_sched_data {
 		CLG_GILB_ELL,
 	} loss_model;
 
+	enum {
+		TX_IN_GAP_PERIOD = 1,
+		TX_IN_BURST_PERIOD,
+		LOST_IN_GAP_PERIOD,
+		LOST_IN_BURST_PERIOD,
+	} _4_state_model;
+
+	enum {
+		GOOD_STATE = 1,
+		BAD_STATE,
+	} GE_state_model;
+
 	/* Correlated Loss Generation models */
 	struct clgstate {
 		/* state of the Markov chain */
@@ -126,10 +142,35 @@ struct netem_sched_data {
  */
 struct netem_skb_cb {
 	psched_time_t	time_to_send;
+	ktime_t		tstamp_save;
 };
 
+/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
+ * to hold a rb_node structure.
+ *
+ * If struct sk_buff layout is changed, the following checks will complain.
+ */
+static struct rb_node *netem_rb_node(struct sk_buff *skb)
+{
+	BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
+	BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
+		     offsetof(struct sk_buff, next) + sizeof(skb->next));
+	BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
+		     offsetof(struct sk_buff, prev) + sizeof(skb->prev));
+	BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
+					      sizeof(skb->prev) +
+					      sizeof(skb->tstamp));
+	return (struct rb_node *)&skb->next;
+}
+
+static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
+{
+	return (struct sk_buff *)rb;
+}
+
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 {
+	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 }
@@ -140,7 +181,7 @@ static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 static void init_crandom(struct crndstate *state, unsigned long rho)
 {
 	state->rho = rho;
-	state->last = net_random();
+	state->last = prandom_u32();
 }
 
 /* get_crandom - correlated random number generator
@@ -153,9 +194,9 @@ static u32 get_crandom(struct crndstate *state)
 	unsigned long answer;
 
 	if (state->rho == 0)	/* no correlation */
-		return net_random();
+		return prandom_u32();
 
-	value = net_random();
+	value = prandom_u32();
 	rho = (u64)state->rho + 1;
 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 	state->last = answer;
@@ -169,51 +210,52 @@ static u32 get_crandom(struct crndstate *state)
 static bool loss_4state(struct netem_sched_data *q)
 {
 	struct clgstate *clg = &q->clg;
-	u32 rnd = net_random();
+	u32 rnd = prandom_u32();
 
 	/*
 	 * Makes a comparison between rnd and the transition
 	 * probabilities outgoing from the current state, then decides the
 	 * next state and if the next packet has to be transmitted or lost.
 	 * The four states correspond to:
-	 *   1 => successfully transmitted packets within a gap period
-	 *   4 => isolated losses within a gap period
-	 *   3 => lost packets within a burst period
-	 *   2 => successfully transmitted packets within a burst period
+	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
+	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
+	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period
+	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
 	 */
 	switch (clg->state) {
-	case 1:
+	case TX_IN_GAP_PERIOD:
 		if (rnd < clg->a4) {
-			clg->state = 4;
+			clg->state = LOST_IN_BURST_PERIOD;
 			return true;
-		} else if (clg->a4 < rnd && rnd < clg->a1) {
-			clg->state = 3;
+		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
+			clg->state = LOST_IN_GAP_PERIOD;
 			return true;
-		} else if (clg->a1 < rnd)
-			clg->state = 1;
+		} else if (clg->a1 + clg->a4 < rnd) {
+			clg->state = TX_IN_GAP_PERIOD;
+		}
 
 		break;
-	case 2:
+	case TX_IN_BURST_PERIOD:
 		if (rnd < clg->a5) {
-			clg->state = 3;
+			clg->state = LOST_IN_GAP_PERIOD;
 			return true;
-		} else
-			clg->state = 2;
+		} else {
+			clg->state = TX_IN_BURST_PERIOD;
+		}
 
 		break;
-	case 3:
+	case LOST_IN_GAP_PERIOD:
 		if (rnd < clg->a3)
-			clg->state = 2;
+			clg->state = TX_IN_BURST_PERIOD;
 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
-			clg->state = 1;
-			return true;
+			clg->state = TX_IN_GAP_PERIOD;
 		} else if (clg->a2 + clg->a3 < rnd) {
-			clg->state = 3;
+			clg->state = LOST_IN_GAP_PERIOD;
 			return true;
 		}
 		break;
-	case 4:
-		clg->state = 1;
+	case LOST_IN_BURST_PERIOD:
+		clg->state = TX_IN_GAP_PERIOD;
 		break;
 	}
 
@@ -235,15 +277,16 @@ static bool loss_gilb_ell(struct netem_sched_data *q)
 	struct clgstate *clg = &q->clg;
 
 	switch (clg->state) {
-	case 1:
-		if (net_random() < clg->a1)
-			clg->state = 2;
-		if (net_random() < clg->a4)
+	case GOOD_STATE:
+		if (prandom_u32() < clg->a1)
+			clg->state = BAD_STATE;
+		if (prandom_u32() < clg->a4)
 			return true;
-	case 2:
-		if (net_random() < clg->a2)
-			clg->state = 1;
-		if (clg->a3 > net_random())
+		break;
+	case BAD_STATE:
+		if (prandom_u32() < clg->a2)
+			clg->state = GOOD_STATE;
+		if (prandom_u32() > clg->a3)
 			return true;
 	}
 
@@ -329,29 +372,40 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche
 	return PSCHED_NS2TICKS(ticks);
 }
 
-static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+static void tfifo_reset(struct Qdisc *sch)
 {
-	struct sk_buff_head *list = &sch->q;
-	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
-	struct sk_buff *skb;
+	struct netem_sched_data *q = qdisc_priv(sch);
+	struct rb_node *p;
 
-	if (likely(skb_queue_len(list) < sch->limit)) {
-		skb = skb_peek_tail(list);
-		/* Optimize for add at tail */
-		if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
-			return qdisc_enqueue_tail(nskb, sch);
+	while ((p = rb_first(&q->t_root))) {
+		struct sk_buff *skb = netem_rb_to_skb(p);
 
-		skb_queue_reverse_walk(list, skb) {
-			if (tnext >= netem_skb_cb(skb)->time_to_send)
-				break;
-		}
-
-		__skb_queue_after(list, skb, nskb);
-		sch->qstats.backlog += qdisc_pkt_len(nskb);
-		return NET_XMIT_SUCCESS;
+		rb_erase(p, &q->t_root);
+		skb->next = NULL;
+		skb->prev = NULL;
+		kfree_skb(skb);
 	}
+}
+
+static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
+	struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
+
+	while (*p) {
+		struct sk_buff *skb;
 
-	return qdisc_reshape_fail(nskb, sch);
+		parent = *p;
+		skb = netem_rb_to_skb(parent);
+		if (tnext >= netem_skb_cb(skb)->time_to_send)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(netem_rb_node(nskb), parent, p);
+	rb_insert_color(netem_rb_node(nskb), &q->t_root);
+	sch->q.qlen++;
 }
 
 /*
@@ -366,7 +420,6 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	/* We don't fill cb now as skb_unshare() may invalidate it */
 	struct netem_skb_cb *cb;
 	struct sk_buff *skb2;
-	int ret;
 	int count = 1;
 
 	/* Random duplication */
@@ -374,16 +427,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		++count;
 
 	/* Drop packet? */
-	if (loss_event(q))
-		--count;
-
+	if (loss_event(q)) {
+		if (q->ecn && INET_ECN_set_ce(skb))
+			sch->qstats.drops++; /* mark packet */
+		else
+			--count;
+	}
 	if (count == 0) {
 		sch->qstats.drops++;
 		kfree_skb(skb);
 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	}
 
-	skb_orphan(skb);
+	/* If a delay is expected, orphan the skb. (orphaning usually takes
+	 * place at TX completion time, so _before_ the link transit delay)
+	 */
+	if (q->latency || q->jitter)
+		skb_orphan_partial(skb);
 
 	/*
 	 * If we need to duplicate packet, then re-insert at top of the
@@ -408,14 +468,18 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
-		     skb_checksum_help(skb))) {
-			sch->qstats.drops++;
-			return NET_XMIT_DROP;
-		}
+		     skb_checksum_help(skb)))
+			return qdisc_drop(skb, sch);
 
-		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
+		skb->data[prandom_u32() % skb_headlen(skb)] ^=
+			1<<(prandom_u32() % 8);
 	}
 
+	if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
+		return qdisc_reshape_fail(skb, sch);
+
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+
 	cb = netem_skb_cb(skb);
 	if (q->gap == 0 ||		/* not doing reordering */
 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
@@ -429,25 +493,30 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		now = psched_get_time();
 
 		if (q->rate) {
-			struct sk_buff_head *list = &sch->q;
-
-			delay += packet_len_2_sched_time(skb->len, q);
+			struct sk_buff *last;
 
-			if (!skb_queue_empty(list)) {
+			if (!skb_queue_empty(&sch->q))
+				last = skb_peek_tail(&sch->q);
+			else
+				last = netem_rb_to_skb(rb_last(&q->t_root));
+			if (last) {
 				/*
-				 * Last packet in queue is reference point (now).
-				 * First packet in queue is already in flight,
-				 * calculate this time bonus and substract
+				 * Last packet in queue is reference point (now),
+				 * calculate this time bonus and subtract
 				 * from delay.
 				 */
-				delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
-				now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
+				delay -= netem_skb_cb(last)->time_to_send - now;
+				delay = max_t(psched_tdiff_t, 0, delay);
+				now = netem_skb_cb(last)->time_to_send;
 			}
+
+			delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
 		}
 
 		cb->time_to_send = now + delay;
+		cb->tstamp_save = skb->tstamp;
 		++q->counter;
-		ret = tfifo_enqueue(skb, sch);
+		tfifo_enqueue(skb, sch);
 	} else {
 		/*
 		 * Do re-ordering by putting one out of N packets at the front
@@ -457,16 +526,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		q->counter = 0;
 
 		__skb_queue_head(&sch->q, skb);
-		sch->qstats.backlog += qdisc_pkt_len(skb);
 		sch->qstats.requeues++;
-		ret = NET_XMIT_SUCCESS;
-	}
-
-	if (ret != NET_XMIT_SUCCESS) {
-		if (net_xmit_drop_count(ret)) {
-			sch->qstats.drops++;
-			return ret;
-		}
 	}
 
 	return NET_XMIT_SUCCESS;
@@ -478,6 +538,22 @@ static unsigned int netem_drop(struct Qdisc *sch)
 	unsigned int len;
 
 	len = qdisc_queue_drop(sch);
+
+	if (!len) {
+		struct rb_node *p = rb_first(&q->t_root);
+
+		if (p) {
+			struct sk_buff *skb = netem_rb_to_skb(p);
+
+			rb_erase(p, &q->t_root);
+			sch->q.qlen--;
+			skb->next = NULL;
+			skb->prev = NULL;
+			len = qdisc_pkt_len(skb);
+			sch->qstats.backlog -= len;
+			kfree_skb(skb);
+		}
+	}
 	if (!len && q->qdisc && q->qdisc->ops->drop)
 	    len = q->qdisc->ops->drop(q->qdisc);
 	if (len)
@@ -490,20 +566,35 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
+	struct rb_node *p;
 
 	if (qdisc_is_throttled(sch))
 		return NULL;
 
 tfifo_dequeue:
-	skb = qdisc_peek_head(sch);
+	skb = __skb_dequeue(&sch->q);
 	if (skb) {
-		const struct netem_skb_cb *cb = netem_skb_cb(skb);
+deliver:
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		qdisc_unthrottled(sch);
+		qdisc_bstats_update(sch, skb);
+		return skb;
+	}
+	p = rb_first(&q->t_root);
+	if (p) {
+		psched_time_t time_to_send;
+
+		skb = netem_rb_to_skb(p);
 
 		/* if more time remaining? */
-		if (cb->time_to_send <= psched_get_time()) {
-			skb = qdisc_dequeue_tail(sch);
-			if (unlikely(!skb))
-				goto qdisc_dequeue;
+		time_to_send = netem_skb_cb(skb)->time_to_send;
+		if (time_to_send <= psched_get_time()) {
+			rb_erase(p, &q->t_root);
+
+			sch->q.qlen--;
+			skb->next = NULL;
+			skb->prev = NULL;
+			skb->tstamp = netem_skb_cb(skb)->tstamp_save;
 
 #ifdef CONFIG_NET_CLS_ACT
 			/*
@@ -525,10 +616,7 @@ tfifo_dequeue:
 				}
 				goto tfifo_dequeue;
 			}
-deliver:
-			qdisc_unthrottled(sch);
-			qdisc_bstats_update(sch, skb);
-			return skb;
+			goto deliver;
 		}
 
 		if (q->qdisc) {
@@ -536,10 +624,9 @@ deliver:
 			if (skb)
 				goto deliver;
 		}
-		qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
+		qdisc_watchdog_schedule(&q->watchdog, time_to_send);
 	}
 
-qdisc_dequeue:
 	if (q->qdisc) {
 		skb = q->qdisc->ops->dequeue(q->qdisc);
 		if (skb)
@@ -553,6 +640,7 @@ static void netem_reset(struct Qdisc *sch)
 	struct netem_sched_data *q = qdisc_priv(sch);
 
 	qdisc_reset_queue(sch);
+	tfifo_reset(sch);
 	if (q->qdisc)
 		qdisc_reset(q->qdisc);
 	qdisc_watchdog_cancel(&q->watchdog);
@@ -560,12 +648,7 @@ static void netem_reset(struct Qdisc *sch)
 
 static void dist_free(struct disttable *d)
 {
-	if (d) {
-		if (is_vmalloc_addr(d))
-			vfree(d);
-		else
-			kfree(d);
-	}
+	kvfree(d);
 }
 
 /*
@@ -606,9 +689,8 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 	return 0;
 }
 
-static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
+static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
 {
-	struct netem_sched_data *q = qdisc_priv(sch);
 	const struct tc_netem_corr *c = nla_data(attr);
 
 	init_crandom(&q->delay_cor, c->delay_corr);
@@ -616,47 +698,45 @@ static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 	init_crandom(&q->dup_cor, c->dup_corr);
 }
 
-static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
+static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
 {
-	struct netem_sched_data *q = qdisc_priv(sch);
 	const struct tc_netem_reorder *r = nla_data(attr);
 
 	q->reorder = r->probability;
 	init_crandom(&q->reorder_cor, r->correlation);
 }
 
-static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
+static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
 {
-	struct netem_sched_data *q = qdisc_priv(sch);
 	const struct tc_netem_corrupt *r = nla_data(attr);
 
 	q->corrupt = r->probability;
 	init_crandom(&q->corrupt_cor, r->correlation);
 }
 
-static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
+static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
 {
-	struct netem_sched_data *q = qdisc_priv(sch);
 	const struct tc_netem_rate *r = nla_data(attr);
 
 	q->rate = r->rate;
 	q->packet_overhead = r->packet_overhead;
 	q->cell_size = r->cell_size;
+	q->cell_overhead = r->cell_overhead;
 	if (q->cell_size)
 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
-	q->cell_overhead = r->cell_overhead;
+	else
+		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
 }
 
-static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
+static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
 {
-	struct netem_sched_data *q = qdisc_priv(sch);
 	const struct nlattr *la;
 	int rem;
 
 	nla_for_each_nested(la, attr, rem) {
 		u16 type = nla_type(la);
 
-		switch(type) {
+		switch (type) {
 		case NETEM_LOSS_GI: {
 			const struct tc_netem_gimodel *gi = nla_data(la);
 
@@ -667,7 +747,7 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 
 			q->loss_model = CLG_4_STATES;
 
-			q->clg.state = 1;
+			q->clg.state = TX_IN_GAP_PERIOD;
 			q->clg.a1 = gi->p13;
 			q->clg.a2 = gi->p31;
 			q->clg.a3 = gi->p32;
@@ -685,7 +765,7 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 			}
 
 			q->loss_model = CLG_GILB_ELL;
-			q->clg.state = 1;
+			q->clg.state = GOOD_STATE;
 			q->clg.a1 = ge->p;
 			q->clg.a2 = ge->r;
 			q->clg.a3 = ge->h;
@@ -708,6 +788,8 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
+	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
+	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
 };
 
 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
@@ -734,6 +816,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 	struct netem_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_NETEM_MAX + 1];
 	struct tc_netem_qopt *qopt;
+	struct clgstate old_clg;
+	int old_loss_model = CLG_RANDOM;
 	int ret;
 
 	if (opt == NULL)
@@ -744,6 +828,33 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 	if (ret < 0)
 		return ret;
 
+	/* backup q->clg and q->loss_model */
+	old_clg = q->clg;
+	old_loss_model = q->loss_model;
+
+	if (tb[TCA_NETEM_LOSS]) {
+		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
+		if (ret) {
+			q->loss_model = old_loss_model;
+			return ret;
+		}
+	} else {
+		q->loss_model = CLG_RANDOM;
+	}
+
+	if (tb[TCA_NETEM_DELAY_DIST]) {
+		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
+		if (ret) {
+			/* recover clg and loss_model, in case of
+			 * q->clg and q->loss_model were modified
+			 * in get_loss_clg()
+			 */
+			q->clg = old_clg;
+			q->loss_model = old_loss_model;
+			return ret;
+		}
+	}
+
 	sch->limit = qopt->limit;
 
 	q->latency = qopt->latency;
@@ -761,26 +872,23 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 		q->reorder = ~0;
 
 	if (tb[TCA_NETEM_CORR])
-		get_correlation(sch, tb[TCA_NETEM_CORR]);
-
-	if (tb[TCA_NETEM_DELAY_DIST]) {
-		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
-		if (ret)
-			return ret;
-	}
+		get_correlation(q, tb[TCA_NETEM_CORR]);
 
 	if (tb[TCA_NETEM_REORDER])
-		get_reorder(sch, tb[TCA_NETEM_REORDER]);
+		get_reorder(q, tb[TCA_NETEM_REORDER]);
 
 	if (tb[TCA_NETEM_CORRUPT])
-		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
+		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
 
 	if (tb[TCA_NETEM_RATE])
-		get_rate(sch, tb[TCA_NETEM_RATE]);
+		get_rate(q, tb[TCA_NETEM_RATE]);
 
-	q->loss_model = CLG_RANDOM;
-	if (tb[TCA_NETEM_LOSS])
-		ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
+	if (tb[TCA_NETEM_RATE64])
+		q->rate = max_t(u64, q->rate,
+				nla_get_u64(tb[TCA_NETEM_RATE64]));
+
+	if (tb[TCA_NETEM_ECN])
+		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 
 	return ret;
 }
@@ -836,7 +944,8 @@ static int dump_loss_model(const struct netem_sched_data *q,
 			.p23 = q->clg.a5,
 		};
 
-		NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
+		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
+			goto nla_put_failure;
 		break;
 	}
 	case CLG_GILB_ELL: {
@@ -847,7 +956,8 @@ static int dump_loss_model(const struct netem_sched_data *q,
 			.k1 = q->clg.a4,
 		};
 
-		NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
+		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
+			goto nla_put_failure;
 		break;
 	}
 	}
@@ -876,26 +986,40 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	qopt.loss = q->loss;
 	qopt.gap = q->gap;
 	qopt.duplicate = q->duplicate;
-	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
+		goto nla_put_failure;
 
 	cor.delay_corr = q->delay_cor.rho;
 	cor.loss_corr = q->loss_cor.rho;
 	cor.dup_corr = q->dup_cor.rho;
-	NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
+	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
+		goto nla_put_failure;
 
 	reorder.probability = q->reorder;
 	reorder.correlation = q->reorder_cor.rho;
-	NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
+	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
+		goto nla_put_failure;
 
 	corrupt.probability = q->corrupt;
 	corrupt.correlation = q->corrupt_cor.rho;
-	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
+	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
+		goto nla_put_failure;
 
-	rate.rate = q->rate;
+	if (q->rate >= (1ULL << 32)) {
+		if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate))
+			goto nla_put_failure;
+		rate.rate = ~0U;
+	} else {
+		rate.rate = q->rate;
+	}
 	rate.packet_overhead = q->packet_overhead;
 	rate.cell_size = q->cell_size;
 	rate.cell_overhead = q->cell_overhead;
-	NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
+	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
+		goto nla_put_failure;
+
+	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
+		goto nla_put_failure;
 
 	if (dump_loss_model(q, skb) != 0)
 		goto nla_put_failure;
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
new file mode 100644
index 00000000000..fefeeb73f15
--- /dev/null
+++ b/net/sched/sch_pie.c
@@ -0,0 +1,566 @@
+/* Copyright (C) 2013 Cisco Systems, Inc, 2013.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Author: Vijay Subramanian <vijaynsu@cisco.com>
+ * Author: Mythili Prabhu <mysuryan@cisco.com>
+ *
+ * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no>
+ * University of Oslo, Norway.
+ *
+ * References:
+ * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
+ * IEEE  Conference on High Performance Switching and Routing 2013 :
+ * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+
+#define QUEUE_THRESHOLD 10000
+#define DQCOUNT_INVALID -1
+#define MAX_PROB  0xffffffff
+#define PIE_SCALE 8
+
+/* parameters used */
+struct pie_params {
+	psched_time_t target;	/* user specified target delay in pschedtime */
+	u32 tupdate;		/* timer frequency (in jiffies) */
+	u32 limit;		/* number of packets that can be enqueued */
+	u32 alpha;		/* alpha and beta are between 0 and 32 */
+	u32 beta;		/* and are used for shift relative to 1 */
+	bool ecn;		/* true if ecn is enabled */
+	bool bytemode;		/* to scale drop early prob based on pkt size */
+};
+
+/* variables used */
+struct pie_vars {
+	u32 prob;		/* probability but scaled by u32 limit. */
+	psched_time_t burst_time;
+	psched_time_t qdelay;
+	psched_time_t qdelay_old;
+	u64 dq_count;		/* measured in bytes */
+	psched_time_t dq_tstamp;	/* drain rate */
+	u32 avg_dq_rate;	/* bytes per pschedtime tick,scaled */
+	u32 qlen_old;		/* in bytes */
+};
+
+/* statistics gathering */
+struct pie_stats {
+	u32 packets_in;		/* total number of packets enqueued */
+	u32 dropped;		/* packets dropped due to pie_action */
+	u32 overlimit;		/* dropped due to lack of space in queue */
+	u32 maxq;		/* maximum queue size */
+	u32 ecn_mark;		/* packets marked with ECN */
+};
+
+/* private data for the Qdisc */
+struct pie_sched_data {
+	struct pie_params params;
+	struct pie_vars vars;
+	struct pie_stats stats;
+	struct timer_list adapt_timer;
+};
+
+static void pie_params_init(struct pie_params *params)
+{
+	params->alpha = 2;
+	params->beta = 20;
+	params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC);	/* 30 ms */
+	params->limit = 1000;	/* default of 1000 packets */
+	params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC);	/* 20 ms */
+	params->ecn = false;
+	params->bytemode = false;
+}
+
+static void pie_vars_init(struct pie_vars *vars)
+{
+	vars->dq_count = DQCOUNT_INVALID;
+	vars->avg_dq_rate = 0;
+	/* default of 100 ms in pschedtime */
+	vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
+}
+
+static bool drop_early(struct Qdisc *sch, u32 packet_size)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	u32 rnd;
+	u32 local_prob = q->vars.prob;
+	u32 mtu = psched_mtu(qdisc_dev(sch));
+
+	/* If there is still burst allowance left skip random early drop */
+	if (q->vars.burst_time > 0)
+		return false;
+
+	/* If current delay is less than half of target, and
+	 * if drop prob is low already, disable early_drop
+	 */
+	if ((q->vars.qdelay < q->params.target / 2)
+	    && (q->vars.prob < MAX_PROB / 5))
+		return false;
+
+	/* If we have fewer than 2 mtu-sized packets, disable drop_early,
+	 * similar to min_th in RED
+	 */
+	if (sch->qstats.backlog < 2 * mtu)
+		return false;
+
+	/* If bytemode is turned on, use packet size to compute new
+	 * probablity. Smaller packets will have lower drop prob in this case
+	 */
+	if (q->params.bytemode && packet_size <= mtu)
+		local_prob = (local_prob / mtu) * packet_size;
+	else
+		local_prob = q->vars.prob;
+
+	rnd = prandom_u32();
+	if (rnd < local_prob)
+		return true;
+
+	return false;
+}
+
+static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	bool enqueue = false;
+
+	if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
+		q->stats.overlimit++;
+		goto out;
+	}
+
+	if (!drop_early(sch, skb->len)) {
+		enqueue = true;
+	} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
+		   INET_ECN_set_ce(skb)) {
+		/* If packet is ecn capable, mark it if drop probability
+		 * is lower than 10%, else drop it.
+		 */
+		q->stats.ecn_mark++;
+		enqueue = true;
+	}
+
+	/* we can enqueue the packet */
+	if (enqueue) {
+		q->stats.packets_in++;
+		if (qdisc_qlen(sch) > q->stats.maxq)
+			q->stats.maxq = qdisc_qlen(sch);
+
+		return qdisc_enqueue_tail(skb, sch);
+	}
+
+out:
+	q->stats.dropped++;
+	return qdisc_drop(skb, sch);
+}
+
+static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
+	[TCA_PIE_TARGET] = {.type = NLA_U32},
+	[TCA_PIE_LIMIT] = {.type = NLA_U32},
+	[TCA_PIE_TUPDATE] = {.type = NLA_U32},
+	[TCA_PIE_ALPHA] = {.type = NLA_U32},
+	[TCA_PIE_BETA] = {.type = NLA_U32},
+	[TCA_PIE_ECN] = {.type = NLA_U32},
+	[TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+};
+
+static int pie_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_PIE_MAX + 1];
+	unsigned int qlen;
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy);
+	if (err < 0)
+		return err;
+
+	sch_tree_lock(sch);
+
+	/* convert from microseconds to pschedtime */
+	if (tb[TCA_PIE_TARGET]) {
+		/* target is in us */
+		u32 target = nla_get_u32(tb[TCA_PIE_TARGET]);
+
+		/* convert to pschedtime */
+		q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
+	}
+
+	/* tupdate is in jiffies */
+	if (tb[TCA_PIE_TUPDATE])
+		q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
+
+	if (tb[TCA_PIE_LIMIT]) {
+		u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
+
+		q->params.limit = limit;
+		sch->limit = limit;
+	}
+
+	if (tb[TCA_PIE_ALPHA])
+		q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]);
+
+	if (tb[TCA_PIE_BETA])
+		q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]);
+
+	if (tb[TCA_PIE_ECN])
+		q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]);
+
+	if (tb[TCA_PIE_BYTEMODE])
+		q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
+
+	/* Drop excess packets if new limit is lower */
+	qlen = sch->q.qlen;
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		qdisc_drop(skb, sch);
+	}
+	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
+{
+
+	struct pie_sched_data *q = qdisc_priv(sch);
+	int qlen = sch->qstats.backlog;	/* current queue size in bytes */
+
+	/* If current queue is about 10 packets or more and dq_count is unset
+	 * we have enough packets to calculate the drain rate. Save
+	 * current time as dq_tstamp and start measurement cycle.
+	 */
+	if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) {
+		q->vars.dq_tstamp = psched_get_time();
+		q->vars.dq_count = 0;
+	}
+
+	/* Calculate the average drain rate from this value.  If queue length
+	 * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset
+	 * the dq_count to -1 as we don't have enough packets to calculate the
+	 * drain rate anymore The following if block is entered only when we
+	 * have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
+	 * and we calculate the drain rate for the threshold here.  dq_count is
+	 * in bytes, time difference in psched_time, hence rate is in
+	 * bytes/psched_time.
+	 */
+	if (q->vars.dq_count != DQCOUNT_INVALID) {
+		q->vars.dq_count += skb->len;
+
+		if (q->vars.dq_count >= QUEUE_THRESHOLD) {
+			psched_time_t now = psched_get_time();
+			u32 dtime = now - q->vars.dq_tstamp;
+			u32 count = q->vars.dq_count << PIE_SCALE;
+
+			if (dtime == 0)
+				return;
+
+			count = count / dtime;
+
+			if (q->vars.avg_dq_rate == 0)
+				q->vars.avg_dq_rate = count;
+			else
+				q->vars.avg_dq_rate =
+				    (q->vars.avg_dq_rate -
+				     (q->vars.avg_dq_rate >> 3)) + (count >> 3);
+
+			/* If the queue has receded below the threshold, we hold
+			 * on to the last drain rate calculated, else we reset
+			 * dq_count to 0 to re-enter the if block when the next
+			 * packet is dequeued
+			 */
+			if (qlen < QUEUE_THRESHOLD)
+				q->vars.dq_count = DQCOUNT_INVALID;
+			else {
+				q->vars.dq_count = 0;
+				q->vars.dq_tstamp = psched_get_time();
+			}
+
+			if (q->vars.burst_time > 0) {
+				if (q->vars.burst_time > dtime)
+					q->vars.burst_time -= dtime;
+				else
+					q->vars.burst_time = 0;
+			}
+		}
+	}
+}
+
+static void calculate_probability(struct Qdisc *sch)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	u32 qlen = sch->qstats.backlog;	/* queue size in bytes */
+	psched_time_t qdelay = 0;	/* in pschedtime */
+	psched_time_t qdelay_old = q->vars.qdelay;	/* in pschedtime */
+	s32 delta = 0;		/* determines the change in probability */
+	u32 oldprob;
+	u32 alpha, beta;
+	bool update_prob = true;
+
+	q->vars.qdelay_old = q->vars.qdelay;
+
+	if (q->vars.avg_dq_rate > 0)
+		qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
+	else
+		qdelay = 0;
+
+	/* If qdelay is zero and qlen is not, it means qlen is very small, less
+	 * than dequeue_rate, so we do not update probabilty in this round
+	 */
+	if (qdelay == 0 && qlen != 0)
+		update_prob = false;
+
+	/* In the algorithm, alpha and beta are between 0 and 2 with typical
+	 * value for alpha as 0.125. In this implementation, we use values 0-32
+	 * passed from user space to represent this. Also, alpha and beta have
+	 * unit of HZ and need to be scaled before they can used to update
+	 * probability. alpha/beta are updated locally below by 1) scaling them
+	 * appropriately 2) scaling down by 16 to come to 0-2 range.
+	 * Please see paper for details.
+	 *
+	 * We scale alpha and beta differently depending on whether we are in
+	 * light, medium or high dropping mode.
+	 */
+	if (q->vars.prob < MAX_PROB / 100) {
+		alpha =
+		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
+		beta =
+		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
+	} else if (q->vars.prob < MAX_PROB / 10) {
+		alpha =
+		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
+		beta =
+		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
+	} else {
+		alpha =
+		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+		beta =
+		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+	}
+
+	/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
+	delta += alpha * ((qdelay - q->params.target));
+	delta += beta * ((qdelay - qdelay_old));
+
+	oldprob = q->vars.prob;
+
+	/* to ensure we increase probability in steps of no more than 2% */
+	if (delta > (s32) (MAX_PROB / (100 / 2)) &&
+	    q->vars.prob >= MAX_PROB / 10)
+		delta = (MAX_PROB / 100) * 2;
+
+	/* Non-linear drop:
+	 * Tune drop probability to increase quickly for high delays(>= 250ms)
+	 * 250ms is derived through experiments and provides error protection
+	 */
+
+	if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
+		delta += MAX_PROB / (100 / 2);
+
+	q->vars.prob += delta;
+
+	if (delta > 0) {
+		/* prevent overflow */
+		if (q->vars.prob < oldprob) {
+			q->vars.prob = MAX_PROB;
+			/* Prevent normalization error. If probability is at
+			 * maximum value already, we normalize it here, and
+			 * skip the check to do a non-linear drop in the next
+			 * section.
+			 */
+			update_prob = false;
+		}
+	} else {
+		/* prevent underflow */
+		if (q->vars.prob > oldprob)
+			q->vars.prob = 0;
+	}
+
+	/* Non-linear drop in probability: Reduce drop probability quickly if
+	 * delay is 0 for 2 consecutive Tupdate periods.
+	 */
+
+	if ((qdelay == 0) && (qdelay_old == 0) && update_prob)
+		q->vars.prob = (q->vars.prob * 98) / 100;
+
+	q->vars.qdelay = qdelay;
+	q->vars.qlen_old = qlen;
+
+	/* We restart the measurement cycle if the following conditions are met
+	 * 1. If the delay has been low for 2 consecutive Tupdate periods
+	 * 2. Calculated drop probability is zero
+	 * 3. We have atleast one estimate for the avg_dq_rate ie.,
+	 *    is a non-zero value
+	 */
+	if ((q->vars.qdelay < q->params.target / 2) &&
+	    (q->vars.qdelay_old < q->params.target / 2) &&
+	    (q->vars.prob == 0) &&
+	    (q->vars.avg_dq_rate > 0))
+		pie_vars_init(&q->vars);
+}
+
+static void pie_timer(unsigned long arg)
+{
+	struct Qdisc *sch = (struct Qdisc *)arg;
+	struct pie_sched_data *q = qdisc_priv(sch);
+	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+
+	spin_lock(root_lock);
+	calculate_probability(sch);
+
+	/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
+	if (q->params.tupdate)
+		mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
+	spin_unlock(root_lock);
+
+}
+
+static int pie_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+
+	pie_params_init(&q->params);
+	pie_vars_init(&q->vars);
+	sch->limit = q->params.limit;
+
+	setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch);
+	mod_timer(&q->adapt_timer, jiffies + HZ / 2);
+
+	if (opt) {
+		int err = pie_change(sch, opt);
+
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	/* convert target from pschedtime to us */
+	if (nla_put_u32(skb, TCA_PIE_TARGET,
+			((u32) PSCHED_TICKS2NS(q->params.target)) /
+			NSEC_PER_USEC) ||
+	    nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
+	    nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) ||
+	    nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
+	    nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
+	    nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
+	    nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -1;
+
+}
+
+static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	struct tc_pie_xstats st = {
+		.prob		= q->vars.prob,
+		.delay		= ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) /
+				   NSEC_PER_USEC,
+		/* unscale and return dq_rate in bytes per sec */
+		.avg_dq_rate	= q->vars.avg_dq_rate *
+				  (PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
+		.packets_in	= q->stats.packets_in,
+		.overlimit	= q->stats.overlimit,
+		.maxq		= q->stats.maxq,
+		.dropped	= q->stats.dropped,
+		.ecn_mark	= q->stats.ecn_mark,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	skb = __qdisc_dequeue_head(sch, &sch->q);
+
+	if (!skb)
+		return NULL;
+
+	pie_process_dequeue(sch, skb);
+	return skb;
+}
+
+static void pie_reset(struct Qdisc *sch)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	qdisc_reset_queue(sch);
+	pie_vars_init(&q->vars);
+}
+
+static void pie_destroy(struct Qdisc *sch)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	q->params.tupdate = 0;
+	del_timer_sync(&q->adapt_timer);
+}
+
+static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
+	.id = "pie",
+	.priv_size	= sizeof(struct pie_sched_data),
+	.enqueue	= pie_qdisc_enqueue,
+	.dequeue	= pie_qdisc_dequeue,
+	.peek		= qdisc_peek_dequeued,
+	.init		= pie_init,
+	.destroy	= pie_destroy,
+	.reset		= pie_reset,
+	.change		= pie_change,
+	.dump		= pie_dump,
+	.dump_stats	= pie_dump_stats,
+	.owner		= THIS_MODULE,
+};
+
+static int __init pie_module_init(void)
+{
+	return register_qdisc(&pie_qdisc_ops);
+}
+
+static void __exit pie_module_exit(void)
+{
+	unregister_qdisc(&pie_qdisc_ops);
+}
+
+module_init(pie_module_init);
+module_exit(pie_module_exit);
+
+MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler");
+MODULE_AUTHOR("Vijay Subramanian");
+MODULE_AUTHOR("Mythili Prabhu");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
new file mode 100644
index 00000000000..89f8fcf73f1
--- /dev/null
+++ b/net/sched/sch_plug.c
@@ -0,0 +1,233 @@
+/*
+ * sch_plug.c Queue traffic until an explicit release command
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ *
+ * There are two ways to use this qdisc:
+ * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
+ *    sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
+ *
+ * 2. For network output buffering (a.k.a output commit) functionality.
+ *    Output commit property is commonly used by applications using checkpoint
+ *    based fault-tolerance to ensure that the checkpoint from which a system
+ *    is being restored is consistent w.r.t outside world.
+ *
+ *    Consider for e.g. Remus - a Virtual Machine checkpointing system,
+ *    wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated
+ *    asynchronously to the backup host, while the VM continues executing the
+ *    next epoch speculatively.
+ *
+ *    The following is a typical sequence of output buffer operations:
+ *       1.At epoch i, start_buffer(i)
+ *       2. At end of epoch i (i.e. after 50ms):
+ *          2.1 Stop VM and take checkpoint(i).
+ *          2.2 start_buffer(i+1) and Resume VM
+ *       3. While speculatively executing epoch(i+1), asynchronously replicate
+ *          checkpoint(i) to backup host.
+ *       4. When checkpoint_ack(i) is received from backup, release_buffer(i)
+ *    Thus, this Qdisc would receive the following sequence of commands:
+ *       TCQ_PLUG_BUFFER (epoch i)
+ *       .. TCQ_PLUG_BUFFER (epoch i+1)
+ *       ....TCQ_PLUG_RELEASE_ONE (epoch i)
+ *       ......TCQ_PLUG_BUFFER (epoch i+2)
+ *       ........
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+/*
+ * State of the queue, when used for network output buffering:
+ *
+ *                 plug(i+1)            plug(i)          head
+ * ------------------+--------------------+---------------->
+ *                   |                    |
+ *                   |                    |
+ * pkts_current_epoch| pkts_last_epoch    |pkts_to_release
+ * ----------------->|<--------+--------->|+--------------->
+ *                   v                    v
+ *
+ */
+
+struct plug_sched_data {
+	/* If true, the dequeue function releases all packets
+	 * from head to end of the queue. The queue turns into
+	 * a pass-through queue for newly arriving packets.
+	 */
+	bool unplug_indefinite;
+
+	/* Queue Limit in bytes */
+	u32 limit;
+
+	/* Number of packets (output) from the current speculatively
+	 * executing epoch.
+	 */
+	u32 pkts_current_epoch;
+
+	/* Number of packets corresponding to the recently finished
+	 * epoch. These will be released when we receive a
+	 * TCQ_PLUG_RELEASE_ONE command. This command is typically
+	 * issued after committing a checkpoint at the target.
+	 */
+	u32 pkts_last_epoch;
+
+	/*
+	 * Number of packets from the head of the queue, that can
+	 * be released (committed checkpoint).
+	 */
+	u32 pkts_to_release;
+};
+
+static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+
+	if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
+		if (!q->unplug_indefinite)
+			q->pkts_current_epoch++;
+		return qdisc_enqueue_tail(skb, sch);
+	}
+
+	return qdisc_reshape_fail(skb, sch);
+}
+
+static struct sk_buff *plug_dequeue(struct Qdisc *sch)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+
+	if (qdisc_is_throttled(sch))
+		return NULL;
+
+	if (!q->unplug_indefinite) {
+		if (!q->pkts_to_release) {
+			/* No more packets to dequeue. Block the queue
+			 * and wait for the next release command.
+			 */
+			qdisc_throttled(sch);
+			return NULL;
+		}
+		q->pkts_to_release--;
+	}
+
+	return qdisc_dequeue_head(sch);
+}
+
+static int plug_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+
+	q->pkts_current_epoch = 0;
+	q->pkts_last_epoch = 0;
+	q->pkts_to_release = 0;
+	q->unplug_indefinite = false;
+
+	if (opt == NULL) {
+		/* We will set a default limit of 100 pkts (~150kB)
+		 * in case tx_queue_len is not available. The
+		 * default value is completely arbitrary.
+		 */
+		u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100;
+		q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
+	} else {
+		struct tc_plug_qopt *ctl = nla_data(opt);
+
+		if (nla_len(opt) < sizeof(*ctl))
+			return -EINVAL;
+
+		q->limit = ctl->limit;
+	}
+
+	qdisc_throttled(sch);
+	return 0;
+}
+
+/* Receives 4 types of messages:
+ * TCQ_PLUG_BUFFER: Inset a plug into the queue and
+ *  buffer any incoming packets
+ * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
+ *   to beginning of the next plug.
+ * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
+ *   Stop buffering packets until the next TCQ_PLUG_BUFFER
+ *   command is received (just act as a pass-thru queue).
+ * TCQ_PLUG_LIMIT: Increase/decrease queue size
+ */
+static int plug_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+	struct tc_plug_qopt *msg;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	msg = nla_data(opt);
+	if (nla_len(opt) < sizeof(*msg))
+		return -EINVAL;
+
+	switch (msg->action) {
+	case TCQ_PLUG_BUFFER:
+		/* Save size of the current buffer */
+		q->pkts_last_epoch = q->pkts_current_epoch;
+		q->pkts_current_epoch = 0;
+		if (q->unplug_indefinite)
+			qdisc_throttled(sch);
+		q->unplug_indefinite = false;
+		break;
+	case TCQ_PLUG_RELEASE_ONE:
+		/* Add packets from the last complete buffer to the
+		 * packets to be released set.
+		 */
+		q->pkts_to_release += q->pkts_last_epoch;
+		q->pkts_last_epoch = 0;
+		qdisc_unthrottled(sch);
+		netif_schedule_queue(sch->dev_queue);
+		break;
+	case TCQ_PLUG_RELEASE_INDEFINITE:
+		q->unplug_indefinite = true;
+		q->pkts_to_release = 0;
+		q->pkts_last_epoch = 0;
+		q->pkts_current_epoch = 0;
+		qdisc_unthrottled(sch);
+		netif_schedule_queue(sch->dev_queue);
+		break;
+	case TCQ_PLUG_LIMIT:
+		/* Limit is supplied in bytes */
+		q->limit = msg->limit;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
+	.id          =       "plug",
+	.priv_size   =       sizeof(struct plug_sched_data),
+	.enqueue     =       plug_enqueue,
+	.dequeue     =       plug_dequeue,
+	.peek        =       qdisc_peek_head,
+	.init        =       plug_init,
+	.change      =       plug_change,
+	.owner       =       THIS_MODULE,
+};
+
+static int __init plug_module_init(void)
+{
+	return register_qdisc(&plug_qdisc_ops);
+}
+
+static void __exit plug_module_exit(void)
+{
+	unregister_qdisc(&plug_qdisc_ops);
+}
+module_init(plug_module_init)
+module_exit(plug_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index b5d56a22b1d..79359b69ad8 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -247,7 +247,8 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	opt.bands = q->bands;
 	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
 
-	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
 
 	return skb->len;
 
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index e68cb440756..8056fb4e618 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1,7 +1,8 @@
 /*
- * net/sched/sch_qfq.c         Quick Fair Queueing Scheduler.
+ * net/sched/sch_qfq.c         Quick Fair Queueing Plus Scheduler.
  *
  * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
+ * Copyright (c) 2012 Paolo Valente.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -19,12 +20,18 @@
 #include <net/pkt_cls.h>
 
 
-/*  Quick Fair Queueing
-    ===================
+/*  Quick Fair Queueing Plus
+    ========================
 
     Sources:
 
-    Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
+    [1] Paolo Valente,
+    "Reducing the Execution Time of Fair-Queueing Schedulers."
+    http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
+
+    Sources for QFQ:
+
+    [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
     Packet Scheduling with Tight Bandwidth Distribution Guarantees."
 
     See also:
@@ -33,6 +40,20 @@
 
 /*
 
+  QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
+  classes. Each aggregate is timestamped with a virtual start time S
+  and a virtual finish time F, and scheduled according to its
+  timestamps. S and F are computed as a function of a system virtual
+  time function V. The classes within each aggregate are instead
+  scheduled with DRR.
+
+  To speed up operations, QFQ+ divides also aggregates into a limited
+  number of groups. Which group a class belongs to depends on the
+  ratio between the maximum packet length for the class and the weight
+  of the class. Groups have their own S and F. In the end, QFQ+
+  schedules groups, then aggregates within groups, then classes within
+  aggregates. See [1] and [2] for a full description.
+
   Virtual time computations.
 
   S, F and V are all computed in fixed point arithmetic with
@@ -76,26 +97,27 @@
 #define QFQ_MAX_SLOTS	32
 
 /*
- * Shifts used for class<->group mapping.  We allow class weights that are
- * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the
+ * Shifts used for aggregate<->group mapping.  We allow class weights that are
+ * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
  * group with the smallest index that can support the L_i / r_i configured
- * for the class.
+ * for the classes in the aggregate.
  *
  * grp->index is the index of the group; and grp->slot_shift
  * is the shift for the corresponding (scaled) sigma_i.
  */
-#define QFQ_MAX_INDEX		19
-#define QFQ_MAX_WSHIFT		16
+#define QFQ_MAX_INDEX		24
+#define QFQ_MAX_WSHIFT		10
 
-#define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT)
-#define QFQ_MAX_WSUM		(2*QFQ_MAX_WEIGHT)
+#define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
+#define QFQ_MAX_WSUM		(64*QFQ_MAX_WEIGHT)
 
 #define FRAC_BITS		30	/* fixed point arithmetic */
 #define ONE_FP			(1UL << FRAC_BITS)
-#define IWSUM			(ONE_FP/QFQ_MAX_WSUM)
 
-#define QFQ_MTU_SHIFT		11
-#define QFQ_MIN_SLOT_SHIFT	(FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
+#define QFQ_MTU_SHIFT		16	/* to support TSO/GSO */
+#define QFQ_MIN_LMAX		512	/* see qfq_slot_insert */
+
+#define QFQ_MAX_AGG_CLASSES	8 /* max num classes per aggregate allowed */
 
 /*
  * Possible group states.  These values are used as indexes for the bitmaps
@@ -105,6 +127,8 @@ enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
 
 struct qfq_group;
 
+struct qfq_aggregate;
+
 struct qfq_class {
 	struct Qdisc_class_common common;
 
@@ -113,9 +137,14 @@ struct qfq_class {
 
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
+	struct gnet_stats_rate_est64 rate_est;
 	struct Qdisc *qdisc;
+	struct list_head alist;		/* Link for active-classes list. */
+	struct qfq_aggregate *agg;	/* Parent aggregate. */
+	int deficit;			/* DRR deficit counter. */
+};
 
+struct qfq_aggregate {
 	struct hlist_node next;	/* Link for the slot list. */
 	u64 S, F;		/* flow timestamps (exact) */
 
@@ -126,8 +155,18 @@ struct qfq_class {
 	struct qfq_group *grp;
 
 	/* these are copied from the flowset. */
-	u32	inv_w;		/* ONE_FP/weight */
-	u32	lmax;		/* Max packet size for this flow. */
+	u32	class_weight; /* Weight of each class in this aggregate. */
+	/* Max pkt size for the classes in this aggregate, DRR quantum. */
+	int	lmax;
+
+	u32	inv_w;	    /* ONE_FP/(sum of weights of classes in aggr.). */
+	u32	budgetmax;  /* Max budget for this aggregate. */
+	u32	initial_budget, budget;     /* Initial and current budget. */
+
+	int		  num_classes;	/* Number of classes in this aggr. */
+	struct list_head  active;	/* DRR queue of active classes. */
+
+	struct hlist_node nonfull_next;	/* See nonfull_aggs in qfq_sched. */
 };
 
 struct qfq_group {
@@ -137,7 +176,7 @@ struct qfq_group {
 	unsigned int front;		/* Index of the front slot. */
 	unsigned long full_slots;	/* non-empty slots */
 
-	/* Array of RR lists of active classes. */
+	/* Array of RR lists of active aggregates. */
 	struct hlist_head slots[QFQ_MAX_SLOTS];
 };
 
@@ -145,13 +184,29 @@ struct qfq_sched {
 	struct tcf_proto *filter_list;
 	struct Qdisc_class_hash clhash;
 
-	u64		V;		/* Precise virtual time. */
-	u32		wsum;		/* weight sum */
+	u64			oldV, V;	/* Precise virtual times. */
+	struct qfq_aggregate	*in_serv_agg;   /* Aggregate being served. */
+	u32			num_active_agg; /* Num. of active aggregates */
+	u32			wsum;		/* weight sum */
+	u32			iwsum;		/* inverse weight sum */
 
 	unsigned long bitmaps[QFQ_MAX_STATE];	    /* Group bitmaps. */
 	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+	u32 min_slot_shift;	/* Index of the group-0 bit in the bitmaps. */
+
+	u32 max_agg_classes;		/* Max number of classes per aggr. */
+	struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
 };
 
+/*
+ * Possible reasons why the timestamps of an aggregate are updated
+ * enqueue: the aggregate switches from idle to active and must scheduled
+ *	    for service
+ * requeue: the aggregate finishes its budget, so it stops being served and
+ *	    must be rescheduled for service
+ */
+enum update_reason {enqueue, requeue};
+
 static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
@@ -181,18 +236,18 @@ static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
  * index = log_2(maxlen/weight) but we need to apply the scaling.
  * This is used only once at flow creation.
  */
-static int qfq_calc_index(u32 inv_w, unsigned int maxlen)
+static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
 {
 	u64 slot_size = (u64)maxlen * inv_w;
 	unsigned long size_map;
 	int index = 0;
 
-	size_map = slot_size >> QFQ_MIN_SLOT_SHIFT;
+	size_map = slot_size >> min_slot_shift;
 	if (!size_map)
 		goto out;
 
 	index = __fls(size_map) + 1;	/* basically a log_2 */
-	index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
+	index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
 
 	if (index < 0)
 		index = 0;
@@ -203,14 +258,160 @@ out:
 	return index;
 }
 
+static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
+static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
+			     enum update_reason);
+
+static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+			 u32 lmax, u32 weight)
+{
+	INIT_LIST_HEAD(&agg->active);
+	hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
+
+	agg->lmax = lmax;
+	agg->class_weight = weight;
+}
+
+static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
+					  u32 lmax, u32 weight)
+{
+	struct qfq_aggregate *agg;
+
+	hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
+		if (agg->lmax == lmax && agg->class_weight == weight)
+			return agg;
+
+	return NULL;
+}
+
+
+/* Update aggregate as a function of the new number of classes. */
+static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+			   int new_num_classes)
+{
+	u32 new_agg_weight;
+
+	if (new_num_classes == q->max_agg_classes)
+		hlist_del_init(&agg->nonfull_next);
+
+	if (agg->num_classes > new_num_classes &&
+	    new_num_classes == q->max_agg_classes - 1) /* agg no more full */
+		hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
+
+	/* The next assignment may let
+	 * agg->initial_budget > agg->budgetmax
+	 * hold, we will take it into account in charge_actual_service().
+	 */
+	agg->budgetmax = new_num_classes * agg->lmax;
+	new_agg_weight = agg->class_weight * new_num_classes;
+	agg->inv_w = ONE_FP/new_agg_weight;
+
+	if (agg->grp == NULL) {
+		int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
+				       q->min_slot_shift);
+		agg->grp = &q->groups[i];
+	}
+
+	q->wsum +=
+		(int) agg->class_weight * (new_num_classes - agg->num_classes);
+	q->iwsum = ONE_FP / q->wsum;
+
+	agg->num_classes = new_num_classes;
+}
+
+/* Add class to aggregate. */
+static void qfq_add_to_agg(struct qfq_sched *q,
+			   struct qfq_aggregate *agg,
+			   struct qfq_class *cl)
+{
+	cl->agg = agg;
+
+	qfq_update_agg(q, agg, agg->num_classes+1);
+	if (cl->qdisc->q.qlen > 0) { /* adding an active class */
+		list_add_tail(&cl->alist, &agg->active);
+		if (list_first_entry(&agg->active, struct qfq_class, alist) ==
+		    cl && q->in_serv_agg != agg) /* agg was inactive */
+			qfq_activate_agg(q, agg, enqueue); /* schedule agg */
+	}
+}
+
+static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
+
+static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+	if (!hlist_unhashed(&agg->nonfull_next))
+		hlist_del_init(&agg->nonfull_next);
+	q->wsum -= agg->class_weight;
+	if (q->wsum != 0)
+		q->iwsum = ONE_FP / q->wsum;
+
+	if (q->in_serv_agg == agg)
+		q->in_serv_agg = qfq_choose_next_agg(q);
+	kfree(agg);
+}
+
+/* Deschedule class from within its parent aggregate. */
+static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
+{
+	struct qfq_aggregate *agg = cl->agg;
+
+
+	list_del(&cl->alist); /* remove from RR queue of the aggregate */
+	if (list_empty(&agg->active)) /* agg is now inactive */
+		qfq_deactivate_agg(q, agg);
+}
+
+/* Remove class from its parent aggregate. */
+static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
+{
+	struct qfq_aggregate *agg = cl->agg;
+
+	cl->agg = NULL;
+	if (agg->num_classes == 1) { /* agg being emptied, destroy it */
+		qfq_destroy_agg(q, agg);
+		return;
+	}
+	qfq_update_agg(q, agg, agg->num_classes-1);
+}
+
+/* Deschedule class and remove it from its parent aggregate. */
+static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
+{
+	if (cl->qdisc->q.qlen > 0) /* class is active */
+		qfq_deactivate_class(q, cl);
+
+	qfq_rm_from_agg(q, cl);
+}
+
+/* Move class to a new aggregate, matching the new class weight and/or lmax */
+static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
+			   u32 lmax)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
+
+	if (new_agg == NULL) { /* create new aggregate */
+		new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
+		if (new_agg == NULL)
+			return -ENOBUFS;
+		qfq_init_agg(q, new_agg, lmax, weight);
+	}
+	qfq_deact_rm_from_agg(q, cl);
+	qfq_add_to_agg(q, new_agg, cl);
+
+	return 0;
+}
+
 static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 			    struct nlattr **tca, unsigned long *arg)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
 	struct qfq_class *cl = (struct qfq_class *)*arg;
+	bool existing = false;
 	struct nlattr *tb[TCA_QFQ_MAX + 1];
+	struct qfq_aggregate *new_agg = NULL;
 	u32 weight, lmax, inv_w;
-	int i, err;
+	int err;
 	int delta_w;
 
 	if (tca[TCA_OPTIONS] == NULL) {
@@ -231,25 +432,32 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	} else
 		weight = 1;
 
-	inv_w = ONE_FP / weight;
-	weight = ONE_FP / inv_w;
-	delta_w = weight - (cl ? ONE_FP / cl->inv_w : 0);
-	if (q->wsum + delta_w > QFQ_MAX_WSUM) {
-		pr_notice("qfq: total weight out of range (%u + %u)\n",
-			  delta_w, q->wsum);
-		return -EINVAL;
-	}
-
 	if (tb[TCA_QFQ_LMAX]) {
 		lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
-		if (!lmax || lmax > (1UL << QFQ_MTU_SHIFT)) {
+		if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
 			pr_notice("qfq: invalid max length %u\n", lmax);
 			return -EINVAL;
 		}
 	} else
-		lmax = 1UL << QFQ_MTU_SHIFT;
+		lmax = psched_mtu(qdisc_dev(sch));
+
+	inv_w = ONE_FP / weight;
+	weight = ONE_FP / inv_w;
+
+	if (cl != NULL &&
+	    lmax == cl->agg->lmax &&
+	    weight == cl->agg->class_weight)
+		return 0; /* nothing to change */
+
+	delta_w = weight - (cl ? cl->agg->class_weight : 0);
+
+	if (q->wsum + delta_w > QFQ_MAX_WSUM) {
+		pr_notice("qfq: total weight out of range (%d + %u)\n",
+			  delta_w, q->wsum);
+		return -EINVAL;
+	}
 
-	if (cl != NULL) {
+	if (cl != NULL) { /* modify existing class */
 		if (tca[TCA_RATE]) {
 			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
 						    qdisc_root_sleeping_lock(sch),
@@ -257,27 +465,18 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 			if (err)
 				return err;
 		}
-
-		if (inv_w != cl->inv_w) {
-			sch_tree_lock(sch);
-			q->wsum += delta_w;
-			cl->inv_w = inv_w;
-			sch_tree_unlock(sch);
-		}
-		return 0;
+		existing = true;
+		goto set_change_agg;
 	}
 
+	/* create and init new class */
 	cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
 	if (cl == NULL)
 		return -ENOBUFS;
 
 	cl->refcnt = 1;
 	cl->common.classid = classid;
-	cl->lmax = lmax;
-	cl->inv_w = inv_w;
-	i = qfq_calc_index(cl->inv_w, cl->lmax);
-
-	cl->grp = &q->groups[i];
+	cl->deficit = lmax;
 
 	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
 				      &pfifo_qdisc_ops, classid);
@@ -288,13 +487,9 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
 					qdisc_root_sleeping_lock(sch),
 					tca[TCA_RATE]);
-		if (err) {
-			qdisc_destroy(cl->qdisc);
-			kfree(cl);
-			return err;
-		}
+		if (err)
+			goto destroy_class;
 	}
-	q->wsum += weight;
 
 	sch_tree_lock(sch);
 	qdisc_class_hash_insert(&q->clhash, &cl->common);
@@ -302,19 +497,39 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 
 	qdisc_class_hash_grow(sch, &q->clhash);
 
+set_change_agg:
+	sch_tree_lock(sch);
+	new_agg = qfq_find_agg(q, lmax, weight);
+	if (new_agg == NULL) { /* create new aggregate */
+		sch_tree_unlock(sch);
+		new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
+		if (new_agg == NULL) {
+			err = -ENOBUFS;
+			gen_kill_estimator(&cl->bstats, &cl->rate_est);
+			goto destroy_class;
+		}
+		sch_tree_lock(sch);
+		qfq_init_agg(q, new_agg, lmax, weight);
+	}
+	if (existing)
+		qfq_deact_rm_from_agg(q, cl);
+	qfq_add_to_agg(q, new_agg, cl);
+	sch_tree_unlock(sch);
+
 	*arg = (unsigned long)cl;
 	return 0;
+
+destroy_class:
+	qdisc_destroy(cl->qdisc);
+	kfree(cl);
+	return err;
 }
 
 static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
 
-	if (cl->inv_w) {
-		q->wsum -= ONE_FP / cl->inv_w;
-		cl->inv_w = 0;
-	}
-
+	qfq_rm_from_agg(q, cl);
 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
 	qdisc_destroy(cl->qdisc);
 	kfree(cl);
@@ -429,8 +644,9 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
-	NLA_PUT_U32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w);
-	NLA_PUT_U32(skb, TCA_QFQ_LMAX, cl->lmax);
+	if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
+	    nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
+		goto nla_put_failure;
 	return nla_nest_end(skb, nest);
 
 nla_put_failure:
@@ -447,8 +663,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	memset(&xstats, 0, sizeof(xstats));
 	cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
 
-	xstats.weight = ONE_FP/cl->inv_w;
-	xstats.lmax = cl->lmax;
+	xstats.weight = cl->agg->class_weight;
+	xstats.lmax = cl->agg->lmax;
 
 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
 	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
@@ -462,14 +678,13 @@ static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
 	struct qfq_class *cl;
-	struct hlist_node *n;
 	unsigned int i;
 
 	if (arg->stop)
 		return;
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 			if (arg->count < arg->skip) {
 				arg->count++;
 				continue;
@@ -599,45 +814,111 @@ static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
  * perhaps
  *
 	old_V ^= q->V;
-	old_V >>= QFQ_MIN_SLOT_SHIFT;
+	old_V >>= q->min_slot_shift;
 	if (old_V) {
 		...
 	}
  *
  */
-static void qfq_make_eligible(struct qfq_sched *q, u64 old_V)
+static void qfq_make_eligible(struct qfq_sched *q)
 {
-	unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
-	unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
+	unsigned long vslot = q->V >> q->min_slot_shift;
+	unsigned long old_vslot = q->oldV >> q->min_slot_shift;
 
 	if (vslot != old_vslot) {
-		unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1;
+		unsigned long mask;
+		int last_flip_pos = fls(vslot ^ old_vslot);
+
+		if (last_flip_pos > 31) /* higher than the number of groups */
+			mask = ~0UL;    /* make all groups eligible */
+		else
+			mask = (1UL << last_flip_pos) - 1;
+
 		qfq_move_groups(q, mask, IR, ER);
 		qfq_move_groups(q, mask, IB, EB);
 	}
 }
 
-
 /*
- * XXX we should make sure that slot becomes less than 32.
- * This is guaranteed by the input values.
- * roundedS is always cl->S rounded on grp->slot_shift bits.
+ * The index of the slot in which the input aggregate agg is to be
+ * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
+ * and not a '-1' because the start time of the group may be moved
+ * backward by one slot after the aggregate has been inserted, and
+ * this would cause non-empty slots to be right-shifted by one
+ * position.
+ *
+ * QFQ+ fully satisfies this bound to the slot index if the parameters
+ * of the classes are not changed dynamically, and if QFQ+ never
+ * happens to postpone the service of agg unjustly, i.e., it never
+ * happens that the aggregate becomes backlogged and eligible, or just
+ * eligible, while an aggregate with a higher approximated finish time
+ * is being served. In particular, in this case QFQ+ guarantees that
+ * the timestamps of agg are low enough that the slot index is never
+ * higher than 2. Unfortunately, QFQ+ cannot provide the same
+ * guarantee if it happens to unjustly postpone the service of agg, or
+ * if the parameters of some class are changed.
+ *
+ * As for the first event, i.e., an out-of-order service, the
+ * upper bound to the slot index guaranteed by QFQ+ grows to
+ * 2 +
+ * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
+ * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
+ *
+ * The following function deals with this problem by backward-shifting
+ * the timestamps of agg, if needed, so as to guarantee that the slot
+ * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
+ * cause the service of other aggregates to be postponed, yet the
+ * worst-case guarantees of these aggregates are not violated.  In
+ * fact, in case of no out-of-order service, the timestamps of agg
+ * would have been even lower than they are after the backward shift,
+ * because QFQ+ would have guaranteed a maximum value equal to 2 for
+ * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
+ * service is postponed because of the backward-shift would have
+ * however waited for the service of agg before being served.
+ *
+ * The other event that may cause the slot index to be higher than 2
+ * for agg is a recent change of the parameters of some class. If the
+ * weight of a class is increased or the lmax (max_pkt_size) of the
+ * class is decreased, then a new aggregate with smaller slot size
+ * than the original parent aggregate of the class may happen to be
+ * activated. The activation of this aggregate should be properly
+ * delayed to when the service of the class has finished in the ideal
+ * system tracked by QFQ+. If the activation of the aggregate is not
+ * delayed to this reference time instant, then this aggregate may be
+ * unjustly served before other aggregates waiting for service. This
+ * may cause the above bound to the slot index to be violated for some
+ * of these unlucky aggregates.
+ *
+ * Instead of delaying the activation of the new aggregate, which is
+ * quite complex, the above-discussed capping of the slot index is
+ * used to handle also the consequences of a change of the parameters
+ * of a class.
  */
-static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl,
+static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
 			    u64 roundedS)
 {
 	u64 slot = (roundedS - grp->S) >> grp->slot_shift;
-	unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
+	unsigned int i; /* slot index in the bucket list */
+
+	if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
+		u64 deltaS = roundedS - grp->S -
+			((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
+		agg->S -= deltaS;
+		agg->F -= deltaS;
+		slot = QFQ_MAX_SLOTS - 2;
+	}
 
-	hlist_add_head(&cl->next, &grp->slots[i]);
+	i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+	hlist_add_head(&agg->next, &grp->slots[i]);
 	__set_bit(slot, &grp->full_slots);
 }
 
 /* Maybe introduce hlist_first_entry?? */
-static struct qfq_class *qfq_slot_head(struct qfq_group *grp)
+static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
 {
 	return hlist_entry(grp->slots[grp->front].first,
-			   struct qfq_class, next);
+			   struct qfq_aggregate, next);
 }
 
 /*
@@ -645,20 +926,20 @@ static struct qfq_class *qfq_slot_head(struct qfq_group *grp)
  */
 static void qfq_front_slot_remove(struct qfq_group *grp)
 {
-	struct qfq_class *cl = qfq_slot_head(grp);
+	struct qfq_aggregate *agg = qfq_slot_head(grp);
 
-	BUG_ON(!cl);
-	hlist_del(&cl->next);
+	BUG_ON(!agg);
+	hlist_del(&agg->next);
 	if (hlist_empty(&grp->slots[grp->front]))
 		__clear_bit(0, &grp->full_slots);
 }
 
 /*
- * Returns the first full queue in a group. As a side effect,
- * adjust the bucket list so the first non-empty bucket is at
- * position 0 in full_slots.
+ * Returns the first aggregate in the first non-empty bucket of the
+ * group. As a side effect, adjusts the bucket list so the first
+ * non-empty bucket is at position 0 in full_slots.
  */
-static struct qfq_class *qfq_slot_scan(struct qfq_group *grp)
+static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
 {
 	unsigned int i;
 
@@ -694,7 +975,7 @@ static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
 	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
 }
 
-static void qfq_update_eligible(struct qfq_sched *q, u64 old_V)
+static void qfq_update_eligible(struct qfq_sched *q)
 {
 	struct qfq_group *grp;
 	unsigned long ineligible;
@@ -706,147 +987,239 @@ static void qfq_update_eligible(struct qfq_sched *q, u64 old_V)
 			if (qfq_gt(grp->S, q->V))
 				q->V = grp->S;
 		}
-		qfq_make_eligible(q, old_V);
+		qfq_make_eligible(q);
 	}
 }
 
-/* What is length of next packet in queue (0 if queue is empty) */
-static unsigned int qdisc_peek_len(struct Qdisc *sch)
-{
-	struct sk_buff *skb;
-
-	skb = sch->ops->peek(sch);
-	return skb ? qdisc_pkt_len(skb) : 0;
-}
-
-/*
- * Updates the class, returns true if also the group needs to be updated.
- */
-static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl)
+/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
+static void agg_dequeue(struct qfq_aggregate *agg,
+			struct qfq_class *cl, unsigned int len)
 {
-	unsigned int len = qdisc_peek_len(cl->qdisc);
+	qdisc_dequeue_peeked(cl->qdisc);
 
-	cl->S = cl->F;
-	if (!len)
-		qfq_front_slot_remove(grp);	/* queue is empty */
-	else {
-		u64 roundedS;
-
-		cl->F = cl->S + (u64)len * cl->inv_w;
-		roundedS = qfq_round_down(cl->S, grp->slot_shift);
-		if (roundedS == grp->S)
-			return false;
+	cl->deficit -= (int) len;
 
-		qfq_front_slot_remove(grp);
-		qfq_slot_insert(grp, cl, roundedS);
+	if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
+		list_del(&cl->alist);
+	else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
+		cl->deficit += agg->lmax;
+		list_move_tail(&cl->alist, &agg->active);
 	}
-
-	return true;
 }
 
-static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
+static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
+					   struct qfq_class **cl,
+					   unsigned int *len)
 {
-	struct qfq_sched *q = qdisc_priv(sch);
-	struct qfq_group *grp;
-	struct qfq_class *cl;
 	struct sk_buff *skb;
-	unsigned int len;
-	u64 old_V;
 
-	if (!q->bitmaps[ER])
-		return NULL;
-
-	grp = qfq_ffs(q, q->bitmaps[ER]);
-
-	cl = qfq_slot_head(grp);
-	skb = qdisc_dequeue_peeked(cl->qdisc);
-	if (!skb) {
+	*cl = list_first_entry(&agg->active, struct qfq_class, alist);
+	skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
+	if (skb == NULL)
 		WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
-		return NULL;
-	}
+	else
+		*len = qdisc_pkt_len(skb);
 
-	sch->q.qlen--;
-	qdisc_bstats_update(sch, skb);
-
-	old_V = q->V;
-	len = qdisc_pkt_len(skb);
-	q->V += (u64)len * IWSUM;
-	pr_debug("qfq dequeue: len %u F %lld now %lld\n",
-		 len, (unsigned long long) cl->F, (unsigned long long) q->V);
-
-	if (qfq_update_class(grp, cl)) {
-		u64 old_F = grp->F;
-
-		cl = qfq_slot_scan(grp);
-		if (!cl)
-			__clear_bit(grp->index, &q->bitmaps[ER]);
-		else {
-			u64 roundedS = qfq_round_down(cl->S, grp->slot_shift);
-			unsigned int s;
-
-			if (grp->S == roundedS)
-				goto skip_unblock;
-			grp->S = roundedS;
-			grp->F = roundedS + (2ULL << grp->slot_shift);
-			__clear_bit(grp->index, &q->bitmaps[ER]);
-			s = qfq_calc_state(q, grp);
-			__set_bit(grp->index, &q->bitmaps[s]);
-		}
-
-		qfq_unblock_groups(q, grp->index, old_F);
-	}
+	return skb;
+}
 
-skip_unblock:
-	qfq_update_eligible(q, old_V);
+/* Update F according to the actual service received by the aggregate. */
+static inline void charge_actual_service(struct qfq_aggregate *agg)
+{
+	/* Compute the service received by the aggregate, taking into
+	 * account that, after decreasing the number of classes in
+	 * agg, it may happen that
+	 * agg->initial_budget - agg->budget > agg->bugdetmax
+	 */
+	u32 service_received = min(agg->budgetmax,
+				   agg->initial_budget - agg->budget);
 
-	return skb;
+	agg->F = agg->S + (u64)service_received * agg->inv_w;
 }
 
-/*
- * Assign a reasonable start time for a new flow k in group i.
+/* Assign a reasonable start time for a new aggregate in group i.
  * Admissible values for \hat(F) are multiples of \sigma_i
  * no greater than V+\sigma_i . Larger values mean that
  * we had a wraparound so we consider the timestamp to be stale.
  *
  * If F is not stale and F >= V then we set S = F.
  * Otherwise we should assign S = V, but this may violate
- * the ordering in ER. So, if we have groups in ER, set S to
- * the F_j of the first group j which would be blocking us.
+ * the ordering in EB (see [2]). So, if we have groups in ER,
+ * set S to the F_j of the first group j which would be blocking us.
  * We are guaranteed not to move S backward because
  * otherwise our group i would still be blocked.
  */
-static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
+static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
 {
 	unsigned long mask;
 	u64 limit, roundedF;
-	int slot_shift = cl->grp->slot_shift;
+	int slot_shift = agg->grp->slot_shift;
 
-	roundedF = qfq_round_down(cl->F, slot_shift);
+	roundedF = qfq_round_down(agg->F, slot_shift);
 	limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
 
-	if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
+	if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
 		/* timestamp was stale */
-		mask = mask_from(q->bitmaps[ER], cl->grp->index);
+		mask = mask_from(q->bitmaps[ER], agg->grp->index);
 		if (mask) {
 			struct qfq_group *next = qfq_ffs(q, mask);
 			if (qfq_gt(roundedF, next->F)) {
-				cl->S = next->F;
+				if (qfq_gt(limit, next->F))
+					agg->S = next->F;
+				else /* preserve timestamp correctness */
+					agg->S = limit;
 				return;
 			}
 		}
-		cl->S = q->V;
+		agg->S = q->V;
 	} else  /* timestamp is not stale */
-		cl->S = cl->F;
+		agg->S = agg->F;
 }
 
-static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+/* Update the timestamps of agg before scheduling/rescheduling it for
+ * service.  In particular, assign to agg->F its maximum possible
+ * value, i.e., the virtual finish time with which the aggregate
+ * should be labeled if it used all its budget once in service.
+ */
+static inline void
+qfq_update_agg_ts(struct qfq_sched *q,
+		    struct qfq_aggregate *agg, enum update_reason reason)
+{
+	if (reason != requeue)
+		qfq_update_start(q, agg);
+	else /* just charge agg for the service received */
+		agg->S = agg->F;
+
+	agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
+}
+
+static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);
+
+static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
+	struct qfq_class *cl;
+	struct sk_buff *skb = NULL;
+	/* next-packet len, 0 means no more active classes in in-service agg */
+	unsigned int len = 0;
+
+	if (in_serv_agg == NULL)
+		return NULL;
+
+	if (!list_empty(&in_serv_agg->active))
+		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
+
+	/*
+	 * If there are no active classes in the in-service aggregate,
+	 * or if the aggregate has not enough budget to serve its next
+	 * class, then choose the next aggregate to serve.
+	 */
+	if (len == 0 || in_serv_agg->budget < len) {
+		charge_actual_service(in_serv_agg);
+
+		/* recharge the budget of the aggregate */
+		in_serv_agg->initial_budget = in_serv_agg->budget =
+			in_serv_agg->budgetmax;
+
+		if (!list_empty(&in_serv_agg->active)) {
+			/*
+			 * Still active: reschedule for
+			 * service. Possible optimization: if no other
+			 * aggregate is active, then there is no point
+			 * in rescheduling this aggregate, and we can
+			 * just keep it as the in-service one. This
+			 * should be however a corner case, and to
+			 * handle it, we would need to maintain an
+			 * extra num_active_aggs field.
+			*/
+			qfq_update_agg_ts(q, in_serv_agg, requeue);
+			qfq_schedule_agg(q, in_serv_agg);
+		} else if (sch->q.qlen == 0) { /* no aggregate to serve */
+			q->in_serv_agg = NULL;
+			return NULL;
+		}
+
+		/*
+		 * If we get here, there are other aggregates queued:
+		 * choose the new aggregate to serve.
+		 */
+		in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
+		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
+	}
+	if (!skb)
+		return NULL;
+
+	sch->q.qlen--;
+	qdisc_bstats_update(sch, skb);
+
+	agg_dequeue(in_serv_agg, cl, len);
+	/* If lmax is lowered, through qfq_change_class, for a class
+	 * owning pending packets with larger size than the new value
+	 * of lmax, then the following condition may hold.
+	 */
+	if (unlikely(in_serv_agg->budget < len))
+		in_serv_agg->budget = 0;
+	else
+		in_serv_agg->budget -= len;
+
+	q->V += (u64)len * q->iwsum;
+	pr_debug("qfq dequeue: len %u F %lld now %lld\n",
+		 len, (unsigned long long) in_serv_agg->F,
+		 (unsigned long long) q->V);
+
+	return skb;
+}
+
+static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
+{
 	struct qfq_group *grp;
+	struct qfq_aggregate *agg, *new_front_agg;
+	u64 old_F;
+
+	qfq_update_eligible(q);
+	q->oldV = q->V;
+
+	if (!q->bitmaps[ER])
+		return NULL;
+
+	grp = qfq_ffs(q, q->bitmaps[ER]);
+	old_F = grp->F;
+
+	agg = qfq_slot_head(grp);
+
+	/* agg starts to be served, remove it from schedule */
+	qfq_front_slot_remove(grp);
+
+	new_front_agg = qfq_slot_scan(grp);
+
+	if (new_front_agg == NULL) /* group is now inactive, remove from ER */
+		__clear_bit(grp->index, &q->bitmaps[ER]);
+	else {
+		u64 roundedS = qfq_round_down(new_front_agg->S,
+					      grp->slot_shift);
+		unsigned int s;
+
+		if (grp->S == roundedS)
+			return agg;
+		grp->S = roundedS;
+		grp->F = roundedS + (2ULL << grp->slot_shift);
+		__clear_bit(grp->index, &q->bitmaps[ER]);
+		s = qfq_calc_state(q, grp);
+		__set_bit(grp->index, &q->bitmaps[s]);
+	}
+
+	qfq_unblock_groups(q, grp->index, old_F);
+
+	return agg;
+}
+
+static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
 	struct qfq_class *cl;
-	int err;
-	u64 roundedS;
-	int s;
+	struct qfq_aggregate *agg;
+	int err = 0;
 
 	cl = qfq_classify(skb, sch, &err);
 	if (cl == NULL) {
@@ -857,6 +1230,15 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	}
 	pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
 
+	if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
+		pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
+			 cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
+		err = qfq_change_agg(sch, cl, cl->agg->class_weight,
+				     qdisc_pkt_len(skb));
+		if (err)
+			return err;
+	}
+
 	err = qdisc_enqueue(skb, cl->qdisc);
 	if (unlikely(err != NET_XMIT_SUCCESS)) {
 		pr_debug("qfq_enqueue: enqueue failed %d\n", err);
@@ -870,21 +1252,44 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	bstats_update(&cl->bstats, skb);
 	++sch->q.qlen;
 
-	/* If the new skb is not the head of queue, then done here. */
-	if (cl->qdisc->q.qlen != 1)
+	agg = cl->agg;
+	/* if the queue was not empty, then done here */
+	if (cl->qdisc->q.qlen != 1) {
+		if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
+		    list_first_entry(&agg->active, struct qfq_class, alist)
+		    == cl && cl->deficit < qdisc_pkt_len(skb))
+			list_move_tail(&cl->alist, &agg->active);
+
 		return err;
+	}
+
+	/* schedule class for service within the aggregate */
+	cl->deficit = agg->lmax;
+	list_add_tail(&cl->alist, &agg->active);
+
+	if (list_first_entry(&agg->active, struct qfq_class, alist) != cl ||
+	    q->in_serv_agg == agg)
+		return err; /* non-empty or in service, nothing else to do */
+
+	qfq_activate_agg(q, agg, enqueue);
+
+	return err;
+}
 
-	/* If reach this point, queue q was idle */
-	grp = cl->grp;
-	qfq_update_start(q, cl);
+/*
+ * Schedule aggregate according to its timestamps.
+ */
+static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+	struct qfq_group *grp = agg->grp;
+	u64 roundedS;
+	int s;
 
-	/* compute new finish time and rounded start. */
-	cl->F = cl->S + (u64)qdisc_pkt_len(skb) * cl->inv_w;
-	roundedS = qfq_round_down(cl->S, grp->slot_shift);
+	roundedS = qfq_round_down(agg->S, grp->slot_shift);
 
 	/*
-	 * insert cl in the correct bucket.
-	 * If cl->S >= grp->S we don't need to adjust the
+	 * Insert agg in the correct bucket.
+	 * If agg->S >= grp->S we don't need to adjust the
 	 * bucket list and simply go to the insertion phase.
 	 * Otherwise grp->S is decreasing, we must make room
 	 * in the bucket list, and also recompute the group state.
@@ -892,15 +1297,16 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	 * was in ER make sure to adjust V.
 	 */
 	if (grp->full_slots) {
-		if (!qfq_gt(grp->S, cl->S))
+		if (!qfq_gt(grp->S, agg->S))
 			goto skip_update;
 
-		/* create a slot for this cl->S */
+		/* create a slot for this agg->S */
 		qfq_slot_rotate(grp, roundedS);
 		/* group was surely ineligible, remove */
 		__clear_bit(grp->index, &q->bitmaps[IR]);
 		__clear_bit(grp->index, &q->bitmaps[IB]);
-	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
+	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) &&
+		   q->in_serv_agg == NULL)
 		q->V = roundedS;
 
 	grp->S = roundedS;
@@ -910,48 +1316,68 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
 		 s, q->bitmaps[s],
-		 (unsigned long long) cl->S,
-		 (unsigned long long) cl->F,
+		 (unsigned long long) agg->S,
+		 (unsigned long long) agg->F,
 		 (unsigned long long) q->V);
 
 skip_update:
-	qfq_slot_insert(grp, cl, roundedS);
-
-	return err;
+	qfq_slot_insert(grp, agg, roundedS);
 }
 
 
+/* Update agg ts and schedule agg for service */
+static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+			     enum update_reason reason)
+{
+	agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */
+
+	qfq_update_agg_ts(q, agg, reason);
+	if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */
+		q->in_serv_agg = agg; /* start serving this aggregate */
+		 /* update V: to be in service, agg must be eligible */
+		q->oldV = q->V = agg->S;
+	} else if (agg != q->in_serv_agg)
+		qfq_schedule_agg(q, agg);
+}
+
 static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
-			    struct qfq_class *cl)
+			    struct qfq_aggregate *agg)
 {
 	unsigned int i, offset;
 	u64 roundedS;
 
-	roundedS = qfq_round_down(cl->S, grp->slot_shift);
+	roundedS = qfq_round_down(agg->S, grp->slot_shift);
 	offset = (roundedS - grp->S) >> grp->slot_shift;
+
 	i = (grp->front + offset) % QFQ_MAX_SLOTS;
 
-	hlist_del(&cl->next);
+	hlist_del(&agg->next);
 	if (hlist_empty(&grp->slots[i]))
 		__clear_bit(offset, &grp->full_slots);
 }
 
 /*
- * called to forcibly destroy a queue.
- * If the queue is not in the front bucket, or if it has
- * other queues in the front bucket, we can simply remove
- * the queue with no other side effects.
+ * Called to forcibly deschedule an aggregate.  If the aggregate is
+ * not in the front bucket, or if the latter has other aggregates in
+ * the front bucket, we can simply remove the aggregate with no other
+ * side effects.
  * Otherwise we must propagate the event up.
  */
-static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
+static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
 {
-	struct qfq_group *grp = cl->grp;
+	struct qfq_group *grp = agg->grp;
 	unsigned long mask;
 	u64 roundedS;
 	int s;
 
-	cl->F = cl->S;
-	qfq_slot_remove(q, grp, cl);
+	if (agg == q->in_serv_agg) {
+		charge_actual_service(agg);
+		q->in_serv_agg = qfq_choose_next_agg(q);
+		return;
+	}
+
+	agg->F = agg->S;
+	qfq_slot_remove(q, grp, agg);
 
 	if (!grp->full_slots) {
 		__clear_bit(grp->index, &q->bitmaps[IR]);
@@ -970,8 +1396,8 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
 		}
 		__clear_bit(grp->index, &q->bitmaps[ER]);
 	} else if (hlist_empty(&grp->slots[grp->front])) {
-		cl = qfq_slot_scan(grp);
-		roundedS = qfq_round_down(cl->S, grp->slot_shift);
+		agg = qfq_slot_scan(grp);
+		roundedS = qfq_round_down(agg->S, grp->slot_shift);
 		if (grp->S != roundedS) {
 			__clear_bit(grp->index, &q->bitmaps[ER]);
 			__clear_bit(grp->index, &q->bitmaps[IR]);
@@ -983,8 +1409,6 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
 			__set_bit(grp->index, &q->bitmaps[s]);
 		}
 	}
-
-	qfq_update_eligible(q, q->V);
 }
 
 static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
@@ -996,6 +1420,31 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
 		qfq_deactivate_class(q, cl);
 }
 
+static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
+				       struct hlist_head *slot)
+{
+	struct qfq_aggregate *agg;
+	struct qfq_class *cl;
+	unsigned int len;
+
+	hlist_for_each_entry(agg, slot, next) {
+		list_for_each_entry(cl, &agg->active, alist) {
+
+			if (!cl->qdisc->ops->drop)
+				continue;
+
+			len = cl->qdisc->ops->drop(cl->qdisc);
+			if (len > 0) {
+				if (cl->qdisc->q.qlen == 0)
+					qfq_deactivate_class(q, cl);
+
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
 static unsigned int qfq_drop(struct Qdisc *sch)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
@@ -1005,24 +1454,13 @@ static unsigned int qfq_drop(struct Qdisc *sch)
 	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
 		grp = &q->groups[i];
 		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
-			struct qfq_class *cl;
-			struct hlist_node *n;
-
-			hlist_for_each_entry(cl, n, &grp->slots[j], next) {
-
-				if (!cl->qdisc->ops->drop)
-					continue;
-
-				len = cl->qdisc->ops->drop(cl->qdisc);
-				if (len > 0) {
-					sch->q.qlen--;
-					if (!cl->qdisc->q.qlen)
-						qfq_deactivate_class(q, cl);
-
-					return len;
-				}
+			len = qfq_drop_from_slot(q, &grp->slots[j]);
+			if (len > 0) {
+				sch->q.qlen--;
+				return len;
 			}
 		}
+
 	}
 
 	return 0;
@@ -1033,44 +1471,50 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 	struct qfq_sched *q = qdisc_priv(sch);
 	struct qfq_group *grp;
 	int i, j, err;
+	u32 max_cl_shift, maxbudg_shift, max_classes;
 
 	err = qdisc_class_hash_init(&q->clhash);
 	if (err < 0)
 		return err;
 
+	if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
+		max_classes = QFQ_MAX_AGG_CLASSES;
+	else
+		max_classes = qdisc_dev(sch)->tx_queue_len + 1;
+	/* max_cl_shift = floor(log_2(max_classes)) */
+	max_cl_shift = __fls(max_classes);
+	q->max_agg_classes = 1<<max_cl_shift;
+
+	/* maxbudg_shift = log2(max_len * max_classes_per_agg) */
+	maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
+	q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
+
 	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
 		grp = &q->groups[i];
 		grp->index = i;
-		grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS
-				   - (QFQ_MAX_INDEX - i);
+		grp->slot_shift = q->min_slot_shift + i;
 		for (j = 0; j < QFQ_MAX_SLOTS; j++)
 			INIT_HLIST_HEAD(&grp->slots[j]);
 	}
 
+	INIT_HLIST_HEAD(&q->nonfull_aggs);
+
 	return 0;
 }
 
 static void qfq_reset_qdisc(struct Qdisc *sch)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
-	struct qfq_group *grp;
 	struct qfq_class *cl;
-	struct hlist_node *n, *tmp;
-	unsigned int i, j;
+	unsigned int i;
 
-	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
-		grp = &q->groups[i];
-		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
-			hlist_for_each_entry_safe(cl, n, tmp,
-						  &grp->slots[j], next) {
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+			if (cl->qdisc->q.qlen > 0)
 				qfq_deactivate_class(q, cl);
-			}
-		}
-	}
 
-	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
 			qdisc_reset(cl->qdisc);
+		}
 	}
 	sch->q.qlen = 0;
 }
@@ -1079,13 +1523,13 @@ static void qfq_destroy_qdisc(struct Qdisc *sch)
 {
 	struct qfq_sched *q = qdisc_priv(sch);
 	struct qfq_class *cl;
-	struct hlist_node *n, *next;
+	struct hlist_node *next;
 	unsigned int i;
 
 	tcf_destroy_chain(&q->filter_list);
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
 					  common.hnode) {
 			qfq_destroy_class(sch, cl);
 		}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a5cc3012cf4..633e32defdc 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -272,8 +272,9 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 	opts = nla_nest_start(skb, TCA_OPTIONS);
 	if (opts == NULL)
 		goto nla_put_failure;
-	NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
-	NLA_PUT_U32(skb, TCA_RED_MAX_P, q->parms.max_P);
+	if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
+	    nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P))
+		goto nla_put_failure;
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index d7eea99333e..9b0f7093d97 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -220,7 +220,7 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da
 
 static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
 {
-	q->bins[slot].perturbation = net_random();
+	q->bins[slot].perturbation = prandom_u32();
 }
 
 static void sfb_swap_slot(struct sfb_sched_data *q)
@@ -381,7 +381,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		goto enqueue;
 	}
 
-	r = net_random() & SFB_MAX_PROB;
+	r = prandom_u32() & SFB_MAX_PROB;
 
 	if (unlikely(r < p_min)) {
 		if (unlikely(p_min > SFB_MAX_PROB / 2)) {
@@ -570,7 +570,10 @@ static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
 
 	sch->qstats.backlog = q->qdisc->qstats.backlog;
 	opts = nla_nest_start(skb, TCA_OPTIONS);
-	NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt);
+	if (opts == NULL)
+		goto nla_put_failure;
+	if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 60d47180f04..1af2f73906d 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -237,10 +237,12 @@ static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 }
 
 #define sfq_unlink(q, x, n, p)			\
-	n = q->slots[x].dep.next;		\
-	p = q->slots[x].dep.prev;		\
-	sfq_dep_head(q, p)->next = n;		\
-	sfq_dep_head(q, n)->prev = p
+	do {					\
+		n = q->slots[x].dep.next;	\
+		p = q->slots[x].dep.prev;	\
+		sfq_dep_head(q, p)->next = n;	\
+		sfq_dep_head(q, n)->prev = p;	\
+	} while (0)
 
 
 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
@@ -469,11 +471,15 @@ enqueue:
 	if (slot->qlen == 1) {		/* The flow is new */
 		if (q->tail == NULL) {	/* It is the first flow */
 			slot->next = x;
-			q->tail = slot;
 		} else {
 			slot->next = q->tail->next;
 			q->tail->next = x;
 		}
+		/* We put this flow at the end of our flow list.
+		 * This might sound unfair for a new flow to wait after old ones,
+		 * but we could endup servicing new flows only, and freeze old ones.
+		 */
+		q->tail = slot;
 		/* We could use a bigger initial quantum for new flows */
 		slot->allot = q->scaled_quantum;
 	}
@@ -623,7 +629,7 @@ static void sfq_perturbation(unsigned long arg)
 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 
 	spin_lock(root_lock);
-	q->perturbation = net_random();
+	q->perturbation = prandom_u32();
 	if (!q->filter_list && q->tail)
 		sfq_rehash(sch);
 	spin_unlock(root_lock);
@@ -692,7 +698,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 	del_timer(&q->perturb_timer);
 	if (q->perturb_period) {
 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
-		q->perturbation = net_random();
+		q->perturbation = prandom_u32();
 	}
 	sch_tree_unlock(sch);
 	kfree(p);
@@ -710,12 +716,7 @@ static void *sfq_alloc(size_t sz)
 
 static void sfq_free(void *addr)
 {
-	if (addr) {
-		if (is_vmalloc_addr(addr))
-			vfree(addr);
-		else
-			kfree(addr);
-	}
+	kvfree(addr);
 }
 
 static void sfq_destroy(struct Qdisc *sch)
@@ -753,7 +754,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->quantum = psched_mtu(qdisc_dev(sch));
 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 	q->perturb_period = 0;
-	q->perturbation = net_random();
+	q->perturbation = prandom_u32();
 
 	if (opt) {
 		int err = sfq_change(sch, opt);
@@ -808,7 +809,8 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 	opt.flags	= q->flags;
 
-	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
 
 	return skb->len;
 
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index b8e156319d7..18ff6343370 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/skbuff.h>
 #include <net/netlink.h>
+#include <net/sch_generic.h>
 #include <net/pkt_sched.h>
 
 
@@ -100,31 +101,103 @@
 struct tbf_sched_data {
 /* Parameters */
 	u32		limit;		/* Maximal length of backlog: bytes */
-	u32		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
-	u32		mtu;
 	u32		max_size;
-	struct qdisc_rate_table	*R_tab;
-	struct qdisc_rate_table	*P_tab;
+	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
+	s64		mtu;
+	struct psched_ratecfg rate;
+	struct psched_ratecfg peak;
 
 /* Variables */
-	long	tokens;			/* Current number of B tokens */
-	long	ptokens;		/* Current number of P tokens */
-	psched_time_t	t_c;		/* Time check-point */
+	s64	tokens;			/* Current number of B tokens */
+	s64	ptokens;		/* Current number of P tokens */
+	s64	t_c;			/* Time check-point */
 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
 };
 
-#define L2T(q, L)   qdisc_l2t((q)->R_tab, L)
-#define L2T_P(q, L) qdisc_l2t((q)->P_tab, L)
+
+/* Time to Length, convert time in ns to length in bytes
+ * to determinate how many bytes can be sent in given time.
+ */
+static u64 psched_ns_t2l(const struct psched_ratecfg *r,
+			 u64 time_in_ns)
+{
+	/* The formula is :
+	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
+	 */
+	u64 len = time_in_ns * r->rate_bytes_ps;
+
+	do_div(len, NSEC_PER_SEC);
+
+	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
+		do_div(len, 53);
+		len = len * 48;
+	}
+
+	if (len > r->overhead)
+		len -= r->overhead;
+	else
+		len = 0;
+
+	return len;
+}
+
+/*
+ * Return length of individual segments of a gso packet,
+ * including all headers (MAC, IP, TCP/UDP)
+ */
+static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
+{
+	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+	return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/* GSO packet is too big, segment it so that tbf can transmit
+ * each segment in time
+ */
+static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *segs, *nskb;
+	netdev_features_t features = netif_skb_features(skb);
+	int ret, nb;
+
+	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+
+	if (IS_ERR_OR_NULL(segs))
+		return qdisc_reshape_fail(skb, sch);
+
+	nb = 0;
+	while (segs) {
+		nskb = segs->next;
+		segs->next = NULL;
+		qdisc_skb_cb(segs)->pkt_len = segs->len;
+		ret = qdisc_enqueue(segs, q->qdisc);
+		if (ret != NET_XMIT_SUCCESS) {
+			if (net_xmit_drop_count(ret))
+				sch->qstats.drops++;
+		} else {
+			nb++;
+		}
+		segs = nskb;
+	}
+	sch->q.qlen += nb;
+	if (nb > 1)
+		qdisc_tree_decrease_qlen(sch, 1 - nb);
+	consume_skb(skb);
+	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
 
 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 	int ret;
 
-	if (qdisc_pkt_len(skb) > q->max_size)
+	if (qdisc_pkt_len(skb) > q->max_size) {
+		if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
+			return tbf_segment(skb, sch);
 		return qdisc_reshape_fail(skb, sch);
-
+	}
 	ret = qdisc_enqueue(skb, q->qdisc);
 	if (ret != NET_XMIT_SUCCESS) {
 		if (net_xmit_drop_count(ret))
@@ -148,6 +221,11 @@ static unsigned int tbf_drop(struct Qdisc *sch)
 	return len;
 }
 
+static bool tbf_peak_present(const struct tbf_sched_data *q)
+{
+	return q->peak.rate_bytes_ps;
+}
+
 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
@@ -156,24 +234,24 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 	skb = q->qdisc->ops->peek(q->qdisc);
 
 	if (skb) {
-		psched_time_t now;
-		long toks;
-		long ptoks = 0;
+		s64 now;
+		s64 toks;
+		s64 ptoks = 0;
 		unsigned int len = qdisc_pkt_len(skb);
 
-		now = psched_get_time();
-		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
+		now = ktime_to_ns(ktime_get());
+		toks = min_t(s64, now - q->t_c, q->buffer);
 
-		if (q->P_tab) {
+		if (tbf_peak_present(q)) {
 			ptoks = toks + q->ptokens;
-			if (ptoks > (long)q->mtu)
+			if (ptoks > q->mtu)
 				ptoks = q->mtu;
-			ptoks -= L2T_P(q, len);
+			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
 		}
 		toks += q->tokens;
-		if (toks > (long)q->buffer)
+		if (toks > q->buffer)
 			toks = q->buffer;
-		toks -= L2T(q, len);
+		toks -= (s64) psched_l2t_ns(&q->rate, len);
 
 		if ((toks|ptoks) >= 0) {
 			skb = qdisc_dequeue_peeked(q->qdisc);
@@ -189,8 +267,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 			return skb;
 		}
 
-		qdisc_watchdog_schedule(&q->watchdog,
-					now + max_t(long, -toks, -ptoks));
+		qdisc_watchdog_schedule_ns(&q->watchdog,
+					   now + max_t(long, -toks, -ptoks));
 
 		/* Maybe we have a shorter packet in the queue,
 		   which can be sent now. It sounds cool,
@@ -214,7 +292,7 @@ static void tbf_reset(struct Qdisc *sch)
 
 	qdisc_reset(q->qdisc);
 	sch->q.qlen = 0;
-	q->t_c = psched_get_time();
+	q->t_c = ktime_to_ns(ktime_get());
 	q->tokens = q->buffer;
 	q->ptokens = q->mtu;
 	qdisc_watchdog_cancel(&q->watchdog);
@@ -224,20 +302,26 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
 	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
 	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
 	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
+	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
+	[TCA_TBF_BURST] = { .type = NLA_U32 },
+	[TCA_TBF_PBURST] = { .type = NLA_U32 },
 };
 
 static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	int err;
 	struct tbf_sched_data *q = qdisc_priv(sch);
-	struct nlattr *tb[TCA_TBF_PTAB + 1];
+	struct nlattr *tb[TCA_TBF_MAX + 1];
 	struct tc_tbf_qopt *qopt;
-	struct qdisc_rate_table *rtab = NULL;
-	struct qdisc_rate_table *ptab = NULL;
 	struct Qdisc *child = NULL;
-	int max_size, n;
+	struct psched_ratecfg rate;
+	struct psched_ratecfg peak;
+	u64 max_size;
+	s64 buffer, mtu;
+	u64 rate64 = 0, prate64 = 0;
 
-	err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
+	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
 	if (err < 0)
 		return err;
 
@@ -246,33 +330,59 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 		goto done;
 
 	qopt = nla_data(tb[TCA_TBF_PARMS]);
-	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
-	if (rtab == NULL)
-		goto done;
+	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
+		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
+					      tb[TCA_TBF_RTAB]));
+
+	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
+			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
+						      tb[TCA_TBF_PTAB]));
+
+	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
+	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
+
+	if (tb[TCA_TBF_RATE64])
+		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
+	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
+
+	if (tb[TCA_TBF_BURST]) {
+		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
+		buffer = psched_l2t_ns(&rate, max_size);
+	} else {
+		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
+	}
 
 	if (qopt->peakrate.rate) {
-		if (qopt->peakrate.rate > qopt->rate.rate)
-			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
-		if (ptab == NULL)
+		if (tb[TCA_TBF_PRATE64])
+			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
+		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
+		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
+			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
+					peak.rate_bytes_ps, rate.rate_bytes_ps);
+			err = -EINVAL;
 			goto done;
-	}
+		}
 
-	for (n = 0; n < 256; n++)
-		if (rtab->data[n] > qopt->buffer)
-			break;
-	max_size = (n << qopt->rate.cell_log) - 1;
-	if (ptab) {
-		int size;
-
-		for (n = 0; n < 256; n++)
-			if (ptab->data[n] > qopt->mtu)
-				break;
-		size = (n << qopt->peakrate.cell_log) - 1;
-		if (size < max_size)
-			max_size = size;
+		if (tb[TCA_TBF_PBURST]) {
+			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
+			max_size = min_t(u32, max_size, pburst);
+			mtu = psched_l2t_ns(&peak, pburst);
+		} else {
+			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
+		}
+	} else {
+		memset(&peak, 0, sizeof(peak));
 	}
-	if (max_size < 0)
+
+	if (max_size < psched_mtu(qdisc_dev(sch)))
+		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
+				    max_size, qdisc_dev(sch)->name,
+				    psched_mtu(qdisc_dev(sch)));
+
+	if (!max_size) {
+		err = -EINVAL;
 		goto done;
+	}
 
 	if (q->qdisc != &noop_qdisc) {
 		err = fifo_set_limit(q->qdisc, qopt->limit);
@@ -293,22 +403,24 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 		q->qdisc = child;
 	}
 	q->limit = qopt->limit;
-	q->mtu = qopt->mtu;
+	if (tb[TCA_TBF_PBURST])
+		q->mtu = mtu;
+	else
+		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
 	q->max_size = max_size;
-	q->buffer = qopt->buffer;
+	if (tb[TCA_TBF_BURST])
+		q->buffer = buffer;
+	else
+		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 	q->tokens = q->buffer;
 	q->ptokens = q->mtu;
 
-	swap(q->R_tab, rtab);
-	swap(q->P_tab, ptab);
+	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
+	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
 
 	sch_tree_unlock(sch);
 	err = 0;
 done:
-	if (rtab)
-		qdisc_put_rtab(rtab);
-	if (ptab)
-		qdisc_put_rtab(ptab);
 	return err;
 }
 
@@ -319,7 +431,7 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
 	if (opt == NULL)
 		return -EINVAL;
 
-	q->t_c = psched_get_time();
+	q->t_c = ktime_to_ns(ktime_get());
 	qdisc_watchdog_init(&q->watchdog, sch);
 	q->qdisc = &noop_qdisc;
 
@@ -331,12 +443,6 @@ static void tbf_destroy(struct Qdisc *sch)
 	struct tbf_sched_data *q = qdisc_priv(sch);
 
 	qdisc_watchdog_cancel(&q->watchdog);
-
-	if (q->P_tab)
-		qdisc_put_rtab(q->P_tab);
-	if (q->R_tab)
-		qdisc_put_rtab(q->R_tab);
-
 	qdisc_destroy(q->qdisc);
 }
 
@@ -352,17 +458,24 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 		goto nla_put_failure;
 
 	opt.limit = q->limit;
-	opt.rate = q->R_tab->rate;
-	if (q->P_tab)
-		opt.peakrate = q->P_tab->rate;
+	psched_ratecfg_getrate(&opt.rate, &q->rate);
+	if (tbf_peak_present(q))
+		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 	else
 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
-	opt.mtu = q->mtu;
-	opt.buffer = q->buffer;
-	NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
+	opt.mtu = PSCHED_NS2TICKS(q->mtu);
+	opt.buffer = PSCHED_NS2TICKS(q->buffer);
+	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
+	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
+		goto nla_put_failure;
+	if (tbf_peak_present(q) &&
+	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
+	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
+		goto nla_put_failure;
 
-	nla_nest_end(skb, nest);
-	return skb->len;
+	return nla_nest_end(skb, nest);
 
 nla_put_failure:
 	nla_nest_cancel(skb, nest);
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 45326599fda..47416716294 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -67,7 +67,6 @@ struct teql_master {
 struct teql_sched_data {
 	struct Qdisc *next;
 	struct teql_master *m;
-	struct neighbour *ncache;
 	struct sk_buff_head q;
 };
 
@@ -88,9 +87,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return NET_XMIT_SUCCESS;
 	}
 
-	kfree_skb(skb);
-	sch->qstats.drops++;
-	return NET_XMIT_DROP;
+	return qdisc_drop(skb, sch);
 }
 
 static struct sk_buff *
@@ -136,7 +133,6 @@ teql_reset(struct Qdisc *sch)
 
 	skb_queue_purge(&dat->q);
 	sch->q.qlen = 0;
-	teql_neigh_release(xchg(&dat->ncache, NULL));
 }
 
 static void
@@ -168,7 +164,6 @@ teql_destroy(struct Qdisc *sch)
 					}
 				}
 				skb_queue_purge(&dat->q);
-				teql_neigh_release(xchg(&dat->ncache, NULL));
 				break;
 			}
 
@@ -227,21 +222,25 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
 static int
 __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
 	       struct net_device *dev, struct netdev_queue *txq,
-	       struct neighbour *mn)
+	       struct dst_entry *dst)
 {
-	struct teql_sched_data *q = qdisc_priv(txq->qdisc);
-	struct neighbour *n = q->ncache;
+	struct neighbour *n;
+	int err = 0;
 
-	if (mn->tbl == NULL)
-		return -EINVAL;
-	if (n && n->tbl == mn->tbl &&
-	    memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) {
-		atomic_inc(&n->refcnt);
-	} else {
-		n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev);
-		if (IS_ERR(n))
-			return PTR_ERR(n);
+	n = dst_neigh_lookup_skb(dst, skb);
+	if (!n)
+		return -ENOENT;
+
+	if (dst->dev != dev) {
+		struct neighbour *mn;
+
+		mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev);
+		neigh_release(n);
+		if (IS_ERR(mn))
+			return PTR_ERR(mn);
+		n = mn;
 	}
+
 	if (neigh_event_send(n, skb_res) == 0) {
 		int err;
 		char haddr[MAX_ADDR_LEN];
@@ -250,15 +249,13 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
 		err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
 				      NULL, skb->len);
 
-		if (err < 0) {
-			neigh_release(n);
-			return -EINVAL;
-		}
-		teql_neigh_release(xchg(&q->ncache, n));
-		return 0;
+		if (err < 0)
+			err = -EINVAL;
+	} else {
+		err = (skb_res == NULL) ? -EAGAIN : 1;
 	}
 	neigh_release(n);
-	return (skb_res == NULL) ? -EAGAIN : 1;
+	return err;
 }
 
 static inline int teql_resolve(struct sk_buff *skb,
@@ -267,7 +264,6 @@ static inline int teql_resolve(struct sk_buff *skb,
 			       struct netdev_queue *txq)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct neighbour *mn;
 	int res;
 
 	if (txq->qdisc == &noop_qdisc)
@@ -277,8 +273,7 @@ static inline int teql_resolve(struct sk_buff *skb,
 		return 0;
 
 	rcu_read_lock();
-	mn = dst_get_neighbour_noref(dst);
-	res = mn ? __teql_resolve(skb, skb_res, dev, txq, mn) : 0;
+	res = __teql_resolve(skb, skb_res, dev, txq, dst);
 	rcu_read_unlock();
 
 	return res;