63 files changed, 20333 insertions, 8300 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 778b1e5a4b5..a1a8e29e5fc 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -2,10 +2,9 @@
 # Traffic control configuration.
 # 
 
-menu "QoS and/or fair queueing"
-
-config NET_SCHED
+menuconfig NET_SCHED
 	bool "QoS and/or fair queueing"
+	select NET_SCH_FIFO
 	---help---
 	  When the kernel has several packets to send out over a network
 	  device, it has to decide which ones to send first, which ones to
@@ -25,7 +24,7 @@ config NET_SCHED
 	  To administer these schedulers, you'll need the user-level utilities
 	  from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
 	  That package also contains some documentation; for more, check out
-	  <http://linux-net.osdl.org/index.php/Iproute2>.
+	  <http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2>.
 
 	  This Quality of Service (QoS) support will enable you to use
 	  Differentiated Services (diffserv) and Resource Reservation Protocol
@@ -42,62 +41,6 @@ config NET_SCHED
 
 if NET_SCHED
 
-choice
-	prompt "Packet scheduler clock source"
-	default NET_SCH_CLK_GETTIMEOFDAY
-	---help---
-	  Packet schedulers need a monotonic clock that increments at a static
-	  rate. The kernel provides several suitable interfaces, each with
-	  different properties:
-	  
-	  - high resolution (us or better)
-	  - fast to read (minimal locking, no i/o access)
-	  - synchronized on all processors
-	  - handles cpu clock frequency changes
-
-	  but nothing provides all of the above.
-
-config NET_SCH_CLK_JIFFIES
-	bool "Timer interrupt"
-	---help---
-	  Say Y here if you want to use the timer interrupt (jiffies) as clock
-	  source. This clock source is fast, synchronized on all processors and
-	  handles cpu clock frequency changes, but its resolution is too low
-	  for accurate shaping except at very low speed.
-
-config NET_SCH_CLK_GETTIMEOFDAY
-	bool "gettimeofday"
-	---help---
-	  Say Y here if you want to use gettimeofday as clock source. This clock
-	  source has high resolution, is synchronized on all processors and
-	  handles cpu clock frequency changes, but it is slow.
-
-	  Choose this if you need a high resolution clock source but can't use
-	  the CPU's cycle counter.
-
-# don't allow on SMP x86 because they can have unsynchronized TSCs.
-# gettimeofday is a good alternative
-config NET_SCH_CLK_CPU
-	bool "CPU cycle counter"
-	depends on ((X86_TSC || X86_64) && !SMP) || ALPHA || SPARC64 || PPC64 || IA64
-	---help---
-	  Say Y here if you want to use the CPU's cycle counter as clock source.
-	  This is a cheap and high resolution clock source, but on some
-	  architectures it is not synchronized on all processors and doesn't
-	  handle cpu clock frequency changes.
-
-	  The useable cycle counters are:
-
-	  	x86/x86_64	- Timestamp Counter
-		alpha		- Cycle Counter
-		sparc64		- %ticks register
-		ppc64		- Time base
-		ia64		- Interval Time Counter
-
-	  Choose this if your CPU's cycle counter is working properly.
-
-endchoice
-
 comment "Queueing/Scheduling"
 
 config NET_SCH_CBQ
@@ -149,7 +92,7 @@ config NET_SCH_ATM
 	  select classes of this queuing discipline.  Each class maps
 	  the flow(s) it is handling to a given virtual circuit.
 
-	  See the top of <file:net/sched/sch_atm.c>) for more details.
+	  See the top of <file:net/sched/sch_atm.c> for more details.
 
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_atm.
@@ -163,6 +106,15 @@ config NET_SCH_PRIO
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_prio.
 
+config NET_SCH_MULTIQ
+	tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
+	---help---
+	  Say Y here if you want to use an n-band queue packet scheduler
+	  to support devices that have multiple hardware transmit queues.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_multiq.
+
 config NET_SCH_RED
 	tristate "Random Early Detection (RED)"
 	---help---
@@ -174,11 +126,22 @@ config NET_SCH_RED
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_red.
 
+config NET_SCH_SFB
+	tristate "Stochastic Fair Blue (SFB)"
+	---help---
+	  Say Y here if you want to use the Stochastic Fair Blue (SFB)
+	  packet scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_sfb.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_sfb.
+
 config NET_SCH_SFQ
 	tristate "Stochastic Fairness Queueing (SFQ)"
 	---help---
 	  Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
-	  packet scheduling algorithm .
+	  packet scheduling algorithm.
 
 	  See the top of <file:net/sched/sch_sfq.c> for more details.
 
@@ -242,8 +205,112 @@ config NET_SCH_NETEM
 
 	  If unsure, say N.
 
+config NET_SCH_DRR
+	tristate "Deficit Round Robin scheduler (DRR)"
+	help
+	  Say Y here if you want to use the Deficit Round Robin (DRR) packet
+	  scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_drr.
+
+	  If unsure, say N.
+
+config NET_SCH_MQPRIO
+	tristate "Multi-queue priority scheduler (MQPRIO)"
+	help
+	  Say Y here if you want to use the Multi-queue Priority scheduler.
+	  This scheduler allows QOS to be offloaded on NICs that have support
+	  for offloading QOS schedulers.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called sch_mqprio.
+
+	  If unsure, say N.
+
+config NET_SCH_CHOKE
+	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+	help
+	  Say Y here if you want to use the CHOKe packet scheduler (CHOose
+	  and Keep for responsive flows, CHOose and Kill for unresponsive
+	  flows). This is a variation of RED which trys to penalize flows
+	  that monopolize the queue.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_choke.
+
+config NET_SCH_QFQ
+	tristate "Quick Fair Queueing scheduler (QFQ)"
+	help
+	  Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ)
+	  packet scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_qfq.
+
+	  If unsure, say N.
+
+config NET_SCH_CODEL
+	tristate "Controlled Delay AQM (CODEL)"
+	help
+	  Say Y here if you want to use the Controlled Delay (CODEL)
+	  packet scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_codel.
+
+	  If unsure, say N.
+
+config NET_SCH_FQ_CODEL
+	tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)"
+	help
+	  Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL)
+	  packet scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_fq_codel.
+
+	  If unsure, say N.
+
+config NET_SCH_FQ
+	tristate "Fair Queue"
+	help
+	  Say Y here if you want to use the FQ packet scheduling algorithm.
+
+	  FQ does flow separation, and is able to respect pacing requirements
+	  set by TCP stack into sk->sk_pacing_rate (for localy generated
+	  traffic)
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_fq.
+
+	  If unsure, say N.
+
+config NET_SCH_HHF
+	tristate "Heavy-Hitter Filter (HHF)"
+	help
+	  Say Y here if you want to use the Heavy-Hitter Filter (HHF)
+	  packet scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_hhf.
+
+config NET_SCH_PIE
+	tristate "Proportional Integral controller Enhanced (PIE) scheduler"
+	help
+	  Say Y here if you want to use the Proportional Integral controller
+	  Enhanced scheduler packet scheduling algorithm.
+	  For more information, please see
+	  http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_pie.
+
+	  If unsure, say N.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
+	depends on NET_CLS_ACT
 	---help---
 	  Say Y here if you want to use classifiers for incoming packets.
 	  If unsure, say Y.
@@ -251,6 +318,32 @@ config NET_SCH_INGRESS
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_ingress.
 
+config NET_SCH_PLUG
+	tristate "Plug network traffic until release (PLUG)"
+	---help---
+
+	  This queuing discipline allows userspace to plug/unplug a network
+	  output queue, using the netlink interface.  When it receives an
+	  enqueue command it inserts a plug into the outbound queue that
+	  causes following packets to enqueue until a dequeue command arrives
+	  over netlink, causing the plug to be removed and resuming the normal
+	  packet flow.
+
+	  This module also provides a generic "network output buffering"
+	  functionality (aka output commit), wherein upon arrival of a dequeue
+	  command, only packets up to the first plug are released for delivery.
+	  The Remus HA project uses this module to enable speculative execution
+	  of virtual machines by allowing the generated network output to be rolled
+	  back if needed.
+
+	  For more information, please refer to http://wiki.xensource.com/xenwiki/Remus
+
+	  Say Y here if you are using this kernel for Xen dom0 and
+	  want to protect Xen guests with Remus.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_plug.
+
 comment "Classification"
 
 config NET_CLS
@@ -279,7 +372,8 @@ config NET_CLS_TCINDEX
 
 config NET_CLS_ROUTE4
 	tristate "Routing decision (ROUTE)"
-	select NET_CLS_ROUTE
+	depends on INET
+	select IP_ROUTE_CLASSID
 	select NET_CLS
 	---help---
 	  If you say Y here, you will be able to classify packets
@@ -288,9 +382,6 @@ config NET_CLS_ROUTE4
 	  To compile this code as a module, choose M here: the
 	  module will be called cls_route.
 
-config NET_CLS_ROUTE
-	bool
-
 config NET_CLS_FW
 	tristate "Netfilter mark (FW)"
 	select NET_CLS
@@ -305,7 +396,7 @@ config NET_CLS_U32
 	tristate "Universal 32bit comparisons w/ hashing (U32)"
 	select NET_CLS
 	---help---
-	  Say Y here to be able to classify packetes using a universal
+	  Say Y here to be able to classify packets using a universal
 	  32bit pieces based comparison scheme.
 
 	  To compile this code as a module, choose M here: the
@@ -320,14 +411,13 @@ config CLS_U32_PERF
 
 config CLS_U32_MARK
 	bool "Netfilter marks support"
-	depends on NET_CLS_U32 && NETFILTER
+	depends on NET_CLS_U32
 	---help---
 	  Say Y here to be able to use netfilter marks as u32 key.
 
 config NET_CLS_RSVP
 	tristate "IPv4 Resource Reservation Protocol (RSVP)"
 	select NET_CLS
-	select NET_ESTIMATOR
 	---help---
 	  The Resource Reservation Protocol (RSVP) permits end systems to
 	  request a minimum and maximum data flow rate for a connection; this
@@ -342,18 +432,50 @@ config NET_CLS_RSVP
 config NET_CLS_RSVP6
 	tristate "IPv6 Resource Reservation Protocol (RSVP6)"
 	select NET_CLS
-	select NET_ESTIMATOR
 	---help---
 	  The Resource Reservation Protocol (RSVP) permits end systems to
 	  request a minimum and maximum data flow rate for a connection; this
 	  is important for real time data such as streaming sound or video.
 
 	  Say Y here if you want to be able to classify outgoing packets based
-	  on their RSVP requests and you are using the IPv6.
+	  on their RSVP requests and you are using the IPv6 protocol.
 
 	  To compile this code as a module, choose M here: the
 	  module will be called cls_rsvp6.
 
+config NET_CLS_FLOW
+	tristate "Flow classifier"
+	select NET_CLS
+	---help---
+	  If you say Y here, you will be able to classify packets based on
+	  a configurable combination of packet keys. This is mostly useful
+	  in combination with SFQ.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_flow.
+
+config NET_CLS_CGROUP
+	tristate "Control Group Classifier"
+	select NET_CLS
+	select CGROUP_NET_CLASSID
+	depends on CGROUPS
+	---help---
+	  Say Y here if you want to classify packets based on the control
+	  cgroup of their process.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_cgroup.
+
+config NET_CLS_BPF
+	tristate "BPF-based classifier"
+	select NET_CLS
+	---help---
+	  If you say Y here, you will be able to classify packets based on
+	  programmable BPF (JIT'ed) filters as an alternative to ematches.
+
+	  To compile this code as a module, choose M here: the module will
+	  be called cls_bpf.
+
 config NET_EMATCH
 	bool "Extended Matches"
 	select NET_CLS
@@ -432,10 +554,28 @@ config NET_EMATCH_TEXT
 	  To compile this code as a module, choose M here: the
 	  module will be called em_text.
 
+config NET_EMATCH_CANID
+	tristate "CAN Identifier"
+	depends on NET_EMATCH && (CAN=y || CAN=m)
+	---help---
+	  Say Y here if you want to be able to classify CAN frames based
+	  on CAN Identifier.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_canid.
+
+config NET_EMATCH_IPSET
+	tristate "IPset"
+	depends on NET_EMATCH && IP_SET
+	---help---
+	  Say Y here if you want to be able to classify packets based on
+	  ipset membership.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_ipset.
+
 config NET_CLS_ACT
 	bool "Actions"
-	depends on EXPERIMENTAL
-	select NET_ESTIMATOR
 	---help---
 	  Say Y here if you want to use traffic control actions. Actions
 	  get attached to classifiers and are invoked after a successful
@@ -454,7 +594,7 @@ config NET_ACT_POLICE
 	  module.
 
 	  To compile this code as a module, choose M here: the
-	  module will be called police.
+	  module will be called act_police.
 
 config NET_ACT_GACT
         tristate "Generic actions"
@@ -464,7 +604,7 @@ config NET_ACT_GACT
 	  accepting packets.
 
 	  To compile this code as a module, choose M here: the
-	  module will be called gact.
+	  module will be called act_gact.
 
 config GACT_PROB
         bool "Probability support"
@@ -480,17 +620,27 @@ config NET_ACT_MIRRED
 	  other devices.
 
 	  To compile this code as a module, choose M here: the
-	  module will be called mirred.
+	  module will be called act_mirred.
 
 config NET_ACT_IPT
         tristate "IPtables targets"
         depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
         ---help---
-	  Say Y here to be able to invoke iptables targets after succesful
+	  Say Y here to be able to invoke iptables targets after successful
 	  classification.
 
 	  To compile this code as a module, choose M here: the
-	  module will be called ipt.
+	  module will be called act_ipt.
+
+config NET_ACT_NAT
+        tristate "Stateless NAT"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to do stateless NAT on IPv4 packets.  You should use
+	  netfilter for NAT unless you know what you are doing.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_nat.
 
 config NET_ACT_PEDIT
         tristate "Packet Editing"
@@ -499,7 +649,7 @@ config NET_ACT_PEDIT
 	  Say Y here if you want to mangle the content of packets.
 
 	  To compile this code as a module, choose M here: the
-	  module will be called pedit.
+	  module will be called act_pedit.
 
 config NET_ACT_SIMP
         tristate "Simple Example (Debug)"
@@ -513,17 +663,28 @@ config NET_ACT_SIMP
 	  If unsure, say N.
 
 	  To compile this code as a module, choose M here: the
-	  module will be called simple.
+	  module will be called act_simple.
 
-config NET_CLS_POLICE
-	bool "Traffic Policing (obsolete)"
-	depends on NET_CLS_ACT!=y
-	select NET_ESTIMATOR
-	---help---
-	  Say Y here if you want to do traffic policing, i.e. strict
-	  bandwidth limiting. This option is obsoleted by the traffic
-	  policer implemented as action, it stays here for compatibility
-	  reasons.
+config NET_ACT_SKBEDIT
+        tristate "SKB Editing"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to change skb priority or queue_mapping settings.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_skbedit.
+
+config NET_ACT_CSUM
+        tristate "Checksum Updating"
+        depends on NET_CLS_ACT && INET
+        ---help---
+	  Say Y here to update some common checksum after some direct
+	  packet alterations.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_csum.
 
 config NET_CLS_IND
 	bool "Incoming device classification"
@@ -533,14 +694,7 @@ config NET_CLS_IND
 	  classification based on the incoming device. This option is
 	  likely to disappear in favour of the metadata ematch.
 
-config NET_ESTIMATOR
-	bool "Rate estimator"
-	---help---
-	  Say Y here to allow using rate estimators to estimate the current
-	  rate-of-flow for network devices, queues, etc. This module is
-	  automaticaly selected if needed but can be selected manually for
-	  statstical purposes.
-
 endif # NET_SCHED
 
-endmenu
+config NET_SCH_FIFO
+	bool
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 0f06aec6609..0a869a11f3e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -2,32 +2,47 @@
 # Makefile for the Linux Traffic Control Unit.
 #
 
-obj-y	:= sch_generic.o
+obj-y	:= sch_generic.o sch_mq.o
 
-obj-$(CONFIG_NET_SCHED)		+= sch_api.o sch_fifo.o sch_blackhole.o
+obj-$(CONFIG_NET_SCHED)		+= sch_api.o sch_blackhole.o
 obj-$(CONFIG_NET_CLS)		+= cls_api.o
 obj-$(CONFIG_NET_CLS_ACT)	+= act_api.o
 obj-$(CONFIG_NET_ACT_POLICE)	+= act_police.o
-obj-$(CONFIG_NET_CLS_POLICE)	+= act_police.o
 obj-$(CONFIG_NET_ACT_GACT)	+= act_gact.o
 obj-$(CONFIG_NET_ACT_MIRRED)	+= act_mirred.o
 obj-$(CONFIG_NET_ACT_IPT)	+= act_ipt.o
+obj-$(CONFIG_NET_ACT_NAT)	+= act_nat.o
 obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
 obj-$(CONFIG_NET_ACT_SIMP)	+= act_simple.o
+obj-$(CONFIG_NET_ACT_SKBEDIT)	+= act_skbedit.o
+obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
+obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
-obj-$(CONFIG_NET_SCH_HPFQ)	+= sch_hpfq.o
 obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
 obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
 obj-$(CONFIG_NET_SCH_GRED)	+= sch_gred.o
 obj-$(CONFIG_NET_SCH_INGRESS)	+= sch_ingress.o 
 obj-$(CONFIG_NET_SCH_DSMARK)	+= sch_dsmark.o
+obj-$(CONFIG_NET_SCH_SFB)	+= sch_sfb.o
 obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
 obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL)	+= sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO)	+= sch_prio.o
+obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
+obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_PLUG)	+= sch_plug.o
+obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
+obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
+obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
+obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
+obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
+obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
+obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
+
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
@@ -35,9 +50,14 @@ obj-$(CONFIG_NET_CLS_RSVP)	+= cls_rsvp.o
 obj-$(CONFIG_NET_CLS_TCINDEX)	+= cls_tcindex.o
 obj-$(CONFIG_NET_CLS_RSVP6)	+= cls_rsvp6.o
 obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o
+obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o
+obj-$(CONFIG_NET_CLS_CGROUP)	+= cls_cgroup.o
+obj-$(CONFIG_NET_CLS_BPF)	+= cls_bpf.o
 obj-$(CONFIG_NET_EMATCH)	+= ematch.o
 obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o
 obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o
 obj-$(CONFIG_NET_EMATCH_U32)	+= em_u32.o
 obj-$(CONFIG_NET_EMATCH_META)	+= em_meta.o
 obj-$(CONFIG_NET_EMATCH_TEXT)	+= em_text.o
+obj-$(CONFIG_NET_EMATCH_CANID)	+= em_canid.o
+obj-$(CONFIG_NET_EMATCH_IPSET)	+= em_ipset.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 792ce59940e..648778aef1a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -11,204 +11,421 @@
  *
  */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/netdevice.h>
+#include <linux/slab.h>
 #include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
 #include <linux/init.h>
 #include <linux/kmod.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/sch_generic.h>
 #include <net/act_api.h>
+#include <net/netlink.h>
 
-#if 0 /* control */
-#define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args)
-#else
-#define DPRINTK(format, args...)
-#endif
-#if 0 /* data */
-#define D2PRINTK(format, args...) printk(KERN_DEBUG format, ##args)
-#else
-#define D2PRINTK(format, args...)
-#endif
+void tcf_hash_destroy(struct tc_action *a)
+{
+	struct tcf_common *p = a->priv;
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+
+	spin_lock_bh(&hinfo->lock);
+	hlist_del(&p->tcfc_head);
+	spin_unlock_bh(&hinfo->lock);
+	gen_kill_estimator(&p->tcfc_bstats,
+			   &p->tcfc_rate_est);
+	/*
+	 * gen_estimator est_timer() might access p->tcfc_lock
+	 * or bstats, wait a RCU grace period before freeing p
+	 */
+	kfree_rcu(p, tcfc_rcu);
+}
+EXPORT_SYMBOL(tcf_hash_destroy);
+
+int tcf_hash_release(struct tc_action *a, int bind)
+{
+	struct tcf_common *p = a->priv;
+	int ret = 0;
+
+	if (p) {
+		if (bind)
+			p->tcfc_bindcnt--;
+		else if (p->tcfc_bindcnt > 0)
+			return -EPERM;
+
+		p->tcfc_refcnt--;
+		if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
+			if (a->ops->cleanup)
+				a->ops->cleanup(a, bind);
+			tcf_hash_destroy(a);
+			ret = 1;
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL(tcf_hash_release);
+
+static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
+			   struct tc_action *a)
+{
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct hlist_head *head;
+	struct tcf_common *p;
+	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+	struct nlattr *nest;
+
+	spin_lock_bh(&hinfo->lock);
+
+	s_i = cb->args[0];
+
+	for (i = 0; i < (hinfo->hmask + 1); i++) {
+		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
+
+		hlist_for_each_entry_rcu(p, head, tcfc_head) {
+			index++;
+			if (index < s_i)
+				continue;
+			a->priv = p;
+			a->order = n_i;
+
+			nest = nla_nest_start(skb, a->order);
+			if (nest == NULL)
+				goto nla_put_failure;
+			err = tcf_action_dump_1(skb, a, 0, 0);
+			if (err < 0) {
+				index--;
+				nlmsg_trim(skb, nest);
+				goto done;
+			}
+			nla_nest_end(skb, nest);
+			n_i++;
+			if (n_i >= TCA_ACT_MAX_PRIO)
+				goto done;
+		}
+	}
+done:
+	spin_unlock_bh(&hinfo->lock);
+	if (n_i)
+		cb->args[0] += n_i;
+	return n_i;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	goto done;
+}
+
+static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a)
+{
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct hlist_head *head;
+	struct hlist_node *n;
+	struct tcf_common *p;
+	struct nlattr *nest;
+	int i = 0, n_i = 0;
+	int ret = -EINVAL;
+
+	nest = nla_nest_start(skb, a->order);
+	if (nest == NULL)
+		goto nla_put_failure;
+	if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+		goto nla_put_failure;
+	for (i = 0; i < (hinfo->hmask + 1); i++) {
+		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
+		hlist_for_each_entry_safe(p, n, head, tcfc_head) {
+			a->priv = p;
+			ret = tcf_hash_release(a, 0);
+			if (ret == ACT_P_DELETED) {
+				module_put(a->ops->owner);
+				n_i++;
+			} else if (ret < 0)
+				goto nla_put_failure;
+		}
+	}
+	if (nla_put_u32(skb, TCA_FCNT, n_i))
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+
+	return n_i;
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return ret;
+}
+
+static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
+			      int type, struct tc_action *a)
+{
+	if (type == RTM_DELACTION) {
+		return tcf_del_walker(skb, a);
+	} else if (type == RTM_GETACTION) {
+		return tcf_dump_walker(skb, cb, a);
+	} else {
+		WARN(1, "tcf_generic_walker: unknown action %d\n", type);
+		return -EINVAL;
+	}
+}
+
+static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
+{
+	struct tcf_common *p = NULL;
+	struct hlist_head *head;
+
+	spin_lock_bh(&hinfo->lock);
+	head = &hinfo->htab[tcf_hash(index, hinfo->hmask)];
+	hlist_for_each_entry_rcu(p, head, tcfc_head)
+		if (p->tcfc_index == index)
+			break;
+	spin_unlock_bh(&hinfo->lock);
+
+	return p;
+}
+
+u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)
+{
+	u32 val = hinfo->index;
+
+	do {
+		if (++val == 0)
+			val = 1;
+	} while (tcf_hash_lookup(val, hinfo));
+
+	hinfo->index = val;
+	return val;
+}
+EXPORT_SYMBOL(tcf_hash_new_index);
+
+int tcf_hash_search(struct tc_action *a, u32 index)
+{
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tcf_common *p = tcf_hash_lookup(index, hinfo);
+
+	if (p) {
+		a->priv = p;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tcf_hash_search);
+
+int tcf_hash_check(u32 index, struct tc_action *a, int bind)
+{
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tcf_common *p = NULL;
+	if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
+		if (bind)
+			p->tcfc_bindcnt++;
+		p->tcfc_refcnt++;
+		a->priv = p;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tcf_hash_check);
+
+void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
+{
+	struct tcf_common *pc = a->priv;
+	if (est)
+		gen_kill_estimator(&pc->tcfc_bstats,
+				   &pc->tcfc_rate_est);
+	kfree_rcu(pc, tcfc_rcu);
+}
+EXPORT_SYMBOL(tcf_hash_cleanup);
+
+int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
+		    int size, int bind)
+{
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+
+	if (unlikely(!p))
+		return -ENOMEM;
+	p->tcfc_refcnt = 1;
+	if (bind)
+		p->tcfc_bindcnt = 1;
+
+	spin_lock_init(&p->tcfc_lock);
+	INIT_HLIST_NODE(&p->tcfc_head);
+	p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);
+	p->tcfc_tm.install = jiffies;
+	p->tcfc_tm.lastuse = jiffies;
+	if (est) {
+		int err = gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est,
+					    &p->tcfc_lock, est);
+		if (err) {
+			kfree(p);
+			return err;
+		}
+	}
 
-static struct tc_action_ops *act_base = NULL;
+	a->priv = (void *) p;
+	return 0;
+}
+EXPORT_SYMBOL(tcf_hash_create);
+
+void tcf_hash_insert(struct tc_action *a)
+{
+	struct tcf_common *p = a->priv;
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
+
+	spin_lock_bh(&hinfo->lock);
+	hlist_add_head(&p->tcfc_head, &hinfo->htab[h]);
+	spin_unlock_bh(&hinfo->lock);
+}
+EXPORT_SYMBOL(tcf_hash_insert);
+
+static LIST_HEAD(act_base);
 static DEFINE_RWLOCK(act_mod_lock);
 
-int tcf_register_action(struct tc_action_ops *act)
+int tcf_register_action(struct tc_action_ops *act, unsigned int mask)
 {
-	struct tc_action_ops *a, **ap;
+	struct tc_action_ops *a;
+	int err;
+
+	/* Must supply act, dump and init */
+	if (!act->act || !act->dump || !act->init)
+		return -EINVAL;
+
+	/* Supply defaults */
+	if (!act->lookup)
+		act->lookup = tcf_hash_search;
+	if (!act->walk)
+		act->walk = tcf_generic_walker;
+
+	act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL);
+	if (!act->hinfo)
+		return -ENOMEM;
+	err = tcf_hashinfo_init(act->hinfo, mask);
+	if (err) {
+		kfree(act->hinfo);
+		return err;
+	}
 
 	write_lock(&act_mod_lock);
-	for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) {
+	list_for_each_entry(a, &act_base, head) {
 		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
 			write_unlock(&act_mod_lock);
+			tcf_hashinfo_destroy(act->hinfo);
+			kfree(act->hinfo);
 			return -EEXIST;
 		}
 	}
-	act->next = NULL;
-	*ap = act;
+	list_add_tail(&act->head, &act_base);
 	write_unlock(&act_mod_lock);
 	return 0;
 }
+EXPORT_SYMBOL(tcf_register_action);
 
 int tcf_unregister_action(struct tc_action_ops *act)
 {
-	struct tc_action_ops *a, **ap;
+	struct tc_action_ops *a;
 	int err = -ENOENT;
 
 	write_lock(&act_mod_lock);
-	for (ap = &act_base; (a = *ap) != NULL; ap = &a->next)
-		if (a == act)
+	list_for_each_entry(a, &act_base, head) {
+		if (a == act) {
+			list_del(&act->head);
+			tcf_hashinfo_destroy(act->hinfo);
+			kfree(act->hinfo);
+			err = 0;
 			break;
-	if (a) {
-		*ap = a->next;
-		a->next = NULL;
-		err = 0;
+		}
 	}
 	write_unlock(&act_mod_lock);
 	return err;
 }
+EXPORT_SYMBOL(tcf_unregister_action);
 
 /* lookup by name */
 static struct tc_action_ops *tc_lookup_action_n(char *kind)
 {
-	struct tc_action_ops *a = NULL;
+	struct tc_action_ops *a, *res = NULL;
 
 	if (kind) {
 		read_lock(&act_mod_lock);
-		for (a = act_base; a; a = a->next) {
+		list_for_each_entry(a, &act_base, head) {
 			if (strcmp(kind, a->kind) == 0) {
-				if (!try_module_get(a->owner)) {
-					read_unlock(&act_mod_lock);
-					return NULL;
-				}
+				if (try_module_get(a->owner))
+					res = a;
 				break;
 			}
 		}
 		read_unlock(&act_mod_lock);
 	}
-	return a;
+	return res;
 }
 
-/* lookup by rtattr */
-static struct tc_action_ops *tc_lookup_action(struct rtattr *kind)
+/* lookup by nlattr */
+static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
 {
-	struct tc_action_ops *a = NULL;
+	struct tc_action_ops *a, *res = NULL;
 
 	if (kind) {
 		read_lock(&act_mod_lock);
-		for (a = act_base; a; a = a->next) {
-			if (rtattr_strcmp(kind, a->kind) == 0) {
-				if (!try_module_get(a->owner)) {
-					read_unlock(&act_mod_lock);
-					return NULL;
-				}
-				break;
-			}
-		}
-		read_unlock(&act_mod_lock);
-	}
-	return a;
-}
-
-#if 0
-/* lookup by id */
-static struct tc_action_ops *tc_lookup_action_id(u32 type)
-{
-	struct tc_action_ops *a = NULL;
-
-	if (type) {
-		read_lock(&act_mod_lock);
-		for (a = act_base; a; a = a->next) {
-			if (a->type == type) {
-				if (!try_module_get(a->owner)) {
-					read_unlock(&act_mod_lock);
-					return NULL;
-				}
+		list_for_each_entry(a, &act_base, head) {
+			if (nla_strcmp(kind, a->kind) == 0) {
+				if (try_module_get(a->owner))
+					res = a;
 				break;
 			}
 		}
 		read_unlock(&act_mod_lock);
 	}
-	return a;
+	return res;
 }
-#endif
 
-int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
-                    struct tcf_result *res)
+int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
+		    struct tcf_result *res)
 {
-	struct tc_action *a;
+	const struct tc_action *a;
 	int ret = -1;
 
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n",
-		         skb, skb->input_dev ? skb->input_dev->name : "xxx",
-		         skb->dev->name);
 		ret = TC_ACT_OK;
 		goto exec_done;
 	}
-	while ((a = act) != NULL) {
+	list_for_each_entry(a, actions, list) {
 repeat:
-		if (a->ops && a->ops->act) {
-			ret = a->ops->act(skb, a, res);
-			if (TC_MUNGED & skb->tc_verd) {
-				/* copied already, allow trampling */
-				skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
-				skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
-			}
-			if (ret == TC_ACT_REPEAT)
-				goto repeat;	/* we need a ttl - JHS */
-			if (ret != TC_ACT_PIPE)
-				goto exec_done;
+		ret = a->ops->act(skb, a, res);
+		if (TC_MUNGED & skb->tc_verd) {
+			/* copied already, allow trampling */
+			skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
+			skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
 		}
-		act = a->next;
+		if (ret == TC_ACT_REPEAT)
+			goto repeat;	/* we need a ttl - JHS */
+		if (ret != TC_ACT_PIPE)
+			goto exec_done;
 	}
 exec_done:
 	return ret;
 }
+EXPORT_SYMBOL(tcf_action_exec);
 
-void tcf_action_destroy(struct tc_action *act, int bind)
+int tcf_action_destroy(struct list_head *actions, int bind)
 {
-	struct tc_action *a;
+	struct tc_action *a, *tmp;
+	int ret = 0;
 
-	for (a = act; a; a = act) {
-		if (a->ops && a->ops->cleanup) {
-			DPRINTK("tcf_action_destroy destroying %p next %p\n",
-			        a, a->next);
-			if (a->ops->cleanup(a, bind) == ACT_P_DELETED)
-				module_put(a->ops->owner);
-			act = act->next;
-			kfree(a);
-		} else { /*FIXME: Remove later - catch insertion bugs*/
-			printk("tcf_action_destroy: BUG? destroying NULL ops\n");
-			act = act->next;
-			kfree(a);
-		}
+	list_for_each_entry_safe(a, tmp, actions, list) {
+		ret = tcf_hash_release(a, bind);
+		if (ret == ACT_P_DELETED)
+			module_put(a->ops->owner);
+		else if (ret < 0)
+			return ret;
+		list_del(&a->list);
+		kfree(a);
 	}
+	return ret;
 }
 
 int
 tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
-	int err = -EINVAL;
-
-	if (a->ops == NULL || a->ops->dump == NULL)
-		return err;
 	return a->ops->dump(skb, a, bind, ref);
 }
 
@@ -216,79 +433,84 @@ int
 tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
 	int err = -EINVAL;
-	unsigned char *b = skb->tail;
-	struct rtattr *r;
-
-	if (a->ops == NULL || a->ops->dump == NULL)
-		return err;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
 
-	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind);
+	if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+		goto nla_put_failure;
 	if (tcf_action_copy_stats(skb, a, 0))
-		goto rtattr_failure;
-	r = (struct rtattr*) skb->tail;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
-	if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) {
-		r->rta_len = skb->tail - (u8*)r;
+		goto nla_put_failure;
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	err = tcf_action_dump_old(skb, a, bind, ref);
+	if (err > 0) {
+		nla_nest_end(skb, nest);
 		return err;
 	}
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
+EXPORT_SYMBOL(tcf_action_dump_1);
 
 int
-tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref)
+tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref)
 {
 	struct tc_action *a;
 	int err = -EINVAL;
-	unsigned char *b = skb->tail;
-	struct rtattr *r ;
+	struct nlattr *nest;
 
-	while ((a = act) != NULL) {
-		r = (struct rtattr*) skb->tail;
-		act = a->next;
-		RTA_PUT(skb, a->order, 0, NULL);
+	list_for_each_entry(a, actions, list) {
+		nest = nla_nest_start(skb, a->order);
+		if (nest == NULL)
+			goto nla_put_failure;
 		err = tcf_action_dump_1(skb, a, bind, ref);
 		if (err < 0)
-			goto rtattr_failure;
-		r->rta_len = skb->tail - (u8*)r;
+			goto errout;
+		nla_nest_end(skb, nest);
 	}
 
 	return 0;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
-	return -err;
+nla_put_failure:
+	err = -EINVAL;
+errout:
+	nla_nest_cancel(skb, nest);
+	return err;
 }
 
-struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est,
-                                    char *name, int ovr, int bind, int *err)
+struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
+				    struct nlattr *est, char *name, int ovr,
+				    int bind)
 {
 	struct tc_action *a;
 	struct tc_action_ops *a_o;
 	char act_name[IFNAMSIZ];
-	struct rtattr *tb[TCA_ACT_MAX+1];
-	struct rtattr *kind;
-
-	*err = -EINVAL;
+	struct nlattr *tb[TCA_ACT_MAX + 1];
+	struct nlattr *kind;
+	int err;
 
 	if (name == NULL) {
-		if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0)
+		err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL);
+		if (err < 0)
 			goto err_out;
-		kind = tb[TCA_ACT_KIND-1];
+		err = -EINVAL;
+		kind = tb[TCA_ACT_KIND];
 		if (kind == NULL)
 			goto err_out;
-		if (rtattr_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
+		if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
 			goto err_out;
 	} else {
+		err = -EINVAL;
 		if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ)
 			goto err_out;
 	}
 
 	a_o = tc_lookup_action_n(act_name);
 	if (a_o == NULL) {
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 		rtnl_unlock();
 		request_module("act_%s", act_name);
 		rtnl_lock();
@@ -302,37 +524,36 @@ struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est,
 		 * indicate this using -EAGAIN.
 		 */
 		if (a_o != NULL) {
-			*err = -EAGAIN;
+			err = -EAGAIN;
 			goto err_mod;
 		}
 #endif
+		err = -ENOENT;
 		goto err_out;
 	}
 
-	*err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_KERNEL);
+	err = -ENOMEM;
+	a = kzalloc(sizeof(*a), GFP_KERNEL);
 	if (a == NULL)
 		goto err_mod;
-	memset(a, 0, sizeof(*a));
 
+	a->ops = a_o;
+	INIT_LIST_HEAD(&a->list);
 	/* backward compatibility for policer */
 	if (name == NULL)
-		*err = a_o->init(tb[TCA_ACT_OPTIONS-1], est, a, ovr, bind);
+		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind);
 	else
-		*err = a_o->init(rta, est, a, ovr, bind);
-	if (*err < 0)
+		err = a_o->init(net, nla, est, a, ovr, bind);
+	if (err < 0)
 		goto err_free;
 
 	/* module count goes up only when brand new policy is created
-	   if it exists and is only bound to in a_o->init() then
-	   ACT_P_CREATED is not returned (a zero is).
-	*/
-	if (*err != ACT_P_CREATED)
+	 * if it exists and is only bound to in a_o->init() then
+	 * ACT_P_CREATED is not returned (a zero is).
+	 */
+	if (err != ACT_P_CREATED)
 		module_put(a_o->owner);
-	a->ops = a_o;
-	DPRINTK("tcf_action_init_1: successfull %s\n", act_name);
 
-	*err = 0;
 	return a;
 
 err_free:
@@ -340,39 +561,36 @@ err_free:
 err_mod:
 	module_put(a_o->owner);
 err_out:
-	return NULL;
+	return ERR_PTR(err);
 }
 
-struct tc_action *tcf_action_init(struct rtattr *rta, struct rtattr *est,
-                                  char *name, int ovr, int bind, int *err)
+int tcf_action_init(struct net *net, struct nlattr *nla,
+				  struct nlattr *est, char *name, int ovr,
+				  int bind, struct list_head *actions)
 {
-	struct rtattr *tb[TCA_ACT_MAX_PRIO+1];
-	struct tc_action *head = NULL, *act, *act_prev = NULL;
+	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+	struct tc_action *act;
+	int err;
 	int i;
 
-	if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) {
-		*err = -EINVAL;
-		return head;
-	}
+	err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
+	if (err < 0)
+		return err;
 
-	for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_init_1(tb[i], est, name, ovr, bind, err);
-		if (act == NULL)
+	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
+		act = tcf_action_init_1(net, tb[i], est, name, ovr, bind);
+		if (IS_ERR(act)) {
+			err = PTR_ERR(act);
 			goto err;
-		act->order = i+1;
-
-		if (head == NULL)
-			head = act;
-		else
-			act_prev->next = act;
-		act_prev = act;
+		}
+		act->order = i;
+		list_add_tail(&act->list, actions);
 	}
-	return head;
+	return 0;
 
 err:
-	if (head != NULL)
-		tcf_action_destroy(head, bind);
-	return NULL;
+	tcf_action_destroy(actions, bind);
+	return err;
 }
 
 int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
@@ -380,36 +598,31 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 {
 	int err = 0;
 	struct gnet_dump d;
-	struct tcf_act_hdr *h = a->priv;
-	
-	if (h == NULL)
+	struct tcf_common *p = a->priv;
+
+	if (p == NULL)
 		goto errout;
 
 	/* compat_mode being true specifies a call that is supposed
-	 * to add additional backward compatiblity statistic TLVs.
+	 * to add additional backward compatibility statistic TLVs.
 	 */
 	if (compat_mode) {
 		if (a->type == TCA_OLD_COMPAT)
 			err = gnet_stats_start_copy_compat(skb, 0,
-				TCA_STATS, TCA_XSTATS, h->stats_lock, &d);
+				TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d);
 		else
 			return 0;
 	} else
 		err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
-			h->stats_lock, &d);
+					    &p->tcfc_lock, &d);
 
 	if (err < 0)
 		goto errout;
 
-	if (a->ops != NULL && a->ops->get_stats != NULL)
-		if (a->ops->get_stats(skb, a) < 0)
-			goto errout;
-
-	if (gnet_stats_copy_basic(&d, &h->bstats) < 0 ||
-#ifdef CONFIG_NET_ESTIMATOR
-	    gnet_stats_copy_rate_est(&d, &h->rate_est) < 0 ||
-#endif
-	    gnet_stats_copy_queue(&d, &h->qstats) < 0)
+	if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
+				     &p->tcfc_rate_est) < 0 ||
+	    gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0)
 		goto errout;
 
 	if (gnet_stats_finish_copy(&d) < 0)
@@ -422,353 +635,354 @@ errout:
 }
 
 static int
-tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
-             u16 flags, int event, int bind, int ref)
+tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq,
+	     u16 flags, int event, int bind, int ref)
 {
 	struct tcamsg *t;
 	struct nlmsghdr *nlh;
-	unsigned char *b = skb->tail;
-	struct rtattr *x;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
-
-	t = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
-	
-	x = (struct rtattr*) skb->tail;
-	RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
 
-	if (tcf_action_dump(skb, a, bind, ref) < 0)
-		goto rtattr_failure;
+	nest = nla_nest_start(skb, TCA_ACT_TAB);
+	if (nest == NULL)
+		goto out_nlmsg_trim;
+
+	if (tcf_action_dump(skb, actions, bind, ref) < 0)
+		goto out_nlmsg_trim;
 
-	x->rta_len = skb->tail - (u8*)x;
-	
-	nlh->nlmsg_len = skb->tail - b;
+	nla_nest_end(skb, nest);
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-rtattr_failure:
-nlmsg_failure:
-	skb_trim(skb, b - skb->data);
+out_nlmsg_trim:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
 static int
-act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event)
+act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
+	       struct list_head *actions, int event)
 {
 	struct sk_buff *skb;
-	int err = 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
-	if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
+	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
-	err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
-	return err;
+
+	return rtnl_unicast(skb, net, portid);
+}
+
+static struct tc_action *create_a(int i)
+{
+	struct tc_action *act;
+
+	act = kzalloc(sizeof(*act), GFP_KERNEL);
+	if (act == NULL) {
+		pr_debug("create_a: failed to alloc!\n");
+		return NULL;
+	}
+	act->order = i;
+	INIT_LIST_HEAD(&act->list);
+	return act;
 }
 
 static struct tc_action *
-tcf_action_get_1(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int *err)
+tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)
 {
-	struct rtattr *tb[TCA_ACT_MAX+1];
+	struct nlattr *tb[TCA_ACT_MAX + 1];
 	struct tc_action *a;
 	int index;
+	int err;
 
-	*err = -EINVAL;
-	if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0)
-		return NULL;
+	err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL);
+	if (err < 0)
+		goto err_out;
 
-	if (tb[TCA_ACT_INDEX - 1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_ACT_INDEX - 1]) < sizeof(index))
-		return NULL;
-	index = *(int *)RTA_DATA(tb[TCA_ACT_INDEX - 1]);
+	err = -EINVAL;
+	if (tb[TCA_ACT_INDEX] == NULL ||
+	    nla_len(tb[TCA_ACT_INDEX]) < sizeof(index))
+		goto err_out;
+	index = nla_get_u32(tb[TCA_ACT_INDEX]);
 
-	*err = -ENOMEM;
-	a = kmalloc(sizeof(struct tc_action), GFP_KERNEL);
+	err = -ENOMEM;
+	a = create_a(0);
 	if (a == NULL)
-		return NULL;
-	memset(a, 0, sizeof(struct tc_action));
+		goto err_out;
 
-	*err = -EINVAL;
-	a->ops = tc_lookup_action(tb[TCA_ACT_KIND - 1]);
-	if (a->ops == NULL)
+	err = -EINVAL;
+	a->ops = tc_lookup_action(tb[TCA_ACT_KIND]);
+	if (a->ops == NULL) /* could happen in batch of actions */
 		goto err_free;
-	if (a->ops->lookup == NULL)
-		goto err_mod;
-	*err = -ENOENT;
+	err = -ENOENT;
 	if (a->ops->lookup(a, index) == 0)
 		goto err_mod;
 
 	module_put(a->ops->owner);
-	*err = 0;
 	return a;
+
 err_mod:
 	module_put(a->ops->owner);
 err_free:
 	kfree(a);
-	return NULL;
+err_out:
+	return ERR_PTR(err);
 }
 
-static void cleanup_a(struct tc_action *act)
+static void cleanup_a(struct list_head *actions)
 {
-	struct tc_action *a;
+	struct tc_action *a, *tmp;
 
-	for (a = act; a; a = act) {
-		act = a->next;
+	list_for_each_entry_safe(a, tmp, actions, list) {
+		list_del(&a->list);
 		kfree(a);
 	}
 }
 
-static struct tc_action *create_a(int i)
-{
-	struct tc_action *act;
-
-	act = kmalloc(sizeof(*act), GFP_KERNEL);
-	if (act == NULL) {
-		printk("create_a: failed to alloc!\n");
-		return NULL;
-	}
-	memset(act, 0, sizeof(*act));
-	act->order = i;
-	return act;
-}
-
-static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
+static int tca_action_flush(struct net *net, struct nlattr *nla,
+			    struct nlmsghdr *n, u32 portid)
 {
 	struct sk_buff *skb;
 	unsigned char *b;
 	struct nlmsghdr *nlh;
 	struct tcamsg *t;
 	struct netlink_callback dcb;
-	struct rtattr *x;
-	struct rtattr *tb[TCA_ACT_MAX+1];
-	struct rtattr *kind;
-	struct tc_action *a = create_a(0);
-	int err = -EINVAL;
-
-	if (a == NULL) {
-		printk("tca_action_flush: couldnt create tc_action\n");
-		return err;
-	}
+	struct nlattr *nest;
+	struct nlattr *tb[TCA_ACT_MAX + 1];
+	struct nlattr *kind;
+	struct tc_action a;
+	int err = -ENOMEM;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb) {
-		printk("tca_action_flush: failed skb alloc\n");
-		kfree(a);
-		return -ENOBUFS;
+		pr_debug("tca_action_flush: failed skb alloc\n");
+		return err;
 	}
 
-	b = (unsigned char *)skb->tail;
+	b = skb_tail_pointer(skb);
 
-	if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0)
+	err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL);
+	if (err < 0)
 		goto err_out;
 
-	kind = tb[TCA_ACT_KIND-1];
-	a->ops = tc_lookup_action(kind);
-	if (a->ops == NULL)
+	err = -EINVAL;
+	kind = tb[TCA_ACT_KIND];
+	memset(&a, 0, sizeof(struct tc_action));
+	INIT_LIST_HEAD(&a.list);
+	a.ops = tc_lookup_action(kind);
+	if (a.ops == NULL) /*some idjot trying to flush unknown action */
 		goto err_out;
 
-	nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
-	t = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0);
+	if (!nlh)
+		goto out_module_put;
+	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
 
-	x = (struct rtattr *) skb->tail;
-	RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
+	nest = nla_nest_start(skb, TCA_ACT_TAB);
+	if (nest == NULL)
+		goto out_module_put;
 
-	err = a->ops->walk(skb, &dcb, RTM_DELACTION, a);
+	err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a);
 	if (err < 0)
-		goto rtattr_failure;
+		goto out_module_put;
+	if (err == 0)
+		goto noflush_out;
 
-	x->rta_len = skb->tail - (u8 *) x;
+	nla_nest_end(skb, nest);
 
-	nlh->nlmsg_len = skb->tail - b;
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	nlh->nlmsg_flags |= NLM_F_ROOT;
-	module_put(a->ops->owner);
-	kfree(a);
-	err = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+	module_put(a.ops->owner);
+	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+			     n->nlmsg_flags & NLM_F_ECHO);
 	if (err > 0)
 		return 0;
 
 	return err;
 
-rtattr_failure:
-	module_put(a->ops->owner);
-nlmsg_failure:
+out_module_put:
+	module_put(a.ops->owner);
 err_out:
+noflush_out:
 	kfree_skb(skb);
-	kfree(a);
 	return err;
 }
 
 static int
-tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event)
+tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
+	       u32 portid)
 {
-	int i, ret = 0;
-	struct rtattr *tb[TCA_ACT_MAX_PRIO+1];
-	struct tc_action *head = NULL, *act, *act_prev = NULL;
+	int ret;
+	struct sk_buff *skb;
 
-	if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0)
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
+			 0, 1) <= 0) {
+		kfree_skb(skb);
 		return -EINVAL;
+	}
 
-	if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) {
-		if (tb[0] != NULL && tb[1] == NULL)
-			return tca_action_flush(tb[0], n, pid);
+	/* now do the delete */
+	ret = tcf_action_destroy(actions, 0);
+	if (ret < 0) {
+		kfree_skb(skb);
+		return ret;
 	}
 
-	for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_get_1(tb[i], n, pid, &ret);
-		if (act == NULL)
-			goto err;
-		act->order = i+1;
+	ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+			     n->nlmsg_flags & NLM_F_ECHO);
+	if (ret > 0)
+		return 0;
+	return ret;
+}
+
+static int
+tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
+	      u32 portid, int event)
+{
+	int i, ret;
+	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+	struct tc_action *act;
+	LIST_HEAD(actions);
+
+	ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
+	if (ret < 0)
+		return ret;
 
-		if (head == NULL)
-			head = act;
+	if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
+		if (tb[1] != NULL)
+			return tca_action_flush(net, tb[1], n, portid);
 		else
-			act_prev->next = act;
-		act_prev = act;
+			return -EINVAL;
 	}
 
-	if (event == RTM_GETACTION)
-		ret = act_get_notify(pid, n, head, event);
-	else { /* delete */
-		struct sk_buff *skb;
-
-		skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-		if (!skb) {
-			ret = -ENOBUFS;
+	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
+		act = tcf_action_get_1(tb[i], n, portid);
+		if (IS_ERR(act)) {
+			ret = PTR_ERR(act);
 			goto err;
 		}
+		act->order = i;
+		list_add_tail(&act->list, &actions);
+	}
 
-		if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event,
-		                 0, 1) <= 0) {
-			kfree_skb(skb);
-			ret = -EINVAL;
+	if (event == RTM_GETACTION)
+		ret = act_get_notify(net, portid, n, &actions, event);
+	else { /* delete */
+		ret = tcf_del_notify(net, n, &actions, portid);
+		if (ret)
 			goto err;
-		}
-
-		/* now do the delete */
-		tcf_action_destroy(head, 0);
-		ret = rtnetlink_send(skb, pid, RTNLGRP_TC,
-		                     n->nlmsg_flags&NLM_F_ECHO);
-		if (ret > 0)
-			return 0;
 		return ret;
 	}
 err:
-	cleanup_a(head);
+	cleanup_a(&actions);
 	return ret;
 }
 
-static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
-                          u16 flags)
+static int
+tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
+	       u32 portid)
 {
-	struct tcamsg *t;
-	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
-	struct rtattr *x;
-	unsigned char *b;
 	int err = 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
-	b = (unsigned char *)skb->tail;
-
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
-	t = NLMSG_DATA(nlh);
-	t->tca_family = AF_UNSPEC;
-	t->tca__pad1 = 0;
-	t->tca__pad2 = 0;
-
-	x = (struct rtattr*) skb->tail;
-	RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
-
-	if (tcf_action_dump(skb, a, 0, 0) < 0)
-		goto rtattr_failure;
+	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags,
+			 RTM_NEWACTION, 0, 0) <= 0) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
 
-	x->rta_len = skb->tail - (u8*)x;
-	
-	nlh->nlmsg_len = skb->tail - b;
-	NETLINK_CB(skb).dst_group = RTNLGRP_TC;
-	
-	err = rtnetlink_send(skb, pid, RTNLGRP_TC, flags&NLM_F_ECHO);
+	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+			     n->nlmsg_flags & NLM_F_ECHO);
 	if (err > 0)
 		err = 0;
 	return err;
-
-rtattr_failure:
-nlmsg_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
 }
 
-	
 static int
-tcf_action_add(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int ovr)
+tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
+	       u32 portid, int ovr)
 {
 	int ret = 0;
-	struct tc_action *act;
-	struct tc_action *a;
-	u32 seq = n->nlmsg_seq;
+	LIST_HEAD(actions);
 
-	act = tcf_action_init(rta, NULL, NULL, ovr, 0, &ret);
-	if (act == NULL)
+	ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions);
+	if (ret)
 		goto done;
 
 	/* dump then free all the actions after update; inserted policy
 	 * stays intact
-	 * */
-	ret = tcf_add_notify(act, pid, seq, RTM_NEWACTION, n->nlmsg_flags);
-	for (a = act; a; a = act) {
-		act = a->next;
-		kfree(a);
-	}
+	 */
+	ret = tcf_add_notify(net, n, &actions, portid);
+	cleanup_a(&actions);
 done:
 	return ret;
 }
 
-static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)
 {
-	struct rtattr **tca = arg;
-	u32 pid = skb ? NETLINK_CB(skb).pid : 0;
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tca[TCA_ACT_MAX + 1];
+	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
 	int ret = 0, ovr = 0;
 
-	if (tca[TCA_ACT_TAB-1] == NULL) {
-		printk("tc_ctl_action: received NO action attribs\n");
+	if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL);
+	if (ret < 0)
+		return ret;
+
+	if (tca[TCA_ACT_TAB] == NULL) {
+		pr_notice("tc_ctl_action: received NO action attribs\n");
 		return -EINVAL;
 	}
 
-	/* n->nlmsg_flags&NLM_F_CREATE
-	 * */
+	/* n->nlmsg_flags & NLM_F_CREATE */
 	switch (n->nlmsg_type) {
 	case RTM_NEWACTION:
 		/* we are going to assume all other flags
-		 * imply create only if it doesnt exist
+		 * imply create only if it doesn't exist
 		 * Note that CREATE | EXCL implies that
 		 * but since we want avoid ambiguity (eg when flags
 		 * is zero) then just set this
 		 */
-		if (n->nlmsg_flags&NLM_F_REPLACE)
+		if (n->nlmsg_flags & NLM_F_REPLACE)
 			ovr = 1;
 replay:
-		ret = tcf_action_add(tca[TCA_ACT_TAB-1], n, pid, ovr);
+		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr);
 		if (ret == -EAGAIN)
 			goto replay;
 		break;
 	case RTM_DELACTION:
-		ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_DELACTION);
+		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
+				    portid, RTM_DELACTION);
 		break;
 	case RTM_GETACTION:
-		ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_GETACTION);
+		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
+				    portid, RTM_GETACTION);
 		break;
 	default:
 		BUG();
@@ -777,120 +991,101 @@ replay:
 	return ret;
 }
 
-static char *
-find_dump_kind(struct nlmsghdr *n)
+static struct nlattr *
+find_dump_kind(const struct nlmsghdr *n)
 {
-	struct rtattr *tb1, *tb2[TCA_ACT_MAX+1];
-	struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
-	struct rtattr *rta[TCAA_MAX + 1];
-	struct rtattr *kind;
-	int min_len = NLMSG_LENGTH(sizeof(struct tcamsg));
-	int attrlen = n->nlmsg_len - NLMSG_ALIGN(min_len);
-	struct rtattr *attr = (void *) n + NLMSG_ALIGN(min_len);
+	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
+	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+	struct nlattr *nla[TCAA_MAX + 1];
+	struct nlattr *kind;
 
-	if (rtattr_parse(rta, TCAA_MAX, attr, attrlen) < 0)
+	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, NULL) < 0)
 		return NULL;
-	tb1 = rta[TCA_ACT_TAB - 1];
+	tb1 = nla[TCA_ACT_TAB];
 	if (tb1 == NULL)
 		return NULL;
 
-	if (rtattr_parse(tb, TCA_ACT_MAX_PRIO, RTA_DATA(tb1),
-	                 NLMSG_ALIGN(RTA_PAYLOAD(tb1))) < 0)
-		return NULL;
-	if (tb[0] == NULL)
+	if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1),
+		      NLMSG_ALIGN(nla_len(tb1)), NULL) < 0)
 		return NULL;
 
-	if (rtattr_parse(tb2, TCA_ACT_MAX, RTA_DATA(tb[0]),
-	                 RTA_PAYLOAD(tb[0])) < 0)
+	if (tb[1] == NULL)
+		return NULL;
+	if (nla_parse(tb2, TCA_ACT_MAX, nla_data(tb[1]),
+		      nla_len(tb[1]), NULL) < 0)
 		return NULL;
-	kind = tb2[TCA_ACT_KIND-1];
+	kind = tb2[TCA_ACT_KIND];
 
-	return (char *) RTA_DATA(kind);
+	return kind;
 }
 
 static int
 tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct nlmsghdr *nlh;
-	unsigned char *b = skb->tail;
-	struct rtattr *x;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
 	struct tc_action_ops *a_o;
 	struct tc_action a;
 	int ret = 0;
-	struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh);
-	char *kind = find_dump_kind(cb->nlh);
+	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
+	struct nlattr *kind = find_dump_kind(cb->nlh);
 
 	if (kind == NULL) {
-		printk("tc_dump_action: action bad kind\n");
+		pr_info("tc_dump_action: action bad kind\n");
 		return 0;
 	}
 
-	a_o = tc_lookup_action_n(kind);
-	if (a_o == NULL) {
-		printk("failed to find %s\n", kind);
+	a_o = tc_lookup_action(kind);
+	if (a_o == NULL)
 		return 0;
-	}
 
 	memset(&a, 0, sizeof(struct tc_action));
 	a.ops = a_o;
 
-	if (a_o->walk == NULL) {
-		printk("tc_dump_action: %s !capable of dumping table\n", kind);
-		goto rtattr_failure;
-	}
-
-	nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
-	                cb->nlh->nlmsg_type, sizeof(*t));
-	t = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*t), 0);
+	if (!nlh)
+		goto out_module_put;
+	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
 
-	x = (struct rtattr *) skb->tail;
-	RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
+	nest = nla_nest_start(skb, TCA_ACT_TAB);
+	if (nest == NULL)
+		goto out_module_put;
 
 	ret = a_o->walk(skb, cb, RTM_GETACTION, &a);
 	if (ret < 0)
-		goto rtattr_failure;
+		goto out_module_put;
 
 	if (ret > 0) {
-		x->rta_len = skb->tail - (u8 *) x;
+		nla_nest_end(skb, nest);
 		ret = skb->len;
 	} else
-		skb_trim(skb, (u8*)x - skb->data);
+		nla_nest_cancel(skb, nest);
 
-	nlh->nlmsg_len = skb->tail - b;
-	if (NETLINK_CB(cb->skb).pid && ret)
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	if (NETLINK_CB(cb->skb).portid && ret)
 		nlh->nlmsg_flags |= NLM_F_MULTI;
 	module_put(a_o->owner);
 	return skb->len;
 
-rtattr_failure:
-nlmsg_failure:
+out_module_put:
 	module_put(a_o->owner);
-	skb_trim(skb, b - skb->data);
+	nlmsg_trim(skb, b);
 	return skb->len;
 }
 
 static int __init tc_action_init(void)
 {
-	struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC];
+	rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action,
+		      NULL);
 
-	if (link_p) {
-		link_p[RTM_NEWACTION-RTM_BASE].doit = tc_ctl_action;
-		link_p[RTM_DELACTION-RTM_BASE].doit = tc_ctl_action;
-		link_p[RTM_GETACTION-RTM_BASE].doit = tc_ctl_action;
-		link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action;
-	}
-
-	printk("TC classifier action (bugs to netdev@vger.kernel.org cc "
-	       "hadi@cyberus.ca)\n");
 	return 0;
 }
 
 subsys_initcall(tc_action_init);
-
-EXPORT_SYMBOL(tcf_register_action);
-EXPORT_SYMBOL(tcf_unregister_action);
-EXPORT_SYMBOL(tcf_action_exec);
-EXPORT_SYMBOL(tcf_action_dump_1);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
new file mode 100644
index 00000000000..edbf40dac70
--- /dev/null
+++ b/net/sched/act_csum.c
@@ -0,0 +1,584 @@
+/*
+ * Checksum updating actions
+ *
+ * Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+#include <linux/netlink.h>
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <linux/skbuff.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/igmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
+
+#include <net/act_api.h>
+
+#include <linux/tc_act/tc_csum.h>
+#include <net/tc_act/tc_csum.h>
+
+#define CSUM_TAB_MASK 15
+
+static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
+	[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
+};
+
+static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
+			 struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_CSUM_MAX + 1];
+	struct tc_csum *parm;
+	struct tcf_csum *p;
+	int ret = 0, err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_CSUM_PARMS] == NULL)
+		return -EINVAL;
+	parm = nla_data(tb[TCA_CSUM_PARMS]);
+
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+		if (ret)
+			return ret;
+		ret = ACT_P_CREATED;
+	} else {
+		if (bind)/* dont override defaults */
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	p = to_tcf_csum(a);
+	spin_lock_bh(&p->tcf_lock);
+	p->tcf_action = parm->action;
+	p->update_flags = parm->update_flags;
+	spin_unlock_bh(&p->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(a);
+
+	return ret;
+}
+
+/**
+ * tcf_csum_skb_nextlayer - Get next layer pointer
+ * @skb: sk_buff to use
+ * @ihl: previous summed headers length
+ * @ipl: complete packet length
+ * @jhl: next header length
+ *
+ * Check the expected next layer availability in the specified sk_buff.
+ * Return the next layer pointer if pass, NULL otherwise.
+ */
+static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
+				    unsigned int ihl, unsigned int ipl,
+				    unsigned int jhl)
+{
+	int ntkoff = skb_network_offset(skb);
+	int hl = ihl + jhl;
+
+	if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
+	    (skb_cloned(skb) &&
+	     !skb_clone_writable(skb, hl + ntkoff) &&
+	     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		return NULL;
+	else
+		return (void *)(skb_network_header(skb) + ihl);
+}
+
+static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
+			      unsigned int ihl, unsigned int ipl)
+{
+	struct icmphdr *icmph;
+
+	icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph));
+	if (icmph == NULL)
+		return 0;
+
+	icmph->checksum = 0;
+	skb->csum = csum_partial(icmph, ipl - ihl, 0);
+	icmph->checksum = csum_fold(skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
+			      unsigned int ihl, unsigned int ipl)
+{
+	struct igmphdr *igmph;
+
+	igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph));
+	if (igmph == NULL)
+		return 0;
+
+	igmph->csum = 0;
+	skb->csum = csum_partial(igmph, ipl - ihl, 0);
+	igmph->csum = csum_fold(skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
+			      unsigned int ihl, unsigned int ipl)
+{
+	struct icmp6hdr *icmp6h;
+	const struct ipv6hdr *ip6h;
+
+	icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));
+	if (icmp6h == NULL)
+		return 0;
+
+	ip6h = ipv6_hdr(skb);
+	icmp6h->icmp6_cksum = 0;
+	skb->csum = csum_partial(icmp6h, ipl - ihl, 0);
+	icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+					      ipl - ihl, IPPROTO_ICMPV6,
+					      skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
+			     unsigned int ihl, unsigned int ipl)
+{
+	struct tcphdr *tcph;
+	const struct iphdr *iph;
+
+	tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+	if (tcph == NULL)
+		return 0;
+
+	iph = ip_hdr(skb);
+	tcph->check = 0;
+	skb->csum = csum_partial(tcph, ipl - ihl, 0);
+	tcph->check = tcp_v4_check(ipl - ihl,
+				   iph->saddr, iph->daddr, skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
+			     unsigned int ihl, unsigned int ipl)
+{
+	struct tcphdr *tcph;
+	const struct ipv6hdr *ip6h;
+
+	tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+	if (tcph == NULL)
+		return 0;
+
+	ip6h = ipv6_hdr(skb);
+	tcph->check = 0;
+	skb->csum = csum_partial(tcph, ipl - ihl, 0);
+	tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+				      ipl - ihl, IPPROTO_TCP,
+				      skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv4_udp(struct sk_buff *skb,
+			     unsigned int ihl, unsigned int ipl, int udplite)
+{
+	struct udphdr *udph;
+	const struct iphdr *iph;
+	u16 ul;
+
+	/*
+	 * Support both UDP and UDPLITE checksum algorithms, Don't use
+	 * udph->len to get the real length without any protocol check,
+	 * UDPLITE uses udph->len for another thing,
+	 * Use iph->tot_len, or just ipl.
+	 */
+
+	udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+	if (udph == NULL)
+		return 0;
+
+	iph = ip_hdr(skb);
+	ul = ntohs(udph->len);
+
+	if (udplite || udph->check) {
+
+		udph->check = 0;
+
+		if (udplite) {
+			if (ul == 0)
+				skb->csum = csum_partial(udph, ipl - ihl, 0);
+			else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+				skb->csum = csum_partial(udph, ul, 0);
+			else
+				goto ignore_obscure_skb;
+		} else {
+			if (ul != ipl - ihl)
+				goto ignore_obscure_skb;
+
+			skb->csum = csum_partial(udph, ul, 0);
+		}
+
+		udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+						ul, iph->protocol,
+						skb->csum);
+
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+	}
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+	return 1;
+}
+
+static int tcf_csum_ipv6_udp(struct sk_buff *skb,
+			     unsigned int ihl, unsigned int ipl, int udplite)
+{
+	struct udphdr *udph;
+	const struct ipv6hdr *ip6h;
+	u16 ul;
+
+	/*
+	 * Support both UDP and UDPLITE checksum algorithms, Don't use
+	 * udph->len to get the real length without any protocol check,
+	 * UDPLITE uses udph->len for another thing,
+	 * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl.
+	 */
+
+	udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+	if (udph == NULL)
+		return 0;
+
+	ip6h = ipv6_hdr(skb);
+	ul = ntohs(udph->len);
+
+	udph->check = 0;
+
+	if (udplite) {
+		if (ul == 0)
+			skb->csum = csum_partial(udph, ipl - ihl, 0);
+
+		else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+			skb->csum = csum_partial(udph, ul, 0);
+
+		else
+			goto ignore_obscure_skb;
+	} else {
+		if (ul != ipl - ihl)
+			goto ignore_obscure_skb;
+
+		skb->csum = csum_partial(udph, ul, 0);
+	}
+
+	udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul,
+				      udplite ? IPPROTO_UDPLITE : IPPROTO_UDP,
+				      skb->csum);
+
+	if (!udph->check)
+		udph->check = CSUM_MANGLED_0;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+	return 1;
+}
+
+static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
+{
+	const struct iphdr *iph;
+	int ntkoff;
+
+	ntkoff = skb_network_offset(skb);
+
+	if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff))
+		goto fail;
+
+	iph = ip_hdr(skb);
+
+	switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
+	case IPPROTO_ICMP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+			if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4,
+						ntohs(iph->tot_len)))
+				goto fail;
+		break;
+	case IPPROTO_IGMP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP)
+			if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4,
+						ntohs(iph->tot_len)))
+				goto fail;
+		break;
+	case IPPROTO_TCP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+			if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4,
+					       ntohs(iph->tot_len)))
+				goto fail;
+		break;
+	case IPPROTO_UDP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+			if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
+					       ntohs(iph->tot_len), 0))
+				goto fail;
+		break;
+	case IPPROTO_UDPLITE:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+			if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
+					       ntohs(iph->tot_len), 1))
+				goto fail;
+		break;
+	}
+
+	if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
+		if (skb_cloned(skb) &&
+		    !skb_clone_writable(skb, sizeof(*iph) + ntkoff) &&
+		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+			goto fail;
+
+		ip_send_check(ip_hdr(skb));
+	}
+
+	return 1;
+
+fail:
+	return 0;
+}
+
+static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
+				 unsigned int ixhl, unsigned int *pl)
+{
+	int off, len, optlen;
+	unsigned char *xh = (void *)ip6xh;
+
+	off = sizeof(*ip6xh);
+	len = ixhl - off;
+
+	while (len > 1) {
+		switch (xh[off]) {
+		case IPV6_TLV_PAD1:
+			optlen = 1;
+			break;
+		case IPV6_TLV_JUMBO:
+			optlen = xh[off + 1] + 2;
+			if (optlen != 6 || len < 6 || (off & 3) != 2)
+				/* wrong jumbo option length/alignment */
+				return 0;
+			*pl = ntohl(*(__be32 *)(xh + off + 2));
+			goto done;
+		default:
+			optlen = xh[off + 1] + 2;
+			if (optlen > len)
+				/* ignore obscure options */
+				goto done;
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+
+done:
+	return 1;
+}
+
+static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
+{
+	struct ipv6hdr *ip6h;
+	struct ipv6_opt_hdr *ip6xh;
+	unsigned int hl, ixhl;
+	unsigned int pl;
+	int ntkoff;
+	u8 nexthdr;
+
+	ntkoff = skb_network_offset(skb);
+
+	hl = sizeof(*ip6h);
+
+	if (!pskb_may_pull(skb, hl + ntkoff))
+		goto fail;
+
+	ip6h = ipv6_hdr(skb);
+
+	pl = ntohs(ip6h->payload_len);
+	nexthdr = ip6h->nexthdr;
+
+	do {
+		switch (nexthdr) {
+		case NEXTHDR_FRAGMENT:
+			goto ignore_skb;
+		case NEXTHDR_ROUTING:
+		case NEXTHDR_HOP:
+		case NEXTHDR_DEST:
+			if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff))
+				goto fail;
+			ip6xh = (void *)(skb_network_header(skb) + hl);
+			ixhl = ipv6_optlen(ip6xh);
+			if (!pskb_may_pull(skb, hl + ixhl + ntkoff))
+				goto fail;
+			ip6xh = (void *)(skb_network_header(skb) + hl);
+			if ((nexthdr == NEXTHDR_HOP) &&
+			    !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))
+				goto fail;
+			nexthdr = ip6xh->nexthdr;
+			hl += ixhl;
+			break;
+		case IPPROTO_ICMPV6:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+				if (!tcf_csum_ipv6_icmp(skb,
+							hl, pl + sizeof(*ip6h)))
+					goto fail;
+			goto done;
+		case IPPROTO_TCP:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+				if (!tcf_csum_ipv6_tcp(skb,
+						       hl, pl + sizeof(*ip6h)))
+					goto fail;
+			goto done;
+		case IPPROTO_UDP:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+				if (!tcf_csum_ipv6_udp(skb, hl,
+						       pl + sizeof(*ip6h), 0))
+					goto fail;
+			goto done;
+		case IPPROTO_UDPLITE:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+				if (!tcf_csum_ipv6_udp(skb, hl,
+						       pl + sizeof(*ip6h), 1))
+					goto fail;
+			goto done;
+		default:
+			goto ignore_skb;
+		}
+	} while (pskb_may_pull(skb, hl + 1 + ntkoff));
+
+done:
+ignore_skb:
+	return 1;
+
+fail:
+	return 0;
+}
+
+static int tcf_csum(struct sk_buff *skb,
+		    const struct tc_action *a, struct tcf_result *res)
+{
+	struct tcf_csum *p = a->priv;
+	int action;
+	u32 update_flags;
+
+	spin_lock(&p->tcf_lock);
+	p->tcf_tm.lastuse = jiffies;
+	bstats_update(&p->tcf_bstats, skb);
+	action = p->tcf_action;
+	update_flags = p->update_flags;
+	spin_unlock(&p->tcf_lock);
+
+	if (unlikely(action == TC_ACT_SHOT))
+		goto drop;
+
+	switch (skb->protocol) {
+	case cpu_to_be16(ETH_P_IP):
+		if (!tcf_csum_ipv4(skb, update_flags))
+			goto drop;
+		break;
+	case cpu_to_be16(ETH_P_IPV6):
+		if (!tcf_csum_ipv6(skb, update_flags))
+			goto drop;
+		break;
+	}
+
+	return action;
+
+drop:
+	spin_lock(&p->tcf_lock);
+	p->tcf_qstats.drops++;
+	spin_unlock(&p->tcf_lock);
+	return TC_ACT_SHOT;
+}
+
+static int tcf_csum_dump(struct sk_buff *skb,
+			 struct tc_action *a, int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_csum *p = a->priv;
+	struct tc_csum opt = {
+		.update_flags = p->update_flags,
+		.index   = p->tcf_index,
+		.action  = p->tcf_action,
+		.refcnt  = p->tcf_refcnt - ref,
+		.bindcnt = p->tcf_bindcnt - bind,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+	if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t))
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_csum_ops = {
+	.kind		= "csum",
+	.type		= TCA_ACT_CSUM,
+	.owner		= THIS_MODULE,
+	.act		= tcf_csum,
+	.dump		= tcf_csum_dump,
+	.init		= tcf_csum_init,
+};
+
+MODULE_DESCRIPTION("Checksum updating actions");
+MODULE_LICENSE("GPL");
+
+static int __init csum_init_module(void)
+{
+	return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK);
+}
+
+static void __exit csum_cleanup_module(void)
+{
+	tcf_unregister_action(&act_csum_ops);
+}
+
+module_init(csum_init_module);
+module_exit(csum_cleanup_module);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index a1e68f78dcc..d6bcbd9f779 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -10,218 +10,197 @@
  *
  */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <net/sock.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <linux/tc_act/tc_gact.h>
 #include <net/tc_act/tc_gact.h>
 
-/* use generic hash table */
-#define MY_TAB_SIZE	16
-#define MY_TAB_MASK	15
-
-static u32 idx_gen;
-static struct tcf_gact *tcf_gact_ht[MY_TAB_SIZE];
-static DEFINE_RWLOCK(gact_lock);
-
-/* ovewrride the defaults */
-#define tcf_st		tcf_gact
-#define tc_st		tc_gact
-#define tcf_t_lock	gact_lock
-#define tcf_ht		tcf_gact_ht
-
-#define CONFIG_NET_ACT_INIT 1
-#include <net/pkt_act.h>
+#define GACT_TAB_MASK	15
 
 #ifdef CONFIG_GACT_PROB
-static int gact_net_rand(struct tcf_gact *p)
+static int gact_net_rand(struct tcf_gact *gact)
 {
-	if (net_random()%p->pval)
-		return p->action;
-	return p->paction;
+	if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval)
+		return gact->tcf_action;
+	return gact->tcfg_paction;
 }
 
-static int gact_determ(struct tcf_gact *p)
+static int gact_determ(struct tcf_gact *gact)
 {
-	if (p->bstats.packets%p->pval)
-		return p->action;
-	return p->paction;
+	if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval)
+		return gact->tcf_action;
+	return gact->tcfg_paction;
 }
 
-typedef int (*g_rand)(struct tcf_gact *p);
-static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ };
-#endif
+typedef int (*g_rand)(struct tcf_gact *gact);
+static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };
+#endif /* CONFIG_GACT_PROB */
+
+static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
+	[TCA_GACT_PARMS]	= { .len = sizeof(struct tc_gact) },
+	[TCA_GACT_PROB]		= { .len = sizeof(struct tc_gact_p) },
+};
 
-static int tcf_gact_init(struct rtattr *rta, struct rtattr *est,
-                         struct tc_action *a, int ovr, int bind)
+static int tcf_gact_init(struct net *net, struct nlattr *nla,
+			 struct nlattr *est, struct tc_action *a,
+			 int ovr, int bind)
 {
-	struct rtattr *tb[TCA_GACT_MAX];
+	struct nlattr *tb[TCA_GACT_MAX + 1];
 	struct tc_gact *parm;
-	struct tcf_gact *p;
+	struct tcf_gact *gact;
 	int ret = 0;
+	int err;
+#ifdef CONFIG_GACT_PROB
+	struct tc_gact_p *p_parm = NULL;
+#endif
 
-	if (rta == NULL || rtattr_parse_nested(tb, TCA_GACT_MAX, rta) < 0)
+	if (nla == NULL)
 		return -EINVAL;
 
-	if (tb[TCA_GACT_PARMS - 1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_GACT_PARMS - 1]) < sizeof(*parm))
+	err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_GACT_PARMS] == NULL)
 		return -EINVAL;
-	parm = RTA_DATA(tb[TCA_GACT_PARMS - 1]);
+	parm = nla_data(tb[TCA_GACT_PARMS]);
 
-	if (tb[TCA_GACT_PROB-1] != NULL)
-#ifdef CONFIG_GACT_PROB
-		if (RTA_PAYLOAD(tb[TCA_GACT_PROB-1]) < sizeof(struct tc_gact_p))
-			return -EINVAL;
-#else
+#ifndef CONFIG_GACT_PROB
+	if (tb[TCA_GACT_PROB] != NULL)
 		return -EOPNOTSUPP;
+#else
+	if (tb[TCA_GACT_PROB]) {
+		p_parm = nla_data(tb[TCA_GACT_PROB]);
+		if (p_parm->ptype >= MAX_RAND)
+			return -EINVAL;
+	}
 #endif
 
-	p = tcf_hash_check(parm->index, a, ovr, bind);
-	if (p == NULL) {
-		p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
-		if (p == NULL)
-			return -ENOMEM;
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind);
+		if (ret)
+			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		if (!ovr) {
-			tcf_hash_release(p, bind);
+		if (bind)/* dont override defaults */
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
 			return -EEXIST;
-		}
 	}
 
-	spin_lock_bh(&p->lock);
-	p->action = parm->action;
+	gact = to_gact(a);
+
+	spin_lock_bh(&gact->tcf_lock);
+	gact->tcf_action = parm->action;
 #ifdef CONFIG_GACT_PROB
-	if (tb[TCA_GACT_PROB-1] != NULL) {
-		struct tc_gact_p *p_parm = RTA_DATA(tb[TCA_GACT_PROB-1]);
-		p->paction = p_parm->paction;
-		p->pval    = p_parm->pval;
-		p->ptype   = p_parm->ptype;
+	if (p_parm) {
+		gact->tcfg_paction = p_parm->paction;
+		gact->tcfg_pval    = p_parm->pval;
+		gact->tcfg_ptype   = p_parm->ptype;
 	}
 #endif
-	spin_unlock_bh(&p->lock);
+	spin_unlock_bh(&gact->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(p);
+		tcf_hash_insert(a);
 	return ret;
 }
 
-static int
-tcf_gact_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_gact *p = PRIV(a, gact);
-
-	if (p != NULL)
-		return tcf_hash_release(p, bind);
-	return 0;
-}
-
-static int
-tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
+		    struct tcf_result *res)
 {
-	struct tcf_gact *p = PRIV(a, gact);
+	struct tcf_gact *gact = a->priv;
 	int action = TC_ACT_SHOT;
 
-	spin_lock(&p->lock);
+	spin_lock(&gact->tcf_lock);
 #ifdef CONFIG_GACT_PROB
-	if (p->ptype && gact_rand[p->ptype] != NULL)
-		action = gact_rand[p->ptype](p);
+	if (gact->tcfg_ptype)
+		action = gact_rand[gact->tcfg_ptype](gact);
 	else
-		action = p->action;
+		action = gact->tcf_action;
 #else
-	action = p->action;
+	action = gact->tcf_action;
 #endif
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
+	gact->tcf_bstats.bytes += qdisc_pkt_len(skb);
+	gact->tcf_bstats.packets++;
 	if (action == TC_ACT_SHOT)
-		p->qstats.drops++;
-	p->tm.lastuse = jiffies;
-	spin_unlock(&p->lock);
+		gact->tcf_qstats.drops++;
+	gact->tcf_tm.lastuse = jiffies;
+	spin_unlock(&gact->tcf_lock);
 
 	return action;
 }
 
-static int
-tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
-	unsigned char *b = skb->tail;
-	struct tc_gact opt;
-	struct tcf_gact *p = PRIV(a, gact);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_gact *gact = a->priv;
+	struct tc_gact opt = {
+		.index   = gact->tcf_index,
+		.refcnt  = gact->tcf_refcnt - ref,
+		.bindcnt = gact->tcf_bindcnt - bind,
+		.action  = gact->tcf_action,
+	};
 	struct tcf_t t;
 
-	opt.index = p->index;
-	opt.refcnt = p->refcnt - ref;
-	opt.bindcnt = p->bindcnt - bind;
-	opt.action = p->action;
-	RTA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
 #ifdef CONFIG_GACT_PROB
-	if (p->ptype) {
-		struct tc_gact_p p_opt;
-		p_opt.paction = p->paction;
-		p_opt.pval = p->pval;
-		p_opt.ptype = p->ptype;
-		RTA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt);
+	if (gact->tcfg_ptype) {
+		struct tc_gact_p p_opt = {
+			.paction = gact->tcfg_paction,
+			.pval    = gact->tcfg_pval,
+			.ptype   = gact->tcfg_ptype,
+		};
+
+		if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt))
+			goto nla_put_failure;
 	}
 #endif
-	t.install = jiffies_to_clock_t(jiffies - p->tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
-	t.expires = jiffies_to_clock_t(p->tm.expires);
-	RTA_PUT(skb, TCA_GACT_TM, sizeof(t), &t);
+	t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
+	if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t))
+		goto nla_put_failure;
 	return skb->len;
 
-      rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
 static struct tc_action_ops act_gact_ops = {
 	.kind		=	"gact",
 	.type		=	TCA_ACT_GACT,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_gact,
 	.dump		=	tcf_gact_dump,
-	.cleanup	=	tcf_gact_cleanup,
-	.lookup		=	tcf_hash_search,
 	.init		=	tcf_gact_init,
-	.walk		=	tcf_generic_walker
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
 MODULE_DESCRIPTION("Generic Classifier actions");
 MODULE_LICENSE("GPL");
 
-static int __init
-gact_init_module(void)
+static int __init gact_init_module(void)
 {
 #ifdef CONFIG_GACT_PROB
-	printk("GACT probability on\n");
+	pr_info("GACT probability on\n");
 #else
-	printk("GACT probability NOT on\n");
+	pr_info("GACT probability NOT on\n");
 #endif
-	return tcf_register_action(&act_gact_ops);
+	return tcf_register_action(&act_gact_ops, GACT_TAB_MASK);
 }
 
-static void __exit
-gact_cleanup_module(void)
+static void __exit gact_cleanup_module(void)
 {
 	tcf_unregister_action(&act_gact_ops);
 }
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 39a22a3ffe7..8a64a0734ae 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -1,5 +1,5 @@
 /*
- * net/sched/ipt.c	iptables target interface
+ * net/sched/ipt.c     iptables target interface
  *
  *TODO: Add other tables. For now we only support the ipv4 table targets
  *
@@ -8,180 +8,156 @@
  *		as published by the Free Software Foundation; either version
  *		2 of the License, or (at your option) any later version.
  *
- * Copyright:	Jamal Hadi Salim (2002-4)
+ * Copyright:	Jamal Hadi Salim (2002-13)
  */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/bitops.h>
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <linux/kmod.h>
-#include <net/sock.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <linux/tc_act/tc_ipt.h>
 #include <net/tc_act/tc_ipt.h>
 
 #include <linux/netfilter_ipv4/ip_tables.h>
 
-/* use generic hash table */
-#define MY_TAB_SIZE     16
-#define MY_TAB_MASK     15
 
-static u32 idx_gen;
-static struct tcf_ipt *tcf_ipt_ht[MY_TAB_SIZE];
-/* ipt hash table lock */
-static DEFINE_RWLOCK(ipt_lock);
+#define IPT_TAB_MASK     15
 
-/* ovewrride the defaults */
-#define tcf_st		tcf_ipt
-#define tcf_t_lock	ipt_lock
-#define tcf_ht		tcf_ipt_ht
-
-#define CONFIG_NET_ACT_INIT
-#include <net/pkt_act.h>
-
-static int
-ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
+static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
 {
-	struct ipt_target *target;
+	struct xt_tgchk_param par;
+	struct xt_target *target;
 	int ret = 0;
 
-	target = xt_find_target(AF_INET, t->u.user.name, t->u.user.revision);
-	if (!target)
-		return -ENOENT;
+	target = xt_request_find_target(AF_INET, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target))
+		return PTR_ERR(target);
 
-	DPRINTK("ipt_init_target: found %s\n", target->name);
 	t->u.kernel.target = target;
-
-	if (t->u.kernel.target->checkentry
-	    && !t->u.kernel.target->checkentry(table, NULL, t->data,
-					       t->u.target_size - sizeof(*t),
-					       hook)) {
-		DPRINTK("ipt_init_target: check failed for `%s'.\n",
-			t->u.kernel.target->name);
+	par.table     = table;
+	par.entryinfo = NULL;
+	par.target    = target;
+	par.targinfo  = t->data;
+	par.hook_mask = hook;
+	par.family    = NFPROTO_IPV4;
+
+	ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
+	if (ret < 0) {
 		module_put(t->u.kernel.target->me);
-		ret = -EINVAL;
+		return ret;
 	}
-
-	return ret;
+	return 0;
 }
 
-static void
-ipt_destroy_target(struct ipt_entry_target *t)
+static void ipt_destroy_target(struct xt_entry_target *t)
 {
-	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->data,
-		                            t->u.target_size - sizeof(*t));
-        module_put(t->u.kernel.target->me);
+	struct xt_tgdtor_param par = {
+		.target   = t->u.kernel.target,
+		.targinfo = t->data,
+	};
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
 }
 
-static int
-tcf_ipt_release(struct tcf_ipt *p, int bind)
+static void tcf_ipt_release(struct tc_action *a, int bind)
 {
-	int ret = 0;
-	if (p) {
-		if (bind)
-			p->bindcnt--;
-		p->refcnt--;
-		if (p->bindcnt <= 0 && p->refcnt <= 0) {
-			ipt_destroy_target(p->t);
-			kfree(p->tname);
-			kfree(p->t);
-			tcf_hash_destroy(p);
-			ret = ACT_P_DELETED;
-		}
-	}
-	return ret;
+	struct tcf_ipt *ipt = to_ipt(a);
+	ipt_destroy_target(ipt->tcfi_t);
+	kfree(ipt->tcfi_tname);
+	kfree(ipt->tcfi_t);
 }
 
-static int
-tcf_ipt_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
-             int ovr, int bind)
+static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
+	[TCA_IPT_TABLE]	= { .type = NLA_STRING, .len = IFNAMSIZ },
+	[TCA_IPT_HOOK]	= { .type = NLA_U32 },
+	[TCA_IPT_INDEX]	= { .type = NLA_U32 },
+	[TCA_IPT_TARG]	= { .len = sizeof(struct xt_entry_target) },
+};
+
+static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
+			struct tc_action *a, int ovr, int bind)
 {
-	struct rtattr *tb[TCA_IPT_MAX];
-	struct tcf_ipt *p;
-	struct ipt_entry_target *td, *t;
+	struct nlattr *tb[TCA_IPT_MAX + 1];
+	struct tcf_ipt *ipt;
+	struct xt_entry_target *td, *t;
 	char *tname;
 	int ret = 0, err;
 	u32 hook = 0;
 	u32 index = 0;
 
-	if (rta == NULL || rtattr_parse_nested(tb, TCA_IPT_MAX, rta) < 0)
+	if (nla == NULL)
 		return -EINVAL;
 
-	if (tb[TCA_IPT_HOOK-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_IPT_HOOK-1]) < sizeof(u32))
+	err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_IPT_HOOK] == NULL)
 		return -EINVAL;
-	if (tb[TCA_IPT_TARG-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < sizeof(*t))
+	if (tb[TCA_IPT_TARG] == NULL)
 		return -EINVAL;
-	td = (struct ipt_entry_target *)RTA_DATA(tb[TCA_IPT_TARG-1]);
-	if (RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < td->u.target_size)
+
+	td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
+	if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
 		return -EINVAL;
 
-	if (tb[TCA_IPT_INDEX-1] != NULL &&
-	    RTA_PAYLOAD(tb[TCA_IPT_INDEX-1]) >= sizeof(u32))
-		index = *(u32 *)RTA_DATA(tb[TCA_IPT_INDEX-1]);
+	if (tb[TCA_IPT_INDEX] != NULL)
+		index = nla_get_u32(tb[TCA_IPT_INDEX]);
 
-	p = tcf_hash_check(index, a, ovr, bind);
-	if (p == NULL) {
-		p = tcf_hash_create(index, est, a, sizeof(*p), ovr, bind);
-		if (p == NULL)
-			return -ENOMEM;
+	if (!tcf_hash_check(index, a, bind) ) {
+		ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind);
+		if (ret)
+			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		if (!ovr) {
-			tcf_ipt_release(p, bind);
+		if (bind)/* dont override defaults */
+			return 0;
+		tcf_hash_release(a, bind);
+
+		if (!ovr)
 			return -EEXIST;
-		}
 	}
+	ipt = to_ipt(a);
 
-	hook = *(u32 *)RTA_DATA(tb[TCA_IPT_HOOK-1]);
+	hook = nla_get_u32(tb[TCA_IPT_HOOK]);
 
 	err = -ENOMEM;
 	tname = kmalloc(IFNAMSIZ, GFP_KERNEL);
-	if (tname == NULL)
+	if (unlikely(!tname))
 		goto err1;
-	if (tb[TCA_IPT_TABLE - 1] == NULL ||
-	    rtattr_strlcpy(tname, tb[TCA_IPT_TABLE-1], IFNAMSIZ) >= IFNAMSIZ)
+	if (tb[TCA_IPT_TABLE] == NULL ||
+	    nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
 		strcpy(tname, "mangle");
 
-	t = kmalloc(td->u.target_size, GFP_KERNEL);
-	if (t == NULL)
+	t = kmemdup(td, td->u.target_size, GFP_KERNEL);
+	if (unlikely(!t))
 		goto err2;
-	memcpy(t, td, td->u.target_size);
 
-	if ((err = ipt_init_target(t, tname, hook)) < 0)
+	err = ipt_init_target(t, tname, hook);
+	if (err < 0)
 		goto err3;
 
-	spin_lock_bh(&p->lock);
+	spin_lock_bh(&ipt->tcf_lock);
 	if (ret != ACT_P_CREATED) {
-		ipt_destroy_target(p->t);
-		kfree(p->tname);
-		kfree(p->t);
+		ipt_destroy_target(ipt->tcfi_t);
+		kfree(ipt->tcfi_tname);
+		kfree(ipt->tcfi_t);
 	}
-	p->tname = tname;
-	p->t     = t;
-	p->hook  = hook;
-	spin_unlock_bh(&p->lock);
+	ipt->tcfi_tname = tname;
+	ipt->tcfi_t     = t;
+	ipt->tcfi_hook  = hook;
+	spin_unlock_bh(&ipt->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(p);
+		tcf_hash_insert(a);
 	return ret;
 
 err3:
@@ -189,108 +165,96 @@ err3:
 err2:
 	kfree(tname);
 err1:
-	kfree(p);
+	if (ret == ACT_P_CREATED)
+		tcf_hash_cleanup(a, est);
 	return err;
 }
 
-static int
-tcf_ipt_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_ipt *p = PRIV(a, ipt);
-	return tcf_ipt_release(p, bind);
-}
-
-static int
-tcf_ipt(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
+		   struct tcf_result *res)
 {
 	int ret = 0, result = 0;
-	struct tcf_ipt *p = PRIV(a, ipt);
+	struct tcf_ipt *ipt = a->priv;
+	struct xt_action_param par;
 
-	if (skb_cloned(skb)) {
-		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-			return TC_ACT_UNSPEC;
-	}
+	if (skb_unclone(skb, GFP_ATOMIC))
+		return TC_ACT_UNSPEC;
 
-	spin_lock(&p->lock);
+	spin_lock(&ipt->tcf_lock);
 
-	p->tm.lastuse = jiffies;
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
+	ipt->tcf_tm.lastuse = jiffies;
+	bstats_update(&ipt->tcf_bstats, skb);
 
 	/* yes, we have to worry about both in and out dev
-	 worry later - danger - this API seems to have changed
-	 from earlier kernels */
-
-	/* iptables targets take a double skb pointer in case the skb
-	 * needs to be replaced. We don't own the skb, so this must not
-	 * happen. The pskb_expand_head above should make sure of this */
-	ret = p->t->u.kernel.target->target(&skb, skb->dev, NULL,
-					    p->hook, p->t->data, NULL);
+	 * worry later - danger - this API seems to have changed
+	 * from earlier kernels
+	 */
+	par.in       = skb->dev;
+	par.out      = NULL;
+	par.hooknum  = ipt->tcfi_hook;
+	par.target   = ipt->tcfi_t->u.kernel.target;
+	par.targinfo = ipt->tcfi_t->data;
+	ret = par.target->target(skb, &par);
+
 	switch (ret) {
 	case NF_ACCEPT:
 		result = TC_ACT_OK;
 		break;
 	case NF_DROP:
 		result = TC_ACT_SHOT;
-		p->qstats.drops++;
+		ipt->tcf_qstats.drops++;
 		break;
-	case IPT_CONTINUE:
+	case XT_CONTINUE:
 		result = TC_ACT_PIPE;
 		break;
 	default:
-		if (net_ratelimit())
-			printk("Bogus netfilter code %d assume ACCEPT\n", ret);
+		net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n",
+				       ret);
 		result = TC_POLICE_OK;
 		break;
 	}
-	spin_unlock(&p->lock);
+	spin_unlock(&ipt->tcf_lock);
 	return result;
 
 }
 
-static int
-tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
-	struct ipt_entry_target *t;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_ipt *ipt = a->priv;
+	struct xt_entry_target *t;
 	struct tcf_t tm;
 	struct tc_cnt c;
-	unsigned char *b = skb->tail;
-	struct tcf_ipt *p = PRIV(a, ipt);
 
 	/* for simple targets kernel size == user size
-	** user name = target name
-	** for foolproof you need to not assume this
-	*/
-
-	t = kmalloc(p->t->u.user.target_size, GFP_ATOMIC);
-	if (t == NULL)
-		goto rtattr_failure;
-
-	c.bindcnt = p->bindcnt - bind;
-	c.refcnt = p->refcnt - ref;
-	memcpy(t, p->t, p->t->u.user.target_size);
-	strcpy(t->u.user.name, p->t->u.kernel.target->name);
-
-	DPRINTK("\ttcf_ipt_dump tablename %s length %d\n", p->tname,
-		strlen(p->tname));
-	DPRINTK("\tdump target name %s size %d size user %d "
-	        "data[0] %x data[1] %x\n", p->t->u.kernel.target->name,
-	        p->t->u.target_size, p->t->u.user.target_size,
-	        p->t->data[0], p->t->data[1]);
-	RTA_PUT(skb, TCA_IPT_TARG, p->t->u.user.target_size, t);
-	RTA_PUT(skb, TCA_IPT_INDEX, 4, &p->index);
-	RTA_PUT(skb, TCA_IPT_HOOK, 4, &p->hook);
-	RTA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c);
-	RTA_PUT(skb, TCA_IPT_TABLE, IFNAMSIZ, p->tname);
-	tm.install = jiffies_to_clock_t(jiffies - p->tm.install);
-	tm.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
-	tm.expires = jiffies_to_clock_t(p->tm.expires);
-	RTA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm);
+	 * user name = target name
+	 * for foolproof you need to not assume this
+	 */
+
+	t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
+	if (unlikely(!t))
+		goto nla_put_failure;
+
+	c.bindcnt = ipt->tcf_bindcnt - bind;
+	c.refcnt = ipt->tcf_refcnt - ref;
+	strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
+
+	if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
+	    nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) ||
+	    nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) ||
+	    nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) ||
+	    nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname))
+		goto nla_put_failure;
+	tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
+	tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
+	tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
+	if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm))
+		goto nla_put_failure;
 	kfree(t);
 	return skb->len;
 
-      rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	kfree(t);
 	return -1;
 }
@@ -298,29 +262,48 @@ tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 static struct tc_action_ops act_ipt_ops = {
 	.kind		=	"ipt",
 	.type		=	TCA_ACT_IPT,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_ipt,
 	.dump		=	tcf_ipt_dump,
-	.cleanup	=	tcf_ipt_cleanup,
-	.lookup		=	tcf_hash_search,
+	.cleanup	=	tcf_ipt_release,
+	.init		=	tcf_ipt_init,
+};
+
+static struct tc_action_ops act_xt_ops = {
+	.kind		=	"xt",
+	.type		=	TCA_ACT_XT,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_ipt,
+	.dump		=	tcf_ipt_dump,
+	.cleanup	=	tcf_ipt_release,
 	.init		=	tcf_ipt_init,
-	.walk		=	tcf_generic_walker
 };
 
-MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
 MODULE_DESCRIPTION("Iptables target actions");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("act_xt");
 
-static int __init
-ipt_init_module(void)
+static int __init ipt_init_module(void)
 {
-	return tcf_register_action(&act_ipt_ops);
+	int ret1, ret2;
+
+	ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK);
+	if (ret1 < 0)
+		printk("Failed to load xt action\n");
+	ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK);
+	if (ret2 < 0)
+		printk("Failed to load ipt action\n");
+
+	if (ret1 < 0 && ret2 < 0) {
+		return ret1;
+	} else
+		return 0;
 }
 
-static void __exit
-ipt_cleanup_module(void)
+static void __exit ipt_cleanup_module(void)
 {
+	tcf_unregister_action(&act_xt_ops);
 	tcf_unregister_action(&act_ipt_ops);
 }
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 4fcccbd5088..4f912c0e225 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -12,263 +12,253 @@
  *
  */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/bitops.h>
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <net/sock.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <linux/tc_act/tc_mirred.h>
 #include <net/tc_act/tc_mirred.h>
 
-#include <linux/etherdevice.h>
 #include <linux/if_arp.h>
 
+#define MIRRED_TAB_MASK     7
+static LIST_HEAD(mirred_list);
 
-/* use generic hash table */
-#define MY_TAB_SIZE     8
-#define MY_TAB_MASK     (MY_TAB_SIZE - 1)
-static u32 idx_gen;
-static struct tcf_mirred *tcf_mirred_ht[MY_TAB_SIZE];
-static DEFINE_RWLOCK(mirred_lock);
-
-/* ovewrride the defaults */
-#define tcf_st		tcf_mirred
-#define tc_st		tc_mirred
-#define tcf_t_lock	mirred_lock
-#define tcf_ht		tcf_mirred_ht
-
-#define CONFIG_NET_ACT_INIT 1
-#include <net/pkt_act.h>
-
-static inline int
-tcf_mirred_release(struct tcf_mirred *p, int bind)
+static void tcf_mirred_release(struct tc_action *a, int bind)
 {
-	if (p) {
-		if (bind)
-			p->bindcnt--;
-		p->refcnt--;
-		if(!p->bindcnt && p->refcnt <= 0) {
-			dev_put(p->dev);
-			tcf_hash_destroy(p);
-			return 1;
-		}
-	}
-	return 0;
+	struct tcf_mirred *m = to_mirred(a);
+	list_del(&m->tcfm_list);
+	if (m->tcfm_dev)
+		dev_put(m->tcfm_dev);
 }
 
-static int
-tcf_mirred_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
-                int ovr, int bind)
+static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
+	[TCA_MIRRED_PARMS]	= { .len = sizeof(struct tc_mirred) },
+};
+
+static int tcf_mirred_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action *a, int ovr,
+			   int bind)
 {
-	struct rtattr *tb[TCA_MIRRED_MAX];
+	struct nlattr *tb[TCA_MIRRED_MAX + 1];
 	struct tc_mirred *parm;
-	struct tcf_mirred *p;
-	struct net_device *dev = NULL;
-	int ret = 0;
-	int ok_push = 0;
+	struct tcf_mirred *m;
+	struct net_device *dev;
+	int ret, ok_push = 0;
 
-	if (rta == NULL || rtattr_parse_nested(tb, TCA_MIRRED_MAX, rta) < 0)
+	if (nla == NULL)
 		return -EINVAL;
-
-	if (tb[TCA_MIRRED_PARMS-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_MIRRED_PARMS-1]) < sizeof(*parm))
+	ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy);
+	if (ret < 0)
+		return ret;
+	if (tb[TCA_MIRRED_PARMS] == NULL)
 		return -EINVAL;
-	parm = RTA_DATA(tb[TCA_MIRRED_PARMS-1]);
-
+	parm = nla_data(tb[TCA_MIRRED_PARMS]);
+	switch (parm->eaction) {
+	case TCA_EGRESS_MIRROR:
+	case TCA_EGRESS_REDIR:
+		break;
+	default:
+		return -EINVAL;
+	}
 	if (parm->ifindex) {
-		dev = __dev_get_by_index(parm->ifindex);
+		dev = __dev_get_by_index(net, parm->ifindex);
 		if (dev == NULL)
 			return -ENODEV;
 		switch (dev->type) {
-			case ARPHRD_TUNNEL:
-			case ARPHRD_TUNNEL6:
-			case ARPHRD_SIT:
-			case ARPHRD_IPGRE:
-			case ARPHRD_VOID:
-			case ARPHRD_NONE:
-				ok_push = 0;
-				break;
-			default:
-				ok_push = 1;
-				break;
+		case ARPHRD_TUNNEL:
+		case ARPHRD_TUNNEL6:
+		case ARPHRD_SIT:
+		case ARPHRD_IPGRE:
+		case ARPHRD_VOID:
+		case ARPHRD_NONE:
+			ok_push = 0;
+			break;
+		default:
+			ok_push = 1;
+			break;
 		}
+	} else {
+		dev = NULL;
 	}
 
-	p = tcf_hash_check(parm->index, a, ovr, bind);
-	if (p == NULL) {
-		if (!parm->ifindex)
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		if (dev == NULL)
 			return -EINVAL;
-		p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
-		if (p == NULL)
-			return -ENOMEM;
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind);
+		if (ret)
+			return ret;
 		ret = ACT_P_CREATED;
 	} else {
 		if (!ovr) {
-			tcf_mirred_release(p, bind);
+			tcf_hash_release(a, bind);
 			return -EEXIST;
 		}
 	}
+	m = to_mirred(a);
 
-	spin_lock_bh(&p->lock);
-	p->action = parm->action;
-	p->eaction = parm->eaction;
-	if (parm->ifindex) {
-		p->ifindex = parm->ifindex;
+	spin_lock_bh(&m->tcf_lock);
+	m->tcf_action = parm->action;
+	m->tcfm_eaction = parm->eaction;
+	if (dev != NULL) {
+		m->tcfm_ifindex = parm->ifindex;
 		if (ret != ACT_P_CREATED)
-			dev_put(p->dev);
-		p->dev = dev;
+			dev_put(m->tcfm_dev);
 		dev_hold(dev);
-		p->ok_push = ok_push;
+		m->tcfm_dev = dev;
+		m->tcfm_ok_push = ok_push;
+	}
+	spin_unlock_bh(&m->tcf_lock);
+	if (ret == ACT_P_CREATED) {
+		list_add(&m->tcfm_list, &mirred_list);
+		tcf_hash_insert(a);
 	}
-	spin_unlock_bh(&p->lock);
-	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(p);
 
-	DPRINTK("tcf_mirred_init index %d action %d eaction %d device %s "
-	        "ifindex %d\n", parm->index, parm->action, parm->eaction,
-	        dev->name, parm->ifindex);
 	return ret;
 }
 
-static int
-tcf_mirred_cleanup(struct tc_action *a, int bind)
+static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
+		      struct tcf_result *res)
 {
-	struct tcf_mirred *p = PRIV(a, mirred);
-
-	if (p != NULL)
-		return tcf_mirred_release(p, bind);
-	return 0;
-}
-
-static int
-tcf_mirred(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
-{
-	struct tcf_mirred *p = PRIV(a, mirred);
+	struct tcf_mirred *m = a->priv;
 	struct net_device *dev;
-	struct sk_buff *skb2 = NULL;
-	u32 at = G_TC_AT(skb->tc_verd);
-
-	spin_lock(&p->lock);
-
-	dev = p->dev;
-	p->tm.lastuse = jiffies;
-
-	if (!(dev->flags&IFF_UP) ) {
-		if (net_ratelimit())
-			printk("mirred to Houston: device %s is gone!\n",
-			       dev->name);
-bad_mirred:
-		if (skb2 != NULL)
-			kfree_skb(skb2);
-		p->qstats.overlimits++;
-		p->bstats.bytes += skb->len;
-		p->bstats.packets++;
-		spin_unlock(&p->lock);
-		/* should we be asking for packet to be dropped?
-		 * may make sense for redirect case only
-		*/
-		return TC_ACT_SHOT;
+	struct sk_buff *skb2;
+	u32 at;
+	int retval, err = 1;
+
+	spin_lock(&m->tcf_lock);
+	m->tcf_tm.lastuse = jiffies;
+	bstats_update(&m->tcf_bstats, skb);
+
+	dev = m->tcfm_dev;
+	if (!dev) {
+		printk_once(KERN_NOTICE "tc mirred: target device is gone\n");
+		goto out;
 	}
 
-	skb2 = skb_clone(skb, GFP_ATOMIC);
-	if (skb2 == NULL)
-		goto bad_mirred;
-	if (p->eaction != TCA_EGRESS_MIRROR && p->eaction != TCA_EGRESS_REDIR) {
-		if (net_ratelimit())
-			printk("tcf_mirred unknown action %d\n", p->eaction);
-		goto bad_mirred;
+	if (!(dev->flags & IFF_UP)) {
+		net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
+				       dev->name);
+		goto out;
 	}
 
-	p->bstats.bytes += skb2->len;
-	p->bstats.packets++;
-	if (!(at & AT_EGRESS))
-		if (p->ok_push)
+	at = G_TC_AT(skb->tc_verd);
+	skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action);
+	if (skb2 == NULL)
+		goto out;
+
+	if (!(at & AT_EGRESS)) {
+		if (m->tcfm_ok_push)
 			skb_push(skb2, skb2->dev->hard_header_len);
+	}
 
 	/* mirror is always swallowed */
-	if (p->eaction != TCA_EGRESS_MIRROR)
+	if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
 		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
 
+	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
-	skb2->input_dev = skb->dev;
-	dev_queue_xmit(skb2);
-	spin_unlock(&p->lock);
-	return p->action;
+	err = dev_queue_xmit(skb2);
+
+out:
+	if (err) {
+		m->tcf_qstats.overlimits++;
+		if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
+			retval = TC_ACT_SHOT;
+		else
+			retval = m->tcf_action;
+	} else
+		retval = m->tcf_action;
+	spin_unlock(&m->tcf_lock);
+
+	return retval;
 }
 
-static int
-tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
-	unsigned char *b = skb->tail;
-	struct tc_mirred opt;
-	struct tcf_mirred *p = PRIV(a, mirred);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_mirred *m = a->priv;
+	struct tc_mirred opt = {
+		.index   = m->tcf_index,
+		.action  = m->tcf_action,
+		.refcnt  = m->tcf_refcnt - ref,
+		.bindcnt = m->tcf_bindcnt - bind,
+		.eaction = m->tcfm_eaction,
+		.ifindex = m->tcfm_ifindex,
+	};
 	struct tcf_t t;
 
-	opt.index = p->index;
-	opt.action = p->action;
-	opt.refcnt = p->refcnt - ref;
-	opt.bindcnt = p->bindcnt - bind;
-	opt.eaction = p->eaction;
-	opt.ifindex = p->ifindex;
-	DPRINTK("tcf_mirred_dump index %d action %d eaction %d ifindex %d\n",
-	         p->index, p->action, p->eaction, p->ifindex);
-	RTA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);
-	t.install = jiffies_to_clock_t(jiffies - p->tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
-	t.expires = jiffies_to_clock_t(p->tm.expires);
-	RTA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t);
+	if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
+	if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t))
+		goto nla_put_failure;
 	return skb->len;
 
-      rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
+static int mirred_device_event(struct notifier_block *unused,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct tcf_mirred *m;
+
+	if (event == NETDEV_UNREGISTER)
+		list_for_each_entry(m, &mirred_list, tcfm_list) {
+			if (m->tcfm_dev == dev) {
+				dev_put(dev);
+				m->tcfm_dev = NULL;
+			}
+		}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block mirred_device_notifier = {
+	.notifier_call = mirred_device_event,
+};
+
 static struct tc_action_ops act_mirred_ops = {
 	.kind		=	"mirred",
 	.type		=	TCA_ACT_MIRRED,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_mirred,
 	.dump		=	tcf_mirred_dump,
-	.cleanup	=	tcf_mirred_cleanup,
-	.lookup		=	tcf_hash_search,
+	.cleanup	=	tcf_mirred_release,
 	.init		=	tcf_mirred_init,
-	.walk		=	tcf_generic_walker
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002)");
 MODULE_DESCRIPTION("Device Mirror/redirect actions");
 MODULE_LICENSE("GPL");
 
-static int __init
-mirred_init_module(void)
+static int __init mirred_init_module(void)
 {
-	printk("Mirror/redirect action on\n");
-	return tcf_register_action(&act_mirred_ops);
+	int err = register_netdevice_notifier(&mirred_device_notifier);
+	if (err)
+		return err;
+
+	pr_info("Mirror/redirect action on\n");
+	return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK);
 }
 
-static void __exit
-mirred_cleanup_module(void)
+static void __exit mirred_cleanup_module(void)
 {
 	tcf_unregister_action(&act_mirred_ops);
+	unregister_netdevice_notifier(&mirred_device_notifier);
 }
 
 module_init(mirred_init_module);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
new file mode 100644
index 00000000000..270a030d5fd
--- /dev/null
+++ b/net/sched/act_nat.c
@@ -0,0 +1,306 @@
+/*
+ * Stateless NAT actions
+ *
+ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tc_act/tc_nat.h>
+#include <net/act_api.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/tc_act/tc_nat.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+
+#define NAT_TAB_MASK	15
+
+static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
+	[TCA_NAT_PARMS]	= { .len = sizeof(struct tc_nat) },
+};
+
+static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
+			struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_NAT_MAX + 1];
+	struct tc_nat *parm;
+	int ret = 0, err;
+	struct tcf_nat *p;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_NAT_PARMS] == NULL)
+		return -EINVAL;
+	parm = nla_data(tb[TCA_NAT_PARMS]);
+
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+		if (ret)
+			return ret;
+		ret = ACT_P_CREATED;
+	} else {
+		if (bind)
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+	p = to_tcf_nat(a);
+
+	spin_lock_bh(&p->tcf_lock);
+	p->old_addr = parm->old_addr;
+	p->new_addr = parm->new_addr;
+	p->mask = parm->mask;
+	p->flags = parm->flags;
+
+	p->tcf_action = parm->action;
+	spin_unlock_bh(&p->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(a);
+
+	return ret;
+}
+
+static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
+		   struct tcf_result *res)
+{
+	struct tcf_nat *p = a->priv;
+	struct iphdr *iph;
+	__be32 old_addr;
+	__be32 new_addr;
+	__be32 mask;
+	__be32 addr;
+	int egress;
+	int action;
+	int ihl;
+	int noff;
+
+	spin_lock(&p->tcf_lock);
+
+	p->tcf_tm.lastuse = jiffies;
+	old_addr = p->old_addr;
+	new_addr = p->new_addr;
+	mask = p->mask;
+	egress = p->flags & TCA_NAT_FLAG_EGRESS;
+	action = p->tcf_action;
+
+	bstats_update(&p->tcf_bstats, skb);
+
+	spin_unlock(&p->tcf_lock);
+
+	if (unlikely(action == TC_ACT_SHOT))
+		goto drop;
+
+	noff = skb_network_offset(skb);
+	if (!pskb_may_pull(skb, sizeof(*iph) + noff))
+		goto drop;
+
+	iph = ip_hdr(skb);
+
+	if (egress)
+		addr = iph->saddr;
+	else
+		addr = iph->daddr;
+
+	if (!((old_addr ^ addr) & mask)) {
+		if (skb_cloned(skb) &&
+		    !skb_clone_writable(skb, sizeof(*iph) + noff) &&
+		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+			goto drop;
+
+		new_addr &= mask;
+		new_addr |= addr & ~mask;
+
+		/* Rewrite IP header */
+		iph = ip_hdr(skb);
+		if (egress)
+			iph->saddr = new_addr;
+		else
+			iph->daddr = new_addr;
+
+		csum_replace4(&iph->check, addr, new_addr);
+	} else if ((iph->frag_off & htons(IP_OFFSET)) ||
+		   iph->protocol != IPPROTO_ICMP) {
+		goto out;
+	}
+
+	ihl = iph->ihl * 4;
+
+	/* It would be nice to share code with stateful NAT. */
+	switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
+	case IPPROTO_TCP:
+	{
+		struct tcphdr *tcph;
+
+		if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) ||
+		    (skb_cloned(skb) &&
+		     !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) &&
+		     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+			goto drop;
+
+		tcph = (void *)(skb_network_header(skb) + ihl);
+		inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1);
+		break;
+	}
+	case IPPROTO_UDP:
+	{
+		struct udphdr *udph;
+
+		if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) ||
+		    (skb_cloned(skb) &&
+		     !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) &&
+		     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+			goto drop;
+
+		udph = (void *)(skb_network_header(skb) + ihl);
+		if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+			inet_proto_csum_replace4(&udph->check, skb, addr,
+						 new_addr, 1);
+			if (!udph->check)
+				udph->check = CSUM_MANGLED_0;
+		}
+		break;
+	}
+	case IPPROTO_ICMP:
+	{
+		struct icmphdr *icmph;
+
+		if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + noff))
+			goto drop;
+
+		icmph = (void *)(skb_network_header(skb) + ihl);
+
+		if ((icmph->type != ICMP_DEST_UNREACH) &&
+		    (icmph->type != ICMP_TIME_EXCEEDED) &&
+		    (icmph->type != ICMP_PARAMETERPROB))
+			break;
+
+		if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) +
+					noff))
+			goto drop;
+
+		icmph = (void *)(skb_network_header(skb) + ihl);
+		iph = (void *)(icmph + 1);
+		if (egress)
+			addr = iph->daddr;
+		else
+			addr = iph->saddr;
+
+		if ((old_addr ^ addr) & mask)
+			break;
+
+		if (skb_cloned(skb) &&
+		    !skb_clone_writable(skb, ihl + sizeof(*icmph) +
+					     sizeof(*iph) + noff) &&
+		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+			goto drop;
+
+		icmph = (void *)(skb_network_header(skb) + ihl);
+		iph = (void *)(icmph + 1);
+
+		new_addr &= mask;
+		new_addr |= addr & ~mask;
+
+		/* XXX Fix up the inner checksums. */
+		if (egress)
+			iph->daddr = new_addr;
+		else
+			iph->saddr = new_addr;
+
+		inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr,
+					 0);
+		break;
+	}
+	default:
+		break;
+	}
+
+out:
+	return action;
+
+drop:
+	spin_lock(&p->tcf_lock);
+	p->tcf_qstats.drops++;
+	spin_unlock(&p->tcf_lock);
+	return TC_ACT_SHOT;
+}
+
+static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
+			int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_nat *p = a->priv;
+	struct tc_nat opt = {
+		.old_addr = p->old_addr,
+		.new_addr = p->new_addr,
+		.mask     = p->mask,
+		.flags    = p->flags,
+
+		.index    = p->tcf_index,
+		.action   = p->tcf_action,
+		.refcnt   = p->tcf_refcnt - ref,
+		.bindcnt  = p->tcf_bindcnt - bind,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+	if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t))
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_nat_ops = {
+	.kind		=	"nat",
+	.type		=	TCA_ACT_NAT,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_nat,
+	.dump		=	tcf_nat_dump,
+	.init		=	tcf_nat_init,
+};
+
+MODULE_DESCRIPTION("Stateless NAT actions");
+MODULE_LICENSE("GPL");
+
+static int __init nat_init_module(void)
+{
+	return tcf_register_action(&act_nat_ops, NAT_TAB_MASK);
+}
+
+static void __exit nat_cleanup_module(void)
+{
+	tcf_unregister_action(&act_nat_ops);
+}
+
+module_init(nat_init_module);
+module_exit(nat_cleanup_module);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 1742a68e012..5f9bcb2e080 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -9,277 +9,231 @@
  * Authors:	Jamal Hadi Salim (2002-4)
  */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/bitops.h>
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <net/sock.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <linux/tc_act/tc_pedit.h>
 #include <net/tc_act/tc_pedit.h>
 
+#define PEDIT_TAB_MASK	15
 
-#define PEDIT_DEB 1
-
-/* use generic hash table */
-#define MY_TAB_SIZE     16
-#define MY_TAB_MASK     15
-static u32 idx_gen;
-static struct tcf_pedit *tcf_pedit_ht[MY_TAB_SIZE];
-static DEFINE_RWLOCK(pedit_lock);
-
-#define tcf_st		tcf_pedit
-#define tc_st		tc_pedit
-#define tcf_t_lock	pedit_lock
-#define tcf_ht		tcf_pedit_ht
-
-#define CONFIG_NET_ACT_INIT 1
-#include <net/pkt_act.h>
+static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
+	[TCA_PEDIT_PARMS]	= { .len = sizeof(struct tc_pedit) },
+};
 
-static int
-tcf_pedit_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
-               int ovr, int bind)
+static int tcf_pedit_init(struct net *net, struct nlattr *nla,
+			  struct nlattr *est, struct tc_action *a,
+			  int ovr, int bind)
 {
-	struct rtattr *tb[TCA_PEDIT_MAX];
+	struct nlattr *tb[TCA_PEDIT_MAX + 1];
 	struct tc_pedit *parm;
-	int ret = 0;
+	int ret = 0, err;
 	struct tcf_pedit *p;
 	struct tc_pedit_key *keys = NULL;
 	int ksize;
 
-	if (rta == NULL || rtattr_parse_nested(tb, TCA_PEDIT_MAX, rta) < 0)
+	if (nla == NULL)
 		return -EINVAL;
 
-	if (tb[TCA_PEDIT_PARMS - 1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm))
+	err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_PEDIT_PARMS] == NULL)
 		return -EINVAL;
-	parm = RTA_DATA(tb[TCA_PEDIT_PARMS-1]);
+	parm = nla_data(tb[TCA_PEDIT_PARMS]);
 	ksize = parm->nkeys * sizeof(struct tc_pedit_key);
-	if (RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm) + ksize)
+	if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
 		return -EINVAL;
 
-	p = tcf_hash_check(parm->index, a, ovr, bind);
-	if (p == NULL) {
+	if (!tcf_hash_check(parm->index, a, bind)) {
 		if (!parm->nkeys)
 			return -EINVAL;
-		p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
-		if (p == NULL)
-			return -ENOMEM;
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+		if (ret)
+			return ret;
+		p = to_pedit(a);
 		keys = kmalloc(ksize, GFP_KERNEL);
 		if (keys == NULL) {
-			kfree(p);
+			tcf_hash_cleanup(a, est);
 			return -ENOMEM;
 		}
 		ret = ACT_P_CREATED;
 	} else {
-		if (!ovr) {
-			tcf_hash_release(p, bind);
+		p = to_pedit(a);
+		tcf_hash_release(a, bind);
+		if (bind)
+			return 0;
+		if (!ovr)
 			return -EEXIST;
-		}
-		if (p->nkeys && p->nkeys != parm->nkeys) {
+
+		if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
 			keys = kmalloc(ksize, GFP_KERNEL);
 			if (keys == NULL)
 				return -ENOMEM;
 		}
 	}
 
-	spin_lock_bh(&p->lock);
-	p->flags = parm->flags;
-	p->action = parm->action;
+	spin_lock_bh(&p->tcf_lock);
+	p->tcfp_flags = parm->flags;
+	p->tcf_action = parm->action;
 	if (keys) {
-		kfree(p->keys);
-		p->keys = keys;
-		p->nkeys = parm->nkeys;
+		kfree(p->tcfp_keys);
+		p->tcfp_keys = keys;
+		p->tcfp_nkeys = parm->nkeys;
 	}
-	memcpy(p->keys, parm->keys, ksize);
-	spin_unlock_bh(&p->lock);
+	memcpy(p->tcfp_keys, parm->keys, ksize);
+	spin_unlock_bh(&p->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(p);
+		tcf_hash_insert(a);
 	return ret;
 }
 
-static int
-tcf_pedit_cleanup(struct tc_action *a, int bind)
+static void tcf_pedit_cleanup(struct tc_action *a, int bind)
 {
-	struct tcf_pedit *p = PRIV(a, pedit);
-
-	if (p != NULL) {
-		struct tc_pedit_key *keys = p->keys;
-		if (tcf_hash_release(p, bind)) {
-			kfree(keys);
-			return 1;
-		}
-	}
-	return 0;
+	struct tcf_pedit *p = a->priv;
+	struct tc_pedit_key *keys = p->tcfp_keys;
+	kfree(keys);
 }
 
-static int
-tcf_pedit(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
+		     struct tcf_result *res)
 {
-	struct tcf_pedit *p = PRIV(a, pedit);
+	struct tcf_pedit *p = a->priv;
 	int i, munged = 0;
-	u8 *pptr;
+	unsigned int off;
 
-	if (!(skb->tc_verd & TC_OK2MUNGE)) {
-		/* should we set skb->cloned? */
-		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
-			return p->action;
-		}
-	}
+	if (skb_unclone(skb, GFP_ATOMIC))
+		return p->tcf_action;
 
-	pptr = skb->nh.raw;
+	off = skb_network_offset(skb);
 
-	spin_lock(&p->lock);
+	spin_lock(&p->tcf_lock);
 
-	p->tm.lastuse = jiffies;
+	p->tcf_tm.lastuse = jiffies;
 
-	if (p->nkeys > 0) {
-		struct tc_pedit_key *tkey = p->keys;
+	if (p->tcfp_nkeys > 0) {
+		struct tc_pedit_key *tkey = p->tcfp_keys;
 
-		for (i = p->nkeys; i > 0; i--, tkey++) {
-			u32 *ptr;
+		for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
+			u32 *ptr, _data;
 			int offset = tkey->off;
 
 			if (tkey->offmask) {
-				if (skb->len > tkey->at) {
-					 char *j = pptr + tkey->at;
-					 offset += ((*j & tkey->offmask) >> 
-					           tkey->shift);
-				} else {
+				char *d, _d;
+
+				d = skb_header_pointer(skb, off + tkey->at, 1,
+						       &_d);
+				if (!d)
 					goto bad;
-				}
+				offset += (*d & tkey->offmask) >> tkey->shift;
 			}
 
 			if (offset % 4) {
-				printk("offset must be on 32 bit boundaries\n");
+				pr_info("tc filter pedit"
+					" offset must be on 32 bit boundaries\n");
 				goto bad;
 			}
-			if (skb->len < 0 || (offset > 0 && offset > skb->len)) {
-				printk("offset %d cant exceed pkt length %d\n",
+			if (offset > 0 && offset > skb->len) {
+				pr_info("tc filter pedit"
+					" offset %d can't exceed pkt length %d\n",
 				       offset, skb->len);
 				goto bad;
 			}
 
-			ptr = (u32 *)(pptr+offset);
+			ptr = skb_header_pointer(skb, off + offset, 4, &_data);
+			if (!ptr)
+				goto bad;
 			/* just do it, baby */
 			*ptr = ((*ptr & tkey->mask) ^ tkey->val);
+			if (ptr == &_data)
+				skb_store_bits(skb, off + offset, ptr, 4);
 			munged++;
 		}
-		
+
 		if (munged)
 			skb->tc_verd = SET_TC_MUNGED(skb->tc_verd);
 		goto done;
-	} else {
-		printk("pedit BUG: index %d\n",p->index);
-	}
+	} else
+		WARN(1, "pedit BUG: index %d\n", p->tcf_index);
 
 bad:
-	p->qstats.overlimits++;
+	p->tcf_qstats.overlimits++;
 done:
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
-	spin_unlock(&p->lock);
-	return p->action;
+	bstats_update(&p->tcf_bstats, skb);
+	spin_unlock(&p->tcf_lock);
+	return p->tcf_action;
 }
 
-static int
-tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,int bind, int ref)
+static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
+			  int bind, int ref)
 {
-	unsigned char *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_pedit *p = a->priv;
 	struct tc_pedit *opt;
-	struct tcf_pedit *p = PRIV(a, pedit);
 	struct tcf_t t;
-	int s; 
-		
-	s = sizeof(*opt) + p->nkeys * sizeof(struct tc_pedit_key);
+	int s;
+
+	s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key);
 
 	/* netlink spinlocks held above us - must use ATOMIC */
-	opt = kmalloc(s, GFP_ATOMIC);
-	if (opt == NULL)
+	opt = kzalloc(s, GFP_ATOMIC);
+	if (unlikely(!opt))
 		return -ENOBUFS;
-	memset(opt, 0, s);
-
-	memcpy(opt->keys, p->keys, p->nkeys * sizeof(struct tc_pedit_key));
-	opt->index = p->index;
-	opt->nkeys = p->nkeys;
-	opt->flags = p->flags;
-	opt->action = p->action;
-	opt->refcnt = p->refcnt - ref;
-	opt->bindcnt = p->bindcnt - bind;
-
-
-#ifdef PEDIT_DEB
-	{                
-		/* Debug - get rid of later */
-		int i;
-		struct tc_pedit_key *key = opt->keys;
-
-		for (i=0; i<opt->nkeys; i++, key++) {
-			printk( "\n key #%d",i);
-			printk( "  at %d: val %08x mask %08x",
-			(unsigned int)key->off,
-			(unsigned int)key->val,
-			(unsigned int)key->mask);
-		}
-	}
-#endif
 
-	RTA_PUT(skb, TCA_PEDIT_PARMS, s, opt);
-	t.install = jiffies_to_clock_t(jiffies - p->tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
-	t.expires = jiffies_to_clock_t(p->tm.expires);
-	RTA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t);
+	memcpy(opt->keys, p->tcfp_keys,
+	       p->tcfp_nkeys * sizeof(struct tc_pedit_key));
+	opt->index = p->tcf_index;
+	opt->nkeys = p->tcfp_nkeys;
+	opt->flags = p->tcfp_flags;
+	opt->action = p->tcf_action;
+	opt->refcnt = p->tcf_refcnt - ref;
+	opt->bindcnt = p->tcf_bindcnt - bind;
+
+	if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
+		goto nla_put_failure;
+	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+	if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t))
+		goto nla_put_failure;
 	kfree(opt);
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	kfree(opt);
 	return -1;
 }
 
-static
-struct tc_action_ops act_pedit_ops = {
+static struct tc_action_ops act_pedit_ops = {
 	.kind		=	"pedit",
 	.type		=	TCA_ACT_PEDIT,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_pedit,
 	.dump		=	tcf_pedit_dump,
 	.cleanup	=	tcf_pedit_cleanup,
-	.lookup		=	tcf_hash_search,
 	.init		=	tcf_pedit_init,
-	.walk		=	tcf_generic_walker
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
 MODULE_DESCRIPTION("Generic Packet Editor actions");
 MODULE_LICENSE("GPL");
 
-static int __init
-pedit_init_module(void)
+static int __init pedit_init_module(void)
 {
-	return tcf_register_action(&act_pedit_ops);
+	return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK);
 }
 
-static void __exit
-pedit_cleanup_module(void)
+static void __exit pedit_cleanup_module(void)
 {
 	tcf_unregister_action(&act_pedit_ops);
 }
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index fa877f8f652..0566e4606a4 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -10,368 +10,335 @@
  * 		J Hadi Salim (action changes)
  */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
-#include <linux/module.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
-#include <net/sock.h>
+#include <linux/slab.h>
 #include <net/act_api.h>
-
-#define L2T(p,L)   ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log])
-#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log])
-#define PRIV(a) ((struct tcf_police *) (a)->priv)
-
-/* use generic hash table */
-#define MY_TAB_SIZE     16
-#define MY_TAB_MASK     15
-static u32 idx_gen;
-static struct tcf_police *tcf_police_ht[MY_TAB_SIZE];
-/* Policer hash table lock */
-static DEFINE_RWLOCK(police_lock);
+#include <net/netlink.h>
+
+struct tcf_police {
+	struct tcf_common	common;
+	int			tcfp_result;
+	u32			tcfp_ewma_rate;
+	s64			tcfp_burst;
+	u32			tcfp_mtu;
+	s64			tcfp_toks;
+	s64			tcfp_ptoks;
+	s64			tcfp_mtu_ptoks;
+	s64			tcfp_t_c;
+	struct psched_ratecfg	rate;
+	bool			rate_present;
+	struct psched_ratecfg	peak;
+	bool			peak_present;
+};
+#define to_police(pc)	\
+	container_of(pc, struct tcf_police, common)
+
+#define POL_TAB_MASK     15
+
+/* old policer structure from before tc actions */
+struct tc_police_compat {
+	u32			index;
+	int			action;
+	u32			limit;
+	u32			burst;
+	u32			mtu;
+	struct tc_ratespec	rate;
+	struct tc_ratespec	peakrate;
+};
 
 /* Each policer is serialized by its individual spinlock */
 
-static __inline__ unsigned tcf_police_hash(u32 index)
-{
-	return index&0xF;
-}
-
-static __inline__ struct tcf_police * tcf_police_lookup(u32 index)
+static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
+			      int type, struct tc_action *a)
 {
-	struct tcf_police *p;
-
-	read_lock(&police_lock);
-	for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) {
-		if (p->index == index)
-			break;
-	}
-	read_unlock(&police_lock);
-	return p;
-}
-
-#ifdef CONFIG_NET_CLS_ACT
-static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
-                              int type, struct tc_action *a)
-{
-	struct tcf_police *p;
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct hlist_head *head;
+	struct tcf_common *p;
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
-	struct rtattr *r;
+	struct nlattr *nest;
 
-	read_lock(&police_lock);
+	spin_lock_bh(&hinfo->lock);
 
 	s_i = cb->args[0];
 
-	for (i = 0; i < MY_TAB_SIZE; i++) {
-		p = tcf_police_ht[tcf_police_hash(i)];
+	for (i = 0; i < (POL_TAB_MASK + 1); i++) {
+		head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)];
 
-		for (; p; p = p->next) {
+		hlist_for_each_entry_rcu(p, head, tcfc_head) {
 			index++;
 			if (index < s_i)
 				continue;
 			a->priv = p;
 			a->order = index;
-			r = (struct rtattr*) skb->tail;
-			RTA_PUT(skb, a->order, 0, NULL);
+			nest = nla_nest_start(skb, a->order);
+			if (nest == NULL)
+				goto nla_put_failure;
 			if (type == RTM_DELACTION)
 				err = tcf_action_dump_1(skb, a, 0, 1);
 			else
 				err = tcf_action_dump_1(skb, a, 0, 0);
 			if (err < 0) {
 				index--;
-				skb_trim(skb, (u8*)r - skb->data);
+				nla_nest_cancel(skb, nest);
 				goto done;
 			}
-			r->rta_len = skb->tail - (u8*)r;
+			nla_nest_end(skb, nest);
 			n_i++;
 		}
 	}
 done:
-	read_unlock(&police_lock);
+	spin_unlock_bh(&hinfo->lock);
 	if (n_i)
 		cb->args[0] += n_i;
 	return n_i;
 
-rtattr_failure:
-	skb_trim(skb, (u8*)r - skb->data);
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
 	goto done;
 }
 
-static inline int
-tcf_hash_search(struct tc_action *a, u32 index)
-{
-	struct tcf_police *p = tcf_police_lookup(index);
-
-	if (p != NULL) {
-		a->priv = p;
-		return 1;
-	} else {
-		return 0;
-	}
-}
-#endif
-
-static inline u32 tcf_police_new_index(void)
-{
-	do {
-		if (++idx_gen == 0)
-			idx_gen = 1;
-	} while (tcf_police_lookup(idx_gen));
-
-	return idx_gen;
-}
-
-void tcf_police_destroy(struct tcf_police *p)
-{
-	unsigned h = tcf_police_hash(p->index);
-	struct tcf_police **p1p;
-	
-	for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) {
-		if (*p1p == p) {
-			write_lock_bh(&police_lock);
-			*p1p = p->next;
-			write_unlock_bh(&police_lock);
-#ifdef CONFIG_NET_ESTIMATOR
-			gen_kill_estimator(&p->bstats, &p->rate_est);
-#endif
-			if (p->R_tab)
-				qdisc_put_rtab(p->R_tab);
-			if (p->P_tab)
-				qdisc_put_rtab(p->P_tab);
-			kfree(p);
-			return;
-		}
-	}
-	BUG_TRAP(0);
-}
+static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
+	[TCA_POLICE_RATE]	= { .len = TC_RTAB_SIZE },
+	[TCA_POLICE_PEAKRATE]	= { .len = TC_RTAB_SIZE },
+	[TCA_POLICE_AVRATE]	= { .type = NLA_U32 },
+	[TCA_POLICE_RESULT]	= { .type = NLA_U32 },
+};
 
-#ifdef CONFIG_NET_CLS_ACT
-static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est,
-                                 struct tc_action *a, int ovr, int bind)
+static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
+				 struct nlattr *est, struct tc_action *a,
+				 int ovr, int bind)
 {
-	unsigned h;
+	unsigned int h;
 	int ret = 0, err;
-	struct rtattr *tb[TCA_POLICE_MAX];
+	struct nlattr *tb[TCA_POLICE_MAX + 1];
 	struct tc_police *parm;
-	struct tcf_police *p;
+	struct tcf_police *police;
 	struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	int size;
 
-	if (rta == NULL || rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0)
+	if (nla == NULL)
 		return -EINVAL;
 
-	if (tb[TCA_POLICE_TBF-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm))
-		return -EINVAL;
-	parm = RTA_DATA(tb[TCA_POLICE_TBF-1]);
+	err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy);
+	if (err < 0)
+		return err;
 
-	if (tb[TCA_POLICE_RESULT-1] != NULL &&
-	    RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))
+	if (tb[TCA_POLICE_TBF] == NULL)
 		return -EINVAL;
-	if (tb[TCA_POLICE_RESULT-1] != NULL &&
-	    RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))
+	size = nla_len(tb[TCA_POLICE_TBF]);
+	if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat))
 		return -EINVAL;
-
-	if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) {
-		a->priv = p;
-		if (bind) {
-			p->bindcnt += 1;
-			p->refcnt += 1;
+	parm = nla_data(tb[TCA_POLICE_TBF]);
+
+	if (parm->index) {
+		if (tcf_hash_search(a, parm->index)) {
+			police = to_police(a->priv);
+			if (bind) {
+				police->tcf_bindcnt += 1;
+				police->tcf_refcnt += 1;
+				return 0;
+			}
+			if (ovr)
+				goto override;
+			/* not replacing */
+			return -EEXIST;
 		}
-		if (ovr)
-			goto override;
-		return ret;
 	}
 
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
-	if (p == NULL)
+	police = kzalloc(sizeof(*police), GFP_KERNEL);
+	if (police == NULL)
 		return -ENOMEM;
-	memset(p, 0, sizeof(*p));
-
 	ret = ACT_P_CREATED;
-	p->refcnt = 1;
-	spin_lock_init(&p->lock);
-	p->stats_lock = &p->lock;
+	police->tcf_refcnt = 1;
+	spin_lock_init(&police->tcf_lock);
 	if (bind)
-		p->bindcnt = 1;
+		police->tcf_bindcnt = 1;
 override:
 	if (parm->rate.rate) {
 		err = -ENOMEM;
-		R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]);
+		R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]);
 		if (R_tab == NULL)
 			goto failure;
+
 		if (parm->peakrate.rate) {
 			P_tab = qdisc_get_rtab(&parm->peakrate,
-					       tb[TCA_POLICE_PEAKRATE-1]);
-			if (p->P_tab == NULL) {
-				qdisc_put_rtab(R_tab);
+					       tb[TCA_POLICE_PEAKRATE]);
+			if (P_tab == NULL)
 				goto failure;
-			}
 		}
 	}
+
+	spin_lock_bh(&police->tcf_lock);
+	if (est) {
+		err = gen_replace_estimator(&police->tcf_bstats,
+					    &police->tcf_rate_est,
+					    &police->tcf_lock, est);
+		if (err)
+			goto failure_unlock;
+	} else if (tb[TCA_POLICE_AVRATE] &&
+		   (ret == ACT_P_CREATED ||
+		    !gen_estimator_active(&police->tcf_bstats,
+					  &police->tcf_rate_est))) {
+		err = -EINVAL;
+		goto failure_unlock;
+	}
+
 	/* No failure allowed after this point */
-	spin_lock_bh(&p->lock);
-	if (R_tab != NULL) {
-		qdisc_put_rtab(p->R_tab);
-		p->R_tab = R_tab;
+	police->tcfp_mtu = parm->mtu;
+	if (police->tcfp_mtu == 0) {
+		police->tcfp_mtu = ~0;
+		if (R_tab)
+			police->tcfp_mtu = 255 << R_tab->rate.cell_log;
 	}
-	if (P_tab != NULL) {
-		qdisc_put_rtab(p->P_tab);
-		p->P_tab = P_tab;
+	if (R_tab) {
+		police->rate_present = true;
+		psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);
+		qdisc_put_rtab(R_tab);
+	} else {
+		police->rate_present = false;
+	}
+	if (P_tab) {
+		police->peak_present = true;
+		psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);
+		qdisc_put_rtab(P_tab);
+	} else {
+		police->peak_present = false;
 	}
 
-	if (tb[TCA_POLICE_RESULT-1])
-		p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
-	p->toks = p->burst = parm->burst;
-	p->mtu = parm->mtu;
-	if (p->mtu == 0) {
-		p->mtu = ~0;
-		if (p->R_tab)
-			p->mtu = 255<<p->R_tab->rate.cell_log;
+	if (tb[TCA_POLICE_RESULT])
+		police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
+	police->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
+	police->tcfp_toks = police->tcfp_burst;
+	if (police->peak_present) {
+		police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak,
+							     police->tcfp_mtu);
+		police->tcfp_ptoks = police->tcfp_mtu_ptoks;
 	}
-	if (p->P_tab)
-		p->ptoks = L2T_P(p, p->mtu);
-	p->action = parm->action;
-
-#ifdef CONFIG_NET_ESTIMATOR
-	if (tb[TCA_POLICE_AVRATE-1])
-		p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
-	if (est)
-		gen_replace_estimator(&p->bstats, &p->rate_est, p->stats_lock, est);
-#endif
-
-	spin_unlock_bh(&p->lock);
+	police->tcf_action = parm->action;
+
+	if (tb[TCA_POLICE_AVRATE])
+		police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
+
+	spin_unlock_bh(&police->tcf_lock);
 	if (ret != ACT_P_CREATED)
 		return ret;
 
-	PSCHED_GET_TIME(p->t_c);
-	p->index = parm->index ? : tcf_police_new_index();
-	h = tcf_police_hash(p->index);
-	write_lock_bh(&police_lock);
-	p->next = tcf_police_ht[h];
-	tcf_police_ht[h] = p;
-	write_unlock_bh(&police_lock);
+	police->tcfp_t_c = ktime_to_ns(ktime_get());
+	police->tcf_index = parm->index ? parm->index :
+		tcf_hash_new_index(hinfo);
+	h = tcf_hash(police->tcf_index, POL_TAB_MASK);
+	spin_lock_bh(&hinfo->lock);
+	hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
+	spin_unlock_bh(&hinfo->lock);
 
-	a->priv = p;
+	a->priv = police;
 	return ret;
 
+failure_unlock:
+	spin_unlock_bh(&police->tcf_lock);
 failure:
+	qdisc_put_rtab(P_tab);
+	qdisc_put_rtab(R_tab);
 	if (ret == ACT_P_CREATED)
-		kfree(p);
+		kfree(police);
 	return err;
 }
 
-static int tcf_act_police_cleanup(struct tc_action *a, int bind)
+static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
 {
-	struct tcf_police *p = PRIV(a);
-
-	if (p != NULL)
-		return tcf_police_release(p, bind);
-	return 0;
-}
-
-static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,
-                          struct tcf_result *res)
-{
-	psched_time_t now;
-	struct tcf_police *p = PRIV(a);
-	long toks;
-	long ptoks = 0;
-
-	spin_lock(&p->lock);
-
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
-
-#ifdef CONFIG_NET_ESTIMATOR
-	if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) {
-		p->qstats.overlimits++;
-		spin_unlock(&p->lock);
-		return p->action;
+	struct tcf_police *police = a->priv;
+	s64 now;
+	s64 toks;
+	s64 ptoks = 0;
+
+	spin_lock(&police->tcf_lock);
+
+	bstats_update(&police->tcf_bstats, skb);
+
+	if (police->tcfp_ewma_rate &&
+	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
+		police->tcf_qstats.overlimits++;
+		if (police->tcf_action == TC_ACT_SHOT)
+			police->tcf_qstats.drops++;
+		spin_unlock(&police->tcf_lock);
+		return police->tcf_action;
 	}
-#endif
 
-	if (skb->len <= p->mtu) {
-		if (p->R_tab == NULL) {
-			spin_unlock(&p->lock);
-			return p->result;
+	if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
+		if (!police->rate_present) {
+			spin_unlock(&police->tcf_lock);
+			return police->tcfp_result;
 		}
 
-		PSCHED_GET_TIME(now);
-
-		toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst);
-
-		if (p->P_tab) {
-			ptoks = toks + p->ptoks;
-			if (ptoks > (long)L2T_P(p, p->mtu))
-				ptoks = (long)L2T_P(p, p->mtu);
-			ptoks -= L2T_P(p, skb->len);
+		now = ktime_to_ns(ktime_get());
+		toks = min_t(s64, now - police->tcfp_t_c,
+			     police->tcfp_burst);
+		if (police->peak_present) {
+			ptoks = toks + police->tcfp_ptoks;
+			if (ptoks > police->tcfp_mtu_ptoks)
+				ptoks = police->tcfp_mtu_ptoks;
+			ptoks -= (s64) psched_l2t_ns(&police->peak,
+						     qdisc_pkt_len(skb));
 		}
-		toks += p->toks;
-		if (toks > (long)p->burst)
-			toks = p->burst;
-		toks -= L2T(p, skb->len);
-
+		toks += police->tcfp_toks;
+		if (toks > police->tcfp_burst)
+			toks = police->tcfp_burst;
+		toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb));
 		if ((toks|ptoks) >= 0) {
-			p->t_c = now;
-			p->toks = toks;
-			p->ptoks = ptoks;
-			spin_unlock(&p->lock);
-			return p->result;
+			police->tcfp_t_c = now;
+			police->tcfp_toks = toks;
+			police->tcfp_ptoks = ptoks;
+			spin_unlock(&police->tcf_lock);
+			return police->tcfp_result;
 		}
 	}
 
-	p->qstats.overlimits++;
-	spin_unlock(&p->lock);
-	return p->action;
+	police->tcf_qstats.overlimits++;
+	if (police->tcf_action == TC_ACT_SHOT)
+		police->tcf_qstats.drops++;
+	spin_unlock(&police->tcf_lock);
+	return police->tcf_action;
 }
 
 static int
 tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
-	unsigned char	 *b = skb->tail;
-	struct tc_police opt;
-	struct tcf_police *p = PRIV(a);
-
-	opt.index = p->index;
-	opt.action = p->action;
-	opt.mtu = p->mtu;
-	opt.burst = p->burst;
-	opt.refcnt = p->refcnt - ref;
-	opt.bindcnt = p->bindcnt - bind;
-	if (p->R_tab)
-		opt.rate = p->R_tab->rate;
-	else
-		memset(&opt.rate, 0, sizeof(opt.rate));
-	if (p->P_tab)
-		opt.peakrate = p->P_tab->rate;
-	else
-		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
-	RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
-	if (p->result)
-		RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result);
-#ifdef CONFIG_NET_ESTIMATOR
-	if (p->ewma_rate)
-		RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate);
-#endif
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_police *police = a->priv;
+	struct tc_police opt = {
+		.index = police->tcf_index,
+		.action = police->tcf_action,
+		.mtu = police->tcfp_mtu,
+		.burst = PSCHED_NS2TICKS(police->tcfp_burst),
+		.refcnt = police->tcf_refcnt - ref,
+		.bindcnt = police->tcf_bindcnt - bind,
+	};
+
+	if (police->rate_present)
+		psched_ratecfg_getrate(&opt.rate, &police->rate);
+	if (police->peak_present)
+		psched_ratecfg_getrate(&opt.peakrate, &police->peak);
+	if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if (police->tcfp_result &&
+	    nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result))
+		goto nla_put_failure;
+	if (police->tcfp_ewma_rate &&
+	    nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
+		goto nla_put_failure;
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
@@ -382,20 +349,17 @@ MODULE_LICENSE("GPL");
 static struct tc_action_ops act_police_ops = {
 	.kind		=	"police",
 	.type		=	TCA_ID_POLICE,
-	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_act_police,
 	.dump		=	tcf_act_police_dump,
-	.cleanup	=	tcf_act_police_cleanup,
-	.lookup		=	tcf_hash_search,
 	.init		=	tcf_act_police_locate,
-	.walk		=	tcf_generic_walker
+	.walk		=	tcf_act_police_walker
 };
 
 static int __init
 police_init_module(void)
 {
-	return tcf_register_action(&act_police_ops);
+	return tcf_register_action(&act_police_ops, POL_TAB_MASK);
 }
 
 static void __exit
@@ -406,199 +370,3 @@ police_cleanup_module(void)
 
 module_init(police_init_module);
 module_exit(police_cleanup_module);
-
-#else /* CONFIG_NET_CLS_ACT */
-
-struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est)
-{
-	unsigned h;
-	struct tcf_police *p;
-	struct rtattr *tb[TCA_POLICE_MAX];
-	struct tc_police *parm;
-
-	if (rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0)
-		return NULL;
-
-	if (tb[TCA_POLICE_TBF-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm))
-		return NULL;
-
-	parm = RTA_DATA(tb[TCA_POLICE_TBF-1]);
-
-	if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) {
-		p->refcnt++;
-		return p;
-	}
-
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
-	if (p == NULL)
-		return NULL;
-
-	memset(p, 0, sizeof(*p));
-	p->refcnt = 1;
-	spin_lock_init(&p->lock);
-	p->stats_lock = &p->lock;
-	if (parm->rate.rate) {
-		p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]);
-		if (p->R_tab == NULL)
-			goto failure;
-		if (parm->peakrate.rate) {
-			p->P_tab = qdisc_get_rtab(&parm->peakrate,
-			                          tb[TCA_POLICE_PEAKRATE-1]);
-			if (p->P_tab == NULL)
-				goto failure;
-		}
-	}
-	if (tb[TCA_POLICE_RESULT-1]) {
-		if (RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))
-			goto failure;
-		p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
-	}
-#ifdef CONFIG_NET_ESTIMATOR
-	if (tb[TCA_POLICE_AVRATE-1]) {
-		if (RTA_PAYLOAD(tb[TCA_POLICE_AVRATE-1]) != sizeof(u32))
-			goto failure;
-		p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
-	}
-#endif
-	p->toks = p->burst = parm->burst;
-	p->mtu = parm->mtu;
-	if (p->mtu == 0) {
-		p->mtu = ~0;
-		if (p->R_tab)
-			p->mtu = 255<<p->R_tab->rate.cell_log;
-	}
-	if (p->P_tab)
-		p->ptoks = L2T_P(p, p->mtu);
-	PSCHED_GET_TIME(p->t_c);
-	p->index = parm->index ? : tcf_police_new_index();
-	p->action = parm->action;
-#ifdef CONFIG_NET_ESTIMATOR
-	if (est)
-		gen_new_estimator(&p->bstats, &p->rate_est, p->stats_lock, est);
-#endif
-	h = tcf_police_hash(p->index);
-	write_lock_bh(&police_lock);
-	p->next = tcf_police_ht[h];
-	tcf_police_ht[h] = p;
-	write_unlock_bh(&police_lock);
-	return p;
-
-failure:
-	if (p->R_tab)
-		qdisc_put_rtab(p->R_tab);
-	kfree(p);
-	return NULL;
-}
-
-int tcf_police(struct sk_buff *skb, struct tcf_police *p)
-{
-	psched_time_t now;
-	long toks;
-	long ptoks = 0;
-
-	spin_lock(&p->lock);
-
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
-
-#ifdef CONFIG_NET_ESTIMATOR
-	if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) {
-		p->qstats.overlimits++;
-		spin_unlock(&p->lock);
-		return p->action;
-	}
-#endif
-
-	if (skb->len <= p->mtu) {
-		if (p->R_tab == NULL) {
-			spin_unlock(&p->lock);
-			return p->result;
-		}
-
-		PSCHED_GET_TIME(now);
-
-		toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst);
-
-		if (p->P_tab) {
-			ptoks = toks + p->ptoks;
-			if (ptoks > (long)L2T_P(p, p->mtu))
-				ptoks = (long)L2T_P(p, p->mtu);
-			ptoks -= L2T_P(p, skb->len);
-		}
-		toks += p->toks;
-		if (toks > (long)p->burst)
-			toks = p->burst;
-		toks -= L2T(p, skb->len);
-
-		if ((toks|ptoks) >= 0) {
-			p->t_c = now;
-			p->toks = toks;
-			p->ptoks = ptoks;
-			spin_unlock(&p->lock);
-			return p->result;
-		}
-	}
-
-	p->qstats.overlimits++;
-	spin_unlock(&p->lock);
-	return p->action;
-}
-EXPORT_SYMBOL(tcf_police);
-
-int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p)
-{
-	unsigned char	 *b = skb->tail;
-	struct tc_police opt;
-
-	opt.index = p->index;
-	opt.action = p->action;
-	opt.mtu = p->mtu;
-	opt.burst = p->burst;
-	if (p->R_tab)
-		opt.rate = p->R_tab->rate;
-	else
-		memset(&opt.rate, 0, sizeof(opt.rate));
-	if (p->P_tab)
-		opt.peakrate = p->P_tab->rate;
-	else
-		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
-	RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
-	if (p->result)
-		RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result);
-#ifdef CONFIG_NET_ESTIMATOR
-	if (p->ewma_rate)
-		RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate);
-#endif
-	return skb->len;
-
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
-}
-
-int tcf_police_dump_stats(struct sk_buff *skb, struct tcf_police *p)
-{
-	struct gnet_dump d;
-	
-	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
-			TCA_XSTATS, p->stats_lock, &d) < 0)
-		goto errout;
-	
-	if (gnet_stats_copy_basic(&d, &p->bstats) < 0 ||
-#ifdef CONFIG_NET_ESTIMATOR
-	    gnet_stats_copy_rate_est(&d, &p->rate_est) < 0 ||
-#endif
-	    gnet_stats_copy_queue(&d, &p->qstats) < 0)
-		goto errout;
-
-	if (gnet_stats_finish_copy(&d) < 0)
-		goto errout;
-
-	return 0;
-
-errout:
-	return -1;
-}
-
-#endif /* CONFIG_NET_CLS_ACT */
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index e5f2e1f431e..992c2317ce8 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -6,69 +6,168 @@
  *		as published by the Free Software Foundation; either version
  *		2 of the License, or (at your option) any later version.
  *
- * Authors:	Jamal Hadi Salim (2005)
+ * Authors:	Jamal Hadi Salim (2005-8)
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
 
 #define TCA_ACT_SIMP 22
 
-/* XXX: Hide all these common elements under some macro 
- * probably
-*/
 #include <linux/tc_act/tc_defact.h>
 #include <net/tc_act/tc_defact.h>
 
-/* use generic hash table with 8 buckets */
-#define MY_TAB_SIZE     8
-#define MY_TAB_MASK     (MY_TAB_SIZE - 1)
-static u32 idx_gen;
-static struct tcf_defact *tcf_simp_ht[MY_TAB_SIZE];
-static DEFINE_RWLOCK(simp_lock);
+#define SIMP_TAB_MASK     7
 
-/* override the defaults */
-#define tcf_st		tcf_defact
-#define tc_st		tc_defact
-#define tcf_t_lock	simp_lock
-#define tcf_ht		tcf_simp_ht
+#define SIMP_MAX_DATA	32
+static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
+		    struct tcf_result *res)
+{
+	struct tcf_defact *d = a->priv;
+
+	spin_lock(&d->tcf_lock);
+	d->tcf_tm.lastuse = jiffies;
+	bstats_update(&d->tcf_bstats, skb);
+
+	/* print policy string followed by _ then packet count
+	 * Example if this was the 3rd packet and the string was "hello"
+	 * then it would look like "hello_3" (without quotes)
+	 */
+	pr_info("simple: %s_%d\n",
+	       (char *)d->tcfd_defdata, d->tcf_bstats.packets);
+	spin_unlock(&d->tcf_lock);
+	return d->tcf_action;
+}
+
+static void tcf_simp_release(struct tc_action *a, int bind)
+{
+	struct tcf_defact *d = to_defact(a);
+	kfree(d->tcfd_defdata);
+}
+
+static int alloc_defdata(struct tcf_defact *d, char *defdata)
+{
+	d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
+	if (unlikely(!d->tcfd_defdata))
+		return -ENOMEM;
+	strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+	return 0;
+}
+
+static void reset_policy(struct tcf_defact *d, char *defdata,
+			 struct tc_defact *p)
+{
+	spin_lock_bh(&d->tcf_lock);
+	d->tcf_action = p->action;
+	memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
+	strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+	spin_unlock_bh(&d->tcf_lock);
+}
+
+static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
+	[TCA_DEF_PARMS]	= { .len = sizeof(struct tc_defact) },
+	[TCA_DEF_DATA]	= { .type = NLA_STRING, .len = SIMP_MAX_DATA },
+};
 
-#define CONFIG_NET_ACT_INIT 1
-#include <net/pkt_act.h>
-#include <net/act_generic.h>
+static int tcf_simp_init(struct net *net, struct nlattr *nla,
+			 struct nlattr *est, struct tc_action *a,
+			 int ovr, int bind)
+{
+	struct nlattr *tb[TCA_DEF_MAX + 1];
+	struct tc_defact *parm;
+	struct tcf_defact *d;
+	char *defdata;
+	int ret = 0, err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_DEF_PARMS] == NULL)
+		return -EINVAL;
+
+	if (tb[TCA_DEF_DATA] == NULL)
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_DEF_PARMS]);
+	defdata = nla_data(tb[TCA_DEF_DATA]);
+
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+		if (ret)
+			return ret;
+
+		d = to_defact(a);
+		ret = alloc_defdata(d, defdata);
+		if (ret < 0) {
+			tcf_hash_cleanup(a, est);
+			return ret;
+		}
+		d->tcf_action = parm->action;
+		ret = ACT_P_CREATED;
+	} else {
+		d = to_defact(a);
 
-static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+		if (bind)
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
+			return -EEXIST;
+
+		reset_policy(d, defdata, parm);
+	}
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(a);
+	return ret;
+}
+
+static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
+			 int bind, int ref)
 {
-	struct tcf_defact *p = PRIV(a, defact);
-
-	spin_lock(&p->lock);
-	p->tm.lastuse = jiffies;
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
-
-	/* print policy string followed by _ then packet count 
-	 * Example if this was the 3rd packet and the string was "hello" 
-	 * then it would look like "hello_3" (without quotes) 
-	 **/
-	printk("simple: %s_%d\n", (char *)p->defdata, p->bstats.packets);
-	spin_unlock(&p->lock);
-	return p->action;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_defact *d = a->priv;
+	struct tc_defact opt = {
+		.index   = d->tcf_index,
+		.refcnt  = d->tcf_refcnt - ref,
+		.bindcnt = d->tcf_bindcnt - bind,
+		.action  = d->tcf_action,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
+	    nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
+		goto nla_put_failure;
+	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+	if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t))
+		goto nla_put_failure;
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
 }
 
 static struct tc_action_ops act_simp_ops = {
-	.kind = "simple",
-	.type = TCA_ACT_SIMP,
-	.capab = TCA_CAP_NONE,
-	.owner = THIS_MODULE,
-	.act = tcf_simp,
-	tca_use_default_ops
+	.kind		=	"simple",
+	.type		=	TCA_ACT_SIMP,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_simp,
+	.dump		=	tcf_simp_dump,
+	.cleanup	=	tcf_simp_release,
+	.init		=	tcf_simp_init,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2005)");
@@ -77,9 +176,10 @@ MODULE_LICENSE("GPL");
 
 static int __init simp_init_module(void)
 {
-	int ret = tcf_register_action(&act_simp_ops);
+	int ret;
+	ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK);
 	if (!ret)
-		printk("Simple TC action Loaded\n");
+		pr_info("Simple TC action Loaded\n");
 	return ret;
 }
 
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
new file mode 100644
index 00000000000..fcfeeaf838b
--- /dev/null
+++ b/net/sched/act_skbedit.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_skbedit.h>
+
+#define SKBEDIT_TAB_MASK     15
+
+static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
+		       struct tcf_result *res)
+{
+	struct tcf_skbedit *d = a->priv;
+
+	spin_lock(&d->tcf_lock);
+	d->tcf_tm.lastuse = jiffies;
+	bstats_update(&d->tcf_bstats, skb);
+
+	if (d->flags & SKBEDIT_F_PRIORITY)
+		skb->priority = d->priority;
+	if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
+	    skb->dev->real_num_tx_queues > d->queue_mapping)
+		skb_set_queue_mapping(skb, d->queue_mapping);
+	if (d->flags & SKBEDIT_F_MARK)
+		skb->mark = d->mark;
+
+	spin_unlock(&d->tcf_lock);
+	return d->tcf_action;
+}
+
+static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
+	[TCA_SKBEDIT_PARMS]		= { .len = sizeof(struct tc_skbedit) },
+	[TCA_SKBEDIT_PRIORITY]		= { .len = sizeof(u32) },
+	[TCA_SKBEDIT_QUEUE_MAPPING]	= { .len = sizeof(u16) },
+	[TCA_SKBEDIT_MARK]		= { .len = sizeof(u32) },
+};
+
+static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
+			    struct nlattr *est, struct tc_action *a,
+			    int ovr, int bind)
+{
+	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
+	struct tc_skbedit *parm;
+	struct tcf_skbedit *d;
+	u32 flags = 0, *priority = NULL, *mark = NULL;
+	u16 *queue_mapping = NULL;
+	int ret = 0, err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_SKBEDIT_PARMS] == NULL)
+		return -EINVAL;
+
+	if (tb[TCA_SKBEDIT_PRIORITY] != NULL) {
+		flags |= SKBEDIT_F_PRIORITY;
+		priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]);
+	}
+
+	if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
+		flags |= SKBEDIT_F_QUEUE_MAPPING;
+		queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
+	}
+
+	if (tb[TCA_SKBEDIT_MARK] != NULL) {
+		flags |= SKBEDIT_F_MARK;
+		mark = nla_data(tb[TCA_SKBEDIT_MARK]);
+	}
+
+	if (!flags)
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
+
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+		if (ret)
+			return ret;
+
+		d = to_skbedit(a);
+		ret = ACT_P_CREATED;
+	} else {
+		d = to_skbedit(a);
+		if (bind)
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	spin_lock_bh(&d->tcf_lock);
+
+	d->flags = flags;
+	if (flags & SKBEDIT_F_PRIORITY)
+		d->priority = *priority;
+	if (flags & SKBEDIT_F_QUEUE_MAPPING)
+		d->queue_mapping = *queue_mapping;
+	if (flags & SKBEDIT_F_MARK)
+		d->mark = *mark;
+
+	d->tcf_action = parm->action;
+
+	spin_unlock_bh(&d->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(a);
+	return ret;
+}
+
+static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
+			    int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_skbedit *d = a->priv;
+	struct tc_skbedit opt = {
+		.index   = d->tcf_index,
+		.refcnt  = d->tcf_refcnt - ref,
+		.bindcnt = d->tcf_bindcnt - bind,
+		.action  = d->tcf_action,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if ((d->flags & SKBEDIT_F_PRIORITY) &&
+	    nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
+		    &d->priority))
+		goto nla_put_failure;
+	if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) &&
+	    nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING,
+		    sizeof(d->queue_mapping), &d->queue_mapping))
+		goto nla_put_failure;
+	if ((d->flags & SKBEDIT_F_MARK) &&
+	    nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
+		    &d->mark))
+		goto nla_put_failure;
+	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+	if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t))
+		goto nla_put_failure;
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_skbedit_ops = {
+	.kind		=	"skbedit",
+	.type		=	TCA_ACT_SKBEDIT,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_skbedit,
+	.dump		=	tcf_skbedit_dump,
+	.init		=	tcf_skbedit_init,
+};
+
+MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
+MODULE_DESCRIPTION("SKB Editing");
+MODULE_LICENSE("GPL");
+
+static int __init skbedit_init_module(void)
+{
+	return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK);
+}
+
+static void __exit skbedit_cleanup_module(void)
+{
+	tcf_unregister_action(&act_skbedit_ops);
+}
+
+module_init(skbedit_init_module);
+module_exit(skbedit_cleanup_module);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b4d89fbb378..45527e6b52d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -14,123 +14,110 @@
  *
  */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
 #include <linux/init.h>
 #include <linux/kmod.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
 #include <net/sock.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
 
-#if 0 /* control */
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
 /* The list of all installed classifier types */
-
-static struct tcf_proto_ops *tcf_proto_base;
+static LIST_HEAD(tcf_proto_base);
 
 /* Protects list of registered TC modules. It is pure SMP lock. */
 static DEFINE_RWLOCK(cls_mod_lock);
 
 /* Find classifier type by string name */
 
-static struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind)
+static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
 {
-	struct tcf_proto_ops *t = NULL;
+	const struct tcf_proto_ops *t, *res = NULL;
 
 	if (kind) {
 		read_lock(&cls_mod_lock);
-		for (t = tcf_proto_base; t; t = t->next) {
-			if (rtattr_strcmp(kind, t->kind) == 0) {
-				if (!try_module_get(t->owner))
-					t = NULL;
+		list_for_each_entry(t, &tcf_proto_base, head) {
+			if (nla_strcmp(kind, t->kind) == 0) {
+				if (try_module_get(t->owner))
+					res = t;
 				break;
 			}
 		}
 		read_unlock(&cls_mod_lock);
 	}
-	return t;
+	return res;
 }
 
 /* Register(unregister) new classifier type */
 
 int register_tcf_proto_ops(struct tcf_proto_ops *ops)
 {
-	struct tcf_proto_ops *t, **tp;
+	struct tcf_proto_ops *t;
 	int rc = -EEXIST;
 
 	write_lock(&cls_mod_lock);
-	for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
+	list_for_each_entry(t, &tcf_proto_base, head)
 		if (!strcmp(ops->kind, t->kind))
 			goto out;
 
-	ops->next = NULL;
-	*tp = ops;
+	list_add_tail(&ops->head, &tcf_proto_base);
 	rc = 0;
 out:
 	write_unlock(&cls_mod_lock);
 	return rc;
 }
+EXPORT_SYMBOL(register_tcf_proto_ops);
 
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 {
-	struct tcf_proto_ops *t, **tp;
+	struct tcf_proto_ops *t;
 	int rc = -ENOENT;
 
 	write_lock(&cls_mod_lock);
-	for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next)
-		if (t == ops)
+	list_for_each_entry(t, &tcf_proto_base, head) {
+		if (t == ops) {
+			list_del(&t->head);
+			rc = 0;
 			break;
-
-	if (!t)
-		goto out;
-	*tp = t->next;
-	rc = 0;
-out:
+		}
+	}
 	write_unlock(&cls_mod_lock);
 	return rc;
 }
+EXPORT_SYMBOL(unregister_tcf_proto_ops);
 
-static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
-			  struct tcf_proto *tp, unsigned long fh, int event);
+static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+			  struct nlmsghdr *n, struct tcf_proto *tp,
+			  unsigned long fh, int event);
 
 
 /* Select new prio value from the range, managed by kernel. */
 
-static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp)
+static inline u32 tcf_auto_prio(struct tcf_proto *tp)
 {
-	u32 first = TC_H_MAKE(0xC0000000U,0U);
+	u32 first = TC_H_MAKE(0xC0000000U, 0U);
 
 	if (tp)
-		first = tp->prio-1;
+		first = tp->prio - 1;
 
 	return first;
 }
 
 /* Add/change/delete/get a filter node */
 
-static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 {
-	struct rtattr **tca;
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tca[TCA_MAX + 1];
+	spinlock_t *root_lock;
 	struct tcmsg *t;
 	u32 protocol;
 	u32 prio;
@@ -140,15 +127,23 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 	struct Qdisc  *q;
 	struct tcf_proto **back, **chain;
 	struct tcf_proto *tp;
-	struct tcf_proto_ops *tp_ops;
-	struct Qdisc_class_ops *cops;
+	const struct tcf_proto_ops *tp_ops;
+	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
 	unsigned long fh;
 	int err;
+	int tp_created = 0;
+
+	if ((n->nlmsg_type != RTM_GETTFILTER) &&
+	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
 
 replay:
-	tca = arg;
-	t = NLMSG_DATA(n);
+	err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
+	if (err < 0)
+		return err;
+
+	t = nlmsg_data(n);
 	protocol = TC_H_MIN(t->tcm_info);
 	prio = TC_H_MAJ(t->tcm_info);
 	nprio = prio;
@@ -157,28 +152,37 @@ replay:
 
 	if (prio == 0) {
 		/* If no priority is given, user wants we allocated it. */
-		if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+		if (n->nlmsg_type != RTM_NEWTFILTER ||
+		    !(n->nlmsg_flags & NLM_F_CREATE))
 			return -ENOENT;
-		prio = TC_H_MAKE(0x80000000U,0U);
+		prio = TC_H_MAKE(0x80000000U, 0U);
 	}
 
 	/* Find head of filter chain. */
 
 	/* Find link */
-	if ((dev = __dev_get_by_index(t->tcm_ifindex)) == NULL)
+	dev = __dev_get_by_index(net, t->tcm_ifindex);
+	if (dev == NULL)
 		return -ENODEV;
 
 	/* Find qdisc */
 	if (!parent) {
-		q = dev->qdisc_sleeping;
+		q = dev->qdisc;
 		parent = q->handle;
-	} else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL)
-		return -EINVAL;
+	} else {
+		q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
+		if (q == NULL)
+			return -EINVAL;
+	}
 
 	/* Is it classful? */
-	if ((cops = q->ops->cl_ops) == NULL)
+	cops = q->ops->cl_ops;
+	if (!cops)
 		return -EINVAL;
 
+	if (cops->tcf_chain == NULL)
+		return -EOPNOTSUPP;
+
 	/* Do we search for filter, attached to class? */
 	if (TC_H_MIN(parent)) {
 		cl = cops->get(q, parent);
@@ -193,10 +197,11 @@ replay:
 		goto errout;
 
 	/* Check the chain for existence of proto-tcf with this priority */
-	for (back = chain; (tp=*back) != NULL; back = &tp->next) {
+	for (back = chain; (tp = *back) != NULL; back = &tp->next) {
 		if (tp->prio >= prio) {
 			if (tp->prio == prio) {
-				if (!nprio || (tp->protocol != protocol && protocol))
+				if (!nprio ||
+				    (tp->protocol != protocol && protocol))
 					goto errout;
 			} else
 				tp = NULL;
@@ -204,31 +209,35 @@ replay:
 		}
 	}
 
+	root_lock = qdisc_root_sleeping_lock(q);
+
 	if (tp == NULL) {
 		/* Proto-tcf does not exist, create new one */
 
-		if (tca[TCA_KIND-1] == NULL || !protocol)
+		if (tca[TCA_KIND] == NULL || !protocol)
 			goto errout;
 
 		err = -ENOENT;
-		if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+		if (n->nlmsg_type != RTM_NEWTFILTER ||
+		    !(n->nlmsg_flags & NLM_F_CREATE))
 			goto errout;
 
 
 		/* Create new proto tcf */
 
 		err = -ENOBUFS;
-		if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL)
+		tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+		if (tp == NULL)
 			goto errout;
-		err = -EINVAL;
-		tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]);
+		err = -ENOENT;
+		tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
 		if (tp_ops == NULL) {
-#ifdef CONFIG_KMOD
-			struct rtattr *kind = tca[TCA_KIND-1];
+#ifdef CONFIG_MODULES
+			struct nlattr *kind = tca[TCA_KIND];
 			char name[IFNAMSIZ];
 
 			if (kind != NULL &&
-			    rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
+			    nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 				rtnl_unlock();
 				request_module("cls_%s", name);
 				rtnl_lock();
@@ -248,58 +257,60 @@ replay:
 			kfree(tp);
 			goto errout;
 		}
-		memset(tp, 0, sizeof(*tp));
 		tp->ops = tp_ops;
 		tp->protocol = protocol;
-		tp->prio = nprio ? : tcf_auto_prio(*back);
+		tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back));
 		tp->q = q;
 		tp->classify = tp_ops->classify;
 		tp->classid = parent;
-		if ((err = tp_ops->init(tp)) != 0) {
+
+		err = tp_ops->init(tp);
+		if (err != 0) {
 			module_put(tp_ops->owner);
 			kfree(tp);
 			goto errout;
 		}
 
-		qdisc_lock_tree(dev);
-		tp->next = *back;
-		*back = tp;
-		qdisc_unlock_tree(dev);
+		tp_created = 1;
 
-	} else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind))
+	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
 		goto errout;
 
 	fh = tp->ops->get(tp, t->tcm_handle);
 
 	if (fh == 0) {
 		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
-			qdisc_lock_tree(dev);
+			spin_lock_bh(root_lock);
 			*back = tp->next;
-			qdisc_unlock_tree(dev);
+			spin_unlock_bh(root_lock);
 
-			tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
+			tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
 			tcf_destroy(tp);
 			err = 0;
 			goto errout;
 		}
 
 		err = -ENOENT;
-		if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+		if (n->nlmsg_type != RTM_NEWTFILTER ||
+		    !(n->nlmsg_flags & NLM_F_CREATE))
 			goto errout;
 	} else {
 		switch (n->nlmsg_type) {
-		case RTM_NEWTFILTER:	
+		case RTM_NEWTFILTER:
 			err = -EEXIST;
-			if (n->nlmsg_flags&NLM_F_EXCL)
+			if (n->nlmsg_flags & NLM_F_EXCL) {
+				if (tp_created)
+					tcf_destroy(tp);
 				goto errout;
+			}
 			break;
 		case RTM_DELTFILTER:
 			err = tp->ops->delete(tp, fh);
 			if (err == 0)
-				tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
+				tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
 			goto errout;
 		case RTM_GETTFILTER:
-			err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
+			err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
 			goto errout;
 		default:
 			err = -EINVAL;
@@ -307,9 +318,20 @@ replay:
 		}
 	}
 
-	err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
-	if (err == 0)
-		tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
+	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
+			      n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE);
+	if (err == 0) {
+		if (tp_created) {
+			spin_lock_bh(root_lock);
+			tp->next = *back;
+			*back = tp;
+			spin_unlock_bh(root_lock);
+		}
+		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
+	} else {
+		if (tp_created)
+			tcf_destroy(tp);
+	}
 
 errout:
 	if (cl)
@@ -320,96 +342,106 @@ errout:
 	return err;
 }
 
-static int
-tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
-	      u32 pid, u32 seq, u16 flags, int event)
+static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp,
+			 unsigned long fh, u32 portid, u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
-	unsigned char	 *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
-	tcm = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	tcm = nlmsg_data(nlh);
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm__pad1 = 0;
-	tcm->tcm__pad1 = 0;
-	tcm->tcm_ifindex = tp->q->dev->ifindex;
+	tcm->tcm__pad2 = 0;
+	tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
 	tcm->tcm_parent = tp->classid;
 	tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
-	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind);
+	if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
+		goto nla_put_failure;
 	tcm->tcm_handle = fh;
 	if (RTM_DELTFILTER != event) {
 		tcm->tcm_handle = 0;
-		if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0)
-			goto rtattr_failure;
+		if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
+			goto nla_put_failure;
 	}
-	nlh->nlmsg_len = skb->tail - b;
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+out_nlmsg_trim:
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
-			  struct tcf_proto *tp, unsigned long fh, int event)
+static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+			  struct nlmsghdr *n, struct tcf_proto *tp,
+			  unsigned long fh, int event)
 {
 	struct sk_buff *skb;
-	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
-	if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) {
+	if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
 
-	return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+			      n->nlmsg_flags & NLM_F_ECHO);
 }
 
-struct tcf_dump_args
-{
+struct tcf_dump_args {
 	struct tcf_walker w;
 	struct sk_buff *skb;
 	struct netlink_callback *cb;
 };
 
-static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg)
+static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
+			 struct tcf_walker *arg)
 {
-	struct tcf_dump_args *a = (void*)arg;
+	struct tcf_dump_args *a = (void *)arg;
+	struct net *net = sock_net(a->skb->sk);
 
-	return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid,
+	return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
 			     a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
 }
 
+/* called with RTNL */
 static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = sock_net(skb->sk);
 	int t;
 	int s_t;
 	struct net_device *dev;
 	struct Qdisc *q;
 	struct tcf_proto *tp, **chain;
-	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
+	struct tcmsg *tcm = nlmsg_data(cb->nlh);
 	unsigned long cl = 0;
-	struct Qdisc_class_ops *cops;
+	const struct Qdisc_class_ops *cops;
 	struct tcf_dump_args arg;
 
-	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
 		return skb->len;
-	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
 		return skb->len;
 
-	read_lock_bh(&qdisc_tree_lock);
 	if (!tcm->tcm_parent)
-		q = dev->qdisc_sleeping;
+		q = dev->qdisc;
 	else
 		q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
 	if (!q)
 		goto out;
-	if ((cops = q->ops->cl_ops) == NULL)
+	cops = q->ops->cl_ops;
+	if (!cops)
+		goto errout;
+	if (cops->tcf_chain == NULL)
 		goto errout;
 	if (TC_H_MIN(tcm->tcm_parent)) {
 		cl = cops->get(q, tcm->tcm_parent);
@@ -422,8 +454,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 
 	s_t = cb->args[0];
 
-	for (tp=*chain, t=0; tp; tp = tp->next, t++) {
-		if (t < s_t) continue;
+	for (tp = *chain, t = 0; tp; tp = tp->next, t++) {
+		if (t < s_t)
+			continue;
 		if (TC_H_MAJ(tcm->tcm_info) &&
 		    TC_H_MAJ(tcm->tcm_info) != tp->prio)
 			continue;
@@ -433,10 +466,11 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 		if (t > s_t)
 			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
 		if (cb->args[1] == 0) {
-			if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid,
-					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) {
+			if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid,
+					  cb->nlh->nlmsg_seq, NLM_F_MULTI,
+					  RTM_NEWTFILTER) <= 0)
 				break;
-			}
+
 			cb->args[1] = 1;
 		}
 		if (tp->ops->walk == NULL)
@@ -445,10 +479,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 		arg.skb = skb;
 		arg.cb = cb;
 		arg.w.stop = 0;
-		arg.w.skip = cb->args[1]-1;
+		arg.w.skip = cb->args[1] - 1;
 		arg.w.count = 0;
 		tp->ops->walk(tp, &arg.w);
-		cb->args[1] = arg.w.count+1;
+		cb->args[1] = arg.w.count + 1;
 		if (arg.w.stop)
 			break;
 	}
@@ -459,185 +493,125 @@ errout:
 	if (cl)
 		cops->put(q, cl);
 out:
-	read_unlock_bh(&qdisc_tree_lock);
-	dev_put(dev);
 	return skb->len;
 }
 
-void
-tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts)
+void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (exts->action) {
-		tcf_action_destroy(exts->action, TCA_ACT_UNBIND);
-		exts->action = NULL;
-	}
-#elif defined CONFIG_NET_CLS_POLICE
-	if (exts->police) {
-		tcf_police_release(exts->police, TCA_ACT_UNBIND);
-		exts->police = NULL;
-	}
+	tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND);
+	INIT_LIST_HEAD(&exts->actions);
 #endif
 }
+EXPORT_SYMBOL(tcf_exts_destroy);
 
-
-int
-tcf_exts_validate(struct tcf_proto *tp, struct rtattr **tb,
-	          struct rtattr *rate_tlv, struct tcf_exts *exts,
-	          struct tcf_ext_map *map)
+int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
+		  struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
 {
-	memset(exts, 0, sizeof(*exts));
-	
 #ifdef CONFIG_NET_CLS_ACT
 	{
-		int err;
 		struct tc_action *act;
 
-		if (map->police && tb[map->police-1]) {
-			act = tcf_action_init_1(tb[map->police-1], rate_tlv, "police",
-				TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err);
-			if (act == NULL)
+		INIT_LIST_HEAD(&exts->actions);
+		if (exts->police && tb[exts->police]) {
+			act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
+						"police", ovr,
+						TCA_ACT_BIND);
+			if (IS_ERR(act))
+				return PTR_ERR(act);
+
+			act->type = exts->type = TCA_OLD_COMPAT;
+			list_add(&act->list, &exts->actions);
+		} else if (exts->action && tb[exts->action]) {
+			int err;
+			err = tcf_action_init(net, tb[exts->action], rate_tlv,
+					      NULL, ovr,
+					      TCA_ACT_BIND, &exts->actions);
+			if (err)
 				return err;
-
-			act->type = TCA_OLD_COMPAT;
-			exts->action = act;
-		} else if (map->action && tb[map->action-1]) {
-			act = tcf_action_init(tb[map->action-1], rate_tlv, NULL,
-				TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err);
-			if (act == NULL)
-				return err;
-
-			exts->action = act;
 		}
 	}
-#elif defined CONFIG_NET_CLS_POLICE
-	if (map->police && tb[map->police-1]) {
-		struct tcf_police *p;
-		
-		p = tcf_police_locate(tb[map->police-1], rate_tlv);
-		if (p == NULL)
-			return -EINVAL;
-
-		exts->police = p;
-	} else if (map->action && tb[map->action-1])
-		return -EOPNOTSUPP;
 #else
-	if ((map->action && tb[map->action-1]) ||
-	    (map->police && tb[map->police-1]))
+	if ((exts->action && tb[exts->action]) ||
+	    (exts->police && tb[exts->police]))
 		return -EOPNOTSUPP;
 #endif
 
 	return 0;
 }
+EXPORT_SYMBOL(tcf_exts_validate);
 
-void
-tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
-	        struct tcf_exts *src)
+void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
+		     struct tcf_exts *src)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (src->action) {
-		struct tc_action *act;
-		tcf_tree_lock(tp);
-		act = xchg(&dst->action, src->action);
-		tcf_tree_unlock(tp);
-		if (act)
-			tcf_action_destroy(act, TCA_ACT_UNBIND);
-	}
-#elif defined CONFIG_NET_CLS_POLICE
-	if (src->police) {
-		struct tcf_police *p;
-		tcf_tree_lock(tp);
-		p = xchg(&dst->police, src->police);
-		tcf_tree_unlock(tp);
-		if (p)
-			tcf_police_release(p, TCA_ACT_UNBIND);
-	}
+	LIST_HEAD(tmp);
+	tcf_tree_lock(tp);
+	list_splice_init(&dst->actions, &tmp);
+	list_splice(&src->actions, &dst->actions);
+	tcf_tree_unlock(tp);
+	tcf_action_destroy(&tmp, TCA_ACT_UNBIND);
 #endif
 }
+EXPORT_SYMBOL(tcf_exts_change);
 
-int
-tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts,
-	      struct tcf_ext_map *map)
+#define tcf_exts_first_act(ext) \
+		list_first_entry(&(exts)->actions, struct tc_action, list)
+
+int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (map->action && exts->action) {
+	if (exts->action && !list_empty(&exts->actions)) {
 		/*
 		 * again for backward compatible mode - we want
 		 * to work with both old and new modes of entering
 		 * tc data even if iproute2  was newer - jhs
 		 */
-		struct rtattr * p_rta = (struct rtattr*) skb->tail;
-
-		if (exts->action->type != TCA_OLD_COMPAT) {
-			RTA_PUT(skb, map->action, 0, NULL);
-			if (tcf_action_dump(skb, exts->action, 0, 0) < 0)
-				goto rtattr_failure;
-			p_rta->rta_len = skb->tail - (u8*)p_rta;
-		} else if (map->police) {
-			RTA_PUT(skb, map->police, 0, NULL);
-			if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0)
-				goto rtattr_failure;
-			p_rta->rta_len = skb->tail - (u8*)p_rta;
+		struct nlattr *nest;
+		if (exts->type != TCA_OLD_COMPAT) {
+			nest = nla_nest_start(skb, exts->action);
+			if (nest == NULL)
+				goto nla_put_failure;
+			if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0)
+				goto nla_put_failure;
+			nla_nest_end(skb, nest);
+		} else if (exts->police) {
+			struct tc_action *act = tcf_exts_first_act(exts);
+			nest = nla_nest_start(skb, exts->police);
+			if (nest == NULL || !act)
+				goto nla_put_failure;
+			if (tcf_action_dump_old(skb, act, 0, 0) < 0)
+				goto nla_put_failure;
+			nla_nest_end(skb, nest);
 		}
 	}
-#elif defined CONFIG_NET_CLS_POLICE
-	if (map->police && exts->police) {
-		struct rtattr * p_rta = (struct rtattr*) skb->tail;
-
-		RTA_PUT(skb, map->police, 0, NULL);
-
-		if (tcf_police_dump(skb, exts->police) < 0)
-			goto rtattr_failure;
-
-		p_rta->rta_len = skb->tail - (u8*)p_rta;
-	}
 #endif
 	return 0;
-rtattr_failure: __attribute__ ((unused))
+nla_put_failure: __attribute__ ((unused))
 	return -1;
 }
+EXPORT_SYMBOL(tcf_exts_dump);
+
 
-int
-tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts,
-	            struct tcf_ext_map *map)
+int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (exts->action)
-		if (tcf_action_copy_stats(skb, exts->action, 1) < 0)
-			goto rtattr_failure;
-#elif defined CONFIG_NET_CLS_POLICE
-	if (exts->police)
-		if (tcf_police_dump_stats(skb, exts->police) < 0)
-			goto rtattr_failure;
+	struct tc_action *a = tcf_exts_first_act(exts);
+	if (tcf_action_copy_stats(skb, a, 1) < 0)
+		return -1;
 #endif
 	return 0;
-rtattr_failure: __attribute__ ((unused))
-	return -1;
 }
+EXPORT_SYMBOL(tcf_exts_dump_stats);
 
 static int __init tc_filter_init(void)
 {
-	struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC];
+	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
+		      tc_dump_tfilter, NULL);
 
-	/* Setup rtnetlink links. It is made here to avoid
-	   exporting large number of public symbols.
-	 */
-
-	if (link_p) {
-		link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
-		link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
-		link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
-		link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter;
-	}
 	return 0;
 }
 
 subsys_initcall(tc_filter_init);
-
-EXPORT_SYMBOL(register_tcf_proto_ops);
-EXPORT_SYMBOL(unregister_tcf_proto_ops);
-EXPORT_SYMBOL(tcf_exts_validate);
-EXPORT_SYMBOL(tcf_exts_destroy);
-EXPORT_SYMBOL(tcf_exts_change);
-EXPORT_SYMBOL(tcf_exts_dump);
-EXPORT_SYMBOL(tcf_exts_dump_stats);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index dfb300bb6ba..0ae1813e3e9 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -9,27 +9,24 @@
  * Authors:	Thomas Graf <tgraf@suug.ch>
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/rtnetlink.h>
 #include <linux/skbuff.h>
+#include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 
-struct basic_head
-{
+struct basic_head {
 	u32			hgenerator;
 	struct list_head	flist;
 };
 
-struct basic_filter
-{
+struct basic_filter {
 	u32			handle;
 	struct tcf_exts		exts;
 	struct tcf_ematch_tree	ematches;
@@ -37,16 +34,11 @@ struct basic_filter
 	struct list_head	link;
 };
 
-static struct tcf_ext_map basic_ext_map = {
-	.action = TCA_BASIC_ACT,
-	.police = TCA_BASIC_POLICE
-};
-
-static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp,
+static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			  struct tcf_result *res)
 {
 	int r;
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = tp->root;
 	struct basic_filter *f;
 
 	list_for_each_entry(f, &head->flist, link) {
@@ -64,7 +56,7 @@ static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp,
 static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
 {
 	unsigned long l = 0UL;
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = tp->root;
 	struct basic_filter *f;
 
 	if (head == NULL)
@@ -83,11 +75,17 @@ static void basic_put(struct tcf_proto *tp, unsigned long f)
 
 static int basic_init(struct tcf_proto *tp)
 {
+	struct basic_head *head;
+
+	head = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (head == NULL)
+		return -ENOBUFS;
+	INIT_LIST_HEAD(&head->flist);
+	tp->root = head;
 	return 0;
 }
 
-static inline void basic_delete_filter(struct tcf_proto *tp,
-				       struct basic_filter *f)
+static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)
 {
 	tcf_unbind_filter(tp, &f->res);
 	tcf_exts_destroy(tp, &f->exts);
@@ -97,18 +95,19 @@ static inline void basic_delete_filter(struct tcf_proto *tp,
 
 static void basic_destroy(struct tcf_proto *tp)
 {
-	struct basic_head *head = (struct basic_head *) xchg(&tp->root, NULL);
+	struct basic_head *head = tp->root;
 	struct basic_filter *f, *n;
-	
+
 	list_for_each_entry_safe(f, n, &head->flist, link) {
 		list_del(&f->link);
 		basic_delete_filter(tp, f);
 	}
+	kfree(head);
 }
 
 static int basic_delete(struct tcf_proto *tp, unsigned long arg)
 {
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = tp->root;
 	struct basic_filter *t, *f = (struct basic_filter *) arg;
 
 	list_for_each_entry(t, &head->flist, link)
@@ -123,28 +122,31 @@ static int basic_delete(struct tcf_proto *tp, unsigned long arg)
 	return -ENOENT;
 }
 
-static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
-				  unsigned long base, struct rtattr **tb,
-				  struct rtattr *est)
+static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
+	[TCA_BASIC_CLASSID]	= { .type = NLA_U32 },
+	[TCA_BASIC_EMATCHES]	= { .type = NLA_NESTED },
+};
+
+static int basic_set_parms(struct net *net, struct tcf_proto *tp,
+			   struct basic_filter *f, unsigned long base,
+			   struct nlattr **tb,
+			   struct nlattr *est, bool ovr)
 {
-	int err = -EINVAL;
+	int err;
 	struct tcf_exts e;
 	struct tcf_ematch_tree t;
 
-	if (tb[TCA_BASIC_CLASSID-1])
-		if (RTA_PAYLOAD(tb[TCA_BASIC_CLASSID-1]) < sizeof(u32))
-			return err;
-
-	err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map);
+	tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
 	if (err < 0)
 		return err;
 
-	err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES-1], &t);
+	err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t);
 	if (err < 0)
 		goto errout;
 
-	if (tb[TCA_BASIC_CLASSID-1]) {
-		f->res.classid = *(u32*)RTA_DATA(tb[TCA_BASIC_CLASSID-1]);
+	if (tb[TCA_BASIC_CLASSID]) {
+		f->res.classid = nla_get_u32(tb[TCA_BASIC_CLASSID]);
 		tcf_bind_filter(tp, &f->res, base);
 	}
 
@@ -157,61 +159,54 @@ errout:
 	return err;
 }
 
-static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
-		        struct rtattr **tca, unsigned long *arg)
+static int basic_change(struct net *net, struct sk_buff *in_skb,
+			struct tcf_proto *tp, unsigned long base, u32 handle,
+			struct nlattr **tca, unsigned long *arg, bool ovr)
 {
-	int err = -EINVAL;
-	struct basic_head *head = (struct basic_head *) tp->root;
-	struct rtattr *tb[TCA_BASIC_MAX];
+	int err;
+	struct basic_head *head = tp->root;
+	struct nlattr *tb[TCA_BASIC_MAX + 1];
 	struct basic_filter *f = (struct basic_filter *) *arg;
 
-	if (tca[TCA_OPTIONS-1] == NULL)
+	if (tca[TCA_OPTIONS] == NULL)
 		return -EINVAL;
 
-	if (rtattr_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS-1]) < 0)
-		return -EINVAL;
+	err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS],
+			       basic_policy);
+	if (err < 0)
+		return err;
 
 	if (f != NULL) {
 		if (handle && f->handle != handle)
 			return -EINVAL;
-		return basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]);
+		return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);
 	}
 
 	err = -ENOBUFS;
-	if (head == NULL) {
-		head = kmalloc(sizeof(*head), GFP_KERNEL);
-		if (head == NULL)
-			goto errout;
-
-		memset(head, 0, sizeof(*head));
-		INIT_LIST_HEAD(&head->flist);
-		tp->root = head;
-	}
-
-	f = kmalloc(sizeof(*f), GFP_KERNEL);
+	f = kzalloc(sizeof(*f), GFP_KERNEL);
 	if (f == NULL)
 		goto errout;
-	memset(f, 0, sizeof(*f));
 
+	tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
 	err = -EINVAL;
 	if (handle)
 		f->handle = handle;
 	else {
-		int i = 0x80000000;
+		unsigned int i = 0x80000000;
 		do {
 			if (++head->hgenerator == 0x7FFFFFFF)
 				head->hgenerator = 1;
 		} while (--i > 0 && basic_get(tp, head->hgenerator));
 
 		if (i <= 0) {
-			printk(KERN_ERR "Insufficient number of handles\n");
+			pr_err("Insufficient number of handles\n");
 			goto errout;
 		}
 
 		f->handle = head->hgenerator;
 	}
 
-	err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]);
+	err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);
 	if (err < 0)
 		goto errout;
 
@@ -230,7 +225,7 @@ errout:
 
 static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = tp->root;
 	struct basic_filter *f;
 
 	list_for_each_entry(f, &head->flist, link) {
@@ -246,37 +241,42 @@ skip:
 	}
 }
 
-static int basic_dump(struct tcf_proto *tp, unsigned long fh,
+static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		      struct sk_buff *skb, struct tcmsg *t)
 {
 	struct basic_filter *f = (struct basic_filter *) fh;
-	unsigned char *b = skb->tail;
-	struct rtattr *rta;
+	struct nlattr *nest;
 
 	if (f == NULL)
 		return skb->len;
 
 	t->tcm_handle = f->handle;
 
-	rta = (struct rtattr *) b;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
 
-	if (f->res.classid)
-		RTA_PUT(skb, TCA_BASIC_CLASSID, sizeof(u32), &f->res.classid);
+	if (f->res.classid &&
+	    nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid))
+		goto nla_put_failure;
 
-	if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 ||
+	if (tcf_exts_dump(skb, &f->exts) < 0 ||
 	    tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
-		goto rtattr_failure;
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+		goto nla_put_failure;
 
-	rta->rta_len = (skb->tail - b);
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
 	return -1;
 }
 
-static struct tcf_proto_ops cls_basic_ops = {
+static struct tcf_proto_ops cls_basic_ops __read_mostly = {
 	.kind		=	"basic",
 	.classify	=	basic_classify,
 	.init		=	basic_init,
@@ -295,7 +295,7 @@ static int __init init_basic(void)
 	return register_tcf_proto_ops(&cls_basic_ops);
 }
 
-static void __exit exit_basic(void) 
+static void __exit exit_basic(void)
 {
 	unregister_tcf_proto_ops(&cls_basic_ops);
 }
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
new file mode 100644
index 00000000000..13f64df2c71
--- /dev/null
+++ b/net/sched/cls_bpf.c
@@ -0,0 +1,382 @@
+/*
+ * Berkeley Packet Filter based traffic classifier
+ *
+ * Might be used to classify traffic through flexible, user-defined and
+ * possibly JIT-ed BPF filters for traffic control as an alternative to
+ * ematches.
+ *
+ * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_DESCRIPTION("TC BPF based classifier");
+
+struct cls_bpf_head {
+	struct list_head plist;
+	u32 hgen;
+};
+
+struct cls_bpf_prog {
+	struct sk_filter *filter;
+	struct sock_filter *bpf_ops;
+	struct tcf_exts exts;
+	struct tcf_result res;
+	struct list_head link;
+	u32 handle;
+	u16 bpf_len;
+};
+
+static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
+	[TCA_BPF_CLASSID]	= { .type = NLA_U32 },
+	[TCA_BPF_OPS_LEN]	= { .type = NLA_U16 },
+	[TCA_BPF_OPS]		= { .type = NLA_BINARY,
+				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
+};
+
+static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+			    struct tcf_result *res)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog;
+	int ret;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		int filter_res = SK_RUN_FILTER(prog->filter, skb);
+
+		if (filter_res == 0)
+			continue;
+
+		*res = prog->res;
+		if (filter_res != -1)
+			res->classid = filter_res;
+
+		ret = tcf_exts_exec(skb, &prog->exts, res);
+		if (ret < 0)
+			continue;
+
+		return ret;
+	}
+
+	return -1;
+}
+
+static int cls_bpf_init(struct tcf_proto *tp)
+{
+	struct cls_bpf_head *head;
+
+	head = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (head == NULL)
+		return -ENOBUFS;
+
+	INIT_LIST_HEAD(&head->plist);
+	tp->root = head;
+
+	return 0;
+}
+
+static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
+{
+	tcf_unbind_filter(tp, &prog->res);
+	tcf_exts_destroy(tp, &prog->exts);
+
+	sk_unattached_filter_destroy(prog->filter);
+
+	kfree(prog->bpf_ops);
+	kfree(prog);
+}
+
+static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		if (prog == todel) {
+			tcf_tree_lock(tp);
+			list_del(&prog->link);
+			tcf_tree_unlock(tp);
+
+			cls_bpf_delete_prog(tp, prog);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static void cls_bpf_destroy(struct tcf_proto *tp)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog, *tmp;
+
+	list_for_each_entry_safe(prog, tmp, &head->plist, link) {
+		list_del(&prog->link);
+		cls_bpf_delete_prog(tp, prog);
+	}
+
+	kfree(head);
+}
+
+static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog;
+	unsigned long ret = 0UL;
+
+	if (head == NULL)
+		return 0UL;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		if (prog->handle == handle) {
+			ret = (unsigned long) prog;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void cls_bpf_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
+				   struct cls_bpf_prog *prog,
+				   unsigned long base, struct nlattr **tb,
+				   struct nlattr *est, bool ovr)
+{
+	struct sock_filter *bpf_ops, *bpf_old;
+	struct tcf_exts exts;
+	struct sock_fprog_kern tmp;
+	struct sk_filter *fp, *fp_old;
+	u16 bpf_size, bpf_len;
+	u32 classid;
+	int ret;
+
+	if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID])
+		return -EINVAL;
+
+	tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+	ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
+	if (ret < 0)
+		return ret;
+
+	classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+	bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
+	if (bpf_len > BPF_MAXINSNS || bpf_len == 0) {
+		ret = -EINVAL;
+		goto errout;
+	}
+
+	bpf_size = bpf_len * sizeof(*bpf_ops);
+	bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+	if (bpf_ops == NULL) {
+		ret = -ENOMEM;
+		goto errout;
+	}
+
+	memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
+
+	tmp.len = bpf_len;
+	tmp.filter = bpf_ops;
+
+	ret = sk_unattached_filter_create(&fp, &tmp);
+	if (ret)
+		goto errout_free;
+
+	tcf_tree_lock(tp);
+	fp_old = prog->filter;
+	bpf_old = prog->bpf_ops;
+
+	prog->bpf_len = bpf_len;
+	prog->bpf_ops = bpf_ops;
+	prog->filter = fp;
+	prog->res.classid = classid;
+	tcf_tree_unlock(tp);
+
+	tcf_bind_filter(tp, &prog->res, base);
+	tcf_exts_change(tp, &prog->exts, &exts);
+
+	if (fp_old)
+		sk_unattached_filter_destroy(fp_old);
+	if (bpf_old)
+		kfree(bpf_old);
+
+	return 0;
+
+errout_free:
+	kfree(bpf_ops);
+errout:
+	tcf_exts_destroy(tp, &exts);
+	return ret;
+}
+
+static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
+				   struct cls_bpf_head *head)
+{
+	unsigned int i = 0x80000000;
+
+	do {
+		if (++head->hgen == 0x7FFFFFFF)
+			head->hgen = 1;
+	} while (--i > 0 && cls_bpf_get(tp, head->hgen));
+	if (i == 0)
+		pr_err("Insufficient number of handles\n");
+
+	return i;
+}
+
+static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
+			  struct tcf_proto *tp, unsigned long base,
+			  u32 handle, struct nlattr **tca,
+			  unsigned long *arg, bool ovr)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg;
+	struct nlattr *tb[TCA_BPF_MAX + 1];
+	int ret;
+
+	if (tca[TCA_OPTIONS] == NULL)
+		return -EINVAL;
+
+	ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy);
+	if (ret < 0)
+		return ret;
+
+	if (prog != NULL) {
+		if (handle && prog->handle != handle)
+			return -EINVAL;
+		return cls_bpf_modify_existing(net, tp, prog, base, tb,
+					       tca[TCA_RATE], ovr);
+	}
+
+	prog = kzalloc(sizeof(*prog), GFP_KERNEL);
+	if (prog == NULL)
+		return -ENOBUFS;
+
+	tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+	if (handle == 0)
+		prog->handle = cls_bpf_grab_new_handle(tp, head);
+	else
+		prog->handle = handle;
+	if (prog->handle == 0) {
+		ret = -EINVAL;
+		goto errout;
+	}
+
+	ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
+	if (ret < 0)
+		goto errout;
+
+	tcf_tree_lock(tp);
+	list_add(&prog->link, &head->plist);
+	tcf_tree_unlock(tp);
+
+	*arg = (unsigned long) prog;
+
+	return 0;
+errout:
+	if (*arg == 0UL && prog)
+		kfree(prog);
+
+	return ret;
+}
+
+static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+			struct sk_buff *skb, struct tcmsg *tm)
+{
+	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
+	struct nlattr *nest, *nla;
+
+	if (prog == NULL)
+		return skb->len;
+
+	tm->tcm_handle = prog->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
+		goto nla_put_failure;
+	if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len))
+		goto nla_put_failure;
+
+	nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len *
+			  sizeof(struct sock_filter));
+	if (nla == NULL)
+		goto nla_put_failure;
+
+	memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
+
+	if (tcf_exts_dump(skb, &prog->exts) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &prog->exts) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct cls_bpf_head *head = tp->root;
+	struct cls_bpf_prog *prog;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		if (arg->count < arg->skip)
+			goto skip;
+		if (arg->fn(tp, (unsigned long) prog, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+skip:
+		arg->count++;
+	}
+}
+
+static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
+	.kind		=	"bpf",
+	.owner		=	THIS_MODULE,
+	.classify	=	cls_bpf_classify,
+	.init		=	cls_bpf_init,
+	.destroy	=	cls_bpf_destroy,
+	.get		=	cls_bpf_get,
+	.put		=	cls_bpf_put,
+	.change		=	cls_bpf_change,
+	.delete		=	cls_bpf_delete,
+	.walk		=	cls_bpf_walk,
+	.dump		=	cls_bpf_dump,
+};
+
+static int __init cls_bpf_init_mod(void)
+{
+	return register_tcf_proto_ops(&cls_bpf_ops);
+}
+
+static void __exit cls_bpf_exit_mod(void)
+{
+	unregister_tcf_proto_ops(&cls_bpf_ops);
+}
+
+module_init(cls_bpf_init_mod);
+module_exit(cls_bpf_exit_mod);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
new file mode 100644
index 00000000000..cacf01bd04f
--- /dev/null
+++ b/net/sched/cls_cgroup.c
@@ -0,0 +1,222 @@
+/*
+ * net/sched/cls_cgroup.c	Control Group Classifier
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rcupdate.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/cls_cgroup.h>
+
+struct cls_cgroup_head {
+	u32			handle;
+	struct tcf_exts		exts;
+	struct tcf_ematch_tree	ematches;
+};
+
+static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+			       struct tcf_result *res)
+{
+	struct cls_cgroup_head *head = tp->root;
+	u32 classid;
+
+	rcu_read_lock();
+	classid = task_cls_state(current)->classid;
+	rcu_read_unlock();
+
+	/*
+	 * Due to the nature of the classifier it is required to ignore all
+	 * packets originating from softirq context as accessing `current'
+	 * would lead to false results.
+	 *
+	 * This test assumes that all callers of dev_queue_xmit() explicitely
+	 * disable bh. Knowing this, it is possible to detect softirq based
+	 * calls by looking at the number of nested bh disable calls because
+	 * softirqs always disables bh.
+	 */
+	if (in_serving_softirq()) {
+		/* If there is an sk_classid we'll use that. */
+		if (!skb->sk)
+			return -1;
+		classid = skb->sk->sk_classid;
+	}
+
+	if (!classid)
+		return -1;
+
+	if (!tcf_em_tree_match(skb, &head->ematches, NULL))
+		return -1;
+
+	res->classid = classid;
+	res->class = 0;
+	return tcf_exts_exec(skb, &head->exts, res);
+}
+
+static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle)
+{
+	return 0UL;
+}
+
+static void cls_cgroup_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int cls_cgroup_init(struct tcf_proto *tp)
+{
+	return 0;
+}
+
+static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
+	[TCA_CGROUP_EMATCHES]	= { .type = NLA_NESTED },
+};
+
+static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
+			     struct tcf_proto *tp, unsigned long base,
+			     u32 handle, struct nlattr **tca,
+			     unsigned long *arg, bool ovr)
+{
+	struct nlattr *tb[TCA_CGROUP_MAX + 1];
+	struct cls_cgroup_head *head = tp->root;
+	struct tcf_ematch_tree t;
+	struct tcf_exts e;
+	int err;
+
+	if (!tca[TCA_OPTIONS])
+		return -EINVAL;
+
+	if (head == NULL) {
+		if (!handle)
+			return -EINVAL;
+
+		head = kzalloc(sizeof(*head), GFP_KERNEL);
+		if (head == NULL)
+			return -ENOBUFS;
+
+		tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+		head->handle = handle;
+
+		tcf_tree_lock(tp);
+		tp->root = head;
+		tcf_tree_unlock(tp);
+	}
+
+	if (handle != head->handle)
+		return -ENOENT;
+
+	err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
+			       cgroup_policy);
+	if (err < 0)
+		return err;
+
+	tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+	if (err < 0)
+		return err;
+
+	err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t);
+	if (err < 0)
+		return err;
+
+	tcf_exts_change(tp, &head->exts, &e);
+	tcf_em_tree_change(tp, &head->ematches, &t);
+
+	return 0;
+}
+
+static void cls_cgroup_destroy(struct tcf_proto *tp)
+{
+	struct cls_cgroup_head *head = tp->root;
+
+	if (head) {
+		tcf_exts_destroy(tp, &head->exts);
+		tcf_em_tree_destroy(tp, &head->ematches);
+		kfree(head);
+	}
+}
+
+static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct cls_cgroup_head *head = tp->root;
+
+	if (arg->count < arg->skip)
+		goto skip;
+
+	if (arg->fn(tp, (unsigned long) head, arg) < 0) {
+		arg->stop = 1;
+		return;
+	}
+skip:
+	arg->count++;
+}
+
+static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+			   struct sk_buff *skb, struct tcmsg *t)
+{
+	struct cls_cgroup_head *head = tp->root;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+
+	t->tcm_handle = head->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (tcf_exts_dump(skb, &head->exts) < 0 ||
+	    tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &head->exts) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
+	.kind		=	"cgroup",
+	.init		=	cls_cgroup_init,
+	.change		=	cls_cgroup_change,
+	.classify	=	cls_cgroup_classify,
+	.destroy	=	cls_cgroup_destroy,
+	.get		=	cls_cgroup_get,
+	.put		=	cls_cgroup_put,
+	.delete		=	cls_cgroup_delete,
+	.walk		=	cls_cgroup_walk,
+	.dump		=	cls_cgroup_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init init_cgroup_cls(void)
+{
+	return register_tcf_proto_ops(&cls_cgroup_ops);
+}
+
+static void __exit exit_cgroup_cls(void)
+{
+	unregister_tcf_proto_ops(&cls_cgroup_ops);
+}
+
+module_init(init_cgroup_cls);
+module_exit(exit_cgroup_cls);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
new file mode 100644
index 00000000000..35be16f7c19
--- /dev/null
+++ b/net/sched/cls_flow.c
@@ -0,0 +1,673 @@
+/*
+ * net/sched/cls_flow.c		Generic flow classifier
+ *
+ * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/pkt_cls.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <net/pkt_cls.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/flow_keys.h>
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+struct flow_head {
+	struct list_head	filters;
+};
+
+struct flow_filter {
+	struct list_head	list;
+	struct tcf_exts		exts;
+	struct tcf_ematch_tree	ematches;
+	struct timer_list	perturb_timer;
+	u32			perturb_period;
+	u32			handle;
+
+	u32			nkeys;
+	u32			keymask;
+	u32			mode;
+	u32			mask;
+	u32			xor;
+	u32			rshift;
+	u32			addend;
+	u32			divisor;
+	u32			baseclass;
+	u32			hashrnd;
+};
+
+static inline u32 addr_fold(void *addr)
+{
+	unsigned long a = (unsigned long)addr;
+
+	return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
+}
+
+static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+	if (flow->src)
+		return ntohl(flow->src);
+	return addr_fold(skb->sk);
+}
+
+static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+	if (flow->dst)
+		return ntohl(flow->dst);
+	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
+}
+
+static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+	return flow->ip_proto;
+}
+
+static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+	if (flow->ports)
+		return ntohs(flow->port16[0]);
+
+	return addr_fold(skb->sk);
+}
+
+static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+	if (flow->ports)
+		return ntohs(flow->port16[1]);
+
+	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
+}
+
+static u32 flow_get_iif(const struct sk_buff *skb)
+{
+	return skb->skb_iif;
+}
+
+static u32 flow_get_priority(const struct sk_buff *skb)
+{
+	return skb->priority;
+}
+
+static u32 flow_get_mark(const struct sk_buff *skb)
+{
+	return skb->mark;
+}
+
+static u32 flow_get_nfct(const struct sk_buff *skb)
+{
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	return addr_fold(skb->nfct);
+#else
+	return 0;
+#endif
+}
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#define CTTUPLE(skb, member)						\
+({									\
+	enum ip_conntrack_info ctinfo;					\
+	const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);		\
+	if (ct == NULL)							\
+		goto fallback;						\
+	ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member;			\
+})
+#else
+#define CTTUPLE(skb, member)						\
+({									\
+	goto fallback;							\
+	0;								\
+})
+#endif
+
+static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		return ntohl(CTTUPLE(skb, src.u3.ip));
+	case htons(ETH_P_IPV6):
+		return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
+	}
+fallback:
+	return flow_get_src(skb, flow);
+}
+
+static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		return ntohl(CTTUPLE(skb, dst.u3.ip));
+	case htons(ETH_P_IPV6):
+		return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
+	}
+fallback:
+	return flow_get_dst(skb, flow);
+}
+
+static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+	return ntohs(CTTUPLE(skb, src.u.all));
+fallback:
+	return flow_get_proto_src(skb, flow);
+}
+
+static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+	return ntohs(CTTUPLE(skb, dst.u.all));
+fallback:
+	return flow_get_proto_dst(skb, flow);
+}
+
+static u32 flow_get_rtclassid(const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (skb_dst(skb))
+		return skb_dst(skb)->tclassid;
+#endif
+	return 0;
+}
+
+static u32 flow_get_skuid(const struct sk_buff *skb)
+{
+	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
+		kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid;
+		return from_kuid(&init_user_ns, skuid);
+	}
+	return 0;
+}
+
+static u32 flow_get_skgid(const struct sk_buff *skb)
+{
+	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
+		kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid;
+		return from_kgid(&init_user_ns, skgid);
+	}
+	return 0;
+}
+
+static u32 flow_get_vlan_tag(const struct sk_buff *skb)
+{
+	u16 uninitialized_var(tag);
+
+	if (vlan_get_tag(skb, &tag) < 0)
+		return 0;
+	return tag & VLAN_VID_MASK;
+}
+
+static u32 flow_get_rxhash(struct sk_buff *skb)
+{
+	return skb_get_hash(skb);
+}
+
+static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
+{
+	switch (key) {
+	case FLOW_KEY_SRC:
+		return flow_get_src(skb, flow);
+	case FLOW_KEY_DST:
+		return flow_get_dst(skb, flow);
+	case FLOW_KEY_PROTO:
+		return flow_get_proto(skb, flow);
+	case FLOW_KEY_PROTO_SRC:
+		return flow_get_proto_src(skb, flow);
+	case FLOW_KEY_PROTO_DST:
+		return flow_get_proto_dst(skb, flow);
+	case FLOW_KEY_IIF:
+		return flow_get_iif(skb);
+	case FLOW_KEY_PRIORITY:
+		return flow_get_priority(skb);
+	case FLOW_KEY_MARK:
+		return flow_get_mark(skb);
+	case FLOW_KEY_NFCT:
+		return flow_get_nfct(skb);
+	case FLOW_KEY_NFCT_SRC:
+		return flow_get_nfct_src(skb, flow);
+	case FLOW_KEY_NFCT_DST:
+		return flow_get_nfct_dst(skb, flow);
+	case FLOW_KEY_NFCT_PROTO_SRC:
+		return flow_get_nfct_proto_src(skb, flow);
+	case FLOW_KEY_NFCT_PROTO_DST:
+		return flow_get_nfct_proto_dst(skb, flow);
+	case FLOW_KEY_RTCLASSID:
+		return flow_get_rtclassid(skb);
+	case FLOW_KEY_SKUID:
+		return flow_get_skuid(skb);
+	case FLOW_KEY_SKGID:
+		return flow_get_skgid(skb);
+	case FLOW_KEY_VLAN_TAG:
+		return flow_get_vlan_tag(skb);
+	case FLOW_KEY_RXHASH:
+		return flow_get_rxhash(skb);
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+}
+
+#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | 		\
+			  (1 << FLOW_KEY_DST) |			\
+			  (1 << FLOW_KEY_PROTO) |		\
+			  (1 << FLOW_KEY_PROTO_SRC) |		\
+			  (1 << FLOW_KEY_PROTO_DST) | 		\
+			  (1 << FLOW_KEY_NFCT_SRC) |		\
+			  (1 << FLOW_KEY_NFCT_DST) |		\
+			  (1 << FLOW_KEY_NFCT_PROTO_SRC) |	\
+			  (1 << FLOW_KEY_NFCT_PROTO_DST))
+
+static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+			 struct tcf_result *res)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+	u32 keymask;
+	u32 classid;
+	unsigned int n, key;
+	int r;
+
+	list_for_each_entry(f, &head->filters, list) {
+		u32 keys[FLOW_KEY_MAX + 1];
+		struct flow_keys flow_keys;
+
+		if (!tcf_em_tree_match(skb, &f->ematches, NULL))
+			continue;
+
+		keymask = f->keymask;
+		if (keymask & FLOW_KEYS_NEEDED)
+			skb_flow_dissect(skb, &flow_keys);
+
+		for (n = 0; n < f->nkeys; n++) {
+			key = ffs(keymask) - 1;
+			keymask &= ~(1 << key);
+			keys[n] = flow_key_get(skb, key, &flow_keys);
+		}
+
+		if (f->mode == FLOW_MODE_HASH)
+			classid = jhash2(keys, f->nkeys, f->hashrnd);
+		else {
+			classid = keys[0];
+			classid = (classid & f->mask) ^ f->xor;
+			classid = (classid >> f->rshift) + f->addend;
+		}
+
+		if (f->divisor)
+			classid %= f->divisor;
+
+		res->class   = 0;
+		res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
+
+		r = tcf_exts_exec(skb, &f->exts, res);
+		if (r < 0)
+			continue;
+		return r;
+	}
+	return -1;
+}
+
+static void flow_perturbation(unsigned long arg)
+{
+	struct flow_filter *f = (struct flow_filter *)arg;
+
+	get_random_bytes(&f->hashrnd, 4);
+	if (f->perturb_period)
+		mod_timer(&f->perturb_timer, jiffies + f->perturb_period);
+}
+
+static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
+	[TCA_FLOW_KEYS]		= { .type = NLA_U32 },
+	[TCA_FLOW_MODE]		= { .type = NLA_U32 },
+	[TCA_FLOW_BASECLASS]	= { .type = NLA_U32 },
+	[TCA_FLOW_RSHIFT]	= { .type = NLA_U32 },
+	[TCA_FLOW_ADDEND]	= { .type = NLA_U32 },
+	[TCA_FLOW_MASK]		= { .type = NLA_U32 },
+	[TCA_FLOW_XOR]		= { .type = NLA_U32 },
+	[TCA_FLOW_DIVISOR]	= { .type = NLA_U32 },
+	[TCA_FLOW_ACT]		= { .type = NLA_NESTED },
+	[TCA_FLOW_POLICE]	= { .type = NLA_NESTED },
+	[TCA_FLOW_EMATCHES]	= { .type = NLA_NESTED },
+	[TCA_FLOW_PERTURB]	= { .type = NLA_U32 },
+};
+
+static int flow_change(struct net *net, struct sk_buff *in_skb,
+		       struct tcf_proto *tp, unsigned long base,
+		       u32 handle, struct nlattr **tca,
+		       unsigned long *arg, bool ovr)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_FLOW_MAX + 1];
+	struct tcf_exts e;
+	struct tcf_ematch_tree t;
+	unsigned int nkeys = 0;
+	unsigned int perturb_period = 0;
+	u32 baseclass = 0;
+	u32 keymask = 0;
+	u32 mode;
+	int err;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_FLOW_BASECLASS]) {
+		baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]);
+		if (TC_H_MIN(baseclass) == 0)
+			return -EINVAL;
+	}
+
+	if (tb[TCA_FLOW_KEYS]) {
+		keymask = nla_get_u32(tb[TCA_FLOW_KEYS]);
+
+		nkeys = hweight32(keymask);
+		if (nkeys == 0)
+			return -EINVAL;
+
+		if (fls(keymask) - 1 > FLOW_KEY_MAX)
+			return -EOPNOTSUPP;
+
+		if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) &&
+		    sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns)
+			return -EOPNOTSUPP;
+	}
+
+	tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+	if (err < 0)
+		return err;
+
+	err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t);
+	if (err < 0)
+		goto err1;
+
+	f = (struct flow_filter *)*arg;
+	if (f != NULL) {
+		err = -EINVAL;
+		if (f->handle != handle && handle)
+			goto err2;
+
+		mode = f->mode;
+		if (tb[TCA_FLOW_MODE])
+			mode = nla_get_u32(tb[TCA_FLOW_MODE]);
+		if (mode != FLOW_MODE_HASH && nkeys > 1)
+			goto err2;
+
+		if (mode == FLOW_MODE_HASH)
+			perturb_period = f->perturb_period;
+		if (tb[TCA_FLOW_PERTURB]) {
+			if (mode != FLOW_MODE_HASH)
+				goto err2;
+			perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
+		}
+	} else {
+		err = -EINVAL;
+		if (!handle)
+			goto err2;
+		if (!tb[TCA_FLOW_KEYS])
+			goto err2;
+
+		mode = FLOW_MODE_MAP;
+		if (tb[TCA_FLOW_MODE])
+			mode = nla_get_u32(tb[TCA_FLOW_MODE]);
+		if (mode != FLOW_MODE_HASH && nkeys > 1)
+			goto err2;
+
+		if (tb[TCA_FLOW_PERTURB]) {
+			if (mode != FLOW_MODE_HASH)
+				goto err2;
+			perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
+		}
+
+		if (TC_H_MAJ(baseclass) == 0)
+			baseclass = TC_H_MAKE(tp->q->handle, baseclass);
+		if (TC_H_MIN(baseclass) == 0)
+			baseclass = TC_H_MAKE(baseclass, 1);
+
+		err = -ENOBUFS;
+		f = kzalloc(sizeof(*f), GFP_KERNEL);
+		if (f == NULL)
+			goto err2;
+
+		f->handle = handle;
+		f->mask	  = ~0U;
+		tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+
+		get_random_bytes(&f->hashrnd, 4);
+		f->perturb_timer.function = flow_perturbation;
+		f->perturb_timer.data = (unsigned long)f;
+		init_timer_deferrable(&f->perturb_timer);
+	}
+
+	tcf_exts_change(tp, &f->exts, &e);
+	tcf_em_tree_change(tp, &f->ematches, &t);
+
+	tcf_tree_lock(tp);
+
+	if (tb[TCA_FLOW_KEYS]) {
+		f->keymask = keymask;
+		f->nkeys   = nkeys;
+	}
+
+	f->mode = mode;
+
+	if (tb[TCA_FLOW_MASK])
+		f->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
+	if (tb[TCA_FLOW_XOR])
+		f->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
+	if (tb[TCA_FLOW_RSHIFT])
+		f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
+	if (tb[TCA_FLOW_ADDEND])
+		f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
+
+	if (tb[TCA_FLOW_DIVISOR])
+		f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
+	if (baseclass)
+		f->baseclass = baseclass;
+
+	f->perturb_period = perturb_period;
+	del_timer(&f->perturb_timer);
+	if (perturb_period)
+		mod_timer(&f->perturb_timer, jiffies + perturb_period);
+
+	if (*arg == 0)
+		list_add_tail(&f->list, &head->filters);
+
+	tcf_tree_unlock(tp);
+
+	*arg = (unsigned long)f;
+	return 0;
+
+err2:
+	tcf_em_tree_destroy(tp, &t);
+err1:
+	tcf_exts_destroy(tp, &e);
+	return err;
+}
+
+static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f)
+{
+	del_timer_sync(&f->perturb_timer);
+	tcf_exts_destroy(tp, &f->exts);
+	tcf_em_tree_destroy(tp, &f->ematches);
+	kfree(f);
+}
+
+static int flow_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct flow_filter *f = (struct flow_filter *)arg;
+
+	tcf_tree_lock(tp);
+	list_del(&f->list);
+	tcf_tree_unlock(tp);
+	flow_destroy_filter(tp, f);
+	return 0;
+}
+
+static int flow_init(struct tcf_proto *tp)
+{
+	struct flow_head *head;
+
+	head = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (head == NULL)
+		return -ENOBUFS;
+	INIT_LIST_HEAD(&head->filters);
+	tp->root = head;
+	return 0;
+}
+
+static void flow_destroy(struct tcf_proto *tp)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f, *next;
+
+	list_for_each_entry_safe(f, next, &head->filters, list) {
+		list_del(&f->list);
+		flow_destroy_filter(tp, f);
+	}
+	kfree(head);
+}
+
+static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+
+	list_for_each_entry(f, &head->filters, list)
+		if (f->handle == handle)
+			return (unsigned long)f;
+	return 0;
+}
+
+static void flow_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+		     struct sk_buff *skb, struct tcmsg *t)
+{
+	struct flow_filter *f = (struct flow_filter *)fh;
+	struct nlattr *nest;
+
+	if (f == NULL)
+		return skb->len;
+
+	t->tcm_handle = f->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) ||
+	    nla_put_u32(skb, TCA_FLOW_MODE, f->mode))
+		goto nla_put_failure;
+
+	if (f->mask != ~0 || f->xor != 0) {
+		if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) ||
+		    nla_put_u32(skb, TCA_FLOW_XOR, f->xor))
+			goto nla_put_failure;
+	}
+	if (f->rshift &&
+	    nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift))
+		goto nla_put_failure;
+	if (f->addend &&
+	    nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend))
+		goto nla_put_failure;
+
+	if (f->divisor &&
+	    nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor))
+		goto nla_put_failure;
+	if (f->baseclass &&
+	    nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass))
+		goto nla_put_failure;
+
+	if (f->perturb_period &&
+	    nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ))
+		goto nla_put_failure;
+
+	if (tcf_exts_dump(skb, &f->exts) < 0)
+		goto nla_put_failure;
+#ifdef CONFIG_NET_EMATCH
+	if (f->ematches.hdr.nmatches &&
+	    tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
+		goto nla_put_failure;
+#endif
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, nest);
+	return -1;
+}
+
+static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+
+	list_for_each_entry(f, &head->filters, list) {
+		if (arg->count < arg->skip)
+			goto skip;
+		if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+skip:
+		arg->count++;
+	}
+}
+
+static struct tcf_proto_ops cls_flow_ops __read_mostly = {
+	.kind		= "flow",
+	.classify	= flow_classify,
+	.init		= flow_init,
+	.destroy	= flow_destroy,
+	.change		= flow_change,
+	.delete		= flow_delete,
+	.get		= flow_get,
+	.put		= flow_put,
+	.dump		= flow_dump,
+	.walk		= flow_walk,
+	.owner		= THIS_MODULE,
+};
+
+static int __init cls_flow_init(void)
+{
+	return register_tcf_proto_ops(&cls_flow_ops);
+}
+
+static void __exit cls_flow_exit(void)
+{
+	unregister_tcf_proto_ops(&cls_flow_ops);
+}
+
+module_init(cls_flow_init);
+module_exit(cls_flow_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("TC flow classifier");
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 75470486e40..861b03ccfed 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -18,101 +18,56 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <linux/netfilter.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
+#include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 
-#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
+#define HTSIZE 256
 
-struct fw_head
-{
-	struct fw_filter *ht[HTSIZE];
+struct fw_head {
+	u32			mask;
+	struct fw_filter	*ht[HTSIZE];
 };
 
-struct fw_filter
-{
+struct fw_filter {
 	struct fw_filter	*next;
 	u32			id;
 	struct tcf_result	res;
 #ifdef CONFIG_NET_CLS_IND
-	char			indev[IFNAMSIZ];
+	int			ifindex;
 #endif /* CONFIG_NET_CLS_IND */
 	struct tcf_exts		exts;
 };
 
-static struct tcf_ext_map fw_ext_map = {
-	.action = TCA_FW_ACT,
-	.police = TCA_FW_POLICE
-};
-
-static __inline__ int fw_hash(u32 handle)
+static u32 fw_hash(u32 handle)
 {
-	if (HTSIZE == 4096)
-		return ((handle >> 24) & 0xFFF) ^
-		       ((handle >> 12) & 0xFFF) ^
-		       (handle & 0xFFF);
-	else if (HTSIZE == 2048)
-		return ((handle >> 22) & 0x7FF) ^
-		       ((handle >> 11) & 0x7FF) ^
-		       (handle & 0x7FF);
-	else if (HTSIZE == 1024)
-		return ((handle >> 20) & 0x3FF) ^
-		       ((handle >> 10) & 0x3FF) ^
-		       (handle & 0x3FF);
-	else if (HTSIZE == 512)
-		return (handle >> 27) ^
-		       ((handle >> 18) & 0x1FF) ^
-		       ((handle >> 9) & 0x1FF) ^
-		       (handle & 0x1FF);
-	else if (HTSIZE == 256) {
-		u8 *t = (u8 *) &handle;
-		return t[0] ^ t[1] ^ t[2] ^ t[3];
-	} else 
-		return handle & (HTSIZE - 1);
+	handle ^= (handle >> 16);
+	handle ^= (handle >> 8);
+	return handle % HTSIZE;
 }
 
-static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
+static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			  struct tcf_result *res)
 {
-	struct fw_head *head = (struct fw_head*)tp->root;
+	struct fw_head *head = tp->root;
 	struct fw_filter *f;
 	int r;
-#ifdef CONFIG_NETFILTER
-	u32 id = skb->nfmark;
-#else
-	u32 id = 0;
-#endif
+	u32 id = skb->mark;
 
 	if (head != NULL) {
-		for (f=head->ht[fw_hash(id)]; f; f=f->next) {
+		id &= head->mask;
+		for (f = head->ht[fw_hash(id)]; f; f = f->next) {
 			if (f->id == id) {
 				*res = f->res;
 #ifdef CONFIG_NET_CLS_IND
-				if (!tcf_match_indev(skb, f->indev))
+				if (!tcf_match_indev(skb, f->ifindex))
 					continue;
 #endif /* CONFIG_NET_CLS_IND */
 				r = tcf_exts_exec(skb, &f->exts, res);
@@ -124,7 +79,8 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
 		}
 	} else {
 		/* old method */
-		if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) {
+		if (id && (TC_H_MAJ(id) == 0 ||
+			   !(TC_H_MAJ(id ^ tp->q->handle)))) {
 			res->classid = id;
 			res->class = 0;
 			return 0;
@@ -136,13 +92,13 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
 
 static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
 {
-	struct fw_head *head = (struct fw_head*)tp->root;
+	struct fw_head *head = tp->root;
 	struct fw_filter *f;
 
 	if (head == NULL)
 		return 0;
 
-	for (f=head->ht[fw_hash(handle)]; f; f=f->next) {
+	for (f = head->ht[fw_hash(handle)]; f; f = f->next) {
 		if (f->id == handle)
 			return (unsigned long)f;
 	}
@@ -158,8 +114,7 @@ static int fw_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static inline void
-fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
+static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
 {
 	tcf_unbind_filter(tp, &f->res);
 	tcf_exts_destroy(tp, &f->exts);
@@ -168,15 +123,15 @@ fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
 
 static void fw_destroy(struct tcf_proto *tp)
 {
-	struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL);
+	struct fw_head *head = tp->root;
 	struct fw_filter *f;
 	int h;
 
 	if (head == NULL)
 		return;
 
-	for (h=0; h<HTSIZE; h++) {
-		while ((f=head->ht[h]) != NULL) {
+	for (h = 0; h < HTSIZE; h++) {
+		while ((f = head->ht[h]) != NULL) {
 			head->ht[h] = f->next;
 			fw_delete_filter(tp, f);
 		}
@@ -186,14 +141,14 @@ static void fw_destroy(struct tcf_proto *tp)
 
 static int fw_delete(struct tcf_proto *tp, unsigned long arg)
 {
-	struct fw_head *head = (struct fw_head*)tp->root;
-	struct fw_filter *f = (struct fw_filter*)arg;
+	struct fw_head *head = tp->root;
+	struct fw_filter *f = (struct fw_filter *)arg;
 	struct fw_filter **fp;
 
 	if (head == NULL || f == NULL)
 		goto out;
 
-	for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
+	for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
 		if (*fp == f) {
 			tcf_tree_lock(tp);
 			*fp = f->next;
@@ -206,33 +161,51 @@ out:
 	return -EINVAL;
 }
 
+static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
+	[TCA_FW_CLASSID]	= { .type = NLA_U32 },
+	[TCA_FW_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
+	[TCA_FW_MASK]		= { .type = NLA_U32 },
+};
+
 static int
-fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,
-	struct rtattr **tb, struct rtattr **tca, unsigned long base)
+fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
+	struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)
 {
+	struct fw_head *head = tp->root;
 	struct tcf_exts e;
+	u32 mask;
 	int err;
 
-	err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &fw_ext_map);
+	tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
 	if (err < 0)
 		return err;
 
-	err = -EINVAL;
-	if (tb[TCA_FW_CLASSID-1]) {
-		if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != sizeof(u32))
-			goto errout;
-		f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]);
+	if (tb[TCA_FW_CLASSID]) {
+		f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
 		tcf_bind_filter(tp, &f->res, base);
 	}
 
 #ifdef CONFIG_NET_CLS_IND
-	if (tb[TCA_FW_INDEV-1]) {
-		err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV-1]);
-		if (err < 0)
+	if (tb[TCA_FW_INDEV]) {
+		int ret;
+		ret = tcf_change_indev(net, tb[TCA_FW_INDEV]);
+		if (ret < 0) {
+			err = ret;
 			goto errout;
+		}
+		f->ifindex = ret;
 	}
 #endif /* CONFIG_NET_CLS_IND */
 
+	err = -EINVAL;
+	if (tb[TCA_FW_MASK]) {
+		mask = nla_get_u32(tb[TCA_FW_MASK]);
+		if (mask != head->mask)
+			goto errout;
+	} else if (head->mask != 0xFFFFFFFF)
+		goto errout;
+
 	tcf_exts_change(tp, &f->exts, &e);
 
 	return 0;
@@ -241,51 +214,57 @@ errout:
 	return err;
 }
 
-static int fw_change(struct tcf_proto *tp, unsigned long base,
+static int fw_change(struct net *net, struct sk_buff *in_skb,
+		     struct tcf_proto *tp, unsigned long base,
 		     u32 handle,
-		     struct rtattr **tca,
-		     unsigned long *arg)
+		     struct nlattr **tca,
+		     unsigned long *arg, bool ovr)
 {
-	struct fw_head *head = (struct fw_head*)tp->root;
+	struct fw_head *head = tp->root;
 	struct fw_filter *f = (struct fw_filter *) *arg;
-	struct rtattr *opt = tca[TCA_OPTIONS-1];
-	struct rtattr *tb[TCA_FW_MAX];
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_FW_MAX + 1];
 	int err;
 
 	if (!opt)
 		return handle ? -EINVAL : 0;
 
-	if (rtattr_parse_nested(tb, TCA_FW_MAX, opt) < 0)
-		return -EINVAL;
+	err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy);
+	if (err < 0)
+		return err;
 
 	if (f != NULL) {
 		if (f->id != handle && handle)
 			return -EINVAL;
-		return fw_change_attrs(tp, f, tb, tca, base);
+		return fw_change_attrs(net, tp, f, tb, tca, base, ovr);
 	}
 
 	if (!handle)
 		return -EINVAL;
 
 	if (head == NULL) {
-		head = kmalloc(sizeof(struct fw_head), GFP_KERNEL);
+		u32 mask = 0xFFFFFFFF;
+		if (tb[TCA_FW_MASK])
+			mask = nla_get_u32(tb[TCA_FW_MASK]);
+
+		head = kzalloc(sizeof(struct fw_head), GFP_KERNEL);
 		if (head == NULL)
 			return -ENOBUFS;
-		memset(head, 0, sizeof(*head));
+		head->mask = mask;
 
 		tcf_tree_lock(tp);
 		tp->root = head;
 		tcf_tree_unlock(tp);
 	}
 
-	f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL);
+	f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
 	if (f == NULL)
 		return -ENOBUFS;
-	memset(f, 0, sizeof(*f));
 
+	tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
 	f->id = handle;
 
-	err = fw_change_attrs(tp, f, tb, tca, base);
+	err = fw_change_attrs(net, tp, f, tb, tca, base, ovr);
 	if (err < 0)
 		goto errout;
 
@@ -304,7 +283,7 @@ errout:
 
 static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
-	struct fw_head *head = (struct fw_head*)tp->root;
+	struct fw_head *head = tp->root;
 	int h;
 
 	if (head == NULL)
@@ -330,12 +309,13 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int fw_dump(struct tcf_proto *tp, unsigned long fh,
+static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		   struct sk_buff *skb, struct tcmsg *t)
 {
-	struct fw_filter *f = (struct fw_filter*)fh;
-	unsigned char	 *b = skb->tail;
-	struct rtattr *rta;
+	struct fw_head *head = tp->root;
+	struct fw_filter *f = (struct fw_filter *)fh;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
 
 	if (f == NULL)
 		return skb->len;
@@ -345,33 +325,41 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh,
 	if (!f->res.classid && !tcf_exts_is_available(&f->exts))
 		return skb->len;
 
-	rta = (struct rtattr*)b;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
 
-	if (f->res.classid)
-		RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid);
+	if (f->res.classid &&
+	    nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
+		goto nla_put_failure;
 #ifdef CONFIG_NET_CLS_IND
-	if (strlen(f->indev))
-		RTA_PUT(skb, TCA_FW_INDEV, IFNAMSIZ, f->indev);
+	if (f->ifindex) {
+		struct net_device *dev;
+		dev = __dev_get_by_index(net, f->ifindex);
+		if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
+			goto nla_put_failure;
+	}
 #endif /* CONFIG_NET_CLS_IND */
+	if (head->mask != 0xFFFFFFFF &&
+	    nla_put_u32(skb, TCA_FW_MASK, head->mask))
+		goto nla_put_failure;
 
-	if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0)
-		goto rtattr_failure;
+	if (tcf_exts_dump(skb, &f->exts) < 0)
+		goto nla_put_failure;
 
-	rta->rta_len = skb->tail - b;
+	nla_nest_end(skb, nest);
 
-	if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0)
-		goto rtattr_failure;
+	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+		goto nla_put_failure;
 
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-static struct tcf_proto_ops cls_fw_ops = {
-	.next		=	NULL,
+static struct tcf_proto_ops cls_fw_ops __read_mostly = {
 	.kind		=	"fw",
 	.classify	=	fw_classify,
 	.init		=	fw_init,
@@ -390,7 +378,7 @@ static int __init init_fw(void)
 	return register_tcf_proto_ops(&cls_fw_ops);
 }
 
-static void __exit exit_fw(void) 
+static void __exit exit_fw(void)
 {
 	unregister_tcf_proto_ops(&cls_fw_ops);
 }
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 520ff716dab..dd9fc2523c7 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -10,61 +10,43 @@
  */
 
 #include <linux/module.h>
-#include <linux/config.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 
 /*
-   1. For now we assume that route tags < 256.
-      It allows to use direct table lookups, instead of hash tables.
-   2. For now we assume that "from TAG" and "fromdev DEV" statements
-      are mutually  exclusive.
-   3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
+ * 1. For now we assume that route tags < 256.
+ *    It allows to use direct table lookups, instead of hash tables.
+ * 2. For now we assume that "from TAG" and "fromdev DEV" statements
+ *    are mutually  exclusive.
+ * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
  */
 
-struct route4_fastmap
-{
+struct route4_fastmap {
 	struct route4_filter	*filter;
 	u32			id;
 	int			iif;
 };
 
-struct route4_head
-{
+struct route4_head {
 	struct route4_fastmap	fastmap[16];
-	struct route4_bucket	*table[256+1];
+	struct route4_bucket	*table[256 + 1];
 };
 
-struct route4_bucket
-{
+struct route4_bucket {
 	/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
-	struct route4_filter	*ht[16+16+1];
+	struct route4_filter	*ht[16 + 16 + 1];
 };
 
-struct route4_filter
-{
+struct route4_filter {
 	struct route4_filter	*next;
 	u32			id;
 	int			iif;
@@ -75,52 +57,50 @@ struct route4_filter
 	struct route4_bucket	*bkt;
 };
 
-#define ROUTE4_FAILURE ((struct route4_filter*)(-1L))
-
-static struct tcf_ext_map route_ext_map = {
-	.police = TCA_ROUTE4_POLICE,
-	.action = TCA_ROUTE4_ACT
-};
+#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
 
-static __inline__ int route4_fastmap_hash(u32 id, int iif)
+static inline int route4_fastmap_hash(u32 id, int iif)
 {
-	return id&0xF;
+	return id & 0xF;
 }
 
-static inline
-void route4_reset_fastmap(struct net_device *dev, struct route4_head *head, u32 id)
+static void
+route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
 {
-	spin_lock_bh(&dev->queue_lock);
+	spinlock_t *root_lock = qdisc_root_sleeping_lock(q);
+
+	spin_lock_bh(root_lock);
 	memset(head->fastmap, 0, sizeof(head->fastmap));
-	spin_unlock_bh(&dev->queue_lock);
+	spin_unlock_bh(root_lock);
 }
 
-static void __inline__
+static void
 route4_set_fastmap(struct route4_head *head, u32 id, int iif,
 		   struct route4_filter *f)
 {
 	int h = route4_fastmap_hash(id, iif);
+
 	head->fastmap[h].id = id;
 	head->fastmap[h].iif = iif;
 	head->fastmap[h].filter = f;
 }
 
-static __inline__ int route4_hash_to(u32 id)
+static inline int route4_hash_to(u32 id)
 {
-	return id&0xFF;
+	return id & 0xFF;
 }
 
-static __inline__ int route4_hash_from(u32 id)
+static inline int route4_hash_from(u32 id)
 {
-	return (id>>16)&0xF;
+	return (id >> 16) & 0xF;
 }
 
-static __inline__ int route4_hash_iif(int iif)
+static inline int route4_hash_iif(int iif)
 {
-	return 16 + ((iif>>16)&0xF);
+	return 16 + ((iif >> 16) & 0xF);
 }
 
-static __inline__ int route4_hash_wild(void)
+static inline int route4_hash_wild(void)
 {
 	return 32;
 }
@@ -140,24 +120,25 @@ static __inline__ int route4_hash_wild(void)
 	return 0;						\
 }
 
-static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
+static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			   struct tcf_result *res)
 {
-	struct route4_head *head = (struct route4_head*)tp->root;
+	struct route4_head *head = tp->root;
 	struct dst_entry *dst;
 	struct route4_bucket *b;
 	struct route4_filter *f;
 	u32 id, h;
 	int iif, dont_cache = 0;
 
-	if ((dst = skb->dst) == NULL)
+	dst = skb_dst(skb);
+	if (!dst)
 		goto failure;
 
 	id = dst->tclassid;
 	if (head == NULL)
 		goto old_method;
 
-	iif = ((struct rtable*)dst)->fl.iif;
+	iif = inet_iif(skb);
 
 	h = route4_fastmap_hash(id, iif);
 	if (id == head->fastmap[h].id &&
@@ -173,7 +154,8 @@ static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
 	h = route4_hash_to(id);
 
 restart:
-	if ((b = head->table[h]) != NULL) {
+	b = head->table[h];
+	if (b) {
 		for (f = b->ht[route4_hash_from(id)]; f; f = f->next)
 			if (f->id == id)
 				ROUTE4_APPLY_RESULT();
@@ -209,8 +191,9 @@ old_method:
 
 static inline u32 to_hash(u32 id)
 {
-	u32 h = id&0xFF;
-	if (id&0x8000)
+	u32 h = id & 0xFF;
+
+	if (id & 0x8000)
 		h += 256;
 	return h;
 }
@@ -223,17 +206,17 @@ static inline u32 from_hash(u32 id)
 	if (!(id & 0x8000)) {
 		if (id > 255)
 			return 256;
-		return id&0xF;
+		return id & 0xF;
 	}
-	return 16 + (id&0xF);
+	return 16 + (id & 0xF);
 }
 
 static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
 {
-	struct route4_head *head = (struct route4_head*)tp->root;
+	struct route4_head *head = tp->root;
 	struct route4_bucket *b;
 	struct route4_filter *f;
-	unsigned h1, h2;
+	unsigned int h1, h2;
 
 	if (!head)
 		return 0;
@@ -242,11 +225,12 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
 	if (h1 > 256)
 		return 0;
 
-	h2 = from_hash(handle>>16);
+	h2 = from_hash(handle >> 16);
 	if (h2 > 32)
 		return 0;
 
-	if ((b = head->table[h1]) != NULL) {
+	b = head->table[h1];
+	if (b) {
 		for (f = b->ht[h2]; f; f = f->next)
 			if (f->handle == handle)
 				return (unsigned long)f;
@@ -263,7 +247,7 @@ static int route4_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static inline void
+static void
 route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
 {
 	tcf_unbind_filter(tp, &f->res);
@@ -273,17 +257,18 @@ route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
 
 static void route4_destroy(struct tcf_proto *tp)
 {
-	struct route4_head *head = xchg(&tp->root, NULL);
+	struct route4_head *head = tp->root;
 	int h1, h2;
 
 	if (head == NULL)
 		return;
 
-	for (h1=0; h1<=256; h1++) {
+	for (h1 = 0; h1 <= 256; h1++) {
 		struct route4_bucket *b;
 
-		if ((b = head->table[h1]) != NULL) {
-			for (h2=0; h2<=32; h2++) {
+		b = head->table[h1];
+		if (b) {
+			for (h2 = 0; h2 <= 32; h2++) {
 				struct route4_filter *f;
 
 				while ((f = b->ht[h2]) != NULL) {
@@ -299,9 +284,9 @@ static void route4_destroy(struct tcf_proto *tp)
 
 static int route4_delete(struct tcf_proto *tp, unsigned long arg)
 {
-	struct route4_head *head = (struct route4_head*)tp->root;
-	struct route4_filter **fp, *f = (struct route4_filter*)arg;
-	unsigned h = 0;
+	struct route4_head *head = tp->root;
+	struct route4_filter **fp, *f = (struct route4_filter *)arg;
+	unsigned int h = 0;
 	struct route4_bucket *b;
 	int i;
 
@@ -311,18 +296,18 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
 	h = f->handle;
 	b = f->bkt;
 
-	for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) {
+	for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) {
 		if (*fp == f) {
 			tcf_tree_lock(tp);
 			*fp = f->next;
 			tcf_tree_unlock(tp);
 
-			route4_reset_fastmap(tp->q->dev, head, f->id);
+			route4_reset_fastmap(tp->q, head, f->id);
 			route4_delete_filter(tp, f);
 
 			/* Strip tree */
 
-			for (i=0; i<=32; i++)
+			for (i = 0; i <= 32; i++)
 				if (b->ht[i])
 					return 0;
 
@@ -338,9 +323,18 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
 	return 0;
 }
 
-static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
-	struct route4_filter *f, u32 handle, struct route4_head *head,
-	struct rtattr **tb, struct rtattr *est, int new)
+static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {
+	[TCA_ROUTE4_CLASSID]	= { .type = NLA_U32 },
+	[TCA_ROUTE4_TO]		= { .type = NLA_U32 },
+	[TCA_ROUTE4_FROM]	= { .type = NLA_U32 },
+	[TCA_ROUTE4_IIF]	= { .type = NLA_U32 },
+};
+
+static int route4_set_parms(struct net *net, struct tcf_proto *tp,
+			    unsigned long base, struct route4_filter *f,
+			    u32 handle, struct route4_head *head,
+			    struct nlattr **tb, struct nlattr *est, int new,
+			    bool ovr)
 {
 	int err;
 	u32 id = 0, to = 0, nhandle = 0x8000;
@@ -349,39 +343,30 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
 	struct route4_bucket *b;
 	struct tcf_exts e;
 
-	err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map);
+	tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
 	if (err < 0)
 		return err;
 
 	err = -EINVAL;
-	if (tb[TCA_ROUTE4_CLASSID-1])
-		if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < sizeof(u32))
-			goto errout;
-
-	if (tb[TCA_ROUTE4_TO-1]) {
+	if (tb[TCA_ROUTE4_TO]) {
 		if (new && handle & 0x8000)
 			goto errout;
-		if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < sizeof(u32))
-			goto errout;
-		to = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]);
+		to = nla_get_u32(tb[TCA_ROUTE4_TO]);
 		if (to > 0xFF)
 			goto errout;
 		nhandle = to;
 	}
 
-	if (tb[TCA_ROUTE4_FROM-1]) {
-		if (tb[TCA_ROUTE4_IIF-1])
+	if (tb[TCA_ROUTE4_FROM]) {
+		if (tb[TCA_ROUTE4_IIF])
 			goto errout;
-		if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < sizeof(u32))
-			goto errout;
-		id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1]);
+		id = nla_get_u32(tb[TCA_ROUTE4_FROM]);
 		if (id > 0xFF)
 			goto errout;
 		nhandle |= id << 16;
-	} else if (tb[TCA_ROUTE4_IIF-1]) {
-		if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < sizeof(u32))
-			goto errout;
-		id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]);
+	} else if (tb[TCA_ROUTE4_IIF]) {
+		id = nla_get_u32(tb[TCA_ROUTE4_IIF]);
 		if (id > 0x7FFF)
 			goto errout;
 		nhandle |= (id | 0x8000) << 16;
@@ -395,18 +380,19 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
 	}
 
 	h1 = to_hash(nhandle);
-	if ((b = head->table[h1]) == NULL) {
+	b = head->table[h1];
+	if (!b) {
 		err = -ENOBUFS;
-		b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL);
+		b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
 		if (b == NULL)
 			goto errout;
-		memset(b, 0, sizeof(*b));
 
 		tcf_tree_lock(tp);
 		head->table[h1] = b;
 		tcf_tree_unlock(tp);
 	} else {
 		unsigned int h2 = from_hash(nhandle >> 16);
+
 		err = -EEXIST;
 		for (fp = b->ht[h2]; fp; fp = fp->next)
 			if (fp->handle == f->handle)
@@ -414,20 +400,20 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
 	}
 
 	tcf_tree_lock(tp);
-	if (tb[TCA_ROUTE4_TO-1])
+	if (tb[TCA_ROUTE4_TO])
 		f->id = to;
 
-	if (tb[TCA_ROUTE4_FROM-1])
+	if (tb[TCA_ROUTE4_FROM])
 		f->id = to | id<<16;
-	else if (tb[TCA_ROUTE4_IIF-1])
+	else if (tb[TCA_ROUTE4_IIF])
 		f->iif = id;
 
 	f->handle = nhandle;
 	f->bkt = b;
 	tcf_tree_unlock(tp);
 
-	if (tb[TCA_ROUTE4_CLASSID-1]) {
-		f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]);
+	if (tb[TCA_ROUTE4_CLASSID]) {
+		f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]);
 		tcf_bind_filter(tp, &f->res, base);
 	}
 
@@ -439,16 +425,17 @@ errout:
 	return err;
 }
 
-static int route4_change(struct tcf_proto *tp, unsigned long base,
+static int route4_change(struct net *net, struct sk_buff *in_skb,
+		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle,
-		       struct rtattr **tca,
-		       unsigned long *arg)
+		       struct nlattr **tca,
+		       unsigned long *arg, bool ovr)
 {
 	struct route4_head *head = tp->root;
 	struct route4_filter *f, *f1, **fp;
 	struct route4_bucket *b;
-	struct rtattr *opt = tca[TCA_OPTIONS-1];
-	struct rtattr *tb[TCA_ROUTE4_MAX];
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_ROUTE4_MAX + 1];
 	unsigned int h, th;
 	u32 old_handle = 0;
 	int err;
@@ -456,18 +443,20 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
 	if (opt == NULL)
 		return handle ? -EINVAL : 0;
 
-	if (rtattr_parse_nested(tb, TCA_ROUTE4_MAX, opt) < 0)
-		return -EINVAL;
+	err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy);
+	if (err < 0)
+		return err;
 
-	if ((f = (struct route4_filter*)*arg) != NULL) {
+	f = (struct route4_filter *)*arg;
+	if (f) {
 		if (f->handle != handle && handle)
 			return -EINVAL;
 
 		if (f->bkt)
 			old_handle = f->handle;
 
-		err = route4_set_parms(tp, base, f, handle, head, tb,
-			tca[TCA_RATE-1], 0);
+		err = route4_set_parms(net, tp, base, f, handle, head, tb,
+			tca[TCA_RATE], 0, ovr);
 		if (err < 0)
 			return err;
 
@@ -476,29 +465,28 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
 
 	err = -ENOBUFS;
 	if (head == NULL) {
-		head = kmalloc(sizeof(struct route4_head), GFP_KERNEL);
+		head = kzalloc(sizeof(struct route4_head), GFP_KERNEL);
 		if (head == NULL)
 			goto errout;
-		memset(head, 0, sizeof(struct route4_head));
 
 		tcf_tree_lock(tp);
 		tp->root = head;
 		tcf_tree_unlock(tp);
 	}
 
-	f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL);
+	f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL);
 	if (f == NULL)
 		goto errout;
-	memset(f, 0, sizeof(*f));
 
-	err = route4_set_parms(tp, base, f, handle, head, tb,
-		tca[TCA_RATE-1], 1);
+	tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+	err = route4_set_parms(net, tp, base, f, handle, head, tb,
+		tca[TCA_RATE], 1, ovr);
 	if (err < 0)
 		goto errout;
 
 reinsert:
 	h = from_hash(f->handle >> 16);
-	for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next)
+	for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next)
 		if (f->handle < f1->handle)
 			break;
 
@@ -509,7 +497,8 @@ reinsert:
 	if (old_handle && f->handle != old_handle) {
 		th = to_hash(old_handle);
 		h = from_hash(old_handle >> 16);
-		if ((b = head->table[th]) != NULL) {
+		b = head->table[th];
+		if (b) {
 			for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {
 				if (*fp == f) {
 					*fp = f->next;
@@ -520,7 +509,7 @@ reinsert:
 	}
 	tcf_tree_unlock(tp);
 
-	route4_reset_fastmap(tp->q->dev, head, f->id);
+	route4_reset_fastmap(tp->q, head, f->id);
 	*arg = (unsigned long)f;
 	return 0;
 
@@ -532,7 +521,7 @@ errout:
 static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
 	struct route4_head *head = tp->root;
-	unsigned h, h1;
+	unsigned int h, h1;
 
 	if (head == NULL)
 		arg->stop = 1;
@@ -563,12 +552,12 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int route4_dump(struct tcf_proto *tp, unsigned long fh,
+static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		       struct sk_buff *skb, struct tcmsg *t)
 {
-	struct route4_filter *f = (struct route4_filter*)fh;
-	unsigned char	 *b = skb->tail;
-	struct rtattr *rta;
+	struct route4_filter *f = (struct route4_filter *)fh;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
 	u32 id;
 
 	if (f == NULL)
@@ -576,40 +565,44 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh,
 
 	t->tcm_handle = f->handle;
 
-	rta = (struct rtattr*)b;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
 
-	if (!(f->handle&0x8000)) {
-		id = f->id&0xFF;
-		RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id);
+	if (!(f->handle & 0x8000)) {
+		id = f->id & 0xFF;
+		if (nla_put_u32(skb, TCA_ROUTE4_TO, id))
+			goto nla_put_failure;
 	}
-	if (f->handle&0x80000000) {
-		if ((f->handle>>16) != 0xFFFF)
-			RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif);
+	if (f->handle & 0x80000000) {
+		if ((f->handle >> 16) != 0xFFFF &&
+		    nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif))
+			goto nla_put_failure;
 	} else {
-		id = f->id>>16;
-		RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id);
+		id = f->id >> 16;
+		if (nla_put_u32(skb, TCA_ROUTE4_FROM, id))
+			goto nla_put_failure;
 	}
-	if (f->res.classid)
-		RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid);
+	if (f->res.classid &&
+	    nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid))
+		goto nla_put_failure;
 
-	if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0)
-		goto rtattr_failure;
+	if (tcf_exts_dump(skb, &f->exts) < 0)
+		goto nla_put_failure;
 
-	rta->rta_len = skb->tail - b;
+	nla_nest_end(skb, nest);
 
-	if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0)
-		goto rtattr_failure;
+	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+		goto nla_put_failure;
 
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-static struct tcf_proto_ops cls_route4_ops = {
-	.next		=	NULL,
+static struct tcf_proto_ops cls_route4_ops __read_mostly = {
 	.kind		=	"route",
 	.classify	=	route4_classify,
 	.init		=	route4_init,
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c
index ad2613790d8..cbb5e0d600f 100644
--- a/net/sched/cls_rsvp.c
+++ b/net/sched/cls_rsvp.c
@@ -10,28 +10,13 @@
  */
 
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
+#include <net/ip.h>
+#include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 572f06be3b0..1020e233a5d 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -65,32 +65,28 @@
    Well, as result, despite its simplicity, we get a pretty
    powerful classification engine.  */
 
-#include <linux/config.h>
 
-struct rsvp_head
-{
+struct rsvp_head {
 	u32			tmap[256/32];
 	u32			hgenerator;
 	u8			tgenerator;
 	struct rsvp_session	*ht[256];
 };
 
-struct rsvp_session
-{
+struct rsvp_session {
 	struct rsvp_session	*next;
-	u32			dst[RSVP_DST_LEN];
+	__be32			dst[RSVP_DST_LEN];
 	struct tc_rsvp_gpi 	dpi;
 	u8			protocol;
 	u8			tunnelid;
 	/* 16 (src,sport) hash slots, and one wildcard source slot */
-	struct rsvp_filter	*ht[16+1];
+	struct rsvp_filter	*ht[16 + 1];
 };
 
 
-struct rsvp_filter
-{
+struct rsvp_filter {
 	struct rsvp_filter	*next;
-	u32			src[RSVP_DST_LEN];
+	__be32			src[RSVP_DST_LEN];
 	struct tc_rsvp_gpi	spi;
 	u8			tunnelhdr;
 
@@ -101,28 +97,25 @@ struct rsvp_filter
 	struct rsvp_session	*sess;
 };
 
-static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid)
+static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
 {
-	unsigned h = dst[RSVP_DST_LEN-1];
+	unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
+
 	h ^= h>>16;
 	h ^= h>>8;
 	return (h ^ protocol ^ tunnelid) & 0xFF;
 }
 
-static __inline__ unsigned hash_src(u32 *src)
+static inline unsigned int hash_src(__be32 *src)
 {
-	unsigned h = src[RSVP_DST_LEN-1];
+	unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
+
 	h ^= h>>16;
 	h ^= h>>8;
 	h ^= h>>4;
 	return h & 0xF;
 }
 
-static struct tcf_ext_map rsvp_ext_map = {
-	.police = TCA_RSVP_POLICE,
-	.action = TCA_RSVP_ACT
-};
-
 #define RSVP_APPLY_RESULT()				\
 {							\
 	int r = tcf_exts_exec(skb, &f->exts, res);	\
@@ -131,22 +124,30 @@ static struct tcf_ext_map rsvp_ext_map = {
 	else if (r > 0)					\
 		return r;				\
 }
-	
-static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
+
+static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			 struct tcf_result *res)
 {
-	struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
+	struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
 	struct rsvp_session *s;
 	struct rsvp_filter *f;
-	unsigned h1, h2;
-	u32 *dst, *src;
+	unsigned int h1, h2;
+	__be32 *dst, *src;
 	u8 protocol;
 	u8 tunnelid = 0;
 	u8 *xprt;
 #if RSVP_DST_LEN == 4
-	struct ipv6hdr *nhptr = skb->nh.ipv6h;
+	struct ipv6hdr *nhptr;
+
+	if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
+		return -1;
+	nhptr = ipv6_hdr(skb);
 #else
-	struct iphdr *nhptr = skb->nh.iph;
+	struct iphdr *nhptr;
+
+	if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
+		return -1;
+	nhptr = ip_hdr(skb);
 #endif
 
 restart:
@@ -155,13 +156,13 @@ restart:
 	src = &nhptr->saddr.s6_addr32[0];
 	dst = &nhptr->daddr.s6_addr32[0];
 	protocol = nhptr->nexthdr;
-	xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr);
+	xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
 #else
 	src = &nhptr->saddr;
 	dst = &nhptr->daddr;
 	protocol = nhptr->protocol;
-	xprt = ((u8*)nhptr) + (nhptr->ihl<<2);
-	if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET))
+	xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
+	if (ip_is_fragment(nhptr))
 		return -1;
 #endif
 
@@ -169,23 +170,25 @@ restart:
 	h2 = hash_src(src);
 
 	for (s = sht[h1]; s; s = s->next) {
-		if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
+		if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
 		    protocol == s->protocol &&
-		    !(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key))
+		    !(s->dpi.mask &
+		      (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
 #if RSVP_DST_LEN == 4
-		    && dst[0] == s->dst[0]
-		    && dst[1] == s->dst[1]
-		    && dst[2] == s->dst[2]
+		    dst[0] == s->dst[0] &&
+		    dst[1] == s->dst[1] &&
+		    dst[2] == s->dst[2] &&
 #endif
-		    && tunnelid == s->tunnelid) {
+		    tunnelid == s->tunnelid) {
 
 			for (f = s->ht[h2]; f; f = f->next) {
-				if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] &&
-				    !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key))
+				if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
+				    !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
 #if RSVP_DST_LEN == 4
-				    && src[0] == f->src[0]
-				    && src[1] == f->src[1]
-				    && src[2] == f->src[2]
+				    &&
+				    src[0] == f->src[0] &&
+				    src[1] == f->src[1] &&
+				    src[2] == f->src[2]
 #endif
 				    ) {
 					*res = f->res;
@@ -196,7 +199,7 @@ matched:
 						return 0;
 
 					tunnelid = f->res.classid;
-					nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr));
+					nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
 					goto restart;
 				}
 			}
@@ -215,11 +218,11 @@ matched:
 
 static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
 {
-	struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
+	struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
 	struct rsvp_session *s;
 	struct rsvp_filter *f;
-	unsigned h1 = handle&0xFF;
-	unsigned h2 = (handle>>8)&0xFF;
+	unsigned int h1 = handle & 0xFF;
+	unsigned int h2 = (handle >> 8) & 0xFF;
 
 	if (h2 > 16)
 		return 0;
@@ -241,16 +244,15 @@ static int rsvp_init(struct tcf_proto *tp)
 {
 	struct rsvp_head *data;
 
-	data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL);
+	data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL);
 	if (data) {
-		memset(data, 0, sizeof(struct rsvp_head));
 		tp->root = data;
 		return 0;
 	}
 	return -ENOBUFS;
 }
 
-static inline void
+static void
 rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
 {
 	tcf_unbind_filter(tp, &f->res);
@@ -269,13 +271,13 @@ static void rsvp_destroy(struct tcf_proto *tp)
 
 	sht = data->ht;
 
-	for (h1=0; h1<256; h1++) {
+	for (h1 = 0; h1 < 256; h1++) {
 		struct rsvp_session *s;
 
 		while ((s = sht[h1]) != NULL) {
 			sht[h1] = s->next;
 
-			for (h2=0; h2<=16; h2++) {
+			for (h2 = 0; h2 <= 16; h2++) {
 				struct rsvp_filter *f;
 
 				while ((f = s->ht[h2]) != NULL) {
@@ -291,13 +293,13 @@ static void rsvp_destroy(struct tcf_proto *tp)
 
 static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
 {
-	struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg;
-	unsigned h = f->handle;
+	struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg;
+	unsigned int h = f->handle;
 	struct rsvp_session **sp;
 	struct rsvp_session *s = f->sess;
 	int i;
 
-	for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) {
+	for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) {
 		if (*fp == f) {
 			tcf_tree_lock(tp);
 			*fp = f->next;
@@ -306,12 +308,12 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
 
 			/* Strip tree */
 
-			for (i=0; i<=16; i++)
+			for (i = 0; i <= 16; i++)
 				if (s->ht[i])
 					return 0;
 
 			/* OK, session has no flows */
-			for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF];
+			for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF];
 			     *sp; sp = &(*sp)->next) {
 				if (*sp == s) {
 					tcf_tree_lock(tp);
@@ -329,13 +331,14 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
 	return 0;
 }
 
-static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
+static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
 {
 	struct rsvp_head *data = tp->root;
 	int i = 0xFFFF;
 
 	while (i-- > 0) {
 		u32 h;
+
 		if ((data->hgenerator += 0x10000) == 0)
 			data->hgenerator = 0x10000;
 		h = data->hgenerator|salt;
@@ -347,10 +350,10 @@ static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
 
 static int tunnel_bts(struct rsvp_head *data)
 {
-	int n = data->tgenerator>>5;
-	u32 b = 1<<(data->tgenerator&0x1F);
-	
-	if (data->tmap[n]&b)
+	int n = data->tgenerator >> 5;
+	u32 b = 1 << (data->tgenerator & 0x1F);
+
+	if (data->tmap[n] & b)
 		return 0;
 	data->tmap[n] |= b;
 	return 1;
@@ -364,10 +367,10 @@ static void tunnel_recycle(struct rsvp_head *data)
 
 	memset(tmap, 0, sizeof(tmap));
 
-	for (h1=0; h1<256; h1++) {
+	for (h1 = 0; h1 < 256; h1++) {
 		struct rsvp_session *s;
 		for (s = sht[h1]; s; s = s->next) {
-			for (h2=0; h2<=16; h2++) {
+			for (h2 = 0; h2 <= 16; h2++) {
 				struct rsvp_filter *f;
 
 				for (f = s->ht[h2]; f; f = f->next) {
@@ -387,8 +390,8 @@ static u32 gen_tunnel(struct rsvp_head *data)
 {
 	int i, k;
 
-	for (k=0; k<2; k++) {
-		for (i=255; i>0; i--) {
+	for (k = 0; k < 2; k++) {
+		for (i = 255; i > 0; i--) {
 			if (++data->tgenerator == 0)
 				data->tgenerator = 1;
 			if (tunnel_bts(data))
@@ -399,39 +402,52 @@ static u32 gen_tunnel(struct rsvp_head *data)
 	return 0;
 }
 
-static int rsvp_change(struct tcf_proto *tp, unsigned long base,
+static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
+	[TCA_RSVP_CLASSID]	= { .type = NLA_U32 },
+	[TCA_RSVP_DST]		= { .type = NLA_BINARY,
+				    .len = RSVP_DST_LEN * sizeof(u32) },
+	[TCA_RSVP_SRC]		= { .type = NLA_BINARY,
+				    .len = RSVP_DST_LEN * sizeof(u32) },
+	[TCA_RSVP_PINFO]	= { .len = sizeof(struct tc_rsvp_pinfo) },
+};
+
+static int rsvp_change(struct net *net, struct sk_buff *in_skb,
+		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle,
-		       struct rtattr **tca,
-		       unsigned long *arg)
+		       struct nlattr **tca,
+		       unsigned long *arg, bool ovr)
 {
 	struct rsvp_head *data = tp->root;
 	struct rsvp_filter *f, **fp;
 	struct rsvp_session *s, **sp;
 	struct tc_rsvp_pinfo *pinfo = NULL;
-	struct rtattr *opt = tca[TCA_OPTIONS-1];
-	struct rtattr *tb[TCA_RSVP_MAX];
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_RSVP_MAX + 1];
 	struct tcf_exts e;
-	unsigned h1, h2;
-	u32 *dst;
+	unsigned int h1, h2;
+	__be32 *dst;
 	int err;
 
 	if (opt == NULL)
 		return handle ? -EINVAL : 0;
 
-	if (rtattr_parse_nested(tb, TCA_RSVP_MAX, opt) < 0)
-		return -EINVAL;
+	err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy);
+	if (err < 0)
+		return err;
 
-	err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map);
+	tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
 	if (err < 0)
 		return err;
 
-	if ((f = (struct rsvp_filter*)*arg) != NULL) {
+	f = (struct rsvp_filter *)*arg;
+	if (f) {
 		/* Node exists: adjust only classid */
 
 		if (f->handle != handle && handle)
 			goto errout2;
-		if (tb[TCA_RSVP_CLASSID-1]) {
-			f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
+		if (tb[TCA_RSVP_CLASSID]) {
+			f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
 			tcf_bind_filter(tp, &f->res, base);
 		}
 
@@ -443,42 +459,29 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
 	err = -EINVAL;
 	if (handle)
 		goto errout2;
-	if (tb[TCA_RSVP_DST-1] == NULL)
+	if (tb[TCA_RSVP_DST] == NULL)
 		goto errout2;
 
 	err = -ENOBUFS;
-	f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
+	f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
 	if (f == NULL)
 		goto errout2;
 
-	memset(f, 0, sizeof(*f));
+	tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
 	h2 = 16;
-	if (tb[TCA_RSVP_SRC-1]) {
-		err = -EINVAL;
-		if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src))
-			goto errout;
-		memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src));
+	if (tb[TCA_RSVP_SRC]) {
+		memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
 		h2 = hash_src(f->src);
 	}
-	if (tb[TCA_RSVP_PINFO-1]) {
-		err = -EINVAL;
-		if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo))
-			goto errout;
-		pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]);
+	if (tb[TCA_RSVP_PINFO]) {
+		pinfo = nla_data(tb[TCA_RSVP_PINFO]);
 		f->spi = pinfo->spi;
 		f->tunnelhdr = pinfo->tunnelhdr;
 	}
-	if (tb[TCA_RSVP_CLASSID-1]) {
-		err = -EINVAL;
-		if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4)
-			goto errout;
-		f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
-	}
+	if (tb[TCA_RSVP_CLASSID])
+		f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
 
-	err = -EINVAL;
-	if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src))
-		goto errout;
-	dst = RTA_DATA(tb[TCA_RSVP_DST-1]);
+	dst = nla_data(tb[TCA_RSVP_DST]);
 	h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
 
 	err = -ENOMEM;
@@ -496,16 +499,16 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
 			goto errout;
 	}
 
-	for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) {
+	for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) {
 		if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
 		    pinfo && pinfo->protocol == s->protocol &&
-		    memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0
+		    memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
 #if RSVP_DST_LEN == 4
-		    && dst[0] == s->dst[0]
-		    && dst[1] == s->dst[1]
-		    && dst[2] == s->dst[2]
+		    dst[0] == s->dst[0] &&
+		    dst[1] == s->dst[1] &&
+		    dst[2] == s->dst[2] &&
 #endif
-		    && pinfo->tunnelid == s->tunnelid) {
+		    pinfo->tunnelid == s->tunnelid) {
 
 insert:
 			/* OK, we found appropriate session */
@@ -519,7 +522,7 @@ insert:
 			tcf_exts_change(tp, &f->exts, &e);
 
 			for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
-				if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask)
+				if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask)
 					break;
 			f->next = *fp;
 			wmb();
@@ -533,10 +536,9 @@ insert:
 	/* No session found. Create new one. */
 
 	err = -ENOBUFS;
-	s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL);
+	s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL);
 	if (s == NULL)
 		goto errout;
-	memset(s, 0, sizeof(*s));
 	memcpy(s->dst, dst, sizeof(s->dst));
 
 	if (pinfo) {
@@ -551,7 +553,7 @@ insert:
 	s->next = *sp;
 	wmb();
 	*sp = s;
-	
+
 	goto insert;
 
 errout:
@@ -564,7 +566,7 @@ errout2:
 static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
 	struct rsvp_head *head = tp->root;
-	unsigned h, h1;
+	unsigned int h, h1;
 
 	if (arg->stop)
 		return;
@@ -592,13 +594,13 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
+static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		     struct sk_buff *skb, struct tcmsg *t)
 {
-	struct rsvp_filter *f = (struct rsvp_filter*)fh;
+	struct rsvp_filter *f = (struct rsvp_filter *)fh;
 	struct rsvp_session *s;
-	unsigned char	 *b = skb->tail;
-	struct rtattr *rta;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
 	struct tc_rsvp_pinfo pinfo;
 
 	if (f == NULL)
@@ -607,39 +609,42 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
 
 	t->tcm_handle = f->handle;
 
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
 
-	rta = (struct rtattr*)b;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
-
-	RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst);
+	if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst))
+		goto nla_put_failure;
 	pinfo.dpi = s->dpi;
 	pinfo.spi = f->spi;
 	pinfo.protocol = s->protocol;
 	pinfo.tunnelid = s->tunnelid;
 	pinfo.tunnelhdr = f->tunnelhdr;
 	pinfo.pad = 0;
-	RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
-	if (f->res.classid)
-		RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
-	if (((f->handle>>8)&0xFF) != 16)
-		RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src);
-
-	if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0)
-		goto rtattr_failure;
-
-	rta->rta_len = skb->tail - b;
-
-	if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0)
-		goto rtattr_failure;
+	if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo))
+		goto nla_put_failure;
+	if (f->res.classid &&
+	    nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid))
+		goto nla_put_failure;
+	if (((f->handle >> 8) & 0xFF) != 16 &&
+	    nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src))
+		goto nla_put_failure;
+
+	if (tcf_exts_dump(skb, &f->exts) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+		goto nla_put_failure;
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-static struct tcf_proto_ops RSVP_OPS = {
-	.next		=	NULL,
+static struct tcf_proto_ops RSVP_OPS __read_mostly = {
 	.kind		=	RSVP_ID,
 	.classify	=	rsvp_classify,
 	.init		=	rsvp_init,
@@ -658,7 +663,7 @@ static int __init init_rsvp(void)
 	return register_tcf_proto_ops(&RSVP_OPS);
 }
 
-static void __exit exit_rsvp(void) 
+static void __exit exit_rsvp(void)
 {
 	unregister_tcf_proto_ops(&RSVP_OPS);
 }
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c
index fde51f7848e..dd08aea2aee 100644
--- a/net/sched/cls_rsvp6.c
+++ b/net/sched/cls_rsvp6.c
@@ -10,31 +10,15 @@
  */
 
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
 #include <linux/ipv6.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
+#include <net/netlink.h>
 
 #define RSVP_DST_LEN	4
 #define RSVP_ID		"rsvp6"
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 9f921174c8a..c721cd4a469 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -4,23 +4,15 @@
  * Written 1998,1999 by Werner Almesberger, EPFL ICA
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/errno.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
+#include <linux/slab.h>
 #include <net/act_api.h>
+#include <net/netlink.h>
 #include <net/pkt_cls.h>
-#include <net/route.h>
-
-
-/*
- * Not quite sure if we need all the xchgs Alexey uses when accessing things.
- * Can always add them later ... :)
- */
 
 /*
  * Passing parameters to the root seems to be done more awkwardly than really
@@ -32,22 +24,6 @@
 #define DEFAULT_HASH_SIZE	64	/* optimized for diffserv */
 
 
-#if 1 /* control */
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-#if 0 /* data */
-#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define D2PRINTK(format,args...)
-#endif
-
-
-#define	PRIV(tp)	((struct tcindex_data *) (tp)->root)
-
-
 struct tcindex_filter_result {
 	struct tcf_exts		exts;
 	struct tcf_result	res;
@@ -71,11 +47,6 @@ struct tcindex_data {
 	int fall_through;	/* 0: only classify if explicit match */
 };
 
-static struct tcf_ext_map tcindex_ext_map = {
-	.police = TCA_TCINDEX_POLICE,
-	.action = TCA_TCINDEX_ACT
-};
-
 static inline int
 tcindex_filter_is_set(struct tcindex_filter_result *r)
 {
@@ -100,14 +71,15 @@ tcindex_lookup(struct tcindex_data *p, u16 key)
 }
 
 
-static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp,
+static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			    struct tcf_result *res)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcindex_filter_result *f;
 	int key = (skb->tc_index & p->mask) >> p->shift;
 
-	D2PRINTK("tcindex_classify(skb %p,tp %p,res %p),p %p\n",skb,tp,res,p);
+	pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n",
+		 skb, tp, res, p);
 
 	f = tcindex_lookup(p, key);
 	if (!f) {
@@ -115,11 +87,11 @@ static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp,
 			return -1;
 		res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key);
 		res->class = 0;
-		D2PRINTK("alg 0x%x\n",res->classid);
+		pr_debug("alg 0x%x\n", res->classid);
 		return 0;
 	}
 	*res = f->res;
-	D2PRINTK("map 0x%x\n",res->classid);
+	pr_debug("map 0x%x\n", res->classid);
 
 	return tcf_exts_exec(skb, &f->exts, res);
 }
@@ -127,10 +99,10 @@ static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp,
 
 static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcindex_filter_result *r;
 
-	DPRINTK("tcindex_get(tp %p,handle 0x%08x)\n",tp,handle);
+	pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
 	if (p->perfect && handle >= p->alloc_hash)
 		return 0;
 	r = tcindex_lookup(p, handle);
@@ -140,7 +112,7 @@ static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
 
 static void tcindex_put(struct tcf_proto *tp, unsigned long f)
 {
-	DPRINTK("tcindex_put(tp %p,f 0x%lx)\n",tp,f);
+	pr_debug("tcindex_put(tp %p,f 0x%lx)\n", tp, f);
 }
 
 
@@ -148,12 +120,11 @@ static int tcindex_init(struct tcf_proto *tp)
 {
 	struct tcindex_data *p;
 
-	DPRINTK("tcindex_init(tp %p)\n",tp);
-	p = kmalloc(sizeof(struct tcindex_data),GFP_KERNEL);
+	pr_debug("tcindex_init(tp %p)\n", tp);
+	p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
 
-	memset(p, 0, sizeof(*p));
 	p->mask = 0xffff;
 	p->hash = DEFAULT_HASH_SIZE;
 	p->fall_through = 1;
@@ -166,11 +137,11 @@ static int tcindex_init(struct tcf_proto *tp)
 static int
 __tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
 	struct tcindex_filter *f = NULL;
 
-	DPRINTK("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n",tp,arg,p,f);
+	pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f);
 	if (p->perfect) {
 		if (!r->res.class)
 			return -ENOENT;
@@ -209,10 +180,25 @@ valid_perfect_hash(struct tcindex_data *p)
 	return  p->hash > (p->mask >> p->shift);
 }
 
+static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
+	[TCA_TCINDEX_HASH]		= { .type = NLA_U32 },
+	[TCA_TCINDEX_MASK]		= { .type = NLA_U16 },
+	[TCA_TCINDEX_SHIFT]		= { .type = NLA_U32 },
+	[TCA_TCINDEX_FALL_THROUGH]	= { .type = NLA_U32 },
+	[TCA_TCINDEX_CLASSID]		= { .type = NLA_U32 },
+};
+
+static void tcindex_filter_result_init(struct tcindex_filter_result *r)
+{
+	memset(r, 0, sizeof(*r));
+	tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+}
+
 static int
-tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
-		  struct tcindex_data *p, struct tcindex_filter_result *r,
-		  struct rtattr **tb, struct rtattr *est)
+tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
+		  u32 handle, struct tcindex_data *p,
+		  struct tcindex_filter_result *r, struct nlattr **tb,
+		  struct nlattr *est, bool ovr)
 {
 	int err, balloc = 0;
 	struct tcindex_filter_result new_filter_result, *old_r = r;
@@ -221,36 +207,26 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
 	struct tcindex_filter *f = NULL; /* make gcc behave */
 	struct tcf_exts e;
 
-	err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map);
+	tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
 	if (err < 0)
 		return err;
-	
+
 	memcpy(&cp, p, sizeof(cp));
-	memset(&new_filter_result, 0, sizeof(new_filter_result));
+	tcindex_filter_result_init(&new_filter_result);
 
+	tcindex_filter_result_init(&cr);
 	if (old_r)
-		memcpy(&cr, r, sizeof(cr));
-	else
-		memset(&cr, 0, sizeof(cr));
+		cr.res = r->res;
 
-	err = -EINVAL;
-	if (tb[TCA_TCINDEX_HASH-1]) {
-		if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH-1]) < sizeof(u32))
-			goto errout;
-		cp.hash = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_HASH-1]);
-	}
+	if (tb[TCA_TCINDEX_HASH])
+		cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
 
-	if (tb[TCA_TCINDEX_MASK-1]) {
-		if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK-1]) < sizeof(u16))
-			goto errout;
-		cp.mask = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK-1]);
-	}
+	if (tb[TCA_TCINDEX_MASK])
+		cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
 
-	if (tb[TCA_TCINDEX_SHIFT-1]) {
-		if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT-1]) < sizeof(u16))
-			goto errout;
-		cp.shift = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_SHIFT-1]);
-	}
+	if (tb[TCA_TCINDEX_SHIFT])
+		cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
 
 	err = -EBUSY;
 	/* Hash already allocated, make sure that we still meet the
@@ -264,19 +240,15 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
 		goto errout;
 
 	err = -EINVAL;
-	if (tb[TCA_TCINDEX_FALL_THROUGH-1]) {
-		if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH-1]) < sizeof(u32))
-			goto errout;
-		cp.fall_through =
-			*(u32 *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH-1]);
-	}
+	if (tb[TCA_TCINDEX_FALL_THROUGH])
+		cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
 
 	if (!cp.hash) {
 		/* Hash not specified, use perfect hash if the upper limit
 		 * of the hashing index is below the threshold.
 		 */
 		if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD)
-			cp.hash = (cp.mask >> cp.shift)+1;
+			cp.hash = (cp.mask >> cp.shift) + 1;
 		else
 			cp.hash = DEFAULT_HASH_SIZE;
 	}
@@ -297,16 +269,19 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
 	err = -ENOMEM;
 	if (!cp.perfect && !cp.h) {
 		if (valid_perfect_hash(&cp)) {
-			cp.perfect = kmalloc(cp.hash * sizeof(*r), GFP_KERNEL);
+			int i;
+
+			cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL);
 			if (!cp.perfect)
 				goto errout;
-			memset(cp.perfect, 0, cp.hash * sizeof(*r));
+			for (i = 0; i < cp.hash; i++)
+				tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT,
+					      TCA_TCINDEX_POLICE);
 			balloc = 1;
 		} else {
-			cp.h = kmalloc(cp.hash * sizeof(f), GFP_KERNEL);
+			cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL);
 			if (!cp.h)
 				goto errout;
-			memset(cp.h, 0, cp.hash * sizeof(f));
 			balloc = 2;
 		}
 	}
@@ -317,25 +292,27 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
 		r = tcindex_lookup(&cp, handle) ? : &new_filter_result;
 
 	if (r == &new_filter_result) {
-		f = kmalloc(sizeof(*f), GFP_KERNEL);
+		f = kzalloc(sizeof(*f), GFP_KERNEL);
 		if (!f)
 			goto errout_alloc;
-		memset(f, 0, sizeof(*f));
- 	}
+	}
 
-	if (tb[TCA_TCINDEX_CLASSID-1]) {
-		cr.res.classid = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_CLASSID-1]);
+	if (tb[TCA_TCINDEX_CLASSID]) {
+		cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
 		tcf_bind_filter(tp, &cr.res, base);
- 	}
+	}
 
-	tcf_exts_change(tp, &cr.exts, &e);
+	if (old_r)
+		tcf_exts_change(tp, &r->exts, &e);
+	else
+		tcf_exts_change(tp, &cr.exts, &e);
 
 	tcf_tree_lock(tp);
 	if (old_r && old_r != r)
-		memset(old_r, 0, sizeof(*old_r));
+		tcindex_filter_result_init(old_r);
 
 	memcpy(p, &cp, sizeof(cp));
-	memcpy(r, &cr, sizeof(cr));
+	r->res = cr.res;
 
 	if (r == &new_filter_result) {
 		struct tcindex_filter **fp;
@@ -346,7 +323,7 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
 		for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next)
 			/* nothing */;
 		*fp = f;
- 	}
+	}
 	tcf_tree_unlock(tp);
 
 	return 0;
@@ -362,35 +339,39 @@ errout:
 }
 
 static int
-tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle,
-	       struct rtattr **tca, unsigned long *arg)
+tcindex_change(struct net *net, struct sk_buff *in_skb,
+	       struct tcf_proto *tp, unsigned long base, u32 handle,
+	       struct nlattr **tca, unsigned long *arg, bool ovr)
 {
-	struct rtattr *opt = tca[TCA_OPTIONS-1];
-	struct rtattr *tb[TCA_TCINDEX_MAX];
-	struct tcindex_data *p = PRIV(tp);
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_TCINDEX_MAX + 1];
+	struct tcindex_data *p = tp->root;
 	struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
+	int err;
 
-	DPRINTK("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
+	pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
 	    "p %p,r %p,*arg 0x%lx\n",
 	    tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L);
 
 	if (!opt)
 		return 0;
 
-	if (rtattr_parse_nested(tb, TCA_TCINDEX_MAX, opt) < 0)
-		return -EINVAL;
+	err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy);
+	if (err < 0)
+		return err;
 
-	return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE-1]);
+	return tcindex_set_parms(net, tp, base, handle, p, r, tb,
+				 tca[TCA_RATE], ovr);
 }
 
 
 static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
 {
-	struct tcindex_data *p = PRIV(tp);
-	struct tcindex_filter *f,*next;
+	struct tcindex_data *p = tp->root;
+	struct tcindex_filter *f, *next;
 	int i;
 
-	DPRINTK("tcindex_walk(tp %p,walker %p),p %p\n",tp,walker,p);
+	pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
 	if (p->perfect) {
 		for (i = 0; i < p->hash; i++) {
 			if (!p->perfect[i].res.class)
@@ -412,7 +393,7 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
 		for (f = p->h[i]; f; f = next) {
 			next = f->next;
 			if (walker->count >= walker->skip) {
-				if (walker->fn(tp,(unsigned long) &f->result,
+				if (walker->fn(tp, (unsigned long) &f->result,
 				    walker) < 0) {
 					walker->stop = 1;
 					return;
@@ -433,14 +414,14 @@ static int tcindex_destroy_element(struct tcf_proto *tp,
 
 static void tcindex_destroy(struct tcf_proto *tp)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcf_walker walker;
 
-	DPRINTK("tcindex_destroy(tp %p),p %p\n",tp,p);
+	pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
 	walker.count = 0;
 	walker.skip = 0;
 	walker.fn = &tcindex_destroy_element;
-	tcindex_walk(tp,&walker);
+	tcindex_walk(tp, &walker);
 	kfree(p->perfect);
 	kfree(p->h);
 	kfree(p);
@@ -448,27 +429,30 @@ static void tcindex_destroy(struct tcf_proto *tp)
 }
 
 
-static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
+static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
     struct sk_buff *skb, struct tcmsg *t)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = tp->root;
 	struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
-	unsigned char *b = skb->tail;
-	struct rtattr *rta;
-
-	DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n",
-	    tp,fh,skb,t,p,r,b);
-	DPRINTK("p->perfect %p p->h %p\n",p->perfect,p->h);
-	rta = (struct rtattr *) b;
-	RTA_PUT(skb,TCA_OPTIONS,0,NULL);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+
+	pr_debug("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n",
+		 tp, fh, skb, t, p, r, b);
+	pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
 	if (!fh) {
 		t->tcm_handle = ~0; /* whatever ... */
-		RTA_PUT(skb,TCA_TCINDEX_HASH,sizeof(p->hash),&p->hash);
-		RTA_PUT(skb,TCA_TCINDEX_MASK,sizeof(p->mask),&p->mask);
-		RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift);
-		RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through),
-		    &p->fall_through);
-		rta->rta_len = skb->tail-b;
+		if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) ||
+		    nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) ||
+		    nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) ||
+		    nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through))
+			goto nla_put_failure;
+		nla_nest_end(skb, nest);
 	} else {
 		if (p->perfect) {
 			t->tcm_handle = r-p->perfect;
@@ -485,27 +469,27 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
 				}
 			}
 		}
-		DPRINTK("handle = %d\n",t->tcm_handle);
-		if (r->res.class)
-			RTA_PUT(skb, TCA_TCINDEX_CLASSID, 4, &r->res.classid);
+		pr_debug("handle = %d\n", t->tcm_handle);
+		if (r->res.class &&
+		    nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))
+			goto nla_put_failure;
 
-		if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0)
-			goto rtattr_failure;
-		rta->rta_len = skb->tail-b;
+		if (tcf_exts_dump(skb, &r->exts) < 0)
+			goto nla_put_failure;
+		nla_nest_end(skb, nest);
 
-		if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0)
-			goto rtattr_failure;
+		if (tcf_exts_dump_stats(skb, &r->exts) < 0)
+			goto nla_put_failure;
 	}
-	
+
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-static struct tcf_proto_ops cls_tcindex_ops = {
-	.next		=	NULL,
+static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
 	.kind		=	"tcindex",
 	.classify	=	tcindex_classify,
 	.init		=	tcindex_init,
@@ -524,7 +508,7 @@ static int __init init_tcindex(void)
 	return register_tcf_proto_ops(&cls_tcindex_ops);
 }
 
-static void __exit exit_tcindex(void) 
+static void __exit exit_tcindex(void)
 {
 	unregister_tcf_proto_ops(&cls_tcindex_ops);
 }
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 2b670479dde..70c0be8d012 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -30,42 +30,26 @@
  *	nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
  */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/config.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
 #include <linux/rtnetlink.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
+#include <linux/bitmap.h>
+#include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 
-struct tc_u_knode
-{
+struct tc_u_knode {
 	struct tc_u_knode	*next;
 	u32			handle;
 	struct tc_u_hnode	*ht_up;
 	struct tcf_exts		exts;
 #ifdef CONFIG_NET_CLS_IND
-	char                     indev[IFNAMSIZ];
+	int			ifindex;
 #endif
 	u8			fshift;
 	struct tcf_result	res;
@@ -79,49 +63,41 @@ struct tc_u_knode
 	struct tc_u32_sel	sel;
 };
 
-struct tc_u_hnode
-{
+struct tc_u_hnode {
 	struct tc_u_hnode	*next;
 	u32			handle;
 	u32			prio;
 	struct tc_u_common	*tp_c;
 	int			refcnt;
-	unsigned		divisor;
+	unsigned int		divisor;
 	struct tc_u_knode	*ht[1];
 };
 
-struct tc_u_common
-{
-	struct tc_u_common	*next;
+struct tc_u_common {
 	struct tc_u_hnode	*hlist;
 	struct Qdisc		*q;
 	int			refcnt;
 	u32			hgenerator;
 };
 
-static struct tcf_ext_map u32_ext_map = {
-	.action = TCA_U32_ACT,
-	.police = TCA_U32_POLICE
-};
-
-static struct tc_u_common *u32_list;
-
-static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel, u8 fshift)
+static inline unsigned int u32_hash_fold(__be32 key,
+					 const struct tc_u32_sel *sel,
+					 u8 fshift)
 {
-	unsigned h = (key & sel->hmask)>>fshift;
+	unsigned int h = ntohl(key & sel->hmask) >> fshift;
 
 	return h;
 }
 
-static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res)
+static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res)
 {
 	struct {
 		struct tc_u_knode *knode;
-		u8		  *ptr;
+		unsigned int	  off;
 	} stack[TC_U32_MAXDEPTH];
 
-	struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root;
-	u8 *ptr = skb->nh.raw;
+	struct tc_u_hnode *ht = tp->root;
+	unsigned int off = skb_network_offset(skb);
 	struct tc_u_knode *n;
 	int sdepth = 0;
 	int off2 = 0;
@@ -139,12 +115,12 @@ next_knode:
 		struct tc_u32_key *key = n->sel.keys;
 
 #ifdef CONFIG_CLS_U32_PERF
-		n->pf->rcnt +=1;
+		n->pf->rcnt += 1;
 		j = 0;
 #endif
 
 #ifdef CONFIG_CLS_U32_MARK
-		if ((skb->nfmark & n->mark.mask) != n->mark.val) {
+		if ((skb->mark & n->mark.mask) != n->mark.val) {
 			n = n->next;
 			goto next_knode;
 		} else {
@@ -152,30 +128,38 @@ next_knode:
 		}
 #endif
 
-		for (i = n->sel.nkeys; i>0; i--, key++) {
+		for (i = n->sel.nkeys; i > 0; i--, key++) {
+			int toff = off + key->off + (off2 & key->offmask);
+			__be32 *data, hdata;
+
+			if (skb_headroom(skb) + toff > INT_MAX)
+				goto out;
 
-			if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) {
+			data = skb_header_pointer(skb, toff, 4, &hdata);
+			if (!data)
+				goto out;
+			if ((*data ^ key->val) & key->mask) {
 				n = n->next;
 				goto next_knode;
 			}
 #ifdef CONFIG_CLS_U32_PERF
-			n->pf->kcnts[j] +=1;
+			n->pf->kcnts[j] += 1;
 			j++;
 #endif
 		}
 		if (n->ht_down == NULL) {
 check_terminal:
-			if (n->sel.flags&TC_U32_TERMINAL) {
+			if (n->sel.flags & TC_U32_TERMINAL) {
 
 				*res = n->res;
 #ifdef CONFIG_NET_CLS_IND
-				if (!tcf_match_indev(skb, n->indev)) {
+				if (!tcf_match_indev(skb, n->ifindex)) {
 					n = n->next;
 					goto next_knode;
 				}
 #endif
 #ifdef CONFIG_CLS_U32_PERF
-				n->pf->rhit +=1;
+				n->pf->rhit += 1;
 #endif
 				r = tcf_exts_exec(skb, &n->exts, res);
 				if (r < 0) {
@@ -193,29 +177,45 @@ check_terminal:
 		if (sdepth >= TC_U32_MAXDEPTH)
 			goto deadloop;
 		stack[sdepth].knode = n;
-		stack[sdepth].ptr = ptr;
+		stack[sdepth].off = off;
 		sdepth++;
 
 		ht = n->ht_down;
 		sel = 0;
-		if (ht->divisor)
-			sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel,n->fshift);
-
-		if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT)))
+		if (ht->divisor) {
+			__be32 *data, hdata;
+
+			data = skb_header_pointer(skb, off + n->sel.hoff, 4,
+						  &hdata);
+			if (!data)
+				goto out;
+			sel = ht->divisor & u32_hash_fold(*data, &n->sel,
+							  n->fshift);
+		}
+		if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
 			goto next_ht;
 
-		if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) {
+		if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
 			off2 = n->sel.off + 3;
-			if (n->sel.flags&TC_U32_VAROFFSET)
-				off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift;
+			if (n->sel.flags & TC_U32_VAROFFSET) {
+				__be16 *data, hdata;
+
+				data = skb_header_pointer(skb,
+							  off + n->sel.offoff,
+							  2, &hdata);
+				if (!data)
+					goto out;
+				off2 += ntohs(n->sel.offmask & *data) >>
+					n->sel.offshift;
+			}
 			off2 &= ~3;
 		}
-		if (n->sel.flags&TC_U32_EAT) {
-			ptr += off2;
+		if (n->sel.flags & TC_U32_EAT) {
+			off += off2;
 			off2 = 0;
 		}
 
-		if (ptr < skb->tail)
+		if (off < skb->len)
 			goto next_ht;
 	}
 
@@ -223,18 +223,18 @@ check_terminal:
 	if (sdepth--) {
 		n = stack[sdepth].knode;
 		ht = n->ht_up;
-		ptr = stack[sdepth].ptr;
+		off = stack[sdepth].off;
 		goto check_terminal;
 	}
+out:
 	return -1;
 
 deadloop:
-	if (net_ratelimit())
-		printk("cls_u32: dead loop\n");
+	net_warn_ratelimited("cls_u32: dead loop\n");
 	return -1;
 }
 
-static __inline__ struct tc_u_hnode *
+static struct tc_u_hnode *
 u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
 {
 	struct tc_u_hnode *ht;
@@ -246,10 +246,10 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
 	return ht;
 }
 
-static __inline__ struct tc_u_knode *
+static struct tc_u_knode *
 u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
 {
-	unsigned sel;
+	unsigned int sel;
 	struct tc_u_knode *n = NULL;
 
 	sel = TC_U32_HASH(handle);
@@ -294,7 +294,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c)
 	do {
 		if (++tp_c->hgenerator == 0x7FF)
 			tp_c->hgenerator = 1;
-	} while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
+	} while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
 
 	return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
 }
@@ -304,30 +304,25 @@ static int u32_init(struct tcf_proto *tp)
 	struct tc_u_hnode *root_ht;
 	struct tc_u_common *tp_c;
 
-	for (tp_c = u32_list; tp_c; tp_c = tp_c->next)
-		if (tp_c->q == tp->q)
-			break;
+	tp_c = tp->q->u32_node;
 
-	root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL);
+	root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
 	if (root_ht == NULL)
 		return -ENOBUFS;
 
-	memset(root_ht, 0, sizeof(*root_ht));
 	root_ht->divisor = 0;
 	root_ht->refcnt++;
 	root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
 	root_ht->prio = tp->prio;
 
 	if (tp_c == NULL) {
-		tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL);
+		tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
 		if (tp_c == NULL) {
 			kfree(root_ht);
 			return -ENOBUFS;
 		}
-		memset(tp_c, 0, sizeof(*tp_c));
 		tp_c->q = tp->q;
-		tp_c->next = u32_list;
-		u32_list = tp_c;
+		tp->q->u32_node = tp_c;
 	}
 
 	tp_c->refcnt++;
@@ -347,14 +342,13 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
 	if (n->ht_down)
 		n->ht_down->refcnt--;
 #ifdef CONFIG_CLS_U32_PERF
-	if (n)
-		kfree(n->pf);
+	kfree(n->pf);
 #endif
 	kfree(n);
 	return 0;
 }
 
-static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
+static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 {
 	struct tc_u_knode **kp;
 	struct tc_u_hnode *ht = key->ht_up;
@@ -371,16 +365,16 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
 			}
 		}
 	}
-	BUG_TRAP(0);
+	WARN_ON(1);
 	return 0;
 }
 
 static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 {
 	struct tc_u_knode *n;
-	unsigned h;
+	unsigned int h;
 
-	for (h=0; h<=ht->divisor; h++) {
+	for (h = 0; h <= ht->divisor; h++) {
 		while ((n = ht->ht[h]) != NULL) {
 			ht->ht[h] = n->next;
 
@@ -394,7 +388,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_hnode **hn;
 
-	BUG_TRAP(!ht->refcnt);
+	WARN_ON(ht->refcnt);
 
 	u32_clear_hnode(tp, ht);
 
@@ -406,41 +400,37 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 		}
 	}
 
-	BUG_TRAP(0);
+	WARN_ON(1);
 	return -ENOENT;
 }
 
 static void u32_destroy(struct tcf_proto *tp)
 {
 	struct tc_u_common *tp_c = tp->data;
-	struct tc_u_hnode *root_ht = xchg(&tp->root, NULL);
+	struct tc_u_hnode *root_ht = tp->root;
 
-	BUG_TRAP(root_ht != NULL);
+	WARN_ON(root_ht == NULL);
 
 	if (root_ht && --root_ht->refcnt == 0)
 		u32_destroy_hnode(tp, root_ht);
 
 	if (--tp_c->refcnt == 0) {
 		struct tc_u_hnode *ht;
-		struct tc_u_common **tp_cp;
 
-		for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) {
-			if (*tp_cp == tp_c) {
-				*tp_cp = tp_c->next;
-				break;
-			}
-		}
+		tp->q->u32_node = NULL;
 
-		for (ht=tp_c->hlist; ht; ht = ht->next)
+		for (ht = tp_c->hlist; ht; ht = ht->next) {
+			ht->refcnt--;
 			u32_clear_hnode(tp, ht);
+		}
 
 		while ((ht = tp_c->hlist) != NULL) {
 			tp_c->hlist = ht->next;
 
-			BUG_TRAP(ht->refcnt == 0);
+			WARN_ON(ht->refcnt != 0);
 
 			kfree(ht);
-		};
+		}
 
 		kfree(tp_c);
 	}
@@ -450,52 +440,75 @@ static void u32_destroy(struct tcf_proto *tp)
 
 static int u32_delete(struct tcf_proto *tp, unsigned long arg)
 {
-	struct tc_u_hnode *ht = (struct tc_u_hnode*)arg;
+	struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
 
 	if (ht == NULL)
 		return 0;
 
 	if (TC_U32_KEY(ht->handle))
-		return u32_delete_key(tp, (struct tc_u_knode*)ht);
+		return u32_delete_key(tp, (struct tc_u_knode *)ht);
 
 	if (tp->root == ht)
 		return -EINVAL;
 
-	if (--ht->refcnt == 0)
+	if (ht->refcnt == 1) {
+		ht->refcnt--;
 		u32_destroy_hnode(tp, ht);
+	} else {
+		return -EBUSY;
+	}
 
 	return 0;
 }
 
+#define NR_U32_NODE (1<<12)
 static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
 {
 	struct tc_u_knode *n;
-	unsigned i = 0x7FF;
+	unsigned long i;
+	unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long),
+					GFP_KERNEL);
+	if (!bitmap)
+		return handle | 0xFFF;
+
+	for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
+		set_bit(TC_U32_NODE(n->handle), bitmap);
 
-	for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
-		if (i < TC_U32_NODE(n->handle))
-			i = TC_U32_NODE(n->handle);
-	i++;
+	i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800);
+	if (i >= NR_U32_NODE)
+		i = find_next_zero_bit(bitmap, NR_U32_NODE, 1);
 
-	return handle|(i>0xFFF ? 0xFFF : i);
+	kfree(bitmap);
+	return handle | (i >= NR_U32_NODE ? 0xFFF : i);
 }
 
-static int u32_set_parms(struct tcf_proto *tp, unsigned long base,
-			 struct tc_u_hnode *ht,
-			 struct tc_u_knode *n, struct rtattr **tb,
-			 struct rtattr *est)
+static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
+	[TCA_U32_CLASSID]	= { .type = NLA_U32 },
+	[TCA_U32_HASH]		= { .type = NLA_U32 },
+	[TCA_U32_LINK]		= { .type = NLA_U32 },
+	[TCA_U32_DIVISOR]	= { .type = NLA_U32 },
+	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
+	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
+	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
+};
+
+static int u32_set_parms(struct net *net, struct tcf_proto *tp,
+			 unsigned long base, struct tc_u_hnode *ht,
+			 struct tc_u_knode *n, struct nlattr **tb,
+			 struct nlattr *est, bool ovr)
 {
 	int err;
 	struct tcf_exts e;
 
-	err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map);
+	tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE);
+	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
 	if (err < 0)
 		return err;
 
 	err = -EINVAL;
-	if (tb[TCA_U32_LINK-1]) {
-		u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]);
-		struct tc_u_hnode *ht_down = NULL;
+	if (tb[TCA_U32_LINK]) {
+		u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
+		struct tc_u_hnode *ht_down = NULL, *ht_old;
 
 		if (TC_U32_KEY(handle))
 			goto errout;
@@ -509,22 +522,25 @@ static int u32_set_parms(struct tcf_proto *tp, unsigned long base,
 		}
 
 		tcf_tree_lock(tp);
-		ht_down = xchg(&n->ht_down, ht_down);
+		ht_old = n->ht_down;
+		n->ht_down = ht_down;
 		tcf_tree_unlock(tp);
 
-		if (ht_down)
-			ht_down->refcnt--;
+		if (ht_old)
+			ht_old->refcnt--;
 	}
-	if (tb[TCA_U32_CLASSID-1]) {
-		n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]);
+	if (tb[TCA_U32_CLASSID]) {
+		n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
 		tcf_bind_filter(tp, &n->res, base);
 	}
 
 #ifdef CONFIG_NET_CLS_IND
-	if (tb[TCA_U32_INDEV-1]) {
-		int err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV-1]);
-		if (err < 0)
+	if (tb[TCA_U32_INDEV]) {
+		int ret;
+		ret = tcf_change_indev(net, tb[TCA_U32_INDEV]);
+		if (ret < 0)
 			goto errout;
+		n->ifindex = ret;
 	}
 #endif
 	tcf_exts_change(tp, &n->exts, &e);
@@ -535,34 +551,38 @@ errout:
 	return err;
 }
 
-static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
-		      struct rtattr **tca,
-		      unsigned long *arg)
+static int u32_change(struct net *net, struct sk_buff *in_skb,
+		      struct tcf_proto *tp, unsigned long base, u32 handle,
+		      struct nlattr **tca,
+		      unsigned long *arg, bool ovr)
 {
 	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_hnode *ht;
 	struct tc_u_knode *n;
 	struct tc_u32_sel *s;
-	struct rtattr *opt = tca[TCA_OPTIONS-1];
-	struct rtattr *tb[TCA_U32_MAX];
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_U32_MAX + 1];
 	u32 htid;
 	int err;
 
 	if (opt == NULL)
 		return handle ? -EINVAL : 0;
 
-	if (rtattr_parse_nested(tb, TCA_U32_MAX, opt) < 0)
-		return -EINVAL;
+	err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy);
+	if (err < 0)
+		return err;
 
-	if ((n = (struct tc_u_knode*)*arg) != NULL) {
+	n = (struct tc_u_knode *)*arg;
+	if (n) {
 		if (TC_U32_KEY(n->handle) == 0)
 			return -EINVAL;
 
-		return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE-1]);
+		return u32_set_parms(net, tp, base, n->ht_up, n, tb,
+				     tca[TCA_RATE], ovr);
 	}
 
-	if (tb[TCA_U32_DIVISOR-1]) {
-		unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]);
+	if (tb[TCA_U32_DIVISOR]) {
+		unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
 
 		if (--divisor > 0x100)
 			return -EINVAL;
@@ -573,12 +593,11 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 			if (handle == 0)
 				return -ENOMEM;
 		}
-		ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL);
+		ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
 		if (ht == NULL)
 			return -ENOBUFS;
-		memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*));
 		ht->tp_c = tp_c;
-		ht->refcnt = 0;
+		ht->refcnt = 1;
 		ht->divisor = divisor;
 		ht->handle = handle;
 		ht->prio = tp->prio;
@@ -588,8 +607,8 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 		return 0;
 	}
 
-	if (tb[TCA_U32_HASH-1]) {
-		htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]);
+	if (tb[TCA_U32_HASH]) {
+		htid = nla_get_u32(tb[TCA_U32_HASH]);
 		if (TC_U32_HTID(htid) == TC_U32_ROOT) {
 			ht = tp->root;
 			htid = ht->handle;
@@ -613,59 +632,40 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 	} else
 		handle = gen_new_kid(ht, htid);
 
-	if (tb[TCA_U32_SEL-1] == 0 ||
-	    RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel))
+	if (tb[TCA_U32_SEL] == NULL)
 		return -EINVAL;
 
-	s = RTA_DATA(tb[TCA_U32_SEL-1]);
+	s = nla_data(tb[TCA_U32_SEL]);
 
-	n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
+	n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
 	if (n == NULL)
 		return -ENOBUFS;
 
-	memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key));
 #ifdef CONFIG_CLS_U32_PERF
-	n->pf = kmalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL);
+	n->pf = kzalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL);
 	if (n->pf == NULL) {
 		kfree(n);
 		return -ENOBUFS;
 	}
-	memset(n->pf, 0, sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64));
 #endif
 
 	memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
 	n->ht_up = ht;
 	n->handle = handle;
-{
-	u8 i = 0;
-	u32 mask = s->hmask;
-	if (mask) {
-		while (!(mask & 1)) {
-			i++;
-			mask>>=1;
-		}
-	}
-	n->fshift = i;
-}
+	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
 
 #ifdef CONFIG_CLS_U32_MARK
-	if (tb[TCA_U32_MARK-1]) {
+	if (tb[TCA_U32_MARK]) {
 		struct tc_u32_mark *mark;
 
-		if (RTA_PAYLOAD(tb[TCA_U32_MARK-1]) < sizeof(struct tc_u32_mark)) {
-#ifdef CONFIG_CLS_U32_PERF
-			kfree(n->pf);
-#endif
-			kfree(n);
-			return -EINVAL;
-		}
-		mark = RTA_DATA(tb[TCA_U32_MARK-1]);
+		mark = nla_data(tb[TCA_U32_MARK]);
 		memcpy(&n->mark, mark, sizeof(struct tc_u32_mark));
 		n->mark.success = 0;
 	}
 #endif
 
-	err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE-1]);
+	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);
 	if (err == 0) {
 		struct tc_u_knode **ins;
 		for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
@@ -673,15 +673,15 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
 				break;
 
 		n->next = *ins;
-		wmb();
+		tcf_tree_lock(tp);
 		*ins = n;
+		tcf_tree_unlock(tp);
 
 		*arg = (unsigned long)n;
 		return 0;
 	}
 #ifdef CONFIG_CLS_U32_PERF
-	if (n)
-		kfree(n->pf);
+	kfree(n->pf);
 #endif
 	kfree(n);
 	return err;
@@ -692,7 +692,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_hnode *ht;
 	struct tc_u_knode *n;
-	unsigned h;
+	unsigned int h;
 
 	if (arg->stop)
 		return;
@@ -723,70 +723,82 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int u32_dump(struct tcf_proto *tp, unsigned long fh,
+static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		     struct sk_buff *skb, struct tcmsg *t)
 {
-	struct tc_u_knode *n = (struct tc_u_knode*)fh;
-	unsigned char	 *b = skb->tail;
-	struct rtattr *rta;
+	struct tc_u_knode *n = (struct tc_u_knode *)fh;
+	struct nlattr *nest;
 
 	if (n == NULL)
 		return skb->len;
 
 	t->tcm_handle = n->handle;
 
-	rta = (struct rtattr*)b;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
 
 	if (TC_U32_KEY(n->handle) == 0) {
-		struct tc_u_hnode *ht = (struct tc_u_hnode*)fh;
-		u32 divisor = ht->divisor+1;
-		RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor);
+		struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
+		u32 divisor = ht->divisor + 1;
+
+		if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
+			goto nla_put_failure;
 	} else {
-		RTA_PUT(skb, TCA_U32_SEL,
-			sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
-			&n->sel);
+		if (nla_put(skb, TCA_U32_SEL,
+			    sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
+			    &n->sel))
+			goto nla_put_failure;
 		if (n->ht_up) {
 			u32 htid = n->handle & 0xFFFFF000;
-			RTA_PUT(skb, TCA_U32_HASH, 4, &htid);
+			if (nla_put_u32(skb, TCA_U32_HASH, htid))
+				goto nla_put_failure;
 		}
-		if (n->res.classid)
-			RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid);
-		if (n->ht_down)
-			RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle);
+		if (n->res.classid &&
+		    nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid))
+			goto nla_put_failure;
+		if (n->ht_down &&
+		    nla_put_u32(skb, TCA_U32_LINK, n->ht_down->handle))
+			goto nla_put_failure;
 
 #ifdef CONFIG_CLS_U32_MARK
-		if (n->mark.val || n->mark.mask)
-			RTA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark);
+		if ((n->mark.val || n->mark.mask) &&
+		    nla_put(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark))
+			goto nla_put_failure;
 #endif
 
-		if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0)
-			goto rtattr_failure;
+		if (tcf_exts_dump(skb, &n->exts) < 0)
+			goto nla_put_failure;
 
 #ifdef CONFIG_NET_CLS_IND
-		if(strlen(n->indev))
-			RTA_PUT(skb, TCA_U32_INDEV, IFNAMSIZ, n->indev);
+		if (n->ifindex) {
+			struct net_device *dev;
+			dev = __dev_get_by_index(net, n->ifindex);
+			if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
+				goto nla_put_failure;
+		}
 #endif
 #ifdef CONFIG_CLS_U32_PERF
-		RTA_PUT(skb, TCA_U32_PCNT, 
-		sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
-			n->pf);
+		if (nla_put(skb, TCA_U32_PCNT,
+			    sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
+			    n->pf))
+			goto nla_put_failure;
 #endif
 	}
 
-	rta->rta_len = skb->tail - b;
+	nla_nest_end(skb, nest);
+
 	if (TC_U32_KEY(n->handle))
-		if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0)
-			goto rtattr_failure;
+		if (tcf_exts_dump_stats(skb, &n->exts) < 0)
+			goto nla_put_failure;
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
 	return -1;
 }
 
-static struct tcf_proto_ops cls_u32_ops = {
-	.next		=	NULL,
+static struct tcf_proto_ops cls_u32_ops __read_mostly = {
 	.kind		=	"u32",
 	.classify	=	u32_classify,
 	.init		=	u32_init,
@@ -802,23 +814,20 @@ static struct tcf_proto_ops cls_u32_ops = {
 
 static int __init init_u32(void)
 {
-	printk("u32 classifier\n");
+	pr_info("u32 classifier\n");
 #ifdef CONFIG_CLS_U32_PERF
-	printk("    Perfomance counters on\n");
-#endif
-#ifdef CONFIG_NET_CLS_POLICE
-	printk("    OLD policer on \n");
+	pr_info("    Performance counters on\n");
 #endif
 #ifdef CONFIG_NET_CLS_IND
-	printk("    input device check on \n");
+	pr_info("    input device check on\n");
 #endif
 #ifdef CONFIG_NET_CLS_ACT
-	printk("    Actions configured \n");
+	pr_info("    Actions configured\n");
 #endif
 	return register_tcf_proto_ops(&cls_u32_ops);
 }
 
-static void __exit exit_u32(void) 
+static void __exit exit_u32(void)
 {
 	unregister_tcf_proto_ops(&cls_u32_ops);
 }
diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c
new file mode 100644
index 00000000000..bfd34e4c1af
--- /dev/null
+++ b/net/sched/em_canid.c
@@ -0,0 +1,240 @@
+/*
+ * em_canid.c  Ematch rule to match CAN frames according to their CAN IDs
+ *
+ *              This program is free software; you can distribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Idea:       Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
+ * Copyright:  (c) 2011 Czech Technical University in Prague
+ *             (c) 2011 Volkswagen Group Research
+ * Authors:    Michal Sojka <sojkam1@fel.cvut.cz>
+ *             Pavel Pisa <pisa@cmp.felk.cvut.cz>
+ *             Rostislav Lisovy <lisovy@gmail.cz>
+ * Funded by:  Volkswagen Group Research
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+#include <linux/can.h>
+
+#define EM_CAN_RULES_MAX 500
+
+struct canid_match {
+	/* For each SFF CAN ID (11 bit) there is one record in this bitfield */
+	DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS));
+
+	int rules_count;
+	int sff_rules_count;
+	int eff_rules_count;
+
+	/*
+	 * Raw rules copied from netlink message; Used for sending
+	 * information to userspace (when 'tc filter show' is invoked)
+	 * AND when matching EFF frames
+	 */
+	struct can_filter rules_raw[];
+};
+
+/**
+ * em_canid_get_id() - Extracts Can ID out of the sk_buff structure.
+ */
+static canid_t em_canid_get_id(struct sk_buff *skb)
+{
+	/* CAN ID is stored within the data field */
+	struct can_frame *cf = (struct can_frame *)skb->data;
+
+	return cf->can_id;
+}
+
+static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id,
+					u32 can_mask)
+{
+	int i;
+
+	/*
+	 * Limit can_mask and can_id to SFF range to
+	 * protect against write after end of array
+	 */
+	can_mask &= CAN_SFF_MASK;
+	can_id &= can_mask;
+
+	/* Single frame */
+	if (can_mask == CAN_SFF_MASK) {
+		set_bit(can_id, cm->match_sff);
+		return;
+	}
+
+	/* All frames */
+	if (can_mask == 0) {
+		bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS));
+		return;
+	}
+
+	/*
+	 * Individual frame filter.
+	 * Add record (set bit to 1) for each ID that
+	 * conforms particular rule
+	 */
+	for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) {
+		if ((i & can_mask) == can_id)
+			set_bit(i, cm->match_sff);
+	}
+}
+
+static inline struct canid_match *em_canid_priv(struct tcf_ematch *m)
+{
+	return (struct canid_match *)m->data;
+}
+
+static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m,
+			 struct tcf_pkt_info *info)
+{
+	struct canid_match *cm = em_canid_priv(m);
+	canid_t can_id;
+	int match = 0;
+	int i;
+	const struct can_filter *lp;
+
+	can_id = em_canid_get_id(skb);
+
+	if (can_id & CAN_EFF_FLAG) {
+		for (i = 0, lp = cm->rules_raw;
+		     i < cm->eff_rules_count; i++, lp++) {
+			if (!(((lp->can_id ^ can_id) & lp->can_mask))) {
+				match = 1;
+				break;
+			}
+		}
+	} else { /* SFF */
+		can_id &= CAN_SFF_MASK;
+		match = (test_bit(can_id, cm->match_sff) ? 1 : 0);
+	}
+
+	return match;
+}
+
+static int em_canid_change(struct tcf_proto *tp, void *data, int len,
+			  struct tcf_ematch *m)
+{
+	struct can_filter *conf = data; /* Array with rules */
+	struct canid_match *cm;
+	struct canid_match *cm_old = (struct canid_match *)m->data;
+	int i;
+
+	if (!len)
+		return -EINVAL;
+
+	if (len % sizeof(struct can_filter))
+		return -EINVAL;
+
+	if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX)
+		return -EINVAL;
+
+	cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL);
+	if (!cm)
+		return -ENOMEM;
+
+	cm->rules_count = len / sizeof(struct can_filter);
+
+	/*
+	 * We need two for() loops for copying rules into two contiguous
+	 * areas in rules_raw to process all eff rules with a simple loop.
+	 * NB: The configuration interface supports sff and eff rules.
+	 * We do not support filters here that match for the same can_id
+	 * provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123).
+	 * For this (unusual case) two filters have to be specified. The
+	 * SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id.
+	 */
+
+	/* Fill rules_raw with EFF rules first */
+	for (i = 0; i < cm->rules_count; i++) {
+		if (conf[i].can_id & CAN_EFF_FLAG) {
+			memcpy(cm->rules_raw + cm->eff_rules_count,
+				&conf[i],
+				sizeof(struct can_filter));
+
+			cm->eff_rules_count++;
+		}
+	}
+
+	/* append SFF frame rules */
+	for (i = 0; i < cm->rules_count; i++) {
+		if (!(conf[i].can_id & CAN_EFF_FLAG)) {
+			memcpy(cm->rules_raw
+				+ cm->eff_rules_count
+				+ cm->sff_rules_count,
+				&conf[i], sizeof(struct can_filter));
+
+			cm->sff_rules_count++;
+
+			em_canid_sff_match_add(cm,
+				conf[i].can_id, conf[i].can_mask);
+		}
+	}
+
+	m->datalen = sizeof(struct canid_match) + len;
+	m->data = (unsigned long)cm;
+
+	if (cm_old != NULL) {
+		pr_err("canid: Configuring an existing ematch!\n");
+		kfree(cm_old);
+	}
+
+	return 0;
+}
+
+static void em_canid_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+	struct canid_match *cm = em_canid_priv(m);
+
+	kfree(cm);
+}
+
+static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+	struct canid_match *cm = em_canid_priv(m);
+
+	/*
+	 * When configuring this ematch 'rules_count' is set not to exceed
+	 * 'rules_raw' array size
+	 */
+	if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count,
+	    &cm->rules_raw) < 0)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static struct tcf_ematch_ops em_canid_ops = {
+	.kind	  = TCF_EM_CANID,
+	.change	  = em_canid_change,
+	.match	  = em_canid_match,
+	.destroy  = em_canid_destroy,
+	.dump	  = em_canid_dump,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_canid_ops.link)
+};
+
+static int __init init_em_canid(void)
+{
+	return tcf_em_register(&em_canid_ops);
+}
+
+static void __exit exit_em_canid(void)
+{
+	tcf_em_unregister(&em_canid_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_canid);
+module_exit(exit_em_canid);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID);
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index bf1f00f8b1b..1c8360a2752 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -9,12 +9,12 @@
  * Authors:	Thomas Graf <tgraf@suug.ch>
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/tc_ematch/tc_em_cmp.h>
+#include <asm/unaligned.h>
 #include <net/pkt_cls.h>
 
 static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp)
@@ -33,44 +33,41 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
 		return 0;
 
 	switch (cmp->align) {
-		case TCF_EM_ALIGN_U8:
-			val = *ptr;
-			break;
-
-		case TCF_EM_ALIGN_U16:
-			val = *ptr << 8;
-			val |= *(ptr+1);
-
-			if (cmp_needs_transformation(cmp))
-				val = be16_to_cpu(val);
-			break;
-
-		case TCF_EM_ALIGN_U32:
-			/* Worth checking boundries? The branching seems
-			 * to get worse. Visit again. */
-			val = *ptr << 24;
-			val |= *(ptr+1) << 16;
-			val |= *(ptr+2) << 8;
-			val |= *(ptr+3);
-
-			if (cmp_needs_transformation(cmp))
-				val = be32_to_cpu(val);
-			break;
-
-		default:
-			return 0;
+	case TCF_EM_ALIGN_U8:
+		val = *ptr;
+		break;
+
+	case TCF_EM_ALIGN_U16:
+		val = get_unaligned_be16(ptr);
+
+		if (cmp_needs_transformation(cmp))
+			val = be16_to_cpu(val);
+		break;
+
+	case TCF_EM_ALIGN_U32:
+		/* Worth checking boundries? The branching seems
+		 * to get worse. Visit again.
+		 */
+		val = get_unaligned_be32(ptr);
+
+		if (cmp_needs_transformation(cmp))
+			val = be32_to_cpu(val);
+		break;
+
+	default:
+		return 0;
 	}
 
 	if (cmp->mask)
 		val &= cmp->mask;
 
 	switch (cmp->opnd) {
-		case TCF_EM_OPND_EQ:
-			return val == cmp->val;
-		case TCF_EM_OPND_LT:
-			return val < cmp->val;
-		case TCF_EM_OPND_GT:
-			return val > cmp->val;
+	case TCF_EM_OPND_EQ:
+		return val == cmp->val;
+	case TCF_EM_OPND_LT:
+		return val < cmp->val;
+	case TCF_EM_OPND_GT:
+		return val > cmp->val;
 	}
 
 	return 0;
@@ -89,7 +86,7 @@ static int __init init_em_cmp(void)
 	return tcf_em_register(&em_cmp_ops);
 }
 
-static void __exit exit_em_cmp(void) 
+static void __exit exit_em_cmp(void)
 {
 	tcf_em_unregister(&em_cmp_ops);
 }
@@ -99,3 +96,4 @@ MODULE_LICENSE("GPL");
 module_init(init_em_cmp);
 module_exit(exit_em_cmp);
 
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_CMP);
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
new file mode 100644
index 00000000000..527aeb7a3ff
--- /dev/null
+++ b/net/sched/em_ipset.c
@@ -0,0 +1,136 @@
+/*
+ * net/sched/em_ipset.c	ipset ematch
+ *
+ * Copyright (c) 2012 Florian Westphal <fw@strlen.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/xt_set.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+#include <net/pkt_cls.h>
+
+static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len,
+			   struct tcf_ematch *em)
+{
+	struct xt_set_info *set = data;
+	ip_set_id_t index;
+	struct net *net = dev_net(qdisc_dev(tp->q));
+
+	if (data_len != sizeof(*set))
+		return -EINVAL;
+
+	index = ip_set_nfnl_get_byindex(net, set->index);
+	if (index == IPSET_INVALID_ID)
+		return -ENOENT;
+
+	em->datalen = sizeof(*set);
+	em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
+	if (em->data)
+		return 0;
+
+	ip_set_nfnl_put(net, index);
+	return -ENOMEM;
+}
+
+static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em)
+{
+	const struct xt_set_info *set = (const void *) em->data;
+	if (set) {
+		ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index);
+		kfree((void *) em->data);
+	}
+}
+
+static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
+			  struct tcf_pkt_info *info)
+{
+	struct ip_set_adt_opt opt;
+	struct xt_action_param acpar;
+	const struct xt_set_info *set = (const void *) em->data;
+	struct net_device *dev, *indev = NULL;
+	int ret, network_offset;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		acpar.family = NFPROTO_IPV4;
+		if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+			return 0;
+		acpar.thoff = ip_hdrlen(skb);
+		break;
+	case htons(ETH_P_IPV6):
+		acpar.family = NFPROTO_IPV6;
+		if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+			return 0;
+		/* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */
+		acpar.thoff = sizeof(struct ipv6hdr);
+		break;
+	default:
+		return 0;
+	}
+
+	acpar.hooknum = 0;
+
+	opt.family = acpar.family;
+	opt.dim = set->dim;
+	opt.flags = set->flags;
+	opt.cmdflags = 0;
+	opt.ext.timeout = ~0u;
+
+	network_offset = skb_network_offset(skb);
+	skb_pull(skb, network_offset);
+
+	dev = skb->dev;
+
+	rcu_read_lock();
+
+	if (dev && skb->skb_iif)
+		indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif);
+
+	acpar.in      = indev ? indev : dev;
+	acpar.out     = dev;
+
+	ret = ip_set_test(set->index, skb, &acpar, &opt);
+
+	rcu_read_unlock();
+
+	skb_push(skb, network_offset);
+	return ret;
+}
+
+static struct tcf_ematch_ops em_ipset_ops = {
+	.kind	  = TCF_EM_IPSET,
+	.change	  = em_ipset_change,
+	.destroy  = em_ipset_destroy,
+	.match	  = em_ipset_match,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_ipset_ops.link)
+};
+
+static int __init init_em_ipset(void)
+{
+	return tcf_em_register(&em_ipset_ops);
+}
+
+static void __exit exit_em_ipset(void)
+{
+	tcf_em_unregister(&em_ipset_ops);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("TC extended match for IP sets");
+
+module_init(init_em_ipset);
+module_exit(exit_em_ipset);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 700844d49d7..9b8c0b0e60d 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -9,7 +9,7 @@
  * Authors:	Thomas Graf <tgraf@suug.ch>
  *
  * ==========================================================================
- * 
+ *
  * 	The metadata ematch compares two meta objects where each object
  * 	represents either a meta value stored in the kernel or a static
  * 	value provided by userspace. The objects are not provided by
@@ -47,7 +47,7 @@
  * 	on the meta type. Obviously, the length of the data must also
  * 	be provided for non-numeric types.
  *
- * 	Additionaly, type dependant modifiers such as shift operators
+ * 	Additionally, type dependent modifiers such as shift operators
  * 	or mask may be applied to extend the functionaliy. As of now,
  * 	the variable length type supports shifting the byte string to
  * 	the right, eating up any number of octets and thus supporting
@@ -55,10 +55,10 @@
  * 	ppp0..9.
  *
  * 	NOTE: Certain meta values depend on other subsystems and are
- * 	      only available if that subsytem is enabled in the kernel.
+ * 	      only available if that subsystem is enabled in the kernel.
  */
 
-#include <linux/config.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -66,27 +66,25 @@
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/random.h>
+#include <linux/if_vlan.h>
 #include <linux/tc_ematch/tc_em_meta.h>
 #include <net/dst.h>
 #include <net/route.h>
 #include <net/pkt_cls.h>
 #include <net/sock.h>
 
-struct meta_obj
-{
+struct meta_obj {
 	unsigned long		value;
 	unsigned int		len;
 };
 
-struct meta_value
-{
+struct meta_value {
 	struct tcf_meta_val	hdr;
 	unsigned long		val;
 	unsigned int		len;
 };
 
-struct meta_match
-{
+struct meta_match {
 	struct meta_value	lvalue;
 	struct meta_value	rvalue;
 };
@@ -171,6 +169,23 @@ META_COLLECTOR(var_dev)
 }
 
 /**************************************************************************
+ * vlan tag
+ **************************************************************************/
+
+META_COLLECTOR(int_vlan_tag)
+{
+	unsigned short tag;
+
+	tag = vlan_tx_tag_get(skb);
+	if (!tag && __vlan_get_tag(skb, &tag))
+		*err = -1;
+	else
+		dst->value = tag;
+}
+
+
+
+/**************************************************************************
  * skb attributes
  **************************************************************************/
 
@@ -205,17 +220,18 @@ META_COLLECTOR(int_maclen)
 	dst->value = skb->mac_len;
 }
 
+META_COLLECTOR(int_rxhash)
+{
+	dst->value = skb_get_hash(skb);
+}
+
 /**************************************************************************
  * Netfilter
  **************************************************************************/
 
-META_COLLECTOR(int_nfmark)
+META_COLLECTOR(int_mark)
 {
-#ifdef CONFIG_NETFILTER
-	dst->value = skb->nfmark;
-#else
-	dst->value = 0;
-#endif
+	dst->value = skb->mark;
 }
 
 /**************************************************************************
@@ -233,11 +249,11 @@ META_COLLECTOR(int_tcindex)
 
 META_COLLECTOR(int_rtclassid)
 {
-	if (unlikely(skb->dst == NULL))
+	if (unlikely(skb_dst(skb) == NULL))
 		*err = -1;
 	else
-#ifdef CONFIG_NET_CLS_ROUTE
-		dst->value = skb->dst->tclassid;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		dst->value = skb_dst(skb)->tclassid;
 #else
 		dst->value = 0;
 #endif
@@ -245,217 +261,299 @@ META_COLLECTOR(int_rtclassid)
 
 META_COLLECTOR(int_rtiif)
 {
-	if (unlikely(skb->dst == NULL))
+	if (unlikely(skb_rtable(skb) == NULL))
 		*err = -1;
 	else
-		dst->value = ((struct rtable*) skb->dst)->fl.iif;
+		dst->value = inet_iif(skb);
 }
 
 /**************************************************************************
  * Socket Attributes
  **************************************************************************/
 
-#define SKIP_NONLOCAL(skb)			\
-	if (unlikely(skb->sk == NULL)) {	\
-		*err = -1;			\
-		return;				\
-	}
+#define skip_nonlocal(skb) \
+	(unlikely(skb->sk == NULL))
 
 META_COLLECTOR(int_sk_family)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_family;
 }
 
 META_COLLECTOR(int_sk_state)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_state;
 }
 
 META_COLLECTOR(int_sk_reuse)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_reuse;
 }
 
 META_COLLECTOR(int_sk_bound_if)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	/* No error if bound_dev_if is 0, legal userspace check */
 	dst->value = skb->sk->sk_bound_dev_if;
 }
 
 META_COLLECTOR(var_sk_bound_if)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 
-	 if (skb->sk->sk_bound_dev_if == 0) {
+	if (skb->sk->sk_bound_dev_if == 0) {
 		dst->value = (unsigned long) "any";
 		dst->len = 3;
-	 } else  {
+	} else {
 		struct net_device *dev;
-		
-		dev = dev_get_by_index(skb->sk->sk_bound_dev_if);
+
+		rcu_read_lock();
+		dev = dev_get_by_index_rcu(sock_net(skb->sk),
+					   skb->sk->sk_bound_dev_if);
 		*err = var_dev(dev, dst);
-		if (dev)
-			dev_put(dev);
-	 }
+		rcu_read_unlock();
+	}
 }
 
 META_COLLECTOR(int_sk_refcnt)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = atomic_read(&skb->sk->sk_refcnt);
 }
 
 META_COLLECTOR(int_sk_rcvbuf)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_rcvbuf;
 }
 
 META_COLLECTOR(int_sk_shutdown)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_shutdown;
 }
 
 META_COLLECTOR(int_sk_proto)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_protocol;
 }
 
 META_COLLECTOR(int_sk_type)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_type;
 }
 
 META_COLLECTOR(int_sk_rmem_alloc)
 {
-	SKIP_NONLOCAL(skb);
-	dst->value = atomic_read(&skb->sk->sk_rmem_alloc);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
+	dst->value = sk_rmem_alloc_get(skb->sk);
 }
 
 META_COLLECTOR(int_sk_wmem_alloc)
 {
-	SKIP_NONLOCAL(skb);
-	dst->value = atomic_read(&skb->sk->sk_wmem_alloc);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
+	dst->value = sk_wmem_alloc_get(skb->sk);
 }
 
 META_COLLECTOR(int_sk_omem_alloc)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = atomic_read(&skb->sk->sk_omem_alloc);
 }
 
 META_COLLECTOR(int_sk_rcv_qlen)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_receive_queue.qlen;
 }
 
 META_COLLECTOR(int_sk_snd_qlen)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_write_queue.qlen;
 }
 
 META_COLLECTOR(int_sk_wmem_queued)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_wmem_queued;
 }
 
 META_COLLECTOR(int_sk_fwd_alloc)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_forward_alloc;
 }
 
 META_COLLECTOR(int_sk_sndbuf)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_sndbuf;
 }
 
 META_COLLECTOR(int_sk_alloc)
 {
-	SKIP_NONLOCAL(skb);
-	dst->value = skb->sk->sk_allocation;
-}
-
-META_COLLECTOR(int_sk_route_caps)
-{
-	SKIP_NONLOCAL(skb);
-	dst->value = skb->sk->sk_route_caps;
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
+	dst->value = (__force int) skb->sk->sk_allocation;
 }
 
 META_COLLECTOR(int_sk_hash)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_hash;
 }
 
 META_COLLECTOR(int_sk_lingertime)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_lingertime / HZ;
 }
 
 META_COLLECTOR(int_sk_err_qlen)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_error_queue.qlen;
 }
 
 META_COLLECTOR(int_sk_ack_bl)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_ack_backlog;
 }
 
 META_COLLECTOR(int_sk_max_ack_bl)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_max_ack_backlog;
 }
 
 META_COLLECTOR(int_sk_prio)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_priority;
 }
 
 META_COLLECTOR(int_sk_rcvlowat)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_rcvlowat;
 }
 
 META_COLLECTOR(int_sk_rcvtimeo)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_rcvtimeo / HZ;
 }
 
 META_COLLECTOR(int_sk_sndtimeo)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_sndtimeo / HZ;
 }
 
 META_COLLECTOR(int_sk_sendmsg_off)
 {
-	SKIP_NONLOCAL(skb);
-	dst->value = skb->sk->sk_sndmsg_off;
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
+	dst->value = skb->sk->sk_frag.offset;
 }
 
 META_COLLECTOR(int_sk_write_pend)
 {
-	SKIP_NONLOCAL(skb);
+	if (skip_nonlocal(skb)) {
+		*err = -1;
+		return;
+	}
 	dst->value = skb->sk->sk_write_pending;
 }
 
@@ -463,8 +561,7 @@ META_COLLECTOR(int_sk_write_pend)
  * Meta value collectors assignment table
  **************************************************************************/
 
-struct meta_ops
-{
+struct meta_ops {
 	void		(*get)(struct sk_buff *, struct tcf_pkt_info *,
 			       struct meta_value *, struct meta_obj *, int *);
 };
@@ -474,7 +571,7 @@ struct meta_ops
 
 /* Meta value operations table listing all meta value collectors and
  * assigns them to a type and meta id. */
-static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
+static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = {
 	[TCF_META_TYPE_VAR] = {
 		[META_ID(DEV)]			= META_FUNC(var_dev),
 		[META_ID(SK_BOUND_IF)] 		= META_FUNC(var_sk_bound_if),
@@ -491,7 +588,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(PKTLEN)]		= META_FUNC(int_pktlen),
 		[META_ID(DATALEN)]		= META_FUNC(int_datalen),
 		[META_ID(MACLEN)]		= META_FUNC(int_maclen),
-		[META_ID(NFMARK)]		= META_FUNC(int_nfmark),
+		[META_ID(NFMARK)]		= META_FUNC(int_mark),
 		[META_ID(TCINDEX)]		= META_FUNC(int_tcindex),
 		[META_ID(RTCLASSID)]		= META_FUNC(int_rtclassid),
 		[META_ID(RTIIF)]		= META_FUNC(int_rtiif),
@@ -514,7 +611,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(SK_ERR_QLEN)]		= META_FUNC(int_sk_err_qlen),
 		[META_ID(SK_FORWARD_ALLOCS)]	= META_FUNC(int_sk_fwd_alloc),
 		[META_ID(SK_ALLOCS)]		= META_FUNC(int_sk_alloc),
-		[META_ID(SK_ROUTE_CAPS)]	= META_FUNC(int_sk_route_caps),
 		[META_ID(SK_HASH)]		= META_FUNC(int_sk_hash),
 		[META_ID(SK_LINGERTIME)]	= META_FUNC(int_sk_lingertime),
 		[META_ID(SK_ACK_BACKLOG)]	= META_FUNC(int_sk_ack_bl),
@@ -525,10 +621,12 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(SK_SNDTIMEO)]		= META_FUNC(int_sk_sndtimeo),
 		[META_ID(SK_SENDMSG_OFF)]	= META_FUNC(int_sk_sendmsg_off),
 		[META_ID(SK_WRITE_PENDING)]	= META_FUNC(int_sk_write_pend),
+		[META_ID(VLAN_TAG)]		= META_FUNC(int_vlan_tag),
+		[META_ID(RXHASH)]		= META_FUNC(int_rxhash),
 	}
 };
 
-static inline struct meta_ops * meta_ops(struct meta_value *val)
+static inline struct meta_ops *meta_ops(struct meta_value *val)
 {
 	return &__meta_ops[meta_type(val)][meta_id(val)];
 }
@@ -547,14 +645,13 @@ static int meta_var_compare(struct meta_obj *a, struct meta_obj *b)
 	return r;
 }
 
-static int meta_var_change(struct meta_value *dst, struct rtattr *rta)
+static int meta_var_change(struct meta_value *dst, struct nlattr *nla)
 {
-	int len = RTA_PAYLOAD(rta);
+	int len = nla_len(nla);
 
-	dst->val = (unsigned long) kmalloc(len, GFP_KERNEL);
+	dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL);
 	if (dst->val == 0UL)
 		return -ENOMEM;
-	memcpy((void *) dst->val, RTA_DATA(rta), len);
 	dst->len = len;
 	return 0;
 }
@@ -575,11 +672,12 @@ static void meta_var_apply_extras(struct meta_value *v,
 
 static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
 {
-	if (v->val && v->len)
-		RTA_PUT(skb, tlv, v->len, (void *) v->val);
+	if (v->val && v->len &&
+	    nla_put(skb, tlv, v->len, (void *) v->val))
+		goto nla_put_failure;
 	return 0;
 
-rtattr_failure:
+nla_put_failure:
 	return -1;
 }
 
@@ -600,13 +698,13 @@ static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
 		return 1;
 }
 
-static int meta_int_change(struct meta_value *dst, struct rtattr *rta)
+static int meta_int_change(struct meta_value *dst, struct nlattr *nla)
 {
-	if (RTA_PAYLOAD(rta) >= sizeof(unsigned long)) {
-		dst->val = *(unsigned long *) RTA_DATA(rta);
+	if (nla_len(nla) >= sizeof(unsigned long)) {
+		dst->val = *(unsigned long *) nla_data(nla);
 		dst->len = sizeof(unsigned long);
-	} else if (RTA_PAYLOAD(rta) == sizeof(u32)) {
-		dst->val = *(u32 *) RTA_DATA(rta);
+	} else if (nla_len(nla) == sizeof(u32)) {
+		dst->val = nla_get_u32(nla);
 		dst->len = sizeof(u32);
 	} else
 		return -EINVAL;
@@ -626,16 +724,17 @@ static void meta_int_apply_extras(struct meta_value *v,
 
 static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
 {
-	if (v->len == sizeof(unsigned long))
-		RTA_PUT(skb, tlv, sizeof(unsigned long), &v->val);
-	else if (v->len == sizeof(u32)) {
-		u32 d = v->val;
-		RTA_PUT(skb, tlv, sizeof(d), &d);
+	if (v->len == sizeof(unsigned long)) {
+		if (nla_put(skb, tlv, sizeof(unsigned long), &v->val))
+			goto nla_put_failure;
+	} else if (v->len == sizeof(u32)) {
+		if (nla_put_u32(skb, tlv, v->val))
+			goto nla_put_failure;
 	}
 
 	return 0;
 
-rtattr_failure:
+nla_put_failure:
 	return -1;
 }
 
@@ -643,16 +742,15 @@ rtattr_failure:
  * Type specific operations table
  **************************************************************************/
 
-struct meta_type_ops
-{
+struct meta_type_ops {
 	void	(*destroy)(struct meta_value *);
 	int	(*compare)(struct meta_obj *, struct meta_obj *);
-	int	(*change)(struct meta_value *, struct rtattr *);
+	int	(*change)(struct meta_value *, struct nlattr *);
 	void	(*apply_extras)(struct meta_value *, struct meta_obj *);
 	int	(*dump)(struct sk_buff *, struct meta_value *, int);
 };
 
-static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
+static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
 	[TCF_META_TYPE_VAR] = {
 		.destroy = meta_var_destroy,
 		.compare = meta_var_compare,
@@ -668,7 +766,7 @@ static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
 	}
 };
 
-static inline struct meta_type_ops * meta_type_ops(struct meta_value *v)
+static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)
 {
 	return &__meta_type_ops[meta_type(v)];
 }
@@ -677,8 +775,8 @@ static inline struct meta_type_ops * meta_type_ops(struct meta_value *v)
  * Core
  **************************************************************************/
 
-static inline int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, 
-			   struct meta_value *v, struct meta_obj *dst)
+static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
+		    struct meta_value *v, struct meta_obj *dst)
 {
 	int err = 0;
 
@@ -693,7 +791,7 @@ static inline int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
 		return err;
 
 	if (meta_type_ops(v)->apply_extras)
-	    meta_type_ops(v)->apply_extras(v, dst);
+		meta_type_ops(v)->apply_extras(v, dst);
 
 	return 0;
 }
@@ -712,36 +810,38 @@ static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
 	r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
 
 	switch (meta->lvalue.hdr.op) {
-		case TCF_EM_OPND_EQ:
-			return !r;
-		case TCF_EM_OPND_LT:
-			return r < 0;
-		case TCF_EM_OPND_GT:
-			return r > 0;
+	case TCF_EM_OPND_EQ:
+		return !r;
+	case TCF_EM_OPND_LT:
+		return r < 0;
+	case TCF_EM_OPND_GT:
+		return r > 0;
 	}
 
 	return 0;
 }
 
-static inline void meta_delete(struct meta_match *meta)
+static void meta_delete(struct meta_match *meta)
 {
-	struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
+	if (meta) {
+		struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
 
-	if (ops && ops->destroy) {
-		ops->destroy(&meta->lvalue);
-		ops->destroy(&meta->rvalue);
+		if (ops && ops->destroy) {
+			ops->destroy(&meta->lvalue);
+			ops->destroy(&meta->rvalue);
+		}
 	}
 
 	kfree(meta);
 }
 
-static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta)
+static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)
 {
-	if (rta) {
-		if (RTA_PAYLOAD(rta) == 0)
+	if (nla) {
+		if (nla_len(nla) == 0)
 			return -EINVAL;
 
-		return meta_type_ops(dst)->change(dst, rta);
+		return meta_type_ops(dst)->change(dst, nla);
 	}
 
 	return 0;
@@ -749,24 +849,29 @@ static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta)
 
 static inline int meta_is_supported(struct meta_value *val)
 {
-	return (!meta_id(val) || meta_ops(val)->get);
+	return !meta_id(val) || meta_ops(val)->get;
 }
 
+static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = {
+	[TCA_EM_META_HDR]	= { .len = sizeof(struct tcf_meta_hdr) },
+};
+
 static int em_meta_change(struct tcf_proto *tp, void *data, int len,
 			  struct tcf_ematch *m)
 {
-	int err = -EINVAL;
-	struct rtattr *tb[TCA_EM_META_MAX];
+	int err;
+	struct nlattr *tb[TCA_EM_META_MAX + 1];
 	struct tcf_meta_hdr *hdr;
 	struct meta_match *meta = NULL;
-	
-	if (rtattr_parse(tb, TCA_EM_META_MAX, data, len) < 0)
+
+	err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy);
+	if (err < 0)
 		goto errout;
 
-	if (tb[TCA_EM_META_HDR-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_EM_META_HDR-1]) < sizeof(*hdr))
+	err = -EINVAL;
+	if (tb[TCA_EM_META_HDR] == NULL)
 		goto errout;
-	hdr = RTA_DATA(tb[TCA_EM_META_HDR-1]);
+	hdr = nla_data(tb[TCA_EM_META_HDR]);
 
 	if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) ||
 	    TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX ||
@@ -774,10 +879,11 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len,
 	    TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX)
 		goto errout;
 
-	meta = kmalloc(sizeof(*meta), GFP_KERNEL);
-	if (meta == NULL)
+	meta = kzalloc(sizeof(*meta), GFP_KERNEL);
+	if (meta == NULL) {
+		err = -ENOMEM;
 		goto errout;
-	memset(meta, 0, sizeof(*meta));
+	}
 
 	memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
 	memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
@@ -788,8 +894,8 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len,
 		goto errout;
 	}
 
-	if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE-1]) < 0 ||
-	    meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE-1]) < 0)
+	if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 ||
+	    meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0)
 		goto errout;
 
 	m->datalen = sizeof(*meta);
@@ -818,18 +924,19 @@ static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
 	memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
 	memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right));
 
-	RTA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr);
+	if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr))
+		goto nla_put_failure;
 
 	ops = meta_type_ops(&meta->lvalue);
 	if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 ||
 	    ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0)
-		goto rtattr_failure;
+		goto nla_put_failure;
 
 	return 0;
 
-rtattr_failure:
+nla_put_failure:
 	return -1;
-}		
+}
 
 static struct tcf_ematch_ops em_meta_ops = {
 	.kind	  = TCF_EM_META,
@@ -846,7 +953,7 @@ static int __init init_em_meta(void)
 	return tcf_em_register(&em_meta_ops);
 }
 
-static void __exit exit_em_meta(void) 
+static void __exit exit_em_meta(void)
 {
 	tcf_em_unregister(&em_meta_ops);
 }
@@ -855,3 +962,5 @@ MODULE_LICENSE("GPL");
 
 module_init(init_em_meta);
 module_exit(exit_em_meta);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_META);
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 71ea926a9f0..a3bed07a008 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -9,22 +9,20 @@
  * Authors:	Thomas Graf <tgraf@suug.ch>
  */
 
-#include <linux/config.h>
+#include <linux/gfp.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/tc_ematch/tc_em_nbyte.h>
 #include <net/pkt_cls.h>
 
-struct nbyte_data
-{
+struct nbyte_data {
 	struct tcf_em_nbyte	hdr;
 	char			pattern[0];
 };
-	
+
 static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len,
 			   struct tcf_ematch *em)
 {
@@ -35,12 +33,10 @@ static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len,
 		return -EINVAL;
 
 	em->datalen = sizeof(*nbyte) + nbyte->len;
-	em->data = (unsigned long) kmalloc(em->datalen, GFP_KERNEL);
+	em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
 	if (em->data == 0UL)
 		return -ENOBUFS;
 
-	memcpy((void *) em->data, data, em->datalen);
-
 	return 0;
 }
 
@@ -71,7 +67,7 @@ static int __init init_em_nbyte(void)
 	return tcf_em_register(&em_nbyte_ops);
 }
 
-static void __exit exit_em_nbyte(void) 
+static void __exit exit_em_nbyte(void)
 {
 	tcf_em_unregister(&em_nbyte_ops);
 }
@@ -80,3 +76,5 @@ MODULE_LICENSE("GPL");
 
 module_init(init_em_nbyte);
 module_exit(exit_em_nbyte);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_NBYTE);
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index 77beabc91fa..15d353d2e4b 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -9,19 +9,17 @@
  * Authors:	Thomas Graf <tgraf@suug.ch>
  */
 
-#include <linux/config.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/textsearch.h>
 #include <linux/tc_ematch/tc_em_text.h>
 #include <net/pkt_cls.h>
 
-struct text_match
-{
+struct text_match {
 	u16			from_offset;
 	u16			to_offset;
 	u8			from_layer;
@@ -104,7 +102,8 @@ retry:
 
 static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
 {
-	textsearch_destroy(EM_TEXT_PRIV(m)->config);
+	if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config)
+		textsearch_destroy(EM_TEXT_PRIV(m)->config);
 }
 
 static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
@@ -120,13 +119,16 @@ static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
 	conf.pattern_len = textsearch_get_pattern_len(tm->config);
 	conf.pad = 0;
 
-	RTA_PUT_NOHDR(skb, sizeof(conf), &conf);
-	RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config));
+	if (nla_put_nohdr(skb, sizeof(conf), &conf) < 0)
+		goto nla_put_failure;
+	if (nla_append(skb, conf.pattern_len,
+		       textsearch_get_pattern(tm->config)) < 0)
+		goto nla_put_failure;
 	return 0;
 
-rtattr_failure:
+nla_put_failure:
 	return -1;
-}		
+}
 
 static struct tcf_ematch_ops em_text_ops = {
 	.kind	  = TCF_EM_TEXT,
@@ -143,7 +145,7 @@ static int __init init_em_text(void)
 	return tcf_em_register(&em_text_ops);
 }
 
-static void __exit exit_em_text(void) 
+static void __exit exit_em_text(void)
 {
 	tcf_em_unregister(&em_text_ops);
 }
@@ -152,3 +154,5 @@ MODULE_LICENSE("GPL");
 
 module_init(init_em_text);
 module_exit(exit_em_text);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_TEXT);
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
index 34e7e51e601..797bdb88c01 100644
--- a/net/sched/em_u32.c
+++ b/net/sched/em_u32.c
@@ -12,7 +12,6 @@
  * Based on net/sched/cls_u32.c
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -23,8 +22,8 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
 			struct tcf_pkt_info *info)
 {
 	struct tc_u32_key *key = (struct tc_u32_key *) em->data;
-	unsigned char *ptr = skb->nh.raw;
-	
+	const unsigned char *ptr = skb_network_header(skb);
+
 	if (info) {
 		if (info->ptr)
 			ptr = info->ptr;
@@ -35,8 +34,8 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
 
 	if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
 		return 0;
-	
-	return !(((*(u32*) ptr)  ^ key->val) & key->mask);
+
+	return !(((*(__be32 *) ptr)  ^ key->val) & key->mask);
 }
 
 static struct tcf_ematch_ops em_u32_ops = {
@@ -52,7 +51,7 @@ static int __init init_em_u32(void)
 	return tcf_em_register(&em_u32_ops);
 }
 
-static void __exit exit_em_u32(void) 
+static void __exit exit_em_u32(void)
 {
 	tcf_em_unregister(&em_u32_ops);
 }
@@ -61,3 +60,5 @@ MODULE_LICENSE("GPL");
 
 module_init(init_em_u32);
 module_exit(exit_em_u32);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_U32);
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 5cb956b721e..3a633debb6d 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -37,12 +37,12 @@
  *                     --------<-POP---------
  *
  * where B is a virtual ematch referencing to sequence starting with B1.
- * 
+ *
  * ==========================================================================
  *
  * How to write an ematch in 60 seconds
  * ------------------------------------
- * 
+ *
  *   1) Provide a matcher function:
  *      static int my_match(struct sk_buff *skb, struct tcf_ematch *m,
  *                          struct tcf_pkt_info *info)
@@ -71,7 +71,7 @@
  *
  *      static void __exit exit_my_ematch(void)
  *      {
- *      	return tcf_em_unregister(&my_ops);
+ *      	tcf_em_unregister(&my_ops);
  *      }
  *
  *      module_init(init_my_ematch);
@@ -81,14 +81,11 @@
  *      open up a beer to watch the compilation going.
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
 #include <linux/rtnetlink.h>
 #include <linux/skbuff.h>
 #include <net/pkt_cls.h>
@@ -96,7 +93,7 @@
 static LIST_HEAD(ematch_ops);
 static DEFINE_RWLOCK(ematch_mod_lock);
 
-static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind)
+static struct tcf_ematch_ops *tcf_em_lookup(u16 kind)
 {
 	struct tcf_ematch_ops *e = NULL;
 
@@ -116,7 +113,7 @@ static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind)
 
 /**
  * tcf_em_register - register an extended match
- * 
+ *
  * @ops: ematch operations lookup table
  *
  * This function must be called by ematches to announce their presence.
@@ -145,6 +142,7 @@ errout:
 	write_unlock(&ematch_mod_lock);
 	return err;
 }
+EXPORT_SYMBOL(tcf_em_register);
 
 /**
  * tcf_em_unregister - unregster and extended match
@@ -157,27 +155,16 @@ errout:
  *
  * Returns -ENOENT if no matching ematch was found.
  */
-int tcf_em_unregister(struct tcf_ematch_ops *ops)
+void tcf_em_unregister(struct tcf_ematch_ops *ops)
 {
-	int err = 0;
-	struct tcf_ematch_ops *e;
-
 	write_lock(&ematch_mod_lock);
-	list_for_each_entry(e, &ematch_ops, link) {
-		if (e == ops) {
-			list_del(&e->link);
-			goto out;
-		}
-	}
-
-	err = -ENOENT;
-out:
+	list_del(&ops->link);
 	write_unlock(&ematch_mod_lock);
-	return err;
 }
+EXPORT_SYMBOL(tcf_em_unregister);
 
-static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree,
-						   int index)
+static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree,
+						  int index)
 {
 	return &tree->matches[index];
 }
@@ -185,11 +172,11 @@ static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree,
 
 static int tcf_em_validate(struct tcf_proto *tp,
 			   struct tcf_ematch_tree_hdr *tree_hdr,
-			   struct tcf_ematch *em, struct rtattr *rta, int idx)
+			   struct tcf_ematch *em, struct nlattr *nla, int idx)
 {
 	int err = -EINVAL;
-	struct tcf_ematch_hdr *em_hdr = RTA_DATA(rta);
-	int data_len = RTA_PAYLOAD(rta) - sizeof(*em_hdr);
+	struct tcf_ematch_hdr *em_hdr = nla_data(nla);
+	int data_len = nla_len(nla) - sizeof(*em_hdr);
 	void *data = (void *) em_hdr + sizeof(*em_hdr);
 
 	if (!TCF_EM_REL_VALID(em_hdr->flags))
@@ -197,7 +184,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
 
 	if (em_hdr->kind == TCF_EM_CONTAINER) {
 		/* Special ematch called "container", carries an index
-		 * referencing an external ematch sequence. */
+		 * referencing an external ematch sequence.
+		 */
 		u32 ref;
 
 		if (data_len < sizeof(ref))
@@ -208,11 +196,12 @@ static int tcf_em_validate(struct tcf_proto *tp,
 			goto errout;
 
 		/* We do not allow backward jumps to avoid loops and jumps
-		 * to our own position are of course illegal. */
+		 * to our own position are of course illegal.
+		 */
 		if (ref <= idx)
 			goto errout;
 
-		
+
 		em->data = ref;
 	} else {
 		/* Note: This lookup will increase the module refcnt
@@ -221,16 +210,32 @@ static int tcf_em_validate(struct tcf_proto *tp,
 		 * which automatically releases the reference again, therefore
 		 * the module MUST not be given back under any circumstances
 		 * here. Be aware, the destroy function assumes that the
-		 * module is held if the ops field is non zero. */
+		 * module is held if the ops field is non zero.
+		 */
 		em->ops = tcf_em_lookup(em_hdr->kind);
 
 		if (em->ops == NULL) {
 			err = -ENOENT;
+#ifdef CONFIG_MODULES
+			__rtnl_unlock();
+			request_module("ematch-kind-%u", em_hdr->kind);
+			rtnl_lock();
+			em->ops = tcf_em_lookup(em_hdr->kind);
+			if (em->ops) {
+				/* We dropped the RTNL mutex in order to
+				 * perform the module load. Tell the caller
+				 * to replay the request.
+				 */
+				module_put(em->ops->owner);
+				err = -EAGAIN;
+			}
+#endif
 			goto errout;
 		}
 
 		/* ematch module provides expected length of data, so we
-		 * can do a basic sanity check. */
+		 * can do a basic sanity check.
+		 */
 		if (em->ops->datalen && data_len < em->ops->datalen)
 			goto errout;
 
@@ -246,18 +251,18 @@ static int tcf_em_validate(struct tcf_proto *tp,
 			 * TCF_EM_SIMPLE may be specified stating that the
 			 * data only consists of a u32 integer and the module
 			 * does not expected a memory reference but rather
-			 * the value carried. */
+			 * the value carried.
+			 */
 			if (em_hdr->flags & TCF_EM_SIMPLE) {
 				if (data_len < sizeof(u32))
 					goto errout;
 				em->data = *(u32 *) data;
 			} else {
-				void *v = kmalloc(data_len, GFP_KERNEL);
+				void *v = kmemdup(data, data_len, GFP_KERNEL);
 				if (v == NULL) {
 					err = -ENOBUFS;
 					goto errout;
 				}
-				memcpy(v, data, data_len);
 				em->data = (unsigned long) v;
 			}
 		}
@@ -272,15 +277,20 @@ errout:
 	return err;
 }
 
+static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = {
+	[TCA_EMATCH_TREE_HDR]	= { .len = sizeof(struct tcf_ematch_tree_hdr) },
+	[TCA_EMATCH_TREE_LIST]	= { .type = NLA_NESTED },
+};
+
 /**
  * tcf_em_tree_validate - validate ematch config TLV and build ematch tree
  *
  * @tp: classifier kind handle
- * @rta: ematch tree configuration TLV
+ * @nla: ematch tree configuration TLV
  * @tree: destination ematch tree variable to store the resulting
  *        ematch tree.
  *
- * This function validates the given configuration TLV @rta and builds an
+ * This function validates the given configuration TLV @nla and builds an
  * ematch tree in @tree. The resulting tree must later be copied into
  * the private classifier data using tcf_em_tree_change(). You MUST NOT
  * provide the ematch tree variable of the private classifier data directly,
@@ -288,63 +298,60 @@ errout:
  *
  * Returns a negative error code if the configuration TLV contains errors.
  */
-int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta,
+int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
 			 struct tcf_ematch_tree *tree)
 {
-	int idx, list_len, matches_len, err = -EINVAL;
-	struct rtattr *tb[TCA_EMATCH_TREE_MAX];
-	struct rtattr *rt_match, *rt_hdr, *rt_list;
+	int idx, list_len, matches_len, err;
+	struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1];
+	struct nlattr *rt_match, *rt_hdr, *rt_list;
 	struct tcf_ematch_tree_hdr *tree_hdr;
 	struct tcf_ematch *em;
 
-	if (!rta) {
-		memset(tree, 0, sizeof(*tree));
+	memset(tree, 0, sizeof(*tree));
+	if (!nla)
 		return 0;
-	}
 
-	if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0)
+	err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy);
+	if (err < 0)
 		goto errout;
 
-	rt_hdr = tb[TCA_EMATCH_TREE_HDR-1];
-	rt_list = tb[TCA_EMATCH_TREE_LIST-1];
+	err = -EINVAL;
+	rt_hdr = tb[TCA_EMATCH_TREE_HDR];
+	rt_list = tb[TCA_EMATCH_TREE_LIST];
 
 	if (rt_hdr == NULL || rt_list == NULL)
 		goto errout;
 
-	if (RTA_PAYLOAD(rt_hdr) < sizeof(*tree_hdr) ||
-	    RTA_PAYLOAD(rt_list) < sizeof(*rt_match))
-		goto errout;
-
-	tree_hdr = RTA_DATA(rt_hdr);
+	tree_hdr = nla_data(rt_hdr);
 	memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr));
 
-	rt_match = RTA_DATA(rt_list);
-	list_len = RTA_PAYLOAD(rt_list);
+	rt_match = nla_data(rt_list);
+	list_len = nla_len(rt_list);
 	matches_len = tree_hdr->nmatches * sizeof(*em);
 
-	tree->matches = kmalloc(matches_len, GFP_KERNEL);
+	tree->matches = kzalloc(matches_len, GFP_KERNEL);
 	if (tree->matches == NULL)
 		goto errout;
-	memset(tree->matches, 0, matches_len);
 
-	/* We do not use rtattr_parse_nested here because the maximum
+	/* We do not use nla_parse_nested here because the maximum
 	 * number of attributes is unknown. This saves us the allocation
 	 * for a tb buffer which would serve no purpose at all.
-	 * 
+	 *
 	 * The array of rt attributes is parsed in the order as they are
 	 * provided, their type must be incremental from 1 to n. Even
 	 * if it does not serve any real purpose, a failure of sticking
-	 * to this policy will result in parsing failure. */
-	for (idx = 0; RTA_OK(rt_match, list_len); idx++) {
+	 * to this policy will result in parsing failure.
+	 */
+	for (idx = 0; nla_ok(rt_match, list_len); idx++) {
 		err = -EINVAL;
 
-		if (rt_match->rta_type != (idx + 1))
+		if (rt_match->nla_type != (idx + 1))
 			goto errout_abort;
 
 		if (idx >= tree_hdr->nmatches)
 			goto errout_abort;
 
-		if (RTA_PAYLOAD(rt_match) < sizeof(struct tcf_ematch_hdr))
+		if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr))
 			goto errout_abort;
 
 		em = tcf_em_get_match(tree, idx);
@@ -353,13 +360,14 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta,
 		if (err < 0)
 			goto errout_abort;
 
-		rt_match = RTA_NEXT(rt_match, list_len);
+		rt_match = nla_next(rt_match, &list_len);
 	}
 
 	/* Check if the number of matches provided by userspace actually
 	 * complies with the array of matches. The number was used for
 	 * the validation of references and a mismatch could lead to
-	 * undefined references during the matching process. */
+	 * undefined references during the matching process.
+	 */
 	if (idx != tree_hdr->nmatches) {
 		err = -EINVAL;
 		goto errout_abort;
@@ -373,6 +381,7 @@ errout_abort:
 	tcf_em_tree_destroy(tp, tree);
 	return err;
 }
+EXPORT_SYMBOL(tcf_em_tree_validate);
 
 /**
  * tcf_em_tree_destroy - destroy an ematch tree
@@ -397,15 +406,17 @@ void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree)
 		if (em->ops) {
 			if (em->ops->destroy)
 				em->ops->destroy(tp, em);
-			else if (!tcf_em_is_simple(em) && em->data)
+			else if (!tcf_em_is_simple(em))
 				kfree((void *) em->data);
 			module_put(em->ops->owner);
 		}
 	}
-	
+
 	tree->hdr.nmatches = 0;
 	kfree(tree->matches);
+	tree->matches = NULL;
 }
+EXPORT_SYMBOL(tcf_em_tree_destroy);
 
 /**
  * tcf_em_tree_dump - dump ematch tree into a rtnl message
@@ -422,17 +433,24 @@ void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree)
 int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
 {
 	int i;
-	struct rtattr * top_start = (struct rtattr*) skb->tail;
-	struct rtattr * list_start;
+	u8 *tail;
+	struct nlattr *top_start;
+	struct nlattr *list_start;
 
-	RTA_PUT(skb, tlv, 0, NULL);
-	RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr);
+	top_start = nla_nest_start(skb, tlv);
+	if (top_start == NULL)
+		goto nla_put_failure;
 
-	list_start = (struct rtattr *) skb->tail;
-	RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL);
+	if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr))
+		goto nla_put_failure;
 
+	list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);
+	if (list_start == NULL)
+		goto nla_put_failure;
+
+	tail = skb_tail_pointer(skb);
 	for (i = 0; i < tree->hdr.nmatches; i++) {
-		struct rtattr *match_start = (struct rtattr*) skb->tail;
+		struct nlattr *match_start = (struct nlattr *)tail;
 		struct tcf_ematch *em = tcf_em_get_match(tree, i);
 		struct tcf_ematch_hdr em_hdr = {
 			.kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER,
@@ -440,33 +458,37 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
 			.flags = em->flags
 		};
 
-		RTA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr);
+		if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr))
+			goto nla_put_failure;
 
 		if (em->ops && em->ops->dump) {
 			if (em->ops->dump(skb, em) < 0)
-				goto rtattr_failure;
+				goto nla_put_failure;
 		} else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) {
 			u32 u = em->data;
-			RTA_PUT_NOHDR(skb, sizeof(u), &u);
+			nla_put_nohdr(skb, sizeof(u), &u);
 		} else if (em->datalen > 0)
-			RTA_PUT_NOHDR(skb, em->datalen, (void *) em->data);
+			nla_put_nohdr(skb, em->datalen, (void *) em->data);
 
-		match_start->rta_len = skb->tail - (u8*) match_start;
+		tail = skb_tail_pointer(skb);
+		match_start->nla_len = tail - (u8 *)match_start;
 	}
 
-	list_start->rta_len = skb->tail - (u8 *) list_start;
-	top_start->rta_len = skb->tail - (u8 *) top_start;
+	nla_nest_end(skb, list_start);
+	nla_nest_end(skb, top_start);
 
 	return 0;
 
-rtattr_failure:
+nla_put_failure:
 	return -1;
 }
+EXPORT_SYMBOL(tcf_em_tree_dump);
 
 static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
 			       struct tcf_pkt_info *info)
 {
 	int r = em->ops->match(skb, em, info);
+
 	return tcf_em_is_inverted(em) ? !r : r;
 }
 
@@ -515,14 +537,7 @@ pop_stack:
 	return res;
 
 stack_overflow:
-	if (net_ratelimit())
-		printk("Local stack overflow, increase NET_EMATCH_STACK\n");
+	net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n");
 	return -1;
 }
-
-EXPORT_SYMBOL(tcf_em_register);
-EXPORT_SYMBOL(tcf_em_unregister);
-EXPORT_SYMBOL(tcf_em_tree_validate);
-EXPORT_SYMBOL(tcf_em_tree_destroy);
-EXPORT_SYMBOL(tcf_em_tree_dump);
 EXPORT_SYMBOL(__tcf_em_tree_match);
diff --git a/net/sched/estimator.c b/net/sched/estimator.c
deleted file mode 100644
index 5d3ae03e22a..00000000000
--- a/net/sched/estimator.c
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * net/sched/estimator.c	Simple rate estimator.
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- *
- * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/jiffies.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
-#include <linux/init.h>
-#include <net/sock.h>
-#include <net/pkt_sched.h>
-
-/*
-   This code is NOT intended to be used for statistics collection,
-   its purpose is to provide a base for statistical multiplexing
-   for controlled load service.
-   If you need only statistics, run a user level daemon which
-   periodically reads byte counters.
-
-   Unfortunately, rate estimation is not a very easy task.
-   F.e. I did not find a simple way to estimate the current peak rate
-   and even failed to formulate the problem 8)8)
-
-   So I preferred not to built an estimator into the scheduler,
-   but run this task separately.
-   Ideally, it should be kernel thread(s), but for now it runs
-   from timers, which puts apparent top bounds on the number of rated
-   flows, has minimal overhead on small, but is enough
-   to handle controlled load service, sets of aggregates.
-
-   We measure rate over A=(1<<interval) seconds and evaluate EWMA:
-
-   avrate = avrate*(1-W) + rate*W
-
-   where W is chosen as negative power of 2: W = 2^(-ewma_log)
-
-   The resulting time constant is:
-
-   T = A/(-ln(1-W))
-
-
-   NOTES.
-
-   * The stored value for avbps is scaled by 2^5, so that maximal
-     rate is ~1Gbit, avpps is scaled by 2^10.
-
-   * Minimal interval is HZ/4=250msec (it is the greatest common divisor
-     for HZ=100 and HZ=1024 8)), maximal interval
-     is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
-     are too expensive, longer ones can be implemented
-     at user level painlessly.
- */
-
-#define EST_MAX_INTERVAL	5
-
-struct qdisc_estimator
-{
-	struct qdisc_estimator	*next;
-	struct tc_stats		*stats;
-	spinlock_t		*stats_lock;
-	unsigned		interval;
-	int			ewma_log;
-	u64			last_bytes;
-	u32			last_packets;
-	u32			avpps;
-	u32			avbps;
-};
-
-struct qdisc_estimator_head
-{
-	struct timer_list	timer;
-	struct qdisc_estimator	*list;
-};
-
-static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1];
-
-/* Estimator array lock */
-static DEFINE_RWLOCK(est_lock);
-
-static void est_timer(unsigned long arg)
-{
-	int idx = (int)arg;
-	struct qdisc_estimator *e;
-
-	read_lock(&est_lock);
-	for (e = elist[idx].list; e; e = e->next) {
-		struct tc_stats *st = e->stats;
-		u64 nbytes;
-		u32 npackets;
-		u32 rate;
-
-		spin_lock(e->stats_lock);
-		nbytes = st->bytes;
-		npackets = st->packets;
-		rate = (nbytes - e->last_bytes)<<(7 - idx);
-		e->last_bytes = nbytes;
-		e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
-		st->bps = (e->avbps+0xF)>>5;
-
-		rate = (npackets - e->last_packets)<<(12 - idx);
-		e->last_packets = npackets;
-		e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
-		e->stats->pps = (e->avpps+0x1FF)>>10;
-		spin_unlock(e->stats_lock);
-	}
-
-	mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
-	read_unlock(&est_lock);
-}
-
-int qdisc_new_estimator(struct tc_stats *stats, spinlock_t *stats_lock, struct rtattr *opt)
-{
-	struct qdisc_estimator *est;
-	struct tc_estimator *parm = RTA_DATA(opt);
-
-	if (RTA_PAYLOAD(opt) < sizeof(*parm))
-		return -EINVAL;
-
-	if (parm->interval < -2 || parm->interval > 3)
-		return -EINVAL;
-
-	est = kmalloc(sizeof(*est), GFP_KERNEL);
-	if (est == NULL)
-		return -ENOBUFS;
-
-	memset(est, 0, sizeof(*est));
-	est->interval = parm->interval + 2;
-	est->stats = stats;
-	est->stats_lock = stats_lock;
-	est->ewma_log = parm->ewma_log;
-	est->last_bytes = stats->bytes;
-	est->avbps = stats->bps<<5;
-	est->last_packets = stats->packets;
-	est->avpps = stats->pps<<10;
-
-	est->next = elist[est->interval].list;
-	if (est->next == NULL) {
-		init_timer(&elist[est->interval].timer);
-		elist[est->interval].timer.data = est->interval;
-		elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
-		elist[est->interval].timer.function = est_timer;
-		add_timer(&elist[est->interval].timer);
-	}
-	write_lock_bh(&est_lock);
-	elist[est->interval].list = est;
-	write_unlock_bh(&est_lock);
-	return 0;
-}
-
-void qdisc_kill_estimator(struct tc_stats *stats)
-{
-	int idx;
-	struct qdisc_estimator *est, **pest;
-
-	for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
-		int killed = 0;
-		pest = &elist[idx].list;
-		while ((est=*pest) != NULL) {
-			if (est->stats != stats) {
-				pest = &est->next;
-				continue;
-			}
-
-			write_lock_bh(&est_lock);
-			*pest = est->next;
-			write_unlock_bh(&est_lock);
-
-			kfree(est);
-			killed++;
-		}
-		if (killed && elist[idx].list == NULL)
-			del_timer(&elist[idx].timer);
-	}
-}
-
-EXPORT_SYMBOL(qdisc_kill_estimator);
-EXPORT_SYMBOL(qdisc_new_estimator);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 31570b9a6e9..58bed7599db 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -15,39 +15,32 @@
  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/kmod.h>
 #include <linux/list.h>
-#include <linux/bitops.h>
+#include <linux/hrtimer.h>
+#include <linux/lockdep.h>
+#include <linux/slab.h>
 
+#include <net/net_namespace.h>
 #include <net/sock.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
 
-#include <asm/processor.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
+static int qdisc_notify(struct net *net, struct sk_buff *oskb,
+			struct nlmsghdr *n, u32 clid,
 			struct Qdisc *old, struct Qdisc *new);
-static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
-			 struct Qdisc *q, unsigned long cl, int event);
+static int tclass_notify(struct net *net, struct sk_buff *oskb,
+			 struct nlmsghdr *n, struct Qdisc *q,
+			 unsigned long cl, int event);
 
 /*
 
@@ -107,10 +100,9 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 
    Auxiliary routines:
 
-   ---requeue
+   ---peek
 
-   requeues once dequeued packet. It is used for non-standard or
-   just buggy devices, which can defer output even if dev->tbusy=0.
+   like dequeue but without removing a packet from the queue
 
    ---reset
 
@@ -143,7 +135,7 @@ static DEFINE_RWLOCK(qdisc_mod_lock);
 
 static struct Qdisc_ops *qdisc_base;
 
-/* Register/uregister queueing discipline */
+/* Register/unregister queueing discipline */
 
 int register_qdisc(struct Qdisc_ops *qops)
 {
@@ -157,18 +149,37 @@ int register_qdisc(struct Qdisc_ops *qops)
 
 	if (qops->enqueue == NULL)
 		qops->enqueue = noop_qdisc_ops.enqueue;
-	if (qops->requeue == NULL)
-		qops->requeue = noop_qdisc_ops.requeue;
+	if (qops->peek == NULL) {
+		if (qops->dequeue == NULL)
+			qops->peek = noop_qdisc_ops.peek;
+		else
+			goto out_einval;
+	}
 	if (qops->dequeue == NULL)
 		qops->dequeue = noop_qdisc_ops.dequeue;
 
+	if (qops->cl_ops) {
+		const struct Qdisc_class_ops *cops = qops->cl_ops;
+
+		if (!(cops->get && cops->put && cops->walk && cops->leaf))
+			goto out_einval;
+
+		if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
+			goto out_einval;
+	}
+
 	qops->next = NULL;
 	*qp = qops;
 	rc = 0;
 out:
 	write_unlock(&qdisc_mod_lock);
 	return rc;
+
+out_einval:
+	rc = -EINVAL;
+	goto out;
 }
+EXPORT_SYMBOL(register_qdisc);
 
 int unregister_qdisc(struct Qdisc_ops *qops)
 {
@@ -176,7 +187,7 @@ int unregister_qdisc(struct Qdisc_ops *qops)
 	int err = -ENOENT;
 
 	write_lock(&qdisc_mod_lock);
-	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
+	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
 		if (q == qops)
 			break;
 	if (q) {
@@ -187,31 +198,118 @@ int unregister_qdisc(struct Qdisc_ops *qops)
 	write_unlock(&qdisc_mod_lock);
 	return err;
 }
+EXPORT_SYMBOL(unregister_qdisc);
+
+/* Get default qdisc if not otherwise specified */
+void qdisc_get_default(char *name, size_t len)
+{
+	read_lock(&qdisc_mod_lock);
+	strlcpy(name, default_qdisc_ops->id, len);
+	read_unlock(&qdisc_mod_lock);
+}
+
+static struct Qdisc_ops *qdisc_lookup_default(const char *name)
+{
+	struct Qdisc_ops *q = NULL;
+
+	for (q = qdisc_base; q; q = q->next) {
+		if (!strcmp(name, q->id)) {
+			if (!try_module_get(q->owner))
+				q = NULL;
+			break;
+		}
+	}
+
+	return q;
+}
+
+/* Set new default qdisc to use */
+int qdisc_set_default(const char *name)
+{
+	const struct Qdisc_ops *ops;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	write_lock(&qdisc_mod_lock);
+	ops = qdisc_lookup_default(name);
+	if (!ops) {
+		/* Not found, drop lock and try to load module */
+		write_unlock(&qdisc_mod_lock);
+		request_module("sch_%s", name);
+		write_lock(&qdisc_mod_lock);
+
+		ops = qdisc_lookup_default(name);
+	}
+
+	if (ops) {
+		/* Set new default */
+		module_put(default_qdisc_ops->owner);
+		default_qdisc_ops = ops;
+	}
+	write_unlock(&qdisc_mod_lock);
+
+	return ops ? 0 : -ENOENT;
+}
 
 /* We know handle. Find qdisc among all qdisc's attached to device
    (root qdisc, all its children, children of children etc.)
  */
 
-struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
+static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 {
 	struct Qdisc *q;
 
-	read_lock_bh(&qdisc_tree_lock);
-	list_for_each_entry(q, &dev->qdisc_list, list) {
-		if (q->handle == handle) {
-			read_unlock_bh(&qdisc_tree_lock);
+	if (!(root->flags & TCQ_F_BUILTIN) &&
+	    root->handle == handle)
+		return root;
+
+	list_for_each_entry(q, &root->list, list) {
+		if (q->handle == handle)
 			return q;
-		}
 	}
-	read_unlock_bh(&qdisc_tree_lock);
 	return NULL;
 }
 
+void qdisc_list_add(struct Qdisc *q)
+{
+	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
+		struct Qdisc *root = qdisc_dev(q)->qdisc;
+
+		WARN_ON_ONCE(root == &noop_qdisc);
+		list_add_tail(&q->list, &root->list);
+	}
+}
+EXPORT_SYMBOL(qdisc_list_add);
+
+void qdisc_list_del(struct Qdisc *q)
+{
+	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
+		list_del(&q->list);
+}
+EXPORT_SYMBOL(qdisc_list_del);
+
+struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
+{
+	struct Qdisc *q;
+
+	q = qdisc_match_from_root(dev->qdisc, handle);
+	if (q)
+		goto out;
+
+	if (dev_ingress_queue(dev))
+		q = qdisc_match_from_root(
+			dev_ingress_queue(dev)->qdisc_sleeping,
+			handle);
+out:
+	return q;
+}
+
 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 {
 	unsigned long cl;
 	struct Qdisc *leaf;
-	struct Qdisc_class_ops *cops = p->ops->cl_ops;
+	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 
 	if (cops == NULL)
 		return NULL;
@@ -226,14 +324,14 @@ static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 
 /* Find queueing discipline by name */
 
-static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
+static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
 {
 	struct Qdisc_ops *q = NULL;
 
 	if (kind) {
 		read_lock(&qdisc_mod_lock);
 		for (q = qdisc_base; q; q = q->next) {
-			if (rtattr_strcmp(kind, q->id) == 0) {
+			if (nla_strcmp(kind, q->id) == 0) {
 				if (!try_module_get(q->owner))
 					q = NULL;
 				break;
@@ -244,32 +342,76 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
 	return q;
 }
 
+/* The linklayer setting were not transferred from iproute2, in older
+ * versions, and the rate tables lookup systems have been dropped in
+ * the kernel. To keep backward compatible with older iproute2 tc
+ * utils, we detect the linklayer setting by detecting if the rate
+ * table were modified.
+ *
+ * For linklayer ATM table entries, the rate table will be aligned to
+ * 48 bytes, thus some table entries will contain the same value.  The
+ * mpu (min packet unit) is also encoded into the old rate table, thus
+ * starting from the mpu, we find low and high table entries for
+ * mapping this cell.  If these entries contain the same value, when
+ * the rate tables have been modified for linklayer ATM.
+ *
+ * This is done by rounding mpu to the nearest 48 bytes cell/entry,
+ * and then roundup to the next cell, calc the table entry one below,
+ * and compare.
+ */
+static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
+{
+	int low       = roundup(r->mpu, 48);
+	int high      = roundup(low+1, 48);
+	int cell_low  = low >> r->cell_log;
+	int cell_high = (high >> r->cell_log) - 1;
+
+	/* rtab is too inaccurate at rates > 100Mbit/s */
+	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
+		pr_debug("TC linklayer: Giving up ATM detection\n");
+		return TC_LINKLAYER_ETHERNET;
+	}
+
+	if ((cell_high > cell_low) && (cell_high < 256)
+	    && (rtab[cell_low] == rtab[cell_high])) {
+		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
+			 cell_low, cell_high, rtab[cell_high]);
+		return TC_LINKLAYER_ATM;
+	}
+	return TC_LINKLAYER_ETHERNET;
+}
+
 static struct qdisc_rate_table *qdisc_rtab_list;
 
-struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
+struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
 {
 	struct qdisc_rate_table *rtab;
 
+	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
+	    nla_len(tab) != TC_RTAB_SIZE)
+		return NULL;
+
 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
-		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
+		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
+		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
 			rtab->refcnt++;
 			return rtab;
 		}
 	}
 
-	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
-		return NULL;
-
 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
 	if (rtab) {
 		rtab->rate = *r;
 		rtab->refcnt = 1;
-		memcpy(rtab->data, RTA_DATA(tab), 1024);
+		memcpy(rtab->data, nla_data(tab), 1024);
+		if (r->linklayer == TC_LINKLAYER_UNAWARE)
+			r->linklayer = __detect_linklayer(r, rtab->data);
 		rtab->next = qdisc_rtab_list;
 		qdisc_rtab_list = rtab;
 	}
 	return rtab;
 }
+EXPORT_SYMBOL(qdisc_get_rtab);
 
 void qdisc_put_rtab(struct qdisc_rate_table *tab)
 {
@@ -278,7 +420,9 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)
 	if (!tab || --tab->refcnt)
 		return;
 
-	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
+	for (rtabp = &qdisc_rtab_list;
+	     (rtab = *rtabp) != NULL;
+	     rtabp = &rtab->next) {
 		if (rtab == tab) {
 			*rtabp = rtab->next;
 			kfree(rtab);
@@ -286,108 +430,441 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)
 		}
 	}
 }
+EXPORT_SYMBOL(qdisc_put_rtab);
+
+static LIST_HEAD(qdisc_stab_list);
+static DEFINE_SPINLOCK(qdisc_stab_lock);
+
+static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
+	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
+	[TCA_STAB_DATA] = { .type = NLA_BINARY },
+};
+
+static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
+{
+	struct nlattr *tb[TCA_STAB_MAX + 1];
+	struct qdisc_size_table *stab;
+	struct tc_sizespec *s;
+	unsigned int tsize = 0;
+	u16 *tab = NULL;
+	int err;
 
+	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
+	if (err < 0)
+		return ERR_PTR(err);
+	if (!tb[TCA_STAB_BASE])
+		return ERR_PTR(-EINVAL);
 
-/* Allocate an unique handle from space managed by kernel */
+	s = nla_data(tb[TCA_STAB_BASE]);
 
+	if (s->tsize > 0) {
+		if (!tb[TCA_STAB_DATA])
+			return ERR_PTR(-EINVAL);
+		tab = nla_data(tb[TCA_STAB_DATA]);
+		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
+	}
+
+	if (tsize != s->tsize || (!tab && tsize > 0))
+		return ERR_PTR(-EINVAL);
+
+	spin_lock(&qdisc_stab_lock);
+
+	list_for_each_entry(stab, &qdisc_stab_list, list) {
+		if (memcmp(&stab->szopts, s, sizeof(*s)))
+			continue;
+		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
+			continue;
+		stab->refcnt++;
+		spin_unlock(&qdisc_stab_lock);
+		return stab;
+	}
+
+	spin_unlock(&qdisc_stab_lock);
+
+	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
+	if (!stab)
+		return ERR_PTR(-ENOMEM);
+
+	stab->refcnt = 1;
+	stab->szopts = *s;
+	if (tsize > 0)
+		memcpy(stab->data, tab, tsize * sizeof(u16));
+
+	spin_lock(&qdisc_stab_lock);
+	list_add_tail(&stab->list, &qdisc_stab_list);
+	spin_unlock(&qdisc_stab_lock);
+
+	return stab;
+}
+
+static void stab_kfree_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct qdisc_size_table, rcu));
+}
+
+void qdisc_put_stab(struct qdisc_size_table *tab)
+{
+	if (!tab)
+		return;
+
+	spin_lock(&qdisc_stab_lock);
+
+	if (--tab->refcnt == 0) {
+		list_del(&tab->list);
+		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
+	}
+
+	spin_unlock(&qdisc_stab_lock);
+}
+EXPORT_SYMBOL(qdisc_put_stab);
+
+static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_STAB);
+	if (nest == NULL)
+		goto nla_put_failure;
+	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+
+	return skb->len;
+
+nla_put_failure:
+	return -1;
+}
+
+void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
+{
+	int pkt_len, slot;
+
+	pkt_len = skb->len + stab->szopts.overhead;
+	if (unlikely(!stab->szopts.tsize))
+		goto out;
+
+	slot = pkt_len + stab->szopts.cell_align;
+	if (unlikely(slot < 0))
+		slot = 0;
+
+	slot >>= stab->szopts.cell_log;
+	if (likely(slot < stab->szopts.tsize))
+		pkt_len = stab->data[slot];
+	else
+		pkt_len = stab->data[stab->szopts.tsize - 1] *
+				(slot / stab->szopts.tsize) +
+				stab->data[slot % stab->szopts.tsize];
+
+	pkt_len <<= stab->szopts.size_log;
+out:
+	if (unlikely(pkt_len < 1))
+		pkt_len = 1;
+	qdisc_skb_cb(skb)->pkt_len = pkt_len;
+}
+EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
+
+void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
+{
+	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
+		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
+			txt, qdisc->ops->id, qdisc->handle >> 16);
+		qdisc->flags |= TCQ_F_WARN_NONWC;
+	}
+}
+EXPORT_SYMBOL(qdisc_warn_nonwc);
+
+static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
+{
+	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
+						 timer);
+
+	qdisc_unthrottled(wd->qdisc);
+	__netif_schedule(qdisc_root(wd->qdisc));
+
+	return HRTIMER_NORESTART;
+}
+
+void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
+{
+	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	wd->timer.function = qdisc_watchdog;
+	wd->qdisc = qdisc;
+}
+EXPORT_SYMBOL(qdisc_watchdog_init);
+
+void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
+{
+	if (test_bit(__QDISC_STATE_DEACTIVATED,
+		     &qdisc_root_sleeping(wd->qdisc)->state))
+		return;
+
+	qdisc_throttled(wd->qdisc);
+
+	hrtimer_start(&wd->timer,
+		      ns_to_ktime(expires),
+		      HRTIMER_MODE_ABS);
+}
+EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
+
+void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
+{
+	hrtimer_cancel(&wd->timer);
+	qdisc_unthrottled(wd->qdisc);
+}
+EXPORT_SYMBOL(qdisc_watchdog_cancel);
+
+static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
+{
+	unsigned int size = n * sizeof(struct hlist_head), i;
+	struct hlist_head *h;
+
+	if (size <= PAGE_SIZE)
+		h = kmalloc(size, GFP_KERNEL);
+	else
+		h = (struct hlist_head *)
+			__get_free_pages(GFP_KERNEL, get_order(size));
+
+	if (h != NULL) {
+		for (i = 0; i < n; i++)
+			INIT_HLIST_HEAD(&h[i]);
+	}
+	return h;
+}
+
+static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
+{
+	unsigned int size = n * sizeof(struct hlist_head);
+
+	if (size <= PAGE_SIZE)
+		kfree(h);
+	else
+		free_pages((unsigned long)h, get_order(size));
+}
+
+void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
+{
+	struct Qdisc_class_common *cl;
+	struct hlist_node *next;
+	struct hlist_head *nhash, *ohash;
+	unsigned int nsize, nmask, osize;
+	unsigned int i, h;
+
+	/* Rehash when load factor exceeds 0.75 */
+	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
+		return;
+	nsize = clhash->hashsize * 2;
+	nmask = nsize - 1;
+	nhash = qdisc_class_hash_alloc(nsize);
+	if (nhash == NULL)
+		return;
+
+	ohash = clhash->hash;
+	osize = clhash->hashsize;
+
+	sch_tree_lock(sch);
+	for (i = 0; i < osize; i++) {
+		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
+			h = qdisc_class_hash(cl->classid, nmask);
+			hlist_add_head(&cl->hnode, &nhash[h]);
+		}
+	}
+	clhash->hash     = nhash;
+	clhash->hashsize = nsize;
+	clhash->hashmask = nmask;
+	sch_tree_unlock(sch);
+
+	qdisc_class_hash_free(ohash, osize);
+}
+EXPORT_SYMBOL(qdisc_class_hash_grow);
+
+int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
+{
+	unsigned int size = 4;
+
+	clhash->hash = qdisc_class_hash_alloc(size);
+	if (clhash->hash == NULL)
+		return -ENOMEM;
+	clhash->hashsize  = size;
+	clhash->hashmask  = size - 1;
+	clhash->hashelems = 0;
+	return 0;
+}
+EXPORT_SYMBOL(qdisc_class_hash_init);
+
+void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
+{
+	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
+}
+EXPORT_SYMBOL(qdisc_class_hash_destroy);
+
+void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
+			     struct Qdisc_class_common *cl)
+{
+	unsigned int h;
+
+	INIT_HLIST_NODE(&cl->hnode);
+	h = qdisc_class_hash(cl->classid, clhash->hashmask);
+	hlist_add_head(&cl->hnode, &clhash->hash[h]);
+	clhash->hashelems++;
+}
+EXPORT_SYMBOL(qdisc_class_hash_insert);
+
+void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
+			     struct Qdisc_class_common *cl)
+{
+	hlist_del(&cl->hnode);
+	clhash->hashelems--;
+}
+EXPORT_SYMBOL(qdisc_class_hash_remove);
+
+/* Allocate an unique handle from space managed by kernel
+ * Possible range is [8000-FFFF]:0000 (0x8000 values)
+ */
 static u32 qdisc_alloc_handle(struct net_device *dev)
 {
-	int i = 0x10000;
+	int i = 0x8000;
 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
 
 	do {
 		autohandle += TC_H_MAKE(0x10000U, 0);
 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
 			autohandle = TC_H_MAKE(0x80000000U, 0);
-	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
+		if (!qdisc_lookup(dev, autohandle))
+			return autohandle;
+		cond_resched();
+	} while	(--i > 0);
 
-	return i>0 ? autohandle : 0;
+	return 0;
 }
 
-/* Attach toplevel qdisc to device dev */
-
-static struct Qdisc *
-dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
-{
-	struct Qdisc *oqdisc;
-
-	if (dev->flags & IFF_UP)
-		dev_deactivate(dev);
-
-	qdisc_lock_tree(dev);
-	if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
-		oqdisc = dev->qdisc_ingress;
-		/* Prune old scheduler */
-		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
-			/* delete */
-			qdisc_reset(oqdisc);
-			dev->qdisc_ingress = NULL;
-		} else {  /* new */
-			dev->qdisc_ingress = qdisc;
-		}
-
-	} else {
-
-		oqdisc = dev->qdisc_sleeping;
+void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
+{
+	const struct Qdisc_class_ops *cops;
+	unsigned long cl;
+	u32 parentid;
+	int drops;
 
-		/* Prune old scheduler */
-		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
-			qdisc_reset(oqdisc);
+	if (n == 0)
+		return;
+	drops = max_t(int, n, 0);
+	while ((parentid = sch->parent)) {
+		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
+			return;
 
-		/* ... and graft new one */
-		if (qdisc == NULL)
-			qdisc = &noop_qdisc;
-		dev->qdisc_sleeping = qdisc;
-		dev->qdisc = &noop_qdisc;
+		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
+		if (sch == NULL) {
+			WARN_ON(parentid != TC_H_ROOT);
+			return;
+		}
+		cops = sch->ops->cl_ops;
+		if (cops->qlen_notify) {
+			cl = cops->get(sch, parentid);
+			cops->qlen_notify(sch, cl);
+			cops->put(sch, cl);
+		}
+		sch->q.qlen -= n;
+		sch->qstats.drops += drops;
 	}
+}
+EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
 
-	qdisc_unlock_tree(dev);
-
-	if (dev->flags & IFF_UP)
-		dev_activate(dev);
+static void notify_and_destroy(struct net *net, struct sk_buff *skb,
+			       struct nlmsghdr *n, u32 clid,
+			       struct Qdisc *old, struct Qdisc *new)
+{
+	if (new || old)
+		qdisc_notify(net, skb, n, clid, old, new);
 
-	return oqdisc;
+	if (old)
+		qdisc_destroy(old);
 }
 
-
 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
-   to device "dev".
-
-   Old qdisc is not destroyed but returned in *old.
+ * to device "dev".
+ *
+ * When appropriate send a netlink notification using 'skb'
+ * and "n".
+ *
+ * On success, destroy old qdisc.
  */
 
 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
-		       u32 classid,
-		       struct Qdisc *new, struct Qdisc **old)
+		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
+		       struct Qdisc *new, struct Qdisc *old)
 {
+	struct Qdisc *q = old;
+	struct net *net = dev_net(dev);
 	int err = 0;
-	struct Qdisc *q = *old;
 
+	if (parent == NULL) {
+		unsigned int i, num_q, ingress;
+
+		ingress = 0;
+		num_q = dev->num_tx_queues;
+		if ((q && q->flags & TCQ_F_INGRESS) ||
+		    (new && new->flags & TCQ_F_INGRESS)) {
+			num_q = 1;
+			ingress = 1;
+			if (!dev_ingress_queue(dev))
+				return -ENOENT;
+		}
+
+		if (dev->flags & IFF_UP)
+			dev_deactivate(dev);
+
+		if (new && new->ops->attach) {
+			new->ops->attach(new);
+			num_q = 0;
+		}
+
+		for (i = 0; i < num_q; i++) {
+			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
+
+			if (!ingress)
+				dev_queue = netdev_get_tx_queue(dev, i);
 
-	if (parent == NULL) { 
-		if (q && q->flags&TCQ_F_INGRESS) {
-			*old = dev_graft_qdisc(dev, q);
+			old = dev_graft_qdisc(dev_queue, new);
+			if (new && i > 0)
+				atomic_inc(&new->refcnt);
+
+			if (!ingress)
+				qdisc_destroy(old);
+		}
+
+		if (!ingress) {
+			notify_and_destroy(net, skb, n, classid,
+					   dev->qdisc, new);
+			if (new && !new->ops->attach)
+				atomic_inc(&new->refcnt);
+			dev->qdisc = new ? : &noop_qdisc;
 		} else {
-			*old = dev_graft_qdisc(dev, new);
+			notify_and_destroy(net, skb, n, classid, old, new);
 		}
-	} else {
-		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 
-		err = -EINVAL;
+		if (dev->flags & IFF_UP)
+			dev_activate(dev);
+	} else {
+		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 
-		if (cops) {
+		err = -EOPNOTSUPP;
+		if (cops && cops->graft) {
 			unsigned long cl = cops->get(parent, classid);
 			if (cl) {
-				err = cops->graft(parent, cl, new, old);
-				if (new)
-					new->parent = classid;
+				err = cops->graft(parent, cl, new, &old);
 				cops->put(parent, cl);
-			}
+			} else
+				err = -ENOENT;
 		}
+		if (!err)
+			notify_and_destroy(net, skb, n, classid, old, new);
 	}
 	return err;
 }
 
+/* lockdep annotation is needed for ingress; egress gets it only for name */
+static struct lock_class_key qdisc_tx_lock;
+static struct lock_class_key qdisc_rx_lock;
+
 /*
    Allocate and initialize new qdisc.
 
@@ -395,18 +872,21 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
  */
 
 static struct Qdisc *
-qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
+qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
+	     struct Qdisc *p, u32 parent, u32 handle,
+	     struct nlattr **tca, int *errp)
 {
 	int err;
-	struct rtattr *kind = tca[TCA_KIND-1];
+	struct nlattr *kind = tca[TCA_KIND];
 	struct Qdisc *sch;
 	struct Qdisc_ops *ops;
+	struct qdisc_size_table *stab;
 
 	ops = qdisc_lookup_ops(kind);
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 	if (ops == NULL && kind != NULL) {
 		char name[IFNAMSIZ];
-		if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
+		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
 			/* We dropped the RTNL semaphore in order to
 			 * perform the module load.  So, even if we
 			 * succeeded in loading the module we have to
@@ -431,49 +911,66 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
 	}
 #endif
 
-	err = -EINVAL;
+	err = -ENOENT;
 	if (ops == NULL)
 		goto err_out;
 
-	sch = qdisc_alloc(dev, ops);
+	sch = qdisc_alloc(dev_queue, ops);
 	if (IS_ERR(sch)) {
 		err = PTR_ERR(sch);
 		goto err_out2;
 	}
 
+	sch->parent = parent;
+
 	if (handle == TC_H_INGRESS) {
 		sch->flags |= TCQ_F_INGRESS;
 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
-	} else if (handle == 0) {
-		handle = qdisc_alloc_handle(dev);
-		err = -ENOMEM;
-		if (handle == 0)
-			goto err_out3;
+		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
+	} else {
+		if (handle == 0) {
+			handle = qdisc_alloc_handle(dev);
+			err = -ENOMEM;
+			if (handle == 0)
+				goto err_out3;
+		}
+		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
+		if (!netif_is_multiqueue(dev))
+			sch->flags |= TCQ_F_ONETXQUEUE;
 	}
 
 	sch->handle = handle;
 
-	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
-#ifdef CONFIG_NET_ESTIMATOR
-		if (tca[TCA_RATE-1]) {
-			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
-						sch->stats_lock,
-						tca[TCA_RATE-1]);
-			if (err) {
-				/*
-				 * Any broken qdiscs that would require
-				 * a ops->reset() here? The qdisc was never
-				 * in action so it shouldn't be necessary.
-				 */
-				if (ops->destroy)
-					ops->destroy(sch);
-				goto err_out3;
+	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
+		if (tca[TCA_STAB]) {
+			stab = qdisc_get_stab(tca[TCA_STAB]);
+			if (IS_ERR(stab)) {
+				err = PTR_ERR(stab);
+				goto err_out4;
 			}
+			rcu_assign_pointer(sch->stab, stab);
 		}
-#endif
-		qdisc_lock_tree(dev);
-		list_add_tail(&sch->list, &dev->qdisc_list);
-		qdisc_unlock_tree(dev);
+		if (tca[TCA_RATE]) {
+			spinlock_t *root_lock;
+
+			err = -EOPNOTSUPP;
+			if (sch->flags & TCQ_F_MQROOT)
+				goto err_out4;
+
+			if ((sch->parent != TC_H_ROOT) &&
+			    !(sch->flags & TCQ_F_INGRESS) &&
+			    (!p || !(p->flags & TCQ_F_MQROOT)))
+				root_lock = qdisc_root_sleeping_lock(sch);
+			else
+				root_lock = qdisc_lock(sch);
+
+			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
+						root_lock, tca[TCA_RATE]);
+			if (err)
+				goto err_out4;
+		}
+
+		qdisc_list_add(sch);
 
 		return sch;
 	}
@@ -485,30 +982,56 @@ err_out2:
 err_out:
 	*errp = err;
 	return NULL;
+
+err_out4:
+	/*
+	 * Any broken qdiscs that would require a ops->reset() here?
+	 * The qdisc was never in action so it shouldn't be necessary.
+	 */
+	qdisc_put_stab(rtnl_dereference(sch->stab));
+	if (ops->destroy)
+		ops->destroy(sch);
+	goto err_out3;
 }
 
-static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
+static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
 {
-	if (tca[TCA_OPTIONS-1]) {
-		int err;
+	struct qdisc_size_table *ostab, *stab = NULL;
+	int err = 0;
 
+	if (tca[TCA_OPTIONS]) {
 		if (sch->ops->change == NULL)
 			return -EINVAL;
-		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
+		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
 		if (err)
 			return err;
 	}
-#ifdef CONFIG_NET_ESTIMATOR
-	if (tca[TCA_RATE-1])
+
+	if (tca[TCA_STAB]) {
+		stab = qdisc_get_stab(tca[TCA_STAB]);
+		if (IS_ERR(stab))
+			return PTR_ERR(stab);
+	}
+
+	ostab = rtnl_dereference(sch->stab);
+	rcu_assign_pointer(sch->stab, stab);
+	qdisc_put_stab(ostab);
+
+	if (tca[TCA_RATE]) {
+		/* NB: ignores errors from replace_estimator
+		   because change can't be undone. */
+		if (sch->flags & TCQ_F_MQROOT)
+			goto out;
 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
-			sch->stats_lock, tca[TCA_RATE-1]);
-#endif
+					    qdisc_root_sleeping_lock(sch),
+					    tca[TCA_RATE]);
+	}
+out:
 	return 0;
 }
 
-struct check_loop_arg
-{
-	struct qdisc_walker 	w;
+struct check_loop_arg {
+	struct qdisc_walker	w;
 	struct Qdisc		*p;
 	int			depth;
 };
@@ -534,7 +1057,7 @@ static int
 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 {
 	struct Qdisc *leaf;
-	struct Qdisc_class_ops *cops = q->ops->cl_ops;
+	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
 
 	leaf = cops->leaf(q, cl);
@@ -550,30 +1073,42 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
  * Delete/get qdisc.
  */
 
-static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
 {
-	struct tcmsg *tcm = NLMSG_DATA(n);
-	struct rtattr **tca = arg;
+	struct net *net = sock_net(skb->sk);
+	struct tcmsg *tcm = nlmsg_data(n);
+	struct nlattr *tca[TCA_MAX + 1];
 	struct net_device *dev;
-	u32 clid = tcm->tcm_parent;
+	u32 clid;
 	struct Qdisc *q = NULL;
 	struct Qdisc *p = NULL;
 	int err;
 
-	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+	if ((n->nlmsg_type != RTM_GETQDISC) &&
+	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+	if (err < 0)
+		return err;
+
+	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
 		return -ENODEV;
 
+	clid = tcm->tcm_parent;
 	if (clid) {
 		if (clid != TC_H_ROOT) {
 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
-				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+				p = qdisc_lookup(dev, TC_H_MAJ(clid));
+				if (!p)
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
-			} else { /* ingress */
-				q = dev->qdisc_ingress;
-                        }
+			} else if (dev_ingress_queue(dev)) {
+				q = dev_ingress_queue(dev)->qdisc_sleeping;
+			}
 		} else {
-			q = dev->qdisc_sleeping;
+			q = dev->qdisc;
 		}
 		if (!q)
 			return -ENOENT;
@@ -581,11 +1116,12 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
 			return -EINVAL;
 	} else {
-		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+		q = qdisc_lookup(dev, tcm->tcm_handle);
+		if (!q)
 			return -ENOENT;
 	}
 
-	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
 		return -EINVAL;
 
 	if (n->nlmsg_type == RTM_DELQDISC) {
@@ -593,54 +1129,59 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 			return -EINVAL;
 		if (q->handle == 0)
 			return -ENOENT;
-		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
+		err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
+		if (err != 0)
 			return err;
-		if (q) {
-			qdisc_notify(skb, n, clid, q, NULL);
-			spin_lock_bh(&dev->queue_lock);
-			qdisc_destroy(q);
-			spin_unlock_bh(&dev->queue_lock);
-		}
 	} else {
-		qdisc_notify(skb, n, clid, NULL, q);
+		qdisc_notify(net, skb, n, clid, NULL, q);
 	}
 	return 0;
 }
 
 /*
-   Create/change qdisc.
+ * Create/change qdisc.
  */
 
-static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
 {
+	struct net *net = sock_net(skb->sk);
 	struct tcmsg *tcm;
-	struct rtattr **tca;
+	struct nlattr *tca[TCA_MAX + 1];
 	struct net_device *dev;
 	u32 clid;
 	struct Qdisc *q, *p;
 	int err;
 
+	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
 replay:
 	/* Reinit, just in case something touches this. */
-	tcm = NLMSG_DATA(n);
-	tca = arg;
+	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+	if (err < 0)
+		return err;
+
+	tcm = nlmsg_data(n);
 	clid = tcm->tcm_parent;
 	q = p = NULL;
 
-	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
 		return -ENODEV;
 
+
 	if (clid) {
 		if (clid != TC_H_ROOT) {
 			if (clid != TC_H_INGRESS) {
-				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+				p = qdisc_lookup(dev, TC_H_MAJ(clid));
+				if (!p)
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
-			} else { /*ingress */
-				q = dev->qdisc_ingress;
+			} else if (dev_ingress_queue_create(dev)) {
+				q = dev_ingress_queue(dev)->qdisc_sleeping;
 			}
 		} else {
-			q = dev->qdisc_sleeping;
+			q = dev->qdisc;
 		}
 
 		/* It may be default qdisc, ignore it */
@@ -649,15 +1190,16 @@ replay:
 
 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
 			if (tcm->tcm_handle) {
-				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
+				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
 					return -EEXIST;
 				if (TC_H_MIN(tcm->tcm_handle))
 					return -EINVAL;
-				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+				q = qdisc_lookup(dev, tcm->tcm_handle);
+				if (!q)
 					goto create_n_graft;
-				if (n->nlmsg_flags&NLM_F_EXCL)
+				if (n->nlmsg_flags & NLM_F_EXCL)
 					return -EEXIST;
-				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
 					return -EINVAL;
 				if (q == p ||
 				    (p && check_loop(q, p, 0)))
@@ -665,7 +1207,7 @@ replay:
 				atomic_inc(&q->refcnt);
 				goto graft;
 			} else {
-				if (q == NULL)
+				if (!q)
 					goto create_n_graft;
 
 				/* This magic test requires explanation.
@@ -687,11 +1229,11 @@ replay:
 				 *   For now we select create/graft, if
 				 *   user gave KIND, which does not match existing.
 				 */
-				if ((n->nlmsg_flags&NLM_F_CREATE) &&
-				    (n->nlmsg_flags&NLM_F_REPLACE) &&
-				    ((n->nlmsg_flags&NLM_F_EXCL) ||
-				     (tca[TCA_KIND-1] &&
-				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
+				if ((n->nlmsg_flags & NLM_F_CREATE) &&
+				    (n->nlmsg_flags & NLM_F_REPLACE) &&
+				    ((n->nlmsg_flags & NLM_F_EXCL) ||
+				     (tca[TCA_KIND] &&
+				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
 					goto create_n_graft;
 			}
 		}
@@ -704,22 +1246,39 @@ replay:
 	/* Change qdisc parameters */
 	if (q == NULL)
 		return -ENOENT;
-	if (n->nlmsg_flags&NLM_F_EXCL)
+	if (n->nlmsg_flags & NLM_F_EXCL)
 		return -EEXIST;
-	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
+	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
 		return -EINVAL;
 	err = qdisc_change(q, tca);
 	if (err == 0)
-		qdisc_notify(skb, n, clid, NULL, q);
+		qdisc_notify(net, skb, n, clid, NULL, q);
 	return err;
 
 create_n_graft:
-	if (!(n->nlmsg_flags&NLM_F_CREATE))
+	if (!(n->nlmsg_flags & NLM_F_CREATE))
 		return -ENOENT;
-	if (clid == TC_H_INGRESS)
-		q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
-        else
-		q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
+	if (clid == TC_H_INGRESS) {
+		if (dev_ingress_queue(dev))
+			q = qdisc_create(dev, dev_ingress_queue(dev), p,
+					 tcm->tcm_parent, tcm->tcm_parent,
+					 tca, &err);
+		else
+			err = -ENOENT;
+	} else {
+		struct netdev_queue *dev_queue;
+
+		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
+			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
+		else if (p)
+			dev_queue = p->dev_queue;
+		else
+			dev_queue = netdev_get_tx_queue(dev, 0);
+
+		q = qdisc_create(dev, dev_queue, p,
+				 tcm->tcm_parent, tcm->tcm_handle,
+				 tca, &err);
+	}
 	if (q == NULL) {
 		if (err == -EAGAIN)
 			goto replay;
@@ -727,137 +1286,182 @@ create_n_graft:
 	}
 
 graft:
-	if (1) {
-		struct Qdisc *old_q = NULL;
-		err = qdisc_graft(dev, p, clid, q, &old_q);
-		if (err) {
-			if (q) {
-				spin_lock_bh(&dev->queue_lock);
-				qdisc_destroy(q);
-				spin_unlock_bh(&dev->queue_lock);
-			}
-			return err;
-		}
-		qdisc_notify(skb, n, clid, old_q, q);
-		if (old_q) {
-			spin_lock_bh(&dev->queue_lock);
-			qdisc_destroy(old_q);
-			spin_unlock_bh(&dev->queue_lock);
-		}
+	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
+	if (err) {
+		if (q)
+			qdisc_destroy(q);
+		return err;
 	}
+
 	return 0;
 }
 
 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
-			 u32 pid, u32 seq, u16 flags, int event)
+			 u32 portid, u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
-	unsigned char	 *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
 	struct gnet_dump d;
+	struct qdisc_size_table *stab;
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
-	tcm = NLMSG_DATA(nlh);
+	cond_resched();
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	tcm = nlmsg_data(nlh);
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm__pad1 = 0;
 	tcm->tcm__pad2 = 0;
-	tcm->tcm_ifindex = q->dev->ifindex;
+	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
 	tcm->tcm_parent = clid;
 	tcm->tcm_handle = q->handle;
 	tcm->tcm_info = atomic_read(&q->refcnt);
-	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
+	if (nla_put_string(skb, TCA_KIND, q->ops->id))
+		goto nla_put_failure;
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
-		goto rtattr_failure;
+		goto nla_put_failure;
 	q->qstats.qlen = q->q.qlen;
 
-	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
-			TCA_XSTATS, q->stats_lock, &d) < 0)
-		goto rtattr_failure;
+	stab = rtnl_dereference(q->stab);
+	if (stab && qdisc_dump_stab(skb, stab) < 0)
+		goto nla_put_failure;
+
+	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+					 qdisc_root_sleeping_lock(q), &d) < 0)
+		goto nla_put_failure;
 
 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
-		goto rtattr_failure;
+		goto nla_put_failure;
 
 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
-#ifdef CONFIG_NET_ESTIMATOR
-	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
-#endif
+	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
-		goto rtattr_failure;
-	
+		goto nla_put_failure;
+
 	if (gnet_stats_finish_copy(&d) < 0)
-		goto rtattr_failure;
-	
-	nlh->nlmsg_len = skb->tail - b;
+		goto nla_put_failure;
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+out_nlmsg_trim:
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
-			u32 clid, struct Qdisc *old, struct Qdisc *new)
+static bool tc_qdisc_dump_ignore(struct Qdisc *q)
+{
+	return (q->flags & TCQ_F_BUILTIN) ? true : false;
+}
+
+static int qdisc_notify(struct net *net, struct sk_buff *oskb,
+			struct nlmsghdr *n, u32 clid,
+			struct Qdisc *old, struct Qdisc *new)
 {
 	struct sk_buff *skb;
-	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
-	if (old && old->handle) {
-		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
+	if (old && !tc_qdisc_dump_ignore(old)) {
+		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
+				  0, RTM_DELQDISC) < 0)
 			goto err_out;
 	}
-	if (new) {
-		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+	if (new && !tc_qdisc_dump_ignore(new)) {
+		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
+				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
 			goto err_out;
 	}
 
 	if (skb->len)
-		return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+				      n->nlmsg_flags & NLM_F_ECHO);
 
 err_out:
 	kfree_skb(skb);
 	return -EINVAL;
 }
 
+static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
+			      struct netlink_callback *cb,
+			      int *q_idx_p, int s_q_idx)
+{
+	int ret = 0, q_idx = *q_idx_p;
+	struct Qdisc *q;
+
+	if (!root)
+		return 0;
+
+	q = root;
+	if (q_idx < s_q_idx) {
+		q_idx++;
+	} else {
+		if (!tc_qdisc_dump_ignore(q) &&
+		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+			goto done;
+		q_idx++;
+	}
+	list_for_each_entry(q, &root->list, list) {
+		if (q_idx < s_q_idx) {
+			q_idx++;
+			continue;
+		}
+		if (!tc_qdisc_dump_ignore(q) &&
+		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+			goto done;
+		q_idx++;
+	}
+
+out:
+	*q_idx_p = q_idx;
+	return ret;
+done:
+	ret = -1;
+	goto out;
+}
+
 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = sock_net(skb->sk);
 	int idx, q_idx;
 	int s_idx, s_q_idx;
 	struct net_device *dev;
-	struct Qdisc *q;
 
 	s_idx = cb->args[0];
 	s_q_idx = q_idx = cb->args[1];
-	read_lock(&dev_base_lock);
-	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
+
+	idx = 0;
+	ASSERT_RTNL();
+	for_each_netdev(net, dev) {
+		struct netdev_queue *dev_queue;
+
 		if (idx < s_idx)
-			continue;
+			goto cont;
 		if (idx > s_idx)
 			s_q_idx = 0;
-		read_lock_bh(&qdisc_tree_lock);
 		q_idx = 0;
-		list_for_each_entry(q, &dev->qdisc_list, list) {
-			if (q_idx < s_q_idx) {
-				q_idx++;
-				continue;
-			}
-			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
-					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
-				read_unlock_bh(&qdisc_tree_lock);
-				goto done;
-			}
-			q_idx++;
-		}
-		read_unlock_bh(&qdisc_tree_lock);
+
+		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
+			goto done;
+
+		dev_queue = dev_ingress_queue(dev);
+		if (dev_queue &&
+		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
+				       &q_idx, s_q_idx) < 0)
+			goto done;
+
+cont:
+		idx++;
 	}
 
 done:
-	read_unlock(&dev_base_lock);
-
 	cb->args[0] = idx;
 	cb->args[1] = q_idx;
 
@@ -872,21 +1476,31 @@ done:
 
 
 
-static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
 {
-	struct tcmsg *tcm = NLMSG_DATA(n);
-	struct rtattr **tca = arg;
+	struct net *net = sock_net(skb->sk);
+	struct tcmsg *tcm = nlmsg_data(n);
+	struct nlattr *tca[TCA_MAX + 1];
 	struct net_device *dev;
 	struct Qdisc *q = NULL;
-	struct Qdisc_class_ops *cops;
+	const struct Qdisc_class_ops *cops;
 	unsigned long cl = 0;
 	unsigned long new_cl;
-	u32 pid = tcm->tcm_parent;
-	u32 clid = tcm->tcm_handle;
-	u32 qid = TC_H_MAJ(clid);
+	u32 portid;
+	u32 clid;
+	u32 qid;
 	int err;
 
-	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+	if ((n->nlmsg_type != RTM_GETTCLASS) &&
+	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+	if (err < 0)
+		return err;
+
+	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
 		return -ENODEV;
 
 	/*
@@ -904,8 +1518,12 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 
 	/* Step 1. Determine qdisc handle X:0 */
 
-	if (pid != TC_H_ROOT) {
-		u32 qid1 = TC_H_MAJ(pid);
+	portid = tcm->tcm_parent;
+	clid = tcm->tcm_handle;
+	qid = TC_H_MAJ(clid);
+
+	if (portid != TC_H_ROOT) {
+		u32 qid1 = TC_H_MAJ(portid);
 
 		if (qid && qid1) {
 			/* If both majors are known, they must be identical. */
@@ -914,22 +1532,23 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 		} else if (qid1) {
 			qid = qid1;
 		} else if (qid == 0)
-			qid = dev->qdisc_sleeping->handle;
+			qid = dev->qdisc->handle;
 
 		/* Now qid is genuine qdisc handle consistent
-		   both with parent and child.
-
-		   TC_H_MAJ(pid) still may be unspecified, complete it now.
+		 * both with parent and child.
+		 *
+		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
 		 */
-		if (pid)
-			pid = TC_H_MAKE(qid, pid);
+		if (portid)
+			portid = TC_H_MAKE(qid, portid);
 	} else {
 		if (qid == 0)
-			qid = dev->qdisc_sleeping->handle;
+			qid = dev->qdisc->handle;
 	}
 
 	/* OK. Locate qdisc */
-	if ((q = qdisc_lookup(dev, qid)) == NULL) 
+	q = qdisc_lookup(dev, qid);
+	if (!q)
 		return -ENOENT;
 
 	/* An check that it supports classes */
@@ -939,7 +1558,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 
 	/* Now try to get class */
 	if (clid == 0) {
-		if (pid == TC_H_ROOT)
+		if (portid == TC_H_ROOT)
 			clid = qid;
 	} else
 		clid = TC_H_MAKE(qid, clid);
@@ -949,22 +1568,25 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 
 	if (cl == 0) {
 		err = -ENOENT;
-		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
+		if (n->nlmsg_type != RTM_NEWTCLASS ||
+		    !(n->nlmsg_flags & NLM_F_CREATE))
 			goto out;
 	} else {
 		switch (n->nlmsg_type) {
-		case RTM_NEWTCLASS:	
+		case RTM_NEWTCLASS:
 			err = -EEXIST;
-			if (n->nlmsg_flags&NLM_F_EXCL)
+			if (n->nlmsg_flags & NLM_F_EXCL)
 				goto out;
 			break;
 		case RTM_DELTCLASS:
-			err = cops->delete(q, cl);
+			err = -EOPNOTSUPP;
+			if (cops->delete)
+				err = cops->delete(q, cl);
 			if (err == 0)
-				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
+				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
 			goto out;
 		case RTM_GETTCLASS:
-			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
+			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
 			goto out;
 		default:
 			err = -EINVAL;
@@ -973,9 +1595,11 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 	}
 
 	new_cl = cl;
-	err = cops->change(q, clid, pid, tca, &new_cl);
+	err = -EOPNOTSUPP;
+	if (cops->change)
+		err = cops->change(q, clid, portid, tca, &new_cl);
 	if (err == 0)
-		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
+		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
 
 out:
 	if (cl)
@@ -987,118 +1611,160 @@ out:
 
 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 			  unsigned long cl,
-			  u32 pid, u32 seq, u16 flags, int event)
+			  u32 portid, u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
-	unsigned char	 *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
 	struct gnet_dump d;
-	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
+	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
-	tcm = NLMSG_DATA(nlh);
+	cond_resched();
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	tcm = nlmsg_data(nlh);
 	tcm->tcm_family = AF_UNSPEC;
-	tcm->tcm_ifindex = q->dev->ifindex;
+	tcm->tcm__pad1 = 0;
+	tcm->tcm__pad2 = 0;
+	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
 	tcm->tcm_parent = q->handle;
 	tcm->tcm_handle = q->handle;
 	tcm->tcm_info = 0;
-	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
+	if (nla_put_string(skb, TCA_KIND, q->ops->id))
+		goto nla_put_failure;
 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
-		goto rtattr_failure;
+		goto nla_put_failure;
 
-	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
-			TCA_XSTATS, q->stats_lock, &d) < 0)
-		goto rtattr_failure;
+	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+					 qdisc_root_sleeping_lock(q), &d) < 0)
+		goto nla_put_failure;
 
 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
-		goto rtattr_failure;
+		goto nla_put_failure;
 
 	if (gnet_stats_finish_copy(&d) < 0)
-		goto rtattr_failure;
+		goto nla_put_failure;
 
-	nlh->nlmsg_len = skb->tail - b;
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+out_nlmsg_trim:
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
-			  struct Qdisc *q, unsigned long cl, int event)
+static int tclass_notify(struct net *net, struct sk_buff *oskb,
+			 struct nlmsghdr *n, struct Qdisc *q,
+			 unsigned long cl, int event)
 {
 	struct sk_buff *skb;
-	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
-	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
+	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
 
-	return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+			      n->nlmsg_flags & NLM_F_ECHO);
 }
 
-struct qdisc_dump_args
-{
-	struct qdisc_walker w;
-	struct sk_buff *skb;
-	struct netlink_callback *cb;
+struct qdisc_dump_args {
+	struct qdisc_walker	w;
+	struct sk_buff		*skb;
+	struct netlink_callback	*cb;
 };
 
 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
 {
 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
 
-	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
+	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
 }
 
+static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
+				struct tcmsg *tcm, struct netlink_callback *cb,
+				int *t_p, int s_t)
+{
+	struct qdisc_dump_args arg;
+
+	if (tc_qdisc_dump_ignore(q) ||
+	    *t_p < s_t || !q->ops->cl_ops ||
+	    (tcm->tcm_parent &&
+	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
+		(*t_p)++;
+		return 0;
+	}
+	if (*t_p > s_t)
+		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+	arg.w.fn = qdisc_class_dump;
+	arg.skb = skb;
+	arg.cb = cb;
+	arg.w.stop  = 0;
+	arg.w.skip = cb->args[1];
+	arg.w.count = 0;
+	q->ops->cl_ops->walk(q, &arg.w);
+	cb->args[1] = arg.w.count;
+	if (arg.w.stop)
+		return -1;
+	(*t_p)++;
+	return 0;
+}
+
+static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
+			       struct tcmsg *tcm, struct netlink_callback *cb,
+			       int *t_p, int s_t)
+{
+	struct Qdisc *q;
+
+	if (!root)
+		return 0;
+
+	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
+		return -1;
+
+	list_for_each_entry(q, &root->list, list) {
+		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	int t;
-	int s_t;
+	struct tcmsg *tcm = nlmsg_data(cb->nlh);
+	struct net *net = sock_net(skb->sk);
+	struct netdev_queue *dev_queue;
 	struct net_device *dev;
-	struct Qdisc *q;
-	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
-	struct qdisc_dump_args arg;
+	int t, s_t;
 
-	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
 		return 0;
-	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
+	dev = dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
 		return 0;
 
 	s_t = cb->args[0];
 	t = 0;
 
-	read_lock_bh(&qdisc_tree_lock);
-	list_for_each_entry(q, &dev->qdisc_list, list) {
-		if (t < s_t || !q->ops->cl_ops ||
-		    (tcm->tcm_parent &&
-		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
-			t++;
-			continue;
-		}
-		if (t > s_t)
-			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
-		arg.w.fn = qdisc_class_dump;
-		arg.skb = skb;
-		arg.cb = cb;
-		arg.w.stop  = 0;
-		arg.w.skip = cb->args[1];
-		arg.w.count = 0;
-		q->ops->cl_ops->walk(q, &arg.w);
-		cb->args[1] = arg.w.count;
-		if (arg.w.stop)
-			break;
-		t++;
-	}
-	read_unlock_bh(&qdisc_tree_lock);
+	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
+		goto done;
+
+	dev_queue = dev_ingress_queue(dev);
+	if (dev_queue &&
+	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
+				&t, s_t) < 0)
+		goto done;
 
+done:
 	cb->args[0] = t;
 
 	dev_put(dev);
@@ -1106,181 +1772,164 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 }
 
 /* Main classifier routine: scans classifier chain attached
-   to this qdisc, (optionally) tests for protocol and asks
-   specific classifiers.
+ * to this qdisc, (optionally) tests for protocol and asks
+ * specific classifiers.
  */
-int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
-	struct tcf_result *res)
+int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
+		       struct tcf_result *res)
+{
+	__be16 protocol = skb->protocol;
+	int err;
+
+	for (; tp; tp = tp->next) {
+		if (tp->protocol != protocol &&
+		    tp->protocol != htons(ETH_P_ALL))
+			continue;
+		err = tp->classify(skb, tp, res);
+
+		if (err >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
+				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
+#endif
+			return err;
+		}
+	}
+	return -1;
+}
+EXPORT_SYMBOL(tc_classify_compat);
+
+int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+		struct tcf_result *res)
 {
 	int err = 0;
-	u32 protocol = skb->protocol;
 #ifdef CONFIG_NET_CLS_ACT
-	struct tcf_proto *otp = tp;
+	const struct tcf_proto *otp = tp;
 reclassify:
 #endif
-	protocol = skb->protocol;
 
-	for ( ; tp; tp = tp->next) {
-		if ((tp->protocol == protocol ||
-			tp->protocol == __constant_htons(ETH_P_ALL)) &&
-			(err = tp->classify(skb, tp, res)) >= 0) {
+	err = tc_classify_compat(skb, tp, res);
 #ifdef CONFIG_NET_CLS_ACT
-			if ( TC_ACT_RECLASSIFY == err) {
-				__u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
-				tp = otp;
-
-				if (MAX_REC_LOOP < verd++) {
-					printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
-						tp->prio&0xffff, ntohs(tp->protocol));
-					return TC_ACT_SHOT;
-				}
-				skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
-				goto reclassify;
-			} else {
-				if (skb->tc_verd) 
-					skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
-				return err;
-			}
-#else
-
-			return err;
-#endif
+	if (err == TC_ACT_RECLASSIFY) {
+		u32 verd = G_TC_VERD(skb->tc_verd);
+		tp = otp;
+
+		if (verd++ >= MAX_REC_LOOP) {
+			net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
+					       tp->q->ops->id,
+					       tp->prio & 0xffff,
+					       ntohs(tp->protocol));
+			return TC_ACT_SHOT;
 		}
-
+		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
+		goto reclassify;
 	}
-	return -1;
+#endif
+	return err;
+}
+EXPORT_SYMBOL(tc_classify);
+
+void tcf_destroy(struct tcf_proto *tp)
+{
+	tp->ops->destroy(tp);
+	module_put(tp->ops->owner);
+	kfree(tp);
 }
 
-static int psched_us_per_tick = 1;
-static int psched_tick_per_us = 1;
+void tcf_destroy_chain(struct tcf_proto **fl)
+{
+	struct tcf_proto *tp;
+
+	while ((tp = *fl) != NULL) {
+		*fl = tp->next;
+		tcf_destroy(tp);
+	}
+}
+EXPORT_SYMBOL(tcf_destroy_chain);
 
 #ifdef CONFIG_PROC_FS
 static int psched_show(struct seq_file *seq, void *v)
 {
+	struct timespec ts;
+
+	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
 	seq_printf(seq, "%08x %08x %08x %08x\n",
-		      psched_tick_per_us, psched_us_per_tick,
-		      1000000, HZ);
+		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
+		   1000000,
+		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
 
 	return 0;
 }
 
 static int psched_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, psched_show, PDE(inode)->data);
+	return single_open(file, psched_show, NULL);
 }
 
-static struct file_operations psched_fops = {
+static const struct file_operations psched_fops = {
 	.owner = THIS_MODULE,
 	.open = psched_open,
 	.read  = seq_read,
 	.llseek = seq_lseek,
 	.release = single_release,
-};	
-#endif
+};
 
-#ifdef CONFIG_NET_SCH_CLK_CPU
-psched_tdiff_t psched_clock_per_hz;
-int psched_clock_scale;
-EXPORT_SYMBOL(psched_clock_per_hz);
-EXPORT_SYMBOL(psched_clock_scale);
+static int __net_init psched_net_init(struct net *net)
+{
+	struct proc_dir_entry *e;
 
-psched_time_t psched_time_base;
-cycles_t psched_time_mark;
-EXPORT_SYMBOL(psched_time_mark);
-EXPORT_SYMBOL(psched_time_base);
+	e = proc_create("psched", 0, net->proc_net, &psched_fops);
+	if (e == NULL)
+		return -ENOMEM;
 
-/*
- * Periodically adjust psched_time_base to avoid overflow
- * with 32-bit get_cycles(). Safe up to 4GHz CPU.
- */
-static void psched_tick(unsigned long);
-static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
+	return 0;
+}
 
-static void psched_tick(unsigned long dummy)
+static void __net_exit psched_net_exit(struct net *net)
 {
-	if (sizeof(cycles_t) == sizeof(u32)) {
-		psched_time_t dummy_stamp;
-		PSCHED_GET_TIME(dummy_stamp);
-		psched_timer.expires = jiffies + 1*HZ;
-		add_timer(&psched_timer);
-	}
+	remove_proc_entry("psched", net->proc_net);
 }
-
-int __init psched_calibrate_clock(void)
+#else
+static int __net_init psched_net_init(struct net *net)
 {
-	psched_time_t stamp, stamp1;
-	struct timeval tv, tv1;
-	psched_tdiff_t delay;
-	long rdelay;
-	unsigned long stop;
-
-	psched_tick(0);
-	stop = jiffies + HZ/10;
-	PSCHED_GET_TIME(stamp);
-	do_gettimeofday(&tv);
-	while (time_before(jiffies, stop)) {
-		barrier();
-		cpu_relax();
-	}
-	PSCHED_GET_TIME(stamp1);
-	do_gettimeofday(&tv1);
-
-	delay = PSCHED_TDIFF(stamp1, stamp);
-	rdelay = tv1.tv_usec - tv.tv_usec;
-	rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
-	if (rdelay > delay)
-		return -1;
-	delay /= rdelay;
-	psched_tick_per_us = delay;
-	while ((delay>>=1) != 0)
-		psched_clock_scale++;
-	psched_us_per_tick = 1<<psched_clock_scale;
-	psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
 	return 0;
 }
-#endif
 
-static int __init pktsched_init(void)
+static void __net_exit psched_net_exit(struct net *net)
 {
-	struct rtnetlink_link *link_p;
-
-#ifdef CONFIG_NET_SCH_CLK_CPU
-	if (psched_calibrate_clock() < 0)
-		return -1;
-#elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
-	psched_tick_per_us = HZ<<PSCHED_JSCALE;
-	psched_us_per_tick = 1000000;
+}
 #endif
 
-	link_p = rtnetlink_links[PF_UNSPEC];
+static struct pernet_operations psched_net_ops = {
+	.init = psched_net_init,
+	.exit = psched_net_exit,
+};
 
-	/* Setup rtnetlink links. It is made here to avoid
-	   exporting large number of public symbols.
-	 */
+static int __init pktsched_init(void)
+{
+	int err;
 
-	if (link_p) {
-		link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
-		link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
-		link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
-		link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
-		link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
-		link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
-		link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
-		link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
+	err = register_pernet_subsys(&psched_net_ops);
+	if (err) {
+		pr_err("pktsched_init: "
+		       "cannot initialize per netns operations\n");
+		return err;
 	}
 
+	register_qdisc(&pfifo_fast_ops);
 	register_qdisc(&pfifo_qdisc_ops);
 	register_qdisc(&bfifo_qdisc_ops);
-	proc_net_fops_create("psched", 0, &psched_fops);
+	register_qdisc(&pfifo_head_drop_qdisc_ops);
+	register_qdisc(&mq_qdisc_ops);
+
+	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
 
 	return 0;
 }
 
 subsys_initcall(pktsched_init);
-
-EXPORT_SYMBOL(qdisc_lookup);
-EXPORT_SYMBOL(qdisc_get_rtab);
-EXPORT_SYMBOL(qdisc_put_rtab);
-EXPORT_SYMBOL(register_qdisc);
-EXPORT_SYMBOL(unregister_qdisc);
-EXPORT_SYMBOL(tc_classify);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 93ebce40aca..8449b337f9e 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -2,37 +2,19 @@
 
 /* Written 1998-2000 by Werner Almesberger, EPFL ICA */
 
-
-#include <linux/config.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/skbuff.h>
-#include <linux/interrupt.h>
 #include <linux/atmdev.h>
 #include <linux/atmclip.h>
-#include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
-#include <linux/file.h> /* for fput */
+#include <linux/file.h>		/* for fput */
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
-#include <net/sock.h>
-
-
-extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */
-
-#if 0 /* control */
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-#if 0 /* data */
-#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define D2PRINTK(format,args...)
-#endif
-
 
 /*
  * The ATM queuing discipline provides a framework for invoking classifiers
@@ -55,24 +37,21 @@ extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */
  *  - should lock the flow while there is data in the queue (?)
  */
 
-
-#define PRIV(sch) qdisc_priv(sch)
 #define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
 
-
 struct atm_flow_data {
-	struct Qdisc		*q;		/* FIFO, TBF, etc. */
+	struct Qdisc		*q;	/* FIFO, TBF, etc. */
 	struct tcf_proto	*filter_list;
-	struct atm_vcc		*vcc;		/* VCC; NULL if VCC is closed */
-	void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); /* chaining */
+	struct atm_vcc		*vcc;	/* VCC; NULL if VCC is closed */
+	void			(*old_pop)(struct atm_vcc *vcc,
+					   struct sk_buff *skb); /* chaining */
 	struct atm_qdisc_data	*parent;	/* parent qdisc */
 	struct socket		*sock;		/* for closing */
 	u32			classid;	/* x:y type ID */
 	int			ref;		/* reference count */
-	struct gnet_stats_basic	bstats;
+	struct gnet_stats_basic_packed	bstats;
 	struct gnet_stats_queue	qstats;
-	spinlock_t		*stats_lock;
-	struct atm_flow_data	*next;
+	struct list_head	list;
 	struct atm_flow_data	*excess;	/* flow for excess traffic;
 						   NULL to set CLP instead */
 	int			hdr_len;
@@ -81,167 +60,142 @@ struct atm_flow_data {
 
 struct atm_qdisc_data {
 	struct atm_flow_data	link;		/* unclassified skbs go here */
-	struct atm_flow_data	*flows;		/* NB: "link" is also on this
+	struct list_head	flows;		/* NB: "link" is also on this
 						   list */
-	struct tasklet_struct	task;		/* requeue tasklet */
+	struct tasklet_struct	task;		/* dequeue tasklet */
 };
 
-
 /* ------------------------- Class/flow operations ------------------------- */
 
-
-static int find_flow(struct atm_qdisc_data *qdisc,struct atm_flow_data *flow)
+static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
 {
-	struct atm_flow_data *walk;
-
-	DPRINTK("find_flow(qdisc %p,flow %p)\n",qdisc,flow);
-	for (walk = qdisc->flows; walk; walk = walk->next)
-		if (walk == flow) return 1;
-	DPRINTK("find_flow: not found\n");
-	return 0;
-}
-
-
-static __inline__ struct atm_flow_data *lookup_flow(struct Qdisc *sch,
-    u32 classid)
-{
-	struct atm_qdisc_data *p = PRIV(sch);
+	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow;
 
-        for (flow = p->flows; flow; flow = flow->next)
-		if (flow->classid == classid) break;
-	return flow;
+	list_for_each_entry(flow, &p->flows, list) {
+		if (flow->classid == classid)
+			return flow;
+	}
+	return NULL;
 }
 
-
-static int atm_tc_graft(struct Qdisc *sch,unsigned long arg,
-    struct Qdisc *new,struct Qdisc **old)
+static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
+			struct Qdisc *new, struct Qdisc **old)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *) arg;
-
-	DPRINTK("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",sch,
-	    p,flow,new,old);
-	if (!find_flow(p,flow)) return -EINVAL;
-	if (!new) new = &noop_qdisc;
-	*old = xchg(&flow->q,new);
-	if (*old) qdisc_reset(*old);
-        return 0;
-}
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
 
+	pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",
+		sch, p, flow, new, old);
+	if (list_empty(&flow->list))
+		return -EINVAL;
+	if (!new)
+		new = &noop_qdisc;
+	*old = flow->q;
+	flow->q = new;
+	if (*old)
+		qdisc_reset(*old);
+	return 0;
+}
 
-static struct Qdisc *atm_tc_leaf(struct Qdisc *sch,unsigned long cl)
+static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl)
 {
-	struct atm_flow_data *flow = (struct atm_flow_data *) cl;
+	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
 
-	DPRINTK("atm_tc_leaf(sch %p,flow %p)\n",sch,flow);
+	pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow);
 	return flow ? flow->q : NULL;
 }
 
-
-static unsigned long atm_tc_get(struct Qdisc *sch,u32 classid)
+static unsigned long atm_tc_get(struct Qdisc *sch, u32 classid)
 {
-	struct atm_qdisc_data *p __attribute__((unused)) = PRIV(sch);
+	struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
 	struct atm_flow_data *flow;
 
-	DPRINTK("atm_tc_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid);
-	flow = lookup_flow(sch,classid);
-        if (flow) flow->ref++;
-	DPRINTK("atm_tc_get: flow %p\n",flow);
-	return (unsigned long) flow;
+	pr_debug("atm_tc_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid);
+	flow = lookup_flow(sch, classid);
+	if (flow)
+		flow->ref++;
+	pr_debug("atm_tc_get: flow %p\n", flow);
+	return (unsigned long)flow;
 }
 
-
 static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
-    unsigned long parent, u32 classid)
+					unsigned long parent, u32 classid)
 {
-	return atm_tc_get(sch,classid);
-}
-
-
-static void destroy_filters(struct atm_flow_data *flow)
-{
-	struct tcf_proto *filter;
-
-	while ((filter = flow->filter_list)) {
-		DPRINTK("destroy_filters: destroying filter %p\n",filter);
-		flow->filter_list = filter->next;
-		tcf_destroy(filter);
-	}
+	return atm_tc_get(sch, classid);
 }
 
-
 /*
  * atm_tc_put handles all destructions, including the ones that are explicitly
  * requested (atm_tc_destroy, etc.). The assumption here is that we never drop
  * anything that still seems to be in use.
  */
-
 static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *) cl;
-	struct atm_flow_data **prev;
-
-	DPRINTK("atm_tc_put(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
-	if (--flow->ref) return;
-	DPRINTK("atm_tc_put: destroying\n");
-	for (prev = &p->flows; *prev; prev = &(*prev)->next)
-		if (*prev == flow) break;
-	if (!*prev) {
-		printk(KERN_CRIT "atm_tc_put: class %p not found\n",flow);
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+
+	pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+	if (--flow->ref)
 		return;
-	}
-	*prev = flow->next;
-	DPRINTK("atm_tc_put: qdisc %p\n",flow->q);
+	pr_debug("atm_tc_put: destroying\n");
+	list_del_init(&flow->list);
+	pr_debug("atm_tc_put: qdisc %p\n", flow->q);
 	qdisc_destroy(flow->q);
-	destroy_filters(flow);
+	tcf_destroy_chain(&flow->filter_list);
 	if (flow->sock) {
-		DPRINTK("atm_tc_put: f_count %d\n",
-		    file_count(flow->sock->file));
+		pr_debug("atm_tc_put: f_count %ld\n",
+			file_count(flow->sock->file));
 		flow->vcc->pop = flow->old_pop;
 		sockfd_put(flow->sock);
 	}
-	if (flow->excess) atm_tc_put(sch,(unsigned long) flow->excess);
-	if (flow != &p->link) kfree(flow);
+	if (flow->excess)
+		atm_tc_put(sch, (unsigned long)flow->excess);
+	if (flow != &p->link)
+		kfree(flow);
 	/*
 	 * If flow == &p->link, the qdisc no longer works at this point and
 	 * needs to be removed. (By the caller of atm_tc_put.)
 	 */
 }
 
-
-static void sch_atm_pop(struct atm_vcc *vcc,struct sk_buff *skb)
+static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb)
 {
 	struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
 
-	D2PRINTK("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n",vcc,skb,p);
-	VCC2FLOW(vcc)->old_pop(vcc,skb);
+	pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p);
+	VCC2FLOW(vcc)->old_pop(vcc, skb);
 	tasklet_schedule(&p->task);
 }
 
 static const u8 llc_oui_ip[] = {
-	0xaa,		/* DSAP: non-ISO */
-	0xaa,		/* SSAP: non-ISO */
-	0x03,		/* Ctrl: Unnumbered Information Command PDU */
-	0x00,		/* OUI: EtherType */
+	0xaa,			/* DSAP: non-ISO */
+	0xaa,			/* SSAP: non-ISO */
+	0x03,			/* Ctrl: Unnumbered Information Command PDU */
+	0x00,			/* OUI: EtherType */
 	0x00, 0x00,
-	0x08, 0x00 };	/* Ethertype IP (0800) */
+	0x08, 0x00
+};				/* Ethertype IP (0800) */
+
+static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = {
+	[TCA_ATM_FD]		= { .type = NLA_U32 },
+	[TCA_ATM_EXCESS]	= { .type = NLA_U32 },
+};
 
 static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
-    struct rtattr **tca, unsigned long *arg)
+			 struct nlattr **tca, unsigned long *arg)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *) *arg;
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)*arg;
 	struct atm_flow_data *excess = NULL;
-	struct rtattr *opt = tca[TCA_OPTIONS-1];
-	struct rtattr *tb[TCA_ATM_MAX];
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_ATM_MAX + 1];
 	struct socket *sock;
-	int fd,error,hdr_len;
+	int fd, error, hdr_len;
 	void *hdr;
 
-	DPRINTK("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
-	    "flow %p,opt %p)\n",sch,p,classid,parent,flow,opt);
+	pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
+		"flow %p,opt %p)\n", sch, p, classid, parent, flow, opt);
 	/*
 	 * The concept of parents doesn't apply for this qdisc.
 	 */
@@ -254,210 +208,219 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 	 * class needs to be removed and a new one added. (This may be changed
 	 * later.)
 	 */
-	if (flow) return -EBUSY;
-	if (opt == NULL || rtattr_parse_nested(tb, TCA_ATM_MAX, opt))
+	if (flow)
+		return -EBUSY;
+	if (opt == NULL)
 		return -EINVAL;
-	if (!tb[TCA_ATM_FD-1] || RTA_PAYLOAD(tb[TCA_ATM_FD-1]) < sizeof(fd))
+
+	error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy);
+	if (error < 0)
+		return error;
+
+	if (!tb[TCA_ATM_FD])
 		return -EINVAL;
-	fd = *(int *) RTA_DATA(tb[TCA_ATM_FD-1]);
-	DPRINTK("atm_tc_change: fd %d\n",fd);
-	if (tb[TCA_ATM_HDR-1]) {
-		hdr_len = RTA_PAYLOAD(tb[TCA_ATM_HDR-1]);
-		hdr = RTA_DATA(tb[TCA_ATM_HDR-1]);
-	}
-	else {
+	fd = nla_get_u32(tb[TCA_ATM_FD]);
+	pr_debug("atm_tc_change: fd %d\n", fd);
+	if (tb[TCA_ATM_HDR]) {
+		hdr_len = nla_len(tb[TCA_ATM_HDR]);
+		hdr = nla_data(tb[TCA_ATM_HDR]);
+	} else {
 		hdr_len = RFC1483LLC_LEN;
-		hdr = NULL; /* default LLC/SNAP for IP */
+		hdr = NULL;	/* default LLC/SNAP for IP */
 	}
-	if (!tb[TCA_ATM_EXCESS-1]) excess = NULL;
+	if (!tb[TCA_ATM_EXCESS])
+		excess = NULL;
 	else {
-		if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS-1]) != sizeof(u32))
-			return -EINVAL;
-		excess = (struct atm_flow_data *) atm_tc_get(sch,
-		    *(u32 *) RTA_DATA(tb[TCA_ATM_EXCESS-1]));
-		if (!excess) return -ENOENT;
+		excess = (struct atm_flow_data *)
+			atm_tc_get(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
+		if (!excess)
+			return -ENOENT;
 	}
-	DPRINTK("atm_tc_change: type %d, payload %d, hdr_len %d\n",
-	    opt->rta_type,RTA_PAYLOAD(opt),hdr_len);
-	if (!(sock = sockfd_lookup(fd,&error))) return error; /* f_count++ */
-	DPRINTK("atm_tc_change: f_count %d\n",file_count(sock->file));
-        if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
+	pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n",
+		 opt->nla_type, nla_len(opt), hdr_len);
+	sock = sockfd_lookup(fd, &error);
+	if (!sock)
+		return error;	/* f_count++ */
+	pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file));
+	if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
 		error = -EPROTOTYPE;
-                goto err_out;
+		goto err_out;
 	}
 	/* @@@ should check if the socket is really operational or we'll crash
 	   on vcc->send */
 	if (classid) {
 		if (TC_H_MAJ(classid ^ sch->handle)) {
-			DPRINTK("atm_tc_change: classid mismatch\n");
+			pr_debug("atm_tc_change: classid mismatch\n");
 			error = -EINVAL;
 			goto err_out;
 		}
-		if (find_flow(p,flow)) {
-			error = -EEXIST;
-			goto err_out;
-		}
-	}
-	else {
+	} else {
 		int i;
 		unsigned long cl;
 
 		for (i = 1; i < 0x8000; i++) {
-			classid = TC_H_MAKE(sch->handle,0x8000 | i);
-			if (!(cl = atm_tc_get(sch,classid))) break;
-			atm_tc_put(sch,cl);
+			classid = TC_H_MAKE(sch->handle, 0x8000 | i);
+			cl = atm_tc_get(sch, classid);
+			if (!cl)
+				break;
+			atm_tc_put(sch, cl);
 		}
 	}
-	DPRINTK("atm_tc_change: new id %x\n",classid);
-	flow = kmalloc(sizeof(struct atm_flow_data)+hdr_len,GFP_KERNEL);
-	DPRINTK("atm_tc_change: flow %p\n",flow);
+	pr_debug("atm_tc_change: new id %x\n", classid);
+	flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL);
+	pr_debug("atm_tc_change: flow %p\n", flow);
 	if (!flow) {
 		error = -ENOBUFS;
 		goto err_out;
 	}
-	memset(flow,0,sizeof(*flow));
 	flow->filter_list = NULL;
-	if (!(flow->q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops)))
+	flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
+	if (!flow->q)
 		flow->q = &noop_qdisc;
-	DPRINTK("atm_tc_change: qdisc %p\n",flow->q);
+	pr_debug("atm_tc_change: qdisc %p\n", flow->q);
 	flow->sock = sock;
-        flow->vcc = ATM_SD(sock); /* speedup */
+	flow->vcc = ATM_SD(sock);	/* speedup */
 	flow->vcc->user_back = flow;
-        DPRINTK("atm_tc_change: vcc %p\n",flow->vcc);
+	pr_debug("atm_tc_change: vcc %p\n", flow->vcc);
 	flow->old_pop = flow->vcc->pop;
 	flow->parent = p;
 	flow->vcc->pop = sch_atm_pop;
 	flow->classid = classid;
 	flow->ref = 1;
 	flow->excess = excess;
-	flow->next = p->link.next;
-	p->link.next = flow;
+	list_add(&flow->list, &p->link.list);
 	flow->hdr_len = hdr_len;
 	if (hdr)
-		memcpy(flow->hdr,hdr,hdr_len);
+		memcpy(flow->hdr, hdr, hdr_len);
 	else
-		memcpy(flow->hdr,llc_oui_ip,sizeof(llc_oui_ip));
-	*arg = (unsigned long) flow;
+		memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip));
+	*arg = (unsigned long)flow;
 	return 0;
 err_out:
-	if (excess) atm_tc_put(sch,(unsigned long) excess);
+	if (excess)
+		atm_tc_put(sch, (unsigned long)excess);
 	sockfd_put(sock);
 	return error;
 }
 
-
-static int atm_tc_delete(struct Qdisc *sch,unsigned long arg)
+static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *) arg;
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
 
-	DPRINTK("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
-	if (!find_flow(PRIV(sch),flow)) return -EINVAL;
-	if (flow->filter_list || flow == &p->link) return -EBUSY;
+	pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+	if (list_empty(&flow->list))
+		return -EINVAL;
+	if (flow->filter_list || flow == &p->link)
+		return -EBUSY;
 	/*
 	 * Reference count must be 2: one for "keepalive" (set at class
 	 * creation), and one for the reference held when calling delete.
 	 */
 	if (flow->ref < 2) {
-		printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n",flow->ref);
+		pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
 		return -EINVAL;
 	}
-	if (flow->ref > 2) return -EBUSY; /* catch references via excess, etc.*/
-	atm_tc_put(sch,arg);
+	if (flow->ref > 2)
+		return -EBUSY;	/* catch references via excess, etc. */
+	atm_tc_put(sch, arg);
 	return 0;
 }
 
-
-static void atm_tc_walk(struct Qdisc *sch,struct qdisc_walker *walker)
+static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
+	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow;
 
-	DPRINTK("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker);
-	if (walker->stop) return;
-	for (flow = p->flows; flow; flow = flow->next) {
-		if (walker->count >= walker->skip)
-			if (walker->fn(sch,(unsigned long) flow,walker) < 0) {
-				walker->stop = 1;
-				break;
-			}
+	pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
+	if (walker->stop)
+		return;
+	list_for_each_entry(flow, &p->flows, list) {
+		if (walker->count >= walker->skip &&
+		    walker->fn(sch, (unsigned long)flow, walker) < 0) {
+			walker->stop = 1;
+			break;
+		}
 		walker->count++;
 	}
 }
 
-
-static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch,unsigned long cl)
+static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *) cl;
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
 
-	DPRINTK("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
-        return flow ? &flow->filter_list : &p->link.filter_list;
+	pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+	return flow ? &flow->filter_list : &p->link.filter_list;
 }
 
-
 /* --------------------------- Qdisc operations ---------------------------- */
 
-
-static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch)
+static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
-	struct atm_flow_data *flow = NULL ; /* @@@ */
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow;
 	struct tcf_result res;
 	int result;
 	int ret = NET_XMIT_POLICED;
 
-	D2PRINTK("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
-	result = TC_POLICE_OK; /* be nice to gcc */
+	pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
+	result = TC_POLICE_OK;	/* be nice to gcc */
+	flow = NULL;
 	if (TC_H_MAJ(skb->priority) != sch->handle ||
-	    !(flow = (struct atm_flow_data *) atm_tc_get(sch,skb->priority)))
-		for (flow = p->flows; flow; flow = flow->next)
+	    !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
+		list_for_each_entry(flow, &p->flows, list) {
 			if (flow->filter_list) {
-				result = tc_classify(skb,flow->filter_list,
-				    &res);
-				if (result < 0) continue;
-				flow = (struct atm_flow_data *) res.class;
-				if (!flow) flow = lookup_flow(sch,res.classid);
-				break;
+				result = tc_classify_compat(skb,
+							    flow->filter_list,
+							    &res);
+				if (result < 0)
+					continue;
+				flow = (struct atm_flow_data *)res.class;
+				if (!flow)
+					flow = lookup_flow(sch, res.classid);
+				goto done;
 			}
-	if (!flow) flow = &p->link;
-	else {
+		}
+		flow = NULL;
+done:
+		;
+	}
+	if (!flow) {
+		flow = &p->link;
+	} else {
 		if (flow->vcc)
 			ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
-			/*@@@ looks good ... but it's not supposed to work :-)*/
-#ifdef CONFIG_NET_CLS_POLICE
+		/*@@@ looks good ... but it's not supposed to work :-) */
+#ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
-			case TC_POLICE_SHOT:
-				kfree_skb(skb);
-				break;
-			case TC_POLICE_RECLASSIFY:
-				if (flow->excess) flow = flow->excess;
-				else {
-					ATM_SKB(skb)->atm_options |=
-					    ATM_ATMOPT_CLP;
-					break;
-				}
-				/* fall through */
-			case TC_POLICE_OK:
-				/* fall through */
-			default:
-				break;
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			kfree_skb(skb);
+			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			kfree_skb(skb);
+			goto drop;
+		case TC_POLICE_RECLASSIFY:
+			if (flow->excess)
+				flow = flow->excess;
+			else
+				ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP;
+			break;
 		}
 #endif
 	}
-	if (
-#ifdef CONFIG_NET_CLS_POLICE
-	    result == TC_POLICE_SHOT ||
-#endif
-	    (ret = flow->q->enqueue(skb,flow->q)) != 0) {
-		sch->qstats.drops++;
-		if (flow) flow->qstats.drops++;
+
+	ret = qdisc_enqueue(skb, flow->q);
+	if (ret != NET_XMIT_SUCCESS) {
+drop: __maybe_unused
+		if (net_xmit_drop_count(ret)) {
+			sch->qstats.drops++;
+			if (flow)
+				flow->qstats.drops++;
+		}
 		return ret;
 	}
-	sch->bstats.bytes += skb->len;
-	sch->bstats.packets++;
-	flow->bstats.bytes += skb->len;
-	flow->bstats.packets++;
 	/*
 	 * Okay, this may seem weird. We pretend we've dropped the packet if
 	 * it goes via ATM. The reason for this is that the outer qdisc
@@ -469,13 +432,12 @@ static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch)
 	 */
 	if (flow == &p->link) {
 		sch->q.qlen++;
-		return 0;
+		return NET_XMIT_SUCCESS;
 	}
 	tasklet_schedule(&p->task);
-	return NET_XMIT_BYPASS;
+	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 }
 
-
 /*
  * Dequeue packets and send them over ATM. Note that we quite deliberately
  * avoid checking net_device's flow control here, simply because sch_atm
@@ -483,195 +445,195 @@ static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch)
  * non-ATM interfaces.
  */
 
-
 static void sch_atm_dequeue(unsigned long data)
 {
-	struct Qdisc *sch = (struct Qdisc *) data;
-	struct atm_qdisc_data *p = PRIV(sch);
+	struct Qdisc *sch = (struct Qdisc *)data;
+	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow;
 	struct sk_buff *skb;
 
-	D2PRINTK("sch_atm_dequeue(sch %p,[qdisc %p])\n",sch,p);
-	for (flow = p->link.next; flow; flow = flow->next)
+	pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p);
+	list_for_each_entry(flow, &p->flows, list) {
+		if (flow == &p->link)
+			continue;
 		/*
 		 * If traffic is properly shaped, this won't generate nasty
 		 * little bursts. Otherwise, it may ... (but that's okay)
 		 */
-		while ((skb = flow->q->dequeue(flow->q))) {
-			if (!atm_may_send(flow->vcc,skb->truesize)) {
-				(void) flow->q->ops->requeue(skb,flow->q);
+		while ((skb = flow->q->ops->peek(flow->q))) {
+			if (!atm_may_send(flow->vcc, skb->truesize))
 				break;
-			}
-			D2PRINTK("atm_tc_dequeue: sending on class %p\n",flow);
+
+			skb = qdisc_dequeue_peeked(flow->q);
+			if (unlikely(!skb))
+				break;
+
+			qdisc_bstats_update(sch, skb);
+			bstats_update(&flow->bstats, skb);
+			pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
 			/* remove any LL header somebody else has attached */
-			skb_pull(skb,(char *) skb->nh.iph-(char *) skb->data);
+			skb_pull(skb, skb_network_offset(skb));
 			if (skb_headroom(skb) < flow->hdr_len) {
 				struct sk_buff *new;
 
-				new = skb_realloc_headroom(skb,flow->hdr_len);
+				new = skb_realloc_headroom(skb, flow->hdr_len);
 				dev_kfree_skb(skb);
-				if (!new) continue;
+				if (!new)
+					continue;
 				skb = new;
 			}
-			D2PRINTK("sch_atm_dequeue: ip %p, data %p\n",
-			    skb->nh.iph,skb->data);
+			pr_debug("sch_atm_dequeue: ip %p, data %p\n",
+				 skb_network_header(skb), skb->data);
 			ATM_SKB(skb)->vcc = flow->vcc;
-			memcpy(skb_push(skb,flow->hdr_len),flow->hdr,
-			    flow->hdr_len);
+			memcpy(skb_push(skb, flow->hdr_len), flow->hdr,
+			       flow->hdr_len);
 			atomic_add(skb->truesize,
 				   &sk_atm(flow->vcc)->sk_wmem_alloc);
 			/* atm.atm_options are already set by atm_tc_enqueue */
-			(void) flow->vcc->send(flow->vcc,skb);
+			flow->vcc->send(flow->vcc, skb);
 		}
+	}
 }
 
-
 static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
+	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct sk_buff *skb;
 
-	D2PRINTK("atm_tc_dequeue(sch %p,[qdisc %p])\n",sch,p);
+	pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
 	tasklet_schedule(&p->task);
-	skb = p->link.q->dequeue(p->link.q);
-	if (skb) sch->q.qlen--;
+	skb = qdisc_dequeue_peeked(p->link.q);
+	if (skb)
+		sch->q.qlen--;
 	return skb;
 }
 
-
-static int atm_tc_requeue(struct sk_buff *skb,struct Qdisc *sch)
+static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
-	int ret;
-
-	D2PRINTK("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
-	ret = p->link.q->ops->requeue(skb,p->link.q);
-	if (!ret) {
-        sch->q.qlen++;
-        sch->qstats.requeues++;
-    } else {
-		sch->qstats.drops++;
-		p->link.qstats.drops++;
-	}
-	return ret;
-}
+	struct atm_qdisc_data *p = qdisc_priv(sch);
 
+	pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p);
+
+	return p->link.q->ops->peek(p->link.q);
+}
 
 static unsigned int atm_tc_drop(struct Qdisc *sch)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
+	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow;
 	unsigned int len;
 
-	DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n",sch,p);
-	for (flow = p->flows; flow; flow = flow->next)
+	pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p);
+	list_for_each_entry(flow, &p->flows, list) {
 		if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
 			return len;
+	}
 	return 0;
 }
 
-
-static int atm_tc_init(struct Qdisc *sch,struct rtattr *opt)
+static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
-
-	DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
-	p->flows = &p->link;
-	if(!(p->link.q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops)))
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+
+	pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
+	INIT_LIST_HEAD(&p->flows);
+	INIT_LIST_HEAD(&p->link.list);
+	list_add(&p->link.list, &p->flows);
+	p->link.q = qdisc_create_dflt(sch->dev_queue,
+				      &pfifo_qdisc_ops, sch->handle);
+	if (!p->link.q)
 		p->link.q = &noop_qdisc;
-	DPRINTK("atm_tc_init: link (%p) qdisc %p\n",&p->link,p->link.q);
+	pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
 	p->link.filter_list = NULL;
 	p->link.vcc = NULL;
 	p->link.sock = NULL;
 	p->link.classid = sch->handle;
 	p->link.ref = 1;
-	p->link.next = NULL;
-	tasklet_init(&p->task,sch_atm_dequeue,(unsigned long) sch);
+	tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch);
 	return 0;
 }
 
-
 static void atm_tc_reset(struct Qdisc *sch)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
+	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow;
 
-	DPRINTK("atm_tc_reset(sch %p,[qdisc %p])\n",sch,p);
-	for (flow = p->flows; flow; flow = flow->next) qdisc_reset(flow->q);
+	pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p);
+	list_for_each_entry(flow, &p->flows, list)
+		qdisc_reset(flow->q);
 	sch->q.qlen = 0;
 }
 
-
 static void atm_tc_destroy(struct Qdisc *sch)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
-	struct atm_flow_data *flow;
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow, *tmp;
+
+	pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
+	list_for_each_entry(flow, &p->flows, list)
+		tcf_destroy_chain(&flow->filter_list);
 
-	DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n",sch,p);
-	/* races ? */
-	while ((flow = p->flows)) {
-		destroy_filters(flow);
+	list_for_each_entry_safe(flow, tmp, &p->flows, list) {
 		if (flow->ref > 1)
-			printk(KERN_ERR "atm_destroy: %p->ref = %d\n",flow,
-			    flow->ref);
-		atm_tc_put(sch,(unsigned long) flow);
-		if (p->flows == flow) {
-			printk(KERN_ERR "atm_destroy: putting flow %p didn't "
-			    "kill it\n",flow);
-			p->flows = flow->next; /* brute force */
-			break;
-		}
+			pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
+		atm_tc_put(sch, (unsigned long)flow);
 	}
 	tasklet_kill(&p->task);
 }
 
-
 static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
-    struct sk_buff *skb, struct tcmsg *tcm)
+			     struct sk_buff *skb, struct tcmsg *tcm)
 {
-	struct atm_qdisc_data *p = PRIV(sch);
-	struct atm_flow_data *flow = (struct atm_flow_data *) cl;
-	unsigned char *b = skb->tail;
-	struct rtattr *rta;
-
-	DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
-	    sch,p,flow,skb,tcm);
-	if (!find_flow(p,flow)) return -EINVAL;
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+	struct nlattr *nest;
+
+	pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
+		sch, p, flow, skb, tcm);
+	if (list_empty(&flow->list))
+		return -EINVAL;
 	tcm->tcm_handle = flow->classid;
-	rta = (struct rtattr *) b;
-	RTA_PUT(skb,TCA_OPTIONS,0,NULL);
-	RTA_PUT(skb,TCA_ATM_HDR,flow->hdr_len,flow->hdr);
+	tcm->tcm_info = flow->q->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr))
+		goto nla_put_failure;
 	if (flow->vcc) {
 		struct sockaddr_atmpvc pvc;
 		int state;
 
+		memset(&pvc, 0, sizeof(pvc));
 		pvc.sap_family = AF_ATMPVC;
 		pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
 		pvc.sap_addr.vpi = flow->vcc->vpi;
 		pvc.sap_addr.vci = flow->vcc->vci;
-		RTA_PUT(skb,TCA_ATM_ADDR,sizeof(pvc),&pvc);
+		if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc))
+			goto nla_put_failure;
 		state = ATM_VF2VS(flow->vcc->flags);
-		RTA_PUT(skb,TCA_ATM_STATE,sizeof(state),&state);
+		if (nla_put_u32(skb, TCA_ATM_STATE, state))
+			goto nla_put_failure;
 	}
-	if (flow->excess)
-		RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(u32),&flow->classid);
-	else {
-		static u32 zero;
-
-		RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(zero),&zero);
+	if (flow->excess) {
+		if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->classid))
+			goto nla_put_failure;
+	} else {
+		if (nla_put_u32(skb, TCA_ATM_EXCESS, 0))
+			goto nla_put_failure;
 	}
-	rta->rta_len = skb->tail-b;
-	return skb->len;
+	return nla_nest_end(skb, nest);
 
-rtattr_failure:
-	skb_trim(skb,b-skb->data);
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
 	return -1;
 }
 static int
 atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
-	struct gnet_dump *d)
+			struct gnet_dump *d)
 {
-	struct atm_flow_data *flow = (struct atm_flow_data *) arg;
+	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
 
 	flow->qstats.qlen = flow->q->q.qlen;
 
@@ -687,45 +649,42 @@ static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
 	return 0;
 }
 
-static struct Qdisc_class_ops atm_class_ops = {
-	.graft		=	atm_tc_graft,
-	.leaf		=	atm_tc_leaf,
-	.get		=	atm_tc_get,
-	.put		=	atm_tc_put,
-	.change		=	atm_tc_change,
-	.delete		=	atm_tc_delete,
-	.walk		=	atm_tc_walk,
-	.tcf_chain	=	atm_tc_find_tcf,
-	.bind_tcf	=	atm_tc_bind_filter,
-	.unbind_tcf	=	atm_tc_put,
-	.dump		=	atm_tc_dump_class,
-	.dump_stats	=	atm_tc_dump_class_stats,
+static const struct Qdisc_class_ops atm_class_ops = {
+	.graft		= atm_tc_graft,
+	.leaf		= atm_tc_leaf,
+	.get		= atm_tc_get,
+	.put		= atm_tc_put,
+	.change		= atm_tc_change,
+	.delete		= atm_tc_delete,
+	.walk		= atm_tc_walk,
+	.tcf_chain	= atm_tc_find_tcf,
+	.bind_tcf	= atm_tc_bind_filter,
+	.unbind_tcf	= atm_tc_put,
+	.dump		= atm_tc_dump_class,
+	.dump_stats	= atm_tc_dump_class_stats,
 };
 
-static struct Qdisc_ops atm_qdisc_ops = {
-	.next		=	NULL,
-	.cl_ops		=	&atm_class_ops,
-	.id		=	"atm",
-	.priv_size	=	sizeof(struct atm_qdisc_data),
-	.enqueue	=	atm_tc_enqueue,
-	.dequeue	=	atm_tc_dequeue,
-	.requeue	=	atm_tc_requeue,
-	.drop		=	atm_tc_drop,
-	.init		=	atm_tc_init,
-	.reset		=	atm_tc_reset,
-	.destroy	=	atm_tc_destroy,
-	.change		=	NULL,
-	.dump		=	atm_tc_dump,
-	.owner		=	THIS_MODULE,
+static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
+	.cl_ops		= &atm_class_ops,
+	.id		= "atm",
+	.priv_size	= sizeof(struct atm_qdisc_data),
+	.enqueue	= atm_tc_enqueue,
+	.dequeue	= atm_tc_dequeue,
+	.peek		= atm_tc_peek,
+	.drop		= atm_tc_drop,
+	.init		= atm_tc_init,
+	.reset		= atm_tc_reset,
+	.destroy	= atm_tc_destroy,
+	.dump		= atm_tc_dump,
+	.owner		= THIS_MODULE,
 };
 
-
 static int __init atm_init(void)
 {
 	return register_qdisc(&atm_qdisc_ops);
 }
 
-static void __exit atm_exit(void) 
+static void __exit atm_exit(void)
 {
 	unregister_qdisc(&atm_qdisc_ops);
 }
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
index 81f0b8346d1..094a874b48b 100644
--- a/net/sched/sch_blackhole.c
+++ b/net/sched/sch_blackhole.c
@@ -11,11 +11,9 @@
  * Note: Quantum tunneling is not supported.
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/pkt_sched.h>
 
@@ -30,11 +28,12 @@ static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
 	return NULL;
 }
 
-static struct Qdisc_ops blackhole_qdisc_ops = {
+static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = {
 	.id		= "blackhole",
 	.priv_size	= 0,
 	.enqueue	= blackhole_enqueue,
 	.dequeue	= blackhole_dequeue,
+	.peek		= blackhole_dequeue,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 6cd81708bf7..ead526467cc 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -10,30 +10,14 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
 
 
@@ -41,12 +25,12 @@
 	=======================================
 
 	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
-	         Management Models for Packet Networks",
+		 Management Models for Packet Networks",
 		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
 
-	         [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
+		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
 
-	         [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
+		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
 		 Parameters", 1996
 
 		 [4] Sally Floyd and Michael Speer, "Experimental Results
@@ -60,12 +44,12 @@
 	the implementation is different. Particularly:
 
 	--- The WRR algorithm is different. Our version looks more
-        reasonable (I hope) and works when quanta are allowed to be
-        less than MTU, which is always the case when real time classes
-        have small rates. Note, that the statement of [3] is
-        incomplete, delay may actually be estimated even if class
-        per-round allotment is less than MTU. Namely, if per-round
-        allotment is W*r_i, and r_1+...+r_k = r < 1
+	reasonable (I hope) and works when quanta are allowed to be
+	less than MTU, which is always the case when real time classes
+	have small rates. Note, that the statement of [3] is
+	incomplete, delay may actually be estimated even if class
+	per-round allotment is less than MTU. Namely, if per-round
+	allotment is W*r_i, and r_1+...+r_k = r < 1
 
 	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
 
@@ -88,18 +72,16 @@
 struct cbq_sched_data;
 
 
-struct cbq_class
-{
-	struct cbq_class	*next;		/* hash table link */
+struct cbq_class {
+	struct Qdisc_class_common common;
 	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
 
 /* Parameters */
-	u32			classid;
 	unsigned char		priority;	/* class priority */
 	unsigned char		priority2;	/* priority to be used after overlimit */
 	unsigned char		ewma_log;	/* time constant for idle time calculation */
 	unsigned char		ovl_strategy;
-#ifdef CONFIG_NET_CLS_POLICE
+#ifdef CONFIG_NET_CLS_ACT
 	unsigned char		police;
 #endif
 
@@ -114,7 +96,7 @@ struct cbq_class
 
 	/* Overlimit strategy parameters */
 	void			(*overlimit)(struct cbq_class *cl);
-	long			penalty;
+	psched_tdiff_t		penalty;
 
 	/* General scheduler (WRR) parameters */
 	long			allot;
@@ -145,11 +127,10 @@ struct cbq_class
 	psched_time_t		undertime;
 	long			avgidle;
 	long			deficit;	/* Saved deficit for WRR */
-	unsigned long		penalized;
-	struct gnet_stats_basic bstats;
+	psched_time_t		penalized;
+	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
-	spinlock_t		*stats_lock;
+	struct gnet_stats_rate_est64 rate_est;
 	struct tc_cbq_xstats	xstats;
 
 	struct tcf_proto	*filter_list;
@@ -157,22 +138,21 @@ struct cbq_class
 	int			refcnt;
 	int			filters;
 
-	struct cbq_class 	*defaults[TC_PRIO_MAX+1];
+	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
 };
 
-struct cbq_sched_data
-{
-	struct cbq_class	*classes[16];		/* Hash table of all classes */
-	int			nclasses[TC_CBQ_MAXPRIO+1];
-	unsigned		quanta[TC_CBQ_MAXPRIO+1];
+struct cbq_sched_data {
+	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
+	int			nclasses[TC_CBQ_MAXPRIO + 1];
+	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
 
 	struct cbq_class	link;
 
-	unsigned		activemask;
-	struct cbq_class	*active[TC_CBQ_MAXPRIO+1];	/* List of all classes
+	unsigned int		activemask;
+	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
 								   with backlog */
 
-#ifdef CONFIG_NET_CLS_POLICE
+#ifdef CONFIG_NET_CLS_ACT
 	struct cbq_class	*rx_class;
 #endif
 	struct cbq_class	*tx_class;
@@ -180,64 +160,58 @@ struct cbq_sched_data
 	int			tx_len;
 	psched_time_t		now;		/* Cached timestamp */
 	psched_time_t		now_rt;		/* Cached real time */
-	unsigned		pmask;
+	unsigned int		pmask;
 
-	struct timer_list	delay_timer;
-	struct timer_list	wd_timer;	/* Watchdog timer,
+	struct hrtimer		delay_timer;
+	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
 						   started when CBQ has
 						   backlog, but cannot
 						   transmit just now */
-	long			wd_expires;
+	psched_tdiff_t		wd_expires;
 	int			toplevel;
 	u32			hgenerator;
 };
 
 
-#define L2T(cl,len)	((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
-
+#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
 
-static __inline__ unsigned cbq_hash(u32 h)
-{
-	h ^= h>>8;
-	h ^= h>>4;
-	return h&0xF;
-}
-
-static __inline__ struct cbq_class *
+static inline struct cbq_class *
 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 {
-	struct cbq_class *cl;
+	struct Qdisc_class_common *clc;
 
-	for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
-		if (cl->classid == classid)
-			return cl;
-	return NULL;
+	clc = qdisc_class_find(&q->clhash, classid);
+	if (clc == NULL)
+		return NULL;
+	return container_of(clc, struct cbq_class, common);
 }
 
-#ifdef CONFIG_NET_CLS_POLICE
+#ifdef CONFIG_NET_CLS_ACT
 
 static struct cbq_class *
 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 {
-	struct cbq_class *cl, *new;
+	struct cbq_class *cl;
 
-	for (cl = this->tparent; cl; cl = cl->tparent)
-		if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
-			return new;
+	for (cl = this->tparent; cl; cl = cl->tparent) {
+		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 
+		if (new != NULL && new != this)
+			return new;
+	}
 	return NULL;
 }
 
 #endif
 
 /* Classify packet. The procedure is pretty complicated, but
-   it allows us to combine link sharing and priority scheduling
-   transparently.
-
-   Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
-   so that it resolves to split nodes. Then packets are classified
-   by logical priority, or a more specific classifier may be attached
-   to the split node.
+ * it allows us to combine link sharing and priority scheduling
+ * transparently.
+ *
+ * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
+ * so that it resolves to split nodes. Then packets are classified
+ * by logical priority, or a more specific classifier may be attached
+ * to the split node.
  */
 
 static struct cbq_class *
@@ -253,11 +227,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	/*
 	 *  Step 1. If skb->priority points to one of our classes, use it.
 	 */
-	if (TC_H_MAJ(prio^sch->handle) == 0 &&
+	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 	    (cl = cbq_class_lookup(q, prio)) != NULL)
 		return cl;
 
-	*qerr = NET_XMIT_BYPASS;
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	for (;;) {
 		int result = 0;
 		defmap = head->defaults;
@@ -265,35 +239,31 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 		/*
 		 * Step 2+n. Apply classifier.
 		 */
-		if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
+		if (!head->filter_list ||
+		    (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
 			goto fallback;
 
-		if ((cl = (void*)res.class) == NULL) {
+		cl = (void *)res.class;
+		if (!cl) {
 			if (TC_H_MAJ(res.classid))
 				cl = cbq_class_lookup(q, res.classid);
-			else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
+			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 				cl = defmap[TC_PRIO_BESTEFFORT];
 
-			if (cl == NULL || cl->level >= head->level)
+			if (cl == NULL)
 				goto fallback;
 		}
-
+		if (cl->level >= head->level)
+			goto fallback;
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
-		case TC_ACT_STOLEN: 
-			*qerr = NET_XMIT_SUCCESS;
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return NULL;
-		}
-#elif defined(CONFIG_NET_CLS_POLICE)
-		switch (result) {
-		case TC_POLICE_RECLASSIFY:
+		case TC_ACT_RECLASSIFY:
 			return cbq_reclassify(skb, cl);
-		case TC_POLICE_SHOT:
-			return NULL;
-		default:
-			break;
 		}
 #endif
 		if (cl->level == 0)
@@ -314,7 +284,7 @@ fallback:
 	 * Step 4. No success...
 	 */
 	if (TC_H_MAJ(prio) == 0 &&
-	    !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
+	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 		return head;
 
@@ -322,12 +292,12 @@ fallback:
 }
 
 /*
-   A packet has just been enqueued on the empty class.
-   cbq_activate_class adds it to the tail of active class list
-   of its priority band.
+ * A packet has just been enqueued on the empty class.
+ * cbq_activate_class adds it to the tail of active class list
+ * of its priority band.
  */
 
-static __inline__ void cbq_activate_class(struct cbq_class *cl)
+static inline void cbq_activate_class(struct cbq_class *cl)
 {
 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 	int prio = cl->cpriority;
@@ -346,9 +316,9 @@ static __inline__ void cbq_activate_class(struct cbq_class *cl)
 }
 
 /*
-   Unlink class from active chain.
-   Note that this same procedure is done directly in cbq_dequeue*
-   during round-robin procedure.
+ * Unlink class from active chain.
+ * Note that this same procedure is done directly in cbq_dequeue*
+ * during round-robin procedure.
  */
 
 static void cbq_deactivate_class(struct cbq_class *this)
@@ -372,8 +342,6 @@ static void cbq_deactivate_class(struct cbq_class *this)
 					return;
 				}
 			}
-
-			cl = cl_prev->next_alive;
 			return;
 		}
 	} while ((cl_prev = cl) != q->active[prio]);
@@ -384,20 +352,20 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 {
 	int toplevel = q->toplevel;
 
-	if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
+	if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
 		psched_time_t now;
 		psched_tdiff_t incr;
 
-		PSCHED_GET_TIME(now);
-		incr = PSCHED_TDIFF(now, q->now_rt);
-		PSCHED_TADD2(q->now, incr, now);
+		now = psched_get_time();
+		incr = now - q->now_rt;
+		now = q->now + incr;
 
 		do {
-			if (PSCHED_TLESS(cl->undertime, now)) {
+			if (cl->undertime < now) {
 				q->toplevel = cl->level;
 				return;
 			}
-		} while ((cl=cl->borrow) != NULL && toplevel > cl->level);
+		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 	}
 }
 
@@ -405,68 +373,36 @@ static int
 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	int len = skb->len;
-	int ret;
+	int uninitialized_var(ret);
 	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 
-#ifdef CONFIG_NET_CLS_POLICE
+#ifdef CONFIG_NET_CLS_ACT
 	q->rx_class = cl;
 #endif
 	if (cl == NULL) {
-		if (ret == NET_XMIT_BYPASS)
+		if (ret & __NET_XMIT_BYPASS)
 			sch->qstats.drops++;
 		kfree_skb(skb);
 		return ret;
 	}
 
-#ifdef CONFIG_NET_CLS_POLICE
+#ifdef CONFIG_NET_CLS_ACT
 	cl->q->__parent = sch;
 #endif
-	if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
+	ret = qdisc_enqueue(skb, cl->q);
+	if (ret == NET_XMIT_SUCCESS) {
 		sch->q.qlen++;
-		sch->bstats.packets++;
-		sch->bstats.bytes+=len;
 		cbq_mark_toplevel(q, cl);
 		if (!cl->next_alive)
 			cbq_activate_class(cl);
 		return ret;
 	}
 
-	sch->qstats.drops++;
-	cbq_mark_toplevel(q, cl);
-	cl->qstats.drops++;
-	return ret;
-}
-
-static int
-cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
-{
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl;
-	int ret;
-
-	if ((cl = q->tx_class) == NULL) {
-		kfree_skb(skb);
+	if (net_xmit_drop_count(ret)) {
 		sch->qstats.drops++;
-		return NET_XMIT_CN;
-	}
-	q->tx_class = NULL;
-
-	cbq_mark_toplevel(q, cl);
-
-#ifdef CONFIG_NET_CLS_POLICE
-	q->rx_class = cl;
-	cl->q->__parent = sch;
-#endif
-	if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
-		sch->q.qlen++;
-		sch->qstats.requeues++;
-		if (!cl->next_alive)
-			cbq_activate_class(cl);
-		return 0;
+		cbq_mark_toplevel(q, cl);
+		cl->qstats.drops++;
 	}
-	sch->qstats.drops++;
-	cl->qstats.drops++;
 	return ret;
 }
 
@@ -477,17 +413,17 @@ cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
 static void cbq_ovl_classic(struct cbq_class *cl)
 {
 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
-	psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
+	psched_tdiff_t delay = cl->undertime - q->now;
 
 	if (!cl->delayed) {
 		delay += cl->offtime;
 
-		/* 
-		   Class goes to sleep, so that it will have no
-		   chance to work avgidle. Let's forgive it 8)
-
-		   BTW cbq-2.0 has a crap in this
-		   place, apparently they forgot to shift it by cl->ewma_log.
+		/*
+		 * Class goes to sleep, so that it will have no
+		 * chance to work avgidle. Let's forgive it 8)
+		 *
+		 * BTW cbq-2.0 has a crap in this
+		 * place, apparently they forgot to shift it by cl->ewma_log.
 		 */
 		if (cl->avgidle < 0)
 			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
@@ -495,7 +431,7 @@ static void cbq_ovl_classic(struct cbq_class *cl)
 			cl->avgidle = cl->minidle;
 		if (delay <= 0)
 			delay = 1;
-		PSCHED_TADD2(q->now, delay, cl->undertime);
+		cl->undertime = q->now + delay;
 
 		cl->xstats.overactions++;
 		cl->delayed = 1;
@@ -504,15 +440,15 @@ static void cbq_ovl_classic(struct cbq_class *cl)
 		q->wd_expires = delay;
 
 	/* Dirty work! We must schedule wakeups based on
-	   real available rate, rather than leaf rate,
-	   which may be tiny (even zero).
+	 * real available rate, rather than leaf rate,
+	 * which may be tiny (even zero).
 	 */
 	if (q->toplevel == TC_CBQ_MAXLEVEL) {
 		struct cbq_class *b;
 		psched_tdiff_t base_delay = q->wd_expires;
 
 		for (b = cl->borrow; b; b = b->borrow) {
-			delay = PSCHED_TDIFF(b->undertime, q->now);
+			delay = b->undertime - q->now;
 			if (delay < base_delay) {
 				if (delay <= 0)
 					delay = 1;
@@ -525,7 +461,7 @@ static void cbq_ovl_classic(struct cbq_class *cl)
 }
 
 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
-   they go overlimit
+ * they go overlimit
  */
 
 static void cbq_ovl_rclassic(struct cbq_class *cl)
@@ -550,27 +486,36 @@ static void cbq_ovl_rclassic(struct cbq_class *cl)
 static void cbq_ovl_delay(struct cbq_class *cl)
 {
 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
-	psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
+	psched_tdiff_t delay = cl->undertime - q->now;
+
+	if (test_bit(__QDISC_STATE_DEACTIVATED,
+		     &qdisc_root_sleeping(cl->qdisc)->state))
+		return;
 
 	if (!cl->delayed) {
-		unsigned long sched = jiffies;
+		psched_time_t sched = q->now;
+		ktime_t expires;
 
 		delay += cl->offtime;
 		if (cl->avgidle < 0)
 			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 		if (cl->avgidle < cl->minidle)
 			cl->avgidle = cl->minidle;
-		PSCHED_TADD2(q->now, delay, cl->undertime);
+		cl->undertime = q->now + delay;
 
 		if (delay > 0) {
-			sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
+			sched += delay + cl->penalty;
 			cl->penalized = sched;
 			cl->cpriority = TC_CBQ_MAXPRIO;
 			q->pmask |= (1<<TC_CBQ_MAXPRIO);
-			if (del_timer(&q->delay_timer) &&
-			    (long)(q->delay_timer.expires - sched) > 0)
-				q->delay_timer.expires = sched;
-			add_timer(&q->delay_timer);
+
+			expires = ns_to_ktime(PSCHED_TICKS2NS(sched));
+			if (hrtimer_try_to_cancel(&q->delay_timer) &&
+			    ktime_to_ns(ktime_sub(
+					hrtimer_get_expires(&q->delay_timer),
+					expires)) > 0)
+				hrtimer_set_expires(&q->delay_timer, expires);
+			hrtimer_restart(&q->delay_timer);
 			cl->delayed = 1;
 			cl->xstats.overactions++;
 			return;
@@ -587,7 +532,7 @@ static void cbq_ovl_lowprio(struct cbq_class *cl)
 {
 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 
-	cl->penalized = jiffies + cl->penalty;
+	cl->penalized = q->now + cl->penalty;
 
 	if (cl->cpriority != cl->priority2) {
 		cl->cpriority = cl->priority2;
@@ -608,27 +553,19 @@ static void cbq_ovl_drop(struct cbq_class *cl)
 	cbq_ovl_classic(cl);
 }
 
-static void cbq_watchdog(unsigned long arg)
-{
-	struct Qdisc *sch = (struct Qdisc*)arg;
-
-	sch->flags &= ~TCQ_F_THROTTLED;
-	netif_schedule(sch->dev);
-}
-
-static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
+static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
+				       psched_time_t now)
 {
 	struct cbq_class *cl;
 	struct cbq_class *cl_prev = q->active[prio];
-	unsigned long now = jiffies;
-	unsigned long sched = now;
+	psched_time_t sched = now;
 
 	if (cl_prev == NULL)
-		return now;
+		return 0;
 
 	do {
 		cl = cl_prev->next_alive;
-		if ((long)(now - cl->penalized) > 0) {
+		if (now - cl->penalized > 0) {
 			cl_prev->next_alive = cl->next_alive;
 			cl->next_alive = NULL;
 			cl->cpriority = cl->priority;
@@ -644,30 +581,34 @@ static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
 			}
 
 			cl = cl_prev->next_alive;
-		} else if ((long)(sched - cl->penalized) > 0)
+		} else if (sched - cl->penalized > 0)
 			sched = cl->penalized;
 	} while ((cl_prev = cl) != q->active[prio]);
 
-	return (long)(sched - now);
+	return sched - now;
 }
 
-static void cbq_undelay(unsigned long arg)
+static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 {
-	struct Qdisc *sch = (struct Qdisc*)arg;
-	struct cbq_sched_data *q = qdisc_priv(sch);
-	long delay = 0;
-	unsigned pmask;
+	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
+						delay_timer);
+	struct Qdisc *sch = q->watchdog.qdisc;
+	psched_time_t now;
+	psched_tdiff_t delay = 0;
+	unsigned int pmask;
+
+	now = psched_get_time();
 
 	pmask = q->pmask;
 	q->pmask = 0;
 
 	while (pmask) {
 		int prio = ffz(~pmask);
-		long tmp;
+		psched_tdiff_t tmp;
 
 		pmask &= ~(1<<prio);
 
-		tmp = cbq_undelay_prio(q, prio);
+		tmp = cbq_undelay_prio(q, prio, now);
 		if (tmp > 0) {
 			q->pmask |= 1<<prio;
 			if (tmp < delay || delay == 0)
@@ -676,20 +617,21 @@ static void cbq_undelay(unsigned long arg)
 	}
 
 	if (delay) {
-		q->delay_timer.expires = jiffies + delay;
-		add_timer(&q->delay_timer);
+		ktime_t time;
+
+		time = ktime_set(0, 0);
+		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
+		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
 	}
 
-	sch->flags &= ~TCQ_F_THROTTLED;
-	netif_schedule(sch->dev);
+	qdisc_unthrottled(sch);
+	__netif_schedule(qdisc_root(sch));
+	return HRTIMER_NORESTART;
 }
 
-
-#ifdef CONFIG_NET_CLS_POLICE
-
+#ifdef CONFIG_NET_CLS_ACT
 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
 {
-	int len = skb->len;
 	struct Qdisc *sch = child->__parent;
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct cbq_class *cl = q->rx_class;
@@ -697,21 +639,22 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
 	q->rx_class = NULL;
 
 	if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
+		int ret;
 
 		cbq_mark_toplevel(q, cl);
 
 		q->rx_class = cl;
 		cl->q->__parent = sch;
 
-		if (cl->q->enqueue(skb, cl->q) == 0) {
+		ret = qdisc_enqueue(skb, cl->q);
+		if (ret == NET_XMIT_SUCCESS) {
 			sch->q.qlen++;
-			sch->bstats.packets++;
-			sch->bstats.bytes+=len;
 			if (!cl->next_alive)
 				cbq_activate_class(cl);
 			return 0;
 		}
-		sch->qstats.drops++;
+		if (net_xmit_drop_count(ret))
+			sch->qstats.drops++;
 		return 0;
 	}
 
@@ -720,29 +663,29 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
 }
 #endif
 
-/* 
-   It is mission critical procedure.
-
-   We "regenerate" toplevel cutoff, if transmitting class
-   has backlog and it is not regulated. It is not part of
-   original CBQ description, but looks more reasonable.
-   Probably, it is wrong. This question needs further investigation.
-*/
+/*
+ * It is mission critical procedure.
+ *
+ * We "regenerate" toplevel cutoff, if transmitting class
+ * has backlog and it is not regulated. It is not part of
+ * original CBQ description, but looks more reasonable.
+ * Probably, it is wrong. This question needs further investigation.
+ */
 
-static __inline__ void
+static inline void
 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 		    struct cbq_class *borrowed)
 {
 	if (cl && q->toplevel >= borrowed->level) {
 		if (cl->q->q.qlen > 1) {
 			do {
-				if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
+				if (borrowed->undertime == PSCHED_PASTPERFECT) {
 					q->toplevel = borrowed->level;
 					return;
 				}
-			} while ((borrowed=borrowed->borrow) != NULL);
+			} while ((borrowed = borrowed->borrow) != NULL);
 		}
-#if 0	
+#if 0
 	/* It is not necessary now. Uncommenting it
 	   will save CPU cycles, but decrease fairness.
 	 */
@@ -768,22 +711,22 @@ cbq_update(struct cbq_sched_data *q)
 		cl->bstats.bytes += len;
 
 		/*
-		   (now - last) is total time between packet right edges.
-		   (last_pktlen/rate) is "virtual" busy time, so that
-
-		         idle = (now - last) - last_pktlen/rate
+		 * (now - last) is total time between packet right edges.
+		 * (last_pktlen/rate) is "virtual" busy time, so that
+		 *
+		 *	idle = (now - last) - last_pktlen/rate
 		 */
 
-		idle = PSCHED_TDIFF(q->now, cl->last);
+		idle = q->now - cl->last;
 		if ((unsigned long)idle > 128*1024*1024) {
 			avgidle = cl->maxidle;
 		} else {
 			idle -= L2T(cl, len);
 
 		/* true_avgidle := (1-W)*true_avgidle + W*idle,
-		   where W=2^{-ewma_log}. But cl->avgidle is scaled:
-		   cl->avgidle == true_avgidle/W,
-		   hence:
+		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
+		 * cl->avgidle == true_avgidle/W,
+		 * hence:
 		 */
 			avgidle += idle - (avgidle>>cl->ewma_log);
 		}
@@ -797,34 +740,32 @@ cbq_update(struct cbq_sched_data *q)
 			cl->avgidle = avgidle;
 
 			/* Calculate expected time, when this class
-			   will be allowed to send.
-			   It will occur, when:
-			   (1-W)*true_avgidle + W*delay = 0, i.e.
-			   idle = (1/W - 1)*(-true_avgidle)
-			   or
-			   idle = (1 - W)*(-cl->avgidle);
+			 * will be allowed to send.
+			 * It will occur, when:
+			 * (1-W)*true_avgidle + W*delay = 0, i.e.
+			 * idle = (1/W - 1)*(-true_avgidle)
+			 * or
+			 * idle = (1 - W)*(-cl->avgidle);
 			 */
 			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 
 			/*
-			   That is not all.
-			   To maintain the rate allocated to the class,
-			   we add to undertime virtual clock,
-			   necessary to complete transmitted packet.
-			   (len/phys_bandwidth has been already passed
-			   to the moment of cbq_update)
+			 * That is not all.
+			 * To maintain the rate allocated to the class,
+			 * we add to undertime virtual clock,
+			 * necessary to complete transmitted packet.
+			 * (len/phys_bandwidth has been already passed
+			 * to the moment of cbq_update)
 			 */
 
 			idle -= L2T(&q->link, len);
 			idle += L2T(cl, len);
 
-			PSCHED_AUDIT_TDIFF(idle);
-
-			PSCHED_TADD2(q->now, idle, cl->undertime);
+			cl->undertime = q->now + idle;
 		} else {
 			/* Underlimit */
 
-			PSCHED_SET_PASTPERFECT(cl->undertime);
+			cl->undertime = PSCHED_PASTPERFECT;
 			if (avgidle > cl->maxidle)
 				cl->avgidle = cl->maxidle;
 			else
@@ -836,7 +777,7 @@ cbq_update(struct cbq_sched_data *q)
 	cbq_update_toplevel(q, this, q->tx_borrowed);
 }
 
-static __inline__ struct cbq_class *
+static inline struct cbq_class *
 cbq_under_limit(struct cbq_class *cl)
 {
 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
@@ -845,38 +786,37 @@ cbq_under_limit(struct cbq_class *cl)
 	if (cl->tparent == NULL)
 		return cl;
 
-	if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
-	    !PSCHED_TLESS(q->now, cl->undertime)) {
+	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 		cl->delayed = 0;
 		return cl;
 	}
 
 	do {
 		/* It is very suspicious place. Now overlimit
-		   action is generated for not bounded classes
-		   only if link is completely congested.
-		   Though it is in agree with ancestor-only paradigm,
-		   it looks very stupid. Particularly,
-		   it means that this chunk of code will either
-		   never be called or result in strong amplification
-		   of burstiness. Dangerous, silly, and, however,
-		   no another solution exists.
+		 * action is generated for not bounded classes
+		 * only if link is completely congested.
+		 * Though it is in agree with ancestor-only paradigm,
+		 * it looks very stupid. Particularly,
+		 * it means that this chunk of code will either
+		 * never be called or result in strong amplification
+		 * of burstiness. Dangerous, silly, and, however,
+		 * no another solution exists.
 		 */
-		if ((cl = cl->borrow) == NULL) {
+		cl = cl->borrow;
+		if (!cl) {
 			this_cl->qstats.overlimits++;
 			this_cl->overlimit(this_cl);
 			return NULL;
 		}
 		if (cl->level > q->toplevel)
 			return NULL;
-	} while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
-		 PSCHED_TLESS(q->now, cl->undertime));
+	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 
 	cl->delayed = 0;
 	return cl;
 }
 
-static __inline__ struct sk_buff *
+static inline struct sk_buff *
 cbq_dequeue_prio(struct Qdisc *sch, int prio)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
@@ -900,7 +840,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
 
 			if (cl->deficit <= 0) {
 				/* Class exhausted its allotment per
-				   this round. Switch to the next one.
+				 * this round. Switch to the next one.
 				 */
 				deficit = 1;
 				cl->deficit += cl->quantum;
@@ -910,13 +850,13 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
 			skb = cl->q->dequeue(cl->q);
 
 			/* Class did not give us any skb :-(
-			   It could occur even if cl->q->q.qlen != 0 
-			   f.e. if cl->q == "tbf"
+			 * It could occur even if cl->q->q.qlen != 0
+			 * f.e. if cl->q == "tbf"
 			 */
 			if (skb == NULL)
 				goto skip_class;
 
-			cl->deficit -= skb->len;
+			cl->deficit -= qdisc_pkt_len(skb);
 			q->tx_class = cl;
 			q->tx_borrowed = borrow;
 			if (borrow != cl) {
@@ -924,11 +864,11 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
 				borrow->xstats.borrows++;
 				cl->xstats.borrows++;
 #else
-				borrow->xstats.borrows += skb->len;
-				cl->xstats.borrows += skb->len;
+				borrow->xstats.borrows += qdisc_pkt_len(skb);
+				cl->xstats.borrows += qdisc_pkt_len(skb);
 #endif
 			}
-			q->tx_len = skb->len;
+			q->tx_len = qdisc_pkt_len(skb);
 
 			if (cl->deficit <= 0) {
 				q->active[prio] = cl;
@@ -940,7 +880,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
 skip_class:
 			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 				/* Class is empty or penalized.
-				   Unlink it from active chain.
+				 * Unlink it from active chain.
 				 */
 				cl_prev->next_alive = cl->next_alive;
 				cl->next_alive = NULL;
@@ -979,14 +919,14 @@ next_class:
 	return NULL;
 }
 
-static __inline__ struct sk_buff *
+static inline struct sk_buff *
 cbq_dequeue_1(struct Qdisc *sch)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
-	unsigned activemask;
+	unsigned int activemask;
 
-	activemask = q->activemask&0xFF;
+	activemask = q->activemask & 0xFF;
 	while (activemask) {
 		int prio = ffz(~activemask);
 		activemask &= ~(1<<prio);
@@ -1005,25 +945,28 @@ cbq_dequeue(struct Qdisc *sch)
 	psched_time_t now;
 	psched_tdiff_t incr;
 
-	PSCHED_GET_TIME(now);
-	incr = PSCHED_TDIFF(now, q->now_rt);
+	now = psched_get_time();
+	incr = now - q->now_rt;
 
 	if (q->tx_class) {
 		psched_tdiff_t incr2;
 		/* Time integrator. We calculate EOS time
-		   by adding expected packet transmission time.
-		   If real time is greater, we warp artificial clock,
-		   so that:
-
-		   cbq_time = max(real_time, work);
+		 * by adding expected packet transmission time.
+		 * If real time is greater, we warp artificial clock,
+		 * so that:
+		 *
+		 * cbq_time = max(real_time, work);
 		 */
 		incr2 = L2T(&q->link, q->tx_len);
-		PSCHED_TADD(q->now, incr2);
+		q->now += incr2;
 		cbq_update(q);
 		if ((incr -= incr2) < 0)
 			incr = 0;
+		q->now += incr;
+	} else {
+		if (now > q->now)
+			q->now = now;
 	}
-	PSCHED_TADD(q->now, incr);
 	q->now_rt = now;
 
 	for (;;) {
@@ -1031,49 +974,47 @@ cbq_dequeue(struct Qdisc *sch)
 
 		skb = cbq_dequeue_1(sch);
 		if (skb) {
+			qdisc_bstats_update(sch, skb);
 			sch->q.qlen--;
-			sch->flags &= ~TCQ_F_THROTTLED;
+			qdisc_unthrottled(sch);
 			return skb;
 		}
 
 		/* All the classes are overlimit.
-
-		   It is possible, if:
-
-		   1. Scheduler is empty.
-		   2. Toplevel cutoff inhibited borrowing.
-		   3. Root class is overlimit.
-
-		   Reset 2d and 3d conditions and retry.
-
-		   Note, that NS and cbq-2.0 are buggy, peeking
-		   an arbitrary class is appropriate for ancestor-only
-		   sharing, but not for toplevel algorithm.
-
-		   Our version is better, but slower, because it requires
-		   two passes, but it is unavoidable with top-level sharing.
-		*/
+		 *
+		 * It is possible, if:
+		 *
+		 * 1. Scheduler is empty.
+		 * 2. Toplevel cutoff inhibited borrowing.
+		 * 3. Root class is overlimit.
+		 *
+		 * Reset 2d and 3d conditions and retry.
+		 *
+		 * Note, that NS and cbq-2.0 are buggy, peeking
+		 * an arbitrary class is appropriate for ancestor-only
+		 * sharing, but not for toplevel algorithm.
+		 *
+		 * Our version is better, but slower, because it requires
+		 * two passes, but it is unavoidable with top-level sharing.
+		 */
 
 		if (q->toplevel == TC_CBQ_MAXLEVEL &&
-		    PSCHED_IS_PASTPERFECT(q->link.undertime))
+		    q->link.undertime == PSCHED_PASTPERFECT)
 			break;
 
 		q->toplevel = TC_CBQ_MAXLEVEL;
-		PSCHED_SET_PASTPERFECT(q->link.undertime);
+		q->link.undertime = PSCHED_PASTPERFECT;
 	}
 
 	/* No packets in scheduler or nobody wants to give them to us :-(
-	   Sigh... start watchdog timer in the last case. */
+	 * Sigh... start watchdog timer in the last case.
+	 */
 
 	if (sch->q.qlen) {
 		sch->qstats.overlimits++;
-		if (q->wd_expires) {
-			long delay = PSCHED_US2JIFFIE(q->wd_expires);
-			if (delay <= 0)
-				delay = 1;
-			mod_timer(&q->wd_timer, jiffies + delay);
-			sch->flags |= TCQ_F_THROTTLED;
-		}
+		if (q->wd_expires)
+			qdisc_watchdog_schedule(&q->watchdog,
+						now + q->wd_expires);
 	}
 	return NULL;
 }
@@ -1089,36 +1030,39 @@ static void cbq_adjust_levels(struct cbq_class *this)
 		int level = 0;
 		struct cbq_class *cl;
 
-		if ((cl = this->children) != NULL) {
+		cl = this->children;
+		if (cl) {
 			do {
 				if (cl->level > level)
 					level = cl->level;
 			} while ((cl = cl->sibling) != this->children);
 		}
-		this->level = level+1;
+		this->level = level + 1;
 	} while ((this = this->tparent) != NULL);
 }
 
 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 {
 	struct cbq_class *cl;
-	unsigned h;
+	unsigned int h;
 
 	if (q->quanta[prio] == 0)
 		return;
 
-	for (h=0; h<16; h++) {
-		for (cl = q->classes[h]; cl; cl = cl->next) {
+	for (h = 0; h < q->clhash.hashsize; h++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 			/* BUGGGG... Beware! This expression suffer of
-			   arithmetic overflows!
+			 * arithmetic overflows!
 			 */
 			if (cl->priority == prio) {
 				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 					q->quanta[prio];
 			}
-			if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
-				printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
-				cl->quantum = cl->qdisc->dev->mtu/2 + 1;
+			if (cl->quantum <= 0 ||
+			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
+				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
+					cl->common.classid, cl->quantum);
+				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 			}
 		}
 	}
@@ -1128,29 +1072,30 @@ static void cbq_sync_defmap(struct cbq_class *cl)
 {
 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 	struct cbq_class *split = cl->split;
-	unsigned h;
+	unsigned int h;
 	int i;
 
 	if (split == NULL)
 		return;
 
-	for (i=0; i<=TC_PRIO_MAX; i++) {
-		if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
+	for (i = 0; i <= TC_PRIO_MAX; i++) {
+		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 			split->defaults[i] = NULL;
 	}
 
-	for (i=0; i<=TC_PRIO_MAX; i++) {
+	for (i = 0; i <= TC_PRIO_MAX; i++) {
 		int level = split->level;
 
 		if (split->defaults[i])
 			continue;
 
-		for (h=0; h<16; h++) {
+		for (h = 0; h < q->clhash.hashsize; h++) {
 			struct cbq_class *c;
 
-			for (c = q->classes[h]; c; c = c->next) {
+			hlist_for_each_entry(c, &q->clhash.hash[h],
+					     common.hnode) {
 				if (c->split == split && c->level < level &&
-				    c->defmap&(1<<i)) {
+				    c->defmap & (1<<i)) {
 					split->defaults[i] = c;
 					level = c->level;
 				}
@@ -1164,14 +1109,15 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma
 	struct cbq_class *split = NULL;
 
 	if (splitid == 0) {
-		if ((split = cl->split) == NULL)
+		split = cl->split;
+		if (!split)
 			return;
-		splitid = split->classid;
+		splitid = split->common.classid;
 	}
 
-	if (split == NULL || split->classid != splitid) {
+	if (split == NULL || split->common.classid != splitid) {
 		for (split = cl->tparent; split; split = split->tparent)
-			if (split->classid == splitid)
+			if (split->common.classid == splitid)
 				break;
 	}
 
@@ -1182,9 +1128,9 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma
 		cl->defmap = 0;
 		cbq_sync_defmap(cl);
 		cl->split = split;
-		cl->defmap = def&mask;
+		cl->defmap = def & mask;
 	} else
-		cl->defmap = (cl->defmap&~mask)|(def&mask);
+		cl->defmap = (cl->defmap & ~mask) | (def & mask);
 
 	cbq_sync_defmap(cl);
 }
@@ -1194,16 +1140,10 @@ static void cbq_unlink_class(struct cbq_class *this)
 	struct cbq_class *cl, **clp;
 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 
-	for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
-		if (cl == this) {
-			*clp = cl->next;
-			cl->next = NULL;
-			break;
-		}
-	}
+	qdisc_class_hash_remove(&q->clhash, &this->common);
 
 	if (this->tparent) {
-		clp=&this->sibling;
+		clp = &this->sibling;
 		cl = *clp;
 		do {
 			if (cl == this) {
@@ -1219,19 +1159,17 @@ static void cbq_unlink_class(struct cbq_class *this)
 				this->tparent->children = NULL;
 		}
 	} else {
-		BUG_TRAP(this->sibling == this);
+		WARN_ON(this->sibling != this);
 	}
 }
 
 static void cbq_link_class(struct cbq_class *this)
 {
 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
-	unsigned h = cbq_hash(this->classid);
 	struct cbq_class *parent = this->tparent;
 
 	this->sibling = this;
-	this->next = q->classes[h];
-	q->classes[h] = this;
+	qdisc_class_hash_insert(&q->clhash, &this->common);
 
 	if (parent == NULL)
 		return;
@@ -1244,7 +1182,7 @@ static void cbq_link_class(struct cbq_class *this)
 	}
 }
 
-static unsigned int cbq_drop(struct Qdisc* sch)
+static unsigned int cbq_drop(struct Qdisc *sch)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct cbq_class *cl, *cl_head;
@@ -1252,13 +1190,16 @@ static unsigned int cbq_drop(struct Qdisc* sch)
 	unsigned int len;
 
 	for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
-		if ((cl_head = q->active[prio]) == NULL)
+		cl_head = q->active[prio];
+		if (!cl_head)
 			continue;
 
 		cl = cl_head;
 		do {
 			if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
 				sch->q.qlen--;
+				if (!cl->q->q.qlen)
+					cbq_deactivate_class(cl);
 				return len;
 			}
 		} while ((cl = cl->next_alive) != cl_head);
@@ -1267,32 +1208,32 @@ static unsigned int cbq_drop(struct Qdisc* sch)
 }
 
 static void
-cbq_reset(struct Qdisc* sch)
+cbq_reset(struct Qdisc *sch)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct cbq_class *cl;
 	int prio;
-	unsigned h;
+	unsigned int h;
 
 	q->activemask = 0;
 	q->pmask = 0;
 	q->tx_class = NULL;
 	q->tx_borrowed = NULL;
-	del_timer(&q->wd_timer);
-	del_timer(&q->delay_timer);
+	qdisc_watchdog_cancel(&q->watchdog);
+	hrtimer_cancel(&q->delay_timer);
 	q->toplevel = TC_CBQ_MAXLEVEL;
-	PSCHED_GET_TIME(q->now);
+	q->now = psched_get_time();
 	q->now_rt = q->now;
 
 	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
 		q->active[prio] = NULL;
 
-	for (h = 0; h < 16; h++) {
-		for (cl = q->classes[h]; cl; cl = cl->next) {
+	for (h = 0; h < q->clhash.hashsize; h++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 			qdisc_reset(cl->q);
 
 			cl->next_alive = NULL;
-			PSCHED_SET_PASTPERFECT(cl->undertime);
+			cl->undertime = PSCHED_PASTPERFECT;
 			cl->avgidle = cl->maxidle;
 			cl->deficit = cl->quantum;
 			cl->cpriority = cl->priority;
@@ -1304,21 +1245,21 @@ cbq_reset(struct Qdisc* sch)
 
 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
 {
-	if (lss->change&TCF_CBQ_LSS_FLAGS) {
-		cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
-		cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
+	if (lss->change & TCF_CBQ_LSS_FLAGS) {
+		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
+		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
 	}
-	if (lss->change&TCF_CBQ_LSS_EWMA)
+	if (lss->change & TCF_CBQ_LSS_EWMA)
 		cl->ewma_log = lss->ewma_log;
-	if (lss->change&TCF_CBQ_LSS_AVPKT)
+	if (lss->change & TCF_CBQ_LSS_AVPKT)
 		cl->avpkt = lss->avpkt;
-	if (lss->change&TCF_CBQ_LSS_MINIDLE)
+	if (lss->change & TCF_CBQ_LSS_MINIDLE)
 		cl->minidle = -(long)lss->minidle;
-	if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
+	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
 		cl->maxidle = lss->maxidle;
 		cl->avgidle = lss->maxidle;
 	}
-	if (lss->change&TCF_CBQ_LSS_OFFTIME)
+	if (lss->change & TCF_CBQ_LSS_OFFTIME)
 		cl->offtime = lss->offtime;
 	return 0;
 }
@@ -1346,10 +1287,10 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
 	if (wrr->weight)
 		cl->weight = wrr->weight;
 	if (wrr->priority) {
-		cl->priority = wrr->priority-1;
+		cl->priority = wrr->priority - 1;
 		cl->cpriority = cl->priority;
 		if (cl->priority >= cl->priority2)
-			cl->priority2 = TC_CBQ_MAXPRIO-1;
+			cl->priority2 = TC_CBQ_MAXPRIO - 1;
 	}
 
 	cbq_addprio(q, cl);
@@ -1366,10 +1307,10 @@ static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
 		cl->overlimit = cbq_ovl_delay;
 		break;
 	case TC_CBQ_OVL_LOWPRIO:
-		if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
-		    ovl->priority2-1 <= cl->priority)
+		if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
+		    ovl->priority2 - 1 <= cl->priority)
 			return -EINVAL;
-		cl->priority2 = ovl->priority2-1;
+		cl->priority2 = ovl->priority2 - 1;
 		cl->overlimit = cbq_ovl_lowprio;
 		break;
 	case TC_CBQ_OVL_DROP:
@@ -1381,11 +1322,11 @@ static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
 	default:
 		return -EINVAL;
 	}
-	cl->penalty = (ovl->penalty*HZ)/1000;
+	cl->penalty = ovl->penalty;
 	return 0;
 }
 
-#ifdef CONFIG_NET_CLS_POLICE
+#ifdef CONFIG_NET_CLS_ACT
 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
 {
 	cl->police = p->police;
@@ -1406,81 +1347,97 @@ static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
 	return 0;
 }
 
-static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
+static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
+	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
+	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
+	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
+	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
+	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
+	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
+};
+
+static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct rtattr *tb[TCA_CBQ_MAX];
+	struct nlattr *tb[TCA_CBQ_MAX + 1];
 	struct tc_ratespec *r;
+	int err;
 
-	if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
-	    tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
-		return -EINVAL;
+	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
+	if (err < 0)
+		return err;
 
-	if (tb[TCA_CBQ_LSSOPT-1] &&
-	    RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
+	if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
 		return -EINVAL;
 
-	r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
+	r = nla_data(tb[TCA_CBQ_RATE]);
 
-	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
+	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
 		return -EINVAL;
 
+	err = qdisc_class_hash_init(&q->clhash);
+	if (err < 0)
+		goto put_rtab;
+
 	q->link.refcnt = 1;
 	q->link.sibling = &q->link;
-	q->link.classid = sch->handle;
+	q->link.common.classid = sch->handle;
 	q->link.qdisc = sch;
-	if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
+	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+				      sch->handle);
+	if (!q->link.q)
 		q->link.q = &noop_qdisc;
 
-	q->link.priority = TC_CBQ_MAXPRIO-1;
-	q->link.priority2 = TC_CBQ_MAXPRIO-1;
-	q->link.cpriority = TC_CBQ_MAXPRIO-1;
+	q->link.priority = TC_CBQ_MAXPRIO - 1;
+	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
+	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
 	q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
 	q->link.overlimit = cbq_ovl_classic;
-	q->link.allot = psched_mtu(sch->dev);
+	q->link.allot = psched_mtu(qdisc_dev(sch));
 	q->link.quantum = q->link.allot;
 	q->link.weight = q->link.R_tab->rate.rate;
 
 	q->link.ewma_log = TC_CBQ_DEF_EWMA;
 	q->link.avpkt = q->link.allot/2;
 	q->link.minidle = -0x7FFFFFFF;
-	q->link.stats_lock = &sch->dev->queue_lock;
 
-	init_timer(&q->wd_timer);
-	q->wd_timer.data = (unsigned long)sch;
-	q->wd_timer.function = cbq_watchdog;
-	init_timer(&q->delay_timer);
-	q->delay_timer.data = (unsigned long)sch;
+	qdisc_watchdog_init(&q->watchdog, sch);
+	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	q->delay_timer.function = cbq_undelay;
 	q->toplevel = TC_CBQ_MAXLEVEL;
-	PSCHED_GET_TIME(q->now);
+	q->now = psched_get_time();
 	q->now_rt = q->now;
 
 	cbq_link_class(&q->link);
 
-	if (tb[TCA_CBQ_LSSOPT-1])
-		cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
+	if (tb[TCA_CBQ_LSSOPT])
+		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
 
 	cbq_addprio(q, &q->link);
 	return 0;
+
+put_rtab:
+	qdisc_put_rtab(q->link.R_tab);
+	return err;
 }
 
-static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
 {
-	unsigned char	 *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
 
-	RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
+	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
+		goto nla_put_failure;
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
 {
-	unsigned char	 *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_cbq_lssopt opt;
 
 	opt.flags = 0;
@@ -1495,83 +1452,89 @@ static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
 	opt.minidle = (u32)(-cl->minidle);
 	opt.offtime = cl->offtime;
 	opt.change = ~0;
-	RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
+		goto nla_put_failure;
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
 {
-	unsigned char	 *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_cbq_wrropt opt;
 
+	memset(&opt, 0, sizeof(opt));
 	opt.flags = 0;
 	opt.allot = cl->allot;
-	opt.priority = cl->priority+1;
-	opt.cpriority = cl->cpriority+1;
+	opt.priority = cl->priority + 1;
+	opt.cpriority = cl->cpriority + 1;
 	opt.weight = cl->weight;
-	RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
+		goto nla_put_failure;
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
 {
-	unsigned char	 *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_cbq_ovl opt;
 
 	opt.strategy = cl->ovl_strategy;
-	opt.priority2 = cl->priority2+1;
+	opt.priority2 = cl->priority2 + 1;
 	opt.pad = 0;
-	opt.penalty = (cl->penalty*1000)/HZ;
-	RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
+	opt.penalty = cl->penalty;
+	if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt))
+		goto nla_put_failure;
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
 {
-	unsigned char	 *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_cbq_fopt opt;
 
 	if (cl->split || cl->defmap) {
-		opt.split = cl->split ? cl->split->classid : 0;
+		opt.split = cl->split ? cl->split->common.classid : 0;
 		opt.defmap = cl->defmap;
 		opt.defchange = ~0;
-		RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
+		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
+			goto nla_put_failure;
 	}
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
-#ifdef CONFIG_NET_CLS_POLICE
-static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
+#ifdef CONFIG_NET_CLS_ACT
+static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
 {
-	unsigned char	 *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_cbq_police opt;
 
 	if (cl->police) {
 		opt.police = cl->police;
 		opt.__res1 = 0;
 		opt.__res2 = 0;
-		RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
+		if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
+			goto nla_put_failure;
 	}
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 #endif
@@ -1582,7 +1545,7 @@ static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
 	    cbq_dump_rate(skb, cl) < 0 ||
 	    cbq_dump_wrr(skb, cl) < 0 ||
 	    cbq_dump_ovl(skb, cl) < 0 ||
-#ifdef CONFIG_NET_CLS_POLICE
+#ifdef CONFIG_NET_CLS_ACT
 	    cbq_dump_police(skb, cl) < 0 ||
 #endif
 	    cbq_dump_fopt(skb, cl) < 0)
@@ -1593,18 +1556,17 @@ static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	unsigned char	 *b = skb->tail;
-	struct rtattr *rta;
+	struct nlattr *nest;
 
-	rta = (struct rtattr*)b;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
 	if (cbq_dump_attr(skb, &q->link) < 0)
-		goto rtattr_failure;
-	rta->rta_len = skb->tail - b;
-	return skb->len;
+		goto nla_put_failure;
+	return nla_nest_end(skb, nest);
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
 	return -1;
 }
 
@@ -1621,26 +1583,25 @@ static int
 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
 	       struct sk_buff *skb, struct tcmsg *tcm)
 {
-	struct cbq_class *cl = (struct cbq_class*)arg;
-	unsigned char	 *b = skb->tail;
-	struct rtattr *rta;
+	struct cbq_class *cl = (struct cbq_class *)arg;
+	struct nlattr *nest;
 
 	if (cl->tparent)
-		tcm->tcm_parent = cl->tparent->classid;
+		tcm->tcm_parent = cl->tparent->common.classid;
 	else
 		tcm->tcm_parent = TC_H_ROOT;
-	tcm->tcm_handle = cl->classid;
+	tcm->tcm_handle = cl->common.classid;
 	tcm->tcm_info = cl->q->handle;
 
-	rta = (struct rtattr*)b;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
 	if (cbq_dump_attr(skb, cl) < 0)
-		goto rtattr_failure;
-	rta->rta_len = skb->tail - b;
-	return skb->len;
+		goto nla_put_failure;
+	return nla_nest_end(skb, nest);
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
 	return -1;
 }
 
@@ -1649,19 +1610,17 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	struct gnet_dump *d)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl = (struct cbq_class*)arg;
+	struct cbq_class *cl = (struct cbq_class *)arg;
 
 	cl->qstats.qlen = cl->q->q.qlen;
 	cl->xstats.avgidle = cl->avgidle;
 	cl->xstats.undertime = 0;
 
-	if (!PSCHED_IS_PASTPERFECT(cl->undertime))
-		cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
+	if (cl->undertime != PSCHED_PASTPERFECT)
+		cl->xstats.undertime = cl->undertime - q->now;
 
 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
-#ifdef CONFIG_NET_ESTIMATOR
-	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
-#endif
+	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
 		return -1;
 
@@ -1671,36 +1630,42 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 		     struct Qdisc **old)
 {
-	struct cbq_class *cl = (struct cbq_class*)arg;
+	struct cbq_class *cl = (struct cbq_class *)arg;
 
-	if (cl) {
-		if (new == NULL) {
-			if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL)
-				return -ENOBUFS;
-		} else {
-#ifdef CONFIG_NET_CLS_POLICE
-			if (cl->police == TC_POLICE_RECLASSIFY)
-				new->reshape_fail = cbq_reshape_fail;
+	if (new == NULL) {
+		new = qdisc_create_dflt(sch->dev_queue,
+					&pfifo_qdisc_ops, cl->common.classid);
+		if (new == NULL)
+			return -ENOBUFS;
+	} else {
+#ifdef CONFIG_NET_CLS_ACT
+		if (cl->police == TC_POLICE_RECLASSIFY)
+			new->reshape_fail = cbq_reshape_fail;
 #endif
-		}
-		sch_tree_lock(sch);
-		*old = cl->q;
-		cl->q = new;
-		sch->q.qlen -= (*old)->q.qlen;
-		qdisc_reset(*old);
-		sch_tree_unlock(sch);
-
-		return 0;
 	}
-	return -ENOENT;
+	sch_tree_lock(sch);
+	*old = cl->q;
+	cl->q = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct cbq_class *cl = (struct cbq_class *)arg;
+
+	return cl->q;
 }
 
-static struct Qdisc *
-cbq_leaf(struct Qdisc *sch, unsigned long arg)
+static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
 {
-	struct cbq_class *cl = (struct cbq_class*)arg;
+	struct cbq_class *cl = (struct cbq_class *)arg;
 
-	return cl ? cl->q : NULL;
+	if (cl->q->q.qlen == 0)
+		cbq_deactivate_class(cl);
 }
 
 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
@@ -1715,40 +1680,28 @@ static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
 	return 0;
 }
 
-static void cbq_destroy_filters(struct cbq_class *cl)
-{
-	struct tcf_proto *tp;
-
-	while ((tp = cl->filter_list) != NULL) {
-		cl->filter_list = tp->next;
-		tcf_destroy(tp);
-	}
-}
-
 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 
-	BUG_TRAP(!cl->filters);
+	WARN_ON(cl->filters);
 
-	cbq_destroy_filters(cl);
+	tcf_destroy_chain(&cl->filter_list);
 	qdisc_destroy(cl->q);
 	qdisc_put_rtab(cl->R_tab);
-#ifdef CONFIG_NET_ESTIMATOR
 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
-#endif
 	if (cl != &q->link)
 		kfree(cl);
 }
 
-static void
-cbq_destroy(struct Qdisc* sch)
+static void cbq_destroy(struct Qdisc *sch)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct hlist_node *next;
 	struct cbq_class *cl;
-	unsigned h;
+	unsigned int h;
 
-#ifdef CONFIG_NET_CLS_POLICE
+#ifdef CONFIG_NET_CLS_ACT
 	q->rx_class = NULL;
 #endif
 	/*
@@ -1756,32 +1709,31 @@ cbq_destroy(struct Qdisc* sch)
 	 * classes from root to leafs which means that filters can still
 	 * be bound to classes which have been destroyed already. --TGR '04
 	 */
-	for (h = 0; h < 16; h++)
-		for (cl = q->classes[h]; cl; cl = cl->next)
-			cbq_destroy_filters(cl);
-
-	for (h = 0; h < 16; h++) {
-		struct cbq_class *next;
-
-		for (cl = q->classes[h]; cl; cl = next) {
-			next = cl->next;
+	for (h = 0; h < q->clhash.hashsize; h++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
+			tcf_destroy_chain(&cl->filter_list);
+	}
+	for (h = 0; h < q->clhash.hashsize; h++) {
+		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
+					  common.hnode)
 			cbq_destroy_class(sch, cl);
-		}
 	}
+	qdisc_class_hash_destroy(&q->clhash);
 }
 
 static void cbq_put(struct Qdisc *sch, unsigned long arg)
 {
-	struct cbq_class *cl = (struct cbq_class*)arg;
+	struct cbq_class *cl = (struct cbq_class *)arg;
 
 	if (--cl->refcnt == 0) {
-#ifdef CONFIG_NET_CLS_POLICE
+#ifdef CONFIG_NET_CLS_ACT
+		spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
 		struct cbq_sched_data *q = qdisc_priv(sch);
 
-		spin_lock_bh(&sch->dev->queue_lock);
+		spin_lock_bh(root_lock);
 		if (q->rx_class == cl)
 			q->rx_class = NULL;
-		spin_unlock_bh(&sch->dev->queue_lock);
+		spin_unlock_bh(root_lock);
 #endif
 
 		cbq_destroy_class(sch, cl);
@@ -1789,61 +1741,51 @@ static void cbq_put(struct Qdisc *sch, unsigned long arg)
 }
 
 static int
-cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
+cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
 		 unsigned long *arg)
 {
 	int err;
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl = (struct cbq_class*)*arg;
-	struct rtattr *opt = tca[TCA_OPTIONS-1];
-	struct rtattr *tb[TCA_CBQ_MAX];
+	struct cbq_class *cl = (struct cbq_class *)*arg;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_CBQ_MAX + 1];
 	struct cbq_class *parent;
 	struct qdisc_rate_table *rtab = NULL;
 
-	if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
-		return -EINVAL;
-
-	if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
-	    RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
+	if (opt == NULL)
 		return -EINVAL;
 
-	if (tb[TCA_CBQ_FOPT-1] &&
-	    RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
-		return -EINVAL;
-
-	if (tb[TCA_CBQ_RATE-1] &&
-	    RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
-			return -EINVAL;
-
-	if (tb[TCA_CBQ_LSSOPT-1] &&
-	    RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
-			return -EINVAL;
-
-	if (tb[TCA_CBQ_WRROPT-1] &&
-	    RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
-			return -EINVAL;
-
-#ifdef CONFIG_NET_CLS_POLICE
-	if (tb[TCA_CBQ_POLICE-1] &&
-	    RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
-			return -EINVAL;
-#endif
+	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
+	if (err < 0)
+		return err;
 
 	if (cl) {
 		/* Check parent */
 		if (parentid) {
-			if (cl->tparent && cl->tparent->classid != parentid)
+			if (cl->tparent &&
+			    cl->tparent->common.classid != parentid)
 				return -EINVAL;
 			if (!cl->tparent && parentid != TC_H_ROOT)
 				return -EINVAL;
 		}
 
-		if (tb[TCA_CBQ_RATE-1]) {
-			rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
+		if (tb[TCA_CBQ_RATE]) {
+			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
+					      tb[TCA_CBQ_RTAB]);
 			if (rtab == NULL)
 				return -EINVAL;
 		}
 
+		if (tca[TCA_RATE]) {
+			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+						    qdisc_root_sleeping_lock(sch),
+						    tca[TCA_RATE]);
+			if (err) {
+				qdisc_put_rtab(rtab);
+				return err;
+			}
+		}
+
 		/* Change class parameters */
 		sch_tree_lock(sch);
 
@@ -1851,62 +1793,58 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t
 			cbq_deactivate_class(cl);
 
 		if (rtab) {
-			rtab = xchg(&cl->R_tab, rtab);
-			qdisc_put_rtab(rtab);
+			qdisc_put_rtab(cl->R_tab);
+			cl->R_tab = rtab;
 		}
 
-		if (tb[TCA_CBQ_LSSOPT-1])
-			cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
+		if (tb[TCA_CBQ_LSSOPT])
+			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
 
-		if (tb[TCA_CBQ_WRROPT-1]) {
+		if (tb[TCA_CBQ_WRROPT]) {
 			cbq_rmprio(q, cl);
-			cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
+			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
 		}
 
-		if (tb[TCA_CBQ_OVL_STRATEGY-1])
-			cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
+		if (tb[TCA_CBQ_OVL_STRATEGY])
+			cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
 
-#ifdef CONFIG_NET_CLS_POLICE
-		if (tb[TCA_CBQ_POLICE-1])
-			cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
+#ifdef CONFIG_NET_CLS_ACT
+		if (tb[TCA_CBQ_POLICE])
+			cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
 #endif
 
-		if (tb[TCA_CBQ_FOPT-1])
-			cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
+		if (tb[TCA_CBQ_FOPT])
+			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
 
 		if (cl->q->q.qlen)
 			cbq_activate_class(cl);
 
 		sch_tree_unlock(sch);
 
-#ifdef CONFIG_NET_ESTIMATOR
-		if (tca[TCA_RATE-1])
-			gen_replace_estimator(&cl->bstats, &cl->rate_est,
-				cl->stats_lock, tca[TCA_RATE-1]);
-#endif
 		return 0;
 	}
 
 	if (parentid == TC_H_ROOT)
 		return -EINVAL;
 
-	if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
-	    tb[TCA_CBQ_LSSOPT-1] == NULL)
+	if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
+	    tb[TCA_CBQ_LSSOPT] == NULL)
 		return -EINVAL;
 
-	rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
+	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
 	if (rtab == NULL)
 		return -EINVAL;
 
 	if (classid) {
 		err = -EINVAL;
-		if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
+		if (TC_H_MAJ(classid ^ sch->handle) ||
+		    cbq_class_lookup(q, classid))
 			goto failure;
 	} else {
 		int i;
-		classid = TC_H_MAKE(sch->handle,0x8000);
+		classid = TC_H_MAKE(sch->handle, 0x8000);
 
-		for (i=0; i<0x8000; i++) {
+		for (i = 0; i < 0x8000; i++) {
 			if (++q->hgenerator >= 0x8000)
 				q->hgenerator = 1;
 			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
@@ -1927,22 +1865,32 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t
 	}
 
 	err = -ENOBUFS;
-	cl = kmalloc(sizeof(*cl), GFP_KERNEL);
+	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
 	if (cl == NULL)
 		goto failure;
-	memset(cl, 0, sizeof(*cl));
+
+	if (tca[TCA_RATE]) {
+		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+					qdisc_root_sleeping_lock(sch),
+					tca[TCA_RATE]);
+		if (err) {
+			kfree(cl);
+			goto failure;
+		}
+	}
+
 	cl->R_tab = rtab;
 	rtab = NULL;
 	cl->refcnt = 1;
-	if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
+	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
+	if (!cl->q)
 		cl->q = &noop_qdisc;
-	cl->classid = classid;
+	cl->common.classid = classid;
 	cl->tparent = parent;
 	cl->qdisc = sch;
 	cl->allot = parent->allot;
 	cl->quantum = cl->allot;
 	cl->weight = cl->R_tab->rate.rate;
-	cl->stats_lock = &sch->dev->queue_lock;
 
 	sch_tree_lock(sch);
 	cbq_link_class(cl);
@@ -1951,30 +1899,26 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t
 		cl->share = cl->tparent;
 	cbq_adjust_levels(parent);
 	cl->minidle = -0x7FFFFFFF;
-	cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
-	cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
-	if (cl->ewma_log==0)
+	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
+	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
+	if (cl->ewma_log == 0)
 		cl->ewma_log = q->link.ewma_log;
-	if (cl->maxidle==0)
+	if (cl->maxidle == 0)
 		cl->maxidle = q->link.maxidle;
-	if (cl->avpkt==0)
+	if (cl->avpkt == 0)
 		cl->avpkt = q->link.avpkt;
 	cl->overlimit = cbq_ovl_classic;
-	if (tb[TCA_CBQ_OVL_STRATEGY-1])
-		cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
-#ifdef CONFIG_NET_CLS_POLICE
-	if (tb[TCA_CBQ_POLICE-1])
-		cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
+	if (tb[TCA_CBQ_OVL_STRATEGY])
+		cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
+#ifdef CONFIG_NET_CLS_ACT
+	if (tb[TCA_CBQ_POLICE])
+		cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
 #endif
-	if (tb[TCA_CBQ_FOPT-1])
-		cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
+	if (tb[TCA_CBQ_FOPT])
+		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
 	sch_tree_unlock(sch);
 
-#ifdef CONFIG_NET_ESTIMATOR
-	if (tca[TCA_RATE-1])
-		gen_new_estimator(&cl->bstats, &cl->rate_est,
-			cl->stats_lock, tca[TCA_RATE-1]);
-#endif
+	qdisc_class_hash_grow(sch, &q->clhash);
 
 	*arg = (unsigned long)cl;
 	return 0;
@@ -1987,13 +1931,18 @@ failure:
 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl = (struct cbq_class*)arg;
+	struct cbq_class *cl = (struct cbq_class *)arg;
+	unsigned int qlen;
 
 	if (cl->filters || cl->children || cl == &q->link)
 		return -EBUSY;
 
 	sch_tree_lock(sch);
 
+	qlen = cl->q->q.qlen;
+	qdisc_reset(cl->q);
+	qdisc_tree_decrease_qlen(cl->q, qlen);
+
 	if (cl->next_alive)
 		cbq_deactivate_class(cl);
 
@@ -2003,7 +1952,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 		q->tx_class = NULL;
 		q->tx_borrowed = NULL;
 	}
-#ifdef CONFIG_NET_CLS_POLICE
+#ifdef CONFIG_NET_CLS_ACT
 	if (q->rx_class == cl)
 		q->rx_class = NULL;
 #endif
@@ -2016,8 +1965,11 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 	cbq_rmprio(q, cl);
 	sch_tree_unlock(sch);
 
-	if (--cl->refcnt == 0)
-		cbq_destroy_class(sch, cl);
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
 
 	return 0;
 }
@@ -2037,7 +1989,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
 				     u32 classid)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *p = (struct cbq_class*)parent;
+	struct cbq_class *p = (struct cbq_class *)parent;
 	struct cbq_class *cl = cbq_class_lookup(q, classid);
 
 	if (cl) {
@@ -2051,7 +2003,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
 
 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
 {
-	struct cbq_class *cl = (struct cbq_class*)arg;
+	struct cbq_class *cl = (struct cbq_class *)arg;
 
 	cl->filters--;
 }
@@ -2059,15 +2011,14 @@ static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	unsigned h;
+	struct cbq_class *cl;
+	unsigned int h;
 
 	if (arg->stop)
 		return;
 
-	for (h = 0; h < 16; h++) {
-		struct cbq_class *cl;
-
-		for (cl = q->classes[h]; cl; cl = cl->next) {
+	for (h = 0; h < q->clhash.hashsize; h++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 			if (arg->count < arg->skip) {
 				arg->count++;
 				continue;
@@ -2081,9 +2032,10 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 	}
 }
 
-static struct Qdisc_class_ops cbq_class_ops = {
+static const struct Qdisc_class_ops cbq_class_ops = {
 	.graft		=	cbq_graft,
 	.leaf		=	cbq_leaf,
+	.qlen_notify	=	cbq_qlen_notify,
 	.get		=	cbq_get,
 	.put		=	cbq_put,
 	.change		=	cbq_change_class,
@@ -2096,14 +2048,14 @@ static struct Qdisc_class_ops cbq_class_ops = {
 	.dump_stats	=	cbq_dump_class_stats,
 };
 
-static struct Qdisc_ops cbq_qdisc_ops = {
+static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
 	.next		=	NULL,
 	.cl_ops		=	&cbq_class_ops,
 	.id		=	"cbq",
 	.priv_size	=	sizeof(struct cbq_sched_data),
 	.enqueue	=	cbq_enqueue,
 	.dequeue	=	cbq_dequeue,
-	.requeue	=	cbq_requeue,
+	.peek		=	qdisc_peek_dequeued,
 	.drop		=	cbq_drop,
 	.init		=	cbq_init,
 	.reset		=	cbq_reset,
@@ -2118,7 +2070,7 @@ static int __init cbq_module_init(void)
 {
 	return register_qdisc(&cbq_qdisc_ops);
 }
-static void __exit cbq_module_exit(void) 
+static void __exit cbq_module_exit(void)
 {
 	unregister_qdisc(&cbq_qdisc_ops);
 }
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
new file mode 100644
index 00000000000..ed30e436128
--- /dev/null
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,632 @@
+/*
+ * net/sched/sch_choke.c	CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+#include <net/flow_keys.h>
+
+/*
+   CHOKe stateless AQM for fair bandwidth allocation
+   =================================================
+
+   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+   maintains no flow state. The difference from RED is an additional step
+   during the enqueuing process. If average queue size is over the
+   low threshold (qmin), a packet is chosen at random from the queue.
+   If both the new and chosen packet are from the same flow, both
+   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+   needs to access packets in queue randomly. It has a minimal class
+   interface to allow overriding the builtin flow classifier with
+   filters.
+
+   Source:
+   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+   IEEE INFOCOM, 2000.
+
+   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+   Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+/* Upper bound on size of sk_buff table (packets) */
+#define CHOKE_MAX_QUEUE	(128*1024 - 1)
+
+struct choke_sched_data {
+/* Parameters */
+	u32		 limit;
+	unsigned char	 flags;
+
+	struct red_parms parms;
+
+/* Variables */
+	struct red_vars  vars;
+	struct tcf_proto *filter_list;
+	struct {
+		u32	prob_drop;	/* Early probability drops */
+		u32	prob_mark;	/* Early probability marks */
+		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
+		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
+		u32	pdrop;          /* Drops due to queue limits */
+		u32	other;          /* Drops due to drop() calls */
+		u32	matched;	/* Drops to flow match */
+	} stats;
+
+	unsigned int	 head;
+	unsigned int	 tail;
+
+	unsigned int	 tab_mask; /* size - 1 */
+
+	struct sk_buff **tab;
+};
+
+/* number of elements in queue including holes */
+static unsigned int choke_len(const struct choke_sched_data *q)
+{
+	return (q->tail - q->head) & q->tab_mask;
+}
+
+/* Is ECN parameter configured */
+static int use_ecn(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static int use_harddrop(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+	do {
+		q->head = (q->head + 1) & q->tab_mask;
+		if (q->head == q->tail)
+			break;
+	} while (q->tab[q->head] == NULL);
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+	do {
+		q->tail = (q->tail - 1) & q->tab_mask;
+		if (q->head == q->tail)
+			break;
+	} while (q->tab[q->tail] == NULL);
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = q->tab[idx];
+
+	q->tab[idx] = NULL;
+
+	if (idx == q->head)
+		choke_zap_head_holes(q);
+	if (idx == q->tail)
+		choke_zap_tail_holes(q);
+
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	qdisc_drop(skb, sch);
+	qdisc_tree_decrease_qlen(sch, 1);
+	--sch->q.qlen;
+}
+
+struct choke_skb_cb {
+	u16			classid;
+	u8			keys_valid;
+	struct flow_keys	keys;
+};
+
+static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
+	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
+{
+	choke_skb_cb(skb)->classid = classid;
+}
+
+static u16 choke_get_classid(const struct sk_buff *skb)
+{
+	return choke_skb_cb(skb)->classid;
+}
+
+/*
+ * Compare flow of two packets
+ *  Returns true only if source and destination address and port match.
+ *          false for special cases
+ */
+static bool choke_match_flow(struct sk_buff *skb1,
+			     struct sk_buff *skb2)
+{
+	if (skb1->protocol != skb2->protocol)
+		return false;
+
+	if (!choke_skb_cb(skb1)->keys_valid) {
+		choke_skb_cb(skb1)->keys_valid = 1;
+		skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys);
+	}
+
+	if (!choke_skb_cb(skb2)->keys_valid) {
+		choke_skb_cb(skb2)->keys_valid = 1;
+		skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys);
+	}
+
+	return !memcmp(&choke_skb_cb(skb1)->keys,
+		       &choke_skb_cb(skb2)->keys,
+		       sizeof(struct flow_keys));
+}
+
+/*
+ * Classify flow using either:
+ *  1. pre-existing classification result in skb
+ *  2. fast internal classification
+ *  3. use TC filter based classification
+ */
+static bool choke_classify(struct sk_buff *skb,
+			   struct Qdisc *sch, int *qerr)
+
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return false;
+		}
+#endif
+		choke_set_classid(skb, TC_H_MIN(res.classid));
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Select a packet at random from queue
+ * HACK: since queue can have holes from previous deletion; retry several
+ *   times to find a random skb but then just give up and return the head
+ * Will return NULL if queue is empty (q->head == q->tail)
+ */
+static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
+					 unsigned int *pidx)
+{
+	struct sk_buff *skb;
+	int retrys = 3;
+
+	do {
+		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
+		skb = q->tab[*pidx];
+		if (skb)
+			return skb;
+	} while (--retrys > 0);
+
+	return q->tab[*pidx = q->head];
+}
+
+/*
+ * Compare new packet with random packet in queue
+ * returns true if matched and sets *pidx
+ */
+static bool choke_match_random(const struct choke_sched_data *q,
+			       struct sk_buff *nskb,
+			       unsigned int *pidx)
+{
+	struct sk_buff *oskb;
+
+	if (q->head == q->tail)
+		return false;
+
+	oskb = choke_peek_random(q, pidx);
+	if (q->filter_list)
+		return choke_get_classid(nskb) == choke_get_classid(oskb);
+
+	return choke_match_flow(oskb, nskb);
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	const struct red_parms *p = &q->parms;
+	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+	if (q->filter_list) {
+		/* If using external classifiers, get result and record it. */
+		if (!choke_classify(skb, sch, &ret))
+			goto other_drop;	/* Packet was eaten by filter */
+	}
+
+	choke_skb_cb(skb)->keys_valid = 0;
+	/* Compute average queue usage (see RED) */
+	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
+	if (red_is_idling(&q->vars))
+		red_end_of_idle_period(&q->vars);
+
+	/* Is queue small? */
+	if (q->vars.qavg <= p->qth_min)
+		q->vars.qcount = -1;
+	else {
+		unsigned int idx;
+
+		/* Draw a packet at random from queue and compare flow */
+		if (choke_match_random(q, skb, &idx)) {
+			q->stats.matched++;
+			choke_drop_by_idx(sch, idx);
+			goto congestion_drop;
+		}
+
+		/* Queue is large, always mark/drop */
+		if (q->vars.qavg > p->qth_max) {
+			q->vars.qcount = -1;
+
+			sch->qstats.overlimits++;
+			if (use_harddrop(q) || !use_ecn(q) ||
+			    !INET_ECN_set_ce(skb)) {
+				q->stats.forced_drop++;
+				goto congestion_drop;
+			}
+
+			q->stats.forced_mark++;
+		} else if (++q->vars.qcount) {
+			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
+				q->vars.qcount = 0;
+				q->vars.qR = red_random(p);
+
+				sch->qstats.overlimits++;
+				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+					q->stats.prob_drop++;
+					goto congestion_drop;
+				}
+
+				q->stats.prob_mark++;
+			}
+		} else
+			q->vars.qR = red_random(p);
+	}
+
+	/* Admit new packet */
+	if (sch->q.qlen < q->limit) {
+		q->tab[q->tail] = skb;
+		q->tail = (q->tail + 1) & q->tab_mask;
+		++sch->q.qlen;
+		sch->qstats.backlog += qdisc_pkt_len(skb);
+		return NET_XMIT_SUCCESS;
+	}
+
+	q->stats.pdrop++;
+	return qdisc_drop(skb, sch);
+
+congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+
+other_drop:
+	if (ret & __NET_XMIT_BYPASS)
+		sch->qstats.drops++;
+	kfree_skb(skb);
+	return ret;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	if (q->head == q->tail) {
+		if (!red_is_idling(&q->vars))
+			red_start_of_idle_period(&q->vars);
+		return NULL;
+	}
+
+	skb = q->tab[q->head];
+	q->tab[q->head] = NULL;
+	choke_zap_head_holes(q);
+	--sch->q.qlen;
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	qdisc_bstats_update(sch, skb);
+
+	return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	unsigned int len;
+
+	len = qdisc_queue_drop(sch);
+	if (len > 0)
+		q->stats.other++;
+	else {
+		if (!red_is_idling(&q->vars))
+			red_start_of_idle_period(&q->vars);
+	}
+
+	return len;
+}
+
+static void choke_reset(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	red_restart(&q->vars);
+}
+
+static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
+	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
+	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
+};
+
+
+static void choke_free(void *addr)
+{
+	kvfree(addr);
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_CHOKE_MAX + 1];
+	const struct tc_red_qopt *ctl;
+	int err;
+	struct sk_buff **old = NULL;
+	unsigned int mask;
+	u32 max_P;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_CHOKE_PARMS] == NULL ||
+	    tb[TCA_CHOKE_STAB] == NULL)
+		return -EINVAL;
+
+	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
+
+	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
+
+	if (ctl->limit > CHOKE_MAX_QUEUE)
+		return -EINVAL;
+
+	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+	if (mask != q->tab_mask) {
+		struct sk_buff **ntab;
+
+		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
+			       GFP_KERNEL | __GFP_NOWARN);
+		if (!ntab)
+			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+		if (!ntab)
+			return -ENOMEM;
+
+		sch_tree_lock(sch);
+		old = q->tab;
+		if (old) {
+			unsigned int oqlen = sch->q.qlen, tail = 0;
+
+			while (q->head != q->tail) {
+				struct sk_buff *skb = q->tab[q->head];
+
+				q->head = (q->head + 1) & q->tab_mask;
+				if (!skb)
+					continue;
+				if (tail < mask) {
+					ntab[tail++] = skb;
+					continue;
+				}
+				sch->qstats.backlog -= qdisc_pkt_len(skb);
+				--sch->q.qlen;
+				qdisc_drop(skb, sch);
+			}
+			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+			q->head = 0;
+			q->tail = tail;
+		}
+
+		q->tab_mask = mask;
+		q->tab = ntab;
+	} else
+		sch_tree_lock(sch);
+
+	q->flags = ctl->flags;
+	q->limit = ctl->limit;
+
+	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+		      ctl->Plog, ctl->Scell_log,
+		      nla_data(tb[TCA_CHOKE_STAB]),
+		      max_P);
+	red_set_vars(&q->vars);
+
+	if (q->head == q->tail)
+		red_end_of_idle_period(&q->vars);
+
+	sch_tree_unlock(sch);
+	choke_free(old);
+	return 0;
+}
+
+static int choke_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+	struct tc_red_qopt opt = {
+		.limit		= q->limit,
+		.flags		= q->flags,
+		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
+		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
+		.Wlog		= q->parms.Wlog,
+		.Plog		= q->parms.Plog,
+		.Scell_log	= q->parms.Scell_log,
+	};
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
+	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
+		goto nla_put_failure;
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tc_choke_xstats st = {
+		.early	= q->stats.prob_drop + q->stats.forced_drop,
+		.marked	= q->stats.prob_mark + q->stats.forced_mark,
+		.pdrop	= q->stats.pdrop,
+		.other	= q->stats.other,
+		.matched = q->stats.matched,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+				u32 classid)
+{
+	return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	if (!arg->stop) {
+		if (arg->fn(sch, 1, arg) < 0) {
+			arg->stop = 1;
+			return;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+	.leaf		=	choke_leaf,
+	.get		=	choke_get,
+	.put		=	choke_put,
+	.tcf_chain	=	choke_find_tcf,
+	.bind_tcf	=	choke_bind,
+	.unbind_tcf	=	choke_put,
+	.dump		=	choke_dump_class,
+	.walk		=	choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+	.id		=	"choke",
+	.priv_size	=	sizeof(struct choke_sched_data),
+
+	.enqueue	=	choke_enqueue,
+	.dequeue	=	choke_dequeue,
+	.peek		=	choke_peek_head,
+	.drop		=	choke_drop,
+	.init		=	choke_init,
+	.destroy	=	choke_destroy,
+	.reset		=	choke_reset,
+	.change		=	choke_change,
+	.dump		=	choke_dump,
+	.dump_stats	=	choke_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+	return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+	unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
new file mode 100644
index 00000000000..2f9ab17db85
--- /dev/null
+++ b/net/sched/sch_codel.c
@@ -0,0 +1,276 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm
+ *
+ *  Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
+ *  Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
+ *
+ *  Implemented on linux by :
+ *  Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
+ *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The names of the authors may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/prefetch.h>
+#include <net/pkt_sched.h>
+#include <net/codel.h>
+
+
+#define DEFAULT_CODEL_LIMIT 1000
+
+struct codel_sched_data {
+	struct codel_params	params;
+	struct codel_vars	vars;
+	struct codel_stats	stats;
+	u32			drop_overlimit;
+};
+
+/* This is the specific function called from codel_dequeue()
+ * to dequeue a packet from queue. Note: backlog is handled in
+ * codel, we dont need to reduce it here.
+ */
+static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+{
+	struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+	prefetch(&skb->end); /* we'll need skb_shinfo() */
+	return skb;
+}
+
+static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
+
+	/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+	 * or HTB crashes. Defer it for next round.
+	 */
+	if (q->stats.drop_count && sch->q.qlen) {
+		qdisc_tree_decrease_qlen(sch, q->stats.drop_count);
+		q->stats.drop_count = 0;
+	}
+	if (skb)
+		qdisc_bstats_update(sch, skb);
+	return skb;
+}
+
+static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct codel_sched_data *q;
+
+	if (likely(qdisc_qlen(sch) < sch->limit)) {
+		codel_set_enqueue_time(skb);
+		return qdisc_enqueue_tail(skb, sch);
+	}
+	q = qdisc_priv(sch);
+	q->drop_overlimit++;
+	return qdisc_drop(skb, sch);
+}
+
+static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
+	[TCA_CODEL_TARGET]	= { .type = NLA_U32 },
+	[TCA_CODEL_LIMIT]	= { .type = NLA_U32 },
+	[TCA_CODEL_INTERVAL]	= { .type = NLA_U32 },
+	[TCA_CODEL_ECN]		= { .type = NLA_U32 },
+};
+
+static int codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_CODEL_MAX + 1];
+	unsigned int qlen;
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy);
+	if (err < 0)
+		return err;
+
+	sch_tree_lock(sch);
+
+	if (tb[TCA_CODEL_TARGET]) {
+		u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]);
+
+		q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
+	if (tb[TCA_CODEL_INTERVAL]) {
+		u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
+
+		q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
+	if (tb[TCA_CODEL_LIMIT])
+		sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]);
+
+	if (tb[TCA_CODEL_ECN])
+		q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]);
+
+	qlen = sch->q.qlen;
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		qdisc_drop(skb, sch);
+	}
+	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+
+	sch->limit = DEFAULT_CODEL_LIMIT;
+
+	codel_params_init(&q->params);
+	codel_vars_init(&q->vars);
+	codel_stats_init(&q->stats);
+
+	if (opt) {
+		int err = codel_change(sch, opt);
+
+		if (err)
+			return err;
+	}
+
+	if (sch->limit >= 1)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+	return 0;
+}
+
+static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_CODEL_TARGET,
+			codel_time_to_us(q->params.target)) ||
+	    nla_put_u32(skb, TCA_CODEL_LIMIT,
+			sch->limit) ||
+	    nla_put_u32(skb, TCA_CODEL_INTERVAL,
+			codel_time_to_us(q->params.interval)) ||
+	    nla_put_u32(skb, TCA_CODEL_ECN,
+			q->params.ecn))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -1;
+}
+
+static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	const struct codel_sched_data *q = qdisc_priv(sch);
+	struct tc_codel_xstats st = {
+		.maxpacket	= q->stats.maxpacket,
+		.count		= q->vars.count,
+		.lastcount	= q->vars.lastcount,
+		.drop_overlimit = q->drop_overlimit,
+		.ldelay		= codel_time_to_us(q->vars.ldelay),
+		.dropping	= q->vars.dropping,
+		.ecn_mark	= q->stats.ecn_mark,
+	};
+
+	if (q->vars.dropping) {
+		codel_tdiff_t delta = q->vars.drop_next - codel_get_time();
+
+		if (delta >= 0)
+			st.drop_next = codel_time_to_us(delta);
+		else
+			st.drop_next = -codel_time_to_us(-delta);
+	}
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void codel_reset(struct Qdisc *sch)
+{
+	struct codel_sched_data *q = qdisc_priv(sch);
+
+	qdisc_reset_queue(sch);
+	codel_vars_init(&q->vars);
+}
+
+static struct Qdisc_ops codel_qdisc_ops __read_mostly = {
+	.id		=	"codel",
+	.priv_size	=	sizeof(struct codel_sched_data),
+
+	.enqueue	=	codel_qdisc_enqueue,
+	.dequeue	=	codel_qdisc_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	codel_init,
+	.reset		=	codel_reset,
+	.change 	=	codel_change,
+	.dump		=	codel_dump,
+	.dump_stats	=	codel_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init codel_module_init(void)
+{
+	return register_qdisc(&codel_qdisc_ops);
+}
+
+static void __exit codel_module_exit(void)
+{
+	unregister_qdisc(&codel_qdisc_ops);
+}
+
+module_init(codel_module_init)
+module_exit(codel_module_exit)
+
+MODULE_DESCRIPTION("Controlled Delay queue discipline");
+MODULE_AUTHOR("Dave Taht");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
new file mode 100644
index 00000000000..7bbbfe11219
--- /dev/null
+++ b/net/sched/sch_drr.c
@@ -0,0 +1,526 @@
+/*
+ * net/sched/sch_drr.c         Deficit Round Robin scheduler
+ *
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+struct drr_class {
+	struct Qdisc_class_common	common;
+	unsigned int			refcnt;
+	unsigned int			filter_cnt;
+
+	struct gnet_stats_basic_packed		bstats;
+	struct gnet_stats_queue		qstats;
+	struct gnet_stats_rate_est64	rate_est;
+	struct list_head		alist;
+	struct Qdisc			*qdisc;
+
+	u32				quantum;
+	u32				deficit;
+};
+
+struct drr_sched {
+	struct list_head		active;
+	struct tcf_proto		*filter_list;
+	struct Qdisc_class_hash		clhash;
+};
+
+static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct Qdisc_class_common *clc;
+
+	clc = qdisc_class_find(&q->clhash, classid);
+	if (clc == NULL)
+		return NULL;
+	return container_of(clc, struct drr_class, common);
+}
+
+static void drr_purge_queue(struct drr_class *cl)
+{
+	unsigned int len = cl->qdisc->q.qlen;
+
+	qdisc_reset(cl->qdisc);
+	qdisc_tree_decrease_qlen(cl->qdisc, len);
+}
+
+static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
+	[TCA_DRR_QUANTUM]	= { .type = NLA_U32 },
+};
+
+static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+			    struct nlattr **tca, unsigned long *arg)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl = (struct drr_class *)*arg;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_DRR_MAX + 1];
+	u32 quantum;
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_DRR_QUANTUM]) {
+		quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]);
+		if (quantum == 0)
+			return -EINVAL;
+	} else
+		quantum = psched_mtu(qdisc_dev(sch));
+
+	if (cl != NULL) {
+		if (tca[TCA_RATE]) {
+			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+						    qdisc_root_sleeping_lock(sch),
+						    tca[TCA_RATE]);
+			if (err)
+				return err;
+		}
+
+		sch_tree_lock(sch);
+		if (tb[TCA_DRR_QUANTUM])
+			cl->quantum = quantum;
+		sch_tree_unlock(sch);
+
+		return 0;
+	}
+
+	cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL);
+	if (cl == NULL)
+		return -ENOBUFS;
+
+	cl->refcnt	   = 1;
+	cl->common.classid = classid;
+	cl->quantum	   = quantum;
+	cl->qdisc	   = qdisc_create_dflt(sch->dev_queue,
+					       &pfifo_qdisc_ops, classid);
+	if (cl->qdisc == NULL)
+		cl->qdisc = &noop_qdisc;
+
+	if (tca[TCA_RATE]) {
+		err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+					    qdisc_root_sleeping_lock(sch),
+					    tca[TCA_RATE]);
+		if (err) {
+			qdisc_destroy(cl->qdisc);
+			kfree(cl);
+			return err;
+		}
+	}
+
+	sch_tree_lock(sch);
+	qdisc_class_hash_insert(&q->clhash, &cl->common);
+	sch_tree_unlock(sch);
+
+	qdisc_class_hash_grow(sch, &q->clhash);
+
+	*arg = (unsigned long)cl;
+	return 0;
+}
+
+static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
+{
+	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	qdisc_destroy(cl->qdisc);
+	kfree(cl);
+}
+
+static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (cl->filter_cnt > 0)
+		return -EBUSY;
+
+	sch_tree_lock(sch);
+
+	drr_purge_queue(cl);
+	qdisc_class_hash_remove(&q->clhash, &cl->common);
+
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static unsigned long drr_get_class(struct Qdisc *sch, u32 classid)
+{
+	struct drr_class *cl = drr_find_class(sch, classid);
+
+	if (cl != NULL)
+		cl->refcnt++;
+
+	return (unsigned long)cl;
+}
+
+static void drr_put_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (--cl->refcnt == 0)
+		drr_destroy_class(sch, cl);
+}
+
+static struct tcf_proto **drr_tcf_chain(struct Qdisc *sch, unsigned long cl)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+
+	return &q->filter_list;
+}
+
+static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent,
+				  u32 classid)
+{
+	struct drr_class *cl = drr_find_class(sch, classid);
+
+	if (cl != NULL)
+		cl->filter_cnt++;
+
+	return (unsigned long)cl;
+}
+
+static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	cl->filter_cnt--;
+}
+
+static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
+			   struct Qdisc *new, struct Qdisc **old)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (new == NULL) {
+		new = qdisc_create_dflt(sch->dev_queue,
+					&pfifo_qdisc_ops, cl->common.classid);
+		if (new == NULL)
+			new = &noop_qdisc;
+	}
+
+	sch_tree_lock(sch);
+	drr_purge_queue(cl);
+	*old = cl->qdisc;
+	cl->qdisc = new;
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	return cl->qdisc;
+}
+
+static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (cl->qdisc->q.qlen == 0)
+		list_del(&cl->alist);
+}
+
+static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+	struct nlattr *nest;
+
+	tcm->tcm_parent	= TC_H_ROOT;
+	tcm->tcm_handle	= cl->common.classid;
+	tcm->tcm_info	= cl->qdisc->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum))
+		goto nla_put_failure;
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+				struct gnet_dump *d)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+	struct tc_drr_stats xstats;
+
+	memset(&xstats, 0, sizeof(xstats));
+	if (cl->qdisc->q.qlen) {
+		xstats.deficit = cl->deficit;
+		cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
+	}
+
+	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
+		return -1;
+
+	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
+				      int *qerr)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct tcf_result res;
+	int result;
+
+	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
+		cl = drr_find_class(sch, skb->priority);
+		if (cl != NULL)
+			return cl;
+	}
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return NULL;
+		}
+#endif
+		cl = (struct drr_class *)res.class;
+		if (cl == NULL)
+			cl = drr_find_class(sch, res.classid);
+		return cl;
+	}
+	return NULL;
+}
+
+static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	int err = 0;
+
+	cl = drr_classify(skb, sch, &err);
+	if (cl == NULL) {
+		if (err & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return err;
+	}
+
+	err = qdisc_enqueue(skb, cl->qdisc);
+	if (unlikely(err != NET_XMIT_SUCCESS)) {
+		if (net_xmit_drop_count(err)) {
+			cl->qstats.drops++;
+			sch->qstats.drops++;
+		}
+		return err;
+	}
+
+	if (cl->qdisc->q.qlen == 1) {
+		list_add_tail(&cl->alist, &q->active);
+		cl->deficit = cl->quantum;
+	}
+
+	sch->q.qlen++;
+	return err;
+}
+
+static struct sk_buff *drr_dequeue(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct sk_buff *skb;
+	unsigned int len;
+
+	if (list_empty(&q->active))
+		goto out;
+	while (1) {
+		cl = list_first_entry(&q->active, struct drr_class, alist);
+		skb = cl->qdisc->ops->peek(cl->qdisc);
+		if (skb == NULL) {
+			qdisc_warn_nonwc(__func__, cl->qdisc);
+			goto out;
+		}
+
+		len = qdisc_pkt_len(skb);
+		if (len <= cl->deficit) {
+			cl->deficit -= len;
+			skb = qdisc_dequeue_peeked(cl->qdisc);
+			if (cl->qdisc->q.qlen == 0)
+				list_del(&cl->alist);
+
+			bstats_update(&cl->bstats, skb);
+			qdisc_bstats_update(sch, skb);
+			sch->q.qlen--;
+			return skb;
+		}
+
+		cl->deficit += cl->quantum;
+		list_move_tail(&cl->alist, &q->active);
+	}
+out:
+	return NULL;
+}
+
+static unsigned int drr_drop(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	unsigned int len;
+
+	list_for_each_entry(cl, &q->active, alist) {
+		if (cl->qdisc->ops->drop) {
+			len = cl->qdisc->ops->drop(cl->qdisc);
+			if (len > 0) {
+				sch->q.qlen--;
+				if (cl->qdisc->q.qlen == 0)
+					list_del(&cl->alist);
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
+static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	int err;
+
+	err = qdisc_class_hash_init(&q->clhash);
+	if (err < 0)
+		return err;
+	INIT_LIST_HEAD(&q->active);
+	return 0;
+}
+
+static void drr_reset_qdisc(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	unsigned int i;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+			if (cl->qdisc->q.qlen)
+				list_del(&cl->alist);
+			qdisc_reset(cl->qdisc);
+		}
+	}
+	sch->q.qlen = 0;
+}
+
+static void drr_destroy_qdisc(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct hlist_node *next;
+	unsigned int i;
+
+	tcf_destroy_chain(&q->filter_list);
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+					  common.hnode)
+			drr_destroy_class(sch, cl);
+	}
+	qdisc_class_hash_destroy(&q->clhash);
+}
+
+static const struct Qdisc_class_ops drr_class_ops = {
+	.change		= drr_change_class,
+	.delete		= drr_delete_class,
+	.get		= drr_get_class,
+	.put		= drr_put_class,
+	.tcf_chain	= drr_tcf_chain,
+	.bind_tcf	= drr_bind_tcf,
+	.unbind_tcf	= drr_unbind_tcf,
+	.graft		= drr_graft_class,
+	.leaf		= drr_class_leaf,
+	.qlen_notify	= drr_qlen_notify,
+	.dump		= drr_dump_class,
+	.dump_stats	= drr_dump_class_stats,
+	.walk		= drr_walk,
+};
+
+static struct Qdisc_ops drr_qdisc_ops __read_mostly = {
+	.cl_ops		= &drr_class_ops,
+	.id		= "drr",
+	.priv_size	= sizeof(struct drr_sched),
+	.enqueue	= drr_enqueue,
+	.dequeue	= drr_dequeue,
+	.peek		= qdisc_peek_dequeued,
+	.drop		= drr_drop,
+	.init		= drr_init_qdisc,
+	.reset		= drr_reset_qdisc,
+	.destroy	= drr_destroy_qdisc,
+	.owner		= THIS_MODULE,
+};
+
+static int __init drr_init(void)
+{
+	return register_qdisc(&drr_qdisc_ops);
+}
+
+static void __exit drr_exit(void)
+{
+	unregister_qdisc(&drr_qdisc_ops);
+}
+
+module_init(drr_init);
+module_exit(drr_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 13e0e7b3856..49d6ef338b5 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -3,37 +3,20 @@
 /* Written 1998-2000 by Werner Almesberger, EPFL ICA */
 
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/skbuff.h>
-#include <linux/netdevice.h> /* for pkt_sched */
 #include <linux/rtnetlink.h>
+#include <linux/bitops.h>
 #include <net/pkt_sched.h>
 #include <net/dsfield.h>
 #include <net/inet_ecn.h>
 #include <asm/byteorder.h>
 
-
-#if 0 /* control */
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-#if 0 /* data */
-#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define D2PRINTK(format,args...)
-#endif
-
-
-#define PRIV(sch) ((struct dsmark_qdisc_data *) qdisc_priv(sch))
-
-
 /*
  * classid	class		marking
  * -------	-----		-------
@@ -62,20 +45,9 @@ struct dsmark_qdisc_data {
 	int			set_tc_index;
 };
 
-static inline int dsmark_valid_indices(u16 indices)
-{
-	while (indices != 1) {
-		if (indices & 1)
-			return 0;
-		indices >>= 1;
-	}
- 
-	return 1;
-}
-
 static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
 {
-	return (index <= p->indices && index > 0);
+	return index <= p->indices && index > 0;
 }
 
 /* ------------------------- Class/flow operations ------------------------- */
@@ -83,35 +55,38 @@ static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
 static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
 			struct Qdisc *new, struct Qdisc **old)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 
-	DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",
-		sch, p, new, old);
+	pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n",
+		 __func__, sch, p, new, old);
 
 	if (new == NULL) {
-		new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					sch->handle);
 		if (new == NULL)
 			new = &noop_qdisc;
 	}
 
 	sch_tree_lock(sch);
-	*old = xchg(&p->q, new);
+	*old = p->q;
+	p->q = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 	qdisc_reset(*old);
-	sch->q.qlen = 0;
 	sch_tree_unlock(sch);
 
-        return 0;
+	return 0;
 }
 
 static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
 {
-	return PRIV(sch)->q;
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	return p->q;
 }
 
 static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
 {
-	DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",
-		sch, PRIV(sch), classid);
+	pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
+		 __func__, sch, qdisc_priv(sch), classid);
 
 	return TC_H_MIN(classid) + 1;
 }
@@ -126,60 +101,73 @@ static void dsmark_put(struct Qdisc *sch, unsigned long cl)
 {
 }
 
+static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = {
+	[TCA_DSMARK_INDICES]		= { .type = NLA_U16 },
+	[TCA_DSMARK_DEFAULT_INDEX]	= { .type = NLA_U16 },
+	[TCA_DSMARK_SET_TC_INDEX]	= { .type = NLA_FLAG },
+	[TCA_DSMARK_MASK]		= { .type = NLA_U8 },
+	[TCA_DSMARK_VALUE]		= { .type = NLA_U8 },
+};
+
 static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
-			 struct rtattr **tca, unsigned long *arg)
+			 struct nlattr **tca, unsigned long *arg)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
-	struct rtattr *opt = tca[TCA_OPTIONS-1];
-	struct rtattr *tb[TCA_DSMARK_MAX];
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_DSMARK_MAX + 1];
 	int err = -EINVAL;
 	u8 mask = 0;
 
-	DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
-		"arg 0x%lx\n", sch, p, classid, parent, *arg);
+	pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n",
+		 __func__, sch, p, classid, parent, *arg);
 
 	if (!dsmark_valid_index(p, *arg)) {
 		err = -ENOENT;
-		goto rtattr_failure;
+		goto errout;
 	}
 
-	if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt))
-		goto rtattr_failure;
+	if (!opt)
+		goto errout;
+
+	err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
+	if (err < 0)
+		goto errout;
+
+	if (tb[TCA_DSMARK_MASK])
+		mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
 
-	if (tb[TCA_DSMARK_MASK-1])
-		mask = RTA_GET_U8(tb[TCA_DSMARK_MASK-1]);
+	if (tb[TCA_DSMARK_VALUE])
+		p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
 
-	if (tb[TCA_DSMARK_VALUE-1])
-		p->value[*arg-1] = RTA_GET_U8(tb[TCA_DSMARK_VALUE-1]);
-		
-	if (tb[TCA_DSMARK_MASK-1])
-		p->mask[*arg-1] = mask;
+	if (tb[TCA_DSMARK_MASK])
+		p->mask[*arg - 1] = mask;
 
 	err = 0;
 
-rtattr_failure:
+errout:
 	return err;
 }
 
 static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 
 	if (!dsmark_valid_index(p, arg))
 		return -EINVAL;
-	
-	p->mask[arg-1] = 0xff;
-	p->value[arg-1] = 0;
+
+	p->mask[arg - 1] = 0xff;
+	p->value[arg - 1] = 0;
 
 	return 0;
 }
 
-static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker)
+static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 	int i;
 
-	DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
+	pr_debug("%s(sch %p,[qdisc %p],walker %p)\n",
+		 __func__, sch, p, walker);
 
 	if (walker->stop)
 		return;
@@ -188,45 +176,53 @@ static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker)
 		if (p->mask[i] == 0xff && !p->value[i])
 			goto ignore;
 		if (walker->count >= walker->skip) {
-			if (walker->fn(sch, i+1, walker) < 0) {
+			if (walker->fn(sch, i + 1, walker) < 0) {
 				walker->stop = 1;
 				break;
 			}
 		}
-ignore:		
+ignore:
 		walker->count++;
-        }
+	}
 }
 
-static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl)
+static inline struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,
+						 unsigned long cl)
 {
-	return &PRIV(sch)->filter_list;
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	return &p->filter_list;
 }
 
 /* --------------------------- Qdisc operations ---------------------------- */
 
-static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
+static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 	int err;
 
-	D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
+	pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
 
 	if (p->set_tc_index) {
-		/* FIXME: Safe with non-linear skbs? --RR */
 		switch (skb->protocol) {
-			case __constant_htons(ETH_P_IP):
-				skb->tc_index = ipv4_get_dsfield(skb->nh.iph)
-					& ~INET_ECN_MASK;
-				break;
-			case __constant_htons(ETH_P_IPV6):
-				skb->tc_index = ipv6_get_dsfield(skb->nh.ipv6h)
-					& ~INET_ECN_MASK;
-				break;
-			default:
-				skb->tc_index = 0;
-				break;
-		};
+		case htons(ETH_P_IP):
+			if (skb_cow_head(skb, sizeof(struct iphdr)))
+				goto drop;
+
+			skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
+				& ~INET_ECN_MASK;
+			break;
+
+		case htons(ETH_P_IPV6):
+			if (skb_cow_head(skb, sizeof(struct ipv6hdr)))
+				goto drop;
+
+			skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
+				& ~INET_ECN_MASK;
+			break;
+		default:
+			skb->tc_index = 0;
+			break;
+		}
 	}
 
 	if (TC_H_MAJ(skb->priority) == sch->handle)
@@ -235,111 +231,102 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
 		struct tcf_result res;
 		int result = tc_classify(skb, p->filter_list, &res);
 
-		D2PRINTK("result %d class 0x%04x\n", result, res.classid);
+		pr_debug("result %d class 0x%04x\n", result, res.classid);
 
 		switch (result) {
-#ifdef CONFIG_NET_CLS_POLICE
-			case TC_POLICE_SHOT:
-				kfree_skb(skb);
-				sch->qstats.drops++;
-				return NET_XMIT_POLICED;
-#if 0
-			case TC_POLICE_RECLASSIFY:
-				/* FIXME: what to do here ??? */
+#ifdef CONFIG_NET_CLS_ACT
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			kfree_skb(skb);
+			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+
+		case TC_ACT_SHOT:
+			goto drop;
 #endif
-#endif
-			case TC_POLICE_OK:
-				skb->tc_index = TC_H_MIN(res.classid);
-				break;
-			case TC_POLICE_UNSPEC:
-				/* fall through */
-			default:
-				if (p->default_index != NO_DEFAULT_INDEX)
-					skb->tc_index = p->default_index;
-				break;
-		};
+		case TC_ACT_OK:
+			skb->tc_index = TC_H_MIN(res.classid);
+			break;
+
+		default:
+			if (p->default_index != NO_DEFAULT_INDEX)
+				skb->tc_index = p->default_index;
+			break;
+		}
 	}
 
-	err = p->q->enqueue(skb,p->q);
+	err = qdisc_enqueue(skb, p->q);
 	if (err != NET_XMIT_SUCCESS) {
-		sch->qstats.drops++;
+		if (net_xmit_drop_count(err))
+			sch->qstats.drops++;
 		return err;
 	}
 
-	sch->bstats.bytes += skb->len;
-	sch->bstats.packets++;
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
+
+drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 }
 
 static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 	struct sk_buff *skb;
 	u32 index;
 
-	D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p);
+	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 
 	skb = p->q->ops->dequeue(p->q);
 	if (skb == NULL)
 		return NULL;
 
+	qdisc_bstats_update(sch, skb);
 	sch->q.qlen--;
 
 	index = skb->tc_index & (p->indices - 1);
-	D2PRINTK("index %d->%d\n", skb->tc_index, index);
+	pr_debug("index %d->%d\n", skb->tc_index, index);
 
 	switch (skb->protocol) {
-		case __constant_htons(ETH_P_IP):
-			ipv4_change_dsfield(skb->nh.iph, p->mask[index],
-					    p->value[index]);
-			break;
-		case __constant_htons(ETH_P_IPV6):
-			ipv6_change_dsfield(skb->nh.ipv6h, p->mask[index],
-					    p->value[index]);
+	case htons(ETH_P_IP):
+		ipv4_change_dsfield(ip_hdr(skb), p->mask[index],
+				    p->value[index]);
 			break;
-		default:
-			/*
-			 * Only complain if a change was actually attempted.
-			 * This way, we can send non-IP traffic through dsmark
-			 * and don't need yet another qdisc as a bypass.
-			 */
-			if (p->mask[index] != 0xff || p->value[index])
-				printk(KERN_WARNING "dsmark_dequeue: "
-				       "unsupported protocol %d\n",
-				       htons(skb->protocol));
+	case htons(ETH_P_IPV6):
+		ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index],
+				    p->value[index]);
 			break;
-	};
+	default:
+		/*
+		 * Only complain if a change was actually attempted.
+		 * This way, we can send non-IP traffic through dsmark
+		 * and don't need yet another qdisc as a bypass.
+		 */
+		if (p->mask[index] != 0xff || p->value[index])
+			pr_warn("%s: unsupported protocol %d\n",
+				__func__, ntohs(skb->protocol));
+		break;
+	}
 
 	return skb;
 }
 
-static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch)
+static struct sk_buff *dsmark_peek(struct Qdisc *sch)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
-	int err;
-
-	D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
-
-	err = p->q->ops->requeue(skb, p->q);
-	if (err != NET_XMIT_SUCCESS) {
-		sch->qstats.drops++;
-		return err;
-	}
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 
-	sch->q.qlen++;
-	sch->qstats.requeues++;
+	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 
-	return NET_XMIT_SUCCESS;
+	return p->q->ops->peek(p->q);
 }
 
 static unsigned int dsmark_drop(struct Qdisc *sch)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 	unsigned int len;
-	
-	DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
+
+	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 
 	if (p->q->ops->drop == NULL)
 		return 0;
@@ -351,26 +338,32 @@ static unsigned int dsmark_drop(struct Qdisc *sch)
 	return len;
 }
 
-static int dsmark_init(struct Qdisc *sch, struct rtattr *opt)
+static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
-	struct rtattr *tb[TCA_DSMARK_MAX];
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	struct nlattr *tb[TCA_DSMARK_MAX + 1];
 	int err = -EINVAL;
 	u32 default_index = NO_DEFAULT_INDEX;
 	u16 indices;
 	u8 *mask;
 
-	DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
+	pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);
 
-	if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt) < 0)
+	if (!opt)
 		goto errout;
 
-	indices = RTA_GET_U16(tb[TCA_DSMARK_INDICES-1]);
-	if (!indices || !dsmark_valid_indices(indices))
+	err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
+	if (err < 0)
 		goto errout;
 
-	if (tb[TCA_DSMARK_DEFAULT_INDEX-1])
-		default_index = RTA_GET_U16(tb[TCA_DSMARK_DEFAULT_INDEX-1]);
+	err = -EINVAL;
+	indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
+
+	if (hweight32(indices) != 1)
+		goto errout;
+
+	if (tb[TCA_DSMARK_DEFAULT_INDEX])
+		default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]);
 
 	mask = kmalloc(indices * 2, GFP_KERNEL);
 	if (mask == NULL) {
@@ -386,42 +379,35 @@ static int dsmark_init(struct Qdisc *sch, struct rtattr *opt)
 
 	p->indices = indices;
 	p->default_index = default_index;
-	p->set_tc_index = RTA_GET_FLAG(tb[TCA_DSMARK_SET_TC_INDEX-1]);
+	p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
 
-	p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+	p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle);
 	if (p->q == NULL)
 		p->q = &noop_qdisc;
 
-	DPRINTK("dsmark_init: qdisc %p\n", p->q);
+	pr_debug("%s: qdisc %p\n", __func__, p->q);
 
 	err = 0;
 errout:
-rtattr_failure:
 	return err;
 }
 
 static void dsmark_reset(struct Qdisc *sch)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 
-	DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
+	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 	qdisc_reset(p->q);
 	sch->q.qlen = 0;
 }
 
 static void dsmark_destroy(struct Qdisc *sch)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
-	struct tcf_proto *tp;
-
-	DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p);
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
 
-	while (p->filter_list) {
-		tp = p->filter_list;
-		p->filter_list = tp->next;
-		tcf_destroy(tp);
-	}
+	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 
+	tcf_destroy_chain(&p->filter_list);
 	qdisc_destroy(p->q);
 	kfree(p->mask);
 }
@@ -429,47 +415,58 @@ static void dsmark_destroy(struct Qdisc *sch)
 static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
 			     struct sk_buff *skb, struct tcmsg *tcm)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
-	struct rtattr *opts = NULL;
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
 
-	DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl);
+	pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);
 
 	if (!dsmark_valid_index(p, cl))
 		return -EINVAL;
 
-	tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1);
+	tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
+	tcm->tcm_info = p->q->handle;
 
-	opts = RTA_NEST(skb, TCA_OPTIONS);
-	RTA_PUT_U8(skb,TCA_DSMARK_MASK, p->mask[cl-1]);
-	RTA_PUT_U8(skb,TCA_DSMARK_VALUE, p->value[cl-1]);
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+	if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) ||
+	    nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]))
+		goto nla_put_failure;
 
-	return RTA_NEST_END(skb, opts);
+	return nla_nest_end(skb, opts);
 
-rtattr_failure:
-	return RTA_NEST_CANCEL(skb, opts);
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
 }
 
 static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
-	struct rtattr *opts = NULL;
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
 
-	opts = RTA_NEST(skb, TCA_OPTIONS);
-	RTA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices);
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+	if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices))
+		goto nla_put_failure;
 
-	if (p->default_index != NO_DEFAULT_INDEX)
-		RTA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index);
+	if (p->default_index != NO_DEFAULT_INDEX &&
+	    nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index))
+		goto nla_put_failure;
 
-	if (p->set_tc_index)
-		RTA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX);
+	if (p->set_tc_index &&
+	    nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX))
+		goto nla_put_failure;
 
-	return RTA_NEST_END(skb, opts);
+	return nla_nest_end(skb, opts);
 
-rtattr_failure:
-	return RTA_NEST_CANCEL(skb, opts);
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
 }
 
-static struct Qdisc_class_ops dsmark_class_ops = {
+static const struct Qdisc_class_ops dsmark_class_ops = {
 	.graft		=	dsmark_graft,
 	.leaf		=	dsmark_leaf,
 	.get		=	dsmark_get,
@@ -483,14 +480,14 @@ static struct Qdisc_class_ops dsmark_class_ops = {
 	.dump		=	dsmark_dump_class,
 };
 
-static struct Qdisc_ops dsmark_qdisc_ops = {
+static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
 	.next		=	NULL,
 	.cl_ops		=	&dsmark_class_ops,
 	.id		=	"dsmark",
 	.priv_size	=	sizeof(struct dsmark_qdisc_data),
 	.enqueue	=	dsmark_enqueue,
 	.dequeue	=	dsmark_dequeue,
-	.requeue	=	dsmark_requeue,
+	.peek		=	dsmark_peek,
 	.drop		=	dsmark_drop,
 	.init		=	dsmark_init,
 	.reset		=	dsmark_reset,
@@ -505,7 +502,7 @@ static int __init dsmark_module_init(void)
 	return register_qdisc(&dsmark_qdisc_ops);
 }
 
-static void __exit dsmark_module_exit(void) 
+static void __exit dsmark_module_exit(void)
 {
 	unregister_qdisc(&dsmark_qdisc_ops);
 }
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 033083bf0e7..e15a9eb2908 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -9,83 +9,96 @@
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/pkt_sched.h>
 
 /* 1 band FIFO pseudo-"scheduler" */
 
-struct fifo_sched_data
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
-	u32 limit;
-};
+	if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
+		return qdisc_enqueue_tail(skb, sch);
 
-static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
-{
-	struct fifo_sched_data *q = qdisc_priv(sch);
+	return qdisc_reshape_fail(skb, sch);
+}
 
-	if (likely(sch->qstats.backlog + skb->len <= q->limit))
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	if (likely(skb_queue_len(&sch->q) < sch->limit))
 		return qdisc_enqueue_tail(skb, sch);
 
 	return qdisc_reshape_fail(skb, sch);
 }
 
-static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
-	struct fifo_sched_data *q = qdisc_priv(sch);
-
-	if (likely(skb_queue_len(&sch->q) < q->limit))
+	if (likely(skb_queue_len(&sch->q) < sch->limit))
 		return qdisc_enqueue_tail(skb, sch);
 
-	return qdisc_reshape_fail(skb, sch);
+	/* queue full, remove one skb to fulfill the limit */
+	__qdisc_queue_drop_head(sch, &sch->q);
+	sch->qstats.drops++;
+	qdisc_enqueue_tail(skb, sch);
+
+	return NET_XMIT_CN;
 }
 
-static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
+static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
 {
-	struct fifo_sched_data *q = qdisc_priv(sch);
+	bool bypass;
+	bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
 
 	if (opt == NULL) {
-		u32 limit = sch->dev->tx_queue_len ? : 1;
+		u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
 
-		if (sch->ops == &bfifo_qdisc_ops)
-			limit *= sch->dev->mtu;
+		if (is_bfifo)
+			limit *= psched_mtu(qdisc_dev(sch));
 
-		q->limit = limit;
+		sch->limit = limit;
 	} else {
-		struct tc_fifo_qopt *ctl = RTA_DATA(opt);
+		struct tc_fifo_qopt *ctl = nla_data(opt);
 
-		if (RTA_PAYLOAD(opt) < sizeof(*ctl))
+		if (nla_len(opt) < sizeof(*ctl))
 			return -EINVAL;
 
-		q->limit = ctl->limit;
+		sch->limit = ctl->limit;
 	}
 
+	if (is_bfifo)
+		bypass = sch->limit >= psched_mtu(qdisc_dev(sch));
+	else
+		bypass = sch->limit >= 1;
+
+	if (bypass)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
 static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
-	struct fifo_sched_data *q = qdisc_priv(sch);
-	struct tc_fifo_qopt opt = { .limit = q->limit };
+	struct tc_fifo_qopt opt = { .limit = sch->limit };
 
-	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
 	return skb->len;
 
-rtattr_failure:
+nla_put_failure:
 	return -1;
 }
 
-struct Qdisc_ops pfifo_qdisc_ops = {
+struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
 	.id		=	"pfifo",
-	.priv_size	=	sizeof(struct fifo_sched_data),
+	.priv_size	=	0,
 	.enqueue	=	pfifo_enqueue,
 	.dequeue	=	qdisc_dequeue_head,
-	.requeue	=	qdisc_requeue,
+	.peek		=	qdisc_peek_head,
 	.drop		=	qdisc_queue_drop,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
@@ -93,13 +106,14 @@ struct Qdisc_ops pfifo_qdisc_ops = {
 	.dump		=	fifo_dump,
 	.owner		=	THIS_MODULE,
 };
+EXPORT_SYMBOL(pfifo_qdisc_ops);
 
-struct Qdisc_ops bfifo_qdisc_ops = {
+struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
 	.id		=	"bfifo",
-	.priv_size	=	sizeof(struct fifo_sched_data),
+	.priv_size	=	0,
 	.enqueue	=	bfifo_enqueue,
 	.dequeue	=	qdisc_dequeue_head,
-	.requeue	=	qdisc_requeue,
+	.peek		=	qdisc_peek_head,
 	.drop		=	qdisc_queue_drop,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
@@ -107,6 +121,60 @@ struct Qdisc_ops bfifo_qdisc_ops = {
 	.dump		=	fifo_dump,
 	.owner		=	THIS_MODULE,
 };
-
 EXPORT_SYMBOL(bfifo_qdisc_ops);
-EXPORT_SYMBOL(pfifo_qdisc_ops);
+
+struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
+	.id		=	"pfifo_head_drop",
+	.priv_size	=	0,
+	.enqueue	=	pfifo_tail_enqueue,
+	.dequeue	=	qdisc_dequeue_head,
+	.peek		=	qdisc_peek_head,
+	.drop		=	qdisc_queue_drop_head,
+	.init		=	fifo_init,
+	.reset		=	qdisc_reset_queue,
+	.change		=	fifo_init,
+	.dump		=	fifo_dump,
+	.owner		=	THIS_MODULE,
+};
+
+/* Pass size change message down to embedded FIFO */
+int fifo_set_limit(struct Qdisc *q, unsigned int limit)
+{
+	struct nlattr *nla;
+	int ret = -ENOMEM;
+
+	/* Hack to avoid sending change message to non-FIFO */
+	if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
+		return 0;
+
+	nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
+	if (nla) {
+		nla->nla_type = RTM_NEWQDISC;
+		nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt));
+		((struct tc_fifo_qopt *)nla_data(nla))->limit = limit;
+
+		ret = q->ops->change(q, nla);
+		kfree(nla);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(fifo_set_limit);
+
+struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
+			       unsigned int limit)
+{
+	struct Qdisc *q;
+	int err = -ENOMEM;
+
+	q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1));
+	if (q) {
+		err = fifo_set_limit(q, limit);
+		if (err < 0) {
+			qdisc_destroy(q);
+			q = NULL;
+		}
+	}
+
+	return q ? : ERR_PTR(err);
+}
+EXPORT_SYMBOL(fifo_create_dflt);
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
new file mode 100644
index 00000000000..ba32c2b005d
--- /dev/null
+++ b/net/sched/sch_fq.c
@@ -0,0 +1,849 @@
+/*
+ * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
+ *
+ *  Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *  Meant to be mostly used for localy generated traffic :
+ *  Fast classification depends on skb->sk being set before reaching us.
+ *  If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
+ *  All packets belonging to a socket are considered as a 'flow'.
+ *
+ *  Flows are dynamically allocated and stored in a hash table of RB trees
+ *  They are also part of one Round Robin 'queues' (new or old flows)
+ *
+ *  Burst avoidance (aka pacing) capability :
+ *
+ *  Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
+ *  bunch of packets, and this packet scheduler adds delay between
+ *  packets to respect rate limitation.
+ *
+ *  enqueue() :
+ *   - lookup one RB tree (out of 1024 or more) to find the flow.
+ *     If non existent flow, create it, add it to the tree.
+ *     Add skb to the per flow list of skb (fifo).
+ *   - Use a special fifo for high prio packets
+ *
+ *  dequeue() : serves flows in Round Robin
+ *  Note : When a flow becomes empty, we do not immediately remove it from
+ *  rb trees, for performance reasons (its expected to send additional packets,
+ *  or SLAB cache will reuse socket for another flow)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+/*
+ * Per flow structure, dynamically allocated
+ */
+struct fq_flow {
+	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
+	union {
+		struct sk_buff *tail;	/* last skb in the list */
+		unsigned long  age;	/* jiffies when flow was emptied, for gc */
+	};
+	struct rb_node	fq_node; 	/* anchor in fq_root[] trees */
+	struct sock	*sk;
+	int		qlen;		/* number of packets in flow queue */
+	int		credit;
+	u32		socket_hash;	/* sk_hash */
+	struct fq_flow *next;		/* next pointer in RR lists, or &detached */
+
+	struct rb_node  rate_node;	/* anchor in q->delayed tree */
+	u64		time_next_packet;
+};
+
+struct fq_flow_head {
+	struct fq_flow *first;
+	struct fq_flow *last;
+};
+
+struct fq_sched_data {
+	struct fq_flow_head new_flows;
+
+	struct fq_flow_head old_flows;
+
+	struct rb_root	delayed;	/* for rate limited flows */
+	u64		time_next_delayed_flow;
+
+	struct fq_flow	internal;	/* for non classified or high prio packets */
+	u32		quantum;
+	u32		initial_quantum;
+	u32		flow_refill_delay;
+	u32		flow_max_rate;	/* optional max rate per flow */
+	u32		flow_plimit;	/* max packets per flow */
+	struct rb_root	*fq_root;
+	u8		rate_enable;
+	u8		fq_trees_log;
+
+	u32		flows;
+	u32		inactive_flows;
+	u32		throttled_flows;
+
+	u64		stat_gc_flows;
+	u64		stat_internal_packets;
+	u64		stat_tcp_retrans;
+	u64		stat_throttled;
+	u64		stat_flows_plimit;
+	u64		stat_pkts_too_long;
+	u64		stat_allocation_errors;
+	struct qdisc_watchdog watchdog;
+};
+
+/* special value to mark a detached flow (not on old/new list) */
+static struct fq_flow detached, throttled;
+
+static void fq_flow_set_detached(struct fq_flow *f)
+{
+	f->next = &detached;
+	f->age = jiffies;
+}
+
+static bool fq_flow_is_detached(const struct fq_flow *f)
+{
+	return f->next == &detached;
+}
+
+static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+	struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
+
+	while (*p) {
+		struct fq_flow *aux;
+
+		parent = *p;
+		aux = container_of(parent, struct fq_flow, rate_node);
+		if (f->time_next_packet >= aux->time_next_packet)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&f->rate_node, parent, p);
+	rb_insert_color(&f->rate_node, &q->delayed);
+	q->throttled_flows++;
+	q->stat_throttled++;
+
+	f->next = &throttled;
+	if (q->time_next_delayed_flow > f->time_next_packet)
+		q->time_next_delayed_flow = f->time_next_packet;
+}
+
+
+static struct kmem_cache *fq_flow_cachep __read_mostly;
+
+static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+{
+	if (head->first)
+		head->last->next = flow;
+	else
+		head->first = flow;
+	head->last = flow;
+	flow->next = NULL;
+}
+
+/* limit number of collected flows per round */
+#define FQ_GC_MAX 8
+#define FQ_GC_AGE (3*HZ)
+
+static bool fq_gc_candidate(const struct fq_flow *f)
+{
+	return fq_flow_is_detached(f) &&
+	       time_after(jiffies, f->age + FQ_GC_AGE);
+}
+
+static void fq_gc(struct fq_sched_data *q,
+		  struct rb_root *root,
+		  struct sock *sk)
+{
+	struct fq_flow *f, *tofree[FQ_GC_MAX];
+	struct rb_node **p, *parent;
+	int fcnt = 0;
+
+	p = &root->rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+
+		f = container_of(parent, struct fq_flow, fq_node);
+		if (f->sk == sk)
+			break;
+
+		if (fq_gc_candidate(f)) {
+			tofree[fcnt++] = f;
+			if (fcnt == FQ_GC_MAX)
+				break;
+		}
+
+		if (f->sk > sk)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+
+	q->flows -= fcnt;
+	q->inactive_flows -= fcnt;
+	q->stat_gc_flows += fcnt;
+	while (fcnt) {
+		struct fq_flow *f = tofree[--fcnt];
+
+		rb_erase(&f->fq_node, root);
+		kmem_cache_free(fq_flow_cachep, f);
+	}
+}
+
+static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+{
+	struct rb_node **p, *parent;
+	struct sock *sk = skb->sk;
+	struct rb_root *root;
+	struct fq_flow *f;
+
+	/* warning: no starvation prevention... */
+	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
+		return &q->internal;
+
+	if (unlikely(!sk)) {
+		/* By forcing low order bit to 1, we make sure to not
+		 * collide with a local flow (socket pointers are word aligned)
+		 */
+		sk = (struct sock *)(skb_get_hash(skb) | 1L);
+	}
+
+	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
+
+	if (q->flows >= (2U << q->fq_trees_log) &&
+	    q->inactive_flows > q->flows/2)
+		fq_gc(q, root, sk);
+
+	p = &root->rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+
+		f = container_of(parent, struct fq_flow, fq_node);
+		if (f->sk == sk) {
+			/* socket might have been reallocated, so check
+			 * if its sk_hash is the same.
+			 * It not, we need to refill credit with
+			 * initial quantum
+			 */
+			if (unlikely(skb->sk &&
+				     f->socket_hash != sk->sk_hash)) {
+				f->credit = q->initial_quantum;
+				f->socket_hash = sk->sk_hash;
+				f->time_next_packet = 0ULL;
+			}
+			return f;
+		}
+		if (f->sk > sk)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+
+	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!f)) {
+		q->stat_allocation_errors++;
+		return &q->internal;
+	}
+	fq_flow_set_detached(f);
+	f->sk = sk;
+	if (skb->sk)
+		f->socket_hash = sk->sk_hash;
+	f->credit = q->initial_quantum;
+
+	rb_link_node(&f->fq_node, parent, p);
+	rb_insert_color(&f->fq_node, root);
+
+	q->flows++;
+	q->inactive_flows++;
+	return f;
+}
+
+
+/* remove one skb from head of flow queue */
+static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
+{
+	struct sk_buff *skb = flow->head;
+
+	if (skb) {
+		flow->head = skb->next;
+		skb->next = NULL;
+		flow->qlen--;
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		sch->q.qlen--;
+	}
+	return skb;
+}
+
+/* We might add in the future detection of retransmits
+ * For the time being, just return false
+ */
+static bool skb_is_retransmit(struct sk_buff *skb)
+{
+	return false;
+}
+
+/* add skb to flow queue
+ * flow queue is a linked list, kind of FIFO, except for TCP retransmits
+ * We special case tcp retransmits to be transmitted before other packets.
+ * We rely on fact that TCP retransmits are unlikely, so we do not waste
+ * a separate queue or a pointer.
+ * head->  [retrans pkt 1]
+ *         [retrans pkt 2]
+ *         [ normal pkt 1]
+ *         [ normal pkt 2]
+ *         [ normal pkt 3]
+ * tail->  [ normal pkt 4]
+ */
+static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
+{
+	struct sk_buff *prev, *head = flow->head;
+
+	skb->next = NULL;
+	if (!head) {
+		flow->head = skb;
+		flow->tail = skb;
+		return;
+	}
+	if (likely(!skb_is_retransmit(skb))) {
+		flow->tail->next = skb;
+		flow->tail = skb;
+		return;
+	}
+
+	/* This skb is a tcp retransmit,
+	 * find the last retrans packet in the queue
+	 */
+	prev = NULL;
+	while (skb_is_retransmit(head)) {
+		prev = head;
+		head = head->next;
+		if (!head)
+			break;
+	}
+	if (!prev) { /* no rtx packet in queue, become the new head */
+		skb->next = flow->head;
+		flow->head = skb;
+	} else {
+		if (prev == flow->tail)
+			flow->tail = skb;
+		else
+			skb->next = prev->next;
+		prev->next = skb;
+	}
+}
+
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct fq_flow *f;
+
+	if (unlikely(sch->q.qlen >= sch->limit))
+		return qdisc_drop(skb, sch);
+
+	f = fq_classify(skb, q);
+	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
+		q->stat_flows_plimit++;
+		return qdisc_drop(skb, sch);
+	}
+
+	f->qlen++;
+	if (skb_is_retransmit(skb))
+		q->stat_tcp_retrans++;
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+	if (fq_flow_is_detached(f)) {
+		fq_flow_add_tail(&q->new_flows, f);
+		if (time_after(jiffies, f->age + q->flow_refill_delay))
+			f->credit = max_t(u32, f->credit, q->quantum);
+		q->inactive_flows--;
+		qdisc_unthrottled(sch);
+	}
+
+	/* Note: this overwrites f->age */
+	flow_queue_add(f, skb);
+
+	if (unlikely(f == &q->internal)) {
+		q->stat_internal_packets++;
+		qdisc_unthrottled(sch);
+	}
+	sch->q.qlen++;
+
+	return NET_XMIT_SUCCESS;
+}
+
+static void fq_check_throttled(struct fq_sched_data *q, u64 now)
+{
+	struct rb_node *p;
+
+	if (q->time_next_delayed_flow > now)
+		return;
+
+	q->time_next_delayed_flow = ~0ULL;
+	while ((p = rb_first(&q->delayed)) != NULL) {
+		struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
+
+		if (f->time_next_packet > now) {
+			q->time_next_delayed_flow = f->time_next_packet;
+			break;
+		}
+		rb_erase(p, &q->delayed);
+		q->throttled_flows--;
+		fq_flow_add_tail(&q->old_flows, f);
+	}
+}
+
+static struct sk_buff *fq_dequeue(struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	u64 now = ktime_to_ns(ktime_get());
+	struct fq_flow_head *head;
+	struct sk_buff *skb;
+	struct fq_flow *f;
+	u32 rate;
+
+	skb = fq_dequeue_head(sch, &q->internal);
+	if (skb)
+		goto out;
+	fq_check_throttled(q, now);
+begin:
+	head = &q->new_flows;
+	if (!head->first) {
+		head = &q->old_flows;
+		if (!head->first) {
+			if (q->time_next_delayed_flow != ~0ULL)
+				qdisc_watchdog_schedule_ns(&q->watchdog,
+							   q->time_next_delayed_flow);
+			return NULL;
+		}
+	}
+	f = head->first;
+
+	if (f->credit <= 0) {
+		f->credit += q->quantum;
+		head->first = f->next;
+		fq_flow_add_tail(&q->old_flows, f);
+		goto begin;
+	}
+
+	if (unlikely(f->head && now < f->time_next_packet)) {
+		head->first = f->next;
+		fq_flow_set_throttled(q, f);
+		goto begin;
+	}
+
+	skb = fq_dequeue_head(sch, f);
+	if (!skb) {
+		head->first = f->next;
+		/* force a pass through old_flows to prevent starvation */
+		if ((head == &q->new_flows) && q->old_flows.first) {
+			fq_flow_add_tail(&q->old_flows, f);
+		} else {
+			fq_flow_set_detached(f);
+			q->inactive_flows++;
+		}
+		goto begin;
+	}
+	prefetch(&skb->end);
+	f->time_next_packet = now;
+	f->credit -= qdisc_pkt_len(skb);
+
+	if (f->credit > 0 || !q->rate_enable)
+		goto out;
+
+	rate = q->flow_max_rate;
+	if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT)
+		rate = min(skb->sk->sk_pacing_rate, rate);
+
+	if (rate != ~0U) {
+		u32 plen = max(qdisc_pkt_len(skb), q->quantum);
+		u64 len = (u64)plen * NSEC_PER_SEC;
+
+		if (likely(rate))
+			do_div(len, rate);
+		/* Since socket rate can change later,
+		 * clamp the delay to 125 ms.
+		 * TODO: maybe segment the too big skb, as in commit
+		 * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
+		 */
+		if (unlikely(len > 125 * NSEC_PER_MSEC)) {
+			len = 125 * NSEC_PER_MSEC;
+			q->stat_pkts_too_long++;
+		}
+
+		f->time_next_packet = now + len;
+	}
+out:
+	qdisc_bstats_update(sch, skb);
+	qdisc_unthrottled(sch);
+	return skb;
+}
+
+static void fq_reset(struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct rb_root *root;
+	struct sk_buff *skb;
+	struct rb_node *p;
+	struct fq_flow *f;
+	unsigned int idx;
+
+	while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
+		kfree_skb(skb);
+
+	if (!q->fq_root)
+		return;
+
+	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
+		root = &q->fq_root[idx];
+		while ((p = rb_first(root)) != NULL) {
+			f = container_of(p, struct fq_flow, fq_node);
+			rb_erase(p, root);
+
+			while ((skb = fq_dequeue_head(sch, f)) != NULL)
+				kfree_skb(skb);
+
+			kmem_cache_free(fq_flow_cachep, f);
+		}
+	}
+	q->new_flows.first	= NULL;
+	q->old_flows.first	= NULL;
+	q->delayed		= RB_ROOT;
+	q->flows		= 0;
+	q->inactive_flows	= 0;
+	q->throttled_flows	= 0;
+}
+
+static void fq_rehash(struct fq_sched_data *q,
+		      struct rb_root *old_array, u32 old_log,
+		      struct rb_root *new_array, u32 new_log)
+{
+	struct rb_node *op, **np, *parent;
+	struct rb_root *oroot, *nroot;
+	struct fq_flow *of, *nf;
+	int fcnt = 0;
+	u32 idx;
+
+	for (idx = 0; idx < (1U << old_log); idx++) {
+		oroot = &old_array[idx];
+		while ((op = rb_first(oroot)) != NULL) {
+			rb_erase(op, oroot);
+			of = container_of(op, struct fq_flow, fq_node);
+			if (fq_gc_candidate(of)) {
+				fcnt++;
+				kmem_cache_free(fq_flow_cachep, of);
+				continue;
+			}
+			nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
+
+			np = &nroot->rb_node;
+			parent = NULL;
+			while (*np) {
+				parent = *np;
+
+				nf = container_of(parent, struct fq_flow, fq_node);
+				BUG_ON(nf->sk == of->sk);
+
+				if (nf->sk > of->sk)
+					np = &parent->rb_right;
+				else
+					np = &parent->rb_left;
+			}
+
+			rb_link_node(&of->fq_node, parent, np);
+			rb_insert_color(&of->fq_node, nroot);
+		}
+	}
+	q->flows -= fcnt;
+	q->inactive_flows -= fcnt;
+	q->stat_gc_flows += fcnt;
+}
+
+static void *fq_alloc_node(size_t sz, int node)
+{
+	void *ptr;
+
+	ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node);
+	if (!ptr)
+		ptr = vmalloc_node(sz, node);
+	return ptr;
+}
+
+static void fq_free(void *addr)
+{
+	kvfree(addr);
+}
+
+static int fq_resize(struct Qdisc *sch, u32 log)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct rb_root *array;
+	void *old_fq_root;
+	u32 idx;
+
+	if (q->fq_root && log == q->fq_trees_log)
+		return 0;
+
+	/* If XPS was setup, we can allocate memory on right NUMA node */
+	array = fq_alloc_node(sizeof(struct rb_root) << log,
+			      netdev_queue_numa_node_read(sch->dev_queue));
+	if (!array)
+		return -ENOMEM;
+
+	for (idx = 0; idx < (1U << log); idx++)
+		array[idx] = RB_ROOT;
+
+	sch_tree_lock(sch);
+
+	old_fq_root = q->fq_root;
+	if (old_fq_root)
+		fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
+
+	q->fq_root = array;
+	q->fq_trees_log = log;
+
+	sch_tree_unlock(sch);
+
+	fq_free(old_fq_root);
+
+	return 0;
+}
+
+static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+	[TCA_FQ_PLIMIT]			= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_PLIMIT]		= { .type = NLA_U32 },
+	[TCA_FQ_QUANTUM]		= { .type = NLA_U32 },
+	[TCA_FQ_INITIAL_QUANTUM]	= { .type = NLA_U32 },
+	[TCA_FQ_RATE_ENABLE]		= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
+	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
+};
+
+static int fq_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_FQ_MAX + 1];
+	int err, drop_count = 0;
+	u32 fq_log;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
+	if (err < 0)
+		return err;
+
+	sch_tree_lock(sch);
+
+	fq_log = q->fq_trees_log;
+
+	if (tb[TCA_FQ_BUCKETS_LOG]) {
+		u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
+
+		if (nval >= 1 && nval <= ilog2(256*1024))
+			fq_log = nval;
+		else
+			err = -EINVAL;
+	}
+	if (tb[TCA_FQ_PLIMIT])
+		sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
+
+	if (tb[TCA_FQ_FLOW_PLIMIT])
+		q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
+
+	if (tb[TCA_FQ_QUANTUM])
+		q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
+
+	if (tb[TCA_FQ_INITIAL_QUANTUM])
+		q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
+
+	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
+		pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
+				    nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
+
+	if (tb[TCA_FQ_FLOW_MAX_RATE])
+		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+
+	if (tb[TCA_FQ_RATE_ENABLE]) {
+		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
+
+		if (enable <= 1)
+			q->rate_enable = enable;
+		else
+			err = -EINVAL;
+	}
+
+	if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
+		u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
+
+		q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
+	}
+
+	if (!err) {
+		sch_tree_unlock(sch);
+		err = fq_resize(sch, fq_log);
+		sch_tree_lock(sch);
+	}
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = fq_dequeue(sch);
+
+		if (!skb)
+			break;
+		kfree_skb(skb);
+		drop_count++;
+	}
+	qdisc_tree_decrease_qlen(sch, drop_count);
+
+	sch_tree_unlock(sch);
+	return err;
+}
+
+static void fq_destroy(struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+
+	fq_reset(sch);
+	fq_free(q->fq_root);
+	qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int fq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	int err;
+
+	sch->limit		= 10000;
+	q->flow_plimit		= 100;
+	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
+	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
+	q->flow_refill_delay	= msecs_to_jiffies(40);
+	q->flow_max_rate	= ~0U;
+	q->rate_enable		= 1;
+	q->new_flows.first	= NULL;
+	q->old_flows.first	= NULL;
+	q->delayed		= RB_ROOT;
+	q->fq_root		= NULL;
+	q->fq_trees_log		= ilog2(1024);
+	qdisc_watchdog_init(&q->watchdog, sch);
+
+	if (opt)
+		err = fq_change(sch, opt);
+	else
+		err = fq_resize(sch, q->fq_trees_log);
+
+	return err;
+}
+
+static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
+
+	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
+	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
+	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
+	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
+			jiffies_to_usecs(q->flow_refill_delay)) ||
+	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	return -1;
+}
+
+static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	u64 now = ktime_to_ns(ktime_get());
+	struct tc_fq_qd_stats st = {
+		.gc_flows		= q->stat_gc_flows,
+		.highprio_packets	= q->stat_internal_packets,
+		.tcp_retrans		= q->stat_tcp_retrans,
+		.throttled		= q->stat_throttled,
+		.flows_plimit		= q->stat_flows_plimit,
+		.pkts_too_long		= q->stat_pkts_too_long,
+		.allocation_errors	= q->stat_allocation_errors,
+		.flows			= q->flows,
+		.inactive_flows		= q->inactive_flows,
+		.throttled_flows	= q->throttled_flows,
+		.time_next_delayed_flow	= q->time_next_delayed_flow - now,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
+	.id		=	"fq",
+	.priv_size	=	sizeof(struct fq_sched_data),
+
+	.enqueue	=	fq_enqueue,
+	.dequeue	=	fq_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	fq_init,
+	.reset		=	fq_reset,
+	.destroy	=	fq_destroy,
+	.change		=	fq_change,
+	.dump		=	fq_dump,
+	.dump_stats	=	fq_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init fq_module_init(void)
+{
+	int ret;
+
+	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
+					   sizeof(struct fq_flow),
+					   0, 0, NULL);
+	if (!fq_flow_cachep)
+		return -ENOMEM;
+
+	ret = register_qdisc(&fq_qdisc_ops);
+	if (ret)
+		kmem_cache_destroy(fq_flow_cachep);
+	return ret;
+}
+
+static void __exit fq_module_exit(void)
+{
+	unregister_qdisc(&fq_qdisc_ops);
+	kmem_cache_destroy(fq_flow_cachep);
+}
+
+module_init(fq_module_init)
+module_exit(fq_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
new file mode 100644
index 00000000000..063b726bf1f
--- /dev/null
+++ b/net/sched/sch_fq_codel.c
@@ -0,0 +1,620 @@
+/*
+ * Fair Queue CoDel discipline
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/flow_keys.h>
+#include <net/codel.h>
+
+/*	Fair Queue CoDel.
+ *
+ * Principles :
+ * Packets are classified (internal classifier or external) on flows.
+ * This is a Stochastic model (as we use a hash, several flows
+ *			       might be hashed on same slot)
+ * Each flow has a CoDel managed queue.
+ * Flows are linked onto two (Round Robin) lists,
+ * so that new flows have priority on old ones.
+ *
+ * For a given flow, packets are not reordered (CoDel uses a FIFO)
+ * head drops only.
+ * ECN capability is on by default.
+ * Low memory footprint (64 bytes per flow)
+ */
+
+struct fq_codel_flow {
+	struct sk_buff	  *head;
+	struct sk_buff	  *tail;
+	struct list_head  flowchain;
+	int		  deficit;
+	u32		  dropped; /* number of drops (or ECN marks) on this flow */
+	struct codel_vars cvars;
+}; /* please try to keep this structure <= 64 bytes */
+
+struct fq_codel_sched_data {
+	struct tcf_proto *filter_list;	/* optional external classifier */
+	struct fq_codel_flow *flows;	/* Flows table [flows_cnt] */
+	u32		*backlogs;	/* backlog table [flows_cnt] */
+	u32		flows_cnt;	/* number of flows */
+	u32		perturbation;	/* hash perturbation */
+	u32		quantum;	/* psched_mtu(qdisc_dev(sch)); */
+	struct codel_params cparams;
+	struct codel_stats cstats;
+	u32		drop_overlimit;
+	u32		new_flow_count;
+
+	struct list_head new_flows;	/* list of new flows */
+	struct list_head old_flows;	/* list of old flows */
+};
+
+static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
+				  const struct sk_buff *skb)
+{
+	struct flow_keys keys;
+	unsigned int hash;
+
+	skb_flow_dissect(skb, &keys);
+	hash = jhash_3words((__force u32)keys.dst,
+			    (__force u32)keys.src ^ keys.ip_proto,
+			    (__force u32)keys.ports, q->perturbation);
+	return ((u64)hash * q->flows_cnt) >> 32;
+}
+
+static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
+				      int *qerr)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	if (TC_H_MAJ(skb->priority) == sch->handle &&
+	    TC_H_MIN(skb->priority) > 0 &&
+	    TC_H_MIN(skb->priority) <= q->flows_cnt)
+		return TC_H_MIN(skb->priority);
+
+	if (!q->filter_list)
+		return fq_codel_hash(q, skb) + 1;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return 0;
+		}
+#endif
+		if (TC_H_MIN(res.classid) <= q->flows_cnt)
+			return TC_H_MIN(res.classid);
+	}
+	return 0;
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
+{
+	struct sk_buff *skb = flow->head;
+
+	flow->head = skb->next;
+	skb->next = NULL;
+	return skb;
+}
+
+/* add skb to flow queue (tail add) */
+static inline void flow_queue_add(struct fq_codel_flow *flow,
+				  struct sk_buff *skb)
+{
+	if (flow->head == NULL)
+		flow->head = skb;
+	else
+		flow->tail->next = skb;
+	flow->tail = skb;
+	skb->next = NULL;
+}
+
+static unsigned int fq_codel_drop(struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	unsigned int maxbacklog = 0, idx = 0, i, len;
+	struct fq_codel_flow *flow;
+
+	/* Queue is full! Find the fat flow and drop packet from it.
+	 * This might sound expensive, but with 1024 flows, we scan
+	 * 4KB of memory, and we dont need to handle a complex tree
+	 * in fast path (packet queue/enqueue) with many cache misses.
+	 */
+	for (i = 0; i < q->flows_cnt; i++) {
+		if (q->backlogs[i] > maxbacklog) {
+			maxbacklog = q->backlogs[i];
+			idx = i;
+		}
+	}
+	flow = &q->flows[idx];
+	skb = dequeue_head(flow);
+	len = qdisc_pkt_len(skb);
+	q->backlogs[idx] -= len;
+	kfree_skb(skb);
+	sch->q.qlen--;
+	sch->qstats.drops++;
+	sch->qstats.backlog -= len;
+	flow->dropped++;
+	return idx;
+}
+
+static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	unsigned int idx;
+	struct fq_codel_flow *flow;
+	int uninitialized_var(ret);
+
+	idx = fq_codel_classify(skb, sch, &ret);
+	if (idx == 0) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+	idx--;
+
+	codel_set_enqueue_time(skb);
+	flow = &q->flows[idx];
+	flow_queue_add(flow, skb);
+	q->backlogs[idx] += qdisc_pkt_len(skb);
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+
+	if (list_empty(&flow->flowchain)) {
+		list_add_tail(&flow->flowchain, &q->new_flows);
+		q->new_flow_count++;
+		flow->deficit = q->quantum;
+		flow->dropped = 0;
+	}
+	if (++sch->q.qlen <= sch->limit)
+		return NET_XMIT_SUCCESS;
+
+	q->drop_overlimit++;
+	/* Return Congestion Notification only if we dropped a packet
+	 * from this flow.
+	 */
+	if (fq_codel_drop(sch) == idx)
+		return NET_XMIT_CN;
+
+	/* As we dropped a packet, better let upper stack know this */
+	qdisc_tree_decrease_qlen(sch, 1);
+	return NET_XMIT_SUCCESS;
+}
+
+/* This is the specific function called from codel_dequeue()
+ * to dequeue a packet from queue. Note: backlog is handled in
+ * codel, we dont need to reduce it here.
+ */
+static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct fq_codel_flow *flow;
+	struct sk_buff *skb = NULL;
+
+	flow = container_of(vars, struct fq_codel_flow, cvars);
+	if (flow->head) {
+		skb = dequeue_head(flow);
+		q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
+		sch->q.qlen--;
+	}
+	return skb;
+}
+
+static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	struct fq_codel_flow *flow;
+	struct list_head *head;
+	u32 prev_drop_count, prev_ecn_mark;
+
+begin:
+	head = &q->new_flows;
+	if (list_empty(head)) {
+		head = &q->old_flows;
+		if (list_empty(head))
+			return NULL;
+	}
+	flow = list_first_entry(head, struct fq_codel_flow, flowchain);
+
+	if (flow->deficit <= 0) {
+		flow->deficit += q->quantum;
+		list_move_tail(&flow->flowchain, &q->old_flows);
+		goto begin;
+	}
+
+	prev_drop_count = q->cstats.drop_count;
+	prev_ecn_mark = q->cstats.ecn_mark;
+
+	skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
+			    dequeue);
+
+	flow->dropped += q->cstats.drop_count - prev_drop_count;
+	flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
+
+	if (!skb) {
+		/* force a pass through old_flows to prevent starvation */
+		if ((head == &q->new_flows) && !list_empty(&q->old_flows))
+			list_move_tail(&flow->flowchain, &q->old_flows);
+		else
+			list_del_init(&flow->flowchain);
+		goto begin;
+	}
+	qdisc_bstats_update(sch, skb);
+	flow->deficit -= qdisc_pkt_len(skb);
+	/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+	 * or HTB crashes. Defer it for next round.
+	 */
+	if (q->cstats.drop_count && sch->q.qlen) {
+		qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+		q->cstats.drop_count = 0;
+	}
+	return skb;
+}
+
+static void fq_codel_reset(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	while ((skb = fq_codel_dequeue(sch)) != NULL)
+		kfree_skb(skb);
+}
+
+static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
+	[TCA_FQ_CODEL_TARGET]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_LIMIT]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_INTERVAL]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_ECN]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_FLOWS]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_QUANTUM]	= { .type = NLA_U32 },
+};
+
+static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy);
+	if (err < 0)
+		return err;
+	if (tb[TCA_FQ_CODEL_FLOWS]) {
+		if (q->flows)
+			return -EINVAL;
+		q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]);
+		if (!q->flows_cnt ||
+		    q->flows_cnt > 65536)
+			return -EINVAL;
+	}
+	sch_tree_lock(sch);
+
+	if (tb[TCA_FQ_CODEL_TARGET]) {
+		u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]);
+
+		q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
+	if (tb[TCA_FQ_CODEL_INTERVAL]) {
+		u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
+
+		q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
+	if (tb[TCA_FQ_CODEL_LIMIT])
+		sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]);
+
+	if (tb[TCA_FQ_CODEL_ECN])
+		q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]);
+
+	if (tb[TCA_FQ_CODEL_QUANTUM])
+		q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
+
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = fq_codel_dequeue(sch);
+
+		kfree_skb(skb);
+		q->cstats.drop_count++;
+	}
+	qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+	q->cstats.drop_count = 0;
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static void *fq_codel_zalloc(size_t sz)
+{
+	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+	if (!ptr)
+		ptr = vzalloc(sz);
+	return ptr;
+}
+
+static void fq_codel_free(void *addr)
+{
+	kvfree(addr);
+}
+
+static void fq_codel_destroy(struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	fq_codel_free(q->backlogs);
+	fq_codel_free(q->flows);
+}
+
+static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	int i;
+
+	sch->limit = 10*1024;
+	q->flows_cnt = 1024;
+	q->quantum = psched_mtu(qdisc_dev(sch));
+	q->perturbation = prandom_u32();
+	INIT_LIST_HEAD(&q->new_flows);
+	INIT_LIST_HEAD(&q->old_flows);
+	codel_params_init(&q->cparams);
+	codel_stats_init(&q->cstats);
+	q->cparams.ecn = true;
+
+	if (opt) {
+		int err = fq_codel_change(sch, opt);
+		if (err)
+			return err;
+	}
+
+	if (!q->flows) {
+		q->flows = fq_codel_zalloc(q->flows_cnt *
+					   sizeof(struct fq_codel_flow));
+		if (!q->flows)
+			return -ENOMEM;
+		q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32));
+		if (!q->backlogs) {
+			fq_codel_free(q->flows);
+			return -ENOMEM;
+		}
+		for (i = 0; i < q->flows_cnt; i++) {
+			struct fq_codel_flow *flow = q->flows + i;
+
+			INIT_LIST_HEAD(&flow->flowchain);
+			codel_vars_init(&flow->cvars);
+		}
+	}
+	if (sch->limit >= 1)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
+	return 0;
+}
+
+static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET,
+			codel_time_to_us(q->cparams.target)) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_LIMIT,
+			sch->limit) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL,
+			codel_time_to_us(q->cparams.interval)) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_ECN,
+			q->cparams.ecn) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
+			q->quantum) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
+			q->flows_cnt))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	return -1;
+}
+
+static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct tc_fq_codel_xstats st = {
+		.type				= TCA_FQ_CODEL_XSTATS_QDISC,
+	};
+	struct list_head *pos;
+
+	st.qdisc_stats.maxpacket = q->cstats.maxpacket;
+	st.qdisc_stats.drop_overlimit = q->drop_overlimit;
+	st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
+	st.qdisc_stats.new_flow_count = q->new_flow_count;
+
+	list_for_each(pos, &q->new_flows)
+		st.qdisc_stats.new_flows_len++;
+
+	list_for_each(pos, &q->old_flows)
+		st.qdisc_stats.old_flows_len++;
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
+			      u32 classid)
+{
+	/* we cannot bypass queue discipline anymore */
+	sch->flags &= ~TCQ_F_CAN_BYPASS;
+	return 0;
+}
+
+static void fq_codel_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				     struct gnet_dump *d)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	u32 idx = cl - 1;
+	struct gnet_stats_queue qs = { 0 };
+	struct tc_fq_codel_xstats xstats;
+
+	if (idx < q->flows_cnt) {
+		const struct fq_codel_flow *flow = &q->flows[idx];
+		const struct sk_buff *skb = flow->head;
+
+		memset(&xstats, 0, sizeof(xstats));
+		xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
+		xstats.class_stats.deficit = flow->deficit;
+		xstats.class_stats.ldelay =
+			codel_time_to_us(flow->cvars.ldelay);
+		xstats.class_stats.count = flow->cvars.count;
+		xstats.class_stats.lastcount = flow->cvars.lastcount;
+		xstats.class_stats.dropping = flow->cvars.dropping;
+		if (flow->cvars.dropping) {
+			codel_tdiff_t delta = flow->cvars.drop_next -
+					      codel_get_time();
+
+			xstats.class_stats.drop_next = (delta >= 0) ?
+				codel_time_to_us(delta) :
+				-codel_time_to_us(-delta);
+		}
+		while (skb) {
+			qs.qlen++;
+			skb = skb->next;
+		}
+		qs.backlog = q->backlogs[idx];
+		qs.drops = flow->dropped;
+	}
+	if (gnet_stats_copy_queue(d, &qs) < 0)
+		return -1;
+	if (idx < q->flows_cnt)
+		return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+	return 0;
+}
+
+static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->flows_cnt; i++) {
+		if (list_empty(&q->flows[i].flowchain) ||
+		    arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, i + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops fq_codel_class_ops = {
+	.leaf		=	fq_codel_leaf,
+	.get		=	fq_codel_get,
+	.put		=	fq_codel_put,
+	.tcf_chain	=	fq_codel_find_tcf,
+	.bind_tcf	=	fq_codel_bind,
+	.unbind_tcf	=	fq_codel_put,
+	.dump		=	fq_codel_dump_class,
+	.dump_stats	=	fq_codel_dump_class_stats,
+	.walk		=	fq_codel_walk,
+};
+
+static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
+	.cl_ops		=	&fq_codel_class_ops,
+	.id		=	"fq_codel",
+	.priv_size	=	sizeof(struct fq_codel_sched_data),
+	.enqueue	=	fq_codel_enqueue,
+	.dequeue	=	fq_codel_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.drop		=	fq_codel_drop,
+	.init		=	fq_codel_init,
+	.reset		=	fq_codel_reset,
+	.destroy	=	fq_codel_destroy,
+	.change		=	fq_codel_change,
+	.dump		=	fq_codel_dump,
+	.dump_stats =	fq_codel_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init fq_codel_module_init(void)
+{
+	return register_qdisc(&fq_codel_qdisc_ops);
+}
+
+static void __exit fq_codel_module_exit(void)
+{
+	unregister_qdisc(&fq_codel_qdisc_ops);
+}
+
+module_init(fq_codel_module_init)
+module_exit(fq_codel_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 99ceb91f015..e1543b03e39 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -11,309 +11,402 @@
  *              - Ingress support
  */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
 #include <linux/bitops.h>
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
-#include <net/sock.h>
+#include <linux/slab.h>
+#include <linux/if_vlan.h>
+#include <net/sch_generic.h>
 #include <net/pkt_sched.h>
+#include <net/dst.h>
 
-/* Main transmission queue. */
-
-/* Main qdisc structure lock. 
+/* Qdisc to use by default */
+const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
+EXPORT_SYMBOL(default_qdisc_ops);
 
-   However, modifications
-   to data, participating in scheduling must be additionally
-   protected with dev->queue_lock spinlock.
-
-   The idea is the following:
-   - enqueue, dequeue are serialized via top level device
-     spinlock dev->queue_lock.
-   - tree walking is protected by read_lock_bh(qdisc_tree_lock)
-     and this lock is used only in process context.
-   - updates to tree are made under rtnl semaphore or
-     from softirq context (__qdisc_destroy rcu-callback)
-     hence this lock needs local bh disabling.
+/* Main transmission queue. */
 
-   qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
+/* Modifications to data participating in scheduling must be protected with
+ * qdisc_lock(qdisc) spinlock.
+ *
+ * The idea is the following:
+ * - enqueue, dequeue are serialized via qdisc root lock
+ * - ingress filtering is also serialized via qdisc root lock
+ * - updates to tree and tree walking are only done under the rtnl mutex.
  */
-DEFINE_RWLOCK(qdisc_tree_lock);
 
-void qdisc_lock_tree(struct net_device *dev)
+static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-	write_lock_bh(&qdisc_tree_lock);
-	spin_lock_bh(&dev->queue_lock);
+	skb_dst_force(skb);
+	q->gso_skb = skb;
+	q->qstats.requeues++;
+	q->q.qlen++;	/* it's still part of the queue */
+	__netif_schedule(q);
+
+	return 0;
 }
 
-void qdisc_unlock_tree(struct net_device *dev)
+static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 {
-	spin_unlock_bh(&dev->queue_lock);
-	write_unlock_bh(&qdisc_tree_lock);
+	struct sk_buff *skb = q->gso_skb;
+	const struct netdev_queue *txq = q->dev_queue;
+
+	if (unlikely(skb)) {
+		/* check the reason of requeuing without tx lock first */
+		txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb));
+		if (!netif_xmit_frozen_or_stopped(txq)) {
+			q->gso_skb = NULL;
+			q->q.qlen--;
+		} else
+			skb = NULL;
+	} else {
+		if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq))
+			skb = q->dequeue(q);
+	}
+
+	return skb;
 }
 
-/* 
-   dev->queue_lock serializes queue accesses for this device
-   AND dev->qdisc pointer itself.
+static inline int handle_dev_cpu_collision(struct sk_buff *skb,
+					   struct netdev_queue *dev_queue,
+					   struct Qdisc *q)
+{
+	int ret;
 
-   dev->xmit_lock serializes accesses to device driver.
+	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
+		/*
+		 * Same CPU holding the lock. It may be a transient
+		 * configuration error, when hard_start_xmit() recurses. We
+		 * detect it by checking xmit owner and drop the packet when
+		 * deadloop is detected. Return OK to try the next skb.
+		 */
+		kfree_skb(skb);
+		net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
+				     dev_queue->dev->name);
+		ret = qdisc_qlen(q);
+	} else {
+		/*
+		 * Another cpu is holding lock, requeue & delay xmits for
+		 * some time.
+		 */
+		__this_cpu_inc(softnet_data.cpu_collision);
+		ret = dev_requeue_skb(skb, q);
+	}
+
+	return ret;
+}
 
-   dev->queue_lock and dev->xmit_lock are mutually exclusive,
-   if one is grabbed, another must be free.
+/*
+ * Transmit one skb, and handle the return status as required. Holding the
+ * __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this
+ * function.
+ *
+ * Returns to the caller:
+ *				0  - queue is empty or throttled.
+ *				>0 - queue is not empty.
  */
+int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+		    struct net_device *dev, struct netdev_queue *txq,
+		    spinlock_t *root_lock)
+{
+	int ret = NETDEV_TX_BUSY;
+
+	/* And release qdisc */
+	spin_unlock(root_lock);
+
+	HARD_TX_LOCK(dev, txq, smp_processor_id());
+	if (!netif_xmit_frozen_or_stopped(txq))
+		ret = dev_hard_start_xmit(skb, dev, txq);
 
+	HARD_TX_UNLOCK(dev, txq);
 
-/* Kick device.
-   Note, that this procedure can be called by a watchdog timer, so that
-   we do not check dev->tbusy flag here.
+	spin_lock(root_lock);
+
+	if (dev_xmit_complete(ret)) {
+		/* Driver sent out skb successfully or skb was consumed */
+		ret = qdisc_qlen(q);
+	} else if (ret == NETDEV_TX_LOCKED) {
+		/* Driver try lock failed */
+		ret = handle_dev_cpu_collision(skb, txq, q);
+	} else {
+		/* Driver returned NETDEV_TX_BUSY - requeue skb */
+		if (unlikely(ret != NETDEV_TX_BUSY))
+			net_warn_ratelimited("BUG %s code %d qlen %d\n",
+					     dev->name, ret, q->q.qlen);
+
+		ret = dev_requeue_skb(skb, q);
+	}
 
-   Returns:  0  - queue is empty.
-            >0  - queue is not empty, but throttled.
-	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
+	if (ret && netif_xmit_frozen_or_stopped(txq))
+		ret = 0;
 
-   NOTE: Called under dev->queue_lock with locally disabled BH.
-*/
+	return ret;
+}
 
-int qdisc_restart(struct net_device *dev)
+/*
+ * NOTE: Called under qdisc_lock(q) with locally disabled BH.
+ *
+ * __QDISC_STATE_RUNNING guarantees only one CPU can process
+ * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
+ * this queue.
+ *
+ *  netif_tx_lock serializes accesses to device driver.
+ *
+ *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
+ *  if one is grabbed, another must be free.
+ *
+ * Note, that this procedure can be called by a watchdog timer
+ *
+ * Returns to the caller:
+ *				0  - queue is empty or throttled.
+ *				>0 - queue is not empty.
+ *
+ */
+static inline int qdisc_restart(struct Qdisc *q)
 {
-	struct Qdisc *q = dev->qdisc;
+	struct netdev_queue *txq;
+	struct net_device *dev;
+	spinlock_t *root_lock;
 	struct sk_buff *skb;
 
 	/* Dequeue packet */
-	if ((skb = q->dequeue(q)) != NULL) {
-		unsigned nolock = (dev->features & NETIF_F_LLTX);
+	skb = dequeue_skb(q);
+	if (unlikely(!skb))
+		return 0;
+	WARN_ON_ONCE(skb_dst_is_noref(skb));
+	root_lock = qdisc_lock(q);
+	dev = qdisc_dev(q);
+	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+
+	return sch_direct_xmit(skb, q, dev, txq, root_lock);
+}
+
+void __qdisc_run(struct Qdisc *q)
+{
+	int quota = weight_p;
+
+	while (qdisc_restart(q)) {
 		/*
-		 * When the driver has LLTX set it does its own locking
-		 * in start_xmit. No need to add additional overhead by
-		 * locking again. These checks are worth it because
-		 * even uncongested locks can be quite expensive.
-		 * The driver can do trylock like here too, in case
-		 * of lock congestion it should return -1 and the packet
-		 * will be requeued.
+		 * Ordered by possible occurrence: Postpone processing if
+		 * 1. we've exceeded packet quota
+		 * 2. another process needs the CPU;
 		 */
-		if (!nolock) {
-			if (!spin_trylock(&dev->xmit_lock)) {
-			collision:
-				/* So, someone grabbed the driver. */
-				
-				/* It may be transient configuration error,
-				   when hard_start_xmit() recurses. We detect
-				   it by checking xmit owner and drop the
-				   packet when deadloop is detected.
-				*/
-				if (dev->xmit_lock_owner == smp_processor_id()) {
-					kfree_skb(skb);
-					if (net_ratelimit())
-						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
-					return -1;
-				}
-				__get_cpu_var(netdev_rx_stat).cpu_collision++;
-				goto requeue;
-			}
-			/* Remember that the driver is grabbed by us. */
-			dev->xmit_lock_owner = smp_processor_id();
-		}
-		
-		{
-			/* And release queue */
-			spin_unlock(&dev->queue_lock);
-
-			if (!netif_queue_stopped(dev)) {
-				int ret;
-				if (netdev_nit)
-					dev_queue_xmit_nit(skb, dev);
-
-				ret = dev->hard_start_xmit(skb, dev);
-				if (ret == NETDEV_TX_OK) { 
-					if (!nolock) {
-						dev->xmit_lock_owner = -1;
-						spin_unlock(&dev->xmit_lock);
-					}
-					spin_lock(&dev->queue_lock);
-					return -1;
-				}
-				if (ret == NETDEV_TX_LOCKED && nolock) {
-					spin_lock(&dev->queue_lock);
-					goto collision; 
-				}
-			}
-
-			/* NETDEV_TX_BUSY - we need to requeue */
-			/* Release the driver */
-			if (!nolock) { 
-				dev->xmit_lock_owner = -1;
-				spin_unlock(&dev->xmit_lock);
-			} 
-			spin_lock(&dev->queue_lock);
-			q = dev->qdisc;
+		if (--quota <= 0 || need_resched()) {
+			__netif_schedule(q);
+			break;
 		}
+	}
 
-		/* Device kicked us out :(
-		   This is possible in three cases:
-
-		   0. driver is locked
-		   1. fastroute is enabled
-		   2. device cannot determine busy state
-		      before start of transmission (f.e. dialout)
-		   3. device is buggy (ppp)
-		 */
+	qdisc_run_end(q);
+}
 
-requeue:
-		q->ops->requeue(skb, q);
-		netif_schedule(dev);
-		return 1;
+unsigned long dev_trans_start(struct net_device *dev)
+{
+	unsigned long val, res;
+	unsigned int i;
+
+	if (is_vlan_dev(dev))
+		dev = vlan_dev_real_dev(dev);
+	res = dev->trans_start;
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		val = netdev_get_tx_queue(dev, i)->trans_start;
+		if (val && time_after(val, res))
+			res = val;
 	}
-	BUG_ON((int) q->q.qlen < 0);
-	return q->q.qlen;
+	dev->trans_start = res;
+
+	return res;
 }
+EXPORT_SYMBOL(dev_trans_start);
 
 static void dev_watchdog(unsigned long arg)
 {
 	struct net_device *dev = (struct net_device *)arg;
 
-	spin_lock(&dev->xmit_lock);
-	if (dev->qdisc != &noop_qdisc) {
+	netif_tx_lock(dev);
+	if (!qdisc_tx_is_noop(dev)) {
 		if (netif_device_present(dev) &&
 		    netif_running(dev) &&
 		    netif_carrier_ok(dev)) {
-			if (netif_queue_stopped(dev) &&
-			    (jiffies - dev->trans_start) > dev->watchdog_timeo) {
-				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name);
-				dev->tx_timeout(dev);
+			int some_queue_timedout = 0;
+			unsigned int i;
+			unsigned long trans_start;
+
+			for (i = 0; i < dev->num_tx_queues; i++) {
+				struct netdev_queue *txq;
+
+				txq = netdev_get_tx_queue(dev, i);
+				/*
+				 * old device drivers set dev->trans_start
+				 */
+				trans_start = txq->trans_start ? : dev->trans_start;
+				if (netif_xmit_stopped(txq) &&
+				    time_after(jiffies, (trans_start +
+							 dev->watchdog_timeo))) {
+					some_queue_timedout = 1;
+					txq->trans_timeout++;
+					break;
+				}
+			}
+
+			if (some_queue_timedout) {
+				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
+				       dev->name, netdev_drivername(dev), i);
+				dev->netdev_ops->ndo_tx_timeout(dev);
 			}
-			if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
+			if (!mod_timer(&dev->watchdog_timer,
+				       round_jiffies(jiffies +
+						     dev->watchdog_timeo)))
 				dev_hold(dev);
 		}
 	}
-	spin_unlock(&dev->xmit_lock);
+	netif_tx_unlock(dev);
 
 	dev_put(dev);
 }
 
-static void dev_watchdog_init(struct net_device *dev)
-{
-	init_timer(&dev->watchdog_timer);
-	dev->watchdog_timer.data = (unsigned long)dev;
-	dev->watchdog_timer.function = dev_watchdog;
-}
-
 void __netdev_watchdog_up(struct net_device *dev)
 {
-	if (dev->tx_timeout) {
+	if (dev->netdev_ops->ndo_tx_timeout) {
 		if (dev->watchdog_timeo <= 0)
 			dev->watchdog_timeo = 5*HZ;
-		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
+		if (!mod_timer(&dev->watchdog_timer,
+			       round_jiffies(jiffies + dev->watchdog_timeo)))
 			dev_hold(dev);
 	}
 }
 
 static void dev_watchdog_up(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
 	__netdev_watchdog_up(dev);
-	spin_unlock_bh(&dev->xmit_lock);
 }
 
 static void dev_watchdog_down(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	if (del_timer(&dev->watchdog_timer))
-		__dev_put(dev);
-	spin_unlock_bh(&dev->xmit_lock);
+		dev_put(dev);
+	netif_tx_unlock_bh(dev);
 }
 
+/**
+ *	netif_carrier_on - set carrier
+ *	@dev: network device
+ *
+ * Device has detected that carrier.
+ */
 void netif_carrier_on(struct net_device *dev)
 {
-	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
+	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
+		if (dev->reg_state == NETREG_UNINITIALIZED)
+			return;
+		atomic_inc(&dev->carrier_changes);
 		linkwatch_fire_event(dev);
-	if (netif_running(dev))
-		__netdev_watchdog_up(dev);
+		if (netif_running(dev))
+			__netdev_watchdog_up(dev);
+	}
 }
+EXPORT_SYMBOL(netif_carrier_on);
 
+/**
+ *	netif_carrier_off - clear carrier
+ *	@dev: network device
+ *
+ * Device has detected loss of carrier.
+ */
 void netif_carrier_off(struct net_device *dev)
 {
-	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
+	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
+		if (dev->reg_state == NETREG_UNINITIALIZED)
+			return;
+		atomic_inc(&dev->carrier_changes);
 		linkwatch_fire_event(dev);
+	}
 }
+EXPORT_SYMBOL(netif_carrier_off);
 
 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
    under all circumstances. It is difficult to invent anything faster or
    cheaper.
  */
 
-static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
 {
 	kfree_skb(skb);
 	return NET_XMIT_CN;
 }
 
-static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
+static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
 {
 	return NULL;
 }
 
-static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
-{
-	if (net_ratelimit())
-		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
-		       skb->dev->name);
-	kfree_skb(skb);
-	return NET_XMIT_CN;
-}
-
-struct Qdisc_ops noop_qdisc_ops = {
+struct Qdisc_ops noop_qdisc_ops __read_mostly = {
 	.id		=	"noop",
 	.priv_size	=	0,
 	.enqueue	=	noop_enqueue,
 	.dequeue	=	noop_dequeue,
-	.requeue	=	noop_requeue,
+	.peek		=	noop_dequeue,
 	.owner		=	THIS_MODULE,
 };
 
+static struct netdev_queue noop_netdev_queue = {
+	.qdisc		=	&noop_qdisc,
+	.qdisc_sleeping	=	&noop_qdisc,
+};
+
 struct Qdisc noop_qdisc = {
 	.enqueue	=	noop_enqueue,
 	.dequeue	=	noop_dequeue,
 	.flags		=	TCQ_F_BUILTIN,
-	.ops		=	&noop_qdisc_ops,	
+	.ops		=	&noop_qdisc_ops,
 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
+	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
+	.dev_queue	=	&noop_netdev_queue,
+	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
 };
+EXPORT_SYMBOL(noop_qdisc);
 
-static struct Qdisc_ops noqueue_qdisc_ops = {
+static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
 	.id		=	"noqueue",
 	.priv_size	=	0,
 	.enqueue	=	noop_enqueue,
 	.dequeue	=	noop_dequeue,
-	.requeue	=	noop_requeue,
+	.peek		=	noop_dequeue,
 	.owner		=	THIS_MODULE,
 };
 
+static struct Qdisc noqueue_qdisc;
+static struct netdev_queue noqueue_netdev_queue = {
+	.qdisc		=	&noqueue_qdisc,
+	.qdisc_sleeping	=	&noqueue_qdisc,
+};
+
 static struct Qdisc noqueue_qdisc = {
 	.enqueue	=	NULL,
 	.dequeue	=	noop_dequeue,
 	.flags		=	TCQ_F_BUILTIN,
 	.ops		=	&noqueue_qdisc_ops,
 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
+	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
+	.dev_queue	=	&noqueue_netdev_queue,
+	.busylock	=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
 };
 
 
-static const u8 prio2band[TC_PRIO_MAX+1] =
-	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
+static const u8 prio2band[TC_PRIO_MAX + 1] = {
+	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+};
 
 /* 3-band FIFO queue: old style, but should be a bit faster than
    generic prio+fifo combination.
@@ -321,18 +414,38 @@ static const u8 prio2band[TC_PRIO_MAX+1] =
 
 #define PFIFO_FAST_BANDS 3
 
-static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
-					     struct Qdisc *qdisc)
+/*
+ * Private data for a pfifo_fast scheduler containing:
+ * 	- queues for the three band
+ * 	- bitmap indicating which of the bands contain skbs
+ */
+struct pfifo_fast_priv {
+	u32 bitmap;
+	struct sk_buff_head q[PFIFO_FAST_BANDS];
+};
+
+/*
+ * Convert a bitmap to the first band number where an skb is queued, where:
+ * 	bitmap=0 means there are no skbs on any band.
+ * 	bitmap=1 means there is an skb on band 0.
+ *	bitmap=7 means there are skbs on all 3 bands, etc.
+ */
+static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
+
+static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
+					     int band)
 {
-	struct sk_buff_head *list = qdisc_priv(qdisc);
-	return list + prio2band[skb->priority & TC_PRIO_MAX];
+	return priv->q + band;
 }
 
-static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
 {
-	struct sk_buff_head *list = prio2list(skb, qdisc);
+	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
+		int band = prio2band[skb->priority & TC_PRIO_MAX];
+		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+		struct sk_buff_head *list = band2list(priv, band);
 
-	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
+		priv->bitmap |= (1 << band);
 		qdisc->q.qlen++;
 		return __qdisc_enqueue_tail(skb, qdisc, list);
 	}
@@ -340,35 +453,48 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
 	return qdisc_drop(skb, qdisc);
 }
 
-static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
+static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 {
-	int prio;
-	struct sk_buff_head *list = qdisc_priv(qdisc);
+	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+	int band = bitmap2band[priv->bitmap];
 
-	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
-		if (!skb_queue_empty(list + prio)) {
-			qdisc->q.qlen--;
-			return __qdisc_dequeue_head(qdisc, list + prio);
-		}
+	if (likely(band >= 0)) {
+		struct sk_buff_head *list = band2list(priv, band);
+		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+
+		qdisc->q.qlen--;
+		if (skb_queue_empty(list))
+			priv->bitmap &= ~(1 << band);
+
+		return skb;
 	}
 
 	return NULL;
 }
 
-static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
+static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
 {
-	qdisc->q.qlen++;
-	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
+	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+	int band = bitmap2band[priv->bitmap];
+
+	if (band >= 0) {
+		struct sk_buff_head *list = band2list(priv, band);
+
+		return skb_peek(list);
+	}
+
+	return NULL;
 }
 
-static void pfifo_fast_reset(struct Qdisc* qdisc)
+static void pfifo_fast_reset(struct Qdisc *qdisc)
 {
 	int prio;
-	struct sk_buff_head *list = qdisc_priv(qdisc);
+	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 
 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-		__qdisc_reset_queue(qdisc, list + prio);
+		__qdisc_reset_queue(qdisc, band2list(priv, prio));
 
+	priv->bitmap = 0;
 	qdisc->qstats.backlog = 0;
 	qdisc->q.qlen = 0;
 }
@@ -377,77 +503,99 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
 {
 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
 
-	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
-	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
 	return skb->len;
 
-rtattr_failure:
+nla_put_failure:
 	return -1;
 }
 
-static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
+static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
 {
 	int prio;
-	struct sk_buff_head *list = qdisc_priv(qdisc);
+	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 
 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-		skb_queue_head_init(list + prio);
+		skb_queue_head_init(band2list(priv, prio));
 
+	/* Can by-pass the queue discipline */
+	qdisc->flags |= TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
-static struct Qdisc_ops pfifo_fast_ops = {
+struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.id		=	"pfifo_fast",
-	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
+	.priv_size	=	sizeof(struct pfifo_fast_priv),
 	.enqueue	=	pfifo_fast_enqueue,
 	.dequeue	=	pfifo_fast_dequeue,
-	.requeue	=	pfifo_fast_requeue,
+	.peek		=	pfifo_fast_peek,
 	.init		=	pfifo_fast_init,
 	.reset		=	pfifo_fast_reset,
 	.dump		=	pfifo_fast_dump,
 	.owner		=	THIS_MODULE,
 };
 
-struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
+static struct lock_class_key qdisc_tx_busylock;
+
+struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
+			  const struct Qdisc_ops *ops)
 {
 	void *p;
 	struct Qdisc *sch;
-	unsigned int size;
+	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
 	int err = -ENOBUFS;
+	struct net_device *dev = dev_queue->dev;
 
-	/* ensure that the Qdisc and the private data are 32-byte aligned */
-	size = QDISC_ALIGN(sizeof(*sch));
-	size += ops->priv_size + (QDISC_ALIGNTO - 1);
+	p = kzalloc_node(size, GFP_KERNEL,
+			 netdev_queue_numa_node_read(dev_queue));
 
-	p = kmalloc(size, GFP_KERNEL);
 	if (!p)
 		goto errout;
-	memset(p, 0, size);
 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
-	sch->padded = (char *) sch - (char *) p;
-
+	/* if we got non aligned memory, ask more and do alignment ourself */
+	if (sch != p) {
+		kfree(p);
+		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
+				 netdev_queue_numa_node_read(dev_queue));
+		if (!p)
+			goto errout;
+		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+		sch->padded = (char *) sch - (char *) p;
+	}
 	INIT_LIST_HEAD(&sch->list);
 	skb_queue_head_init(&sch->q);
+
+	spin_lock_init(&sch->busylock);
+	lockdep_set_class(&sch->busylock,
+			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
 	sch->ops = ops;
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;
-	sch->dev = dev;
+	sch->dev_queue = dev_queue;
 	dev_hold(dev);
-	sch->stats_lock = &dev->queue_lock;
 	atomic_set(&sch->refcnt, 1);
 
 	return sch;
 errout:
-	return ERR_PTR(-err);
+	return ERR_PTR(err);
 }
 
-struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
+				const struct Qdisc_ops *ops,
+				unsigned int parentid)
 {
 	struct Qdisc *sch;
-	
-	sch = qdisc_alloc(dev, ops);
+
+	if (!try_module_get(ops->owner))
+		goto errout;
+
+	sch = qdisc_alloc(dev_queue, ops);
 	if (IS_ERR(sch))
 		goto errout;
+	sch->parent = parentid;
 
 	if (!ops->init || ops->init(sch, NULL) == 0)
 		return sch;
@@ -456,172 +604,345 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
 errout:
 	return NULL;
 }
+EXPORT_SYMBOL(qdisc_create_dflt);
 
-/* Under dev->queue_lock and BH! */
+/* Under qdisc_lock(qdisc) and BH! */
 
 void qdisc_reset(struct Qdisc *qdisc)
 {
-	struct Qdisc_ops *ops = qdisc->ops;
+	const struct Qdisc_ops *ops = qdisc->ops;
 
 	if (ops->reset)
 		ops->reset(qdisc);
+
+	if (qdisc->gso_skb) {
+		kfree_skb(qdisc->gso_skb);
+		qdisc->gso_skb = NULL;
+		qdisc->q.qlen = 0;
+	}
 }
+EXPORT_SYMBOL(qdisc_reset);
 
-/* this is the rcu callback function to clean up a qdisc when there 
- * are no further references to it */
+static void qdisc_rcu_free(struct rcu_head *head)
+{
+	struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
 
-static void __qdisc_destroy(struct rcu_head *head)
+	kfree((char *) qdisc - qdisc->padded);
+}
+
+void qdisc_destroy(struct Qdisc *qdisc)
 {
-	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
-	struct Qdisc_ops  *ops = qdisc->ops;
+	const struct Qdisc_ops  *ops = qdisc->ops;
 
-#ifdef CONFIG_NET_ESTIMATOR
-	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+	if (qdisc->flags & TCQ_F_BUILTIN ||
+	    !atomic_dec_and_test(&qdisc->refcnt))
+		return;
+
+#ifdef CONFIG_NET_SCHED
+	qdisc_list_del(qdisc);
+
+	qdisc_put_stab(rtnl_dereference(qdisc->stab));
 #endif
-	write_lock(&qdisc_tree_lock);
+	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
 	if (ops->reset)
 		ops->reset(qdisc);
 	if (ops->destroy)
 		ops->destroy(qdisc);
-	write_unlock(&qdisc_tree_lock);
+
 	module_put(ops->owner);
+	dev_put(qdisc_dev(qdisc));
 
-	dev_put(qdisc->dev);
-	kfree((char *) qdisc - qdisc->padded);
+	kfree_skb(qdisc->gso_skb);
+	/*
+	 * gen_estimator est_timer() might access qdisc->q.lock,
+	 * wait a RCU grace period before freeing qdisc.
+	 */
+	call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
 }
+EXPORT_SYMBOL(qdisc_destroy);
 
-/* Under dev->queue_lock and BH! */
+/* Attach toplevel qdisc to device queue. */
+struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
+			      struct Qdisc *qdisc)
+{
+	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
+	spinlock_t *root_lock;
 
-void qdisc_destroy(struct Qdisc *qdisc)
+	root_lock = qdisc_lock(oqdisc);
+	spin_lock_bh(root_lock);
+
+	/* Prune old scheduler */
+	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
+		qdisc_reset(oqdisc);
+
+	/* ... and graft new one */
+	if (qdisc == NULL)
+		qdisc = &noop_qdisc;
+	dev_queue->qdisc_sleeping = qdisc;
+	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
+
+	spin_unlock_bh(root_lock);
+
+	return oqdisc;
+}
+EXPORT_SYMBOL(dev_graft_qdisc);
+
+static void attach_one_default_qdisc(struct net_device *dev,
+				     struct netdev_queue *dev_queue,
+				     void *_unused)
 {
-	struct list_head cql = LIST_HEAD_INIT(cql);
-	struct Qdisc *cq, *q, *n;
+	struct Qdisc *qdisc = &noqueue_qdisc;
+
+	if (dev->tx_queue_len) {
+		qdisc = qdisc_create_dflt(dev_queue,
+					  default_qdisc_ops, TC_H_ROOT);
+		if (!qdisc) {
+			netdev_info(dev, "activation failed\n");
+			return;
+		}
+		if (!netif_is_multiqueue(dev))
+			qdisc->flags |= TCQ_F_ONETXQUEUE;
+	}
+	dev_queue->qdisc_sleeping = qdisc;
+}
 
-	if (qdisc->flags & TCQ_F_BUILTIN ||
-		!atomic_dec_and_test(&qdisc->refcnt))
-		return;
+static void attach_default_qdiscs(struct net_device *dev)
+{
+	struct netdev_queue *txq;
+	struct Qdisc *qdisc;
 
-	if (!list_empty(&qdisc->list)) {
-		if (qdisc->ops->cl_ops == NULL)
-			list_del(&qdisc->list);
-		else
-			list_move(&qdisc->list, &cql);
+	txq = netdev_get_tx_queue(dev, 0);
+
+	if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
+		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
+		dev->qdisc = txq->qdisc_sleeping;
+		atomic_inc(&dev->qdisc->refcnt);
+	} else {
+		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
+		if (qdisc) {
+			dev->qdisc = qdisc;
+			qdisc->ops->attach(qdisc);
+		}
 	}
+}
 
-	/* unlink inner qdiscs from dev->qdisc_list immediately */
-	list_for_each_entry(cq, &cql, list)
-		list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
-			if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
-				if (q->ops->cl_ops == NULL)
-					list_del_init(&q->list);
-				else
-					list_move_tail(&q->list, &cql);
-			}
-	list_for_each_entry_safe(cq, n, &cql, list)
-		list_del_init(&cq->list);
+static void transition_one_qdisc(struct net_device *dev,
+				 struct netdev_queue *dev_queue,
+				 void *_need_watchdog)
+{
+	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
+	int *need_watchdog_p = _need_watchdog;
+
+	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
+		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
 
-	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
+	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
+	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
+		dev_queue->trans_start = 0;
+		*need_watchdog_p = 1;
+	}
 }
 
 void dev_activate(struct net_device *dev)
 {
+	int need_watchdog;
+
 	/* No queueing discipline is attached to device;
-	   create default one i.e. pfifo_fast for devices,
-	   which need queueing and noqueue_qdisc for
-	   virtual interfaces
+	 * create default one for devices, which need queueing
+	 * and noqueue_qdisc for virtual interfaces
 	 */
 
-	if (dev->qdisc_sleeping == &noop_qdisc) {
-		struct Qdisc *qdisc;
-		if (dev->tx_queue_len) {
-			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
-			if (qdisc == NULL) {
-				printk(KERN_INFO "%s: activation failed\n", dev->name);
-				return;
-			}
-			write_lock_bh(&qdisc_tree_lock);
-			list_add_tail(&qdisc->list, &dev->qdisc_list);
-			write_unlock_bh(&qdisc_tree_lock);
-		} else {
-			qdisc =  &noqueue_qdisc;
-		}
-		write_lock_bh(&qdisc_tree_lock);
-		dev->qdisc_sleeping = qdisc;
-		write_unlock_bh(&qdisc_tree_lock);
-	}
+	if (dev->qdisc == &noop_qdisc)
+		attach_default_qdiscs(dev);
 
 	if (!netif_carrier_ok(dev))
 		/* Delay activation until next carrier-on event */
 		return;
 
-	spin_lock_bh(&dev->queue_lock);
-	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
-	if (dev->qdisc != &noqueue_qdisc) {
+	need_watchdog = 0;
+	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
+	if (dev_ingress_queue(dev))
+		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
+
+	if (need_watchdog) {
 		dev->trans_start = jiffies;
 		dev_watchdog_up(dev);
 	}
-	spin_unlock_bh(&dev->queue_lock);
 }
+EXPORT_SYMBOL(dev_activate);
 
-void dev_deactivate(struct net_device *dev)
+static void dev_deactivate_queue(struct net_device *dev,
+				 struct netdev_queue *dev_queue,
+				 void *_qdisc_default)
 {
+	struct Qdisc *qdisc_default = _qdisc_default;
 	struct Qdisc *qdisc;
 
-	spin_lock_bh(&dev->queue_lock);
-	qdisc = dev->qdisc;
-	dev->qdisc = &noop_qdisc;
+	qdisc = dev_queue->qdisc;
+	if (qdisc) {
+		spin_lock_bh(qdisc_lock(qdisc));
+
+		if (!(qdisc->flags & TCQ_F_BUILTIN))
+			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
 
-	qdisc_reset(qdisc);
+		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+		qdisc_reset(qdisc);
 
-	spin_unlock_bh(&dev->queue_lock);
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
+}
+
+static bool some_qdisc_is_busy(struct net_device *dev)
+{
+	unsigned int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct netdev_queue *dev_queue;
+		spinlock_t *root_lock;
+		struct Qdisc *q;
+		int val;
+
+		dev_queue = netdev_get_tx_queue(dev, i);
+		q = dev_queue->qdisc_sleeping;
+		root_lock = qdisc_lock(q);
+
+		spin_lock_bh(root_lock);
 
-	dev_watchdog_down(dev);
+		val = (qdisc_is_running(q) ||
+		       test_bit(__QDISC_STATE_SCHED, &q->state));
 
-	while (test_bit(__LINK_STATE_SCHED, &dev->state))
-		yield();
+		spin_unlock_bh(root_lock);
 
-	spin_unlock_wait(&dev->xmit_lock);
+		if (val)
+			return true;
+	}
+	return false;
+}
+
+/**
+ * 	dev_deactivate_many - deactivate transmissions on several devices
+ * 	@head: list of devices to deactivate
+ *
+ *	This function returns only when all outstanding transmissions
+ *	have completed, unless all devices are in dismantle phase.
+ */
+void dev_deactivate_many(struct list_head *head)
+{
+	struct net_device *dev;
+	bool sync_needed = false;
+
+	list_for_each_entry(dev, head, close_list) {
+		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
+					 &noop_qdisc);
+		if (dev_ingress_queue(dev))
+			dev_deactivate_queue(dev, dev_ingress_queue(dev),
+					     &noop_qdisc);
+
+		dev_watchdog_down(dev);
+		sync_needed |= !dev->dismantle;
+	}
+
+	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
+	 * This is avoided if all devices are in dismantle phase :
+	 * Caller will call synchronize_net() for us
+	 */
+	if (sync_needed)
+		synchronize_net();
+
+	/* Wait for outstanding qdisc_run calls. */
+	list_for_each_entry(dev, head, close_list)
+		while (some_qdisc_is_busy(dev))
+			yield();
+}
+
+void dev_deactivate(struct net_device *dev)
+{
+	LIST_HEAD(single);
+
+	list_add(&dev->close_list, &single);
+	dev_deactivate_many(&single);
+	list_del(&single);
+}
+EXPORT_SYMBOL(dev_deactivate);
+
+static void dev_init_scheduler_queue(struct net_device *dev,
+				     struct netdev_queue *dev_queue,
+				     void *_qdisc)
+{
+	struct Qdisc *qdisc = _qdisc;
+
+	dev_queue->qdisc = qdisc;
+	dev_queue->qdisc_sleeping = qdisc;
 }
 
 void dev_init_scheduler(struct net_device *dev)
 {
-	qdisc_lock_tree(dev);
 	dev->qdisc = &noop_qdisc;
-	dev->qdisc_sleeping = &noop_qdisc;
-	INIT_LIST_HEAD(&dev->qdisc_list);
-	qdisc_unlock_tree(dev);
+	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 
-	dev_watchdog_init(dev);
+	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
 }
 
-void dev_shutdown(struct net_device *dev)
+static void shutdown_scheduler_queue(struct net_device *dev,
+				     struct netdev_queue *dev_queue,
+				     void *_qdisc_default)
 {
-	struct Qdisc *qdisc;
+	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+	struct Qdisc *qdisc_default = _qdisc_default;
+
+	if (qdisc) {
+		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+		dev_queue->qdisc_sleeping = qdisc_default;
 
-	qdisc_lock_tree(dev);
-	qdisc = dev->qdisc_sleeping;
-	dev->qdisc = &noop_qdisc;
-	dev->qdisc_sleeping = &noop_qdisc;
-	qdisc_destroy(qdisc);
-#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
-        if ((qdisc = dev->qdisc_ingress) != NULL) {
-		dev->qdisc_ingress = NULL;
 		qdisc_destroy(qdisc);
-        }
-#endif
-	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
-	qdisc_unlock_tree(dev);
+	}
 }
 
-EXPORT_SYMBOL(__netdev_watchdog_up);
-EXPORT_SYMBOL(netif_carrier_on);
-EXPORT_SYMBOL(netif_carrier_off);
-EXPORT_SYMBOL(noop_qdisc);
-EXPORT_SYMBOL(noop_qdisc_ops);
-EXPORT_SYMBOL(qdisc_create_dflt);
-EXPORT_SYMBOL(qdisc_alloc);
-EXPORT_SYMBOL(qdisc_destroy);
-EXPORT_SYMBOL(qdisc_reset);
-EXPORT_SYMBOL(qdisc_restart);
-EXPORT_SYMBOL(qdisc_lock_tree);
-EXPORT_SYMBOL(qdisc_unlock_tree);
+void dev_shutdown(struct net_device *dev)
+{
+	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
+	qdisc_destroy(dev->qdisc);
+	dev->qdisc = &noop_qdisc;
+
+	WARN_ON(timer_pending(&dev->watchdog_timer));
+}
+
+void psched_ratecfg_precompute(struct psched_ratecfg *r,
+			       const struct tc_ratespec *conf,
+			       u64 rate64)
+{
+	memset(r, 0, sizeof(*r));
+	r->overhead = conf->overhead;
+	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
+	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
+	r->mult = 1;
+	/*
+	 * The deal here is to replace a divide by a reciprocal one
+	 * in fast path (a reciprocal divide is a multiply and a shift)
+	 *
+	 * Normal formula would be :
+	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
+	 *
+	 * We compute mult/shift to use instead :
+	 *  time_in_ns = (len * mult) >> shift;
+	 *
+	 * We try to get the highest possible mult value for accuracy,
+	 * but have to make sure no overflows will ever happen.
+	 */
+	if (r->rate_bytes_ps > 0) {
+		u64 factor = NSEC_PER_SEC;
+
+		for (;;) {
+			r->mult = div64_u64(factor, r->rate_bytes_ps);
+			if (r->mult & (1U << 31) || factor & (1ULL << 63))
+				break;
+			factor <<= 1;
+			r->shift++;
+		}
+	}
+}
+EXPORT_SYMBOL(psched_ratecfg_precompute);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 29a2dd9f302..12cbc09157f 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -18,11 +18,10 @@
  *  For all the glorious comments look at include/net/red.h
  */
 
-#include <linux/config.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/pkt_sched.h>
 #include <net/red.h>
@@ -33,16 +32,16 @@
 struct gred_sched_data;
 struct gred_sched;
 
-struct gred_sched_data
-{
+struct gred_sched_data {
 	u32		limit;		/* HARD maximal queue length	*/
-	u32      	DP;		/* the drop pramaters */
+	u32		DP;		/* the drop parameters */
 	u32		bytesin;	/* bytes seen on virtualQ so far*/
 	u32		packetsin;	/* packets seen on virtualQ so far*/
 	u32		backlog;	/* bytes on the virtualQ */
 	u8		prio;		/* the prio of this vq */
 
 	struct red_parms parms;
+	struct red_vars  vars;
 	struct red_stats stats;
 };
 
@@ -51,14 +50,13 @@ enum {
 	GRED_RIO_MODE,
 };
 
-struct gred_sched
-{
+struct gred_sched {
 	struct gred_sched_data *tab[MAX_DPs];
 	unsigned long	flags;
 	u32		red_flags;
 	u32 		DPs;
 	u32 		def;
-	struct red_parms wred_set;
+	struct red_vars wred_set;
 };
 
 static inline int gred_wred_mode(struct gred_sched *table)
@@ -104,9 +102,8 @@ static inline int gred_wred_mode_check(struct Qdisc *sch)
 		if (q == NULL)
 			continue;
 
-		for (n = 0; n < table->DPs; n++)
-			if (table->tab[n] && table->tab[n] != q &&
-			    table->tab[n]->prio == q->prio)
+		for (n = i + 1; n < table->DPs; n++)
+			if (table->tab[n] && table->tab[n]->prio == q->prio)
 				return 1;
 	}
 
@@ -128,17 +125,18 @@ static inline u16 tc_index_to_dp(struct sk_buff *skb)
 	return skb->tc_index & GRED_VQ_MASK;
 }
 
-static inline void gred_load_wred_set(struct gred_sched *table,
+static inline void gred_load_wred_set(const struct gred_sched *table,
 				      struct gred_sched_data *q)
 {
-	q->parms.qavg = table->wred_set.qavg;
-	q->parms.qidlestart = table->wred_set.qidlestart;
+	q->vars.qavg = table->wred_set.qavg;
+	q->vars.qidlestart = table->wred_set.qidlestart;
 }
 
 static inline void gred_store_wred_set(struct gred_sched *table,
 				       struct gred_sched_data *q)
 {
-	table->wred_set.qavg = q->parms.qavg;
+	table->wred_set.qavg = q->vars.qavg;
+	table->wred_set.qidlestart = q->vars.qidlestart;
 }
 
 static inline int gred_use_ecn(struct gred_sched *t)
@@ -151,85 +149,88 @@ static inline int gred_use_harddrop(struct gred_sched *t)
 	return t->red_flags & TC_RED_HARDDROP;
 }
 
-static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
-	struct gred_sched_data *q=NULL;
-	struct gred_sched *t= qdisc_priv(sch);
+	struct gred_sched_data *q = NULL;
+	struct gred_sched *t = qdisc_priv(sch);
 	unsigned long qavg = 0;
 	u16 dp = tc_index_to_dp(skb);
 
-	if (dp >= t->DPs  || (q = t->tab[dp]) == NULL) {
+	if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
 		dp = t->def;
 
-		if ((q = t->tab[dp]) == NULL) {
+		q = t->tab[dp];
+		if (!q) {
 			/* Pass through packets not assigned to a DP
 			 * if no default DP has been configured. This
 			 * allows for DP flows to be left untouched.
 			 */
-			if (skb_queue_len(&sch->q) < sch->dev->tx_queue_len)
+			if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len)
 				return qdisc_enqueue_tail(skb, sch);
 			else
 				goto drop;
 		}
 
-		/* fix tc_index? --could be controvesial but needed for
+		/* fix tc_index? --could be controversial but needed for
 		   requeueing */
 		skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
 	}
 
-	/* sum up all the qaves of prios <= to ours to get the new qave */
+	/* sum up all the qaves of prios < ours to get the new qave */
 	if (!gred_wred_mode(t) && gred_rio_mode(t)) {
 		int i;
 
 		for (i = 0; i < t->DPs; i++) {
 			if (t->tab[i] && t->tab[i]->prio < q->prio &&
-			    !red_is_idling(&t->tab[i]->parms))
-				qavg +=t->tab[i]->parms.qavg;
+			    !red_is_idling(&t->tab[i]->vars))
+				qavg += t->tab[i]->vars.qavg;
 		}
 
 	}
 
 	q->packetsin++;
-	q->bytesin += skb->len;
+	q->bytesin += qdisc_pkt_len(skb);
 
 	if (gred_wred_mode(t))
 		gred_load_wred_set(t, q);
 
-	q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch));
+	q->vars.qavg = red_calc_qavg(&q->parms,
+				     &q->vars,
+				     gred_backlog(t, q, sch));
 
-	if (red_is_idling(&q->parms))
-		red_end_of_idle_period(&q->parms);
+	if (red_is_idling(&q->vars))
+		red_end_of_idle_period(&q->vars);
 
 	if (gred_wred_mode(t))
 		gred_store_wred_set(t, q);
 
-	switch (red_action(&q->parms, q->parms.qavg + qavg)) {
-		case RED_DONT_MARK:
-			break;
+	switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) {
+	case RED_DONT_MARK:
+		break;
 
-		case RED_PROB_MARK:
-			sch->qstats.overlimits++;
-			if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
-				q->stats.prob_drop++;
-				goto congestion_drop;
-			}
+	case RED_PROB_MARK:
+		sch->qstats.overlimits++;
+		if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+			q->stats.prob_drop++;
+			goto congestion_drop;
+		}
 
-			q->stats.prob_mark++;
-			break;
+		q->stats.prob_mark++;
+		break;
 
-		case RED_HARD_MARK:
-			sch->qstats.overlimits++;
-			if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
-			    !INET_ECN_set_ce(skb)) {
-				q->stats.forced_drop++;
-				goto congestion_drop;
-			}
-			q->stats.forced_mark++;
-			break;
+	case RED_HARD_MARK:
+		sch->qstats.overlimits++;
+		if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+		    !INET_ECN_set_ce(skb)) {
+			q->stats.forced_drop++;
+			goto congestion_drop;
+		}
+		q->stats.forced_mark++;
+		break;
 	}
 
-	if (q->backlog + skb->len <= q->limit) {
-		q->backlog += skb->len;
+	if (q->backlog + qdisc_pkt_len(skb) <= q->limit) {
+		q->backlog += qdisc_pkt_len(skb);
 		return qdisc_enqueue_tail(skb, sch);
 	}
 
@@ -242,27 +243,7 @@ congestion_drop:
 	return NET_XMIT_CN;
 }
 
-static int gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
-{
-	struct gred_sched *t = qdisc_priv(sch);
-	struct gred_sched_data *q;
-	u16 dp = tc_index_to_dp(skb);
-
-	if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
-		if (net_ratelimit())
-			printk(KERN_WARNING "GRED: Unable to relocate VQ 0x%x "
-			       "for requeue, screwing up backlog.\n",
-			       tc_index_to_dp(skb));
-	} else {
-		if (red_is_idling(&q->parms))
-			red_end_of_idle_period(&q->parms);
-		q->backlog += skb->len;
-	}
-
-	return qdisc_requeue(skb, sch);
-}
-
-static struct sk_buff *gred_dequeue(struct Qdisc* sch)
+static struct sk_buff *gred_dequeue(struct Qdisc *sch)
 {
 	struct sk_buff *skb;
 	struct gred_sched *t = qdisc_priv(sch);
@@ -274,75 +255,74 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch)
 		u16 dp = tc_index_to_dp(skb);
 
 		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
-			if (net_ratelimit())
-				printk(KERN_WARNING "GRED: Unable to relocate "
-				       "VQ 0x%x after dequeue, screwing up "
-				       "backlog.\n", tc_index_to_dp(skb));
+			net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n",
+					     tc_index_to_dp(skb));
 		} else {
-			q->backlog -= skb->len;
-
-			if (!q->backlog && !gred_wred_mode(t))
-				red_start_of_idle_period(&q->parms);
+			q->backlog -= qdisc_pkt_len(skb);
+
+			if (gred_wred_mode(t)) {
+				if (!sch->qstats.backlog)
+					red_start_of_idle_period(&t->wred_set);
+			} else {
+				if (!q->backlog)
+					red_start_of_idle_period(&q->vars);
+			}
 		}
 
 		return skb;
 	}
 
-	if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
-		red_start_of_idle_period(&t->wred_set);
-
 	return NULL;
 }
 
-static unsigned int gred_drop(struct Qdisc* sch)
+static unsigned int gred_drop(struct Qdisc *sch)
 {
 	struct sk_buff *skb;
 	struct gred_sched *t = qdisc_priv(sch);
 
 	skb = qdisc_dequeue_tail(sch);
 	if (skb) {
-		unsigned int len = skb->len;
+		unsigned int len = qdisc_pkt_len(skb);
 		struct gred_sched_data *q;
 		u16 dp = tc_index_to_dp(skb);
 
 		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
-			if (net_ratelimit())
-				printk(KERN_WARNING "GRED: Unable to relocate "
-				       "VQ 0x%x while dropping, screwing up "
-				       "backlog.\n", tc_index_to_dp(skb));
+			net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n",
+					     tc_index_to_dp(skb));
 		} else {
 			q->backlog -= len;
 			q->stats.other++;
 
-			if (!q->backlog && !gred_wred_mode(t))
-				red_start_of_idle_period(&q->parms);
+			if (gred_wred_mode(t)) {
+				if (!sch->qstats.backlog)
+					red_start_of_idle_period(&t->wred_set);
+			} else {
+				if (!q->backlog)
+					red_start_of_idle_period(&q->vars);
+			}
 		}
 
 		qdisc_drop(skb, sch);
 		return len;
 	}
 
-	if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
-		red_start_of_idle_period(&t->wred_set);
-
 	return 0;
-
 }
 
-static void gred_reset(struct Qdisc* sch)
+static void gred_reset(struct Qdisc *sch)
 {
 	int i;
 	struct gred_sched *t = qdisc_priv(sch);
 
 	qdisc_reset_queue(sch);
 
-        for (i = 0; i < t->DPs; i++) {
+	for (i = 0; i < t->DPs; i++) {
 		struct gred_sched_data *q = t->tab[i];
 
 		if (!q)
 			continue;
 
-		red_restart(&q->parms);
+		red_restart(&q->vars);
 		q->backlog = 0;
 	}
 }
@@ -352,16 +332,16 @@ static inline void gred_destroy_vq(struct gred_sched_data *q)
 	kfree(q);
 }
 
-static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps)
+static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
 {
 	struct gred_sched *table = qdisc_priv(sch);
 	struct tc_gred_sopt *sopt;
 	int i;
 
-	if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt))
+	if (dps == NULL)
 		return -EINVAL;
 
-	sopt = RTA_DATA(dps);
+	sopt = nla_data(dps);
 
 	if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
 		return -EINVAL;
@@ -390,66 +370,81 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps)
 
 	for (i = table->DPs; i < MAX_DPs; i++) {
 		if (table->tab[i]) {
-			printk(KERN_WARNING "GRED: Warning: Destroying "
-			       "shadowed VQ 0x%x\n", i);
+			pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n",
+				i);
 			gred_destroy_vq(table->tab[i]);
 			table->tab[i] = NULL;
-  		}
+		}
 	}
 
 	return 0;
 }
 
 static inline int gred_change_vq(struct Qdisc *sch, int dp,
-				 struct tc_gred_qopt *ctl, int prio, u8 *stab)
+				 struct tc_gred_qopt *ctl, int prio,
+				 u8 *stab, u32 max_P,
+				 struct gred_sched_data **prealloc)
 {
 	struct gred_sched *table = qdisc_priv(sch);
-	struct gred_sched_data *q;
+	struct gred_sched_data *q = table->tab[dp];
 
-	if (table->tab[dp] == NULL) {
-		table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL);
-		if (table->tab[dp] == NULL)
+	if (!q) {
+		table->tab[dp] = q = *prealloc;
+		*prealloc = NULL;
+		if (!q)
 			return -ENOMEM;
-		memset(table->tab[dp], 0, sizeof(*q));
 	}
 
-	q = table->tab[dp];
 	q->DP = dp;
 	q->prio = prio;
 	q->limit = ctl->limit;
 
 	if (q->backlog == 0)
-		red_end_of_idle_period(&q->parms);
+		red_end_of_idle_period(&q->vars);
 
 	red_set_parms(&q->parms,
 		      ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
-		      ctl->Scell_log, stab);
-
+		      ctl->Scell_log, stab, max_P);
+	red_set_vars(&q->vars);
 	return 0;
 }
 
-static int gred_change(struct Qdisc *sch, struct rtattr *opt)
+static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
+	[TCA_GRED_PARMS]	= { .len = sizeof(struct tc_gred_qopt) },
+	[TCA_GRED_STAB]		= { .len = 256 },
+	[TCA_GRED_DPS]		= { .len = sizeof(struct tc_gred_sopt) },
+	[TCA_GRED_MAX_P]	= { .type = NLA_U32 },
+};
+
+static int gred_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct gred_sched *table = qdisc_priv(sch);
 	struct tc_gred_qopt *ctl;
-	struct rtattr *tb[TCA_GRED_MAX];
-	int err = -EINVAL, prio = GRED_DEF_PRIO;
+	struct nlattr *tb[TCA_GRED_MAX + 1];
+	int err, prio = GRED_DEF_PRIO;
 	u8 *stab;
+	u32 max_P;
+	struct gred_sched_data *prealloc;
 
-	if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
+	if (opt == NULL)
 		return -EINVAL;
 
-	if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL)
+	err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL)
 		return gred_change_table_def(sch, opt);
 
-	if (tb[TCA_GRED_PARMS-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
-	    tb[TCA_GRED_STAB-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
+	if (tb[TCA_GRED_PARMS] == NULL ||
+	    tb[TCA_GRED_STAB] == NULL)
 		return -EINVAL;
 
-	ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
-	stab = RTA_DATA(tb[TCA_GRED_STAB-1]);
+	max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
+
+	err = -EINVAL;
+	ctl = nla_data(tb[TCA_GRED_PARMS]);
+	stab = nla_data(tb[TCA_GRED_STAB]);
 
 	if (ctl->DP >= table->DPs)
 		goto errout;
@@ -469,9 +464,10 @@ static int gred_change(struct Qdisc *sch, struct rtattr *opt)
 			prio = ctl->prio;
 	}
 
+	prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
 	sch_tree_lock(sch);
 
-	err = gred_change_vq(sch, ctl->DP, ctl, prio, stab);
+	err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);
 	if (err < 0)
 		goto errout_locked;
 
@@ -485,28 +481,35 @@ static int gred_change(struct Qdisc *sch, struct rtattr *opt)
 
 errout_locked:
 	sch_tree_unlock(sch);
+	kfree(prealloc);
 errout:
 	return err;
 }
 
-static int gred_init(struct Qdisc *sch, struct rtattr *opt)
+static int gred_init(struct Qdisc *sch, struct nlattr *opt)
 {
-	struct rtattr *tb[TCA_GRED_MAX];
+	struct nlattr *tb[TCA_GRED_MAX + 1];
+	int err;
 
-	if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
+	if (opt == NULL)
 		return -EINVAL;
 
-	if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1])
+	err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB])
 		return -EINVAL;
 
-	return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]);
+	return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
 }
 
 static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct gred_sched *table = qdisc_priv(sch);
-	struct rtattr *parms, *opts = NULL;
+	struct nlattr *parms, *opts = NULL;
 	int i;
+	u32 max_p[MAX_DPs];
 	struct tc_gred_sopt sopt = {
 		.DPs	= table->DPs,
 		.def_DP	= table->def,
@@ -514,13 +517,28 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 		.flags	= table->red_flags,
 	};
 
-	opts = RTA_NEST(skb, TCA_OPTIONS);
-	RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
-	parms = RTA_NEST(skb, TCA_GRED_PARMS);
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+	if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt))
+		goto nla_put_failure;
+
+	for (i = 0; i < MAX_DPs; i++) {
+		struct gred_sched_data *q = table->tab[i];
+
+		max_p[i] = q ? q->parms.max_P : 0;
+	}
+	if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p))
+		goto nla_put_failure;
+
+	parms = nla_nest_start(skb, TCA_GRED_PARMS);
+	if (parms == NULL)
+		goto nla_put_failure;
 
 	for (i = 0; i < MAX_DPs; i++) {
 		struct gred_sched_data *q = table->tab[i];
 		struct tc_gred_qopt opt;
+		unsigned long qavg;
 
 		memset(&opt, 0, sizeof(opt));
 
@@ -549,24 +567,25 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 		opt.packets	= q->packetsin;
 		opt.bytesin	= q->bytesin;
 
-		if (gred_wred_mode(table)) {
-			q->parms.qidlestart =
-				table->tab[table->def]->parms.qidlestart;
-			q->parms.qavg = table->tab[table->def]->parms.qavg;
-		}
+		if (gred_wred_mode(table))
+			gred_load_wred_set(table, q);
 
-		opt.qave = red_calc_qavg(&q->parms, q->parms.qavg);
+		qavg = red_calc_qavg(&q->parms, &q->vars,
+				     q->vars.qavg >> q->parms.Wlog);
+		opt.qave = qavg >> q->parms.Wlog;
 
 append_opt:
-		RTA_APPEND(skb, sizeof(opt), &opt);
+		if (nla_append(skb, sizeof(opt), &opt) < 0)
+			goto nla_put_failure;
 	}
 
-	RTA_NEST_END(skb, parms);
+	nla_nest_end(skb, parms);
 
-	return RTA_NEST_END(skb, opts);
+	return nla_nest_end(skb, opts);
 
-rtattr_failure:
-	return RTA_NEST_CANCEL(skb, opts);
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
 }
 
 static void gred_destroy(struct Qdisc *sch)
@@ -580,12 +599,12 @@ static void gred_destroy(struct Qdisc *sch)
 	}
 }
 
-static struct Qdisc_ops gred_qdisc_ops = {
+static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
 	.id		=	"gred",
 	.priv_size	=	sizeof(struct gred_sched),
 	.enqueue	=	gred_enqueue,
 	.dequeue	=	gred_dequeue,
-	.requeue	=	gred_requeue,
+	.peek		=	qdisc_peek_head,
 	.drop		=	gred_drop,
 	.init		=	gred_init,
 	.reset		=	gred_reset,
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 91132f6871d..ec8aeaac1dd 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -50,30 +50,24 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <linux/jiffies.h>
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
 #include <linux/skbuff.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/timer.h>
 #include <linux/list.h>
 #include <linux/rbtree.h>
 #include <linux/init.h>
-#include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/pkt_sched.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
-#include <asm/system.h>
 #include <asm/div64.h>
 
-#define HFSC_DEBUG 1
-
 /*
  * kernel internal service curve representation:
  *   coordinates are given by 64 bit unsigned integers.
@@ -83,12 +77,11 @@
  *   The service curve parameters are converted to the internal
  *   representation. The slope values are scaled to avoid overflow.
  *   the inverse slope values as well as the y-projection of the 1st
- *   segment are kept in order to to avoid 64-bit divide operations
+ *   segment are kept in order to avoid 64-bit divide operations
  *   that are expensive on 32-bit architectures.
  */
 
-struct internal_sc
-{
+struct internal_sc {
 	u64	sm1;	/* scaled slope of the 1st segment */
 	u64	ism1;	/* scaled inverse-slope of the 1st segment */
 	u64	dx;	/* the x-projection of the 1st segment */
@@ -98,8 +91,7 @@ struct internal_sc
 };
 
 /* runtime service curve */
-struct runtime_sc
-{
+struct runtime_sc {
 	u64	x;	/* current starting position on x-axis */
 	u64	y;	/* current starting position on y-axis */
 	u64	sm1;	/* scaled slope of the 1st segment */
@@ -110,22 +102,19 @@ struct runtime_sc
 	u64	ism2;	/* scaled inverse-slope of the 2nd segment */
 };
 
-enum hfsc_class_flags
-{
+enum hfsc_class_flags {
 	HFSC_RSC = 0x1,
 	HFSC_FSC = 0x2,
 	HFSC_USC = 0x4
 };
 
-struct hfsc_class
-{
-	u32		classid;	/* class id */
+struct hfsc_class {
+	struct Qdisc_class_common cl_common;
 	unsigned int	refcnt;		/* usage count */
 
-	struct gnet_stats_basic bstats;
+	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est rate_est;
-	spinlock_t	*stats_lock;
+	struct gnet_stats_rate_est64 rate_est;
 	unsigned int	level;		/* class level in hierarchy */
 	struct tcf_proto *filter_list;	/* filter list */
 	unsigned int	filter_cnt;	/* filter count */
@@ -141,15 +130,14 @@ struct hfsc_class
 	struct rb_node vt_node;		/* parent's vt_tree member */
 	struct rb_root cf_tree;		/* active children sorted by cl_f */
 	struct rb_node cf_node;		/* parent's cf_heap member */
-	struct list_head hlist;		/* hash list member */
 	struct list_head dlist;		/* drop list member */
 
 	u64	cl_total;		/* total work in bytes */
 	u64	cl_cumul;		/* cumulative work in bytes done by
 					   real-time criteria */
 
-	u64 	cl_d;			/* deadline*/
-	u64 	cl_e;			/* eligible time */
+	u64	cl_d;			/* deadline*/
+	u64	cl_e;			/* eligible time */
 	u64	cl_vt;			/* virtual time */
 	u64	cl_f;			/* time when this class will fit for
 					   link-sharing, max(myf, cfmin) */
@@ -167,7 +155,7 @@ struct hfsc_class
 	u64	cl_vtoff;		/* inter-period cumulative vt offset */
 	u64	cl_cvtmax;		/* max child's vt in the last period */
 	u64	cl_cvtoff;		/* cumulative cvtmax of all periods */
-	u64	cl_pcvtoff;		/* parent's cvtoff at initalization
+	u64	cl_pcvtoff;		/* parent's cvtoff at initialization
 					   time */
 
 	struct internal_sc cl_rsc;	/* internal real-time service curve */
@@ -184,45 +172,16 @@ struct hfsc_class
 	unsigned long	cl_nactive;	/* number of active children */
 };
 
-#define HFSC_HSIZE	16
-
-struct hfsc_sched
-{
+struct hfsc_sched {
 	u16	defcls;				/* default class id */
 	struct hfsc_class root;			/* root class */
-	struct list_head clhash[HFSC_HSIZE];	/* class hash */
+	struct Qdisc_class_hash clhash;		/* class hash */
 	struct rb_root eligible;		/* eligible tree */
 	struct list_head droplist;		/* active leaf class list (for
 						   dropping) */
-	struct sk_buff_head requeue;		/* requeued packet */
-	struct timer_list wd_timer;		/* watchdog timer */
+	struct qdisc_watchdog watchdog;		/* watchdog timer */
 };
 
-/*
- * macros
- */
-#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY
-#include <linux/time.h>
-#undef PSCHED_GET_TIME
-#define PSCHED_GET_TIME(stamp)						\
-do {									\
-	struct timeval tv;						\
-	do_gettimeofday(&tv);						\
-	(stamp) = 1ULL * USEC_PER_SEC * tv.tv_sec + tv.tv_usec;		\
-} while (0)
-#endif
-
-#if HFSC_DEBUG
-#define ASSERT(cond)							\
-do {									\
-	if (unlikely(!(cond)))						\
-		printk("assertion %s failed at %s:%i (%s)\n",		\
-		       #cond, __FILE__, __LINE__, __FUNCTION__);	\
-} while (0)
-#else
-#define ASSERT(cond)
-#endif /* HFSC_DEBUG */
-
 #define	HT_INFINITY	0xffffffffffffffffULL	/* infinite time value */
 
 
@@ -285,7 +244,7 @@ static inline struct hfsc_class *
 eltree_get_minel(struct hfsc_sched *q)
 {
 	struct rb_node *n;
-	
+
 	n = rb_first(&q->eligible);
 	if (n == NULL)
 		return NULL;
@@ -408,31 +367,22 @@ cftree_update(struct hfsc_class *cl)
  *	ism: (psched_us/byte) << ISM_SHIFT
  *	dx: psched_us
  *
- * Clock source resolution (CONFIG_NET_SCH_CLK_*)
- *  JIFFIES: for 48<=HZ<=1534 resolution is between 0.63us and 1.27us.
- *  CPU: resolution is between 0.5us and 1us.
- *  GETTIMEOFDAY: resolution is exactly 1us.
+ * The clock source resolution with ktime and PSCHED_SHIFT 10 is 1.024us.
  *
  * sm and ism are scaled in order to keep effective digits.
  * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective
  * digits in decimal using the following table.
  *
- * Note: We can afford the additional accuracy (altq hfsc keeps at most
- * 3 effective digits) thanks to the fact that linux clock is bounded
- * much more tightly.
- *
  *  bits/sec      100Kbps     1Mbps     10Mbps     100Mbps    1Gbps
  *  ------------+-------------------------------------------------------
- *  bytes/0.5us   6.25e-3    62.5e-3    625e-3     6250e-e    62500e-3
- *  bytes/us      12.5e-3    125e-3     1250e-3    12500e-3   125000e-3
- *  bytes/1.27us  15.875e-3  158.75e-3  1587.5e-3  15875e-3   158750e-3
+ *  bytes/1.024us 12.8e-3    128e-3     1280e-3    12800e-3   128000e-3
  *
- *  0.5us/byte    160        16         1.6        0.16       0.016
- *  us/byte       80         8          0.8        0.08       0.008
- *  1.27us/byte   63         6.3        0.63       0.063      0.0063
+ *  1.024us/byte  78.125     7.8125     0.78125    0.078125   0.0078125
+ *
+ * So, for PSCHED_SHIFT 10 we need: SM_SHIFT 20, ISM_SHIFT 18.
  */
-#define	SM_SHIFT	20
-#define	ISM_SHIFT	18
+#define	SM_SHIFT	(30 - PSCHED_SHIFT)
+#define	ISM_SHIFT	(8 + PSCHED_SHIFT)
 
 #define	SM_MASK		((1ULL << SM_SHIFT) - 1)
 #define	ISM_MASK	((1ULL << ISM_SHIFT) - 1)
@@ -474,8 +424,8 @@ m2sm(u32 m)
 	u64 sm;
 
 	sm = ((u64)m << SM_SHIFT);
-	sm += PSCHED_JIFFIE2US(HZ) - 1;
-	do_div(sm, PSCHED_JIFFIE2US(HZ));
+	sm += PSCHED_TICKS_PER_SEC - 1;
+	do_div(sm, PSCHED_TICKS_PER_SEC);
 	return sm;
 }
 
@@ -488,7 +438,7 @@ m2ism(u32 m)
 	if (m == 0)
 		ism = HT_INFINITY;
 	else {
-		ism = ((u64)PSCHED_JIFFIE2US(HZ) << ISM_SHIFT);
+		ism = ((u64)PSCHED_TICKS_PER_SEC << ISM_SHIFT);
 		ism += m - 1;
 		do_div(ism, m);
 	}
@@ -501,7 +451,7 @@ d2dx(u32 d)
 {
 	u64 dx;
 
-	dx = ((u64)d * PSCHED_JIFFIE2US(HZ));
+	dx = ((u64)d * PSCHED_TICKS_PER_SEC);
 	dx += USEC_PER_SEC - 1;
 	do_div(dx, USEC_PER_SEC);
 	return dx;
@@ -513,7 +463,7 @@ sm2m(u64 sm)
 {
 	u64 m;
 
-	m = (sm * PSCHED_JIFFIE2US(HZ)) >> SM_SHIFT;
+	m = (sm * PSCHED_TICKS_PER_SEC) >> SM_SHIFT;
 	return (u32)m;
 }
 
@@ -524,7 +474,7 @@ dx2d(u64 dx)
 	u64 d;
 
 	d = dx * USEC_PER_SEC;
-	do_div(d, PSCHED_JIFFIE2US(HZ));
+	do_div(d, PSCHED_TICKS_PER_SEC);
 	return (u32)d;
 }
 
@@ -662,15 +612,12 @@ rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
 	rtsc->y = y;
 	rtsc->dx = dx;
 	rtsc->dy = dy;
-	return;
 }
 
 static void
 init_ed(struct hfsc_class *cl, unsigned int next_len)
 {
-	u64 cur_time;
-
-	PSCHED_GET_TIME(cur_time);
+	u64 cur_time = psched_get_time();
 
 	/* update the deadline curve */
 	rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
@@ -741,7 +688,7 @@ init_vf(struct hfsc_class *cl, unsigned int len)
 		if (go_active) {
 			n = rb_last(&cl->cl_parent->vt_tree);
 			if (n != NULL) {
-				max_cl = rb_entry(n, struct hfsc_class,vt_node);
+				max_cl = rb_entry(n, struct hfsc_class, vt_node);
 				/*
 				 * set vt to the average of the min and max
 				 * classes.  if the parent's period didn't
@@ -774,7 +721,7 @@ init_vf(struct hfsc_class *cl, unsigned int len)
 			/* update the virtual curve */
 			vt = cl->cl_vt + cl->cl_vtoff;
 			rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt,
-			                              cl->cl_total);
+						      cl->cl_total);
 			if (cl->cl_virtual.x == vt) {
 				cl->cl_virtual.x -= cl->cl_vtoff;
 				cl->cl_vtoff = 0;
@@ -793,14 +740,14 @@ init_vf(struct hfsc_class *cl, unsigned int len)
 			if (cl->cl_flags & HFSC_USC) {
 				/* class has upper limit curve */
 				if (cur_time == 0)
-					PSCHED_GET_TIME(cur_time);
+					cur_time = psched_get_time();
 
 				/* update the ulimit curve */
 				rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time,
-				         cl->cl_total);
+					 cl->cl_total);
 				/* compute myf */
 				cl->cl_myf = rtsc_y2x(&cl->cl_ulimit,
-				                      cl->cl_total);
+						      cl->cl_total);
 				cl->cl_myfadj = 0;
 			}
 		}
@@ -809,8 +756,8 @@ init_vf(struct hfsc_class *cl, unsigned int len)
 		if (f != cl->cl_f) {
 			cl->cl_f = f;
 			cftree_update(cl);
-			update_cfmin(cl->cl_parent);
 		}
+		update_cfmin(cl->cl_parent);
 	}
 }
 
@@ -854,7 +801,7 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
 		 * update vt and f
 		 */
 		cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
-		            - cl->cl_vtoff + cl->cl_vtadj;
+			    - cl->cl_vtoff + cl->cl_vtadj;
 
 		/*
 		 * if vt of the class is smaller than cvtmin,
@@ -871,7 +818,7 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
 
 		if (cl->cl_flags & HFSC_USC) {
 			cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
-			                                      cl->cl_total);
+							      cl->cl_total);
 #if 0
 			/*
 			 * This code causes classes to stay way under their
@@ -928,27 +875,19 @@ set_passive(struct hfsc_class *cl)
 	 */
 }
 
-/*
- * hack to get length of first packet in queue.
- */
 static unsigned int
 qdisc_peek_len(struct Qdisc *sch)
 {
 	struct sk_buff *skb;
 	unsigned int len;
 
-	skb = sch->dequeue(sch);
+	skb = sch->ops->peek(sch);
 	if (skb == NULL) {
-		if (net_ratelimit())
-			printk("qdisc_peek_len: non work-conserving qdisc ?\n");
-		return 0;
-	}
-	len = skb->len;
-	if (unlikely(sch->ops->requeue(skb, sch) != NET_XMIT_SUCCESS)) {
-		if (net_ratelimit())
-			printk("qdisc_peek_len: failed to requeue\n");
+		qdisc_warn_nonwc("qdisc_peek_len", sch);
 		return 0;
 	}
+	len = qdisc_pkt_len(skb);
+
 	return len;
 }
 
@@ -958,11 +897,7 @@ hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
 	unsigned int len = cl->qdisc->q.qlen;
 
 	qdisc_reset(cl->qdisc);
-	if (len > 0) {
-		update_vf(cl, 0, 0);
-		set_passive(cl);
-		sch->q.qlen -= len;
-	}
+	qdisc_tree_decrease_qlen(cl->qdisc, len);
 }
 
 static void
@@ -974,38 +909,28 @@ hfsc_adjust_levels(struct hfsc_class *cl)
 	do {
 		level = 0;
 		list_for_each_entry(p, &cl->children, siblings) {
-			if (p->level > level)
-				level = p->level;
+			if (p->level >= level)
+				level = p->level + 1;
 		}
-		cl->level = level + 1;
+		cl->level = level;
 	} while ((cl = cl->cl_parent) != NULL);
 }
 
-static inline unsigned int
-hfsc_hash(u32 h)
-{
-	h ^= h >> 8;
-	h ^= h >> 4;
-
-	return h & (HFSC_HSIZE - 1);
-}
-
 static inline struct hfsc_class *
 hfsc_find_class(u32 classid, struct Qdisc *sch)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
-	struct hfsc_class *cl;
+	struct Qdisc_class_common *clc;
 
-	list_for_each_entry(cl, &q->clhash[hfsc_hash(classid)], hlist) {
-		if (cl->classid == classid)
-			return cl;
-	}
-	return NULL;
+	clc = qdisc_class_find(&q->clhash, classid);
+	if (clc == NULL)
+		return NULL;
+	return container_of(clc, struct hfsc_class, cl_common);
 }
 
 static void
 hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc,
-                u64 cur_time)
+		u64 cur_time)
 {
 	sc2isc(rsc, &cl->cl_rsc);
 	rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
@@ -1027,60 +952,74 @@ hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc)
 
 static void
 hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc,
-                u64 cur_time)
+		u64 cur_time)
 {
 	sc2isc(usc, &cl->cl_usc);
 	rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total);
 	cl->cl_flags |= HFSC_USC;
 }
 
+static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = {
+	[TCA_HFSC_RSC]	= { .len = sizeof(struct tc_service_curve) },
+	[TCA_HFSC_FSC]	= { .len = sizeof(struct tc_service_curve) },
+	[TCA_HFSC_USC]	= { .len = sizeof(struct tc_service_curve) },
+};
+
 static int
 hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
-                  struct rtattr **tca, unsigned long *arg)
+		  struct nlattr **tca, unsigned long *arg)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct hfsc_class *cl = (struct hfsc_class *)*arg;
 	struct hfsc_class *parent = NULL;
-	struct rtattr *opt = tca[TCA_OPTIONS-1];
-	struct rtattr *tb[TCA_HFSC_MAX];
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_HFSC_MAX + 1];
 	struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL;
 	u64 cur_time;
+	int err;
 
-	if (opt == NULL || rtattr_parse_nested(tb, TCA_HFSC_MAX, opt))
+	if (opt == NULL)
 		return -EINVAL;
 
-	if (tb[TCA_HFSC_RSC-1]) {
-		if (RTA_PAYLOAD(tb[TCA_HFSC_RSC-1]) < sizeof(*rsc))
-			return -EINVAL;
-		rsc = RTA_DATA(tb[TCA_HFSC_RSC-1]);
+	err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_HFSC_RSC]) {
+		rsc = nla_data(tb[TCA_HFSC_RSC]);
 		if (rsc->m1 == 0 && rsc->m2 == 0)
 			rsc = NULL;
 	}
 
-	if (tb[TCA_HFSC_FSC-1]) {
-		if (RTA_PAYLOAD(tb[TCA_HFSC_FSC-1]) < sizeof(*fsc))
-			return -EINVAL;
-		fsc = RTA_DATA(tb[TCA_HFSC_FSC-1]);
+	if (tb[TCA_HFSC_FSC]) {
+		fsc = nla_data(tb[TCA_HFSC_FSC]);
 		if (fsc->m1 == 0 && fsc->m2 == 0)
 			fsc = NULL;
 	}
 
-	if (tb[TCA_HFSC_USC-1]) {
-		if (RTA_PAYLOAD(tb[TCA_HFSC_USC-1]) < sizeof(*usc))
-			return -EINVAL;
-		usc = RTA_DATA(tb[TCA_HFSC_USC-1]);
+	if (tb[TCA_HFSC_USC]) {
+		usc = nla_data(tb[TCA_HFSC_USC]);
 		if (usc->m1 == 0 && usc->m2 == 0)
 			usc = NULL;
 	}
 
 	if (cl != NULL) {
 		if (parentid) {
-			if (cl->cl_parent && cl->cl_parent->classid != parentid)
+			if (cl->cl_parent &&
+			    cl->cl_parent->cl_common.classid != parentid)
 				return -EINVAL;
 			if (cl->cl_parent == NULL && parentid != TC_H_ROOT)
 				return -EINVAL;
 		}
-		PSCHED_GET_TIME(cur_time);
+		cur_time = psched_get_time();
+
+		if (tca[TCA_RATE]) {
+			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+					      qdisc_root_sleeping_lock(sch),
+					      tca[TCA_RATE]);
+			if (err)
+				return err;
+		}
 
 		sch_tree_lock(sch);
 		if (rsc != NULL)
@@ -1098,11 +1037,6 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		}
 		sch_tree_unlock(sch);
 
-#ifdef CONFIG_NET_ESTIMATOR
-		if (tca[TCA_RATE-1])
-			gen_replace_estimator(&cl->bstats, &cl->rate_est,
-				cl->stats_lock, tca[TCA_RATE-1]);
-#endif
 		return 0;
 	}
 
@@ -1124,10 +1058,19 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	if (rsc == NULL && fsc == NULL)
 		return -EINVAL;
 
-	cl = kmalloc(sizeof(struct hfsc_class), GFP_KERNEL);
+	cl = kzalloc(sizeof(struct hfsc_class), GFP_KERNEL);
 	if (cl == NULL)
 		return -ENOBUFS;
-	memset(cl, 0, sizeof(struct hfsc_class));
+
+	if (tca[TCA_RATE]) {
+		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+					qdisc_root_sleeping_lock(sch),
+					tca[TCA_RATE]);
+		if (err) {
+			kfree(cl);
+			return err;
+		}
+	}
 
 	if (rsc != NULL)
 		hfsc_change_rsc(cl, rsc, 0);
@@ -1136,20 +1079,20 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	if (usc != NULL)
 		hfsc_change_usc(cl, usc, 0);
 
+	cl->cl_common.classid = classid;
 	cl->refcnt    = 1;
-	cl->classid   = classid;
 	cl->sched     = q;
 	cl->cl_parent = parent;
-	cl->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
+				      &pfifo_qdisc_ops, classid);
 	if (cl->qdisc == NULL)
 		cl->qdisc = &noop_qdisc;
-	cl->stats_lock = &sch->dev->queue_lock;
 	INIT_LIST_HEAD(&cl->children);
 	cl->vt_tree = RB_ROOT;
 	cl->cf_tree = RB_ROOT;
 
 	sch_tree_lock(sch);
-	list_add_tail(&cl->hlist, &q->clhash[hfsc_hash(classid)]);
+	qdisc_class_hash_insert(&q->clhash, &cl->cl_common);
 	list_add_tail(&cl->siblings, &parent->children);
 	if (parent->level == 0)
 		hfsc_purge_queue(sch, parent);
@@ -1157,36 +1100,20 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	cl->cl_pcvtoff = parent->cl_cvtoff;
 	sch_tree_unlock(sch);
 
-#ifdef CONFIG_NET_ESTIMATOR
-	if (tca[TCA_RATE-1])
-		gen_new_estimator(&cl->bstats, &cl->rate_est,
-			cl->stats_lock, tca[TCA_RATE-1]);
-#endif
+	qdisc_class_hash_grow(sch, &q->clhash);
+
 	*arg = (unsigned long)cl;
 	return 0;
 }
 
 static void
-hfsc_destroy_filters(struct tcf_proto **fl)
-{
-	struct tcf_proto *tp;
-
-	while ((tp = *fl) != NULL) {
-		*fl = tp->next;
-		tcf_destroy(tp);
-	}
-}
-
-static void
 hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
 
-	hfsc_destroy_filters(&cl->filter_list);
+	tcf_destroy_chain(&cl->filter_list);
 	qdisc_destroy(cl->qdisc);
-#ifdef CONFIG_NET_ESTIMATOR
 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
-#endif
 	if (cl != &q->root)
 		kfree(cl);
 }
@@ -1202,12 +1129,17 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
 
 	sch_tree_lock(sch);
 
-	list_del(&cl->hlist);
 	list_del(&cl->siblings);
 	hfsc_adjust_levels(cl->cl_parent);
+
 	hfsc_purge_queue(sch, cl);
-	if (--cl->refcnt == 0)
-		hfsc_destroy_class(sch, cl);
+	qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
+
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
 
 	sch_tree_unlock(sch);
 	return 0;
@@ -1217,7 +1149,7 @@ static struct hfsc_class *
 hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
-	struct hfsc_class *cl;
+	struct hfsc_class *head, *cl;
 	struct tcf_result res;
 	struct tcf_proto *tcf;
 	int result;
@@ -1227,24 +1159,26 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 		if (cl->level == 0)
 			return cl;
 
-	*qerr = NET_XMIT_BYPASS;
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	head = &q->root;
 	tcf = q->root.filter_list;
 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
-		case TC_ACT_STOLEN: 
-			*qerr = NET_XMIT_SUCCESS;
-		case TC_ACT_SHOT: 
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
 			return NULL;
 		}
-#elif defined(CONFIG_NET_CLS_POLICE)
-		if (result == TC_POLICE_SHOT)
-			return NULL;
 #endif
-		if ((cl = (struct hfsc_class *)res.class) == NULL) {
-			if ((cl = hfsc_find_class(res.classid, sch)) == NULL)
+		cl = (struct hfsc_class *)res.class;
+		if (!cl) {
+			cl = hfsc_find_class(res.classid, sch);
+			if (!cl)
 				break; /* filter selected invalid classid */
+			if (cl->level >= head->level)
+				break; /* filter may only point downwards */
 		}
 
 		if (cl->level == 0)
@@ -1252,6 +1186,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 
 		/* apply inner filter chain */
 		tcf = cl->filter_list;
+		head = cl;
 	}
 
 	/* classification failed, try default class */
@@ -1264,23 +1199,23 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 
 static int
 hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-                 struct Qdisc **old)
+		 struct Qdisc **old)
 {
 	struct hfsc_class *cl = (struct hfsc_class *)arg;
 
-	if (cl == NULL)
-		return -ENOENT;
 	if (cl->level > 0)
 		return -EINVAL;
 	if (new == NULL) {
-		new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					cl->cl_common.classid);
 		if (new == NULL)
 			new = &noop_qdisc;
 	}
 
 	sch_tree_lock(sch);
 	hfsc_purge_queue(sch, cl);
-	*old = xchg(&cl->qdisc, new);
+	*old = cl->qdisc;
+	cl->qdisc = new;
 	sch_tree_unlock(sch);
 	return 0;
 }
@@ -1290,12 +1225,23 @@ hfsc_class_leaf(struct Qdisc *sch, unsigned long arg)
 {
 	struct hfsc_class *cl = (struct hfsc_class *)arg;
 
-	if (cl != NULL && cl->level == 0)
+	if (cl->level == 0)
 		return cl->qdisc;
 
 	return NULL;
 }
 
+static void
+hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+	struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+	if (cl->qdisc->q.qlen == 0) {
+		update_vf(cl, 0, 0);
+		set_passive(cl);
+	}
+}
+
 static unsigned long
 hfsc_get_class(struct Qdisc *sch, u32 classid)
 {
@@ -1359,57 +1305,59 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
 	tsc.m1 = sm2m(sc->sm1);
 	tsc.d  = dx2d(sc->dx);
 	tsc.m2 = sm2m(sc->sm2);
-	RTA_PUT(skb, attr, sizeof(tsc), &tsc);
+	if (nla_put(skb, attr, sizeof(tsc), &tsc))
+		goto nla_put_failure;
 
 	return skb->len;
 
- rtattr_failure:
+ nla_put_failure:
 	return -1;
 }
 
-static inline int
+static int
 hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)
 {
 	if ((cl->cl_flags & HFSC_RSC) &&
 	    (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0))
-		goto rtattr_failure;
+		goto nla_put_failure;
 
 	if ((cl->cl_flags & HFSC_FSC) &&
 	    (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0))
-		goto rtattr_failure;
+		goto nla_put_failure;
 
 	if ((cl->cl_flags & HFSC_USC) &&
 	    (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0))
-		goto rtattr_failure;
+		goto nla_put_failure;
 
 	return skb->len;
 
- rtattr_failure:
+ nla_put_failure:
 	return -1;
 }
 
 static int
 hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
-                struct tcmsg *tcm)
+		struct tcmsg *tcm)
 {
 	struct hfsc_class *cl = (struct hfsc_class *)arg;
-	unsigned char *b = skb->tail;
-	struct rtattr *rta = (struct rtattr *)b;
+	struct nlattr *nest;
 
-	tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT;
-	tcm->tcm_handle = cl->classid;
+	tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->cl_common.classid :
+					  TC_H_ROOT;
+	tcm->tcm_handle = cl->cl_common.classid;
 	if (cl->level == 0)
 		tcm->tcm_info = cl->qdisc->handle;
 
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
 	if (hfsc_dump_curves(skb, cl) < 0)
-		goto rtattr_failure;
-	rta->rta_len = skb->tail - b;
-	return skb->len;
+		goto nla_put_failure;
+	return nla_nest_end(skb, nest);
 
- rtattr_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
+ nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
 }
 
 static int
@@ -1420,15 +1368,14 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	struct tc_hfsc_stats xstats;
 
 	cl->qstats.qlen = cl->qdisc->q.qlen;
+	cl->qstats.backlog = cl->qdisc->qstats.backlog;
 	xstats.level   = cl->level;
 	xstats.period  = cl->cl_vtperiod;
 	xstats.work    = cl->cl_total;
 	xstats.rtwork  = cl->cl_cumul;
 
 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
-#ifdef CONFIG_NET_ESTIMATOR
-	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
-#endif
+	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
 		return -1;
 
@@ -1447,8 +1394,9 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 	if (arg->stop)
 		return;
 
-	for (i = 0; i < HFSC_HSIZE; i++) {
-		list_for_each_entry(cl, &q->clhash[i], hlist) {
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i],
+				     cl_common.hnode) {
 			if (arg->count < arg->skip) {
 				arg->count++;
 				continue;
@@ -1463,85 +1411,69 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 }
 
 static void
-hfsc_watchdog(unsigned long arg)
-{
-	struct Qdisc *sch = (struct Qdisc *)arg;
-
-	sch->flags &= ~TCQ_F_THROTTLED;
-	netif_schedule(sch->dev);
-}
-
-static void
-hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time)
+hfsc_schedule_watchdog(struct Qdisc *sch)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct hfsc_class *cl;
 	u64 next_time = 0;
-	long delay;
 
-	if ((cl = eltree_get_minel(q)) != NULL)
+	cl = eltree_get_minel(q);
+	if (cl)
 		next_time = cl->cl_e;
 	if (q->root.cl_cfmin != 0) {
 		if (next_time == 0 || next_time > q->root.cl_cfmin)
 			next_time = q->root.cl_cfmin;
 	}
-	ASSERT(next_time != 0);
-	delay = next_time - cur_time;
-	delay = PSCHED_US2JIFFIE(delay);
-
-	sch->flags |= TCQ_F_THROTTLED;
-	mod_timer(&q->wd_timer, jiffies + delay);
+	WARN_ON(next_time == 0);
+	qdisc_watchdog_schedule(&q->watchdog, next_time);
 }
 
 static int
-hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt)
+hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct tc_hfsc_qopt *qopt;
-	unsigned int i;
+	int err;
 
-	if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
+	if (opt == NULL || nla_len(opt) < sizeof(*qopt))
 		return -EINVAL;
-	qopt = RTA_DATA(opt);
-
-	sch->stats_lock = &sch->dev->queue_lock;
+	qopt = nla_data(opt);
 
 	q->defcls = qopt->defcls;
-	for (i = 0; i < HFSC_HSIZE; i++)
-		INIT_LIST_HEAD(&q->clhash[i]);
+	err = qdisc_class_hash_init(&q->clhash);
+	if (err < 0)
+		return err;
 	q->eligible = RB_ROOT;
 	INIT_LIST_HEAD(&q->droplist);
-	skb_queue_head_init(&q->requeue);
 
+	q->root.cl_common.classid = sch->handle;
 	q->root.refcnt  = 1;
-	q->root.classid = sch->handle;
 	q->root.sched   = q;
-	q->root.qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+	q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					  sch->handle);
 	if (q->root.qdisc == NULL)
 		q->root.qdisc = &noop_qdisc;
-	q->root.stats_lock = &sch->dev->queue_lock;
 	INIT_LIST_HEAD(&q->root.children);
 	q->root.vt_tree = RB_ROOT;
 	q->root.cf_tree = RB_ROOT;
 
-	list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]);
+	qdisc_class_hash_insert(&q->clhash, &q->root.cl_common);
+	qdisc_class_hash_grow(sch, &q->clhash);
 
-	init_timer(&q->wd_timer);
-	q->wd_timer.function = hfsc_watchdog;
-	q->wd_timer.data = (unsigned long)sch;
+	qdisc_watchdog_init(&q->watchdog, sch);
 
 	return 0;
 }
 
 static int
-hfsc_change_qdisc(struct Qdisc *sch, struct rtattr *opt)
+hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct tc_hfsc_qopt *qopt;
 
-	if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
+	if (opt == NULL || nla_len(opt) < sizeof(*qopt))
 		return -EINVAL;
-	qopt = RTA_DATA(opt);
+	qopt = nla_data(opt);
 
 	sch_tree_lock(sch);
 	q->defcls = qopt->defcls;
@@ -1591,15 +1523,13 @@ hfsc_reset_qdisc(struct Qdisc *sch)
 	struct hfsc_class *cl;
 	unsigned int i;
 
-	for (i = 0; i < HFSC_HSIZE; i++) {
-		list_for_each_entry(cl, &q->clhash[i], hlist)
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
 			hfsc_reset_class(cl);
 	}
-	__skb_queue_purge(&q->requeue);
 	q->eligible = RB_ROOT;
 	INIT_LIST_HEAD(&q->droplist);
-	del_timer(&q->wd_timer);
-	sch->flags &= ~TCQ_F_THROTTLED;
+	qdisc_watchdog_cancel(&q->watchdog);
 	sch->q.qlen = 0;
 }
 
@@ -1607,30 +1537,45 @@ static void
 hfsc_destroy_qdisc(struct Qdisc *sch)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
-	struct hfsc_class *cl, *next;
+	struct hlist_node *next;
+	struct hfsc_class *cl;
 	unsigned int i;
 
-	for (i = 0; i < HFSC_HSIZE; i++) {
-		list_for_each_entry_safe(cl, next, &q->clhash[i], hlist)
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
+			tcf_destroy_chain(&cl->filter_list);
+	}
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+					  cl_common.hnode)
 			hfsc_destroy_class(sch, cl);
 	}
-	__skb_queue_purge(&q->requeue);
-	del_timer(&q->wd_timer);
+	qdisc_class_hash_destroy(&q->clhash);
+	qdisc_watchdog_cancel(&q->watchdog);
 }
 
 static int
 hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
-	unsigned char *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_hfsc_qopt qopt;
+	struct hfsc_class *cl;
+	unsigned int i;
+
+	sch->qstats.backlog = 0;
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
+			sch->qstats.backlog += cl->qdisc->qstats.backlog;
+	}
 
 	qopt.defcls = q->defcls;
-	RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
+		goto nla_put_failure;
 	return skb->len;
 
- rtattr_failure:
-	skb_trim(skb, b - skb->data);
+ nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
@@ -1638,32 +1583,28 @@ static int
 hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct hfsc_class *cl;
-	unsigned int len;
-	int err;
+	int uninitialized_var(err);
 
 	cl = hfsc_classify(skb, sch, &err);
 	if (cl == NULL) {
-		if (err == NET_XMIT_BYPASS)
+		if (err & __NET_XMIT_BYPASS)
 			sch->qstats.drops++;
 		kfree_skb(skb);
 		return err;
 	}
 
-	len = skb->len;
-	err = cl->qdisc->enqueue(skb, cl->qdisc);
+	err = qdisc_enqueue(skb, cl->qdisc);
 	if (unlikely(err != NET_XMIT_SUCCESS)) {
-		cl->qstats.drops++;
-		sch->qstats.drops++;
+		if (net_xmit_drop_count(err)) {
+			cl->qstats.drops++;
+			sch->qstats.drops++;
+		}
 		return err;
 	}
 
 	if (cl->qdisc->q.qlen == 1)
-		set_active(cl, len);
+		set_active(cl, qdisc_pkt_len(skb));
 
-	cl->bstats.packets++;
-	cl->bstats.bytes += len;
-	sch->bstats.packets++;
-	sch->bstats.bytes += len;
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
@@ -1681,17 +1622,16 @@ hfsc_dequeue(struct Qdisc *sch)
 
 	if (sch->q.qlen == 0)
 		return NULL;
-	if ((skb = __skb_dequeue(&q->requeue)))
-		goto out;
 
-	PSCHED_GET_TIME(cur_time);
+	cur_time = psched_get_time();
 
 	/*
 	 * if there are eligible classes, use real-time criteria.
 	 * find the class with the minimum deadline among
 	 * the eligible classes.
 	 */
-	if ((cl = eltree_get_mindl(q, cur_time)) != NULL) {
+	cl = eltree_get_mindl(q, cur_time);
+	if (cl) {
 		realtime = 1;
 	} else {
 		/*
@@ -1701,21 +1641,21 @@ hfsc_dequeue(struct Qdisc *sch)
 		cl = vttree_get_minvt(&q->root, cur_time);
 		if (cl == NULL) {
 			sch->qstats.overlimits++;
-			hfsc_schedule_watchdog(sch, cur_time);
+			hfsc_schedule_watchdog(sch);
 			return NULL;
 		}
 	}
 
-	skb = cl->qdisc->dequeue(cl->qdisc);
+	skb = qdisc_dequeue_peeked(cl->qdisc);
 	if (skb == NULL) {
-		if (net_ratelimit())
-			printk("HFSC: Non-work-conserving qdisc ?\n");
+		qdisc_warn_nonwc("HFSC", cl->qdisc);
 		return NULL;
 	}
 
-	update_vf(cl, skb->len, cur_time);
+	bstats_update(&cl->bstats, skb);
+	update_vf(cl, qdisc_pkt_len(skb), cur_time);
 	if (realtime)
-		cl->cl_cumul += skb->len;
+		cl->cl_cumul += qdisc_pkt_len(skb);
 
 	if (cl->qdisc->q.qlen != 0) {
 		if (cl->cl_flags & HFSC_RSC) {
@@ -1731,24 +1671,13 @@ hfsc_dequeue(struct Qdisc *sch)
 		set_passive(cl);
 	}
 
- out:
-	sch->flags &= ~TCQ_F_THROTTLED;
+	qdisc_unthrottled(sch);
+	qdisc_bstats_update(sch, skb);
 	sch->q.qlen--;
 
 	return skb;
 }
 
-static int
-hfsc_requeue(struct sk_buff *skb, struct Qdisc *sch)
-{
-	struct hfsc_sched *q = qdisc_priv(sch);
-
-	__skb_queue_head(&q->requeue, skb);
-	sch->q.qlen++;
-	sch->qstats.requeues++;
-	return NET_XMIT_SUCCESS;
-}
-
 static unsigned int
 hfsc_drop(struct Qdisc *sch)
 {
@@ -1774,11 +1703,12 @@ hfsc_drop(struct Qdisc *sch)
 	return 0;
 }
 
-static struct Qdisc_class_ops hfsc_class_ops = {
+static const struct Qdisc_class_ops hfsc_class_ops = {
 	.change		= hfsc_change_class,
 	.delete		= hfsc_delete_class,
 	.graft		= hfsc_graft_class,
 	.leaf		= hfsc_class_leaf,
+	.qlen_notify	= hfsc_qlen_notify,
 	.get		= hfsc_get_class,
 	.put		= hfsc_put_class,
 	.bind_tcf	= hfsc_bind_tcf,
@@ -1789,7 +1719,7 @@ static struct Qdisc_class_ops hfsc_class_ops = {
 	.walk		= hfsc_walk
 };
 
-static struct Qdisc_ops hfsc_qdisc_ops = {
+static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = {
 	.id		= "hfsc",
 	.init		= hfsc_init_qdisc,
 	.change		= hfsc_change_qdisc,
@@ -1798,7 +1728,7 @@ static struct Qdisc_ops hfsc_qdisc_ops = {
 	.dump		= hfsc_dump_qdisc,
 	.enqueue	= hfsc_enqueue,
 	.dequeue	= hfsc_dequeue,
-	.requeue	= hfsc_requeue,
+	.peek		= qdisc_peek_dequeued,
 	.drop		= hfsc_drop,
 	.cl_ops		= &hfsc_class_ops,
 	.priv_size	= sizeof(struct hfsc_sched),
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
new file mode 100644
index 00000000000..d85b6812a7d
--- /dev/null
+++ b/net/sched/sch_hhf.c
@@ -0,0 +1,740 @@
+/* net/sched/sch_hhf.c		Heavy-Hitter Filter (HHF)
+ *
+ * Copyright (C) 2013 Terry Lam <vtlam@google.com>
+ * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
+ */
+
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <net/flow_keys.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+/*	Heavy-Hitter Filter (HHF)
+ *
+ * Principles :
+ * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
+ * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
+ * as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
+ * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
+ * in which the heavy-hitter bucket is served with less weight.
+ * In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
+ * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
+ * higher share of bandwidth.
+ *
+ * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
+ * following paper:
+ * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
+ * Accounting", in ACM SIGCOMM, 2002.
+ *
+ * Conceptually, a multi-stage filter comprises k independent hash functions
+ * and k counter arrays. Packets are indexed into k counter arrays by k hash
+ * functions, respectively. The counters are then increased by the packet sizes.
+ * Therefore,
+ *    - For a heavy-hitter flow: *all* of its k array counters must be large.
+ *    - For a non-heavy-hitter flow: some of its k array counters can be large
+ *      due to hash collision with other small flows; however, with high
+ *      probability, not *all* k counters are large.
+ *
+ * By the design of the multi-stage filter algorithm, the false negative rate
+ * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
+ * susceptible to false positives (non-heavy-hitters mistakenly classified as
+ * heavy-hitters).
+ * Therefore, we also implement the following optimizations to reduce false
+ * positives by avoiding unnecessary increment of the counter values:
+ *    - Optimization O1: once a heavy-hitter is identified, its bytes are not
+ *        accounted in the array counters. This technique is called "shielding"
+ *        in Section 3.3.1 of [EV02].
+ *    - Optimization O2: conservative update of counters
+ *                       (Section 3.3.2 of [EV02]),
+ *        New counter value = max {old counter value,
+ *                                 smallest counter value + packet bytes}
+ *
+ * Finally, we refresh the counters periodically since otherwise the counter
+ * values will keep accumulating.
+ *
+ * Once a flow is classified as heavy-hitter, we also save its per-flow state
+ * in an exact-matching flow table so that its subsequent packets can be
+ * dispatched to the heavy-hitter bucket accordingly.
+ *
+ *
+ * At a high level, this qdisc works as follows:
+ * Given a packet p:
+ *   - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
+ *     heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
+ *     bucket.
+ *   - Otherwise, forward p to the multi-stage filter, denoted filter F
+ *        + If F decides that p belongs to a non-heavy-hitter flow, then send p
+ *          to the non-heavy-hitter bucket.
+ *        + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
+ *          then set up a new flow entry for the flow-id of p in the table T and
+ *          send p to the heavy-hitter bucket.
+ *
+ * In this implementation:
+ *   - T is a fixed-size hash-table with 1024 entries. Hash collision is
+ *     resolved by linked-list chaining.
+ *   - F has four counter arrays, each array containing 1024 32-bit counters.
+ *     That means 4 * 1024 * 32 bits = 16KB of memory.
+ *   - Since each array in F contains 1024 counters, 10 bits are sufficient to
+ *     index into each array.
+ *     Hence, instead of having four hash functions, we chop the 32-bit
+ *     skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
+ *     computed as XOR sum of those three chunks.
+ *   - We need to clear the counter arrays periodically; however, directly
+ *     memsetting 16KB of memory can lead to cache eviction and unwanted delay.
+ *     So by representing each counter by a valid bit, we only need to reset
+ *     4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
+ *   - The Deficit Round Robin engine is taken from fq_codel implementation
+ *     (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
+ *     fq_codel_flow in fq_codel implementation.
+ *
+ */
+
+/* Non-configurable parameters */
+#define HH_FLOWS_CNT	 1024  /* number of entries in exact-matching table T */
+#define HHF_ARRAYS_CNT	 4     /* number of arrays in multi-stage filter F */
+#define HHF_ARRAYS_LEN	 1024  /* number of counters in each array of F */
+#define HHF_BIT_MASK_LEN 10    /* masking 10 bits */
+#define HHF_BIT_MASK	 0x3FF /* bitmask of 10 bits */
+
+#define WDRR_BUCKET_CNT  2     /* two buckets for Weighted DRR */
+enum wdrr_bucket_idx {
+	WDRR_BUCKET_FOR_HH	= 0, /* bucket id for heavy-hitters */
+	WDRR_BUCKET_FOR_NON_HH	= 1  /* bucket id for non-heavy-hitters */
+};
+
+#define hhf_time_before(a, b)	\
+	(typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
+
+/* Heavy-hitter per-flow state */
+struct hh_flow_state {
+	u32		 hash_id;	/* hash of flow-id (e.g. TCP 5-tuple) */
+	u32		 hit_timestamp;	/* last time heavy-hitter was seen */
+	struct list_head flowchain;	/* chaining under hash collision */
+};
+
+/* Weighted Deficit Round Robin (WDRR) scheduler */
+struct wdrr_bucket {
+	struct sk_buff	  *head;
+	struct sk_buff	  *tail;
+	struct list_head  bucketchain;
+	int		  deficit;
+};
+
+struct hhf_sched_data {
+	struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
+	u32		   perturbation;   /* hash perturbation */
+	u32		   quantum;        /* psched_mtu(qdisc_dev(sch)); */
+	u32		   drop_overlimit; /* number of times max qdisc packet
+					    * limit was hit
+					    */
+	struct list_head   *hh_flows;       /* table T (currently active HHs) */
+	u32		   hh_flows_limit;            /* max active HH allocs */
+	u32		   hh_flows_overlimit; /* num of disallowed HH allocs */
+	u32		   hh_flows_total_cnt;          /* total admitted HHs */
+	u32		   hh_flows_current_cnt;        /* total current HHs  */
+	u32		   *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */
+	u32		   hhf_arrays_reset_timestamp;  /* last time hhf_arrays
+							 * was reset
+							 */
+	unsigned long	   *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits
+							     * of hhf_arrays
+							     */
+	/* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */
+	struct list_head   new_buckets; /* list of new buckets */
+	struct list_head   old_buckets; /* list of old buckets */
+
+	/* Configurable HHF parameters */
+	u32		   hhf_reset_timeout; /* interval to reset counter
+					       * arrays in filter F
+					       * (default 40ms)
+					       */
+	u32		   hhf_admit_bytes;   /* counter thresh to classify as
+					       * HH (default 128KB).
+					       * With these default values,
+					       * 128KB / 40ms = 25 Mbps
+					       * i.e., we expect to capture HHs
+					       * sending > 25 Mbps.
+					       */
+	u32		   hhf_evict_timeout; /* aging threshold to evict idle
+					       * HHs out of table T. This should
+					       * be large enough to avoid
+					       * reordering during HH eviction.
+					       * (default 1s)
+					       */
+	u32		   hhf_non_hh_weight; /* WDRR weight for non-HHs
+					       * (default 2,
+					       *  i.e., non-HH : HH = 2 : 1)
+					       */
+};
+
+static u32 hhf_time_stamp(void)
+{
+	return jiffies;
+}
+
+static unsigned int skb_hash(const struct hhf_sched_data *q,
+			     const struct sk_buff *skb)
+{
+	struct flow_keys keys;
+	unsigned int hash;
+
+	if (skb->sk && skb->sk->sk_hash)
+		return skb->sk->sk_hash;
+
+	skb_flow_dissect(skb, &keys);
+	hash = jhash_3words((__force u32)keys.dst,
+			    (__force u32)keys.src ^ keys.ip_proto,
+			    (__force u32)keys.ports, q->perturbation);
+	return hash;
+}
+
+/* Looks up a heavy-hitter flow in a chaining list of table T. */
+static struct hh_flow_state *seek_list(const u32 hash,
+				       struct list_head *head,
+				       struct hhf_sched_data *q)
+{
+	struct hh_flow_state *flow, *next;
+	u32 now = hhf_time_stamp();
+
+	if (list_empty(head))
+		return NULL;
+
+	list_for_each_entry_safe(flow, next, head, flowchain) {
+		u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
+
+		if (hhf_time_before(prev, now)) {
+			/* Delete expired heavy-hitters, but preserve one entry
+			 * to avoid kzalloc() when next time this slot is hit.
+			 */
+			if (list_is_last(&flow->flowchain, head))
+				return NULL;
+			list_del(&flow->flowchain);
+			kfree(flow);
+			q->hh_flows_current_cnt--;
+		} else if (flow->hash_id == hash) {
+			return flow;
+		}
+	}
+	return NULL;
+}
+
+/* Returns a flow state entry for a new heavy-hitter.  Either reuses an expired
+ * entry or dynamically alloc a new entry.
+ */
+static struct hh_flow_state *alloc_new_hh(struct list_head *head,
+					  struct hhf_sched_data *q)
+{
+	struct hh_flow_state *flow;
+	u32 now = hhf_time_stamp();
+
+	if (!list_empty(head)) {
+		/* Find an expired heavy-hitter flow entry. */
+		list_for_each_entry(flow, head, flowchain) {
+			u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
+
+			if (hhf_time_before(prev, now))
+				return flow;
+		}
+	}
+
+	if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
+		q->hh_flows_overlimit++;
+		return NULL;
+	}
+	/* Create new entry. */
+	flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC);
+	if (!flow)
+		return NULL;
+
+	q->hh_flows_current_cnt++;
+	INIT_LIST_HEAD(&flow->flowchain);
+	list_add_tail(&flow->flowchain, head);
+
+	return flow;
+}
+
+/* Assigns packets to WDRR buckets.  Implements a multi-stage filter to
+ * classify heavy-hitters.
+ */
+static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	u32 tmp_hash, hash;
+	u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
+	struct hh_flow_state *flow;
+	u32 pkt_len, min_hhf_val;
+	int i;
+	u32 prev;
+	u32 now = hhf_time_stamp();
+
+	/* Reset the HHF counter arrays if this is the right time. */
+	prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
+	if (hhf_time_before(prev, now)) {
+		for (i = 0; i < HHF_ARRAYS_CNT; i++)
+			bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
+		q->hhf_arrays_reset_timestamp = now;
+	}
+
+	/* Get hashed flow-id of the skb. */
+	hash = skb_hash(q, skb);
+
+	/* Check if this packet belongs to an already established HH flow. */
+	flow_pos = hash & HHF_BIT_MASK;
+	flow = seek_list(hash, &q->hh_flows[flow_pos], q);
+	if (flow) { /* found its HH flow */
+		flow->hit_timestamp = now;
+		return WDRR_BUCKET_FOR_HH;
+	}
+
+	/* Now pass the packet through the multi-stage filter. */
+	tmp_hash = hash;
+	xorsum = 0;
+	for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) {
+		/* Split the skb_hash into three 10-bit chunks. */
+		filter_pos[i] = tmp_hash & HHF_BIT_MASK;
+		xorsum ^= filter_pos[i];
+		tmp_hash >>= HHF_BIT_MASK_LEN;
+	}
+	/* The last chunk is computed as XOR sum of other chunks. */
+	filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash;
+
+	pkt_len = qdisc_pkt_len(skb);
+	min_hhf_val = ~0U;
+	for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+		u32 val;
+
+		if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
+			q->hhf_arrays[i][filter_pos[i]] = 0;
+			__set_bit(filter_pos[i], q->hhf_valid_bits[i]);
+		}
+
+		val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
+		if (min_hhf_val > val)
+			min_hhf_val = val;
+	}
+
+	/* Found a new HH iff all counter values > HH admit threshold. */
+	if (min_hhf_val > q->hhf_admit_bytes) {
+		/* Just captured a new heavy-hitter. */
+		flow = alloc_new_hh(&q->hh_flows[flow_pos], q);
+		if (!flow) /* memory alloc problem */
+			return WDRR_BUCKET_FOR_NON_HH;
+		flow->hash_id = hash;
+		flow->hit_timestamp = now;
+		q->hh_flows_total_cnt++;
+
+		/* By returning without updating counters in q->hhf_arrays,
+		 * we implicitly implement "shielding" (see Optimization O1).
+		 */
+		return WDRR_BUCKET_FOR_HH;
+	}
+
+	/* Conservative update of HHF arrays (see Optimization O2). */
+	for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+		if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
+			q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
+	}
+	return WDRR_BUCKET_FOR_NON_HH;
+}
+
+/* Removes one skb from head of bucket. */
+static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
+{
+	struct sk_buff *skb = bucket->head;
+
+	bucket->head = skb->next;
+	skb->next = NULL;
+	return skb;
+}
+
+/* Tail-adds skb to bucket. */
+static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
+{
+	if (bucket->head == NULL)
+		bucket->head = skb;
+	else
+		bucket->tail->next = skb;
+	bucket->tail = skb;
+	skb->next = NULL;
+}
+
+static unsigned int hhf_drop(struct Qdisc *sch)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	struct wdrr_bucket *bucket;
+
+	/* Always try to drop from heavy-hitters first. */
+	bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
+	if (!bucket->head)
+		bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
+
+	if (bucket->head) {
+		struct sk_buff *skb = dequeue_head(bucket);
+
+		sch->q.qlen--;
+		sch->qstats.drops++;
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		kfree_skb(skb);
+	}
+
+	/* Return id of the bucket from which the packet was dropped. */
+	return bucket - q->buckets;
+}
+
+static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	enum wdrr_bucket_idx idx;
+	struct wdrr_bucket *bucket;
+
+	idx = hhf_classify(skb, sch);
+
+	bucket = &q->buckets[idx];
+	bucket_add(bucket, skb);
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+
+	if (list_empty(&bucket->bucketchain)) {
+		unsigned int weight;
+
+		/* The logic of new_buckets vs. old_buckets is the same as
+		 * new_flows vs. old_flows in the implementation of fq_codel,
+		 * i.e., short bursts of non-HHs should have strict priority.
+		 */
+		if (idx == WDRR_BUCKET_FOR_HH) {
+			/* Always move heavy-hitters to old bucket. */
+			weight = 1;
+			list_add_tail(&bucket->bucketchain, &q->old_buckets);
+		} else {
+			weight = q->hhf_non_hh_weight;
+			list_add_tail(&bucket->bucketchain, &q->new_buckets);
+		}
+		bucket->deficit = weight * q->quantum;
+	}
+	if (++sch->q.qlen <= sch->limit)
+		return NET_XMIT_SUCCESS;
+
+	q->drop_overlimit++;
+	/* Return Congestion Notification only if we dropped a packet from this
+	 * bucket.
+	 */
+	if (hhf_drop(sch) == idx)
+		return NET_XMIT_CN;
+
+	/* As we dropped a packet, better let upper stack know this. */
+	qdisc_tree_decrease_qlen(sch, 1);
+	return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = NULL;
+	struct wdrr_bucket *bucket;
+	struct list_head *head;
+
+begin:
+	head = &q->new_buckets;
+	if (list_empty(head)) {
+		head = &q->old_buckets;
+		if (list_empty(head))
+			return NULL;
+	}
+	bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
+
+	if (bucket->deficit <= 0) {
+		int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
+			      1 : q->hhf_non_hh_weight;
+
+		bucket->deficit += weight * q->quantum;
+		list_move_tail(&bucket->bucketchain, &q->old_buckets);
+		goto begin;
+	}
+
+	if (bucket->head) {
+		skb = dequeue_head(bucket);
+		sch->q.qlen--;
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+	}
+
+	if (!skb) {
+		/* Force a pass through old_buckets to prevent starvation. */
+		if ((head == &q->new_buckets) && !list_empty(&q->old_buckets))
+			list_move_tail(&bucket->bucketchain, &q->old_buckets);
+		else
+			list_del_init(&bucket->bucketchain);
+		goto begin;
+	}
+	qdisc_bstats_update(sch, skb);
+	bucket->deficit -= qdisc_pkt_len(skb);
+
+	return skb;
+}
+
+static void hhf_reset(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	while ((skb = hhf_dequeue(sch)) != NULL)
+		kfree_skb(skb);
+}
+
+static void *hhf_zalloc(size_t sz)
+{
+	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+	if (!ptr)
+		ptr = vzalloc(sz);
+
+	return ptr;
+}
+
+static void hhf_free(void *addr)
+{
+	kvfree(addr);
+}
+
+static void hhf_destroy(struct Qdisc *sch)
+{
+	int i;
+	struct hhf_sched_data *q = qdisc_priv(sch);
+
+	for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+		hhf_free(q->hhf_arrays[i]);
+		hhf_free(q->hhf_valid_bits[i]);
+	}
+
+	for (i = 0; i < HH_FLOWS_CNT; i++) {
+		struct hh_flow_state *flow, *next;
+		struct list_head *head = &q->hh_flows[i];
+
+		if (list_empty(head))
+			continue;
+		list_for_each_entry_safe(flow, next, head, flowchain) {
+			list_del(&flow->flowchain);
+			kfree(flow);
+		}
+	}
+	hhf_free(q->hh_flows);
+}
+
+static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
+	[TCA_HHF_BACKLOG_LIMIT]	 = { .type = NLA_U32 },
+	[TCA_HHF_QUANTUM]	 = { .type = NLA_U32 },
+	[TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
+	[TCA_HHF_RESET_TIMEOUT]	 = { .type = NLA_U32 },
+	[TCA_HHF_ADMIT_BYTES]	 = { .type = NLA_U32 },
+	[TCA_HHF_EVICT_TIMEOUT]	 = { .type = NLA_U32 },
+	[TCA_HHF_NON_HH_WEIGHT]	 = { .type = NLA_U32 },
+};
+
+static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_HHF_MAX + 1];
+	unsigned int qlen;
+	int err;
+	u64 non_hh_quantum;
+	u32 new_quantum = q->quantum;
+	u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_HHF_QUANTUM])
+		new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]);
+
+	if (tb[TCA_HHF_NON_HH_WEIGHT])
+		new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
+
+	non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
+	if (non_hh_quantum > INT_MAX)
+		return -EINVAL;
+
+	sch_tree_lock(sch);
+
+	if (tb[TCA_HHF_BACKLOG_LIMIT])
+		sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
+
+	q->quantum = new_quantum;
+	q->hhf_non_hh_weight = new_hhf_non_hh_weight;
+
+	if (tb[TCA_HHF_HH_FLOWS_LIMIT])
+		q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
+
+	if (tb[TCA_HHF_RESET_TIMEOUT]) {
+		u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
+
+		q->hhf_reset_timeout = usecs_to_jiffies(us);
+	}
+
+	if (tb[TCA_HHF_ADMIT_BYTES])
+		q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
+
+	if (tb[TCA_HHF_EVICT_TIMEOUT]) {
+		u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
+
+		q->hhf_evict_timeout = usecs_to_jiffies(us);
+	}
+
+	qlen = sch->q.qlen;
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = hhf_dequeue(sch);
+
+		kfree_skb(skb);
+	}
+	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	int i;
+
+	sch->limit = 1000;
+	q->quantum = psched_mtu(qdisc_dev(sch));
+	q->perturbation = prandom_u32();
+	INIT_LIST_HEAD(&q->new_buckets);
+	INIT_LIST_HEAD(&q->old_buckets);
+
+	/* Configurable HHF parameters */
+	q->hhf_reset_timeout = HZ / 25; /* 40  ms */
+	q->hhf_admit_bytes = 131072;    /* 128 KB */
+	q->hhf_evict_timeout = HZ;      /* 1  sec */
+	q->hhf_non_hh_weight = 2;
+
+	if (opt) {
+		int err = hhf_change(sch, opt);
+
+		if (err)
+			return err;
+	}
+
+	if (!q->hh_flows) {
+		/* Initialize heavy-hitter flow table. */
+		q->hh_flows = hhf_zalloc(HH_FLOWS_CNT *
+					 sizeof(struct list_head));
+		if (!q->hh_flows)
+			return -ENOMEM;
+		for (i = 0; i < HH_FLOWS_CNT; i++)
+			INIT_LIST_HEAD(&q->hh_flows[i]);
+
+		/* Cap max active HHs at twice len of hh_flows table. */
+		q->hh_flows_limit = 2 * HH_FLOWS_CNT;
+		q->hh_flows_overlimit = 0;
+		q->hh_flows_total_cnt = 0;
+		q->hh_flows_current_cnt = 0;
+
+		/* Initialize heavy-hitter filter arrays. */
+		for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+			q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
+						      sizeof(u32));
+			if (!q->hhf_arrays[i]) {
+				hhf_destroy(sch);
+				return -ENOMEM;
+			}
+		}
+		q->hhf_arrays_reset_timestamp = hhf_time_stamp();
+
+		/* Initialize valid bits of heavy-hitter filter arrays. */
+		for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+			q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
+							  BITS_PER_BYTE);
+			if (!q->hhf_valid_bits[i]) {
+				hhf_destroy(sch);
+				return -ENOMEM;
+			}
+		}
+
+		/* Initialize Weighted DRR buckets. */
+		for (i = 0; i < WDRR_BUCKET_CNT; i++) {
+			struct wdrr_bucket *bucket = q->buckets + i;
+
+			INIT_LIST_HEAD(&bucket->bucketchain);
+		}
+	}
+
+	return 0;
+}
+
+static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
+	    nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
+	    nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
+	    nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
+			jiffies_to_usecs(q->hhf_reset_timeout)) ||
+	    nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
+	    nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
+			jiffies_to_usecs(q->hhf_evict_timeout)) ||
+	    nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	return -1;
+}
+
+static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct hhf_sched_data *q = qdisc_priv(sch);
+	struct tc_hhf_xstats st = {
+		.drop_overlimit = q->drop_overlimit,
+		.hh_overlimit	= q->hh_flows_overlimit,
+		.hh_tot_count	= q->hh_flows_total_cnt,
+		.hh_cur_count	= q->hh_flows_current_cnt,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
+	.id		=	"hhf",
+	.priv_size	=	sizeof(struct hhf_sched_data),
+
+	.enqueue	=	hhf_enqueue,
+	.dequeue	=	hhf_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.drop		=	hhf_drop,
+	.init		=	hhf_init,
+	.reset		=	hhf_reset,
+	.destroy	=	hhf_destroy,
+	.change		=	hhf_change,
+	.dump		=	hhf_dump,
+	.dump_stats	=	hhf_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init hhf_module_init(void)
+{
+	return register_qdisc(&hhf_qdisc_ops);
+}
+
+static void __exit hhf_module_exit(void)
+{
+	unregister_qdisc(&hhf_qdisc_ops);
+}
+
+module_init(hhf_module_init)
+module_exit(hhf_module_exit)
+MODULE_AUTHOR("Terry Lam");
+MODULE_AUTHOR("Nandita Dukkipati");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 3ec95df4a85..9f949abcace 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1,4 +1,4 @@
-/* vim: ts=8 sw=8
+/*
  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
  *
  *		This program is free software; you can redistribute it and/or
@@ -11,7 +11,7 @@
  * Credits (in time order) for older HTB versions:
  *              Stef Coene <stef.coene@docum.org>
  *			HTB support at LARTC mailing list
- *		Ondrej Kraus, <krauso@barr.cz> 
+ *		Ondrej Kraus, <krauso@barr.cz>
  *			found missing INIT_QDISC(htb)
  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
  *			helped a lot to locate nasty class stall bug
@@ -24,267 +24,169 @@
  *		Jiri Fojtasek
  *			fixed requeue routine
  *		and many others. thanks.
- *
- * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $
  */
-#include <linux/config.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
+#include <linux/moduleparam.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
 #include <linux/list.h>
 #include <linux/compiler.h>
-#include <net/sock.h>
-#include <net/pkt_sched.h>
 #include <linux/rbtree.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
 
 /* HTB algorithm.
     Author: devik@cdi.cz
     ========================================================================
     HTB is like TBF with multiple classes. It is also similar to CBQ because
-    it allows to assign priority to each class in hierarchy. 
+    it allows to assign priority to each class in hierarchy.
     In fact it is another implementation of Floyd's formal sharing.
 
     Levels:
-    Each class is assigned level. Leaf has ALWAYS level 0 and root 
+    Each class is assigned level. Leaf has ALWAYS level 0 and root
     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
     one less than their parent.
 */
 
-#define HTB_HSIZE 16	/* classid hash size */
-#define HTB_EWMAC 2	/* rate average over HTB_EWMAC*HTB_HSIZE sec */
-#undef HTB_DEBUG	/* compile debugging support (activated by tc tool) */
-#define HTB_RATECM 1    /* whether to use rate computer */
-#define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */
-#define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock)
-#define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock)
-#define HTB_VER 0x30011	/* major must be matched with number suplied by TC as version */
+static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
+#define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
 
 #if HTB_VER >> 16 != TC_HTB_PROTOVER
 #error "Mismatched sch_htb.c and pkt_sch.h"
 #endif
 
-/* debugging support; S is subsystem, these are defined:
-  0 - netlink messages
-  1 - enqueue
-  2 - drop & requeue
-  3 - dequeue main
-  4 - dequeue one prio DRR part
-  5 - dequeue class accounting
-  6 - class overlimit status computation
-  7 - hint tree
-  8 - event queue
- 10 - rate estimator
- 11 - classifier 
- 12 - fast dequeue cache
-
- L is level; 0 = none, 1 = basic info, 2 = detailed, 3 = full
- q->debug uint32 contains 16 2-bit fields one for subsystem starting
- from LSB
- */
-#ifdef HTB_DEBUG
-#define HTB_DBG_COND(S,L) (((q->debug>>(2*S))&3) >= L)
-#define HTB_DBG(S,L,FMT,ARG...) if (HTB_DBG_COND(S,L)) \
-	printk(KERN_DEBUG FMT,##ARG)
-#define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC)
-#define HTB_PASSQ q,
-#define HTB_ARGQ struct htb_sched *q,
-#define static
-#undef __inline__
-#define __inline__
-#undef inline
-#define inline
-#define HTB_CMAGIC 0xFEFAFEF1
-#define htb_safe_rb_erase(N,R) do { BUG_TRAP((N)->rb_color != -1); \
-		if ((N)->rb_color == -1) break; \
-		rb_erase(N,R); \
-		(N)->rb_color = -1; } while (0)
-#else
-#define HTB_DBG_COND(S,L) (0)
-#define HTB_DBG(S,L,FMT,ARG...)
-#define HTB_PASSQ
-#define HTB_ARGQ
-#define HTB_CHCL(cl)
-#define htb_safe_rb_erase(N,R) rb_erase(N,R)
-#endif
+/* Module parameter and sysfs export */
+module_param    (htb_hysteresis, int, 0640);
+MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
 
+static int htb_rate_est = 0; /* htb classes have a default rate estimator */
+module_param(htb_rate_est, int, 0640);
+MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
 
 /* used internaly to keep status of single class */
 enum htb_cmode {
-    HTB_CANT_SEND,		/* class can't send and can't borrow */
-    HTB_MAY_BORROW,		/* class can't send but may borrow */
-    HTB_CAN_SEND		/* class can send */
+	HTB_CANT_SEND,		/* class can't send and can't borrow */
+	HTB_MAY_BORROW,		/* class can't send but may borrow */
+	HTB_CAN_SEND		/* class can send */
 };
 
-/* interior & leaf nodes; props specific to leaves are marked L: */
-struct htb_class
-{
-#ifdef HTB_DEBUG
-	unsigned magic;
-#endif
-    /* general class parameters */
-    u32 classid;
-    struct gnet_stats_basic bstats;
-    struct gnet_stats_queue qstats;
-    struct gnet_stats_rate_est rate_est;
-    struct tc_htb_xstats xstats;/* our special stats */
-    int refcnt;			/* usage count of this class */
-
-#ifdef HTB_RATECM
-    /* rate measurement counters */
-    unsigned long rate_bytes,sum_bytes;
-    unsigned long rate_packets,sum_packets;
-#endif
+struct htb_prio {
+	union {
+		struct rb_root	row;
+		struct rb_root	feed;
+	};
+	struct rb_node	*ptr;
+	/* When class changes from state 1->2 and disconnects from
+	 * parent's feed then we lost ptr value and start from the
+	 * first child again. Here we store classid of the
+	 * last valid ptr (used when ptr is NULL).
+	 */
+	u32		last_ptr_id;
+};
 
-    /* topology */
-    int level;			/* our level (see above) */
-    struct htb_class *parent;	/* parent class */
-    struct list_head hlist;	/* classid hash list item */
-    struct list_head sibling;	/* sibling list item */
-    struct list_head children;	/* children list */
-
-    union {
-	    struct htb_class_leaf {
-		    struct Qdisc *q;
-		    int prio;
-		    int aprio;	
-		    int quantum;
-		    int deficit[TC_HTB_MAXDEPTH];
-		    struct list_head drop_list;
-	    } leaf;
-	    struct htb_class_inner {
-		    struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
-		    struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
-            /* When class changes from state 1->2 and disconnects from 
-               parent's feed then we lost ptr value and start from the
-              first child again. Here we store classid of the
-              last valid ptr (used when ptr is NULL). */
-              u32 last_ptr_id[TC_HTB_NUMPRIO];
-	    } inner;
-    } un;
-    struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
-    struct rb_node pq_node;		 /* node for event queue */
-    unsigned long pq_key;	/* the same type as jiffies global */
-    
-    int prio_activity;		/* for which prios are we active */
-    enum htb_cmode cmode;	/* current mode of the class */
-
-    /* class attached filters */
-    struct tcf_proto *filter_list;
-    int filter_cnt;
-
-    int warned;		/* only one warning about non work conserving .. */
-
-    /* token bucket parameters */
-    struct qdisc_rate_table *rate;	/* rate table of the class itself */
-    struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
-    long buffer,cbuffer;		/* token bucket depth/rate */
-    long mbuffer;			/* max wait time */
-    long tokens,ctokens;		/* current number of tokens */
-    psched_time_t t_c;			/* checkpoint time */
+/* interior & leaf nodes; props specific to leaves are marked L:
+ * To reduce false sharing, place mostly read fields at beginning,
+ * and mostly written ones at the end.
+ */
+struct htb_class {
+	struct Qdisc_class_common common;
+	struct psched_ratecfg	rate;
+	struct psched_ratecfg	ceil;
+	s64			buffer, cbuffer;/* token bucket depth/rate */
+	s64			mbuffer;	/* max wait time */
+	u32			prio;		/* these two are used only by leaves... */
+	int			quantum;	/* but stored for parent-to-leaf return */
+
+	struct tcf_proto	*filter_list;	/* class attached filters */
+	int			filter_cnt;
+	int			refcnt;		/* usage count of this class */
+
+	int			level;		/* our level (see above) */
+	unsigned int		children;
+	struct htb_class	*parent;	/* parent class */
+
+	struct gnet_stats_rate_est64 rate_est;
+
+	/*
+	 * Written often fields
+	 */
+	struct gnet_stats_basic_packed bstats;
+	struct gnet_stats_queue	qstats;
+	struct tc_htb_xstats	xstats;	/* our special stats */
+
+	/* token bucket parameters */
+	s64			tokens, ctokens;/* current number of tokens */
+	s64			t_c;		/* checkpoint time */
+
+	union {
+		struct htb_class_leaf {
+			struct list_head drop_list;
+			int		deficit[TC_HTB_MAXDEPTH];
+			struct Qdisc	*q;
+		} leaf;
+		struct htb_class_inner {
+			struct htb_prio clprio[TC_HTB_NUMPRIO];
+		} inner;
+	} un;
+	s64			pq_key;
+
+	int			prio_activity;	/* for which prios are we active */
+	enum htb_cmode		cmode;		/* current mode of the class */
+	struct rb_node		pq_node;	/* node for event queue */
+	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
 };
 
-/* TODO: maybe compute rate when size is too large .. or drop ? */
-static __inline__ long L2T(struct htb_class *cl,struct qdisc_rate_table *rate,
-	int size)
-{ 
-    int slot = size >> rate->rate.cell_log;
-    if (slot > 255) {
-	cl->xstats.giants++;
-	slot = 255;
-    }
-    return rate->data[slot];
-}
+struct htb_level {
+	struct rb_root	wait_pq;
+	struct htb_prio hprio[TC_HTB_NUMPRIO];
+};
 
-struct htb_sched
-{
-    struct list_head root;			/* root classes list */
-    struct list_head hash[HTB_HSIZE];		/* hashed by classid */
-    struct list_head drops[TC_HTB_NUMPRIO];	/* active leaves (for drops) */
-    
-    /* self list - roots of self generating tree */
-    struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
-    int row_mask[TC_HTB_MAXDEPTH];
-    struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
-    u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
-
-    /* self wait list - roots of wait PQs per row */
-    struct rb_root wait_pq[TC_HTB_MAXDEPTH];
-
-    /* time of nearest event per level (row) */
-    unsigned long near_ev_cache[TC_HTB_MAXDEPTH];
-
-    /* cached value of jiffies in dequeue */
-    unsigned long jiffies;
-
-    /* whether we hit non-work conserving class during this dequeue; we use */
-    int nwc_hit;	/* this to disable mindelay complaint in dequeue */
-
-    int defcls;		/* class where unclassified flows go to */
-    u32 debug;		/* subsystem debug levels */
-
-    /* filters for qdisc itself */
-    struct tcf_proto *filter_list;
-    int filter_cnt;
-
-    int rate2quantum;		/* quant = rate / rate2quantum */
-    psched_time_t now;		/* cached dequeue time */
-    struct timer_list timer;	/* send delay timer */
-#ifdef HTB_RATECM
-    struct timer_list rttim;	/* rate computer timer */
-    int recmp_bucket;		/* which hash bucket to recompute next */
-#endif
-    
-    /* non shaped skbs; let them go directly thru */
-    struct sk_buff_head direct_queue;
-    int direct_qlen;  /* max qlen of above */
+struct htb_sched {
+	struct Qdisc_class_hash clhash;
+	int			defcls;		/* class where unclassified flows go to */
+	int			rate2quantum;	/* quant = rate / rate2quantum */
 
-    long direct_pkts;
-};
+	/* filters for qdisc itself */
+	struct tcf_proto	*filter_list;
 
-/* compute hash of size HTB_HSIZE for given handle */
-static __inline__ int htb_hash(u32 h) 
-{
-#if HTB_HSIZE != 16
- #error "Declare new hash for your HTB_HSIZE"
-#endif
-    h ^= h>>8;	/* stolen from cbq_hash */
-    h ^= h>>4;
-    return h & 0xf;
-}
+#define HTB_WARN_TOOMANYEVENTS	0x1
+	unsigned int		warned;	/* only one warning */
+	int			direct_qlen;
+	struct work_struct	work;
+
+	/* non shaped skbs; let them go directly thru */
+	struct sk_buff_head	direct_queue;
+	long			direct_pkts;
+
+	struct qdisc_watchdog	watchdog;
+
+	s64			now;	/* cached dequeue time */
+	struct list_head	drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
+
+	/* time of nearest event per level (row) */
+	s64			near_ev_cache[TC_HTB_MAXDEPTH];
+
+	int			row_mask[TC_HTB_MAXDEPTH];
+
+	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
+};
 
 /* find class in global hash table using given handle */
-static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
+static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
 {
 	struct htb_sched *q = qdisc_priv(sch);
-	struct list_head *p;
-	if (TC_H_MAJ(handle) != sch->handle) 
+	struct Qdisc_class_common *clc;
+
+	clc = qdisc_class_find(&q->clhash, handle);
+	if (clc == NULL)
 		return NULL;
-	
-	list_for_each (p,q->hash+htb_hash(handle)) {
-		struct htb_class *cl = list_entry(p,struct htb_class,hlist);
-		if (cl->classid == handle)
-			return cl;
-	}
-	return NULL;
+	return container_of(clc, struct htb_class, common);
 }
 
 /**
@@ -295,17 +197,14 @@ static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
  * We allow direct class selection by classid in priority. The we examine
  * filters in qdisc and in inner nodes (if higher filter points to the inner
  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
- * internal fifo (direct). These packets then go directly thru. If we still 
- * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
+ * internal fifo (direct). These packets then go directly thru. If we still
+ * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
  * then finish and return direct queue.
  */
-#define HTB_DIRECT (struct htb_class*)-1
-static inline u32 htb_classid(struct htb_class *cl)
-{
-	return (cl && cl != HTB_DIRECT) ? cl->classid : TC_H_UNSPEC;
-}
+#define HTB_DIRECT ((struct htb_class *)-1L)
 
-static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
+				      int *qerr)
 {
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl;
@@ -314,119 +213,72 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, in
 	int result;
 
 	/* allow to select class by setting skb->priority to valid classid;
-	   note that nfmark can be used too by attaching filter fw with no
-	   rules in it */
+	 * note that nfmark can be used too by attaching filter fw with no
+	 * rules in it
+	 */
 	if (skb->priority == sch->handle)
-		return HTB_DIRECT;  /* X:0 (direct flow) selected */
-	if ((cl = htb_find(skb->priority,sch)) != NULL && cl->level == 0) 
-		return cl;
+		return HTB_DIRECT;	/* X:0 (direct flow) selected */
+	cl = htb_find(skb->priority, sch);
+	if (cl) {
+		if (cl->level == 0)
+			return cl;
+		/* Start with inner filter chain if a non-leaf class is selected */
+		tcf = cl->filter_list;
+	} else {
+		tcf = q->filter_list;
+	}
 
-	*qerr = NET_XMIT_BYPASS;
-	tcf = q->filter_list;
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
 		case TC_ACT_QUEUED:
-		case TC_ACT_STOLEN: 
-			*qerr = NET_XMIT_SUCCESS;
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return NULL;
 		}
-#elif defined(CONFIG_NET_CLS_POLICE)
-		if (result == TC_POLICE_SHOT)
-			return HTB_DIRECT;
 #endif
-		if ((cl = (void*)res.class) == NULL) {
+		cl = (void *)res.class;
+		if (!cl) {
 			if (res.classid == sch->handle)
-				return HTB_DIRECT;  /* X:0 (direct flow) */
-			if ((cl = htb_find(res.classid,sch)) == NULL)
-				break; /* filter selected invalid classid */
+				return HTB_DIRECT;	/* X:0 (direct flow) */
+			cl = htb_find(res.classid, sch);
+			if (!cl)
+				break;	/* filter selected invalid classid */
 		}
 		if (!cl->level)
-			return cl; /* we hit leaf; return it */
+			return cl;	/* we hit leaf; return it */
 
 		/* we have got inner class; apply inner filter chain */
 		tcf = cl->filter_list;
 	}
 	/* classification failed; try to use default class */
-	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle),q->defcls),sch);
+	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
 	if (!cl || cl->level)
-		return HTB_DIRECT; /* bad default .. this is safe bet */
+		return HTB_DIRECT;	/* bad default .. this is safe bet */
 	return cl;
 }
 
-#ifdef HTB_DEBUG
-static void htb_next_rb_node(struct rb_node **n);
-#define HTB_DUMTREE(root,memb) if(root) { \
-	struct rb_node *n = (root)->rb_node; \
-	while (n->rb_left) n = n->rb_left; \
-	while (n) { \
-		struct htb_class *cl = rb_entry(n, struct htb_class, memb); \
-		printk(" %x",cl->classid); htb_next_rb_node (&n); \
-	} }
-
-static void htb_debug_dump (struct htb_sched *q)
-{
-	int i,p;
-	printk(KERN_DEBUG "htb*g j=%lu lj=%lu\n",jiffies,q->jiffies);
-	/* rows */
-	for (i=TC_HTB_MAXDEPTH-1;i>=0;i--) {
-		printk(KERN_DEBUG "htb*r%d m=%x",i,q->row_mask[i]);
-		for (p=0;p<TC_HTB_NUMPRIO;p++) {
-			if (!q->row[i][p].rb_node) continue;
-			printk(" p%d:",p);
-			HTB_DUMTREE(q->row[i]+p,node[p]);
-		}
-		printk("\n");
-	}
-	/* classes */
-	for (i = 0; i < HTB_HSIZE; i++) {
-		struct list_head *l;
-		list_for_each (l,q->hash+i) {
-			struct htb_class *cl = list_entry(l,struct htb_class,hlist);
-			long diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer);
-			printk(KERN_DEBUG "htb*c%x m=%d t=%ld c=%ld pq=%lu df=%ld ql=%d "
-					"pa=%x f:",
-				cl->classid,cl->cmode,cl->tokens,cl->ctokens,
-				cl->pq_node.rb_color==-1?0:cl->pq_key,diff,
-				cl->level?0:cl->un.leaf.q->q.qlen,cl->prio_activity);
-			if (cl->level)
-			for (p=0;p<TC_HTB_NUMPRIO;p++) {
-				if (!cl->un.inner.feed[p].rb_node) continue;
-				printk(" p%d a=%x:",p,cl->un.inner.ptr[p]?rb_entry(cl->un.inner.ptr[p], struct htb_class,node[p])->classid:0);
-				HTB_DUMTREE(cl->un.inner.feed+p,node[p]);
-			}
-			printk("\n");
-		}
-	}
-}
-#endif
 /**
  * htb_add_to_id_tree - adds class to the round robin list
  *
  * Routine adds class to the list (actually tree) sorted by classid.
  * Make sure that class is not already on such list for given prio.
  */
-static void htb_add_to_id_tree (HTB_ARGQ struct rb_root *root,
-		struct htb_class *cl,int prio)
+static void htb_add_to_id_tree(struct rb_root *root,
+			       struct htb_class *cl, int prio)
 {
 	struct rb_node **p = &root->rb_node, *parent = NULL;
-	HTB_DBG(7,3,"htb_add_id_tree cl=%X prio=%d\n",cl->classid,prio);
-#ifdef HTB_DEBUG
-	if (cl->node[prio].rb_color != -1) { BUG_TRAP(0); return; }
-	HTB_CHCL(cl);
-	if (*p) {
-		struct htb_class *x = rb_entry(*p,struct htb_class,node[prio]);
-		HTB_CHCL(x);
-	}
-#endif
+
 	while (*p) {
-		struct htb_class *c; parent = *p;
+		struct htb_class *c;
+		parent = *p;
 		c = rb_entry(parent, struct htb_class, node[prio]);
-		HTB_CHCL(c);
-		if (cl->classid > c->classid)
+
+		if (cl->common.classid > c->common.classid)
 			p = &parent->rb_right;
-		else 
+		else
 			p = &parent->rb_left;
 	}
 	rb_link_node(&cl->node[prio], parent, p);
@@ -440,35 +292,30 @@ static void htb_add_to_id_tree (HTB_ARGQ struct rb_root *root,
  * change its mode in cl->pq_key microseconds. Make sure that class is not
  * already in the queue.
  */
-static void htb_add_to_wait_tree (struct htb_sched *q,
-		struct htb_class *cl,long delay,int debug_hint)
+static void htb_add_to_wait_tree(struct htb_sched *q,
+				 struct htb_class *cl, s64 delay)
 {
-	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
-	HTB_DBG(7,3,"htb_add_wt cl=%X key=%lu\n",cl->classid,cl->pq_key);
-#ifdef HTB_DEBUG
-	if (cl->pq_node.rb_color != -1) { BUG_TRAP(0); return; }
-	HTB_CHCL(cl);
-	if ((delay <= 0 || delay > cl->mbuffer) && net_ratelimit())
-		printk(KERN_ERR "HTB: suspicious delay in wait_tree d=%ld cl=%X h=%d\n",delay,cl->classid,debug_hint);
-#endif
-	cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay);
-	if (cl->pq_key == q->jiffies)
+	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
+
+	cl->pq_key = q->now + delay;
+	if (cl->pq_key == q->now)
 		cl->pq_key++;
 
 	/* update the nearest event cache */
-	if (time_after(q->near_ev_cache[cl->level], cl->pq_key))
+	if (q->near_ev_cache[cl->level] > cl->pq_key)
 		q->near_ev_cache[cl->level] = cl->pq_key;
-	
+
 	while (*p) {
-		struct htb_class *c; parent = *p;
+		struct htb_class *c;
+		parent = *p;
 		c = rb_entry(parent, struct htb_class, pq_node);
-		if (time_after_eq(cl->pq_key, c->pq_key))
+		if (cl->pq_key >= c->pq_key)
 			p = &parent->rb_right;
-		else 
+		else
 			p = &parent->rb_left;
 	}
 	rb_link_node(&cl->pq_node, parent, p);
-	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
+	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 }
 
 /**
@@ -477,7 +324,7 @@ static void htb_add_to_wait_tree (struct htb_sched *q,
  * When we are past last key we return NULL.
  * Average complexity is 2 steps per call.
  */
-static void htb_next_rb_node(struct rb_node **n)
+static inline void htb_next_rb_node(struct rb_node **n)
 {
 	*n = rb_next(*n);
 }
@@ -488,42 +335,53 @@ static void htb_next_rb_node(struct rb_node **n)
  * The class is added to row at priorities marked in mask.
  * It does nothing if mask == 0.
  */
-static inline void htb_add_class_to_row(struct htb_sched *q, 
-		struct htb_class *cl,int mask)
+static inline void htb_add_class_to_row(struct htb_sched *q,
+					struct htb_class *cl, int mask)
 {
-	HTB_DBG(7,2,"htb_addrow cl=%X mask=%X rmask=%X\n",
-			cl->classid,mask,q->row_mask[cl->level]);
-	HTB_CHCL(cl);
 	q->row_mask[cl->level] |= mask;
 	while (mask) {
 		int prio = ffz(~mask);
 		mask &= ~(1 << prio);
-		htb_add_to_id_tree(HTB_PASSQ q->row[cl->level]+prio,cl,prio);
+		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
 	}
 }
 
+/* If this triggers, it is a bug in this code, but it need not be fatal */
+static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
+{
+	if (RB_EMPTY_NODE(rb)) {
+		WARN_ON(1);
+	} else {
+		rb_erase(rb, root);
+		RB_CLEAR_NODE(rb);
+	}
+}
+
+
 /**
  * htb_remove_class_from_row - removes class from its row
  *
  * The class is removed from row at priorities marked in mask.
  * It does nothing if mask == 0.
  */
-static __inline__ void htb_remove_class_from_row(struct htb_sched *q,
-		struct htb_class *cl,int mask)
+static inline void htb_remove_class_from_row(struct htb_sched *q,
+						 struct htb_class *cl, int mask)
 {
 	int m = 0;
-	HTB_CHCL(cl);
+	struct htb_level *hlevel = &q->hlevel[cl->level];
+
 	while (mask) {
 		int prio = ffz(~mask);
+		struct htb_prio *hprio = &hlevel->hprio[prio];
+
 		mask &= ~(1 << prio);
-		if (q->ptr[cl->level][prio] == cl->node+prio)
-			htb_next_rb_node(q->ptr[cl->level]+prio);
-		htb_safe_rb_erase(cl->node + prio,q->row[cl->level]+prio);
-		if (!q->row[cl->level][prio].rb_node) 
+		if (hprio->ptr == cl->node + prio)
+			htb_next_rb_node(&hprio->ptr);
+
+		htb_safe_rb_erase(cl->node + prio, &hprio->row);
+		if (!hprio->row.rb_node)
 			m |= 1 << prio;
 	}
-	HTB_DBG(7,2,"htb_delrow cl=%X mask=%X rmask=%X maskdel=%X\n",
-			cl->classid,mask,q->row_mask[cl->level],m);
 	q->row_mask[cl->level] &= ~m;
 }
 
@@ -531,115 +389,123 @@ static __inline__ void htb_remove_class_from_row(struct htb_sched *q,
  * htb_activate_prios - creates active classe's feed chain
  *
  * The class is connected to ancestors and/or appropriate rows
- * for priorities it is participating on. cl->cmode must be new 
+ * for priorities it is participating on. cl->cmode must be new
  * (activated) mode. It does nothing if cl->prio_activity == 0.
  */
-static void htb_activate_prios(struct htb_sched *q,struct htb_class *cl)
+static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
 {
 	struct htb_class *p = cl->parent;
-	long m,mask = cl->prio_activity;
-	HTB_DBG(7,2,"htb_act_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode);
-	HTB_CHCL(cl);
+	long m, mask = cl->prio_activity;
 
 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
-		HTB_CHCL(p);
-		m = mask; while (m) {
+		m = mask;
+		while (m) {
 			int prio = ffz(~m);
 			m &= ~(1 << prio);
-			
-			if (p->un.inner.feed[prio].rb_node)
+
+			if (p->un.inner.clprio[prio].feed.rb_node)
 				/* parent already has its feed in use so that
-				   reset bit in mask as parent is already ok */
+				 * reset bit in mask as parent is already ok
+				 */
 				mask &= ~(1 << prio);
-			
-			htb_add_to_id_tree(HTB_PASSQ p->un.inner.feed+prio,cl,prio);
+
+			htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
 		}
-		HTB_DBG(7,3,"htb_act_pr_aft p=%X pact=%X mask=%lX pmode=%d\n",
-				p->classid,p->prio_activity,mask,p->cmode);
 		p->prio_activity |= mask;
-		cl = p; p = cl->parent;
-		HTB_CHCL(cl);
+		cl = p;
+		p = cl->parent;
+
 	}
 	if (cl->cmode == HTB_CAN_SEND && mask)
-		htb_add_class_to_row(q,cl,mask);
+		htb_add_class_to_row(q, cl, mask);
 }
 
 /**
  * htb_deactivate_prios - remove class from feed chain
  *
- * cl->cmode must represent old mode (before deactivation). It does 
+ * cl->cmode must represent old mode (before deactivation). It does
  * nothing if cl->prio_activity == 0. Class is removed from all feed
  * chains and rows.
  */
 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
 {
 	struct htb_class *p = cl->parent;
-	long m,mask = cl->prio_activity;
-	HTB_DBG(7,2,"htb_deact_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode);
-	HTB_CHCL(cl);
+	long m, mask = cl->prio_activity;
 
 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
-		m = mask; mask = 0; 
+		m = mask;
+		mask = 0;
 		while (m) {
 			int prio = ffz(~m);
 			m &= ~(1 << prio);
-			
-			if (p->un.inner.ptr[prio] == cl->node+prio) {
+
+			if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
 				/* we are removing child which is pointed to from
-				   parent feed - forget the pointer but remember
-				   classid */
-				p->un.inner.last_ptr_id[prio] = cl->classid;
-				p->un.inner.ptr[prio] = NULL;
+				 * parent feed - forget the pointer but remember
+				 * classid
+				 */
+				p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
+				p->un.inner.clprio[prio].ptr = NULL;
 			}
-			
-			htb_safe_rb_erase(cl->node + prio,p->un.inner.feed + prio);
-			
-			if (!p->un.inner.feed[prio].rb_node) 
+
+			htb_safe_rb_erase(cl->node + prio,
+					  &p->un.inner.clprio[prio].feed);
+
+			if (!p->un.inner.clprio[prio].feed.rb_node)
 				mask |= 1 << prio;
 		}
-		HTB_DBG(7,3,"htb_deact_pr_aft p=%X pact=%X mask=%lX pmode=%d\n",
-				p->classid,p->prio_activity,mask,p->cmode);
+
 		p->prio_activity &= ~mask;
-		cl = p; p = cl->parent;
-		HTB_CHCL(cl);
+		cl = p;
+		p = cl->parent;
+
 	}
-	if (cl->cmode == HTB_CAN_SEND && mask) 
-		htb_remove_class_from_row(q,cl,mask);
+	if (cl->cmode == HTB_CAN_SEND && mask)
+		htb_remove_class_from_row(q, cl, mask);
+}
+
+static inline s64 htb_lowater(const struct htb_class *cl)
+{
+	if (htb_hysteresis)
+		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
+	else
+		return 0;
 }
+static inline s64 htb_hiwater(const struct htb_class *cl)
+{
+	if (htb_hysteresis)
+		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
+	else
+		return 0;
+}
+
 
 /**
  * htb_class_mode - computes and returns current class mode
  *
  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
- * from now to time when cl will change its state. 
+ * from now to time when cl will change its state.
  * Also it is worth to note that class mode doesn't change simply
- * at cl->{c,}tokens == 0 but there can rather be hysteresis of 
+ * at cl->{c,}tokens == 0 but there can rather be hysteresis of
  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
  * mode transitions per time unit. The speed gain is about 1/6.
  */
-static __inline__ enum htb_cmode 
-htb_class_mode(struct htb_class *cl,long *diff)
+static inline enum htb_cmode
+htb_class_mode(struct htb_class *cl, s64 *diff)
 {
-    long toks;
+	s64 toks;
 
-    if ((toks = (cl->ctokens + *diff)) < (
-#if HTB_HYSTERESIS
-	    cl->cmode != HTB_CANT_SEND ? -cl->cbuffer :
-#endif
-       	    0)) {
-	    *diff = -toks;
-	    return HTB_CANT_SEND;
-    }
-    if ((toks = (cl->tokens + *diff)) >= (
-#if HTB_HYSTERESIS
-	    cl->cmode == HTB_CAN_SEND ? -cl->buffer :
-#endif
-	    0))
-	    return HTB_CAN_SEND;
+	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
+		*diff = -toks;
+		return HTB_CANT_SEND;
+	}
 
-    *diff = -toks;
-    return HTB_MAY_BORROW;
+	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
+		return HTB_CAN_SEND;
+
+	*diff = -toks;
+	return HTB_MAY_BORROW;
 }
 
 /**
@@ -651,167 +517,118 @@ htb_class_mode(struct htb_class *cl,long *diff)
  * be different from old one and cl->pq_key has to be valid if changing
  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
  */
-static void 
-htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
-{ 
-	enum htb_cmode new_mode = htb_class_mode(cl,diff);
-	
-	HTB_CHCL(cl);
-	HTB_DBG(7,1,"htb_chging_clmode %d->%d cl=%X\n",cl->cmode,new_mode,cl->classid);
+static void
+htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
+{
+	enum htb_cmode new_mode = htb_class_mode(cl, diff);
 
 	if (new_mode == cl->cmode)
-		return;	
-	
-	if (cl->prio_activity) { /* not necessary: speed optimization */
-		if (cl->cmode != HTB_CANT_SEND) 
-			htb_deactivate_prios(q,cl);
+		return;
+
+	if (cl->prio_activity) {	/* not necessary: speed optimization */
+		if (cl->cmode != HTB_CANT_SEND)
+			htb_deactivate_prios(q, cl);
 		cl->cmode = new_mode;
-		if (new_mode != HTB_CANT_SEND) 
-			htb_activate_prios(q,cl);
-	} else 
+		if (new_mode != HTB_CANT_SEND)
+			htb_activate_prios(q, cl);
+	} else
 		cl->cmode = new_mode;
 }
 
 /**
- * htb_activate - inserts leaf cl into appropriate active feeds 
+ * htb_activate - inserts leaf cl into appropriate active feeds
  *
  * Routine learns (new) priority of leaf and activates feed chain
  * for the prio. It can be called on already active leaf safely.
  * It also adds leaf into droplist.
  */
-static __inline__ void htb_activate(struct htb_sched *q,struct htb_class *cl)
+static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
 {
-	BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen);
-	HTB_CHCL(cl);
+	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
+
 	if (!cl->prio_activity) {
-		cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
-		htb_activate_prios(q,cl);
-		list_add_tail(&cl->un.leaf.drop_list,q->drops+cl->un.leaf.aprio);
+		cl->prio_activity = 1 << cl->prio;
+		htb_activate_prios(q, cl);
+		list_add_tail(&cl->un.leaf.drop_list,
+			      q->drops + cl->prio);
 	}
 }
 
 /**
- * htb_deactivate - remove leaf cl from active feeds 
+ * htb_deactivate - remove leaf cl from active feeds
  *
  * Make sure that leaf is active. In the other words it can't be called
  * with non-active leaf. It also removes class from the drop list.
  */
-static __inline__ void 
-htb_deactivate(struct htb_sched *q,struct htb_class *cl)
+static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
 {
-	BUG_TRAP(cl->prio_activity);
-	HTB_CHCL(cl);
-	htb_deactivate_prios(q,cl);
+	WARN_ON(!cl->prio_activity);
+
+	htb_deactivate_prios(q, cl);
 	cl->prio_activity = 0;
 	list_del_init(&cl->un.leaf.drop_list);
 }
 
 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
-    int ret;
-    struct htb_sched *q = qdisc_priv(sch);
-    struct htb_class *cl = htb_classify(skb,sch,&ret);
-
-    if (cl == HTB_DIRECT) {
-	/* enqueue to helper queue */
-	if (q->direct_queue.qlen < q->direct_qlen) {
-	    __skb_queue_tail(&q->direct_queue, skb);
-	    q->direct_pkts++;
-	} else {
-	    kfree_skb(skb);
-	    sch->qstats.drops++;
-	    return NET_XMIT_DROP;
-	}
+	int uninitialized_var(ret);
+	struct htb_sched *q = qdisc_priv(sch);
+	struct htb_class *cl = htb_classify(skb, sch, &ret);
+
+	if (cl == HTB_DIRECT) {
+		/* enqueue to helper queue */
+		if (q->direct_queue.qlen < q->direct_qlen) {
+			__skb_queue_tail(&q->direct_queue, skb);
+			q->direct_pkts++;
+		} else {
+			return qdisc_drop(skb, sch);
+		}
 #ifdef CONFIG_NET_CLS_ACT
-    } else if (!cl) {
-	if (ret == NET_XMIT_BYPASS)
-		sch->qstats.drops++;
-	kfree_skb (skb);
-	return ret;
+	} else if (!cl) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
 #endif
-    } else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) {
-	sch->qstats.drops++;
-	cl->qstats.drops++;
-	return NET_XMIT_DROP;
-    } else {
-	cl->bstats.packets++; cl->bstats.bytes += skb->len;
-	htb_activate (q,cl);
-    }
-
-    sch->q.qlen++;
-    sch->bstats.packets++; sch->bstats.bytes += skb->len;
-    HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb);
-    return NET_XMIT_SUCCESS;
-}
-
-/* TODO: requeuing packet charges it to policers again !! */
-static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
-{
-    struct htb_sched *q = qdisc_priv(sch);
-    int ret =  NET_XMIT_SUCCESS;
-    struct htb_class *cl = htb_classify(skb,sch, &ret);
-    struct sk_buff *tskb;
-
-    if (cl == HTB_DIRECT || !cl) {
-	/* enqueue to helper queue */
-	if (q->direct_queue.qlen < q->direct_qlen && cl) {
-	    __skb_queue_head(&q->direct_queue, skb);
+	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
+		if (net_xmit_drop_count(ret)) {
+			sch->qstats.drops++;
+			cl->qstats.drops++;
+		}
+		return ret;
 	} else {
-            __skb_queue_head(&q->direct_queue, skb);
-            tskb = __skb_dequeue_tail(&q->direct_queue);
-            kfree_skb (tskb);
-            sch->qstats.drops++;
-            return NET_XMIT_CN;	
+		htb_activate(q, cl);
 	}
-    } else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) {
-	sch->qstats.drops++;
-	cl->qstats.drops++;
-	return NET_XMIT_DROP;
-    } else 
-	    htb_activate (q,cl);
-
-    sch->q.qlen++;
-    sch->qstats.requeues++;
-    HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb);
-    return NET_XMIT_SUCCESS;
+
+	sch->q.qlen++;
+	return NET_XMIT_SUCCESS;
 }
 
-static void htb_timer(unsigned long arg)
+static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
 {
-    struct Qdisc *sch = (struct Qdisc*)arg;
-    sch->flags &= ~TCQ_F_THROTTLED;
-    wmb();
-    netif_schedule(sch->dev);
+	s64 toks = diff + cl->tokens;
+
+	if (toks > cl->buffer)
+		toks = cl->buffer;
+	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
+	if (toks <= -cl->mbuffer)
+		toks = 1 - cl->mbuffer;
+
+	cl->tokens = toks;
 }
 
-#ifdef HTB_RATECM
-#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0
-static void htb_rate_timer(unsigned long arg)
+static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
 {
-	struct Qdisc *sch = (struct Qdisc*)arg;
-	struct htb_sched *q = qdisc_priv(sch);
-	struct list_head *p;
-
-	/* lock queue so that we can muck with it */
-	HTB_QLOCK(sch);
-	HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies);
-
-	q->rttim.expires = jiffies + HZ;
-	add_timer(&q->rttim);
-
-	/* scan and recompute one bucket at time */
-	if (++q->recmp_bucket >= HTB_HSIZE) 
-		q->recmp_bucket = 0;
-	list_for_each (p,q->hash+q->recmp_bucket) {
-		struct htb_class *cl = list_entry(p,struct htb_class,hlist);
-		HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n",
-				cl->classid,cl->sum_bytes,cl->sum_packets);
-		RT_GEN (cl->sum_bytes,cl->rate_bytes);
-		RT_GEN (cl->sum_packets,cl->rate_packets);
-	}
-	HTB_QUNLOCK(sch);
+	s64 toks = diff + cl->ctokens;
+
+	if (toks > cl->cbuffer)
+		toks = cl->cbuffer;
+	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
+	if (toks <= -cl->mbuffer)
+		toks = 1 - cl->mbuffer;
+
+	cl->ctokens = toks;
 }
-#endif
 
 /**
  * htb_charge_class - charges amount "bytes" to leaf and ancestors
@@ -824,68 +641,40 @@ static void htb_rate_timer(unsigned long arg)
  * CAN_SEND) because we can use more precise clock that event queue here.
  * In such case we remove class from event queue first.
  */
-static void htb_charge_class(struct htb_sched *q,struct htb_class *cl,
-		int level,int bytes)
-{	
-	long toks,diff;
+static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
+			     int level, struct sk_buff *skb)
+{
+	int bytes = qdisc_pkt_len(skb);
 	enum htb_cmode old_mode;
-	HTB_DBG(5,1,"htb_chrg_cl cl=%X lev=%d len=%d\n",cl->classid,level,bytes);
-
-#define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
-	if (toks > cl->B) toks = cl->B; \
-	toks -= L2T(cl, cl->R, bytes); \
-	if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
-	cl->T = toks
+	s64 diff;
 
 	while (cl) {
-		HTB_CHCL(cl);
-		diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer);
-#ifdef HTB_DEBUG
-		if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) {
-			if (net_ratelimit())
-				printk(KERN_ERR "HTB: bad diff in charge, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n",
-				       cl->classid, diff,
-#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY
-				       q->now.tv_sec * 1000000ULL + q->now.tv_usec,
-				       cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec,
-#else
-				       (unsigned long long) q->now,
-				       (unsigned long long) cl->t_c,
-#endif
-				       q->jiffies);
-			diff = 1000;
-		}
-#endif
+		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
 		if (cl->level >= level) {
-			if (cl->level == level) cl->xstats.lends++;
-			HTB_ACCNT (tokens,buffer,rate);
+			if (cl->level == level)
+				cl->xstats.lends++;
+			htb_accnt_tokens(cl, bytes, diff);
 		} else {
 			cl->xstats.borrows++;
-			cl->tokens += diff; /* we moved t_c; update tokens */
+			cl->tokens += diff;	/* we moved t_c; update tokens */
 		}
-		HTB_ACCNT (ctokens,cbuffer,ceil);
+		htb_accnt_ctokens(cl, bytes, diff);
 		cl->t_c = q->now;
-		HTB_DBG(5,2,"htb_chrg_clp cl=%X diff=%ld tok=%ld ctok=%ld\n",cl->classid,diff,cl->tokens,cl->ctokens);
 
-		old_mode = cl->cmode; diff = 0;
-		htb_change_class_mode(q,cl,&diff);
+		old_mode = cl->cmode;
+		diff = 0;
+		htb_change_class_mode(q, cl, &diff);
 		if (old_mode != cl->cmode) {
 			if (old_mode != HTB_CAN_SEND)
-				htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level);
+				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
 			if (cl->cmode != HTB_CAN_SEND)
-				htb_add_to_wait_tree (q,cl,diff,1);
+				htb_add_to_wait_tree(q, cl, diff);
 		}
-		
-#ifdef HTB_RATECM
-		/* update rate counters */
-		cl->sum_bytes += bytes; cl->sum_packets++;
-#endif
 
-		/* update byte stats except for leaves which are already updated */
-		if (cl->level) {
-			cl->bstats.bytes += bytes;
-			cl->bstats.packets++;
-		}
+		/* update basic stats except for leaves which are already updated */
+		if (cl->level)
+			bstats_update(&cl->bstats, skb);
+
 		cl = cl->parent;
 	}
 }
@@ -893,69 +682,66 @@ static void htb_charge_class(struct htb_sched *q,struct htb_class *cl,
 /**
  * htb_do_events - make mode changes to classes at the level
  *
- * Scans event queue for pending events and applies them. Returns jiffies to
- * next pending event (0 for no event in pq).
- * Note: Aplied are events whose have cl->pq_key <= jiffies.
+ * Scans event queue for pending events and applies them. Returns time of
+ * next pending event (0 for no event in pq, q->now for too many events).
+ * Note: Applied are events whose have cl->pq_key <= q->now.
  */
-static long htb_do_events(struct htb_sched *q,int level)
+static s64 htb_do_events(struct htb_sched *q, const int level,
+			 unsigned long start)
 {
-	int i;
-	HTB_DBG(8,1,"htb_do_events l=%d root=%p rmask=%X\n",
-			level,q->wait_pq[level].rb_node,q->row_mask[level]);
-	for (i = 0; i < 500; i++) {
+	/* don't run for longer than 2 jiffies; 2 is used instead of
+	 * 1 to simplify things when jiffy is going to be incremented
+	 * too soon
+	 */
+	unsigned long stop_at = start + 2;
+	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
+
+	while (time_before(jiffies, stop_at)) {
 		struct htb_class *cl;
-		long diff;
-		struct rb_node *p = q->wait_pq[level].rb_node;
-		if (!p) return 0;
-		while (p->rb_left) p = p->rb_left;
+		s64 diff;
+		struct rb_node *p = rb_first(wait_pq);
+
+		if (!p)
+			return 0;
 
 		cl = rb_entry(p, struct htb_class, pq_node);
-		if (time_after(cl->pq_key, q->jiffies)) {
-			HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - q->jiffies);
-			return cl->pq_key - q->jiffies;
-		}
-		htb_safe_rb_erase(p,q->wait_pq+level);
-		diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer);
-#ifdef HTB_DEBUG
-		if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) {
-			if (net_ratelimit())
-				printk(KERN_ERR "HTB: bad diff in events, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n",
-				       cl->classid, diff,
-#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY
-				       q->now.tv_sec * 1000000ULL + q->now.tv_usec,
-				       cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec,
-#else
-				       (unsigned long long) q->now,
-				       (unsigned long long) cl->t_c,
-#endif
-				       q->jiffies);
-			diff = 1000;
-		}
-#endif
-		htb_change_class_mode(q,cl,&diff);
+		if (cl->pq_key > q->now)
+			return cl->pq_key;
+
+		htb_safe_rb_erase(p, wait_pq);
+		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
+		htb_change_class_mode(q, cl, &diff);
 		if (cl->cmode != HTB_CAN_SEND)
-			htb_add_to_wait_tree (q,cl,diff,2);
+			htb_add_to_wait_tree(q, cl, diff);
 	}
-	if (net_ratelimit())
-		printk(KERN_WARNING "htb: too many events !\n");
-	return HZ/10;
+
+	/* too much load - let's continue after a break for scheduling */
+	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
+		pr_warn("htb: too many events!\n");
+		q->warned |= HTB_WARN_TOOMANYEVENTS;
+	}
+
+	return q->now;
 }
 
 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
-   is no such one exists. */
-static struct rb_node *
-htb_id_find_next_upper(int prio,struct rb_node *n,u32 id)
+ * is no such one exists.
+ */
+static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
+					      u32 id)
 {
 	struct rb_node *r = NULL;
 	while (n) {
-		struct htb_class *cl = rb_entry(n,struct htb_class,node[prio]);
-		if (id == cl->classid) return n;
-		
-		if (id > cl->classid) {
+		struct htb_class *cl =
+		    rb_entry(n, struct htb_class, node[prio]);
+
+		if (id > cl->common.classid) {
 			n = n->rb_right;
-		} else {
+		} else if (id < cl->common.classid) {
 			r = n;
 			n = n->rb_left;
+		} else {
+			return n;
 		}
 	}
 	return r;
@@ -966,226 +752,212 @@ htb_id_find_next_upper(int prio,struct rb_node *n,u32 id)
  *
  * Find leaf where current feed pointers points to.
  */
-static struct htb_class *
-htb_lookup_leaf(HTB_ARGQ struct rb_root *tree,int prio,struct rb_node **pptr,u32 *pid)
+static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
 {
 	int i;
 	struct {
 		struct rb_node *root;
 		struct rb_node **pptr;
 		u32 *pid;
-	} stk[TC_HTB_MAXDEPTH],*sp = stk;
-	
-	BUG_TRAP(tree->rb_node);
-	sp->root = tree->rb_node;
-	sp->pptr = pptr;
-	sp->pid = pid;
+	} stk[TC_HTB_MAXDEPTH], *sp = stk;
+
+	BUG_ON(!hprio->row.rb_node);
+	sp->root = hprio->row.rb_node;
+	sp->pptr = &hprio->ptr;
+	sp->pid = &hprio->last_ptr_id;
 
 	for (i = 0; i < 65535; i++) {
-		HTB_DBG(4,2,"htb_lleaf ptr=%p pid=%X\n",*sp->pptr,*sp->pid);
-		
-		if (!*sp->pptr && *sp->pid) { 
-			/* ptr was invalidated but id is valid - try to recover 
-			   the original or next ptr */
-			*sp->pptr = htb_id_find_next_upper(prio,sp->root,*sp->pid);
+		if (!*sp->pptr && *sp->pid) {
+			/* ptr was invalidated but id is valid - try to recover
+			 * the original or next ptr
+			 */
+			*sp->pptr =
+			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
 		}
-		*sp->pid = 0; /* ptr is valid now so that remove this hint as it
-			         can become out of date quickly */
-		if (!*sp->pptr) { /* we are at right end; rewind & go up */
+		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
+				 * can become out of date quickly
+				 */
+		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
 			*sp->pptr = sp->root;
-			while ((*sp->pptr)->rb_left) 
+			while ((*sp->pptr)->rb_left)
 				*sp->pptr = (*sp->pptr)->rb_left;
 			if (sp > stk) {
 				sp--;
-				BUG_TRAP(*sp->pptr); if(!*sp->pptr) return NULL;
-				htb_next_rb_node (sp->pptr);
+				if (!*sp->pptr) {
+					WARN_ON(1);
+					return NULL;
+				}
+				htb_next_rb_node(sp->pptr);
 			}
 		} else {
 			struct htb_class *cl;
-			cl = rb_entry(*sp->pptr,struct htb_class,node[prio]);
-			HTB_CHCL(cl);
-			if (!cl->level) 
+			struct htb_prio *clp;
+
+			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
+			if (!cl->level)
 				return cl;
-			(++sp)->root = cl->un.inner.feed[prio].rb_node;
-			sp->pptr = cl->un.inner.ptr+prio;
-			sp->pid = cl->un.inner.last_ptr_id+prio;
+			clp = &cl->un.inner.clprio[prio];
+			(++sp)->root = clp->feed.rb_node;
+			sp->pptr = &clp->ptr;
+			sp->pid = &clp->last_ptr_id;
 		}
 	}
-	BUG_TRAP(0);
+	WARN_ON(1);
 	return NULL;
 }
 
 /* dequeues packet at given priority and level; call only if
-   you are sure that there is active class at prio/level */
-static struct sk_buff *
-htb_dequeue_tree(struct htb_sched *q,int prio,int level)
+ * you are sure that there is active class at prio/level
+ */
+static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
+					const int level)
 {
 	struct sk_buff *skb = NULL;
-	struct htb_class *cl,*start;
+	struct htb_class *cl, *start;
+	struct htb_level *hlevel = &q->hlevel[level];
+	struct htb_prio *hprio = &hlevel->hprio[prio];
+
 	/* look initial class up in the row */
-	start = cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio,
-			q->ptr[level]+prio,q->last_ptr_id[level]+prio);
-	
+	start = cl = htb_lookup_leaf(hprio, prio);
+
 	do {
 next:
-		BUG_TRAP(cl); 
-		if (!cl) return NULL;
-		HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n",
-				prio,level,cl->classid,cl->un.leaf.deficit[level]);
+		if (unlikely(!cl))
+			return NULL;
 
 		/* class can be empty - it is unlikely but can be true if leaf
-		   qdisc drops packets in enqueue routine or if someone used
-		   graft operation on the leaf since last dequeue; 
-		   simply deactivate and skip such class */
+		 * qdisc drops packets in enqueue routine or if someone used
+		 * graft operation on the leaf since last dequeue;
+		 * simply deactivate and skip such class
+		 */
 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
 			struct htb_class *next;
-			htb_deactivate(q,cl);
+			htb_deactivate(q, cl);
 
 			/* row/level might become empty */
 			if ((q->row_mask[level] & (1 << prio)) == 0)
-				return NULL; 
-			
-			next = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,
-					prio,q->ptr[level]+prio,q->last_ptr_id[level]+prio);
+				return NULL;
+
+			next = htb_lookup_leaf(hprio, prio);
 
-			if (cl == start) /* fix start if we just deleted it */
+			if (cl == start)	/* fix start if we just deleted it */
 				start = next;
 			cl = next;
 			goto next;
 		}
-	
-		if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL)) 
+
+		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
+		if (likely(skb != NULL))
 			break;
-		if (!cl->warned) {
-			printk(KERN_WARNING "htb: class %X isn't work conserving ?!\n",cl->classid);
-			cl->warned = 1;
-		}
-		q->nwc_hit++;
-		htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio);
-		cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio,q->ptr[level]+prio,
-				q->last_ptr_id[level]+prio);
+
+		qdisc_warn_nonwc("htb", cl->un.leaf.q);
+		htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
+					 &q->hlevel[0].hprio[prio].ptr);
+		cl = htb_lookup_leaf(hprio, prio);
 
 	} while (cl != start);
 
 	if (likely(skb != NULL)) {
-		if ((cl->un.leaf.deficit[level] -= skb->len) < 0) {
-			HTB_DBG(4,2,"htb_next_cl oldptr=%p quant_add=%d\n",
-				level?cl->parent->un.inner.ptr[prio]:q->ptr[0][prio],cl->un.leaf.quantum);
-			cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
-			htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio);
+		bstats_update(&cl->bstats, skb);
+		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
+		if (cl->un.leaf.deficit[level] < 0) {
+			cl->un.leaf.deficit[level] += cl->quantum;
+			htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
+						 &q->hlevel[0].hprio[prio].ptr);
 		}
 		/* this used to be after charge_class but this constelation
-		   gives us slightly better performance */
+		 * gives us slightly better performance
+		 */
 		if (!cl->un.leaf.q->q.qlen)
-			htb_deactivate (q,cl);
-		htb_charge_class (q,cl,level,skb->len);
+			htb_deactivate(q, cl);
+		htb_charge_class(q, cl, level, skb);
 	}
 	return skb;
 }
 
-static void htb_delay_by(struct Qdisc *sch,long delay)
-{
-	struct htb_sched *q = qdisc_priv(sch);
-	if (delay <= 0) delay = 1;
-	if (unlikely(delay > 5*HZ)) {
-		if (net_ratelimit())
-			printk(KERN_INFO "HTB delay %ld > 5sec\n", delay);
-		delay = 5*HZ;
-	}
-	/* why don't use jiffies here ? because expires can be in past */
-	mod_timer(&q->timer, q->jiffies + delay);
-	sch->flags |= TCQ_F_THROTTLED;
-	sch->qstats.overlimits++;
-	HTB_DBG(3,1,"htb_deq t_delay=%ld\n",delay);
-}
-
 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 {
-	struct sk_buff *skb = NULL;
+	struct sk_buff *skb;
 	struct htb_sched *q = qdisc_priv(sch);
 	int level;
-	long min_delay;
-#ifdef HTB_DEBUG
-	int evs_used = 0;
-#endif
-
-	q->jiffies = jiffies;
-	HTB_DBG(3,1,"htb_deq dircnt=%d qlen=%d\n",skb_queue_len(&q->direct_queue),
-			sch->q.qlen);
+	s64 next_event;
+	unsigned long start_at;
 
 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
-	if ((skb = __skb_dequeue(&q->direct_queue)) != NULL) {
-		sch->flags &= ~TCQ_F_THROTTLED;
+	skb = __skb_dequeue(&q->direct_queue);
+	if (skb != NULL) {
+ok:
+		qdisc_bstats_update(sch, skb);
+		qdisc_unthrottled(sch);
 		sch->q.qlen--;
 		return skb;
 	}
 
-	if (!sch->q.qlen) goto fin;
-	PSCHED_GET_TIME(q->now);
+	if (!sch->q.qlen)
+		goto fin;
+	q->now = ktime_to_ns(ktime_get());
+	start_at = jiffies;
+
+	next_event = q->now + 5LLU * NSEC_PER_SEC;
 
-	min_delay = LONG_MAX;
-	q->nwc_hit = 0;
 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
 		/* common case optimization - skip event handler quickly */
 		int m;
-		long delay;
-		if (time_after_eq(q->jiffies, q->near_ev_cache[level])) {
-			delay = htb_do_events(q,level);
-			q->near_ev_cache[level] = q->jiffies + (delay ? delay : HZ);
-#ifdef HTB_DEBUG
-			evs_used++;
-#endif
-		} else
-			delay = q->near_ev_cache[level] - q->jiffies;	
-		
-		if (delay && min_delay > delay) 
-			min_delay = delay;
+		s64 event = q->near_ev_cache[level];
+
+		if (q->now >= event) {
+			event = htb_do_events(q, level, start_at);
+			if (!event)
+				event = q->now + NSEC_PER_SEC;
+			q->near_ev_cache[level] = event;
+		}
+
+		if (next_event > event)
+			next_event = event;
+
 		m = ~q->row_mask[level];
 		while (m != (int)(-1)) {
-			int prio = ffz (m);
+			int prio = ffz(m);
+
 			m |= 1 << prio;
-			skb = htb_dequeue_tree(q,prio,level);
-			if (likely(skb != NULL)) {
-				sch->q.qlen--;
-				sch->flags &= ~TCQ_F_THROTTLED;
-				goto fin;
-			}
+			skb = htb_dequeue_tree(q, prio, level);
+			if (likely(skb != NULL))
+				goto ok;
 		}
 	}
-#ifdef HTB_DEBUG
-	if (!q->nwc_hit && min_delay >= 10*HZ && net_ratelimit()) {
-		if (min_delay == LONG_MAX) {
-			printk(KERN_ERR "HTB: dequeue bug (%d,%lu,%lu), report it please !\n",
-					evs_used,q->jiffies,jiffies);
-			htb_debug_dump(q);
-		} else 
-			printk(KERN_WARNING "HTB: mindelay=%ld, some class has "
-					"too small rate\n",min_delay);
+	sch->qstats.overlimits++;
+	if (likely(next_event > q->now)) {
+		if (!test_bit(__QDISC_STATE_DEACTIVATED,
+			      &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
+			ktime_t time = ns_to_ktime(next_event);
+			qdisc_throttled(q->watchdog.qdisc);
+			hrtimer_start(&q->watchdog.timer, time,
+				      HRTIMER_MODE_ABS);
+		}
+	} else {
+		schedule_work(&q->work);
 	}
-#endif
-	htb_delay_by (sch,min_delay > 5*HZ ? 5*HZ : min_delay);
 fin:
-	HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,q->jiffies,skb);
 	return skb;
 }
 
 /* try to drop from each class (by prio) until one succeed */
-static unsigned int htb_drop(struct Qdisc* sch)
+static unsigned int htb_drop(struct Qdisc *sch)
 {
 	struct htb_sched *q = qdisc_priv(sch);
 	int prio;
 
 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
 		struct list_head *p;
-		list_for_each (p,q->drops+prio) {
+		list_for_each(p, q->drops + prio) {
 			struct htb_class *cl = list_entry(p, struct htb_class,
 							  un.leaf.drop_list);
 			unsigned int len;
-			if (cl->un.leaf.q->ops->drop && 
-				(len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
+			if (cl->un.leaf.q->ops->drop &&
+			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
 				sch->q.qlen--;
 				if (!cl->un.leaf.q->q.qlen)
-					htb_deactivate (q,cl);
+					htb_deactivate(q, cl);
 				return len;
 			}
 		}
@@ -1195,91 +967,92 @@ static unsigned int htb_drop(struct Qdisc* sch)
 
 /* reset all classes */
 /* always caled under BH & queue lock */
-static void htb_reset(struct Qdisc* sch)
+static void htb_reset(struct Qdisc *sch)
 {
 	struct htb_sched *q = qdisc_priv(sch);
-	int i;
-	HTB_DBG(0,1,"htb_reset sch=%p, handle=%X\n",sch,sch->handle);
+	struct htb_class *cl;
+	unsigned int i;
 
-	for (i = 0; i < HTB_HSIZE; i++) {
-		struct list_head *p;
-		list_for_each (p,q->hash+i) {
-			struct htb_class *cl = list_entry(p,struct htb_class,hlist);
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 			if (cl->level)
-				memset(&cl->un.inner,0,sizeof(cl->un.inner));
+				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
 			else {
-				if (cl->un.leaf.q) 
+				if (cl->un.leaf.q)
 					qdisc_reset(cl->un.leaf.q);
 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
 			}
 			cl->prio_activity = 0;
 			cl->cmode = HTB_CAN_SEND;
-#ifdef HTB_DEBUG
-			cl->pq_node.rb_color = -1;
-			memset(cl->node,255,sizeof(cl->node));
-#endif
 
 		}
 	}
-	sch->flags &= ~TCQ_F_THROTTLED;
-	del_timer(&q->timer);
+	qdisc_watchdog_cancel(&q->watchdog);
 	__skb_queue_purge(&q->direct_queue);
 	sch->q.qlen = 0;
-	memset(q->row,0,sizeof(q->row));
-	memset(q->row_mask,0,sizeof(q->row_mask));
-	memset(q->wait_pq,0,sizeof(q->wait_pq));
-	memset(q->ptr,0,sizeof(q->ptr));
+	memset(q->hlevel, 0, sizeof(q->hlevel));
+	memset(q->row_mask, 0, sizeof(q->row_mask));
 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
-		INIT_LIST_HEAD(q->drops+i);
+		INIT_LIST_HEAD(q->drops + i);
+}
+
+static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
+	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
+	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
+	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
+	[TCA_HTB_RATE64] = { .type = NLA_U64 },
+	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
+};
+
+static void htb_work_func(struct work_struct *work)
+{
+	struct htb_sched *q = container_of(work, struct htb_sched, work);
+	struct Qdisc *sch = q->watchdog.qdisc;
+
+	__netif_schedule(qdisc_root(sch));
 }
 
-static int htb_init(struct Qdisc *sch, struct rtattr *opt)
+static int htb_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct htb_sched *q = qdisc_priv(sch);
-	struct rtattr *tb[TCA_HTB_INIT];
+	struct nlattr *tb[TCA_HTB_MAX + 1];
 	struct tc_htb_glob *gopt;
+	int err;
 	int i;
-#ifdef HTB_DEBUG
-	printk(KERN_INFO "HTB init, kernel part version %d.%d\n",
-			  HTB_VER >> 16,HTB_VER & 0xffff);
-#endif
-	if (!opt || rtattr_parse_nested(tb, TCA_HTB_INIT, opt) ||
-			tb[TCA_HTB_INIT-1] == NULL ||
-			RTA_PAYLOAD(tb[TCA_HTB_INIT-1]) < sizeof(*gopt)) {
-		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
+
+	if (!opt)
 		return -EINVAL;
-	}
-	gopt = RTA_DATA(tb[TCA_HTB_INIT-1]);
-	if (gopt->version != HTB_VER >> 16) {
-		printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n",
-				HTB_VER >> 16,HTB_VER & 0xffff,gopt->version);
+
+	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_HTB_INIT])
+		return -EINVAL;
+
+	gopt = nla_data(tb[TCA_HTB_INIT]);
+	if (gopt->version != HTB_VER >> 16)
 		return -EINVAL;
-	}
-	q->debug = gopt->debug;
-	HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum);
 
-	INIT_LIST_HEAD(&q->root);
-	for (i = 0; i < HTB_HSIZE; i++)
-		INIT_LIST_HEAD(q->hash+i);
+	err = qdisc_class_hash_init(&q->clhash);
+	if (err < 0)
+		return err;
 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
-		INIT_LIST_HEAD(q->drops+i);
+		INIT_LIST_HEAD(q->drops + i);
 
-	init_timer(&q->timer);
+	qdisc_watchdog_init(&q->watchdog, sch);
+	INIT_WORK(&q->work, htb_work_func);
 	skb_queue_head_init(&q->direct_queue);
 
-	q->direct_qlen = sch->dev->tx_queue_len;
-	if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
-		q->direct_qlen = 2;
-	q->timer.function = htb_timer;
-	q->timer.data = (unsigned long)sch;
-
-#ifdef HTB_RATECM
-	init_timer(&q->rttim);
-	q->rttim.function = htb_rate_timer;
-	q->rttim.data = (unsigned long)sch;
-	q->rttim.expires = jiffies + HZ;
-	add_timer(&q->rttim);
-#endif
+	if (tb[TCA_HTB_DIRECT_QLEN])
+		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
+	else {
+		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
+		if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
+			q->direct_qlen = 2;
+	}
 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
 		q->rate2quantum = 1;
 	q->defcls = gopt->defcls;
@@ -1290,89 +1063,89 @@ static int htb_init(struct Qdisc *sch, struct rtattr *opt)
 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct htb_sched *q = qdisc_priv(sch);
-	unsigned char	 *b = skb->tail;
-	struct rtattr *rta;
+	struct nlattr *nest;
 	struct tc_htb_glob gopt;
-	HTB_DBG(0,1,"htb_dump sch=%p, handle=%X\n",sch,sch->handle);
-	HTB_QLOCK(sch);
-	gopt.direct_pkts = q->direct_pkts;
 
-#ifdef HTB_DEBUG
-	if (HTB_DBG_COND(0,2))
-		htb_debug_dump(q);
-#endif
+	/* Its safe to not acquire qdisc lock. As we hold RTNL,
+	 * no change can happen on the qdisc parameters.
+	 */
+
+	gopt.direct_pkts = q->direct_pkts;
 	gopt.version = HTB_VER;
 	gopt.rate2quantum = q->rate2quantum;
 	gopt.defcls = q->defcls;
-	gopt.debug = q->debug;
-	rta = (struct rtattr*)b;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
-	RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
-	rta->rta_len = skb->tail - b;
-	HTB_QUNLOCK(sch);
-	return skb->len;
-rtattr_failure:
-	HTB_QUNLOCK(sch);
-	skb_trim(skb, skb->tail - skb->data);
+	gopt.debug = 0;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
+	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
 	return -1;
 }
 
 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
-	struct sk_buff *skb, struct tcmsg *tcm)
+			  struct sk_buff *skb, struct tcmsg *tcm)
 {
-#ifdef HTB_DEBUG
-	struct htb_sched *q = qdisc_priv(sch);
-#endif
-	struct htb_class *cl = (struct htb_class*)arg;
-	unsigned char	 *b = skb->tail;
-	struct rtattr *rta;
+	struct htb_class *cl = (struct htb_class *)arg;
+	struct nlattr *nest;
 	struct tc_htb_opt opt;
 
-	HTB_DBG(0,1,"htb_dump_class handle=%X clid=%X\n",sch->handle,cl->classid);
-
-	HTB_QLOCK(sch);
-	tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT;
-	tcm->tcm_handle = cl->classid;
+	/* Its safe to not acquire qdisc lock. As we hold RTNL,
+	 * no change can happen on the class parameters.
+	 */
+	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
+	tcm->tcm_handle = cl->common.classid;
 	if (!cl->level && cl->un.leaf.q)
 		tcm->tcm_info = cl->un.leaf.q->handle;
 
-	rta = (struct rtattr*)b;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
-
-	memset (&opt,0,sizeof(opt));
-
-	opt.rate = cl->rate->rate; opt.buffer = cl->buffer;
-	opt.ceil = cl->ceil->rate; opt.cbuffer = cl->cbuffer;
-	opt.quantum = cl->un.leaf.quantum; opt.prio = cl->un.leaf.prio;
-	opt.level = cl->level; 
-	RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
-	rta->rta_len = skb->tail - b;
-	HTB_QUNLOCK(sch);
-	return skb->len;
-rtattr_failure:
-	HTB_QUNLOCK(sch);
-	skb_trim(skb, b - skb->data);
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	memset(&opt, 0, sizeof(opt));
+
+	psched_ratecfg_getrate(&opt.rate, &cl->rate);
+	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
+	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
+	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
+	opt.quantum = cl->quantum;
+	opt.prio = cl->prio;
+	opt.level = cl->level;
+	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
+	    nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps))
+		goto nla_put_failure;
+	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
+	    nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
 	return -1;
 }
 
 static int
-htb_dump_class_stats(struct Qdisc *sch, unsigned long arg,
-	struct gnet_dump *d)
+htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
 {
-	struct htb_class *cl = (struct htb_class*)arg;
-
-#ifdef HTB_RATECM
-	cl->rate_est.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE);
-	cl->rate_est.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE);
-#endif
+	struct htb_class *cl = (struct htb_class *)arg;
 
 	if (!cl->level && cl->un.leaf.q)
 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
-	cl->xstats.tokens = cl->tokens;
-	cl->xstats.ctokens = cl->ctokens;
+	cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
+	cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
 
 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
 		return -1;
 
@@ -1380,132 +1153,171 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 }
 
 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
-	struct Qdisc **old)
+		     struct Qdisc **old)
 {
-	struct htb_class *cl = (struct htb_class*)arg;
+	struct htb_class *cl = (struct htb_class *)arg;
 
-	if (cl && !cl->level) {
-		if (new == NULL && (new = qdisc_create_dflt(sch->dev, 
-					&pfifo_qdisc_ops)) == NULL)
-					return -ENOBUFS;
-		sch_tree_lock(sch);
-		if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
-			if (cl->prio_activity)
-				htb_deactivate (qdisc_priv(sch),cl);
+	if (cl->level)
+		return -EINVAL;
+	if (new == NULL &&
+	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+				     cl->common.classid)) == NULL)
+		return -ENOBUFS;
 
-			/* TODO: is it correct ? Why CBQ doesn't do it ? */
-			sch->q.qlen -= (*old)->q.qlen;	
-			qdisc_reset(*old);
-		}
-		sch_tree_unlock(sch);
-		return 0;
+	sch_tree_lock(sch);
+	*old = cl->un.leaf.q;
+	cl->un.leaf.q = new;
+	if (*old != NULL) {
+		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+		qdisc_reset(*old);
 	}
-	return -ENOENT;
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct htb_class *cl = (struct htb_class *)arg;
+	return !cl->level ? cl->un.leaf.q : NULL;
 }
 
-static struct Qdisc * htb_leaf(struct Qdisc *sch, unsigned long arg)
+static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
 {
-	struct htb_class *cl = (struct htb_class*)arg;
-	return (cl && !cl->level) ? cl->un.leaf.q : NULL;
+	struct htb_class *cl = (struct htb_class *)arg;
+
+	if (cl->un.leaf.q->q.qlen == 0)
+		htb_deactivate(qdisc_priv(sch), cl);
 }
 
 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
 {
-#ifdef HTB_DEBUG
-	struct htb_sched *q = qdisc_priv(sch);
-#endif
-	struct htb_class *cl = htb_find(classid,sch);
-	HTB_DBG(0,1,"htb_get clid=%X q=%p cl=%p ref=%d\n",classid,q,cl,cl?cl->refcnt:0);
-	if (cl) 
+	struct htb_class *cl = htb_find(classid, sch);
+	if (cl)
 		cl->refcnt++;
 	return (unsigned long)cl;
 }
 
-static void htb_destroy_filters(struct tcf_proto **fl)
+static inline int htb_parent_last_child(struct htb_class *cl)
 {
-	struct tcf_proto *tp;
+	if (!cl->parent)
+		/* the root class */
+		return 0;
+	if (cl->parent->children > 1)
+		/* not the last child */
+		return 0;
+	return 1;
+}
 
-	while ((tp = *fl) != NULL) {
-		*fl = tp->next;
-		tcf_destroy(tp);
-	}
+static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
+			       struct Qdisc *new_q)
+{
+	struct htb_class *parent = cl->parent;
+
+	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
+
+	if (parent->cmode != HTB_CAN_SEND)
+		htb_safe_rb_erase(&parent->pq_node,
+				  &q->hlevel[parent->level].wait_pq);
+
+	parent->level = 0;
+	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
+	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
+	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
+	parent->tokens = parent->buffer;
+	parent->ctokens = parent->cbuffer;
+	parent->t_c = ktime_to_ns(ktime_get());
+	parent->cmode = HTB_CAN_SEND;
 }
 
-static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl)
+static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 {
-	struct htb_sched *q = qdisc_priv(sch);
-	HTB_DBG(0,1,"htb_destrycls clid=%X ref=%d\n", cl?cl->classid:0,cl?cl->refcnt:0);
 	if (!cl->level) {
-		BUG_TRAP(cl->un.leaf.q);
-		sch->q.qlen -= cl->un.leaf.q->q.qlen;
+		WARN_ON(!cl->un.leaf.q);
 		qdisc_destroy(cl->un.leaf.q);
 	}
-	qdisc_put_rtab(cl->rate);
-	qdisc_put_rtab(cl->ceil);
-	
-	htb_destroy_filters (&cl->filter_list);
-	
-	while (!list_empty(&cl->children)) 
-		htb_destroy_class (sch,list_entry(cl->children.next,
-					struct htb_class,sibling));
-
-	/* note: this delete may happen twice (see htb_delete) */
-	list_del(&cl->hlist);
-	list_del(&cl->sibling);
-	
-	if (cl->prio_activity)
-		htb_deactivate (q,cl);
-	
-	if (cl->cmode != HTB_CAN_SEND)
-		htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level);
-	
+	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	tcf_destroy_chain(&cl->filter_list);
 	kfree(cl);
 }
 
-/* always caled under BH & queue lock */
-static void htb_destroy(struct Qdisc* sch)
+static void htb_destroy(struct Qdisc *sch)
 {
 	struct htb_sched *q = qdisc_priv(sch);
-	HTB_DBG(0,1,"htb_destroy q=%p\n",q);
+	struct hlist_node *next;
+	struct htb_class *cl;
+	unsigned int i;
 
-	del_timer_sync (&q->timer);
-#ifdef HTB_RATECM
-	del_timer_sync (&q->rttim);
-#endif
+	cancel_work_sync(&q->work);
+	qdisc_watchdog_cancel(&q->watchdog);
 	/* This line used to be after htb_destroy_class call below
-	   and surprisingly it worked in 2.4. But it must precede it 
-	   because filter need its target class alive to be able to call
-	   unbind_filter on it (without Oops). */
-	htb_destroy_filters(&q->filter_list);
-	
-	while (!list_empty(&q->root)) 
-		htb_destroy_class (sch,list_entry(q->root.next,
-					struct htb_class,sibling));
+	 * and surprisingly it worked in 2.4. But it must precede it
+	 * because filter need its target class alive to be able to call
+	 * unbind_filter on it (without Oops).
+	 */
+	tcf_destroy_chain(&q->filter_list);
 
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
+			tcf_destroy_chain(&cl->filter_list);
+	}
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+					  common.hnode)
+			htb_destroy_class(sch, cl);
+	}
+	qdisc_class_hash_destroy(&q->clhash);
 	__skb_queue_purge(&q->direct_queue);
 }
 
 static int htb_delete(struct Qdisc *sch, unsigned long arg)
 {
 	struct htb_sched *q = qdisc_priv(sch);
-	struct htb_class *cl = (struct htb_class*)arg;
-	HTB_DBG(0,1,"htb_delete q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0);
+	struct htb_class *cl = (struct htb_class *)arg;
+	unsigned int qlen;
+	struct Qdisc *new_q = NULL;
+	int last_child = 0;
 
-	// TODO: why don't allow to delete subtree ? references ? does
-	// tc subsys quarantee us that in htb_destroy it holds no class
-	// refs so that we can remove children safely there ?
-	if (!list_empty(&cl->children) || cl->filter_cnt)
+	/* TODO: why don't allow to delete subtree ? references ? does
+	 * tc subsys guarantee us that in htb_destroy it holds no class
+	 * refs so that we can remove children safely there ?
+	 */
+	if (cl->children || cl->filter_cnt)
 		return -EBUSY;
-	
+
+	if (!cl->level && htb_parent_last_child(cl)) {
+		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					  cl->parent->common.classid);
+		last_child = 1;
+	}
+
 	sch_tree_lock(sch);
-	
+
+	if (!cl->level) {
+		qlen = cl->un.leaf.q->q.qlen;
+		qdisc_reset(cl->un.leaf.q);
+		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
+	}
+
 	/* delete from hash and active; remainder in destroy_class */
-	list_del_init(&cl->hlist);
+	qdisc_class_hash_remove(&q->clhash, &cl->common);
+	if (cl->parent)
+		cl->parent->children--;
+
 	if (cl->prio_activity)
-		htb_deactivate (q,cl);
+		htb_deactivate(q, cl);
 
-	if (--cl->refcnt == 0)
-		htb_destroy_class(sch,cl);
+	if (cl->cmode != HTB_CAN_SEND)
+		htb_safe_rb_erase(&cl->pq_node,
+				  &q->hlevel[cl->level].wait_pq);
+
+	if (last_child)
+		htb_parent_to_leaf(q, cl, new_q);
+
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
 
 	sch_tree_unlock(sch);
 	return 0;
@@ -1513,141 +1325,197 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 
 static void htb_put(struct Qdisc *sch, unsigned long arg)
 {
-#ifdef HTB_DEBUG
-	struct htb_sched *q = qdisc_priv(sch);
-#endif
-	struct htb_class *cl = (struct htb_class*)arg;
-	HTB_DBG(0,1,"htb_put q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0);
+	struct htb_class *cl = (struct htb_class *)arg;
 
 	if (--cl->refcnt == 0)
-		htb_destroy_class(sch,cl);
+		htb_destroy_class(sch, cl);
 }
 
-static int htb_change_class(struct Qdisc *sch, u32 classid, 
-		u32 parentid, struct rtattr **tca, unsigned long *arg)
+static int htb_change_class(struct Qdisc *sch, u32 classid,
+			    u32 parentid, struct nlattr **tca,
+			    unsigned long *arg)
 {
 	int err = -EINVAL;
 	struct htb_sched *q = qdisc_priv(sch);
-	struct htb_class *cl = (struct htb_class*)*arg,*parent;
-	struct rtattr *opt = tca[TCA_OPTIONS-1];
-	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
-	struct rtattr *tb[TCA_HTB_RTAB];
+	struct htb_class *cl = (struct htb_class *)*arg, *parent;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_HTB_MAX + 1];
 	struct tc_htb_opt *hopt;
+	u64 rate64, ceil64;
 
 	/* extract all subattrs from opt attr */
-	if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) ||
-			tb[TCA_HTB_PARMS-1] == NULL ||
-			RTA_PAYLOAD(tb[TCA_HTB_PARMS-1]) < sizeof(*hopt))
+	if (!opt)
 		goto failure;
-	
-	parent = parentid == TC_H_ROOT ? NULL : htb_find (parentid,sch);
 
-	hopt = RTA_DATA(tb[TCA_HTB_PARMS-1]);
-	HTB_DBG(0,1,"htb_chg cl=%p(%X), clid=%X, parid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,classid,parentid,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum);
-	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB-1]);
-	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB-1]);
-	if (!rtab || !ctab) goto failure;
+	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
+	if (err < 0)
+		goto failure;
 
-	if (!cl) { /* new class */
+	err = -EINVAL;
+	if (tb[TCA_HTB_PARMS] == NULL)
+		goto failure;
+
+	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
+
+	hopt = nla_data(tb[TCA_HTB_PARMS]);
+	if (!hopt->rate.rate || !hopt->ceil.rate)
+		goto failure;
+
+	/* Keeping backward compatible with rate_table based iproute2 tc */
+	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
+		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
+
+	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
+		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
+
+	if (!cl) {		/* new class */
 		struct Qdisc *new_q;
+		int prio;
+		struct {
+			struct nlattr		nla;
+			struct gnet_estimator	opt;
+		} est = {
+			.nla = {
+				.nla_len	= nla_attr_size(sizeof(est.opt)),
+				.nla_type	= TCA_RATE,
+			},
+			.opt = {
+				/* 4s interval, 16s averaging constant */
+				.interval	= 2,
+				.ewma_log	= 2,
+			},
+		};
+
 		/* check for valid classid */
-		if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch))
+		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
+		    htb_find(classid, sch))
 			goto failure;
 
 		/* check maximal depth */
 		if (parent && parent->parent && parent->parent->level < 2) {
-			printk(KERN_ERR "htb: tree is too deep\n");
+			pr_err("htb: tree is too deep\n");
 			goto failure;
 		}
 		err = -ENOBUFS;
-		if ((cl = kmalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
+		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+		if (!cl)
 			goto failure;
-		
-		memset(cl, 0, sizeof(*cl));
+
+		if (htb_rate_est || tca[TCA_RATE]) {
+			err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+						qdisc_root_sleeping_lock(sch),
+						tca[TCA_RATE] ? : &est.nla);
+			if (err) {
+				kfree(cl);
+				goto failure;
+			}
+		}
+
 		cl->refcnt = 1;
-		INIT_LIST_HEAD(&cl->sibling);
-		INIT_LIST_HEAD(&cl->hlist);
-		INIT_LIST_HEAD(&cl->children);
+		cl->children = 0;
 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
-#ifdef HTB_DEBUG
-		cl->magic = HTB_CMAGIC;
-#endif
+		RB_CLEAR_NODE(&cl->pq_node);
+
+		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
+			RB_CLEAR_NODE(&cl->node[prio]);
 
 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
-		   so that can't be used inside of sch_tree_lock
-		   -- thanks to Karlis Peisenieks */
-		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+		 * so that can't be used inside of sch_tree_lock
+		 * -- thanks to Karlis Peisenieks
+		 */
+		new_q = qdisc_create_dflt(sch->dev_queue,
+					  &pfifo_qdisc_ops, classid);
 		sch_tree_lock(sch);
 		if (parent && !parent->level) {
+			unsigned int qlen = parent->un.leaf.q->q.qlen;
+
 			/* turn parent into inner node */
-			sch->q.qlen -= parent->un.leaf.q->q.qlen;
-			qdisc_destroy (parent->un.leaf.q);
-			if (parent->prio_activity) 
-				htb_deactivate (q,parent);
+			qdisc_reset(parent->un.leaf.q);
+			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
+			qdisc_destroy(parent->un.leaf.q);
+			if (parent->prio_activity)
+				htb_deactivate(q, parent);
 
 			/* remove from evt list because of level change */
 			if (parent->cmode != HTB_CAN_SEND) {
-				htb_safe_rb_erase(&parent->pq_node,q->wait_pq /*+0*/);
+				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
 				parent->cmode = HTB_CAN_SEND;
 			}
 			parent->level = (parent->parent ? parent->parent->level
-					: TC_HTB_MAXDEPTH) - 1;
-			memset (&parent->un.inner,0,sizeof(parent->un.inner));
+					 : TC_HTB_MAXDEPTH) - 1;
+			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
 		}
 		/* leaf (we) needs elementary qdisc */
 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
 
-		cl->classid = classid; cl->parent = parent;
+		cl->common.classid = classid;
+		cl->parent = parent;
 
 		/* set class to be in HTB_CAN_SEND state */
-		cl->tokens = hopt->buffer;
-		cl->ctokens = hopt->cbuffer;
-		cl->mbuffer = 60000000; /* 1min */
-		PSCHED_GET_TIME(cl->t_c);
+		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
+		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
+		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
+		cl->t_c = ktime_to_ns(ktime_get());
 		cl->cmode = HTB_CAN_SEND;
 
 		/* attach to the hash list and parent's family */
-		list_add_tail(&cl->hlist, q->hash+htb_hash(classid));
-		list_add_tail(&cl->sibling, parent ? &parent->children : &q->root);
-#ifdef HTB_DEBUG
-		{ 
-			int i;
-			for (i = 0; i < TC_HTB_NUMPRIO; i++) cl->node[i].rb_color = -1;
-			cl->pq_node.rb_color = -1;
+		qdisc_class_hash_insert(&q->clhash, &cl->common);
+		if (parent)
+			parent->children++;
+	} else {
+		if (tca[TCA_RATE]) {
+			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+						    qdisc_root_sleeping_lock(sch),
+						    tca[TCA_RATE]);
+			if (err)
+				return err;
 		}
-#endif
-	} else sch_tree_lock(sch);
+		sch_tree_lock(sch);
+	}
+
+	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
+
+	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
+
+	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
+	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
 
 	/* it used to be a nasty bug here, we have to check that node
-           is really leaf before changing cl->un.leaf ! */
+	 * is really leaf before changing cl->un.leaf !
+	 */
 	if (!cl->level) {
-		cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
-		if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
-			printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.\n", cl->classid);
-			cl->un.leaf.quantum = 1000;
+		u64 quantum = cl->rate.rate_bytes_ps;
+
+		do_div(quantum, q->rate2quantum);
+		cl->quantum = min_t(u64, quantum, INT_MAX);
+
+		if (!hopt->quantum && cl->quantum < 1000) {
+			pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
+				cl->common.classid);
+			cl->quantum = 1000;
 		}
-		if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
-			printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.\n", cl->classid);
-			cl->un.leaf.quantum = 200000;
+		if (!hopt->quantum && cl->quantum > 200000) {
+			pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
+				cl->common.classid);
+			cl->quantum = 200000;
 		}
 		if (hopt->quantum)
-			cl->un.leaf.quantum = hopt->quantum;
-		if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
-			cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
+			cl->quantum = hopt->quantum;
+		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
+			cl->prio = TC_HTB_NUMPRIO - 1;
 	}
 
-	cl->buffer = hopt->buffer;
-	cl->cbuffer = hopt->cbuffer;
-	if (cl->rate) qdisc_put_rtab(cl->rate); cl->rate = rtab;
-	if (cl->ceil) qdisc_put_rtab(cl->ceil); cl->ceil = ctab;
+	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
+	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
+
 	sch_tree_unlock(sch);
 
+	qdisc_class_hash_grow(sch, &q->clhash);
+
 	*arg = (unsigned long)cl;
 	return 0;
 
 failure:
-	if (rtab) qdisc_put_rtab(rtab);
-	if (ctab) qdisc_put_rtab(ctab);
 	return err;
 }
 
@@ -1656,55 +1524,48 @@ static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl = (struct htb_class *)arg;
 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
-	HTB_DBG(0,2,"htb_tcf q=%p clid=%X fref=%d fl=%p\n",q,cl?cl->classid:0,cl?cl->filter_cnt:q->filter_cnt,*fl);
+
 	return fl;
 }
 
 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
-	u32 classid)
+				     u32 classid)
 {
-	struct htb_sched *q = qdisc_priv(sch);
-	struct htb_class *cl = htb_find (classid,sch);
-	HTB_DBG(0,2,"htb_bind q=%p clid=%X cl=%p fref=%d\n",q,classid,cl,cl?cl->filter_cnt:q->filter_cnt);
+	struct htb_class *cl = htb_find(classid, sch);
+
 	/*if (cl && !cl->level) return 0;
-	  The line above used to be there to prevent attaching filters to 
-	  leaves. But at least tc_index filter uses this just to get class 
-	  for other reasons so that we have to allow for it.
-	  ----
-	  19.6.2002 As Werner explained it is ok - bind filter is just
-	  another way to "lock" the class - unlike "get" this lock can
-	  be broken by class during destroy IIUC.
+	 * The line above used to be there to prevent attaching filters to
+	 * leaves. But at least tc_index filter uses this just to get class
+	 * for other reasons so that we have to allow for it.
+	 * ----
+	 * 19.6.2002 As Werner explained it is ok - bind filter is just
+	 * another way to "lock" the class - unlike "get" this lock can
+	 * be broken by class during destroy IIUC.
 	 */
-	if (cl) 
-		cl->filter_cnt++; 
-	else 
-		q->filter_cnt++;
+	if (cl)
+		cl->filter_cnt++;
 	return (unsigned long)cl;
 }
 
 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
 {
-	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl = (struct htb_class *)arg;
-	HTB_DBG(0,2,"htb_unbind q=%p cl=%p fref=%d\n",q,cl,cl?cl->filter_cnt:q->filter_cnt);
-	if (cl) 
-		cl->filter_cnt--; 
-	else 
-		q->filter_cnt--;
+
+	if (cl)
+		cl->filter_cnt--;
 }
 
 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 {
 	struct htb_sched *q = qdisc_priv(sch);
-	int i;
+	struct htb_class *cl;
+	unsigned int i;
 
 	if (arg->stop)
 		return;
 
-	for (i = 0; i < HTB_HSIZE; i++) {
-		struct list_head *p;
-		list_for_each (p,q->hash+i) {
-			struct htb_class *cl = list_entry(p,struct htb_class,hlist);
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
 			if (arg->count < arg->skip) {
 				arg->count++;
 				continue;
@@ -1718,9 +1579,10 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 	}
 }
 
-static struct Qdisc_class_ops htb_class_ops = {
+static const struct Qdisc_class_ops htb_class_ops = {
 	.graft		=	htb_graft,
 	.leaf		=	htb_leaf,
+	.qlen_notify	=	htb_qlen_notify,
 	.get		=	htb_get,
 	.put		=	htb_put,
 	.change		=	htb_change_class,
@@ -1733,31 +1595,30 @@ static struct Qdisc_class_ops htb_class_ops = {
 	.dump_stats	=	htb_dump_class_stats,
 };
 
-static struct Qdisc_ops htb_qdisc_ops = {
-	.next		=	NULL,
+static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
 	.cl_ops		=	&htb_class_ops,
 	.id		=	"htb",
 	.priv_size	=	sizeof(struct htb_sched),
 	.enqueue	=	htb_enqueue,
 	.dequeue	=	htb_dequeue,
-	.requeue	=	htb_requeue,
+	.peek		=	qdisc_peek_dequeued,
 	.drop		=	htb_drop,
 	.init		=	htb_init,
 	.reset		=	htb_reset,
 	.destroy	=	htb_destroy,
-	.change		=	NULL /* htb_change */,
 	.dump		=	htb_dump,
 	.owner		=	THIS_MODULE,
 };
 
 static int __init htb_module_init(void)
 {
-    return register_qdisc(&htb_qdisc_ops);
+	return register_qdisc(&htb_qdisc_ops);
 }
-static void __exit htb_module_exit(void) 
+static void __exit htb_module_exit(void)
 {
-    unregister_qdisc(&htb_qdisc_ops);
+	unregister_qdisc(&htb_qdisc_ops);
 }
+
 module_init(htb_module_init)
 module_exit(htb_module_exit)
 MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 8edc32a6ad2..62871c14e1f 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -1,4 +1,4 @@
-/* net/sched/sch_ingress.c - Ingress qdisc 
+/* net/sched/sch_ingress.c - Ingress qdisc
  *              This program is free software; you can redistribute it and/or
  *              modify it under the terms of the GNU General Public License
  *              as published by the Free Software Foundation; either version
@@ -7,430 +7,136 @@
  * Authors:     Jamal Hadi Salim 1999
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/skbuff.h>
-#include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter.h>
-#include <linux/smp.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
-#include <asm/byteorder.h>
-#include <asm/uaccess.h>
-#include <linux/kmod.h>
-#include <linux/stat.h>
-#include <linux/interrupt.h>
-#include <linux/list.h>
-
-
-#undef DEBUG_INGRESS
-
-#ifdef DEBUG_INGRESS  /* control */
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
 
-#if 0  /* data */
-#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define D2PRINTK(format,args...)
-#endif
-
-
-#define PRIV(sch) qdisc_priv(sch)
-
-
-/* Thanks to Doron Oz for this hack
-*/
-#ifndef CONFIG_NET_CLS_ACT
-#ifdef CONFIG_NETFILTER
-static int nf_registered; 
-#endif
-#endif
 
 struct ingress_qdisc_data {
-	struct Qdisc		*q;
 	struct tcf_proto	*filter_list;
 };
 
-
 /* ------------------------- Class/flow operations ------------------------- */
 
-
-static int ingress_graft(struct Qdisc *sch,unsigned long arg,
-    struct Qdisc *new,struct Qdisc **old)
-{
-#ifdef DEBUG_INGRESS
-	struct ingress_qdisc_data *p = PRIV(sch);
-#endif
-
-	DPRINTK("ingress_graft(sch %p,[qdisc %p],new %p,old %p)\n",
-		sch, p, new, old);
-	DPRINTK("\n ingress_graft: You cannot add qdiscs to classes");
-        return 1;
-}
-
-
 static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
 {
 	return NULL;
 }
 
-
-static unsigned long ingress_get(struct Qdisc *sch,u32 classid)
+static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
 {
-#ifdef DEBUG_INGRESS
-	struct ingress_qdisc_data *p = PRIV(sch);
-#endif
-	DPRINTK("ingress_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid);
 	return TC_H_MIN(classid) + 1;
 }
 
-
 static unsigned long ingress_bind_filter(struct Qdisc *sch,
-    unsigned long parent, u32 classid)
+					 unsigned long parent, u32 classid)
 {
 	return ingress_get(sch, classid);
 }
 
-
 static void ingress_put(struct Qdisc *sch, unsigned long cl)
 {
 }
 
-
-static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent,
-    struct rtattr **tca, unsigned long *arg)
-{
-#ifdef DEBUG_INGRESS
-	struct ingress_qdisc_data *p = PRIV(sch);
-#endif
-	DPRINTK("ingress_change(sch %p,[qdisc %p],classid %x,parent %x),"
-		"arg 0x%lx\n", sch, p, classid, parent, *arg);
-	DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment");
-	return 0;
-}
-
-
-
-static void ingress_walk(struct Qdisc *sch,struct qdisc_walker *walker)
+static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
-#ifdef DEBUG_INGRESS
-	struct ingress_qdisc_data *p = PRIV(sch);
-#endif
-	DPRINTK("ingress_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
-	DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment");
 }
 
-
-static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch,unsigned long cl)
+static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
-	struct ingress_qdisc_data *p = PRIV(sch);
+	struct ingress_qdisc_data *p = qdisc_priv(sch);
 
 	return &p->filter_list;
 }
 
-
 /* --------------------------- Qdisc operations ---------------------------- */
 
-
-static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch)
+static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
-	struct ingress_qdisc_data *p = PRIV(sch);
+	struct ingress_qdisc_data *p = qdisc_priv(sch);
 	struct tcf_result res;
 	int result;
 
-	D2PRINTK("ingress_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
 	result = tc_classify(skb, p->filter_list, &res);
-	D2PRINTK("result %d class 0x%04x\n", result, res.classid);
-	/*
-	 * Unlike normal "enqueue" functions, ingress_enqueue returns a
-	 * firewall FW_* code.
-	 */
-#ifdef CONFIG_NET_CLS_ACT
-	sch->bstats.packets++;
-	sch->bstats.bytes += skb->len;
-	switch (result) {
-		case TC_ACT_SHOT:
-			result = TC_ACT_SHOT;
-			sch->qstats.drops++;
-			break;
-		case TC_ACT_STOLEN:
-		case TC_ACT_QUEUED:
-			result = TC_ACT_STOLEN;
-			break;
-		case TC_ACT_RECLASSIFY: 
-		case TC_ACT_OK:
-		case TC_ACT_UNSPEC:
-		default:
-			skb->tc_index = TC_H_MIN(res.classid);
-			result = TC_ACT_OK;
-			break;
-	};
-/* backward compat */
-#else
-#ifdef	CONFIG_NET_CLS_POLICE  
+
+	qdisc_bstats_update(sch, skb);
 	switch (result) {
-		case TC_POLICE_SHOT:
-		result = NF_DROP;
+	case TC_ACT_SHOT:
+		result = TC_ACT_SHOT;
 		sch->qstats.drops++;
 		break;
-		case TC_POLICE_RECLASSIFY: /* DSCP remarking here ? */
-		case TC_POLICE_OK:
-		case TC_POLICE_UNSPEC:
-		default:
-		sch->bstats.packets++;
-		sch->bstats.bytes += skb->len;
-		result = NF_ACCEPT;
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+		result = TC_ACT_STOLEN;
+		break;
+	case TC_ACT_RECLASSIFY:
+	case TC_ACT_OK:
+		skb->tc_index = TC_H_MIN(res.classid);
+	default:
+		result = TC_ACT_OK;
 		break;
-	};
-
-#else
-	D2PRINTK("Overriding result to ACCEPT\n");
-	result = NF_ACCEPT;
-	sch->bstats.packets++;
-	sch->bstats.bytes += skb->len;
-#endif
-#endif
-
-	return result;
-}
-
-
-static struct sk_buff *ingress_dequeue(struct Qdisc *sch)
-{
-/*
-	struct ingress_qdisc_data *p = PRIV(sch);
-	D2PRINTK("ingress_dequeue(sch %p,[qdisc %p])\n",sch,PRIV(p));
-*/
-	return NULL;
-}
-
-
-static int ingress_requeue(struct sk_buff *skb,struct Qdisc *sch)
-{
-/*
-	struct ingress_qdisc_data *p = PRIV(sch);
-	D2PRINTK("ingress_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,PRIV(p));
-*/
-	return 0;
-}
-
-static unsigned int ingress_drop(struct Qdisc *sch)
-{
-#ifdef DEBUG_INGRESS
-	struct ingress_qdisc_data *p = PRIV(sch);
-#endif
-	DPRINTK("ingress_drop(sch %p,[qdisc %p])\n", sch, p);
-	return 0;
-}
-
-#ifndef CONFIG_NET_CLS_ACT
-#ifdef CONFIG_NETFILTER
-static unsigned int
-ing_hook(unsigned int hook, struct sk_buff **pskb,
-                             const struct net_device *indev,
-                             const struct net_device *outdev,
-	                     int (*okfn)(struct sk_buff *))
-{
-	
-	struct Qdisc *q;
-	struct sk_buff *skb = *pskb;
-        struct net_device *dev = skb->dev;
-	int fwres=NF_ACCEPT;
-
-	DPRINTK("ing_hook: skb %s dev=%s len=%u\n",
-		skb->sk ? "(owned)" : "(unowned)",
-		skb->dev ? (*pskb)->dev->name : "(no dev)",
-		skb->len);
-
-/* 
-revisit later: Use a private since lock dev->queue_lock is also
-used on the egress (might slow things for an iota)
-*/
-
-	if (dev->qdisc_ingress) {
-		spin_lock(&dev->queue_lock);
-		if ((q = dev->qdisc_ingress) != NULL)
-			fwres = q->enqueue(skb, q);
-		spin_unlock(&dev->queue_lock);
-        }
-			
-	return fwres;
-}
-
-/* after ipt_filter */
-static struct nf_hook_ops ing_ops = {
-	.hook           = ing_hook,
-	.owner		= THIS_MODULE,
-	.pf             = PF_INET,
-	.hooknum        = NF_IP_PRE_ROUTING,
-	.priority       = NF_IP_PRI_FILTER + 1,
-};
-
-static struct nf_hook_ops ing6_ops = {
-	.hook           = ing_hook,
-	.owner		= THIS_MODULE,
-	.pf             = PF_INET6,
-	.hooknum        = NF_IP6_PRE_ROUTING,
-	.priority       = NF_IP6_PRI_FILTER + 1,
-};
-
-#endif
-#endif
-
-static int ingress_init(struct Qdisc *sch,struct rtattr *opt)
-{
-	struct ingress_qdisc_data *p = PRIV(sch);
-
-/* Make sure either netfilter or preferably CLS_ACT is
-* compiled in */
-#ifndef CONFIG_NET_CLS_ACT
-#ifndef CONFIG_NETFILTER
-	printk("You MUST compile classifier actions into the kernel\n");
-	return -EINVAL;
-#else
-	printk("Ingress scheduler: Classifier actions prefered over netfilter\n");
-#endif
-#endif
-                                                                                
-#ifndef CONFIG_NET_CLS_ACT
-#ifdef CONFIG_NETFILTER
-	if (!nf_registered) {
-		if (nf_register_hook(&ing_ops) < 0) {
-			printk("ingress qdisc registration error \n");
-			return -EINVAL;
-		}
-		nf_registered++;
-
-		if (nf_register_hook(&ing6_ops) < 0) {
-			printk("IPv6 ingress qdisc registration error, " \
-			    "disabling IPv6 support.\n");
-		} else
-			nf_registered++;
 	}
-#endif
-#endif
-
-	DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
-	p->q = &noop_qdisc;
-	return 0;
-}
-
 
-static void ingress_reset(struct Qdisc *sch)
-{
-	struct ingress_qdisc_data *p = PRIV(sch);
-
-	DPRINTK("ingress_reset(sch %p,[qdisc %p])\n", sch, p);
-
-/*
-#if 0
-*/
-/* for future use */
-	qdisc_reset(p->q);
-/*
-#endif
-*/
+	return result;
 }
 
 /* ------------------------------------------------------------- */
 
-
-/* ------------------------------------------------------------- */
-
 static void ingress_destroy(struct Qdisc *sch)
 {
-	struct ingress_qdisc_data *p = PRIV(sch);
-	struct tcf_proto *tp;
+	struct ingress_qdisc_data *p = qdisc_priv(sch);
 
-	DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p);
-	while (p->filter_list) {
-		tp = p->filter_list;
-		p->filter_list = tp->next;
-		tcf_destroy(tp);
-	}
-#if 0
-/* for future use */
-	qdisc_destroy(p->q);
-#endif
+	tcf_destroy_chain(&p->filter_list);
 }
 
-
 static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
-	unsigned char *b = skb->tail;
-	struct rtattr *rta;
+	struct nlattr *nest;
 
-	rta = (struct rtattr *) b;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
-	rta->rta_len = skb->tail - b;
-	return skb->len;
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	return nla_nest_end(skb, nest);
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
 	return -1;
 }
 
-static struct Qdisc_class_ops ingress_class_ops = {
-	.graft		=	ingress_graft,
+static const struct Qdisc_class_ops ingress_class_ops = {
 	.leaf		=	ingress_leaf,
 	.get		=	ingress_get,
 	.put		=	ingress_put,
-	.change		=	ingress_change,
-	.delete		=	NULL,
 	.walk		=	ingress_walk,
 	.tcf_chain	=	ingress_find_tcf,
 	.bind_tcf	=	ingress_bind_filter,
 	.unbind_tcf	=	ingress_put,
-	.dump		=	NULL,
 };
 
-static struct Qdisc_ops ingress_qdisc_ops = {
-	.next		=	NULL,
+static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
 	.cl_ops		=	&ingress_class_ops,
 	.id		=	"ingress",
 	.priv_size	=	sizeof(struct ingress_qdisc_data),
 	.enqueue	=	ingress_enqueue,
-	.dequeue	=	ingress_dequeue,
-	.requeue	=	ingress_requeue,
-	.drop		=	ingress_drop,
-	.init		=	ingress_init,
-	.reset		=	ingress_reset,
 	.destroy	=	ingress_destroy,
-	.change		=	NULL,
 	.dump		=	ingress_dump,
 	.owner		=	THIS_MODULE,
 };
 
 static int __init ingress_module_init(void)
 {
-	int ret = 0;
-
-	if ((ret = register_qdisc(&ingress_qdisc_ops)) < 0) {
-		printk("Unable to register Ingress qdisc\n");
-		return ret;
-	}
-
-	return ret;
+	return register_qdisc(&ingress_qdisc_ops);
 }
-static void __exit ingress_module_exit(void) 
+
+static void __exit ingress_module_exit(void)
 {
 	unregister_qdisc(&ingress_qdisc_ops);
-#ifndef CONFIG_NET_CLS_ACT
-#ifdef CONFIG_NETFILTER
-	if (nf_registered) {
-		nf_unregister_hook(&ing_ops);
-		if (nf_registered > 1)
-			nf_unregister_hook(&ing6_ops);
-	}
-#endif
-#endif
 }
+
 module_init(ingress_module_init)
 module_exit(ingress_module_exit)
 MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
new file mode 100644
index 00000000000..a8b2864a696
--- /dev/null
+++ b/net/sched/sch_mq.c
@@ -0,0 +1,248 @@
+/*
+ * net/sched/sch_mq.c		Classful multiqueue dummy scheduler
+ *
+ * Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+struct mq_sched {
+	struct Qdisc		**qdiscs;
+};
+
+static void mq_destroy(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mq_sched *priv = qdisc_priv(sch);
+	unsigned int ntx;
+
+	if (!priv->qdiscs)
+		return;
+	for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
+		qdisc_destroy(priv->qdiscs[ntx]);
+	kfree(priv->qdiscs);
+}
+
+static int mq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mq_sched *priv = qdisc_priv(sch);
+	struct netdev_queue *dev_queue;
+	struct Qdisc *qdisc;
+	unsigned int ntx;
+
+	if (sch->parent != TC_H_ROOT)
+		return -EOPNOTSUPP;
+
+	if (!netif_is_multiqueue(dev))
+		return -EOPNOTSUPP;
+
+	/* pre-allocate qdiscs, attachment can't fail */
+	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+			       GFP_KERNEL);
+	if (priv->qdiscs == NULL)
+		return -ENOMEM;
+
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		dev_queue = netdev_get_tx_queue(dev, ntx);
+		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
+					  TC_H_MAKE(TC_H_MAJ(sch->handle),
+						    TC_H_MIN(ntx + 1)));
+		if (qdisc == NULL)
+			goto err;
+		priv->qdiscs[ntx] = qdisc;
+		qdisc->flags |= TCQ_F_ONETXQUEUE;
+	}
+
+	sch->flags |= TCQ_F_MQROOT;
+	return 0;
+
+err:
+	mq_destroy(sch);
+	return -ENOMEM;
+}
+
+static void mq_attach(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mq_sched *priv = qdisc_priv(sch);
+	struct Qdisc *qdisc, *old;
+	unsigned int ntx;
+
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = priv->qdiscs[ntx];
+		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+		if (old)
+			qdisc_destroy(old);
+#ifdef CONFIG_NET_SCHED
+		if (ntx < dev->real_num_tx_queues)
+			qdisc_list_add(qdisc);
+#endif
+
+	}
+	kfree(priv->qdiscs);
+	priv->qdiscs = NULL;
+}
+
+static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct Qdisc *qdisc;
+	unsigned int ntx;
+
+	sch->q.qlen = 0;
+	memset(&sch->bstats, 0, sizeof(sch->bstats));
+	memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
+		spin_lock_bh(qdisc_lock(qdisc));
+		sch->q.qlen		+= qdisc->q.qlen;
+		sch->bstats.bytes	+= qdisc->bstats.bytes;
+		sch->bstats.packets	+= qdisc->bstats.packets;
+		sch->qstats.qlen	+= qdisc->qstats.qlen;
+		sch->qstats.backlog	+= qdisc->qstats.backlog;
+		sch->qstats.drops	+= qdisc->qstats.drops;
+		sch->qstats.requeues	+= qdisc->qstats.requeues;
+		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
+	return 0;
+}
+
+static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx = cl - 1;
+
+	if (ntx >= dev->num_tx_queues)
+		return NULL;
+	return netdev_get_tx_queue(dev, ntx);
+}
+
+static struct netdev_queue *mq_select_queue(struct Qdisc *sch,
+					    struct tcmsg *tcm)
+{
+	unsigned int ntx = TC_H_MIN(tcm->tcm_parent);
+	struct netdev_queue *dev_queue = mq_queue_get(sch, ntx);
+
+	if (!dev_queue) {
+		struct net_device *dev = qdisc_dev(sch);
+
+		return netdev_get_tx_queue(dev, 0);
+	}
+	return dev_queue;
+}
+
+static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+		    struct Qdisc **old)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+
+	*old = dev_graft_qdisc(dev_queue, new);
+	if (new)
+		new->flags |= TCQ_F_ONETXQUEUE;
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+	return 0;
+}
+
+static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+	return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mq_get(struct Qdisc *sch, u32 classid)
+{
+	unsigned int ntx = TC_H_MIN(classid);
+
+	if (!mq_queue_get(sch, ntx))
+		return 0;
+	return ntx;
+}
+
+static void mq_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
+			 struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+	tcm->tcm_parent = TC_H_ROOT;
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+	return 0;
+}
+
+static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+			       struct gnet_dump *d)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+	sch = dev_queue->qdisc_sleeping;
+	sch->qstats.qlen = sch->q.qlen;
+	if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &sch->qstats) < 0)
+		return -1;
+	return 0;
+}
+
+static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned int ntx;
+
+	if (arg->stop)
+		return;
+
+	arg->count = arg->skip;
+	for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
+		if (arg->fn(sch, ntx + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops mq_class_ops = {
+	.select_queue	= mq_select_queue,
+	.graft		= mq_graft,
+	.leaf		= mq_leaf,
+	.get		= mq_get,
+	.put		= mq_put,
+	.walk		= mq_walk,
+	.dump		= mq_dump_class,
+	.dump_stats	= mq_dump_class_stats,
+};
+
+struct Qdisc_ops mq_qdisc_ops __read_mostly = {
+	.cl_ops		= &mq_class_ops,
+	.id		= "mq",
+	.priv_size	= sizeof(struct mq_sched),
+	.init		= mq_init,
+	.destroy	= mq_destroy,
+	.attach		= mq_attach,
+	.dump		= mq_dump,
+	.owner		= THIS_MODULE,
+};
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
new file mode 100644
index 00000000000..6749e2f540d
--- /dev/null
+++ b/net/sched/sch_mqprio.c
@@ -0,0 +1,426 @@
+/*
+ * net/sched/sch_mqprio.c
+ *
+ * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct mqprio_sched {
+	struct Qdisc		**qdiscs;
+	int hw_owned;
+};
+
+static void mqprio_destroy(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	unsigned int ntx;
+
+	if (priv->qdiscs) {
+		for (ntx = 0;
+		     ntx < dev->num_tx_queues && priv->qdiscs[ntx];
+		     ntx++)
+			qdisc_destroy(priv->qdiscs[ntx]);
+		kfree(priv->qdiscs);
+	}
+
+	if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
+		dev->netdev_ops->ndo_setup_tc(dev, 0);
+	else
+		netdev_set_num_tc(dev, 0);
+}
+
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+	int i, j;
+
+	/* Verify num_tc is not out of max range */
+	if (qopt->num_tc > TC_MAX_QUEUE)
+		return -EINVAL;
+
+	/* Verify priority mapping uses valid tcs */
+	for (i = 0; i < TC_BITMASK + 1; i++) {
+		if (qopt->prio_tc_map[i] >= qopt->num_tc)
+			return -EINVAL;
+	}
+
+	/* net_device does not support requested operation */
+	if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
+		return -EINVAL;
+
+	/* if hw owned qcount and qoffset are taken from LLD so
+	 * no reason to verify them here
+	 */
+	if (qopt->hw)
+		return 0;
+
+	for (i = 0; i < qopt->num_tc; i++) {
+		unsigned int last = qopt->offset[i] + qopt->count[i];
+
+		/* Verify the queue count is in tx range being equal to the
+		 * real_num_tx_queues indicates the last queue is in use.
+		 */
+		if (qopt->offset[i] >= dev->real_num_tx_queues ||
+		    !qopt->count[i] ||
+		    last > dev->real_num_tx_queues)
+			return -EINVAL;
+
+		/* Verify that the offset and counts do not overlap */
+		for (j = i + 1; j < qopt->num_tc; j++) {
+			if (last > qopt->offset[j])
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct netdev_queue *dev_queue;
+	struct Qdisc *qdisc;
+	int i, err = -EOPNOTSUPP;
+	struct tc_mqprio_qopt *qopt = NULL;
+
+	BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
+	BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
+
+	if (sch->parent != TC_H_ROOT)
+		return -EOPNOTSUPP;
+
+	if (!netif_is_multiqueue(dev))
+		return -EOPNOTSUPP;
+
+	if (!opt || nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+	if (mqprio_parse_opt(dev, qopt))
+		return -EINVAL;
+
+	/* pre-allocate qdisc, attachment can't fail */
+	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+			       GFP_KERNEL);
+	if (priv->qdiscs == NULL) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		dev_queue = netdev_get_tx_queue(dev, i);
+		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
+					  TC_H_MAKE(TC_H_MAJ(sch->handle),
+						    TC_H_MIN(i + 1)));
+		if (qdisc == NULL) {
+			err = -ENOMEM;
+			goto err;
+		}
+		priv->qdiscs[i] = qdisc;
+		qdisc->flags |= TCQ_F_ONETXQUEUE;
+	}
+
+	/* If the mqprio options indicate that hardware should own
+	 * the queue mapping then run ndo_setup_tc otherwise use the
+	 * supplied and verified mapping
+	 */
+	if (qopt->hw) {
+		priv->hw_owned = 1;
+		err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
+		if (err)
+			goto err;
+	} else {
+		netdev_set_num_tc(dev, qopt->num_tc);
+		for (i = 0; i < qopt->num_tc; i++)
+			netdev_set_tc_queue(dev, i,
+					    qopt->count[i], qopt->offset[i]);
+	}
+
+	/* Always use supplied priority mappings */
+	for (i = 0; i < TC_BITMASK + 1; i++)
+		netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
+
+	sch->flags |= TCQ_F_MQROOT;
+	return 0;
+
+err:
+	mqprio_destroy(sch);
+	return err;
+}
+
+static void mqprio_attach(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct Qdisc *qdisc, *old;
+	unsigned int ntx;
+
+	/* Attach underlying qdisc */
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = priv->qdiscs[ntx];
+		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+		if (old)
+			qdisc_destroy(old);
+		if (ntx < dev->real_num_tx_queues)
+			qdisc_list_add(qdisc);
+	}
+	kfree(priv->qdiscs);
+	priv->qdiscs = NULL;
+}
+
+static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
+					     unsigned long cl)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
+
+	if (ntx >= dev->num_tx_queues)
+		return NULL;
+	return netdev_get_tx_queue(dev, ntx);
+}
+
+static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+		    struct Qdisc **old)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+	if (!dev_queue)
+		return -EINVAL;
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+
+	*old = dev_graft_qdisc(dev_queue, new);
+
+	if (new)
+		new->flags |= TCQ_F_ONETXQUEUE;
+
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+
+	return 0;
+}
+
+static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_mqprio_qopt opt = { 0 };
+	struct Qdisc *qdisc;
+	unsigned int i;
+
+	sch->q.qlen = 0;
+	memset(&sch->bstats, 0, sizeof(sch->bstats));
+	memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+		spin_lock_bh(qdisc_lock(qdisc));
+		sch->q.qlen		+= qdisc->q.qlen;
+		sch->bstats.bytes	+= qdisc->bstats.bytes;
+		sch->bstats.packets	+= qdisc->bstats.packets;
+		sch->qstats.qlen	+= qdisc->qstats.qlen;
+		sch->qstats.backlog	+= qdisc->qstats.backlog;
+		sch->qstats.drops	+= qdisc->qstats.drops;
+		sch->qstats.requeues	+= qdisc->qstats.requeues;
+		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
+
+	opt.num_tc = netdev_get_num_tc(dev);
+	memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+	opt.hw = priv->hw_owned;
+
+	for (i = 0; i < netdev_get_num_tc(dev); i++) {
+		opt.count[i] = dev->tc_to_txq[i].count;
+		opt.offset[i] = dev->tc_to_txq[i].offset;
+	}
+
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return skb->len;
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+	struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+	if (!dev_queue)
+		return NULL;
+
+	return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned int ntx = TC_H_MIN(classid);
+
+	if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
+		return 0;
+	return ntx;
+}
+
+static void mqprio_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
+			 struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (cl <= netdev_get_num_tc(dev)) {
+		tcm->tcm_parent = TC_H_ROOT;
+		tcm->tcm_info = 0;
+	} else {
+		int i;
+		struct netdev_queue *dev_queue;
+
+		dev_queue = mqprio_queue_get(sch, cl);
+		tcm->tcm_parent = 0;
+		for (i = 0; i < netdev_get_num_tc(dev); i++) {
+			struct netdev_tc_txq tc = dev->tc_to_txq[i];
+			int q_idx = cl - netdev_get_num_tc(dev);
+
+			if (q_idx > tc.offset &&
+			    q_idx <= tc.offset + tc.count) {
+				tcm->tcm_parent =
+					TC_H_MAKE(TC_H_MAJ(sch->handle),
+						  TC_H_MIN(i + 1));
+				break;
+			}
+		}
+		tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+	}
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				   struct gnet_dump *d)
+	__releases(d->lock)
+	__acquires(d->lock)
+{
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (cl <= netdev_get_num_tc(dev)) {
+		int i;
+		struct Qdisc *qdisc;
+		struct gnet_stats_queue qstats = {0};
+		struct gnet_stats_basic_packed bstats = {0};
+		struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
+
+		/* Drop lock here it will be reclaimed before touching
+		 * statistics this is required because the d->lock we
+		 * hold here is the look on dev_queue->qdisc_sleeping
+		 * also acquired below.
+		 */
+		spin_unlock_bh(d->lock);
+
+		for (i = tc.offset; i < tc.offset + tc.count; i++) {
+			qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+			spin_lock_bh(qdisc_lock(qdisc));
+			bstats.bytes      += qdisc->bstats.bytes;
+			bstats.packets    += qdisc->bstats.packets;
+			qstats.qlen       += qdisc->qstats.qlen;
+			qstats.backlog    += qdisc->qstats.backlog;
+			qstats.drops      += qdisc->qstats.drops;
+			qstats.requeues   += qdisc->qstats.requeues;
+			qstats.overlimits += qdisc->qstats.overlimits;
+			spin_unlock_bh(qdisc_lock(qdisc));
+		}
+		/* Reclaim root sleeping lock before completing stats */
+		spin_lock_bh(d->lock);
+		if (gnet_stats_copy_basic(d, &bstats) < 0 ||
+		    gnet_stats_copy_queue(d, &qstats) < 0)
+			return -1;
+	} else {
+		struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+		sch = dev_queue->qdisc_sleeping;
+		sch->qstats.qlen = sch->q.qlen;
+		if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+		    gnet_stats_copy_queue(d, &sch->qstats) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx;
+
+	if (arg->stop)
+		return;
+
+	/* Walk hierarchy with a virtual class per tc */
+	arg->count = arg->skip;
+	for (ntx = arg->skip;
+	     ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
+	     ntx++) {
+		if (arg->fn(sch, ntx + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops mqprio_class_ops = {
+	.graft		= mqprio_graft,
+	.leaf		= mqprio_leaf,
+	.get		= mqprio_get,
+	.put		= mqprio_put,
+	.walk		= mqprio_walk,
+	.dump		= mqprio_dump_class,
+	.dump_stats	= mqprio_dump_class_stats,
+};
+
+static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
+	.cl_ops		= &mqprio_class_ops,
+	.id		= "mqprio",
+	.priv_size	= sizeof(struct mqprio_sched),
+	.init		= mqprio_init,
+	.destroy	= mqprio_destroy,
+	.attach		= mqprio_attach,
+	.dump		= mqprio_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init mqprio_module_init(void)
+{
+	return register_qdisc(&mqprio_qdisc_ops);
+}
+
+static void __exit mqprio_module_exit(void)
+{
+	unregister_qdisc(&mqprio_qdisc_ops);
+}
+
+module_init(mqprio_module_init);
+module_exit(mqprio_module_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
new file mode 100644
index 00000000000..afb050a735f
--- /dev/null
+++ b/net/sched/sch_multiq.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+struct multiq_sched_data {
+	u16 bands;
+	u16 max_bands;
+	u16 curband;
+	struct tcf_proto *filter_list;
+	struct Qdisc **queues;
+};
+
+
+static struct Qdisc *
+multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	u32 band;
+	struct tcf_result res;
+	int err;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	err = tc_classify(skb, q->filter_list, &res);
+#ifdef CONFIG_NET_CLS_ACT
+	switch (err) {
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+	case TC_ACT_SHOT:
+		return NULL;
+	}
+#endif
+	band = skb_get_queue_mapping(skb);
+
+	if (band >= q->bands)
+		return q->queues[0];
+
+	return q->queues[band];
+}
+
+static int
+multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct Qdisc *qdisc;
+	int ret;
+
+	qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+	if (qdisc == NULL) {
+
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+#endif
+
+	ret = qdisc_enqueue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
+}
+
+static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	struct sk_buff *skb;
+	int band;
+
+	for (band = 0; band < q->bands; band++) {
+		/* cycle through bands to ensure fairness */
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+
+		/* Check that target subqueue is available before
+		 * pulling an skb to avoid head-of-line blocking.
+		 */
+		if (!netif_xmit_stopped(
+		    netdev_get_tx_queue(qdisc_dev(sch), q->curband))) {
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				qdisc_bstats_update(sch, skb);
+				sch->q.qlen--;
+				return skb;
+			}
+		}
+	}
+	return NULL;
+
+}
+
+static struct sk_buff *multiq_peek(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned int curband = q->curband;
+	struct Qdisc *qdisc;
+	struct sk_buff *skb;
+	int band;
+
+	for (band = 0; band < q->bands; band++) {
+		/* cycle through bands to ensure fairness */
+		curband++;
+		if (curband >= q->bands)
+			curband = 0;
+
+		/* Check that target subqueue is available before
+		 * pulling an skb to avoid head-of-line blocking.
+		 */
+		if (!netif_xmit_stopped(
+		    netdev_get_tx_queue(qdisc_dev(sch), curband))) {
+			qdisc = q->queues[curband];
+			skb = qdisc->ops->peek(qdisc);
+			if (skb)
+				return skb;
+		}
+	}
+	return NULL;
+
+}
+
+static unsigned int multiq_drop(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+	unsigned int len;
+	struct Qdisc *qdisc;
+
+	for (band = q->bands - 1; band >= 0; band--) {
+		qdisc = q->queues[band];
+		if (qdisc->ops->drop) {
+			len = qdisc->ops->drop(qdisc);
+			if (len != 0) {
+				sch->q.qlen--;
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
+
+static void
+multiq_reset(struct Qdisc *sch)
+{
+	u16 band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	for (band = 0; band < q->bands; band++)
+		qdisc_reset(q->queues[band]);
+	sch->q.qlen = 0;
+	q->curband = 0;
+}
+
+static void
+multiq_destroy(struct Qdisc *sch)
+{
+	int band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	for (band = 0; band < q->bands; band++)
+		qdisc_destroy(q->queues[band]);
+
+	kfree(q->queues);
+}
+
+static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct tc_multiq_qopt *qopt;
+	int i;
+
+	if (!netif_is_multiqueue(qdisc_dev(sch)))
+		return -EOPNOTSUPP;
+	if (nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+
+	qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+
+	sch_tree_lock(sch);
+	q->bands = qopt->bands;
+	for (i = q->bands; i < q->max_bands; i++) {
+		if (q->queues[i] != &noop_qdisc) {
+			struct Qdisc *child = q->queues[i];
+			q->queues[i] = &noop_qdisc;
+			qdisc_tree_decrease_qlen(child, child->q.qlen);
+			qdisc_destroy(child);
+		}
+	}
+
+	sch_tree_unlock(sch);
+
+	for (i = 0; i < q->bands; i++) {
+		if (q->queues[i] == &noop_qdisc) {
+			struct Qdisc *child, *old;
+			child = qdisc_create_dflt(sch->dev_queue,
+						  &pfifo_qdisc_ops,
+						  TC_H_MAKE(sch->handle,
+							    i + 1));
+			if (child) {
+				sch_tree_lock(sch);
+				old = q->queues[i];
+				q->queues[i] = child;
+
+				if (old != &noop_qdisc) {
+					qdisc_tree_decrease_qlen(old,
+								 old->q.qlen);
+					qdisc_destroy(old);
+				}
+				sch_tree_unlock(sch);
+			}
+		}
+	}
+	return 0;
+}
+
+static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int i, err;
+
+	q->queues = NULL;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	q->max_bands = qdisc_dev(sch)->num_tx_queues;
+
+	q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL);
+	if (!q->queues)
+		return -ENOBUFS;
+	for (i = 0; i < q->max_bands; i++)
+		q->queues[i] = &noop_qdisc;
+
+	err = multiq_tune(sch, opt);
+
+	if (err)
+		kfree(q->queues);
+
+	return err;
+}
+
+static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_multiq_qopt opt;
+
+	opt.bands = q->bands;
+	opt.max_bands = q->max_bands;
+
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		      struct Qdisc **old)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->queues[band];
+	q->queues[band] = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *
+multiq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	return q->queues[band];
+}
+
+static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = TC_H_MIN(classid);
+
+	if (band - 1 >= q->bands)
+		return 0;
+	return band;
+}
+
+static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
+				 u32 classid)
+{
+	return multiq_get(sch, classid);
+}
+
+
+static void multiq_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	tcm->tcm_info = q->queues[cl - 1]->handle;
+	return 0;
+}
+
+static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				 struct gnet_dump *d)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *cl_q;
+
+	cl_q = q->queues[cl - 1];
+	cl_q->qstats.qlen = cl_q->q.qlen;
+	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
+		return -1;
+
+	return 0;
+}
+
+static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+
+	if (arg->stop)
+		return;
+
+	for (band = 0; band < q->bands; band++) {
+		if (arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, band + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static const struct Qdisc_class_ops multiq_class_ops = {
+	.graft		=	multiq_graft,
+	.leaf		=	multiq_leaf,
+	.get		=	multiq_get,
+	.put		=	multiq_put,
+	.walk		=	multiq_walk,
+	.tcf_chain	=	multiq_find_tcf,
+	.bind_tcf	=	multiq_bind,
+	.unbind_tcf	=	multiq_put,
+	.dump		=	multiq_dump_class,
+	.dump_stats	=	multiq_dump_class_stats,
+};
+
+static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.cl_ops		=	&multiq_class_ops,
+	.id		=	"multiq",
+	.priv_size	=	sizeof(struct multiq_sched_data),
+	.enqueue	=	multiq_enqueue,
+	.dequeue	=	multiq_dequeue,
+	.peek		=	multiq_peek,
+	.drop		=	multiq_drop,
+	.init		=	multiq_init,
+	.reset		=	multiq_reset,
+	.destroy	=	multiq_destroy,
+	.change		=	multiq_tune,
+	.dump		=	multiq_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init multiq_module_init(void)
+{
+	return register_qdisc(&multiq_qdisc_ops);
+}
+
+static void __exit multiq_module_exit(void)
+{
+	unregister_qdisc(&multiq_qdisc_ops);
+}
+
+module_init(multiq_module_init)
+module_exit(multiq_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index ba528320483..111d70fddae 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -4,28 +4,32 @@
  * 		This program is free software; you can redistribute it and/or
  * 		modify it under the terms of the GNU General Public License
  * 		as published by the Free Software Foundation; either version
- * 		2 of the License, or (at your option) any later version.
+ * 		2 of the License.
  *
  *  		Many of the algorithms and ideas for this came from
- *		NIST Net which is not copyrighted. 
+ *		NIST Net which is not copyrighted.
  *
  * Authors:	Stephen Hemminger <shemminger@osdl.org>
  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  */
 
-#include <linux/config.h>
+#include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/bitops.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <linux/vmalloc.h>
 #include <linux/rtnetlink.h>
+#include <linux/reciprocal_div.h>
+#include <linux/rbtree.h>
 
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
 
-#define VERSION "1.2"
+#define VERSION "1.3"
 
 /*	Network Emulation Queuing algorithm.
 	====================================
@@ -49,77 +53,285 @@
 	 control either since that can be handled by using token
 	 bucket or other rate control.
 
-	 The simulator is limited by the Linux timer resolution
-	 and will create packet bursts on the HZ boundary (1ms).
+     Correlated Loss Generator models
+
+	Added generation of correlated loss according to the
+	"Gilbert-Elliot" model, a 4-state markov model.
+
+	References:
+	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
+	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
+	and intuitive loss model for packet networks and its implementation
+	in the Netem module in the Linux kernel", available in [1]
+
+	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
+		 Fabio Ludovici <fabio.ludovici at yahoo.it>
 */
 
 struct netem_sched_data {
+	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
+	struct rb_root t_root;
+
+	/* optional qdisc for classful handling (NULL at netem init) */
 	struct Qdisc	*qdisc;
-	struct timer_list timer;
 
-	u32 latency;
+	struct qdisc_watchdog watchdog;
+
+	psched_tdiff_t latency;
+	psched_tdiff_t jitter;
+
 	u32 loss;
+	u32 ecn;
 	u32 limit;
 	u32 counter;
 	u32 gap;
-	u32 jitter;
 	u32 duplicate;
 	u32 reorder;
 	u32 corrupt;
+	u64 rate;
+	s32 packet_overhead;
+	u32 cell_size;
+	struct reciprocal_value cell_size_reciprocal;
+	s32 cell_overhead;
 
 	struct crndstate {
-		unsigned long last;
-		unsigned long rho;
+		u32 last;
+		u32 rho;
 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 
 	struct disttable {
 		u32  size;
 		s16 table[0];
 	} *delay_dist;
+
+	enum  {
+		CLG_RANDOM,
+		CLG_4_STATES,
+		CLG_GILB_ELL,
+	} loss_model;
+
+	enum {
+		TX_IN_GAP_PERIOD = 1,
+		TX_IN_BURST_PERIOD,
+		LOST_IN_GAP_PERIOD,
+		LOST_IN_BURST_PERIOD,
+	} _4_state_model;
+
+	enum {
+		GOOD_STATE = 1,
+		BAD_STATE,
+	} GE_state_model;
+
+	/* Correlated Loss Generation models */
+	struct clgstate {
+		/* state of the Markov chain */
+		u8 state;
+
+		/* 4-states and Gilbert-Elliot models */
+		u32 a1;	/* p13 for 4-states or p for GE */
+		u32 a2;	/* p31 for 4-states or r for GE */
+		u32 a3;	/* p32 for 4-states or h for GE */
+		u32 a4;	/* p14 for 4-states or 1-k for GE */
+		u32 a5; /* p23 used only in 4-states */
+	} clg;
+
 };
 
-/* Time stamp put into socket buffer control block */
+/* Time stamp put into socket buffer control block
+ * Only valid when skbs are in our internal t(ime)fifo queue.
+ */
 struct netem_skb_cb {
 	psched_time_t	time_to_send;
+	ktime_t		tstamp_save;
 };
 
+/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
+ * to hold a rb_node structure.
+ *
+ * If struct sk_buff layout is changed, the following checks will complain.
+ */
+static struct rb_node *netem_rb_node(struct sk_buff *skb)
+{
+	BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
+	BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
+		     offsetof(struct sk_buff, next) + sizeof(skb->next));
+	BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
+		     offsetof(struct sk_buff, prev) + sizeof(skb->prev));
+	BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
+					      sizeof(skb->prev) +
+					      sizeof(skb->tstamp));
+	return (struct rb_node *)&skb->next;
+}
+
+static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
+{
+	return (struct sk_buff *)rb;
+}
+
+static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
+{
+	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
+	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
+	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
 /* init_crandom - initialize correlated random number generator
  * Use entropy source for initial seed.
  */
 static void init_crandom(struct crndstate *state, unsigned long rho)
 {
 	state->rho = rho;
-	state->last = net_random();
+	state->last = prandom_u32();
 }
 
 /* get_crandom - correlated random number generator
  * Next number depends on last value.
  * rho is scaled to avoid floating point.
  */
-static unsigned long get_crandom(struct crndstate *state)
+static u32 get_crandom(struct crndstate *state)
 {
 	u64 value, rho;
 	unsigned long answer;
 
-	if (state->rho == 0)	/* no correllation */
-		return net_random();
+	if (state->rho == 0)	/* no correlation */
+		return prandom_u32();
 
-	value = net_random();
+	value = prandom_u32();
 	rho = (u64)state->rho + 1;
 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 	state->last = answer;
 	return answer;
 }
 
+/* loss_4state - 4-state model loss generator
+ * Generates losses according to the 4-state Markov chain adopted in
+ * the GI (General and Intuitive) loss model.
+ */
+static bool loss_4state(struct netem_sched_data *q)
+{
+	struct clgstate *clg = &q->clg;
+	u32 rnd = prandom_u32();
+
+	/*
+	 * Makes a comparison between rnd and the transition
+	 * probabilities outgoing from the current state, then decides the
+	 * next state and if the next packet has to be transmitted or lost.
+	 * The four states correspond to:
+	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
+	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
+	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period
+	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
+	 */
+	switch (clg->state) {
+	case TX_IN_GAP_PERIOD:
+		if (rnd < clg->a4) {
+			clg->state = LOST_IN_BURST_PERIOD;
+			return true;
+		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
+			clg->state = LOST_IN_GAP_PERIOD;
+			return true;
+		} else if (clg->a1 + clg->a4 < rnd) {
+			clg->state = TX_IN_GAP_PERIOD;
+		}
+
+		break;
+	case TX_IN_BURST_PERIOD:
+		if (rnd < clg->a5) {
+			clg->state = LOST_IN_GAP_PERIOD;
+			return true;
+		} else {
+			clg->state = TX_IN_BURST_PERIOD;
+		}
+
+		break;
+	case LOST_IN_GAP_PERIOD:
+		if (rnd < clg->a3)
+			clg->state = TX_IN_BURST_PERIOD;
+		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
+			clg->state = TX_IN_GAP_PERIOD;
+		} else if (clg->a2 + clg->a3 < rnd) {
+			clg->state = LOST_IN_GAP_PERIOD;
+			return true;
+		}
+		break;
+	case LOST_IN_BURST_PERIOD:
+		clg->state = TX_IN_GAP_PERIOD;
+		break;
+	}
+
+	return false;
+}
+
+/* loss_gilb_ell - Gilbert-Elliot model loss generator
+ * Generates losses according to the Gilbert-Elliot loss model or
+ * its special cases  (Gilbert or Simple Gilbert)
+ *
+ * Makes a comparison between random number and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state. A second random number is extracted and the comparison
+ * with the loss probability of the current state decides if the next
+ * packet will be transmitted or lost.
+ */
+static bool loss_gilb_ell(struct netem_sched_data *q)
+{
+	struct clgstate *clg = &q->clg;
+
+	switch (clg->state) {
+	case GOOD_STATE:
+		if (prandom_u32() < clg->a1)
+			clg->state = BAD_STATE;
+		if (prandom_u32() < clg->a4)
+			return true;
+		break;
+	case BAD_STATE:
+		if (prandom_u32() < clg->a2)
+			clg->state = GOOD_STATE;
+		if (prandom_u32() > clg->a3)
+			return true;
+	}
+
+	return false;
+}
+
+static bool loss_event(struct netem_sched_data *q)
+{
+	switch (q->loss_model) {
+	case CLG_RANDOM:
+		/* Random packet drop 0 => none, ~0 => all */
+		return q->loss && q->loss >= get_crandom(&q->loss_cor);
+
+	case CLG_4_STATES:
+		/* 4state loss model algorithm (used also for GI model)
+		* Extracts a value from the markov 4 state loss generator,
+		* if it is 1 drops a packet and if needed writes the event in
+		* the kernel logs
+		*/
+		return loss_4state(q);
+
+	case CLG_GILB_ELL:
+		/* Gilbert-Elliot loss model algorithm
+		* Extracts a value from the Gilbert-Elliot loss generator,
+		* if it is 1 drops a packet and if needed writes the event in
+		* the kernel logs
+		*/
+		return loss_gilb_ell(q);
+	}
+
+	return false;	/* not reached */
+}
+
+
 /* tabledist - return a pseudo-randomly distributed value with mean mu and
  * std deviation sigma.  Uses table lookup to approximate the desired
  * distribution, and a uniformly-distributed pseudo-random source.
  */
-static long tabledist(unsigned long mu, long sigma, 
-		      struct crndstate *state, const struct disttable *dist)
+static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
+				struct crndstate *state,
+				const struct disttable *dist)
 {
-	long t, x;
-	unsigned long rnd;
+	psched_tdiff_t x;
+	long t;
+	u32 rnd;
 
 	if (sigma == 0)
 		return mu;
@@ -127,7 +339,7 @@ static long tabledist(unsigned long mu, long sigma,
 	rnd = get_crandom(state);
 
 	/* default uniform distribution */
-	if (dist == NULL) 
+	if (dist == NULL)
 		return (rnd % (2*sigma)) - sigma + mu;
 
 	t = dist->table[rnd % dist->size];
@@ -140,6 +352,62 @@ static long tabledist(unsigned long mu, long sigma,
 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 }
 
+static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
+{
+	u64 ticks;
+
+	len += q->packet_overhead;
+
+	if (q->cell_size) {
+		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
+
+		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
+			cells++;
+		len = cells * (q->cell_size + q->cell_overhead);
+	}
+
+	ticks = (u64)len * NSEC_PER_SEC;
+
+	do_div(ticks, q->rate);
+	return PSCHED_NS2TICKS(ticks);
+}
+
+static void tfifo_reset(struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	struct rb_node *p;
+
+	while ((p = rb_first(&q->t_root))) {
+		struct sk_buff *skb = netem_rb_to_skb(p);
+
+		rb_erase(p, &q->t_root);
+		skb->next = NULL;
+		skb->prev = NULL;
+		kfree_skb(skb);
+	}
+}
+
+static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
+	struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
+
+	while (*p) {
+		struct sk_buff *skb;
+
+		parent = *p;
+		skb = netem_rb_to_skb(parent);
+		if (tnext >= netem_skb_cb(skb)->time_to_send)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(netem_rb_node(nskb), parent, p);
+	rb_insert_color(netem_rb_node(nskb), &q->t_root);
+	sch->q.qlen++;
+}
+
 /*
  * Insert one skb into qdisc.
  * Note: parent depends on return value to account for queue length.
@@ -149,38 +417,45 @@ static long tabledist(unsigned long mu, long sigma,
 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
-	struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
+	/* We don't fill cb now as skb_unshare() may invalidate it */
+	struct netem_skb_cb *cb;
 	struct sk_buff *skb2;
-	int ret;
 	int count = 1;
 
-	pr_debug("netem_enqueue skb=%p\n", skb);
-
 	/* Random duplication */
 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 		++count;
 
-	/* Random packet drop 0 => none, ~0 => all */
-	if (q->loss && q->loss >= get_crandom(&q->loss_cor))
-		--count;
-
+	/* Drop packet? */
+	if (loss_event(q)) {
+		if (q->ecn && INET_ECN_set_ce(skb))
+			sch->qstats.drops++; /* mark packet */
+		else
+			--count;
+	}
 	if (count == 0) {
 		sch->qstats.drops++;
 		kfree_skb(skb);
-		return NET_XMIT_DROP;
+		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	}
 
+	/* If a delay is expected, orphan the skb. (orphaning usually takes
+	 * place at TX completion time, so _before_ the link transit delay)
+	 */
+	if (q->latency || q->jitter)
+		skb_orphan_partial(skb);
+
 	/*
 	 * If we need to duplicate packet, then re-insert at top of the
 	 * qdisc tree, since parent queuer expects that only one
 	 * skb will be queued.
 	 */
 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
-		struct Qdisc *rootq = sch->dev->qdisc;
+		struct Qdisc *rootq = qdisc_root(sch);
 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 		q->duplicate = 0;
 
-		rootq->enqueue(skb2, rootq);
+		qdisc_enqueue_root(skb2, rootq);
 		q->duplicate = dupsave;
 	}
 
@@ -191,73 +466,99 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	 * do it now in software before we mangle it.
 	 */
 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
-		if (!(skb = skb_unshare(skb, GFP_ATOMIC))
-		    || (skb->ip_summed == CHECKSUM_HW
-			&& skb_checksum_help(skb, 0))) {
-			sch->qstats.drops++;
-			return NET_XMIT_DROP;
-		}
+		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
+		    (skb->ip_summed == CHECKSUM_PARTIAL &&
+		     skb_checksum_help(skb)))
+			return qdisc_drop(skb, sch);
 
-		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
+		skb->data[prandom_u32() % skb_headlen(skb)] ^=
+			1<<(prandom_u32() % 8);
 	}
 
-	if (q->gap == 0 		/* not doing reordering */
-	    || q->counter < q->gap 	/* inside last reordering gap */
-	    || q->reorder < get_crandom(&q->reorder_cor)) {
+	if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
+		return qdisc_reshape_fail(skb, sch);
+
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+
+	cb = netem_skb_cb(skb);
+	if (q->gap == 0 ||		/* not doing reordering */
+	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
+	    q->reorder < get_crandom(&q->reorder_cor)) {
 		psched_time_t now;
 		psched_tdiff_t delay;
 
 		delay = tabledist(q->latency, q->jitter,
 				  &q->delay_cor, q->delay_dist);
 
-		PSCHED_GET_TIME(now);
-		PSCHED_TADD2(now, delay, cb->time_to_send);
+		now = psched_get_time();
+
+		if (q->rate) {
+			struct sk_buff *last;
+
+			if (!skb_queue_empty(&sch->q))
+				last = skb_peek_tail(&sch->q);
+			else
+				last = netem_rb_to_skb(rb_last(&q->t_root));
+			if (last) {
+				/*
+				 * Last packet in queue is reference point (now),
+				 * calculate this time bonus and subtract
+				 * from delay.
+				 */
+				delay -= netem_skb_cb(last)->time_to_send - now;
+				delay = max_t(psched_tdiff_t, 0, delay);
+				now = netem_skb_cb(last)->time_to_send;
+			}
+
+			delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
+		}
+
+		cb->time_to_send = now + delay;
+		cb->tstamp_save = skb->tstamp;
 		++q->counter;
-		ret = q->qdisc->enqueue(skb, q->qdisc);
+		tfifo_enqueue(skb, sch);
 	} else {
-		/* 
+		/*
 		 * Do re-ordering by putting one out of N packets at the front
 		 * of the queue.
 		 */
-		PSCHED_GET_TIME(cb->time_to_send);
+		cb->time_to_send = psched_get_time();
 		q->counter = 0;
-		ret = q->qdisc->ops->requeue(skb, q->qdisc);
-	}
-
-	if (likely(ret == NET_XMIT_SUCCESS)) {
-		sch->q.qlen++;
-		sch->bstats.bytes += skb->len;
-		sch->bstats.packets++;
-	} else
-		sch->qstats.drops++;
 
-	pr_debug("netem: enqueue ret %d\n", ret);
-	return ret;
-}
-
-/* Requeue packets but don't change time stamp */
-static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
-{
-	struct netem_sched_data *q = qdisc_priv(sch);
-	int ret;
-
-	if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
-		sch->q.qlen++;
+		__skb_queue_head(&sch->q, skb);
 		sch->qstats.requeues++;
 	}
 
-	return ret;
+	return NET_XMIT_SUCCESS;
 }
 
-static unsigned int netem_drop(struct Qdisc* sch)
+static unsigned int netem_drop(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	unsigned int len;
 
-	if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
-		sch->q.qlen--;
-		sch->qstats.drops++;
+	len = qdisc_queue_drop(sch);
+
+	if (!len) {
+		struct rb_node *p = rb_first(&q->t_root);
+
+		if (p) {
+			struct sk_buff *skb = netem_rb_to_skb(p);
+
+			rb_erase(p, &q->t_root);
+			sch->q.qlen--;
+			skb->next = NULL;
+			skb->prev = NULL;
+			len = qdisc_pkt_len(skb);
+			sch->qstats.backlog -= len;
+			kfree_skb(skb);
+		}
 	}
+	if (!len && q->qdisc && q->qdisc->ops->drop)
+	    len = q->qdisc->ops->drop(q->qdisc);
+	if (len)
+		sch->qstats.drops++;
+
 	return len;
 }
 
@@ -265,301 +566,334 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
+	struct rb_node *p;
 
-	skb = q->qdisc->dequeue(q->qdisc);
+	if (qdisc_is_throttled(sch))
+		return NULL;
+
+tfifo_dequeue:
+	skb = __skb_dequeue(&sch->q);
 	if (skb) {
-		const struct netem_skb_cb *cb
-			= (const struct netem_skb_cb *)skb->cb;
-		psched_time_t now;
+deliver:
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		qdisc_unthrottled(sch);
+		qdisc_bstats_update(sch, skb);
+		return skb;
+	}
+	p = rb_first(&q->t_root);
+	if (p) {
+		psched_time_t time_to_send;
+
+		skb = netem_rb_to_skb(p);
 
 		/* if more time remaining? */
-		PSCHED_GET_TIME(now);
+		time_to_send = netem_skb_cb(skb)->time_to_send;
+		if (time_to_send <= psched_get_time()) {
+			rb_erase(p, &q->t_root);
 
-		if (PSCHED_TLESS(cb->time_to_send, now)) {
-			pr_debug("netem_dequeue: return skb=%p\n", skb);
 			sch->q.qlen--;
-			sch->flags &= ~TCQ_F_THROTTLED;
-			return skb;
-		} else {
-			psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now);
-
-			if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
-				sch->qstats.drops++;
-
-				/* After this qlen is confused */
-				printk(KERN_ERR "netem: queue discpline %s could not requeue\n",
-				       q->qdisc->ops->id);
-
-				sch->q.qlen--;
+			skb->next = NULL;
+			skb->prev = NULL;
+			skb->tstamp = netem_skb_cb(skb)->tstamp_save;
+
+#ifdef CONFIG_NET_CLS_ACT
+			/*
+			 * If it's at ingress let's pretend the delay is
+			 * from the network (tstamp will be updated).
+			 */
+			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
+				skb->tstamp.tv64 = 0;
+#endif
+
+			if (q->qdisc) {
+				int err = qdisc_enqueue(skb, q->qdisc);
+
+				if (unlikely(err != NET_XMIT_SUCCESS)) {
+					if (net_xmit_drop_count(err)) {
+						sch->qstats.drops++;
+						qdisc_tree_decrease_qlen(sch, 1);
+					}
+				}
+				goto tfifo_dequeue;
 			}
+			goto deliver;
+		}
 
-			mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay));
-			sch->flags |= TCQ_F_THROTTLED;
+		if (q->qdisc) {
+			skb = q->qdisc->ops->dequeue(q->qdisc);
+			if (skb)
+				goto deliver;
 		}
+		qdisc_watchdog_schedule(&q->watchdog, time_to_send);
 	}
 
+	if (q->qdisc) {
+		skb = q->qdisc->ops->dequeue(q->qdisc);
+		if (skb)
+			goto deliver;
+	}
 	return NULL;
 }
 
-static void netem_watchdog(unsigned long arg)
-{
-	struct Qdisc *sch = (struct Qdisc *)arg;
-
-	pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen);
-	sch->flags &= ~TCQ_F_THROTTLED;
-	netif_schedule(sch->dev);
-}
-
 static void netem_reset(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 
-	qdisc_reset(q->qdisc);
-	sch->q.qlen = 0;
-	sch->flags &= ~TCQ_F_THROTTLED;
-	del_timer_sync(&q->timer);
+	qdisc_reset_queue(sch);
+	tfifo_reset(sch);
+	if (q->qdisc)
+		qdisc_reset(q->qdisc);
+	qdisc_watchdog_cancel(&q->watchdog);
 }
 
-/* Pass size change message down to embedded FIFO */
-static int set_fifo_limit(struct Qdisc *q, int limit)
+static void dist_free(struct disttable *d)
 {
-        struct rtattr *rta;
-	int ret = -ENOMEM;
-
-	/* Hack to avoid sending change message to non-FIFO */
-	if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
-		return 0;
-
-	rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
-	if (rta) {
-		rta->rta_type = RTM_NEWQDISC;
-		rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); 
-		((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
-		
-		ret = q->ops->change(q, rta);
-		kfree(rta);
-	}
-	return ret;
+	kvfree(d);
 }
 
 /*
  * Distribution data is a variable size payload containing
  * signed 16 bit values.
  */
-static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
+static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
-	unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
-	const __s16 *data = RTA_DATA(attr);
+	size_t n = nla_len(attr)/sizeof(__s16);
+	const __s16 *data = nla_data(attr);
+	spinlock_t *root_lock;
 	struct disttable *d;
 	int i;
+	size_t s;
 
-	if (n > 65536)
+	if (n > NETEM_DIST_MAX)
 		return -EINVAL;
 
-	d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
+	s = sizeof(struct disttable) + n * sizeof(s16);
+	d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
+	if (!d)
+		d = vmalloc(s);
 	if (!d)
 		return -ENOMEM;
 
 	d->size = n;
 	for (i = 0; i < n; i++)
 		d->table[i] = data[i];
-	
-	spin_lock_bh(&sch->dev->queue_lock);
-	d = xchg(&q->delay_dist, d);
-	spin_unlock_bh(&sch->dev->queue_lock);
 
-	kfree(d);
+	root_lock = qdisc_root_sleeping_lock(sch);
+
+	spin_lock_bh(root_lock);
+	swap(q->delay_dist, d);
+	spin_unlock_bh(root_lock);
+
+	dist_free(d);
 	return 0;
 }
 
-static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
+static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
 {
-	struct netem_sched_data *q = qdisc_priv(sch);
-	const struct tc_netem_corr *c = RTA_DATA(attr);
-
-	if (RTA_PAYLOAD(attr) != sizeof(*c))
-		return -EINVAL;
+	const struct tc_netem_corr *c = nla_data(attr);
 
 	init_crandom(&q->delay_cor, c->delay_corr);
 	init_crandom(&q->loss_cor, c->loss_corr);
 	init_crandom(&q->dup_cor, c->dup_corr);
-	return 0;
 }
 
-static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
+static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
 {
-	struct netem_sched_data *q = qdisc_priv(sch);
-	const struct tc_netem_reorder *r = RTA_DATA(attr);
-
-	if (RTA_PAYLOAD(attr) != sizeof(*r))
-		return -EINVAL;
+	const struct tc_netem_reorder *r = nla_data(attr);
 
 	q->reorder = r->probability;
 	init_crandom(&q->reorder_cor, r->correlation);
-	return 0;
 }
 
-static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr)
+static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
 {
-	struct netem_sched_data *q = qdisc_priv(sch);
-	const struct tc_netem_corrupt *r = RTA_DATA(attr);
-
-	if (RTA_PAYLOAD(attr) != sizeof(*r))
-		return -EINVAL;
+	const struct tc_netem_corrupt *r = nla_data(attr);
 
 	q->corrupt = r->probability;
 	init_crandom(&q->corrupt_cor, r->correlation);
-	return 0;
 }
 
-/* Parse netlink message to set options */
-static int netem_change(struct Qdisc *sch, struct rtattr *opt)
+static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
 {
-	struct netem_sched_data *q = qdisc_priv(sch);
-	struct tc_netem_qopt *qopt;
-	int ret;
-	
-	if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
-		return -EINVAL;
+	const struct tc_netem_rate *r = nla_data(attr);
+
+	q->rate = r->rate;
+	q->packet_overhead = r->packet_overhead;
+	q->cell_size = r->cell_size;
+	q->cell_overhead = r->cell_overhead;
+	if (q->cell_size)
+		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
+	else
+		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
+}
 
-	qopt = RTA_DATA(opt);
-	ret = set_fifo_limit(q->qdisc, qopt->limit);
-	if (ret) {
-		pr_debug("netem: can't set fifo limit\n");
-		return ret;
-	}
-	
-	q->latency = qopt->latency;
-	q->jitter = qopt->jitter;
-	q->limit = qopt->limit;
-	q->gap = qopt->gap;
-	q->counter = 0;
-	q->loss = qopt->loss;
-	q->duplicate = qopt->duplicate;
+static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
+{
+	const struct nlattr *la;
+	int rem;
 
-	/* for compatiablity with earlier versions.
-	 * if gap is set, need to assume 100% probablity
-	 */
-	q->reorder = ~0;
-
-	/* Handle nested options after initial queue options.
-	 * Should have put all options in nested format but too late now.
-	 */ 
-	if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
-		struct rtattr *tb[TCA_NETEM_MAX];
-		if (rtattr_parse(tb, TCA_NETEM_MAX, 
-				 RTA_DATA(opt) + sizeof(*qopt),
-				 RTA_PAYLOAD(opt) - sizeof(*qopt)))
-			return -EINVAL;
+	nla_for_each_nested(la, attr, rem) {
+		u16 type = nla_type(la);
 
-		if (tb[TCA_NETEM_CORR-1]) {
-			ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
-			if (ret)
-				return ret;
-		}
+		switch (type) {
+		case NETEM_LOSS_GI: {
+			const struct tc_netem_gimodel *gi = nla_data(la);
+
+			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
+				pr_info("netem: incorrect gi model size\n");
+				return -EINVAL;
+			}
 
-		if (tb[TCA_NETEM_DELAY_DIST-1]) {
-			ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
-			if (ret)
-				return ret;
+			q->loss_model = CLG_4_STATES;
+
+			q->clg.state = TX_IN_GAP_PERIOD;
+			q->clg.a1 = gi->p13;
+			q->clg.a2 = gi->p31;
+			q->clg.a3 = gi->p32;
+			q->clg.a4 = gi->p14;
+			q->clg.a5 = gi->p23;
+			break;
 		}
 
-		if (tb[TCA_NETEM_REORDER-1]) {
-			ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
-			if (ret)
-				return ret;
+		case NETEM_LOSS_GE: {
+			const struct tc_netem_gemodel *ge = nla_data(la);
+
+			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
+				pr_info("netem: incorrect ge model size\n");
+				return -EINVAL;
+			}
+
+			q->loss_model = CLG_GILB_ELL;
+			q->clg.state = GOOD_STATE;
+			q->clg.a1 = ge->p;
+			q->clg.a2 = ge->r;
+			q->clg.a3 = ge->h;
+			q->clg.a4 = ge->k1;
+			break;
 		}
 
-		if (tb[TCA_NETEM_CORRUPT-1]) {
-			ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]);
-			if (ret)
-				return ret;
+		default:
+			pr_info("netem: unknown loss type %u\n", type);
+			return -EINVAL;
 		}
 	}
 
 	return 0;
 }
 
-/*
- * Special case version of FIFO queue for use by netem.
- * It queues in order based on timestamps in skb's
- */
-struct fifo_sched_data {
-	u32 limit;
+static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
+	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
+	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
+	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
+	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
+	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
+	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
+	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
 };
 
-static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+		      const struct nla_policy *policy, int len)
 {
-	struct fifo_sched_data *q = qdisc_priv(sch);
-	struct sk_buff_head *list = &sch->q;
-	const struct netem_skb_cb *ncb
-		= (const struct netem_skb_cb *)nskb->cb;
-	struct sk_buff *skb;
+	int nested_len = nla_len(nla) - NLA_ALIGN(len);
 
-	if (likely(skb_queue_len(list) < q->limit)) {
-		skb_queue_reverse_walk(list, skb) {
-			const struct netem_skb_cb *cb
-				= (const struct netem_skb_cb *)skb->cb;
+	if (nested_len < 0) {
+		pr_info("netem: invalid attributes len %d\n", nested_len);
+		return -EINVAL;
+	}
 
-			if (!PSCHED_TLESS(ncb->time_to_send, cb->time_to_send))
-				break;
-		}
+	if (nested_len >= nla_attr_size(0))
+		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
+				 nested_len, policy);
 
-		__skb_queue_after(list, skb, nskb);
+	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
+	return 0;
+}
 
-		sch->qstats.backlog += nskb->len;
-		sch->bstats.bytes += nskb->len;
-		sch->bstats.packets++;
+/* Parse netlink message to set options */
+static int netem_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_NETEM_MAX + 1];
+	struct tc_netem_qopt *qopt;
+	struct clgstate old_clg;
+	int old_loss_model = CLG_RANDOM;
+	int ret;
 
-		return NET_XMIT_SUCCESS;
+	if (opt == NULL)
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
+	if (ret < 0)
+		return ret;
+
+	/* backup q->clg and q->loss_model */
+	old_clg = q->clg;
+	old_loss_model = q->loss_model;
+
+	if (tb[TCA_NETEM_LOSS]) {
+		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
+		if (ret) {
+			q->loss_model = old_loss_model;
+			return ret;
+		}
+	} else {
+		q->loss_model = CLG_RANDOM;
 	}
 
-	return qdisc_drop(nskb, sch);
-}
+	if (tb[TCA_NETEM_DELAY_DIST]) {
+		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
+		if (ret) {
+			/* recover clg and loss_model, in case of
+			 * q->clg and q->loss_model were modified
+			 * in get_loss_clg()
+			 */
+			q->clg = old_clg;
+			q->loss_model = old_loss_model;
+			return ret;
+		}
+	}
 
-static int tfifo_init(struct Qdisc *sch, struct rtattr *opt)
-{
-	struct fifo_sched_data *q = qdisc_priv(sch);
+	sch->limit = qopt->limit;
 
-	if (opt) {
-		struct tc_fifo_qopt *ctl = RTA_DATA(opt);
-		if (RTA_PAYLOAD(opt) < sizeof(*ctl))
-			return -EINVAL;
+	q->latency = qopt->latency;
+	q->jitter = qopt->jitter;
+	q->limit = qopt->limit;
+	q->gap = qopt->gap;
+	q->counter = 0;
+	q->loss = qopt->loss;
+	q->duplicate = qopt->duplicate;
 
-		q->limit = ctl->limit;
-	} else
-		q->limit = max_t(u32, sch->dev->tx_queue_len, 1);
+	/* for compatibility with earlier versions.
+	 * if gap is set, need to assume 100% probability
+	 */
+	if (q->gap)
+		q->reorder = ~0;
 
-	return 0;
-}
+	if (tb[TCA_NETEM_CORR])
+		get_correlation(q, tb[TCA_NETEM_CORR]);
 
-static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
-	struct fifo_sched_data *q = qdisc_priv(sch);
-	struct tc_fifo_qopt opt = { .limit = q->limit };
+	if (tb[TCA_NETEM_REORDER])
+		get_reorder(q, tb[TCA_NETEM_REORDER]);
 
-	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
-	return skb->len;
+	if (tb[TCA_NETEM_CORRUPT])
+		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
 
-rtattr_failure:
-	return -1;
-}
+	if (tb[TCA_NETEM_RATE])
+		get_rate(q, tb[TCA_NETEM_RATE]);
 
-static struct Qdisc_ops tfifo_qdisc_ops = {
-	.id		=	"tfifo",
-	.priv_size	=	sizeof(struct fifo_sched_data),
-	.enqueue	=	tfifo_enqueue,
-	.dequeue	=	qdisc_dequeue_head,
-	.requeue	=	qdisc_requeue,
-	.drop		=	qdisc_queue_drop,
-	.init		=	tfifo_init,
-	.reset		=	qdisc_reset_queue,
-	.change		=	tfifo_init,
-	.dump		=	tfifo_dump,
-};
+	if (tb[TCA_NETEM_RATE64])
+		q->rate = max_t(u64, q->rate,
+				nla_get_u64(tb[TCA_NETEM_RATE64]));
 
-static int netem_init(struct Qdisc *sch, struct rtattr *opt)
+	if (tb[TCA_NETEM_ECN])
+		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
+
+	return ret;
+}
+
+static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	int ret;
@@ -567,21 +901,12 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt)
 	if (!opt)
 		return -EINVAL;
 
-	init_timer(&q->timer);
-	q->timer.function = netem_watchdog;
-	q->timer.data = (unsigned long) sch;
-
-	q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops);
-	if (!q->qdisc) {
-		pr_debug("netem: qdisc create failed\n");
-		return -ENOMEM;
-	}
+	qdisc_watchdog_init(&q->watchdog, sch);
 
+	q->loss_model = CLG_RANDOM;
 	ret = netem_change(sch, opt);
-	if (ret) {
-		pr_debug("netem: change failed\n");
-		qdisc_destroy(q->qdisc);
-	}
+	if (ret)
+		pr_info("netem: change failed\n");
 	return ret;
 }
 
@@ -589,20 +914,71 @@ static void netem_destroy(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 
-	del_timer_sync(&q->timer);
-	qdisc_destroy(q->qdisc);
-	kfree(q->delay_dist);
+	qdisc_watchdog_cancel(&q->watchdog);
+	if (q->qdisc)
+		qdisc_destroy(q->qdisc);
+	dist_free(q->delay_dist);
+}
+
+static int dump_loss_model(const struct netem_sched_data *q,
+			   struct sk_buff *skb)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	switch (q->loss_model) {
+	case CLG_RANDOM:
+		/* legacy loss model */
+		nla_nest_cancel(skb, nest);
+		return 0;	/* no data */
+
+	case CLG_4_STATES: {
+		struct tc_netem_gimodel gi = {
+			.p13 = q->clg.a1,
+			.p31 = q->clg.a2,
+			.p32 = q->clg.a3,
+			.p14 = q->clg.a4,
+			.p23 = q->clg.a5,
+		};
+
+		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
+			goto nla_put_failure;
+		break;
+	}
+	case CLG_GILB_ELL: {
+		struct tc_netem_gemodel ge = {
+			.p = q->clg.a1,
+			.r = q->clg.a2,
+			.h = q->clg.a3,
+			.k1 = q->clg.a4,
+		};
+
+		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
+			goto nla_put_failure;
+		break;
+	}
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
 }
 
 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	const struct netem_sched_data *q = qdisc_priv(sch);
-	unsigned char	 *b = skb->tail;
-	struct rtattr *rta = (struct rtattr *) b;
+	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 	struct tc_netem_qopt qopt;
 	struct tc_netem_corr cor;
 	struct tc_netem_reorder reorder;
 	struct tc_netem_corrupt corrupt;
+	struct tc_netem_rate rate;
 
 	qopt.latency = q->latency;
 	qopt.jitter = q->jitter;
@@ -610,27 +986,48 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	qopt.loss = q->loss;
 	qopt.gap = q->gap;
 	qopt.duplicate = q->duplicate;
-	RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
+	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
+		goto nla_put_failure;
 
 	cor.delay_corr = q->delay_cor.rho;
 	cor.loss_corr = q->loss_cor.rho;
 	cor.dup_corr = q->dup_cor.rho;
-	RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
+	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
+		goto nla_put_failure;
 
 	reorder.probability = q->reorder;
 	reorder.correlation = q->reorder_cor.rho;
-	RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
+	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
+		goto nla_put_failure;
 
 	corrupt.probability = q->corrupt;
 	corrupt.correlation = q->corrupt_cor.rho;
-	RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
+	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
+		goto nla_put_failure;
 
-	rta->rta_len = skb->tail - b;
+	if (q->rate >= (1ULL << 32)) {
+		if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate))
+			goto nla_put_failure;
+		rate.rate = ~0U;
+	} else {
+		rate.rate = q->rate;
+	}
+	rate.packet_overhead = q->packet_overhead;
+	rate.cell_size = q->cell_size;
+	rate.cell_overhead = q->cell_overhead;
+	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
+		goto nla_put_failure;
 
-	return skb->len;
+	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
+		goto nla_put_failure;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+	if (dump_loss_model(q, skb) != 0)
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nla);
+
+nla_put_failure:
+	nlmsg_trim(skb, nla);
 	return -1;
 }
 
@@ -639,7 +1036,7 @@ static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 
-	if (cl != 1) 	/* only one class */
+	if (cl != 1 || !q->qdisc) 	/* only one class */
 		return -ENOENT;
 
 	tcm->tcm_handle |= TC_H_MIN(1);
@@ -653,13 +1050,13 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 
-	if (new == NULL)
-		new = &noop_qdisc;
-
 	sch_tree_lock(sch);
-	*old = xchg(&q->qdisc, new);
-	qdisc_reset(*old);
-	sch->q.qlen = 0;
+	*old = q->qdisc;
+	q->qdisc = new;
+	if (*old) {
+		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+		qdisc_reset(*old);
+	}
 	sch_tree_unlock(sch);
 
 	return 0;
@@ -680,17 +1077,6 @@ static void netem_put(struct Qdisc *sch, unsigned long arg)
 {
 }
 
-static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid, 
-			    struct rtattr **tca, unsigned long *arg)
-{
-	return -ENOSYS;
-}
-
-static int netem_delete(struct Qdisc *sch, unsigned long arg)
-{
-	return -ENOSYS;
-}
-
 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
 	if (!walker->stop) {
@@ -703,30 +1089,22 @@ static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 	}
 }
 
-static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
-{
-	return NULL;
-}
-
-static struct Qdisc_class_ops netem_class_ops = {
+static const struct Qdisc_class_ops netem_class_ops = {
 	.graft		=	netem_graft,
 	.leaf		=	netem_leaf,
 	.get		=	netem_get,
 	.put		=	netem_put,
-	.change		=	netem_change_class,
-	.delete		=	netem_delete,
 	.walk		=	netem_walk,
-	.tcf_chain	=	netem_find_tcf,
 	.dump		=	netem_dump_class,
 };
 
-static struct Qdisc_ops netem_qdisc_ops = {
+static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 	.id		=	"netem",
 	.cl_ops		=	&netem_class_ops,
 	.priv_size	=	sizeof(struct netem_sched_data),
 	.enqueue	=	netem_enqueue,
 	.dequeue	=	netem_dequeue,
-	.requeue	=	netem_requeue,
+	.peek		=	qdisc_peek_dequeued,
 	.drop		=	netem_drop,
 	.init		=	netem_init,
 	.reset		=	netem_reset,
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
new file mode 100644
index 00000000000..fefeeb73f15
--- /dev/null
+++ b/net/sched/sch_pie.c
@@ -0,0 +1,566 @@
+/* Copyright (C) 2013 Cisco Systems, Inc, 2013.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Author: Vijay Subramanian <vijaynsu@cisco.com>
+ * Author: Mythili Prabhu <mysuryan@cisco.com>
+ *
+ * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no>
+ * University of Oslo, Norway.
+ *
+ * References:
+ * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
+ * IEEE  Conference on High Performance Switching and Routing 2013 :
+ * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+
+#define QUEUE_THRESHOLD 10000
+#define DQCOUNT_INVALID -1
+#define MAX_PROB  0xffffffff
+#define PIE_SCALE 8
+
+/* parameters used */
+struct pie_params {
+	psched_time_t target;	/* user specified target delay in pschedtime */
+	u32 tupdate;		/* timer frequency (in jiffies) */
+	u32 limit;		/* number of packets that can be enqueued */
+	u32 alpha;		/* alpha and beta are between 0 and 32 */
+	u32 beta;		/* and are used for shift relative to 1 */
+	bool ecn;		/* true if ecn is enabled */
+	bool bytemode;		/* to scale drop early prob based on pkt size */
+};
+
+/* variables used */
+struct pie_vars {
+	u32 prob;		/* probability but scaled by u32 limit. */
+	psched_time_t burst_time;
+	psched_time_t qdelay;
+	psched_time_t qdelay_old;
+	u64 dq_count;		/* measured in bytes */
+	psched_time_t dq_tstamp;	/* drain rate */
+	u32 avg_dq_rate;	/* bytes per pschedtime tick,scaled */
+	u32 qlen_old;		/* in bytes */
+};
+
+/* statistics gathering */
+struct pie_stats {
+	u32 packets_in;		/* total number of packets enqueued */
+	u32 dropped;		/* packets dropped due to pie_action */
+	u32 overlimit;		/* dropped due to lack of space in queue */
+	u32 maxq;		/* maximum queue size */
+	u32 ecn_mark;		/* packets marked with ECN */
+};
+
+/* private data for the Qdisc */
+struct pie_sched_data {
+	struct pie_params params;
+	struct pie_vars vars;
+	struct pie_stats stats;
+	struct timer_list adapt_timer;
+};
+
+static void pie_params_init(struct pie_params *params)
+{
+	params->alpha = 2;
+	params->beta = 20;
+	params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC);	/* 30 ms */
+	params->limit = 1000;	/* default of 1000 packets */
+	params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC);	/* 20 ms */
+	params->ecn = false;
+	params->bytemode = false;
+}
+
+static void pie_vars_init(struct pie_vars *vars)
+{
+	vars->dq_count = DQCOUNT_INVALID;
+	vars->avg_dq_rate = 0;
+	/* default of 100 ms in pschedtime */
+	vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
+}
+
+static bool drop_early(struct Qdisc *sch, u32 packet_size)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	u32 rnd;
+	u32 local_prob = q->vars.prob;
+	u32 mtu = psched_mtu(qdisc_dev(sch));
+
+	/* If there is still burst allowance left skip random early drop */
+	if (q->vars.burst_time > 0)
+		return false;
+
+	/* If current delay is less than half of target, and
+	 * if drop prob is low already, disable early_drop
+	 */
+	if ((q->vars.qdelay < q->params.target / 2)
+	    && (q->vars.prob < MAX_PROB / 5))
+		return false;
+
+	/* If we have fewer than 2 mtu-sized packets, disable drop_early,
+	 * similar to min_th in RED
+	 */
+	if (sch->qstats.backlog < 2 * mtu)
+		return false;
+
+	/* If bytemode is turned on, use packet size to compute new
+	 * probablity. Smaller packets will have lower drop prob in this case
+	 */
+	if (q->params.bytemode && packet_size <= mtu)
+		local_prob = (local_prob / mtu) * packet_size;
+	else
+		local_prob = q->vars.prob;
+
+	rnd = prandom_u32();
+	if (rnd < local_prob)
+		return true;
+
+	return false;
+}
+
+static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	bool enqueue = false;
+
+	if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
+		q->stats.overlimit++;
+		goto out;
+	}
+
+	if (!drop_early(sch, skb->len)) {
+		enqueue = true;
+	} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
+		   INET_ECN_set_ce(skb)) {
+		/* If packet is ecn capable, mark it if drop probability
+		 * is lower than 10%, else drop it.
+		 */
+		q->stats.ecn_mark++;
+		enqueue = true;
+	}
+
+	/* we can enqueue the packet */
+	if (enqueue) {
+		q->stats.packets_in++;
+		if (qdisc_qlen(sch) > q->stats.maxq)
+			q->stats.maxq = qdisc_qlen(sch);
+
+		return qdisc_enqueue_tail(skb, sch);
+	}
+
+out:
+	q->stats.dropped++;
+	return qdisc_drop(skb, sch);
+}
+
+static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
+	[TCA_PIE_TARGET] = {.type = NLA_U32},
+	[TCA_PIE_LIMIT] = {.type = NLA_U32},
+	[TCA_PIE_TUPDATE] = {.type = NLA_U32},
+	[TCA_PIE_ALPHA] = {.type = NLA_U32},
+	[TCA_PIE_BETA] = {.type = NLA_U32},
+	[TCA_PIE_ECN] = {.type = NLA_U32},
+	[TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+};
+
+static int pie_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_PIE_MAX + 1];
+	unsigned int qlen;
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy);
+	if (err < 0)
+		return err;
+
+	sch_tree_lock(sch);
+
+	/* convert from microseconds to pschedtime */
+	if (tb[TCA_PIE_TARGET]) {
+		/* target is in us */
+		u32 target = nla_get_u32(tb[TCA_PIE_TARGET]);
+
+		/* convert to pschedtime */
+		q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
+	}
+
+	/* tupdate is in jiffies */
+	if (tb[TCA_PIE_TUPDATE])
+		q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
+
+	if (tb[TCA_PIE_LIMIT]) {
+		u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
+
+		q->params.limit = limit;
+		sch->limit = limit;
+	}
+
+	if (tb[TCA_PIE_ALPHA])
+		q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]);
+
+	if (tb[TCA_PIE_BETA])
+		q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]);
+
+	if (tb[TCA_PIE_ECN])
+		q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]);
+
+	if (tb[TCA_PIE_BYTEMODE])
+		q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
+
+	/* Drop excess packets if new limit is lower */
+	qlen = sch->q.qlen;
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		qdisc_drop(skb, sch);
+	}
+	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
+{
+
+	struct pie_sched_data *q = qdisc_priv(sch);
+	int qlen = sch->qstats.backlog;	/* current queue size in bytes */
+
+	/* If current queue is about 10 packets or more and dq_count is unset
+	 * we have enough packets to calculate the drain rate. Save
+	 * current time as dq_tstamp and start measurement cycle.
+	 */
+	if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) {
+		q->vars.dq_tstamp = psched_get_time();
+		q->vars.dq_count = 0;
+	}
+
+	/* Calculate the average drain rate from this value.  If queue length
+	 * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset
+	 * the dq_count to -1 as we don't have enough packets to calculate the
+	 * drain rate anymore The following if block is entered only when we
+	 * have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
+	 * and we calculate the drain rate for the threshold here.  dq_count is
+	 * in bytes, time difference in psched_time, hence rate is in
+	 * bytes/psched_time.
+	 */
+	if (q->vars.dq_count != DQCOUNT_INVALID) {
+		q->vars.dq_count += skb->len;
+
+		if (q->vars.dq_count >= QUEUE_THRESHOLD) {
+			psched_time_t now = psched_get_time();
+			u32 dtime = now - q->vars.dq_tstamp;
+			u32 count = q->vars.dq_count << PIE_SCALE;
+
+			if (dtime == 0)
+				return;
+
+			count = count / dtime;
+
+			if (q->vars.avg_dq_rate == 0)
+				q->vars.avg_dq_rate = count;
+			else
+				q->vars.avg_dq_rate =
+				    (q->vars.avg_dq_rate -
+				     (q->vars.avg_dq_rate >> 3)) + (count >> 3);
+
+			/* If the queue has receded below the threshold, we hold
+			 * on to the last drain rate calculated, else we reset
+			 * dq_count to 0 to re-enter the if block when the next
+			 * packet is dequeued
+			 */
+			if (qlen < QUEUE_THRESHOLD)
+				q->vars.dq_count = DQCOUNT_INVALID;
+			else {
+				q->vars.dq_count = 0;
+				q->vars.dq_tstamp = psched_get_time();
+			}
+
+			if (q->vars.burst_time > 0) {
+				if (q->vars.burst_time > dtime)
+					q->vars.burst_time -= dtime;
+				else
+					q->vars.burst_time = 0;
+			}
+		}
+	}
+}
+
+static void calculate_probability(struct Qdisc *sch)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	u32 qlen = sch->qstats.backlog;	/* queue size in bytes */
+	psched_time_t qdelay = 0;	/* in pschedtime */
+	psched_time_t qdelay_old = q->vars.qdelay;	/* in pschedtime */
+	s32 delta = 0;		/* determines the change in probability */
+	u32 oldprob;
+	u32 alpha, beta;
+	bool update_prob = true;
+
+	q->vars.qdelay_old = q->vars.qdelay;
+
+	if (q->vars.avg_dq_rate > 0)
+		qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
+	else
+		qdelay = 0;
+
+	/* If qdelay is zero and qlen is not, it means qlen is very small, less
+	 * than dequeue_rate, so we do not update probabilty in this round
+	 */
+	if (qdelay == 0 && qlen != 0)
+		update_prob = false;
+
+	/* In the algorithm, alpha and beta are between 0 and 2 with typical
+	 * value for alpha as 0.125. In this implementation, we use values 0-32
+	 * passed from user space to represent this. Also, alpha and beta have
+	 * unit of HZ and need to be scaled before they can used to update
+	 * probability. alpha/beta are updated locally below by 1) scaling them
+	 * appropriately 2) scaling down by 16 to come to 0-2 range.
+	 * Please see paper for details.
+	 *
+	 * We scale alpha and beta differently depending on whether we are in
+	 * light, medium or high dropping mode.
+	 */
+	if (q->vars.prob < MAX_PROB / 100) {
+		alpha =
+		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
+		beta =
+		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
+	} else if (q->vars.prob < MAX_PROB / 10) {
+		alpha =
+		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
+		beta =
+		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
+	} else {
+		alpha =
+		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+		beta =
+		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+	}
+
+	/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
+	delta += alpha * ((qdelay - q->params.target));
+	delta += beta * ((qdelay - qdelay_old));
+
+	oldprob = q->vars.prob;
+
+	/* to ensure we increase probability in steps of no more than 2% */
+	if (delta > (s32) (MAX_PROB / (100 / 2)) &&
+	    q->vars.prob >= MAX_PROB / 10)
+		delta = (MAX_PROB / 100) * 2;
+
+	/* Non-linear drop:
+	 * Tune drop probability to increase quickly for high delays(>= 250ms)
+	 * 250ms is derived through experiments and provides error protection
+	 */
+
+	if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
+		delta += MAX_PROB / (100 / 2);
+
+	q->vars.prob += delta;
+
+	if (delta > 0) {
+		/* prevent overflow */
+		if (q->vars.prob < oldprob) {
+			q->vars.prob = MAX_PROB;
+			/* Prevent normalization error. If probability is at
+			 * maximum value already, we normalize it here, and
+			 * skip the check to do a non-linear drop in the next
+			 * section.
+			 */
+			update_prob = false;
+		}
+	} else {
+		/* prevent underflow */
+		if (q->vars.prob > oldprob)
+			q->vars.prob = 0;
+	}
+
+	/* Non-linear drop in probability: Reduce drop probability quickly if
+	 * delay is 0 for 2 consecutive Tupdate periods.
+	 */
+
+	if ((qdelay == 0) && (qdelay_old == 0) && update_prob)
+		q->vars.prob = (q->vars.prob * 98) / 100;
+
+	q->vars.qdelay = qdelay;
+	q->vars.qlen_old = qlen;
+
+	/* We restart the measurement cycle if the following conditions are met
+	 * 1. If the delay has been low for 2 consecutive Tupdate periods
+	 * 2. Calculated drop probability is zero
+	 * 3. We have atleast one estimate for the avg_dq_rate ie.,
+	 *    is a non-zero value
+	 */
+	if ((q->vars.qdelay < q->params.target / 2) &&
+	    (q->vars.qdelay_old < q->params.target / 2) &&
+	    (q->vars.prob == 0) &&
+	    (q->vars.avg_dq_rate > 0))
+		pie_vars_init(&q->vars);
+}
+
+static void pie_timer(unsigned long arg)
+{
+	struct Qdisc *sch = (struct Qdisc *)arg;
+	struct pie_sched_data *q = qdisc_priv(sch);
+	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+
+	spin_lock(root_lock);
+	calculate_probability(sch);
+
+	/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
+	if (q->params.tupdate)
+		mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
+	spin_unlock(root_lock);
+
+}
+
+static int pie_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+
+	pie_params_init(&q->params);
+	pie_vars_init(&q->vars);
+	sch->limit = q->params.limit;
+
+	setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch);
+	mod_timer(&q->adapt_timer, jiffies + HZ / 2);
+
+	if (opt) {
+		int err = pie_change(sch, opt);
+
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	/* convert target from pschedtime to us */
+	if (nla_put_u32(skb, TCA_PIE_TARGET,
+			((u32) PSCHED_TICKS2NS(q->params.target)) /
+			NSEC_PER_USEC) ||
+	    nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
+	    nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) ||
+	    nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
+	    nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
+	    nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
+	    nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -1;
+
+}
+
+static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	struct tc_pie_xstats st = {
+		.prob		= q->vars.prob,
+		.delay		= ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) /
+				   NSEC_PER_USEC,
+		/* unscale and return dq_rate in bytes per sec */
+		.avg_dq_rate	= q->vars.avg_dq_rate *
+				  (PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
+		.packets_in	= q->stats.packets_in,
+		.overlimit	= q->stats.overlimit,
+		.maxq		= q->stats.maxq,
+		.dropped	= q->stats.dropped,
+		.ecn_mark	= q->stats.ecn_mark,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	skb = __qdisc_dequeue_head(sch, &sch->q);
+
+	if (!skb)
+		return NULL;
+
+	pie_process_dequeue(sch, skb);
+	return skb;
+}
+
+static void pie_reset(struct Qdisc *sch)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	qdisc_reset_queue(sch);
+	pie_vars_init(&q->vars);
+}
+
+static void pie_destroy(struct Qdisc *sch)
+{
+	struct pie_sched_data *q = qdisc_priv(sch);
+	q->params.tupdate = 0;
+	del_timer_sync(&q->adapt_timer);
+}
+
+static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
+	.id = "pie",
+	.priv_size	= sizeof(struct pie_sched_data),
+	.enqueue	= pie_qdisc_enqueue,
+	.dequeue	= pie_qdisc_dequeue,
+	.peek		= qdisc_peek_dequeued,
+	.init		= pie_init,
+	.destroy	= pie_destroy,
+	.reset		= pie_reset,
+	.change		= pie_change,
+	.dump		= pie_dump,
+	.dump_stats	= pie_dump_stats,
+	.owner		= THIS_MODULE,
+};
+
+static int __init pie_module_init(void)
+{
+	return register_qdisc(&pie_qdisc_ops);
+}
+
+static void __exit pie_module_exit(void)
+{
+	unregister_qdisc(&pie_qdisc_ops);
+}
+
+module_init(pie_module_init);
+module_exit(pie_module_exit);
+
+MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler");
+MODULE_AUTHOR("Vijay Subramanian");
+MODULE_AUTHOR("Mythili Prabhu");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
new file mode 100644
index 00000000000..89f8fcf73f1
--- /dev/null
+++ b/net/sched/sch_plug.c
@@ -0,0 +1,233 @@
+/*
+ * sch_plug.c Queue traffic until an explicit release command
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ *
+ * There are two ways to use this qdisc:
+ * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
+ *    sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
+ *
+ * 2. For network output buffering (a.k.a output commit) functionality.
+ *    Output commit property is commonly used by applications using checkpoint
+ *    based fault-tolerance to ensure that the checkpoint from which a system
+ *    is being restored is consistent w.r.t outside world.
+ *
+ *    Consider for e.g. Remus - a Virtual Machine checkpointing system,
+ *    wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated
+ *    asynchronously to the backup host, while the VM continues executing the
+ *    next epoch speculatively.
+ *
+ *    The following is a typical sequence of output buffer operations:
+ *       1.At epoch i, start_buffer(i)
+ *       2. At end of epoch i (i.e. after 50ms):
+ *          2.1 Stop VM and take checkpoint(i).
+ *          2.2 start_buffer(i+1) and Resume VM
+ *       3. While speculatively executing epoch(i+1), asynchronously replicate
+ *          checkpoint(i) to backup host.
+ *       4. When checkpoint_ack(i) is received from backup, release_buffer(i)
+ *    Thus, this Qdisc would receive the following sequence of commands:
+ *       TCQ_PLUG_BUFFER (epoch i)
+ *       .. TCQ_PLUG_BUFFER (epoch i+1)
+ *       ....TCQ_PLUG_RELEASE_ONE (epoch i)
+ *       ......TCQ_PLUG_BUFFER (epoch i+2)
+ *       ........
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+/*
+ * State of the queue, when used for network output buffering:
+ *
+ *                 plug(i+1)            plug(i)          head
+ * ------------------+--------------------+---------------->
+ *                   |                    |
+ *                   |                    |
+ * pkts_current_epoch| pkts_last_epoch    |pkts_to_release
+ * ----------------->|<--------+--------->|+--------------->
+ *                   v                    v
+ *
+ */
+
+struct plug_sched_data {
+	/* If true, the dequeue function releases all packets
+	 * from head to end of the queue. The queue turns into
+	 * a pass-through queue for newly arriving packets.
+	 */
+	bool unplug_indefinite;
+
+	/* Queue Limit in bytes */
+	u32 limit;
+
+	/* Number of packets (output) from the current speculatively
+	 * executing epoch.
+	 */
+	u32 pkts_current_epoch;
+
+	/* Number of packets corresponding to the recently finished
+	 * epoch. These will be released when we receive a
+	 * TCQ_PLUG_RELEASE_ONE command. This command is typically
+	 * issued after committing a checkpoint at the target.
+	 */
+	u32 pkts_last_epoch;
+
+	/*
+	 * Number of packets from the head of the queue, that can
+	 * be released (committed checkpoint).
+	 */
+	u32 pkts_to_release;
+};
+
+static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+
+	if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
+		if (!q->unplug_indefinite)
+			q->pkts_current_epoch++;
+		return qdisc_enqueue_tail(skb, sch);
+	}
+
+	return qdisc_reshape_fail(skb, sch);
+}
+
+static struct sk_buff *plug_dequeue(struct Qdisc *sch)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+
+	if (qdisc_is_throttled(sch))
+		return NULL;
+
+	if (!q->unplug_indefinite) {
+		if (!q->pkts_to_release) {
+			/* No more packets to dequeue. Block the queue
+			 * and wait for the next release command.
+			 */
+			qdisc_throttled(sch);
+			return NULL;
+		}
+		q->pkts_to_release--;
+	}
+
+	return qdisc_dequeue_head(sch);
+}
+
+static int plug_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+
+	q->pkts_current_epoch = 0;
+	q->pkts_last_epoch = 0;
+	q->pkts_to_release = 0;
+	q->unplug_indefinite = false;
+
+	if (opt == NULL) {
+		/* We will set a default limit of 100 pkts (~150kB)
+		 * in case tx_queue_len is not available. The
+		 * default value is completely arbitrary.
+		 */
+		u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100;
+		q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
+	} else {
+		struct tc_plug_qopt *ctl = nla_data(opt);
+
+		if (nla_len(opt) < sizeof(*ctl))
+			return -EINVAL;
+
+		q->limit = ctl->limit;
+	}
+
+	qdisc_throttled(sch);
+	return 0;
+}
+
+/* Receives 4 types of messages:
+ * TCQ_PLUG_BUFFER: Inset a plug into the queue and
+ *  buffer any incoming packets
+ * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
+ *   to beginning of the next plug.
+ * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
+ *   Stop buffering packets until the next TCQ_PLUG_BUFFER
+ *   command is received (just act as a pass-thru queue).
+ * TCQ_PLUG_LIMIT: Increase/decrease queue size
+ */
+static int plug_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+	struct tc_plug_qopt *msg;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	msg = nla_data(opt);
+	if (nla_len(opt) < sizeof(*msg))
+		return -EINVAL;
+
+	switch (msg->action) {
+	case TCQ_PLUG_BUFFER:
+		/* Save size of the current buffer */
+		q->pkts_last_epoch = q->pkts_current_epoch;
+		q->pkts_current_epoch = 0;
+		if (q->unplug_indefinite)
+			qdisc_throttled(sch);
+		q->unplug_indefinite = false;
+		break;
+	case TCQ_PLUG_RELEASE_ONE:
+		/* Add packets from the last complete buffer to the
+		 * packets to be released set.
+		 */
+		q->pkts_to_release += q->pkts_last_epoch;
+		q->pkts_last_epoch = 0;
+		qdisc_unthrottled(sch);
+		netif_schedule_queue(sch->dev_queue);
+		break;
+	case TCQ_PLUG_RELEASE_INDEFINITE:
+		q->unplug_indefinite = true;
+		q->pkts_to_release = 0;
+		q->pkts_last_epoch = 0;
+		q->pkts_current_epoch = 0;
+		qdisc_unthrottled(sch);
+		netif_schedule_queue(sch->dev_queue);
+		break;
+	case TCQ_PLUG_LIMIT:
+		/* Limit is supplied in bytes */
+		q->limit = msg->limit;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
+	.id          =       "plug",
+	.priv_size   =       sizeof(struct plug_sched_data),
+	.enqueue     =       plug_enqueue,
+	.dequeue     =       plug_dequeue,
+	.peek        =       qdisc_peek_head,
+	.init        =       plug_init,
+	.change      =       plug_change,
+	.owner       =       THIS_MODULE,
+};
+
+static int __init plug_module_init(void)
+{
+	return register_qdisc(&plug_qdisc_ops);
+}
+
+static void __exit plug_module_exit(void)
+{
+	unregister_qdisc(&plug_qdisc_ops);
+}
+module_init(plug_module_init)
+module_exit(plug_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 1641db33a99..79359b69ad8 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -7,39 +7,22 @@
  *		2 of the License, or (at your option) any later version.
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- * Fixes:       19990609: J Hadi Salim <hadi@nortelnetworks.com>: 
+ * Fixes:       19990609: J Hadi Salim <hadi@nortelnetworks.com>:
  *              Init --  EINVAL when opt undefined
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
 
 
-struct prio_sched_data
-{
+struct prio_sched_data {
 	int bands;
 	struct tcf_proto *filter_list;
 	u8  prio2band[TC_PRIO_MAX+1];
@@ -53,30 +36,29 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	struct prio_sched_data *q = qdisc_priv(sch);
 	u32 band = skb->priority;
 	struct tcf_result res;
+	int err;
 
-	*qerr = NET_XMIT_BYPASS;
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	if (TC_H_MAJ(skb->priority) != sch->handle) {
+		err = tc_classify(skb, q->filter_list, &res);
 #ifdef CONFIG_NET_CLS_ACT
-		switch (tc_classify(skb, q->filter_list, &res)) {
+		switch (err) {
 		case TC_ACT_STOLEN:
 		case TC_ACT_QUEUED:
-			*qerr = NET_XMIT_SUCCESS;
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 		case TC_ACT_SHOT:
 			return NULL;
-		};
-
-		if (!q->filter_list ) {
-#else
-		if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) {
+		}
 #endif
+		if (!q->filter_list || err < 0) {
 			if (TC_H_MAJ(band))
 				band = 0;
-			return q->queues[q->prio2band[band&TC_PRIO_MAX]];
+			return q->queues[q->prio2band[band & TC_PRIO_MAX]];
 		}
 		band = res.classid;
 	}
 	band = TC_H_MIN(band) - 1;
-	if (band > q->bands)
+	if (band >= q->bands)
 		return q->queues[q->prio2band[0]];
 
 	return q->queues[band];
@@ -92,62 +74,47 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 #ifdef CONFIG_NET_CLS_ACT
 	if (qdisc == NULL) {
 
-		if (ret == NET_XMIT_BYPASS)
+		if (ret & __NET_XMIT_BYPASS)
 			sch->qstats.drops++;
 		kfree_skb(skb);
 		return ret;
 	}
 #endif
 
-	if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) {
-		sch->bstats.bytes += skb->len;
-		sch->bstats.packets++;
+	ret = qdisc_enqueue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
 		sch->q.qlen++;
 		return NET_XMIT_SUCCESS;
 	}
-	sch->qstats.drops++;
-	return ret; 
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
 }
 
-
-static int
-prio_requeue(struct sk_buff *skb, struct Qdisc* sch)
+static struct sk_buff *prio_peek(struct Qdisc *sch)
 {
-	struct Qdisc *qdisc;
-	int ret;
-
-	qdisc = prio_classify(skb, sch, &ret);
-#ifdef CONFIG_NET_CLS_ACT
-	if (qdisc == NULL) {
-		if (ret == NET_XMIT_BYPASS)
-			sch->qstats.drops++;
-		kfree_skb(skb);
-		return ret;
-	}
-#endif
+	struct prio_sched_data *q = qdisc_priv(sch);
+	int prio;
 
-	if ((ret = qdisc->ops->requeue(skb, qdisc)) == NET_XMIT_SUCCESS) {
-		sch->q.qlen++;
-		sch->qstats.requeues++;
-		return 0;
+	for (prio = 0; prio < q->bands; prio++) {
+		struct Qdisc *qdisc = q->queues[prio];
+		struct sk_buff *skb = qdisc->ops->peek(qdisc);
+		if (skb)
+			return skb;
 	}
-	sch->qstats.drops++;
-	return NET_XMIT_DROP;
+	return NULL;
 }
 
-
-static struct sk_buff *
-prio_dequeue(struct Qdisc* sch)
+static struct sk_buff *prio_dequeue(struct Qdisc *sch)
 {
-	struct sk_buff *skb;
 	struct prio_sched_data *q = qdisc_priv(sch);
 	int prio;
-	struct Qdisc *qdisc;
 
 	for (prio = 0; prio < q->bands; prio++) {
-		qdisc = q->queues[prio];
-		skb = qdisc->dequeue(qdisc);
+		struct Qdisc *qdisc = q->queues[prio];
+		struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);
 		if (skb) {
+			qdisc_bstats_update(sch, skb);
 			sch->q.qlen--;
 			return skb;
 		}
@@ -156,7 +123,7 @@ prio_dequeue(struct Qdisc* sch)
 
 }
 
-static unsigned int prio_drop(struct Qdisc* sch)
+static unsigned int prio_drop(struct Qdisc *sch)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	int prio;
@@ -165,7 +132,7 @@ static unsigned int prio_drop(struct Qdisc* sch)
 
 	for (prio = q->bands-1; prio >= 0; prio--) {
 		qdisc = q->queues[prio];
-		if ((len = qdisc->ops->drop(qdisc)) != 0) {
+		if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
 			sch->q.qlen--;
 			return len;
 		}
@@ -175,44 +142,41 @@ static unsigned int prio_drop(struct Qdisc* sch)
 
 
 static void
-prio_reset(struct Qdisc* sch)
+prio_reset(struct Qdisc *sch)
 {
 	int prio;
 	struct prio_sched_data *q = qdisc_priv(sch);
 
-	for (prio=0; prio<q->bands; prio++)
+	for (prio = 0; prio < q->bands; prio++)
 		qdisc_reset(q->queues[prio]);
 	sch->q.qlen = 0;
 }
 
 static void
-prio_destroy(struct Qdisc* sch)
+prio_destroy(struct Qdisc *sch)
 {
 	int prio;
 	struct prio_sched_data *q = qdisc_priv(sch);
-	struct tcf_proto *tp;
 
-	while ((tp = q->filter_list) != NULL) {
-		q->filter_list = tp->next;
-		tcf_destroy(tp);
-	}
-
-	for (prio=0; prio<q->bands; prio++)
+	tcf_destroy_chain(&q->filter_list);
+	for (prio = 0; prio < q->bands; prio++)
 		qdisc_destroy(q->queues[prio]);
 }
 
-static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
+static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
-	struct tc_prio_qopt *qopt = RTA_DATA(opt);
+	struct tc_prio_qopt *qopt;
 	int i;
 
-	if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
+	if (nla_len(opt) < sizeof(*qopt))
 		return -EINVAL;
+	qopt = nla_data(opt);
+
 	if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
 		return -EINVAL;
 
-	for (i=0; i<=TC_PRIO_MAX; i++) {
+	for (i = 0; i <= TC_PRIO_MAX; i++) {
 		if (qopt->priomap[i] >= qopt->bands)
 			return -EINVAL;
 	}
@@ -221,23 +185,33 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
 	q->bands = qopt->bands;
 	memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
 
-	for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
-		struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
-		if (child != &noop_qdisc)
+	for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
+		struct Qdisc *child = q->queues[i];
+		q->queues[i] = &noop_qdisc;
+		if (child != &noop_qdisc) {
+			qdisc_tree_decrease_qlen(child, child->q.qlen);
 			qdisc_destroy(child);
+		}
 	}
 	sch_tree_unlock(sch);
 
-	for (i=0; i<q->bands; i++) {
+	for (i = 0; i < q->bands; i++) {
 		if (q->queues[i] == &noop_qdisc) {
-			struct Qdisc *child;
-			child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+			struct Qdisc *child, *old;
+
+			child = qdisc_create_dflt(sch->dev_queue,
+						  &pfifo_qdisc_ops,
+						  TC_H_MAKE(sch->handle, i + 1));
 			if (child) {
 				sch_tree_lock(sch);
-				child = xchg(&q->queues[i], child);
-
-				if (child != &noop_qdisc)
-					qdisc_destroy(child);
+				old = q->queues[i];
+				q->queues[i] = child;
+
+				if (old != &noop_qdisc) {
+					qdisc_tree_decrease_qlen(old,
+								 old->q.qlen);
+					qdisc_destroy(old);
+				}
 				sch_tree_unlock(sch);
 			}
 		}
@@ -245,12 +219,12 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
 	return 0;
 }
 
-static int prio_init(struct Qdisc *sch, struct rtattr *opt)
+static int prio_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	int i;
 
-	for (i=0; i<TCQ_PRIO_BANDS; i++)
+	for (i = 0; i < TCQ_PRIO_BANDS; i++)
 		q->queues[i] = &noop_qdisc;
 
 	if (opt == NULL) {
@@ -258,7 +232,7 @@ static int prio_init(struct Qdisc *sch, struct rtattr *opt)
 	} else {
 		int err;
 
-		if ((err= prio_tune(sch, opt)) != 0)
+		if ((err = prio_tune(sch, opt)) != 0)
 			return err;
 	}
 	return 0;
@@ -267,16 +241,19 @@ static int prio_init(struct Qdisc *sch, struct rtattr *opt)
 static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
-	unsigned char	 *b = skb->tail;
+	unsigned char *b = skb_tail_pointer(skb);
 	struct tc_prio_qopt opt;
 
 	opt.bands = q->bands;
-	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
-	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
+
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
 	return skb->len;
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+nla_put_failure:
+	nlmsg_trim(skb, b);
 	return -1;
 }
 
@@ -286,16 +263,13 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned long band = arg - 1;
 
-	if (band >= q->bands)
-		return -EINVAL;
-
 	if (new == NULL)
 		new = &noop_qdisc;
 
 	sch_tree_lock(sch);
 	*old = q->queues[band];
 	q->queues[band] = new;
-	sch->q.qlen -= (*old)->q.qlen;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 	qdisc_reset(*old);
 	sch_tree_unlock(sch);
 
@@ -308,9 +282,6 @@ prio_leaf(struct Qdisc *sch, unsigned long arg)
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned long band = arg - 1;
 
-	if (band >= q->bands)
-		return NULL;
-
 	return q->queues[band];
 }
 
@@ -332,38 +303,30 @@ static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 clas
 
 static void prio_put(struct Qdisc *q, unsigned long cl)
 {
-	return;
 }
 
-static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg)
+static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
+			   struct tcmsg *tcm)
 {
-	unsigned long cl = *arg;
 	struct prio_sched_data *q = qdisc_priv(sch);
 
-	if (cl - 1 > q->bands)
-		return -ENOENT;
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	tcm->tcm_info = q->queues[cl-1]->handle;
 	return 0;
 }
 
-static int prio_delete(struct Qdisc *sch, unsigned long cl)
+static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				 struct gnet_dump *d)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
-	if (cl - 1 > q->bands)
-		return -ENOENT;
-	return 0;
-}
+	struct Qdisc *cl_q;
 
+	cl_q = q->queues[cl - 1];
+	cl_q->qstats.qlen = cl_q->q.qlen;
+	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
+		return -1;
 
-static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
-			   struct tcmsg *tcm)
-{
-	struct prio_sched_data *q = qdisc_priv(sch);
-
-	if (cl - 1 > q->bands)
-		return -ENOENT;
-	tcm->tcm_handle |= TC_H_MIN(cl);
-	if (q->queues[cl-1])
-		tcm->tcm_info = q->queues[cl-1]->handle;
 	return 0;
 }
 
@@ -380,7 +343,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 			arg->count++;
 			continue;
 		}
-		if (arg->fn(sch, prio+1, arg) < 0) {
+		if (arg->fn(sch, prio + 1, arg) < 0) {
 			arg->stop = 1;
 			break;
 		}
@@ -388,7 +351,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 	}
 }
 
-static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 
@@ -397,28 +360,27 @@ static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl)
 	return &q->filter_list;
 }
 
-static struct Qdisc_class_ops prio_class_ops = {
+static const struct Qdisc_class_ops prio_class_ops = {
 	.graft		=	prio_graft,
 	.leaf		=	prio_leaf,
 	.get		=	prio_get,
 	.put		=	prio_put,
-	.change		=	prio_change,
-	.delete		=	prio_delete,
 	.walk		=	prio_walk,
 	.tcf_chain	=	prio_find_tcf,
 	.bind_tcf	=	prio_bind,
 	.unbind_tcf	=	prio_put,
 	.dump		=	prio_dump_class,
+	.dump_stats	=	prio_dump_class_stats,
 };
 
-static struct Qdisc_ops prio_qdisc_ops = {
+static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
 	.next		=	NULL,
 	.cl_ops		=	&prio_class_ops,
 	.id		=	"prio",
 	.priv_size	=	sizeof(struct prio_sched_data),
 	.enqueue	=	prio_enqueue,
 	.dequeue	=	prio_dequeue,
-	.requeue	=	prio_requeue,
+	.peek		=	prio_peek,
 	.drop		=	prio_drop,
 	.init		=	prio_init,
 	.reset		=	prio_reset,
@@ -433,7 +395,7 @@ static int __init prio_module_init(void)
 	return register_qdisc(&prio_qdisc_ops);
 }
 
-static void __exit prio_module_exit(void) 
+static void __exit prio_module_exit(void)
 {
 	unregister_qdisc(&prio_qdisc_ops);
 }
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
new file mode 100644
index 00000000000..8056fb4e618
--- /dev/null
+++ b/net/sched/sch_qfq.c
@@ -0,0 +1,1582 @@
+/*
+ * net/sched/sch_qfq.c         Quick Fair Queueing Plus Scheduler.
+ *
+ * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
+ * Copyright (c) 2012 Paolo Valente.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+
+/*  Quick Fair Queueing Plus
+    ========================
+
+    Sources:
+
+    [1] Paolo Valente,
+    "Reducing the Execution Time of Fair-Queueing Schedulers."
+    http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
+
+    Sources for QFQ:
+
+    [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
+    Packet Scheduling with Tight Bandwidth Distribution Guarantees."
+
+    See also:
+    http://retis.sssup.it/~fabio/linux/qfq/
+ */
+
+/*
+
+  QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
+  classes. Each aggregate is timestamped with a virtual start time S
+  and a virtual finish time F, and scheduled according to its
+  timestamps. S and F are computed as a function of a system virtual
+  time function V. The classes within each aggregate are instead
+  scheduled with DRR.
+
+  To speed up operations, QFQ+ divides also aggregates into a limited
+  number of groups. Which group a class belongs to depends on the
+  ratio between the maximum packet length for the class and the weight
+  of the class. Groups have their own S and F. In the end, QFQ+
+  schedules groups, then aggregates within groups, then classes within
+  aggregates. See [1] and [2] for a full description.
+
+  Virtual time computations.
+
+  S, F and V are all computed in fixed point arithmetic with
+  FRAC_BITS decimal bits.
+
+  QFQ_MAX_INDEX is the maximum index allowed for a group. We need
+	one bit per index.
+  QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
+
+  The layout of the bits is as below:
+
+                   [ MTU_SHIFT ][      FRAC_BITS    ]
+                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
+				 ^.__grp->index = 0
+				 *.__grp->slot_shift
+
+  where MIN_SLOT_SHIFT is derived by difference from the others.
+
+  The max group index corresponds to Lmax/w_min, where
+  Lmax=1<<MTU_SHIFT, w_min = 1 .
+  From this, and knowing how many groups (MAX_INDEX) we want,
+  we can derive the shift corresponding to each group.
+
+  Because we often need to compute
+	F = S + len/w_i  and V = V + len/wsum
+  instead of storing w_i store the value
+	inv_w = (1<<FRAC_BITS)/w_i
+  so we can do F = S + len * inv_w * wsum.
+  We use W_TOT in the formulas so we can easily move between
+  static and adaptive weight sum.
+
+  The per-scheduler-instance data contain all the data structures
+  for the scheduler: bitmaps and bucket lists.
+
+ */
+
+/*
+ * Maximum number of consecutive slots occupied by backlogged classes
+ * inside a group.
+ */
+#define QFQ_MAX_SLOTS	32
+
+/*
+ * Shifts used for aggregate<->group mapping.  We allow class weights that are
+ * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
+ * group with the smallest index that can support the L_i / r_i configured
+ * for the classes in the aggregate.
+ *
+ * grp->index is the index of the group; and grp->slot_shift
+ * is the shift for the corresponding (scaled) sigma_i.
+ */
+#define QFQ_MAX_INDEX		24
+#define QFQ_MAX_WSHIFT		10
+
+#define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
+#define QFQ_MAX_WSUM		(64*QFQ_MAX_WEIGHT)
+
+#define FRAC_BITS		30	/* fixed point arithmetic */
+#define ONE_FP			(1UL << FRAC_BITS)
+
+#define QFQ_MTU_SHIFT		16	/* to support TSO/GSO */
+#define QFQ_MIN_LMAX		512	/* see qfq_slot_insert */
+
+#define QFQ_MAX_AGG_CLASSES	8 /* max num classes per aggregate allowed */
+
+/*
+ * Possible group states.  These values are used as indexes for the bitmaps
+ * array of struct qfq_queue.
+ */
+enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
+
+struct qfq_group;
+
+struct qfq_aggregate;
+
+struct qfq_class {
+	struct Qdisc_class_common common;
+
+	unsigned int refcnt;
+	unsigned int filter_cnt;
+
+	struct gnet_stats_basic_packed bstats;
+	struct gnet_stats_queue qstats;
+	struct gnet_stats_rate_est64 rate_est;
+	struct Qdisc *qdisc;
+	struct list_head alist;		/* Link for active-classes list. */
+	struct qfq_aggregate *agg;	/* Parent aggregate. */
+	int deficit;			/* DRR deficit counter. */
+};
+
+struct qfq_aggregate {
+	struct hlist_node next;	/* Link for the slot list. */
+	u64 S, F;		/* flow timestamps (exact) */
+
+	/* group we belong to. In principle we would need the index,
+	 * which is log_2(lmax/weight), but we never reference it
+	 * directly, only the group.
+	 */
+	struct qfq_group *grp;
+
+	/* these are copied from the flowset. */
+	u32	class_weight; /* Weight of each class in this aggregate. */
+	/* Max pkt size for the classes in this aggregate, DRR quantum. */
+	int	lmax;
+
+	u32	inv_w;	    /* ONE_FP/(sum of weights of classes in aggr.). */
+	u32	budgetmax;  /* Max budget for this aggregate. */
+	u32	initial_budget, budget;     /* Initial and current budget. */
+
+	int		  num_classes;	/* Number of classes in this aggr. */
+	struct list_head  active;	/* DRR queue of active classes. */
+
+	struct hlist_node nonfull_next;	/* See nonfull_aggs in qfq_sched. */
+};
+
+struct qfq_group {
+	u64 S, F;			/* group timestamps (approx). */
+	unsigned int slot_shift;	/* Slot shift. */
+	unsigned int index;		/* Group index. */
+	unsigned int front;		/* Index of the front slot. */
+	unsigned long full_slots;	/* non-empty slots */
+
+	/* Array of RR lists of active aggregates. */
+	struct hlist_head slots[QFQ_MAX_SLOTS];
+};
+
+struct qfq_sched {
+	struct tcf_proto *filter_list;
+	struct Qdisc_class_hash clhash;
+
+	u64			oldV, V;	/* Precise virtual times. */
+	struct qfq_aggregate	*in_serv_agg;   /* Aggregate being served. */
+	u32			num_active_agg; /* Num. of active aggregates */
+	u32			wsum;		/* weight sum */
+	u32			iwsum;		/* inverse weight sum */
+
+	unsigned long bitmaps[QFQ_MAX_STATE];	    /* Group bitmaps. */
+	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+	u32 min_slot_shift;	/* Index of the group-0 bit in the bitmaps. */
+
+	u32 max_agg_classes;		/* Max number of classes per aggr. */
+	struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
+};
+
+/*
+ * Possible reasons why the timestamps of an aggregate are updated
+ * enqueue: the aggregate switches from idle to active and must scheduled
+ *	    for service
+ * requeue: the aggregate finishes its budget, so it stops being served and
+ *	    must be rescheduled for service
+ */
+enum update_reason {enqueue, requeue};
+
+static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct Qdisc_class_common *clc;
+
+	clc = qdisc_class_find(&q->clhash, classid);
+	if (clc == NULL)
+		return NULL;
+	return container_of(clc, struct qfq_class, common);
+}
+
+static void qfq_purge_queue(struct qfq_class *cl)
+{
+	unsigned int len = cl->qdisc->q.qlen;
+
+	qdisc_reset(cl->qdisc);
+	qdisc_tree_decrease_qlen(cl->qdisc, len);
+}
+
+static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
+	[TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
+	[TCA_QFQ_LMAX] = { .type = NLA_U32 },
+};
+
+/*
+ * Calculate a flow index, given its weight and maximum packet length.
+ * index = log_2(maxlen/weight) but we need to apply the scaling.
+ * This is used only once at flow creation.
+ */
+static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
+{
+	u64 slot_size = (u64)maxlen * inv_w;
+	unsigned long size_map;
+	int index = 0;
+
+	size_map = slot_size >> min_slot_shift;
+	if (!size_map)
+		goto out;
+
+	index = __fls(size_map) + 1;	/* basically a log_2 */
+	index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
+
+	if (index < 0)
+		index = 0;
+out:
+	pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
+		 (unsigned long) ONE_FP/inv_w, maxlen, index);
+
+	return index;
+}
+
+static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
+static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
+			     enum update_reason);
+
+static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+			 u32 lmax, u32 weight)
+{
+	INIT_LIST_HEAD(&agg->active);
+	hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
+
+	agg->lmax = lmax;
+	agg->class_weight = weight;
+}
+
+static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
+					  u32 lmax, u32 weight)
+{
+	struct qfq_aggregate *agg;
+
+	hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
+		if (agg->lmax == lmax && agg->class_weight == weight)
+			return agg;
+
+	return NULL;
+}
+
+
+/* Update aggregate as a function of the new number of classes. */
+static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+			   int new_num_classes)
+{
+	u32 new_agg_weight;
+
+	if (new_num_classes == q->max_agg_classes)
+		hlist_del_init(&agg->nonfull_next);
+
+	if (agg->num_classes > new_num_classes &&
+	    new_num_classes == q->max_agg_classes - 1) /* agg no more full */
+		hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
+
+	/* The next assignment may let
+	 * agg->initial_budget > agg->budgetmax
+	 * hold, we will take it into account in charge_actual_service().
+	 */
+	agg->budgetmax = new_num_classes * agg->lmax;
+	new_agg_weight = agg->class_weight * new_num_classes;
+	agg->inv_w = ONE_FP/new_agg_weight;
+
+	if (agg->grp == NULL) {
+		int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
+				       q->min_slot_shift);
+		agg->grp = &q->groups[i];
+	}
+
+	q->wsum +=
+		(int) agg->class_weight * (new_num_classes - agg->num_classes);
+	q->iwsum = ONE_FP / q->wsum;
+
+	agg->num_classes = new_num_classes;
+}
+
+/* Add class to aggregate. */
+static void qfq_add_to_agg(struct qfq_sched *q,
+			   struct qfq_aggregate *agg,
+			   struct qfq_class *cl)
+{
+	cl->agg = agg;
+
+	qfq_update_agg(q, agg, agg->num_classes+1);
+	if (cl->qdisc->q.qlen > 0) { /* adding an active class */
+		list_add_tail(&cl->alist, &agg->active);
+		if (list_first_entry(&agg->active, struct qfq_class, alist) ==
+		    cl && q->in_serv_agg != agg) /* agg was inactive */
+			qfq_activate_agg(q, agg, enqueue); /* schedule agg */
+	}
+}
+
+static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
+
+static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+	if (!hlist_unhashed(&agg->nonfull_next))
+		hlist_del_init(&agg->nonfull_next);
+	q->wsum -= agg->class_weight;
+	if (q->wsum != 0)
+		q->iwsum = ONE_FP / q->wsum;
+
+	if (q->in_serv_agg == agg)
+		q->in_serv_agg = qfq_choose_next_agg(q);
+	kfree(agg);
+}
+
+/* Deschedule class from within its parent aggregate. */
+static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
+{
+	struct qfq_aggregate *agg = cl->agg;
+
+
+	list_del(&cl->alist); /* remove from RR queue of the aggregate */
+	if (list_empty(&agg->active)) /* agg is now inactive */
+		qfq_deactivate_agg(q, agg);
+}
+
+/* Remove class from its parent aggregate. */
+static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
+{
+	struct qfq_aggregate *agg = cl->agg;
+
+	cl->agg = NULL;
+	if (agg->num_classes == 1) { /* agg being emptied, destroy it */
+		qfq_destroy_agg(q, agg);
+		return;
+	}
+	qfq_update_agg(q, agg, agg->num_classes-1);
+}
+
+/* Deschedule class and remove it from its parent aggregate. */
+static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
+{
+	if (cl->qdisc->q.qlen > 0) /* class is active */
+		qfq_deactivate_class(q, cl);
+
+	qfq_rm_from_agg(q, cl);
+}
+
+/* Move class to a new aggregate, matching the new class weight and/or lmax */
+static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
+			   u32 lmax)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
+
+	if (new_agg == NULL) { /* create new aggregate */
+		new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
+		if (new_agg == NULL)
+			return -ENOBUFS;
+		qfq_init_agg(q, new_agg, lmax, weight);
+	}
+	qfq_deact_rm_from_agg(q, cl);
+	qfq_add_to_agg(q, new_agg, cl);
+
+	return 0;
+}
+
+static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+			    struct nlattr **tca, unsigned long *arg)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl = (struct qfq_class *)*arg;
+	bool existing = false;
+	struct nlattr *tb[TCA_QFQ_MAX + 1];
+	struct qfq_aggregate *new_agg = NULL;
+	u32 weight, lmax, inv_w;
+	int err;
+	int delta_w;
+
+	if (tca[TCA_OPTIONS] == NULL) {
+		pr_notice("qfq: no options\n");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_QFQ_WEIGHT]) {
+		weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
+		if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
+			pr_notice("qfq: invalid weight %u\n", weight);
+			return -EINVAL;
+		}
+	} else
+		weight = 1;
+
+	if (tb[TCA_QFQ_LMAX]) {
+		lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
+		if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
+			pr_notice("qfq: invalid max length %u\n", lmax);
+			return -EINVAL;
+		}
+	} else
+		lmax = psched_mtu(qdisc_dev(sch));
+
+	inv_w = ONE_FP / weight;
+	weight = ONE_FP / inv_w;
+
+	if (cl != NULL &&
+	    lmax == cl->agg->lmax &&
+	    weight == cl->agg->class_weight)
+		return 0; /* nothing to change */
+
+	delta_w = weight - (cl ? cl->agg->class_weight : 0);
+
+	if (q->wsum + delta_w > QFQ_MAX_WSUM) {
+		pr_notice("qfq: total weight out of range (%d + %u)\n",
+			  delta_w, q->wsum);
+		return -EINVAL;
+	}
+
+	if (cl != NULL) { /* modify existing class */
+		if (tca[TCA_RATE]) {
+			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+						    qdisc_root_sleeping_lock(sch),
+						    tca[TCA_RATE]);
+			if (err)
+				return err;
+		}
+		existing = true;
+		goto set_change_agg;
+	}
+
+	/* create and init new class */
+	cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
+	if (cl == NULL)
+		return -ENOBUFS;
+
+	cl->refcnt = 1;
+	cl->common.classid = classid;
+	cl->deficit = lmax;
+
+	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
+				      &pfifo_qdisc_ops, classid);
+	if (cl->qdisc == NULL)
+		cl->qdisc = &noop_qdisc;
+
+	if (tca[TCA_RATE]) {
+		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+					qdisc_root_sleeping_lock(sch),
+					tca[TCA_RATE]);
+		if (err)
+			goto destroy_class;
+	}
+
+	sch_tree_lock(sch);
+	qdisc_class_hash_insert(&q->clhash, &cl->common);
+	sch_tree_unlock(sch);
+
+	qdisc_class_hash_grow(sch, &q->clhash);
+
+set_change_agg:
+	sch_tree_lock(sch);
+	new_agg = qfq_find_agg(q, lmax, weight);
+	if (new_agg == NULL) { /* create new aggregate */
+		sch_tree_unlock(sch);
+		new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
+		if (new_agg == NULL) {
+			err = -ENOBUFS;
+			gen_kill_estimator(&cl->bstats, &cl->rate_est);
+			goto destroy_class;
+		}
+		sch_tree_lock(sch);
+		qfq_init_agg(q, new_agg, lmax, weight);
+	}
+	if (existing)
+		qfq_deact_rm_from_agg(q, cl);
+	qfq_add_to_agg(q, new_agg, cl);
+	sch_tree_unlock(sch);
+
+	*arg = (unsigned long)cl;
+	return 0;
+
+destroy_class:
+	qdisc_destroy(cl->qdisc);
+	kfree(cl);
+	return err;
+}
+
+static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+
+	qfq_rm_from_agg(q, cl);
+	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	qdisc_destroy(cl->qdisc);
+	kfree(cl);
+}
+
+static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	if (cl->filter_cnt > 0)
+		return -EBUSY;
+
+	sch_tree_lock(sch);
+
+	qfq_purge_queue(cl);
+	qdisc_class_hash_remove(&q->clhash, &cl->common);
+
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid)
+{
+	struct qfq_class *cl = qfq_find_class(sch, classid);
+
+	if (cl != NULL)
+		cl->refcnt++;
+
+	return (unsigned long)cl;
+}
+
+static void qfq_put_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	if (--cl->refcnt == 0)
+		qfq_destroy_class(sch, cl);
+}
+
+static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+
+	return &q->filter_list;
+}
+
+static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
+				  u32 classid)
+{
+	struct qfq_class *cl = qfq_find_class(sch, classid);
+
+	if (cl != NULL)
+		cl->filter_cnt++;
+
+	return (unsigned long)cl;
+}
+
+static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	cl->filter_cnt--;
+}
+
+static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
+			   struct Qdisc *new, struct Qdisc **old)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	if (new == NULL) {
+		new = qdisc_create_dflt(sch->dev_queue,
+					&pfifo_qdisc_ops, cl->common.classid);
+		if (new == NULL)
+			new = &noop_qdisc;
+	}
+
+	sch_tree_lock(sch);
+	qfq_purge_queue(cl);
+	*old = cl->qdisc;
+	cl->qdisc = new;
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	return cl->qdisc;
+}
+
+static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+	struct nlattr *nest;
+
+	tcm->tcm_parent	= TC_H_ROOT;
+	tcm->tcm_handle	= cl->common.classid;
+	tcm->tcm_info	= cl->qdisc->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
+	    nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
+		goto nla_put_failure;
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+				struct gnet_dump *d)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+	struct tc_qfq_stats xstats;
+
+	memset(&xstats, 0, sizeof(xstats));
+	cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
+
+	xstats.weight = cl->agg->class_weight;
+	xstats.lmax = cl->agg->lmax;
+
+	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
+		return -1;
+
+	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl;
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
+				      int *qerr)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl;
+	struct tcf_result res;
+	int result;
+
+	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
+		pr_debug("qfq_classify: found %d\n", skb->priority);
+		cl = qfq_find_class(sch, skb->priority);
+		if (cl != NULL)
+			return cl;
+	}
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return NULL;
+		}
+#endif
+		cl = (struct qfq_class *)res.class;
+		if (cl == NULL)
+			cl = qfq_find_class(sch, res.classid);
+		return cl;
+	}
+
+	return NULL;
+}
+
+/* Generic comparison function, handling wraparound. */
+static inline int qfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+/* Round a precise timestamp to its slotted value. */
+static inline u64 qfq_round_down(u64 ts, unsigned int shift)
+{
+	return ts & ~((1ULL << shift) - 1);
+}
+
+/* return the pointer to the group with lowest index in the bitmap */
+static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
+					unsigned long bitmap)
+{
+	int index = __ffs(bitmap);
+	return &q->groups[index];
+}
+/* Calculate a mask to mimic what would be ffs_from(). */
+static inline unsigned long mask_from(unsigned long bitmap, int from)
+{
+	return bitmap & ~((1UL << from) - 1);
+}
+
+/*
+ * The state computation relies on ER=0, IR=1, EB=2, IB=3
+ * First compute eligibility comparing grp->S, q->V,
+ * then check if someone is blocking us and possibly add EB
+ */
+static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
+{
+	/* if S > V we are not eligible */
+	unsigned int state = qfq_gt(grp->S, q->V);
+	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
+	struct qfq_group *next;
+
+	if (mask) {
+		next = qfq_ffs(q, mask);
+		if (qfq_gt(grp->F, next->F))
+			state |= EB;
+	}
+
+	return state;
+}
+
+
+/*
+ * In principle
+ *	q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ *	q->bitmaps[src] &= ~mask;
+ * but we should make sure that src != dst
+ */
+static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
+				   int src, int dst)
+{
+	q->bitmaps[dst] |= q->bitmaps[src] & mask;
+	q->bitmaps[src] &= ~mask;
+}
+
+static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
+{
+	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
+	struct qfq_group *next;
+
+	if (mask) {
+		next = qfq_ffs(q, mask);
+		if (!qfq_gt(next->F, old_F))
+			return;
+	}
+
+	mask = (1UL << index) - 1;
+	qfq_move_groups(q, mask, EB, ER);
+	qfq_move_groups(q, mask, IB, IR);
+}
+
+/*
+ * perhaps
+ *
+	old_V ^= q->V;
+	old_V >>= q->min_slot_shift;
+	if (old_V) {
+		...
+	}
+ *
+ */
+static void qfq_make_eligible(struct qfq_sched *q)
+{
+	unsigned long vslot = q->V >> q->min_slot_shift;
+	unsigned long old_vslot = q->oldV >> q->min_slot_shift;
+
+	if (vslot != old_vslot) {
+		unsigned long mask;
+		int last_flip_pos = fls(vslot ^ old_vslot);
+
+		if (last_flip_pos > 31) /* higher than the number of groups */
+			mask = ~0UL;    /* make all groups eligible */
+		else
+			mask = (1UL << last_flip_pos) - 1;
+
+		qfq_move_groups(q, mask, IR, ER);
+		qfq_move_groups(q, mask, IB, EB);
+	}
+}
+
+/*
+ * The index of the slot in which the input aggregate agg is to be
+ * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
+ * and not a '-1' because the start time of the group may be moved
+ * backward by one slot after the aggregate has been inserted, and
+ * this would cause non-empty slots to be right-shifted by one
+ * position.
+ *
+ * QFQ+ fully satisfies this bound to the slot index if the parameters
+ * of the classes are not changed dynamically, and if QFQ+ never
+ * happens to postpone the service of agg unjustly, i.e., it never
+ * happens that the aggregate becomes backlogged and eligible, or just
+ * eligible, while an aggregate with a higher approximated finish time
+ * is being served. In particular, in this case QFQ+ guarantees that
+ * the timestamps of agg are low enough that the slot index is never
+ * higher than 2. Unfortunately, QFQ+ cannot provide the same
+ * guarantee if it happens to unjustly postpone the service of agg, or
+ * if the parameters of some class are changed.
+ *
+ * As for the first event, i.e., an out-of-order service, the
+ * upper bound to the slot index guaranteed by QFQ+ grows to
+ * 2 +
+ * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
+ * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
+ *
+ * The following function deals with this problem by backward-shifting
+ * the timestamps of agg, if needed, so as to guarantee that the slot
+ * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
+ * cause the service of other aggregates to be postponed, yet the
+ * worst-case guarantees of these aggregates are not violated.  In
+ * fact, in case of no out-of-order service, the timestamps of agg
+ * would have been even lower than they are after the backward shift,
+ * because QFQ+ would have guaranteed a maximum value equal to 2 for
+ * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
+ * service is postponed because of the backward-shift would have
+ * however waited for the service of agg before being served.
+ *
+ * The other event that may cause the slot index to be higher than 2
+ * for agg is a recent change of the parameters of some class. If the
+ * weight of a class is increased or the lmax (max_pkt_size) of the
+ * class is decreased, then a new aggregate with smaller slot size
+ * than the original parent aggregate of the class may happen to be
+ * activated. The activation of this aggregate should be properly
+ * delayed to when the service of the class has finished in the ideal
+ * system tracked by QFQ+. If the activation of the aggregate is not
+ * delayed to this reference time instant, then this aggregate may be
+ * unjustly served before other aggregates waiting for service. This
+ * may cause the above bound to the slot index to be violated for some
+ * of these unlucky aggregates.
+ *
+ * Instead of delaying the activation of the new aggregate, which is
+ * quite complex, the above-discussed capping of the slot index is
+ * used to handle also the consequences of a change of the parameters
+ * of a class.
+ */
+static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
+			    u64 roundedS)
+{
+	u64 slot = (roundedS - grp->S) >> grp->slot_shift;
+	unsigned int i; /* slot index in the bucket list */
+
+	if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
+		u64 deltaS = roundedS - grp->S -
+			((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
+		agg->S -= deltaS;
+		agg->F -= deltaS;
+		slot = QFQ_MAX_SLOTS - 2;
+	}
+
+	i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+	hlist_add_head(&agg->next, &grp->slots[i]);
+	__set_bit(slot, &grp->full_slots);
+}
+
+/* Maybe introduce hlist_first_entry?? */
+static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
+{
+	return hlist_entry(grp->slots[grp->front].first,
+			   struct qfq_aggregate, next);
+}
+
+/*
+ * remove the entry from the slot
+ */
+static void qfq_front_slot_remove(struct qfq_group *grp)
+{
+	struct qfq_aggregate *agg = qfq_slot_head(grp);
+
+	BUG_ON(!agg);
+	hlist_del(&agg->next);
+	if (hlist_empty(&grp->slots[grp->front]))
+		__clear_bit(0, &grp->full_slots);
+}
+
+/*
+ * Returns the first aggregate in the first non-empty bucket of the
+ * group. As a side effect, adjusts the bucket list so the first
+ * non-empty bucket is at position 0 in full_slots.
+ */
+static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
+{
+	unsigned int i;
+
+	pr_debug("qfq slot_scan: grp %u full %#lx\n",
+		 grp->index, grp->full_slots);
+
+	if (grp->full_slots == 0)
+		return NULL;
+
+	i = __ffs(grp->full_slots);  /* zero based */
+	if (i > 0) {
+		grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
+		grp->full_slots >>= i;
+	}
+
+	return qfq_slot_head(grp);
+}
+
+/*
+ * adjust the bucket list. When the start time of a group decreases,
+ * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
+ * move the objects. The mask of occupied slots must be shifted
+ * because we use ffs() to find the first non-empty slot.
+ * This covers decreases in the group's start time, but what about
+ * increases of the start time ?
+ * Here too we should make sure that i is less than 32
+ */
+static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
+{
+	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
+
+	grp->full_slots <<= i;
+	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
+}
+
+static void qfq_update_eligible(struct qfq_sched *q)
+{
+	struct qfq_group *grp;
+	unsigned long ineligible;
+
+	ineligible = q->bitmaps[IR] | q->bitmaps[IB];
+	if (ineligible) {
+		if (!q->bitmaps[ER]) {
+			grp = qfq_ffs(q, ineligible);
+			if (qfq_gt(grp->S, q->V))
+				q->V = grp->S;
+		}
+		qfq_make_eligible(q);
+	}
+}
+
+/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
+static void agg_dequeue(struct qfq_aggregate *agg,
+			struct qfq_class *cl, unsigned int len)
+{
+	qdisc_dequeue_peeked(cl->qdisc);
+
+	cl->deficit -= (int) len;
+
+	if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
+		list_del(&cl->alist);
+	else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
+		cl->deficit += agg->lmax;
+		list_move_tail(&cl->alist, &agg->active);
+	}
+}
+
+static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
+					   struct qfq_class **cl,
+					   unsigned int *len)
+{
+	struct sk_buff *skb;
+
+	*cl = list_first_entry(&agg->active, struct qfq_class, alist);
+	skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
+	if (skb == NULL)
+		WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
+	else
+		*len = qdisc_pkt_len(skb);
+
+	return skb;
+}
+
+/* Update F according to the actual service received by the aggregate. */
+static inline void charge_actual_service(struct qfq_aggregate *agg)
+{
+	/* Compute the service received by the aggregate, taking into
+	 * account that, after decreasing the number of classes in
+	 * agg, it may happen that
+	 * agg->initial_budget - agg->budget > agg->bugdetmax
+	 */
+	u32 service_received = min(agg->budgetmax,
+				   agg->initial_budget - agg->budget);
+
+	agg->F = agg->S + (u64)service_received * agg->inv_w;
+}
+
+/* Assign a reasonable start time for a new aggregate in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in EB (see [2]). So, if we have groups in ER,
+ * set S to the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+	unsigned long mask;
+	u64 limit, roundedF;
+	int slot_shift = agg->grp->slot_shift;
+
+	roundedF = qfq_round_down(agg->F, slot_shift);
+	limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
+
+	if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
+		/* timestamp was stale */
+		mask = mask_from(q->bitmaps[ER], agg->grp->index);
+		if (mask) {
+			struct qfq_group *next = qfq_ffs(q, mask);
+			if (qfq_gt(roundedF, next->F)) {
+				if (qfq_gt(limit, next->F))
+					agg->S = next->F;
+				else /* preserve timestamp correctness */
+					agg->S = limit;
+				return;
+			}
+		}
+		agg->S = q->V;
+	} else  /* timestamp is not stale */
+		agg->S = agg->F;
+}
+
+/* Update the timestamps of agg before scheduling/rescheduling it for
+ * service.  In particular, assign to agg->F its maximum possible
+ * value, i.e., the virtual finish time with which the aggregate
+ * should be labeled if it used all its budget once in service.
+ */
+static inline void
+qfq_update_agg_ts(struct qfq_sched *q,
+		    struct qfq_aggregate *agg, enum update_reason reason)
+{
+	if (reason != requeue)
+		qfq_update_start(q, agg);
+	else /* just charge agg for the service received */
+		agg->S = agg->F;
+
+	agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
+}
+
+static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);
+
+static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
+	struct qfq_class *cl;
+	struct sk_buff *skb = NULL;
+	/* next-packet len, 0 means no more active classes in in-service agg */
+	unsigned int len = 0;
+
+	if (in_serv_agg == NULL)
+		return NULL;
+
+	if (!list_empty(&in_serv_agg->active))
+		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
+
+	/*
+	 * If there are no active classes in the in-service aggregate,
+	 * or if the aggregate has not enough budget to serve its next
+	 * class, then choose the next aggregate to serve.
+	 */
+	if (len == 0 || in_serv_agg->budget < len) {
+		charge_actual_service(in_serv_agg);
+
+		/* recharge the budget of the aggregate */
+		in_serv_agg->initial_budget = in_serv_agg->budget =
+			in_serv_agg->budgetmax;
+
+		if (!list_empty(&in_serv_agg->active)) {
+			/*
+			 * Still active: reschedule for
+			 * service. Possible optimization: if no other
+			 * aggregate is active, then there is no point
+			 * in rescheduling this aggregate, and we can
+			 * just keep it as the in-service one. This
+			 * should be however a corner case, and to
+			 * handle it, we would need to maintain an
+			 * extra num_active_aggs field.
+			*/
+			qfq_update_agg_ts(q, in_serv_agg, requeue);
+			qfq_schedule_agg(q, in_serv_agg);
+		} else if (sch->q.qlen == 0) { /* no aggregate to serve */
+			q->in_serv_agg = NULL;
+			return NULL;
+		}
+
+		/*
+		 * If we get here, there are other aggregates queued:
+		 * choose the new aggregate to serve.
+		 */
+		in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
+		skb = qfq_peek_skb(in_serv_agg, &cl, &len);
+	}
+	if (!skb)
+		return NULL;
+
+	sch->q.qlen--;
+	qdisc_bstats_update(sch, skb);
+
+	agg_dequeue(in_serv_agg, cl, len);
+	/* If lmax is lowered, through qfq_change_class, for a class
+	 * owning pending packets with larger size than the new value
+	 * of lmax, then the following condition may hold.
+	 */
+	if (unlikely(in_serv_agg->budget < len))
+		in_serv_agg->budget = 0;
+	else
+		in_serv_agg->budget -= len;
+
+	q->V += (u64)len * q->iwsum;
+	pr_debug("qfq dequeue: len %u F %lld now %lld\n",
+		 len, (unsigned long long) in_serv_agg->F,
+		 (unsigned long long) q->V);
+
+	return skb;
+}
+
+static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
+{
+	struct qfq_group *grp;
+	struct qfq_aggregate *agg, *new_front_agg;
+	u64 old_F;
+
+	qfq_update_eligible(q);
+	q->oldV = q->V;
+
+	if (!q->bitmaps[ER])
+		return NULL;
+
+	grp = qfq_ffs(q, q->bitmaps[ER]);
+	old_F = grp->F;
+
+	agg = qfq_slot_head(grp);
+
+	/* agg starts to be served, remove it from schedule */
+	qfq_front_slot_remove(grp);
+
+	new_front_agg = qfq_slot_scan(grp);
+
+	if (new_front_agg == NULL) /* group is now inactive, remove from ER */
+		__clear_bit(grp->index, &q->bitmaps[ER]);
+	else {
+		u64 roundedS = qfq_round_down(new_front_agg->S,
+					      grp->slot_shift);
+		unsigned int s;
+
+		if (grp->S == roundedS)
+			return agg;
+		grp->S = roundedS;
+		grp->F = roundedS + (2ULL << grp->slot_shift);
+		__clear_bit(grp->index, &q->bitmaps[ER]);
+		s = qfq_calc_state(q, grp);
+		__set_bit(grp->index, &q->bitmaps[s]);
+	}
+
+	qfq_unblock_groups(q, grp->index, old_F);
+
+	return agg;
+}
+
+static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl;
+	struct qfq_aggregate *agg;
+	int err = 0;
+
+	cl = qfq_classify(skb, sch, &err);
+	if (cl == NULL) {
+		if (err & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return err;
+	}
+	pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
+
+	if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
+		pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
+			 cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
+		err = qfq_change_agg(sch, cl, cl->agg->class_weight,
+				     qdisc_pkt_len(skb));
+		if (err)
+			return err;
+	}
+
+	err = qdisc_enqueue(skb, cl->qdisc);
+	if (unlikely(err != NET_XMIT_SUCCESS)) {
+		pr_debug("qfq_enqueue: enqueue failed %d\n", err);
+		if (net_xmit_drop_count(err)) {
+			cl->qstats.drops++;
+			sch->qstats.drops++;
+		}
+		return err;
+	}
+
+	bstats_update(&cl->bstats, skb);
+	++sch->q.qlen;
+
+	agg = cl->agg;
+	/* if the queue was not empty, then done here */
+	if (cl->qdisc->q.qlen != 1) {
+		if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
+		    list_first_entry(&agg->active, struct qfq_class, alist)
+		    == cl && cl->deficit < qdisc_pkt_len(skb))
+			list_move_tail(&cl->alist, &agg->active);
+
+		return err;
+	}
+
+	/* schedule class for service within the aggregate */
+	cl->deficit = agg->lmax;
+	list_add_tail(&cl->alist, &agg->active);
+
+	if (list_first_entry(&agg->active, struct qfq_class, alist) != cl ||
+	    q->in_serv_agg == agg)
+		return err; /* non-empty or in service, nothing else to do */
+
+	qfq_activate_agg(q, agg, enqueue);
+
+	return err;
+}
+
+/*
+ * Schedule aggregate according to its timestamps.
+ */
+static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+	struct qfq_group *grp = agg->grp;
+	u64 roundedS;
+	int s;
+
+	roundedS = qfq_round_down(agg->S, grp->slot_shift);
+
+	/*
+	 * Insert agg in the correct bucket.
+	 * If agg->S >= grp->S we don't need to adjust the
+	 * bucket list and simply go to the insertion phase.
+	 * Otherwise grp->S is decreasing, we must make room
+	 * in the bucket list, and also recompute the group state.
+	 * Finally, if there were no flows in this group and nobody
+	 * was in ER make sure to adjust V.
+	 */
+	if (grp->full_slots) {
+		if (!qfq_gt(grp->S, agg->S))
+			goto skip_update;
+
+		/* create a slot for this agg->S */
+		qfq_slot_rotate(grp, roundedS);
+		/* group was surely ineligible, remove */
+		__clear_bit(grp->index, &q->bitmaps[IR]);
+		__clear_bit(grp->index, &q->bitmaps[IB]);
+	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) &&
+		   q->in_serv_agg == NULL)
+		q->V = roundedS;
+
+	grp->S = roundedS;
+	grp->F = roundedS + (2ULL << grp->slot_shift);
+	s = qfq_calc_state(q, grp);
+	__set_bit(grp->index, &q->bitmaps[s]);
+
+	pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
+		 s, q->bitmaps[s],
+		 (unsigned long long) agg->S,
+		 (unsigned long long) agg->F,
+		 (unsigned long long) q->V);
+
+skip_update:
+	qfq_slot_insert(grp, agg, roundedS);
+}
+
+
+/* Update agg ts and schedule agg for service */
+static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+			     enum update_reason reason)
+{
+	agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */
+
+	qfq_update_agg_ts(q, agg, reason);
+	if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */
+		q->in_serv_agg = agg; /* start serving this aggregate */
+		 /* update V: to be in service, agg must be eligible */
+		q->oldV = q->V = agg->S;
+	} else if (agg != q->in_serv_agg)
+		qfq_schedule_agg(q, agg);
+}
+
+static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
+			    struct qfq_aggregate *agg)
+{
+	unsigned int i, offset;
+	u64 roundedS;
+
+	roundedS = qfq_round_down(agg->S, grp->slot_shift);
+	offset = (roundedS - grp->S) >> grp->slot_shift;
+
+	i = (grp->front + offset) % QFQ_MAX_SLOTS;
+
+	hlist_del(&agg->next);
+	if (hlist_empty(&grp->slots[i]))
+		__clear_bit(offset, &grp->full_slots);
+}
+
+/*
+ * Called to forcibly deschedule an aggregate.  If the aggregate is
+ * not in the front bucket, or if the latter has other aggregates in
+ * the front bucket, we can simply remove the aggregate with no other
+ * side effects.
+ * Otherwise we must propagate the event up.
+ */
+static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+	struct qfq_group *grp = agg->grp;
+	unsigned long mask;
+	u64 roundedS;
+	int s;
+
+	if (agg == q->in_serv_agg) {
+		charge_actual_service(agg);
+		q->in_serv_agg = qfq_choose_next_agg(q);
+		return;
+	}
+
+	agg->F = agg->S;
+	qfq_slot_remove(q, grp, agg);
+
+	if (!grp->full_slots) {
+		__clear_bit(grp->index, &q->bitmaps[IR]);
+		__clear_bit(grp->index, &q->bitmaps[EB]);
+		__clear_bit(grp->index, &q->bitmaps[IB]);
+
+		if (test_bit(grp->index, &q->bitmaps[ER]) &&
+		    !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
+			mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
+			if (mask)
+				mask = ~((1UL << __fls(mask)) - 1);
+			else
+				mask = ~0UL;
+			qfq_move_groups(q, mask, EB, ER);
+			qfq_move_groups(q, mask, IB, IR);
+		}
+		__clear_bit(grp->index, &q->bitmaps[ER]);
+	} else if (hlist_empty(&grp->slots[grp->front])) {
+		agg = qfq_slot_scan(grp);
+		roundedS = qfq_round_down(agg->S, grp->slot_shift);
+		if (grp->S != roundedS) {
+			__clear_bit(grp->index, &q->bitmaps[ER]);
+			__clear_bit(grp->index, &q->bitmaps[IR]);
+			__clear_bit(grp->index, &q->bitmaps[EB]);
+			__clear_bit(grp->index, &q->bitmaps[IB]);
+			grp->S = roundedS;
+			grp->F = roundedS + (2ULL << grp->slot_shift);
+			s = qfq_calc_state(q, grp);
+			__set_bit(grp->index, &q->bitmaps[s]);
+		}
+	}
+}
+
+static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	if (cl->qdisc->q.qlen == 0)
+		qfq_deactivate_class(q, cl);
+}
+
+static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
+				       struct hlist_head *slot)
+{
+	struct qfq_aggregate *agg;
+	struct qfq_class *cl;
+	unsigned int len;
+
+	hlist_for_each_entry(agg, slot, next) {
+		list_for_each_entry(cl, &agg->active, alist) {
+
+			if (!cl->qdisc->ops->drop)
+				continue;
+
+			len = cl->qdisc->ops->drop(cl->qdisc);
+			if (len > 0) {
+				if (cl->qdisc->q.qlen == 0)
+					qfq_deactivate_class(q, cl);
+
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
+static unsigned int qfq_drop(struct Qdisc *sch)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_group *grp;
+	unsigned int i, j, len;
+
+	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+		grp = &q->groups[i];
+		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
+			len = qfq_drop_from_slot(q, &grp->slots[j]);
+			if (len > 0) {
+				sch->q.qlen--;
+				return len;
+			}
+		}
+
+	}
+
+	return 0;
+}
+
+static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_group *grp;
+	int i, j, err;
+	u32 max_cl_shift, maxbudg_shift, max_classes;
+
+	err = qdisc_class_hash_init(&q->clhash);
+	if (err < 0)
+		return err;
+
+	if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
+		max_classes = QFQ_MAX_AGG_CLASSES;
+	else
+		max_classes = qdisc_dev(sch)->tx_queue_len + 1;
+	/* max_cl_shift = floor(log_2(max_classes)) */
+	max_cl_shift = __fls(max_classes);
+	q->max_agg_classes = 1<<max_cl_shift;
+
+	/* maxbudg_shift = log2(max_len * max_classes_per_agg) */
+	maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
+	q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
+
+	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+		grp = &q->groups[i];
+		grp->index = i;
+		grp->slot_shift = q->min_slot_shift + i;
+		for (j = 0; j < QFQ_MAX_SLOTS; j++)
+			INIT_HLIST_HEAD(&grp->slots[j]);
+	}
+
+	INIT_HLIST_HEAD(&q->nonfull_aggs);
+
+	return 0;
+}
+
+static void qfq_reset_qdisc(struct Qdisc *sch)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl;
+	unsigned int i;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+			if (cl->qdisc->q.qlen > 0)
+				qfq_deactivate_class(q, cl);
+
+			qdisc_reset(cl->qdisc);
+		}
+	}
+	sch->q.qlen = 0;
+}
+
+static void qfq_destroy_qdisc(struct Qdisc *sch)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl;
+	struct hlist_node *next;
+	unsigned int i;
+
+	tcf_destroy_chain(&q->filter_list);
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+					  common.hnode) {
+			qfq_destroy_class(sch, cl);
+		}
+	}
+	qdisc_class_hash_destroy(&q->clhash);
+}
+
+static const struct Qdisc_class_ops qfq_class_ops = {
+	.change		= qfq_change_class,
+	.delete		= qfq_delete_class,
+	.get		= qfq_get_class,
+	.put		= qfq_put_class,
+	.tcf_chain	= qfq_tcf_chain,
+	.bind_tcf	= qfq_bind_tcf,
+	.unbind_tcf	= qfq_unbind_tcf,
+	.graft		= qfq_graft_class,
+	.leaf		= qfq_class_leaf,
+	.qlen_notify	= qfq_qlen_notify,
+	.dump		= qfq_dump_class,
+	.dump_stats	= qfq_dump_class_stats,
+	.walk		= qfq_walk,
+};
+
+static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
+	.cl_ops		= &qfq_class_ops,
+	.id		= "qfq",
+	.priv_size	= sizeof(struct qfq_sched),
+	.enqueue	= qfq_enqueue,
+	.dequeue	= qfq_dequeue,
+	.peek		= qdisc_peek_dequeued,
+	.drop		= qfq_drop,
+	.init		= qfq_init_qdisc,
+	.reset		= qfq_reset_qdisc,
+	.destroy	= qfq_destroy_qdisc,
+	.owner		= THIS_MODULE,
+};
+
+static int __init qfq_init(void)
+{
+	return register_qdisc(&qfq_qdisc_ops);
+}
+
+static void __exit qfq_exit(void)
+{
+	unregister_qdisc(&qfq_qdisc_ops);
+}
+
+module_init(qfq_init);
+module_exit(qfq_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index dccfa44c2d7..633e32defdc 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -14,11 +14,9 @@
  * J Hadi Salim 980816:  ECN support
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/pkt_sched.h>
 #include <net/inet_ecn.h>
@@ -38,12 +36,14 @@
 	if RED works correctly.
  */
 
-struct red_sched_data
-{
+struct red_sched_data {
 	u32			limit;		/* HARD maximal queue length */
 	unsigned char		flags;
+	struct timer_list	adapt_timer;
 	struct red_parms	parms;
+	struct red_vars		vars;
 	struct red_stats	stats;
+	struct Qdisc		*qdisc;
 };
 
 static inline int red_use_ecn(struct red_sched_data *q)
@@ -56,143 +56,208 @@ static inline int red_use_harddrop(struct red_sched_data *q)
 	return q->flags & TC_RED_HARDDROP;
 }
 
-static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
-
-	q->parms.qavg = red_calc_qavg(&q->parms, sch->qstats.backlog);
-
-	if (red_is_idling(&q->parms))
-		red_end_of_idle_period(&q->parms);
-
-	switch (red_action(&q->parms, q->parms.qavg)) {
-		case RED_DONT_MARK:
-			break;
-
-		case RED_PROB_MARK:
-			sch->qstats.overlimits++;
-			if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
-				q->stats.prob_drop++;
-				goto congestion_drop;
-			}
-
-			q->stats.prob_mark++;
-			break;
-
-		case RED_HARD_MARK:
-			sch->qstats.overlimits++;
-			if (red_use_harddrop(q) || !red_use_ecn(q) ||
-			    !INET_ECN_set_ce(skb)) {
-				q->stats.forced_drop++;
-				goto congestion_drop;
-			}
-
-			q->stats.forced_mark++;
-			break;
+	struct Qdisc *child = q->qdisc;
+	int ret;
+
+	q->vars.qavg = red_calc_qavg(&q->parms,
+				     &q->vars,
+				     child->qstats.backlog);
+
+	if (red_is_idling(&q->vars))
+		red_end_of_idle_period(&q->vars);
+
+	switch (red_action(&q->parms, &q->vars, q->vars.qavg)) {
+	case RED_DONT_MARK:
+		break;
+
+	case RED_PROB_MARK:
+		sch->qstats.overlimits++;
+		if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
+			q->stats.prob_drop++;
+			goto congestion_drop;
+		}
+
+		q->stats.prob_mark++;
+		break;
+
+	case RED_HARD_MARK:
+		sch->qstats.overlimits++;
+		if (red_use_harddrop(q) || !red_use_ecn(q) ||
+		    !INET_ECN_set_ce(skb)) {
+			q->stats.forced_drop++;
+			goto congestion_drop;
+		}
+
+		q->stats.forced_mark++;
+		break;
 	}
 
-	if (sch->qstats.backlog + skb->len <= q->limit)
-		return qdisc_enqueue_tail(skb, sch);
-
-	q->stats.pdrop++;
-	return qdisc_drop(skb, sch);
+	ret = qdisc_enqueue(skb, child);
+	if (likely(ret == NET_XMIT_SUCCESS)) {
+		sch->q.qlen++;
+	} else if (net_xmit_drop_count(ret)) {
+		q->stats.pdrop++;
+		sch->qstats.drops++;
+	}
+	return ret;
 
 congestion_drop:
 	qdisc_drop(skb, sch);
 	return NET_XMIT_CN;
 }
 
-static int red_requeue(struct sk_buff *skb, struct Qdisc* sch)
+static struct sk_buff *red_dequeue(struct Qdisc *sch)
 {
+	struct sk_buff *skb;
 	struct red_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
 
-	if (red_is_idling(&q->parms))
-		red_end_of_idle_period(&q->parms);
-
-	return qdisc_requeue(skb, sch);
+	skb = child->dequeue(child);
+	if (skb) {
+		qdisc_bstats_update(sch, skb);
+		sch->q.qlen--;
+	} else {
+		if (!red_is_idling(&q->vars))
+			red_start_of_idle_period(&q->vars);
+	}
+	return skb;
 }
 
-static struct sk_buff * red_dequeue(struct Qdisc* sch)
+static struct sk_buff *red_peek(struct Qdisc *sch)
 {
-	struct sk_buff *skb;
 	struct red_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
 
-	skb = qdisc_dequeue_head(sch);
-
-	if (skb == NULL && !red_is_idling(&q->parms))
-		red_start_of_idle_period(&q->parms);
-
-	return skb;
+	return child->ops->peek(child);
 }
 
-static unsigned int red_drop(struct Qdisc* sch)
+static unsigned int red_drop(struct Qdisc *sch)
 {
-	struct sk_buff *skb;
 	struct red_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+	unsigned int len;
 
-	skb = qdisc_dequeue_tail(sch);
-	if (skb) {
-		unsigned int len = skb->len;
+	if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
 		q->stats.other++;
-		qdisc_drop(skb, sch);
+		sch->qstats.drops++;
+		sch->q.qlen--;
 		return len;
 	}
 
-	if (!red_is_idling(&q->parms))
-		red_start_of_idle_period(&q->parms);
+	if (!red_is_idling(&q->vars))
+		red_start_of_idle_period(&q->vars);
 
 	return 0;
 }
 
-static void red_reset(struct Qdisc* sch)
+static void red_reset(struct Qdisc *sch)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
 
-	qdisc_reset_queue(sch);
-	red_restart(&q->parms);
+	qdisc_reset(q->qdisc);
+	sch->q.qlen = 0;
+	red_restart(&q->vars);
 }
 
-static int red_change(struct Qdisc *sch, struct rtattr *opt)
+static void red_destroy(struct Qdisc *sch)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
-	struct rtattr *tb[TCA_RED_MAX];
+
+	del_timer_sync(&q->adapt_timer);
+	qdisc_destroy(q->qdisc);
+}
+
+static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
+	[TCA_RED_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_RED_STAB]	= { .len = RED_STAB_SIZE },
+	[TCA_RED_MAX_P] = { .type = NLA_U32 },
+};
+
+static int red_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_RED_MAX + 1];
 	struct tc_red_qopt *ctl;
+	struct Qdisc *child = NULL;
+	int err;
+	u32 max_P;
 
-	if (opt == NULL || rtattr_parse_nested(tb, TCA_RED_MAX, opt))
+	if (opt == NULL)
 		return -EINVAL;
 
-	if (tb[TCA_RED_PARMS-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
-	    tb[TCA_RED_STAB-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE)
+	err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_RED_PARMS] == NULL ||
+	    tb[TCA_RED_STAB] == NULL)
 		return -EINVAL;
 
-	ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
+	max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
+
+	ctl = nla_data(tb[TCA_RED_PARMS]);
+
+	if (ctl->limit > 0) {
+		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit);
+		if (IS_ERR(child))
+			return PTR_ERR(child);
+	}
 
 	sch_tree_lock(sch);
 	q->flags = ctl->flags;
 	q->limit = ctl->limit;
+	if (child) {
+		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+		qdisc_destroy(q->qdisc);
+		q->qdisc = child;
+	}
+
+	red_set_parms(&q->parms,
+		      ctl->qth_min, ctl->qth_max, ctl->Wlog,
+		      ctl->Plog, ctl->Scell_log,
+		      nla_data(tb[TCA_RED_STAB]),
+		      max_P);
+	red_set_vars(&q->vars);
 
-	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
-				 ctl->Plog, ctl->Scell_log,
-				 RTA_DATA(tb[TCA_RED_STAB-1]));
+	del_timer(&q->adapt_timer);
+	if (ctl->flags & TC_RED_ADAPTATIVE)
+		mod_timer(&q->adapt_timer, jiffies + HZ/2);
 
-	if (skb_queue_empty(&sch->q))
-		red_end_of_idle_period(&q->parms);
+	if (!q->qdisc->q.qlen)
+		red_start_of_idle_period(&q->vars);
 
 	sch_tree_unlock(sch);
 	return 0;
 }
 
-static int red_init(struct Qdisc* sch, struct rtattr *opt)
+static inline void red_adaptative_timer(unsigned long arg)
+{
+	struct Qdisc *sch = (struct Qdisc *)arg;
+	struct red_sched_data *q = qdisc_priv(sch);
+	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+
+	spin_lock(root_lock);
+	red_adaptative_algo(&q->parms, &q->vars);
+	mod_timer(&q->adapt_timer, jiffies + HZ/2);
+	spin_unlock(root_lock);
+}
+
+static int red_init(struct Qdisc *sch, struct nlattr *opt)
 {
+	struct red_sched_data *q = qdisc_priv(sch);
+
+	q->qdisc = &noop_qdisc;
+	setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch);
 	return red_change(sch, opt);
 }
 
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
-	struct rtattr *opts = NULL;
+	struct nlattr *opts = NULL;
 	struct tc_red_qopt opt = {
 		.limit		= q->limit,
 		.flags		= q->flags,
@@ -203,12 +268,18 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 		.Scell_log	= q->parms.Scell_log,
 	};
 
-	opts = RTA_NEST(skb, TCA_OPTIONS);
-	RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
-	return RTA_NEST_END(skb, opts);
-
-rtattr_failure:
-	return RTA_NEST_CANCEL(skb, opts);
+	sch->qstats.backlog = q->qdisc->qstats.backlog;
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+	if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
+	    nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P))
+		goto nla_put_failure;
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
 }
 
 static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
@@ -224,15 +295,80 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	return gnet_stats_copy_app(d, &st, sizeof(st));
 }
 
-static struct Qdisc_ops red_qdisc_ops = {
+static int red_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+
+	tcm->tcm_handle |= TC_H_MIN(1);
+	tcm->tcm_info = q->qdisc->handle;
+	return 0;
+}
+
+static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		     struct Qdisc **old)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->qdisc;
+	q->qdisc = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *red_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	return q->qdisc;
+}
+
+static unsigned long red_get(struct Qdisc *sch, u32 classid)
+{
+	return 1;
+}
+
+static void red_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+	if (!walker->stop) {
+		if (walker->count >= walker->skip)
+			if (walker->fn(sch, 1, walker) < 0) {
+				walker->stop = 1;
+				return;
+			}
+		walker->count++;
+	}
+}
+
+static const struct Qdisc_class_ops red_class_ops = {
+	.graft		=	red_graft,
+	.leaf		=	red_leaf,
+	.get		=	red_get,
+	.put		=	red_put,
+	.walk		=	red_walk,
+	.dump		=	red_dump_class,
+};
+
+static struct Qdisc_ops red_qdisc_ops __read_mostly = {
 	.id		=	"red",
 	.priv_size	=	sizeof(struct red_sched_data),
+	.cl_ops		=	&red_class_ops,
 	.enqueue	=	red_enqueue,
 	.dequeue	=	red_dequeue,
-	.requeue	=	red_requeue,
+	.peek		=	red_peek,
 	.drop		=	red_drop,
 	.init		=	red_init,
 	.reset		=	red_reset,
+	.destroy	=	red_destroy,
 	.change		=	red_change,
 	.dump		=	red_dump,
 	.dump_stats	=	red_dump_stats,
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
new file mode 100644
index 00000000000..9b0f7093d97
--- /dev/null
+++ b/net/sched/sch_sfb.c
@@ -0,0 +1,725 @@
+/*
+ * net/sched/sch_sfb.c	  Stochastic Fair Blue
+ *
+ * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
+ * A New Class of Active Queue Management Algorithms.
+ * U. Michigan CSE-TR-387-99, April 1999.
+ *
+ * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <net/ip.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/flow_keys.h>
+
+/*
+ * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
+ * This implementation uses L = 8 and N = 16
+ * This permits us to split one 32bit hash (provided per packet by rxhash or
+ * external classifier) into 8 subhashes of 4 bits.
+ */
+#define SFB_BUCKET_SHIFT 4
+#define SFB_NUMBUCKETS	(1 << SFB_BUCKET_SHIFT) /* N bins per Level */
+#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
+#define SFB_LEVELS	(32 / SFB_BUCKET_SHIFT) /* L */
+
+/* SFB algo uses a virtual queue, named "bin" */
+struct sfb_bucket {
+	u16		qlen; /* length of virtual queue */
+	u16		p_mark; /* marking probability */
+};
+
+/* We use a double buffering right before hash change
+ * (Section 4.4 of SFB reference : moving hash functions)
+ */
+struct sfb_bins {
+	u32		  perturbation; /* jhash perturbation */
+	struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
+};
+
+struct sfb_sched_data {
+	struct Qdisc	*qdisc;
+	struct tcf_proto *filter_list;
+	unsigned long	rehash_interval;
+	unsigned long	warmup_time;	/* double buffering warmup time in jiffies */
+	u32		max;
+	u32		bin_size;	/* maximum queue length per bin */
+	u32		increment;	/* d1 */
+	u32		decrement;	/* d2 */
+	u32		limit;		/* HARD maximal queue length */
+	u32		penalty_rate;
+	u32		penalty_burst;
+	u32		tokens_avail;
+	unsigned long	rehash_time;
+	unsigned long	token_time;
+
+	u8		slot;		/* current active bins (0 or 1) */
+	bool		double_buffering;
+	struct sfb_bins bins[2];
+
+	struct {
+		u32	earlydrop;
+		u32	penaltydrop;
+		u32	bucketdrop;
+		u32	queuedrop;
+		u32	childdrop;	/* drops in child qdisc */
+		u32	marked;		/* ECN mark */
+	} stats;
+};
+
+/*
+ * Each queued skb might be hashed on one or two bins
+ * We store in skb_cb the two hash values.
+ * (A zero value means double buffering was not used)
+ */
+struct sfb_skb_cb {
+	u32 hashes[2];
+};
+
+static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb));
+	return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+/*
+ * If using 'internal' SFB flow classifier, hash comes from skb rxhash
+ * If using external classifier, hash comes from the classid.
+ */
+static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
+{
+	return sfb_skb_cb(skb)->hashes[slot];
+}
+
+/* Probabilities are coded as Q0.16 fixed-point values,
+ * with 0xFFFF representing 65535/65536 (almost 1.0)
+ * Addition and subtraction are saturating in [0, 65535]
+ */
+static u32 prob_plus(u32 p1, u32 p2)
+{
+	u32 res = p1 + p2;
+
+	return min_t(u32, res, SFB_MAX_PROB);
+}
+
+static u32 prob_minus(u32 p1, u32 p2)
+{
+	return p1 > p2 ? p1 - p2 : 0;
+}
+
+static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
+{
+	int i;
+	struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+	for (i = 0; i < SFB_LEVELS; i++) {
+		u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+		sfbhash >>= SFB_BUCKET_SHIFT;
+		if (b[hash].qlen < 0xFFFF)
+			b[hash].qlen++;
+		b += SFB_NUMBUCKETS; /* next level */
+	}
+}
+
+static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+	u32 sfbhash;
+
+	sfbhash = sfb_hash(skb, 0);
+	if (sfbhash)
+		increment_one_qlen(sfbhash, 0, q);
+
+	sfbhash = sfb_hash(skb, 1);
+	if (sfbhash)
+		increment_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_one_qlen(u32 sfbhash, u32 slot,
+			       struct sfb_sched_data *q)
+{
+	int i;
+	struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+	for (i = 0; i < SFB_LEVELS; i++) {
+		u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+		sfbhash >>= SFB_BUCKET_SHIFT;
+		if (b[hash].qlen > 0)
+			b[hash].qlen--;
+		b += SFB_NUMBUCKETS; /* next level */
+	}
+}
+
+static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+	u32 sfbhash;
+
+	sfbhash = sfb_hash(skb, 0);
+	if (sfbhash)
+		decrement_one_qlen(sfbhash, 0, q);
+
+	sfbhash = sfb_hash(skb, 1);
+	if (sfbhash)
+		decrement_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+	b->p_mark = prob_minus(b->p_mark, q->decrement);
+}
+
+static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+	b->p_mark = prob_plus(b->p_mark, q->increment);
+}
+
+static void sfb_zero_all_buckets(struct sfb_sched_data *q)
+{
+	memset(&q->bins, 0, sizeof(q->bins));
+}
+
+/*
+ * compute max qlen, max p_mark, and avg p_mark
+ */
+static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q)
+{
+	int i;
+	u32 qlen = 0, prob = 0, totalpm = 0;
+	const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
+
+	for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
+		if (qlen < b->qlen)
+			qlen = b->qlen;
+		totalpm += b->p_mark;
+		if (prob < b->p_mark)
+			prob = b->p_mark;
+		b++;
+	}
+	*prob_r = prob;
+	*avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS);
+	return qlen;
+}
+
+
+static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
+{
+	q->bins[slot].perturbation = prandom_u32();
+}
+
+static void sfb_swap_slot(struct sfb_sched_data *q)
+{
+	sfb_init_perturbation(q->slot, q);
+	q->slot ^= 1;
+	q->double_buffering = false;
+}
+
+/* Non elastic flows are allowed to use part of the bandwidth, expressed
+ * in "penalty_rate" packets per second, with "penalty_burst" burst
+ */
+static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
+{
+	if (q->penalty_rate == 0 || q->penalty_burst == 0)
+		return true;
+
+	if (q->tokens_avail < 1) {
+		unsigned long age = min(10UL * HZ, jiffies - q->token_time);
+
+		q->tokens_avail = (age * q->penalty_rate) / HZ;
+		if (q->tokens_avail > q->penalty_burst)
+			q->tokens_avail = q->penalty_burst;
+		q->token_time = jiffies;
+		if (q->tokens_avail < 1)
+			return true;
+	}
+
+	q->tokens_avail--;
+	return false;
+}
+
+static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q,
+			 int *qerr, u32 *salt)
+{
+	struct tcf_result res;
+	int result;
+
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return false;
+		}
+#endif
+		*salt = TC_H_MIN(res.classid);
+		return true;
+	}
+	return false;
+}
+
+static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+	int i;
+	u32 p_min = ~0;
+	u32 minqlen = ~0;
+	u32 r, slot, salt, sfbhash;
+	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	struct flow_keys keys;
+
+	if (unlikely(sch->q.qlen >= q->limit)) {
+		sch->qstats.overlimits++;
+		q->stats.queuedrop++;
+		goto drop;
+	}
+
+	if (q->rehash_interval > 0) {
+		unsigned long limit = q->rehash_time + q->rehash_interval;
+
+		if (unlikely(time_after(jiffies, limit))) {
+			sfb_swap_slot(q);
+			q->rehash_time = jiffies;
+		} else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
+				    time_after(jiffies, limit - q->warmup_time))) {
+			q->double_buffering = true;
+		}
+	}
+
+	if (q->filter_list) {
+		/* If using external classifiers, get result and record it. */
+		if (!sfb_classify(skb, q, &ret, &salt))
+			goto other_drop;
+		keys.src = salt;
+		keys.dst = 0;
+		keys.ports = 0;
+	} else {
+		skb_flow_dissect(skb, &keys);
+	}
+
+	slot = q->slot;
+
+	sfbhash = jhash_3words((__force u32)keys.dst,
+			       (__force u32)keys.src,
+			       (__force u32)keys.ports,
+			       q->bins[slot].perturbation);
+	if (!sfbhash)
+		sfbhash = 1;
+	sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+	for (i = 0; i < SFB_LEVELS; i++) {
+		u32 hash = sfbhash & SFB_BUCKET_MASK;
+		struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+		sfbhash >>= SFB_BUCKET_SHIFT;
+		if (b->qlen == 0)
+			decrement_prob(b, q);
+		else if (b->qlen >= q->bin_size)
+			increment_prob(b, q);
+		if (minqlen > b->qlen)
+			minqlen = b->qlen;
+		if (p_min > b->p_mark)
+			p_min = b->p_mark;
+	}
+
+	slot ^= 1;
+	sfb_skb_cb(skb)->hashes[slot] = 0;
+
+	if (unlikely(minqlen >= q->max)) {
+		sch->qstats.overlimits++;
+		q->stats.bucketdrop++;
+		goto drop;
+	}
+
+	if (unlikely(p_min >= SFB_MAX_PROB)) {
+		/* Inelastic flow */
+		if (q->double_buffering) {
+			sfbhash = jhash_3words((__force u32)keys.dst,
+					       (__force u32)keys.src,
+					       (__force u32)keys.ports,
+					       q->bins[slot].perturbation);
+			if (!sfbhash)
+				sfbhash = 1;
+			sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+			for (i = 0; i < SFB_LEVELS; i++) {
+				u32 hash = sfbhash & SFB_BUCKET_MASK;
+				struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+				sfbhash >>= SFB_BUCKET_SHIFT;
+				if (b->qlen == 0)
+					decrement_prob(b, q);
+				else if (b->qlen >= q->bin_size)
+					increment_prob(b, q);
+			}
+		}
+		if (sfb_rate_limit(skb, q)) {
+			sch->qstats.overlimits++;
+			q->stats.penaltydrop++;
+			goto drop;
+		}
+		goto enqueue;
+	}
+
+	r = prandom_u32() & SFB_MAX_PROB;
+
+	if (unlikely(r < p_min)) {
+		if (unlikely(p_min > SFB_MAX_PROB / 2)) {
+			/* If we're marking that many packets, then either
+			 * this flow is unresponsive, or we're badly congested.
+			 * In either case, we want to start dropping packets.
+			 */
+			if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
+				q->stats.earlydrop++;
+				goto drop;
+			}
+		}
+		if (INET_ECN_set_ce(skb)) {
+			q->stats.marked++;
+		} else {
+			q->stats.earlydrop++;
+			goto drop;
+		}
+	}
+
+enqueue:
+	ret = qdisc_enqueue(skb, child);
+	if (likely(ret == NET_XMIT_SUCCESS)) {
+		sch->q.qlen++;
+		increment_qlen(skb, q);
+	} else if (net_xmit_drop_count(ret)) {
+		q->stats.childdrop++;
+		sch->qstats.drops++;
+	}
+	return ret;
+
+drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+other_drop:
+	if (ret & __NET_XMIT_BYPASS)
+		sch->qstats.drops++;
+	kfree_skb(skb);
+	return ret;
+}
+
+static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+	struct sk_buff *skb;
+
+	skb = child->dequeue(q->qdisc);
+
+	if (skb) {
+		qdisc_bstats_update(sch, skb);
+		sch->q.qlen--;
+		decrement_qlen(skb, q);
+	}
+
+	return skb;
+}
+
+static struct sk_buff *sfb_peek(struct Qdisc *sch)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+
+	return child->ops->peek(child);
+}
+
+/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
+
+static void sfb_reset(struct Qdisc *sch)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	qdisc_reset(q->qdisc);
+	sch->q.qlen = 0;
+	q->slot = 0;
+	q->double_buffering = false;
+	sfb_zero_all_buckets(q);
+	sfb_init_perturbation(0, q);
+}
+
+static void sfb_destroy(struct Qdisc *sch)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	qdisc_destroy(q->qdisc);
+}
+
+static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
+	[TCA_SFB_PARMS]	= { .len = sizeof(struct tc_sfb_qopt) },
+};
+
+static const struct tc_sfb_qopt sfb_default_ops = {
+	.rehash_interval = 600 * MSEC_PER_SEC,
+	.warmup_time = 60 * MSEC_PER_SEC,
+	.limit = 0,
+	.max = 25,
+	.bin_size = 20,
+	.increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
+	.decrement = (SFB_MAX_PROB + 3000) / 6000,
+	.penalty_rate = 10,
+	.penalty_burst = 20,
+};
+
+static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child;
+	struct nlattr *tb[TCA_SFB_MAX + 1];
+	const struct tc_sfb_qopt *ctl = &sfb_default_ops;
+	u32 limit;
+	int err;
+
+	if (opt) {
+		err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
+		if (err < 0)
+			return -EINVAL;
+
+		if (tb[TCA_SFB_PARMS] == NULL)
+			return -EINVAL;
+
+		ctl = nla_data(tb[TCA_SFB_PARMS]);
+	}
+
+	limit = ctl->limit;
+	if (limit == 0)
+		limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
+
+	child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
+	if (IS_ERR(child))
+		return PTR_ERR(child);
+
+	sch_tree_lock(sch);
+
+	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+	qdisc_destroy(q->qdisc);
+	q->qdisc = child;
+
+	q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
+	q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
+	q->rehash_time = jiffies;
+	q->limit = limit;
+	q->increment = ctl->increment;
+	q->decrement = ctl->decrement;
+	q->max = ctl->max;
+	q->bin_size = ctl->bin_size;
+	q->penalty_rate = ctl->penalty_rate;
+	q->penalty_burst = ctl->penalty_burst;
+	q->tokens_avail = ctl->penalty_burst;
+	q->token_time = jiffies;
+
+	q->slot = 0;
+	q->double_buffering = false;
+	sfb_zero_all_buckets(q);
+	sfb_init_perturbation(0, q);
+	sfb_init_perturbation(1, q);
+
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	q->qdisc = &noop_qdisc;
+	return sfb_change(sch, opt);
+}
+
+static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+	struct tc_sfb_qopt opt = {
+		.rehash_interval = jiffies_to_msecs(q->rehash_interval),
+		.warmup_time = jiffies_to_msecs(q->warmup_time),
+		.limit = q->limit,
+		.max = q->max,
+		.bin_size = q->bin_size,
+		.increment = q->increment,
+		.decrement = q->decrement,
+		.penalty_rate = q->penalty_rate,
+		.penalty_burst = q->penalty_burst,
+	};
+
+	sch->qstats.backlog = q->qdisc->qstats.backlog;
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+	if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct tc_sfb_xstats st = {
+		.earlydrop = q->stats.earlydrop,
+		.penaltydrop = q->stats.penaltydrop,
+		.bucketdrop = q->stats.bucketdrop,
+		.queuedrop = q->stats.queuedrop,
+		.childdrop = q->stats.childdrop,
+		.marked = q->stats.marked,
+	};
+
+	st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	return -ENOSYS;
+}
+
+static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		     struct Qdisc **old)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->qdisc;
+	q->qdisc = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	return q->qdisc;
+}
+
+static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
+{
+	return 1;
+}
+
+static void sfb_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+			    struct nlattr **tca, unsigned long *arg)
+{
+	return -ENOSYS;
+}
+
+static int sfb_delete(struct Qdisc *sch, unsigned long cl)
+{
+	return -ENOSYS;
+}
+
+static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+	if (!walker->stop) {
+		if (walker->count >= walker->skip)
+			if (walker->fn(sch, 1, walker) < 0) {
+				walker->stop = 1;
+				return;
+			}
+		walker->count++;
+	}
+}
+
+static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
+			      u32 classid)
+{
+	return 0;
+}
+
+
+static const struct Qdisc_class_ops sfb_class_ops = {
+	.graft		=	sfb_graft,
+	.leaf		=	sfb_leaf,
+	.get		=	sfb_get,
+	.put		=	sfb_put,
+	.change		=	sfb_change_class,
+	.delete		=	sfb_delete,
+	.walk		=	sfb_walk,
+	.tcf_chain	=	sfb_find_tcf,
+	.bind_tcf	=	sfb_bind,
+	.unbind_tcf	=	sfb_put,
+	.dump		=	sfb_dump_class,
+};
+
+static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
+	.id		=	"sfb",
+	.priv_size	=	sizeof(struct sfb_sched_data),
+	.cl_ops		=	&sfb_class_ops,
+	.enqueue	=	sfb_enqueue,
+	.dequeue	=	sfb_dequeue,
+	.peek		=	sfb_peek,
+	.init		=	sfb_init,
+	.reset		=	sfb_reset,
+	.destroy	=	sfb_destroy,
+	.change		=	sfb_change,
+	.dump		=	sfb_dump,
+	.dump_stats	=	sfb_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init sfb_module_init(void)
+{
+	return register_qdisc(&sfb_qdisc_ops);
+}
+
+static void __exit sfb_module_exit(void)
+{
+	unregister_qdisc(&sfb_qdisc_ops);
+}
+
+module_init(sfb_module_init)
+module_exit(sfb_module_exit)
+
+MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
+MODULE_AUTHOR("Juliusz Chroboczek");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 86d8da0cbd0..1af2f73906d 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -9,33 +9,22 @@
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/jiffies.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
 #include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
 #include <linux/init.h>
-#include <net/ip.h>
-#include <linux/ipv6.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/flow_keys.h>
+#include <net/red.h>
 
 
 /*	Stochastic Fairness Queuing algorithm.
@@ -54,7 +43,7 @@
 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
 
 
-	This is not the thing that is usually called (W)FQ nowadays. 
+	This is not the thing that is usually called (W)FQ nowadays.
 	It does not use any timestamp mechanism, but instead
 	processes queues in round-robin order.
 
@@ -64,7 +53,7 @@
 
 	DRAWBACKS:
 
-	- "Stochastic" -> It is not 100% fair. 
+	- "Stochastic" -> It is not 100% fair.
 	When hash collisions occur, several flows are considered as one.
 
 	- "Round-robin" -> It introduces larger delays than virtual clock
@@ -78,122 +67,194 @@
 	SFQ is superior for this purpose.
 
 	IMPLEMENTATION:
-	This implementation limits maximal queue length to 128;
-	maximal mtu to 2^15-1; number of hash buckets to 1024.
-	The only goal of this restrictions was that all data
-	fit into one 4K page :-). Struct sfq_sched_data is
-	organized in anti-cache manner: all the data for a bucket
-	are scattered over different locations. This is not good,
-	but it allowed me to put it into 4K.
+	This implementation limits :
+	- maximal queue length per flow to 127 packets.
+	- max mtu to 2^18-1;
+	- max 65408 flows,
+	- number of hash buckets to 65536.
 
 	It is easy to increase these values, but not in flight.  */
 
-#define SFQ_DEPTH		128
-#define SFQ_HASH_DIVISOR	1024
+#define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
+#define SFQ_DEFAULT_FLOWS	128
+#define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
+#define SFQ_EMPTY_SLOT		0xffff
+#define SFQ_DEFAULT_HASH_DIVISOR 1024
 
-/* This type should contain at least SFQ_DEPTH*2 values */
-typedef unsigned char sfq_index;
+/* We use 16 bits to store allot, and want to handle packets up to 64K
+ * Scale allot by 8 (1<<3) so that no overflow occurs.
+ */
+#define SFQ_ALLOT_SHIFT		3
+#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
 
-struct sfq_head
-{
+/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
+typedef u16 sfq_index;
+
+/*
+ * We dont use pointers to save space.
+ * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
+ * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
+ * are 'pointers' to dep[] array
+ */
+struct sfq_head {
 	sfq_index	next;
 	sfq_index	prev;
 };
 
-struct sfq_sched_data
-{
-/* Parameters */
-	int		perturb_period;
-	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
-	int		limit;
+struct sfq_slot {
+	struct sk_buff	*skblist_next;
+	struct sk_buff	*skblist_prev;
+	sfq_index	qlen; /* number of skbs in skblist */
+	sfq_index	next; /* next slot in sfq RR chain */
+	struct sfq_head dep; /* anchor in dep[] chains */
+	unsigned short	hash; /* hash value (index in ht[]) */
+	short		allot; /* credit for this slot */
+
+	unsigned int    backlog;
+	struct red_vars vars;
+};
 
-/* Variables */
+struct sfq_sched_data {
+/* frequently used fields */
+	int		limit;		/* limit of total number of packets in this qdisc */
+	unsigned int	divisor;	/* number of slots in hash table */
+	u8		headdrop;
+	u8		maxdepth;	/* limit of packets per flow */
+
+	u32		perturbation;
+	u8		cur_depth;	/* depth of longest slot */
+	u8		flags;
+	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
+	struct tcf_proto *filter_list;
+	sfq_index	*ht;		/* Hash table ('divisor' slots) */
+	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
+
+	struct red_parms *red_parms;
+	struct tc_sfqred_stats stats;
+	struct sfq_slot *tail;		/* current slot in round */
+
+	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
+					/* Linked lists of slots, indexed by depth
+					 * dep[0] : list of unused flows
+					 * dep[1] : list of flows with 1 packet
+					 * dep[X] : list of flows with X packets
+					 */
+
+	unsigned int	maxflows;	/* number of flows in flows array */
+	int		perturb_period;
+	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
 	struct timer_list perturb_timer;
-	int		perturbation;
-	sfq_index	tail;		/* Index of current slot in round */
-	sfq_index	max_depth;	/* Maximal depth */
-
-	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
-	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
-	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
-	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
-	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
-	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
 };
 
-static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
+/*
+ * sfq_head are either in a sfq_slot or in dep[] array
+ */
+static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 {
-	int pert = q->perturbation;
+	if (val < SFQ_MAX_FLOWS)
+		return &q->slots[val].dep;
+	return &q->dep[val - SFQ_MAX_FLOWS];
+}
 
-	/* Have we any rotation primitives? If not, WHY? */
-	h ^= (h1<<pert) ^ (h1>>(0x1F - pert));
-	h ^= h>>10;
-	return h & 0x3FF;
+/*
+ * In order to be able to quickly rehash our queue when timer changes
+ * q->perturbation, we store flow_keys in skb->cb[]
+ */
+struct sfq_skb_cb {
+       struct flow_keys        keys;
+};
+
+static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
+	return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
 }
 
-static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
+static unsigned int sfq_hash(const struct sfq_sched_data *q,
+			     const struct sk_buff *skb)
 {
-	u32 h, h2;
+	const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
+	unsigned int hash;
 
-	switch (skb->protocol) {
-	case __constant_htons(ETH_P_IP):
-	{
-		struct iphdr *iph = skb->nh.iph;
-		h = iph->daddr;
-		h2 = iph->saddr^iph->protocol;
-		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
-		    (iph->protocol == IPPROTO_TCP ||
-		     iph->protocol == IPPROTO_UDP ||
-		     iph->protocol == IPPROTO_SCTP ||
-		     iph->protocol == IPPROTO_DCCP ||
-		     iph->protocol == IPPROTO_ESP))
-			h2 ^= *(((u32*)iph) + iph->ihl);
-		break;
-	}
-	case __constant_htons(ETH_P_IPV6):
-	{
-		struct ipv6hdr *iph = skb->nh.ipv6h;
-		h = iph->daddr.s6_addr32[3];
-		h2 = iph->saddr.s6_addr32[3]^iph->nexthdr;
-		if (iph->nexthdr == IPPROTO_TCP ||
-		    iph->nexthdr == IPPROTO_UDP ||
-		    iph->nexthdr == IPPROTO_SCTP ||
-		    iph->nexthdr == IPPROTO_DCCP ||
-		    iph->nexthdr == IPPROTO_ESP)
-			h2 ^= *(u32*)&iph[1];
-		break;
+	hash = jhash_3words((__force u32)keys->dst,
+			    (__force u32)keys->src ^ keys->ip_proto,
+			    (__force u32)keys->ports, q->perturbation);
+	return hash & (q->divisor - 1);
+}
+
+static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
+				 int *qerr)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	if (TC_H_MAJ(skb->priority) == sch->handle &&
+	    TC_H_MIN(skb->priority) > 0 &&
+	    TC_H_MIN(skb->priority) <= q->divisor)
+		return TC_H_MIN(skb->priority);
+
+	if (!q->filter_list) {
+		skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
+		return sfq_hash(q, skb) + 1;
 	}
-	default:
-		h = (u32)(unsigned long)skb->dst^skb->protocol;
-		h2 = (u32)(unsigned long)skb->sk;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return 0;
+		}
+#endif
+		if (TC_H_MIN(res.classid) <= q->divisor)
+			return TC_H_MIN(res.classid);
 	}
-	return sfq_fold_hash(q, h, h2);
+	return 0;
 }
 
+/*
+ * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
+ */
 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 {
 	sfq_index p, n;
-	int d = q->qs[x].qlen + SFQ_DEPTH;
+	struct sfq_slot *slot = &q->slots[x];
+	int qlen = slot->qlen;
 
-	p = d;
-	n = q->dep[d].next;
-	q->dep[x].next = n;
-	q->dep[x].prev = p;
-	q->dep[p].next = q->dep[n].prev = x;
+	p = qlen + SFQ_MAX_FLOWS;
+	n = q->dep[qlen].next;
+
+	slot->dep.next = n;
+	slot->dep.prev = p;
+
+	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
+	sfq_dep_head(q, n)->prev = x;
 }
 
+#define sfq_unlink(q, x, n, p)			\
+	do {					\
+		n = q->slots[x].dep.next;	\
+		p = q->slots[x].dep.prev;	\
+		sfq_dep_head(q, p)->next = n;	\
+		sfq_dep_head(q, n)->prev = p;	\
+	} while (0)
+
+
 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 {
 	sfq_index p, n;
+	int d;
 
-	n = q->dep[x].next;
-	p = q->dep[x].prev;
-	q->dep[p].next = n;
-	q->dep[n].prev = p;
-
-	if (n == p && q->max_depth == q->qs[x].qlen + 1)
-		q->max_depth--;
+	sfq_unlink(q, x, n, p);
 
+	d = q->slots[x].qlen--;
+	if (n == p && q->cur_depth == d)
+		q->cur_depth--;
 	sfq_link(q, x);
 }
 
@@ -202,169 +263,285 @@ static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 	sfq_index p, n;
 	int d;
 
-	n = q->dep[x].next;
-	p = q->dep[x].prev;
-	q->dep[p].next = n;
-	q->dep[n].prev = p;
-	d = q->qs[x].qlen;
-	if (q->max_depth < d)
-		q->max_depth = d;
+	sfq_unlink(q, x, n, p);
 
+	d = ++q->slots[x].qlen;
+	if (q->cur_depth < d)
+		q->cur_depth = d;
 	sfq_link(q, x);
 }
 
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from tail of slot queue */
+static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
+{
+	struct sk_buff *skb = slot->skblist_prev;
+
+	slot->skblist_prev = skb->prev;
+	skb->prev->next = (struct sk_buff *)slot;
+	skb->next = skb->prev = NULL;
+	return skb;
+}
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
+{
+	struct sk_buff *skb = slot->skblist_next;
+
+	slot->skblist_next = skb->next;
+	skb->next->prev = (struct sk_buff *)slot;
+	skb->next = skb->prev = NULL;
+	return skb;
+}
+
+static inline void slot_queue_init(struct sfq_slot *slot)
+{
+	memset(slot, 0, sizeof(*slot));
+	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
+}
+
+/* add skb to slot queue (tail add) */
+static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
+{
+	skb->prev = slot->skblist_prev;
+	skb->next = (struct sk_buff *)slot;
+	slot->skblist_prev->next = skb;
+	slot->skblist_prev = skb;
+}
+
+#define	slot_queue_walk(slot, skb)		\
+	for (skb = slot->skblist_next;		\
+	     skb != (struct sk_buff *)slot;	\
+	     skb = skb->next)
+
 static unsigned int sfq_drop(struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	sfq_index d = q->max_depth;
+	sfq_index x, d = q->cur_depth;
 	struct sk_buff *skb;
 	unsigned int len;
+	struct sfq_slot *slot;
 
-	/* Queue is full! Find the longest slot and
-	   drop a packet from it */
-
+	/* Queue is full! Find the longest slot and drop tail packet from it */
 	if (d > 1) {
-		sfq_index x = q->dep[d+SFQ_DEPTH].next;
-		skb = q->qs[x].prev;
-		len = skb->len;
-		__skb_unlink(skb, &q->qs[x]);
-		kfree_skb(skb);
+		x = q->dep[d].next;
+		slot = &q->slots[x];
+drop:
+		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
+		len = qdisc_pkt_len(skb);
+		slot->backlog -= len;
 		sfq_dec(q, x);
+		kfree_skb(skb);
 		sch->q.qlen--;
 		sch->qstats.drops++;
+		sch->qstats.backlog -= len;
 		return len;
 	}
 
 	if (d == 1) {
 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
-		d = q->next[q->tail];
-		q->next[q->tail] = q->next[d];
-		q->allot[q->next[d]] += q->quantum;
-		skb = q->qs[d].prev;
-		len = skb->len;
-		__skb_unlink(skb, &q->qs[d]);
-		kfree_skb(skb);
-		sfq_dec(q, d);
-		sch->q.qlen--;
-		q->ht[q->hash[d]] = SFQ_DEPTH;
-		sch->qstats.drops++;
-		return len;
+		x = q->tail->next;
+		slot = &q->slots[x];
+		q->tail->next = slot->next;
+		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+		goto drop;
 	}
 
 	return 0;
 }
 
+/* Is ECN parameter configured */
+static int sfq_prob_mark(const struct sfq_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max threshold just be marked */
+static int sfq_hard_mark(const struct sfq_sched_data *q)
+{
+	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
+}
+
+static int sfq_headdrop(const struct sfq_sched_data *q)
+{
+	return q->headdrop;
+}
+
 static int
-sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	unsigned hash = sfq_hash(q, skb);
-	sfq_index x;
+	unsigned int hash;
+	sfq_index x, qlen;
+	struct sfq_slot *slot;
+	int uninitialized_var(ret);
+	struct sk_buff *head;
+	int delta;
+
+	hash = sfq_classify(skb, sch, &ret);
+	if (hash == 0) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+	hash--;
 
 	x = q->ht[hash];
-	if (x == SFQ_DEPTH) {
-		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
-		q->hash[x] = hash;
+	slot = &q->slots[x];
+	if (x == SFQ_EMPTY_SLOT) {
+		x = q->dep[0].next; /* get a free slot */
+		if (x >= SFQ_MAX_FLOWS)
+			return qdisc_drop(skb, sch);
+		q->ht[hash] = x;
+		slot = &q->slots[x];
+		slot->hash = hash;
+		slot->backlog = 0; /* should already be 0 anyway... */
+		red_set_vars(&slot->vars);
+		goto enqueue;
 	}
-	__skb_queue_tail(&q->qs[x], skb);
-	sfq_inc(q, x);
-	if (q->qs[x].qlen == 1) {		/* The flow is new */
-		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
-			q->tail = x;
-			q->next[x] = x;
-			q->allot[x] = q->quantum;
-		} else {
-			q->next[x] = q->next[q->tail];
-			q->next[q->tail] = x;
-			q->tail = x;
+	if (q->red_parms) {
+		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
+							&slot->vars,
+							slot->backlog);
+		switch (red_action(q->red_parms,
+				   &slot->vars,
+				   slot->vars.qavg)) {
+		case RED_DONT_MARK:
+			break;
+
+		case RED_PROB_MARK:
+			sch->qstats.overlimits++;
+			if (sfq_prob_mark(q)) {
+				/* We know we have at least one packet in queue */
+				if (sfq_headdrop(q) &&
+				    INET_ECN_set_ce(slot->skblist_next)) {
+					q->stats.prob_mark_head++;
+					break;
+				}
+				if (INET_ECN_set_ce(skb)) {
+					q->stats.prob_mark++;
+					break;
+				}
+			}
+			q->stats.prob_drop++;
+			goto congestion_drop;
+
+		case RED_HARD_MARK:
+			sch->qstats.overlimits++;
+			if (sfq_hard_mark(q)) {
+				/* We know we have at least one packet in queue */
+				if (sfq_headdrop(q) &&
+				    INET_ECN_set_ce(slot->skblist_next)) {
+					q->stats.forced_mark_head++;
+					break;
+				}
+				if (INET_ECN_set_ce(skb)) {
+					q->stats.forced_mark++;
+					break;
+				}
+			}
+			q->stats.forced_drop++;
+			goto congestion_drop;
 		}
 	}
-	if (++sch->q.qlen < q->limit-1) {
-		sch->bstats.bytes += skb->len;
-		sch->bstats.packets++;
-		return 0;
-	}
 
-	sfq_drop(sch);
-	return NET_XMIT_CN;
-}
+	if (slot->qlen >= q->maxdepth) {
+congestion_drop:
+		if (!sfq_headdrop(q))
+			return qdisc_drop(skb, sch);
 
-static int
-sfq_requeue(struct sk_buff *skb, struct Qdisc* sch)
-{
-	struct sfq_sched_data *q = qdisc_priv(sch);
-	unsigned hash = sfq_hash(q, skb);
-	sfq_index x;
+		/* We know we have at least one packet in queue */
+		head = slot_dequeue_head(slot);
+		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
+		sch->qstats.backlog -= delta;
+		slot->backlog -= delta;
+		qdisc_drop(head, sch);
 
-	x = q->ht[hash];
-	if (x == SFQ_DEPTH) {
-		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
-		q->hash[x] = hash;
+		slot_queue_add(slot, skb);
+		return NET_XMIT_CN;
 	}
-	__skb_queue_head(&q->qs[x], skb);
+
+enqueue:
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+	slot->backlog += qdisc_pkt_len(skb);
+	slot_queue_add(slot, skb);
 	sfq_inc(q, x);
-	if (q->qs[x].qlen == 1) {		/* The flow is new */
-		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
-			q->tail = x;
-			q->next[x] = x;
-			q->allot[x] = q->quantum;
+	if (slot->qlen == 1) {		/* The flow is new */
+		if (q->tail == NULL) {	/* It is the first flow */
+			slot->next = x;
 		} else {
-			q->next[x] = q->next[q->tail];
-			q->next[q->tail] = x;
-			q->tail = x;
+			slot->next = q->tail->next;
+			q->tail->next = x;
 		}
+		/* We put this flow at the end of our flow list.
+		 * This might sound unfair for a new flow to wait after old ones,
+		 * but we could endup servicing new flows only, and freeze old ones.
+		 */
+		q->tail = slot;
+		/* We could use a bigger initial quantum for new flows */
+		slot->allot = q->scaled_quantum;
 	}
-	if (++sch->q.qlen < q->limit - 1) {
-		sch->qstats.requeues++;
-		return 0;
-	}
+	if (++sch->q.qlen <= q->limit)
+		return NET_XMIT_SUCCESS;
 
-	sch->qstats.drops++;
+	qlen = slot->qlen;
 	sfq_drop(sch);
-	return NET_XMIT_CN;
+	/* Return Congestion Notification only if we dropped a packet
+	 * from this flow.
+	 */
+	if (qlen != slot->qlen)
+		return NET_XMIT_CN;
+
+	/* As we dropped a packet, better let upper stack know this */
+	qdisc_tree_decrease_qlen(sch, 1);
+	return NET_XMIT_SUCCESS;
 }
 
-
-
-
 static struct sk_buff *
-sfq_dequeue(struct Qdisc* sch)
+sfq_dequeue(struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
-	sfq_index a, old_a;
+	sfq_index a, next_a;
+	struct sfq_slot *slot;
 
 	/* No active slots */
-	if (q->tail == SFQ_DEPTH)
+	if (q->tail == NULL)
 		return NULL;
 
-	a = old_a = q->next[q->tail];
-
-	/* Grab packet */
-	skb = __skb_dequeue(&q->qs[a]);
+next_slot:
+	a = q->tail->next;
+	slot = &q->slots[a];
+	if (slot->allot <= 0) {
+		q->tail = slot;
+		slot->allot += q->scaled_quantum;
+		goto next_slot;
+	}
+	skb = slot_dequeue_head(slot);
 	sfq_dec(q, a);
+	qdisc_bstats_update(sch, skb);
 	sch->q.qlen--;
-
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	slot->backlog -= qdisc_pkt_len(skb);
 	/* Is the slot empty? */
-	if (q->qs[a].qlen == 0) {
-		q->ht[q->hash[a]] = SFQ_DEPTH;
-		a = q->next[a];
-		if (a == old_a) {
-			q->tail = SFQ_DEPTH;
+	if (slot->qlen == 0) {
+		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+		next_a = slot->next;
+		if (a == next_a) {
+			q->tail = NULL; /* no more active slots */
 			return skb;
 		}
-		q->next[q->tail] = a;
-		q->allot[a] += q->quantum;
-	} else if ((q->allot[a] -= skb->len) <= 0) {
-		q->tail = a;
-		a = q->next[a];
-		q->allot[a] += q->quantum;
+		q->tail->next = next_a;
+	} else {
+		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 	}
 	return skb;
 }
 
 static void
-sfq_reset(struct Qdisc* sch)
+sfq_reset(struct Qdisc *sch)
 {
 	struct sk_buff *skb;
 
@@ -372,113 +549,375 @@ sfq_reset(struct Qdisc* sch)
 		kfree_skb(skb);
 }
 
+/*
+ * When q->perturbation is changed, we rehash all queued skbs
+ * to avoid OOO (Out Of Order) effects.
+ * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
+ * counters.
+ */
+static void sfq_rehash(struct Qdisc *sch)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	int i;
+	struct sfq_slot *slot;
+	struct sk_buff_head list;
+	int dropped = 0;
+
+	__skb_queue_head_init(&list);
+
+	for (i = 0; i < q->maxflows; i++) {
+		slot = &q->slots[i];
+		if (!slot->qlen)
+			continue;
+		while (slot->qlen) {
+			skb = slot_dequeue_head(slot);
+			sfq_dec(q, i);
+			__skb_queue_tail(&list, skb);
+		}
+		slot->backlog = 0;
+		red_set_vars(&slot->vars);
+		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+	}
+	q->tail = NULL;
+
+	while ((skb = __skb_dequeue(&list)) != NULL) {
+		unsigned int hash = sfq_hash(q, skb);
+		sfq_index x = q->ht[hash];
+
+		slot = &q->slots[x];
+		if (x == SFQ_EMPTY_SLOT) {
+			x = q->dep[0].next; /* get a free slot */
+			if (x >= SFQ_MAX_FLOWS) {
+drop:				sch->qstats.backlog -= qdisc_pkt_len(skb);
+				kfree_skb(skb);
+				dropped++;
+				continue;
+			}
+			q->ht[hash] = x;
+			slot = &q->slots[x];
+			slot->hash = hash;
+		}
+		if (slot->qlen >= q->maxdepth)
+			goto drop;
+		slot_queue_add(slot, skb);
+		if (q->red_parms)
+			slot->vars.qavg = red_calc_qavg(q->red_parms,
+							&slot->vars,
+							slot->backlog);
+		slot->backlog += qdisc_pkt_len(skb);
+		sfq_inc(q, x);
+		if (slot->qlen == 1) {		/* The flow is new */
+			if (q->tail == NULL) {	/* It is the first flow */
+				slot->next = x;
+			} else {
+				slot->next = q->tail->next;
+				q->tail->next = x;
+			}
+			q->tail = slot;
+			slot->allot = q->scaled_quantum;
+		}
+	}
+	sch->q.qlen -= dropped;
+	qdisc_tree_decrease_qlen(sch, dropped);
+}
+
 static void sfq_perturbation(unsigned long arg)
 {
-	struct Qdisc *sch = (struct Qdisc*)arg;
+	struct Qdisc *sch = (struct Qdisc *)arg;
 	struct sfq_sched_data *q = qdisc_priv(sch);
+	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 
-	q->perturbation = net_random()&0x1F;
+	spin_lock(root_lock);
+	q->perturbation = prandom_u32();
+	if (!q->filter_list && q->tail)
+		sfq_rehash(sch);
+	spin_unlock(root_lock);
 
-	if (q->perturb_period) {
-		q->perturb_timer.expires = jiffies + q->perturb_period;
-		add_timer(&q->perturb_timer);
-	}
+	if (q->perturb_period)
+		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 }
 
-static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
+static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	struct tc_sfq_qopt *ctl = RTA_DATA(opt);
+	struct tc_sfq_qopt *ctl = nla_data(opt);
+	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
+	unsigned int qlen;
+	struct red_parms *p = NULL;
 
-	if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 		return -EINVAL;
-
+	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
+		ctl_v1 = nla_data(opt);
+	if (ctl->divisor &&
+	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
+		return -EINVAL;
+	if (ctl_v1 && ctl_v1->qth_min) {
+		p = kmalloc(sizeof(*p), GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+	}
 	sch_tree_lock(sch);
-	q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
-	q->perturb_period = ctl->perturb_period*HZ;
-	if (ctl->limit)
-		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH);
+	if (ctl->quantum) {
+		q->quantum = ctl->quantum;
+		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+	}
+	q->perturb_period = ctl->perturb_period * HZ;
+	if (ctl->flows)
+		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
+	if (ctl->divisor) {
+		q->divisor = ctl->divisor;
+		q->maxflows = min_t(u32, q->maxflows, q->divisor);
+	}
+	if (ctl_v1) {
+		if (ctl_v1->depth)
+			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
+		if (p) {
+			swap(q->red_parms, p);
+			red_set_parms(q->red_parms,
+				      ctl_v1->qth_min, ctl_v1->qth_max,
+				      ctl_v1->Wlog,
+				      ctl_v1->Plog, ctl_v1->Scell_log,
+				      NULL,
+				      ctl_v1->max_P);
+		}
+		q->flags = ctl_v1->flags;
+		q->headdrop = ctl_v1->headdrop;
+	}
+	if (ctl->limit) {
+		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
+		q->maxflows = min_t(u32, q->maxflows, q->limit);
+	}
 
-	while (sch->q.qlen >= q->limit-1)
+	qlen = sch->q.qlen;
+	while (sch->q.qlen > q->limit)
 		sfq_drop(sch);
+	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 
 	del_timer(&q->perturb_timer);
 	if (q->perturb_period) {
-		q->perturb_timer.expires = jiffies + q->perturb_period;
-		add_timer(&q->perturb_timer);
+		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
+		q->perturbation = prandom_u32();
 	}
 	sch_tree_unlock(sch);
+	kfree(p);
 	return 0;
 }
 
-static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
+static void *sfq_alloc(size_t sz)
+{
+	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+	if (!ptr)
+		ptr = vmalloc(sz);
+	return ptr;
+}
+
+static void sfq_free(void *addr)
+{
+	kvfree(addr);
+}
+
+static void sfq_destroy(struct Qdisc *sch)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	q->perturb_period = 0;
+	del_timer_sync(&q->perturb_timer);
+	sfq_free(q->ht);
+	sfq_free(q->slots);
+	kfree(q->red_parms);
+}
+
+static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
 	int i;
 
-	init_timer(&q->perturb_timer);
-	q->perturb_timer.data = (unsigned long)sch;
 	q->perturb_timer.function = sfq_perturbation;
+	q->perturb_timer.data = (unsigned long)sch;
+	init_timer_deferrable(&q->perturb_timer);
 
-	for (i=0; i<SFQ_HASH_DIVISOR; i++)
-		q->ht[i] = SFQ_DEPTH;
-	for (i=0; i<SFQ_DEPTH; i++) {
-		skb_queue_head_init(&q->qs[i]);
-		q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
-		q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
+	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
+		q->dep[i].next = i + SFQ_MAX_FLOWS;
+		q->dep[i].prev = i + SFQ_MAX_FLOWS;
 	}
-	q->limit = SFQ_DEPTH;
-	q->max_depth = 0;
-	q->tail = SFQ_DEPTH;
-	if (opt == NULL) {
-		q->quantum = psched_mtu(sch->dev);
-		q->perturb_period = 0;
-	} else {
+
+	q->limit = SFQ_MAX_DEPTH;
+	q->maxdepth = SFQ_MAX_DEPTH;
+	q->cur_depth = 0;
+	q->tail = NULL;
+	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
+	q->maxflows = SFQ_DEFAULT_FLOWS;
+	q->quantum = psched_mtu(qdisc_dev(sch));
+	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+	q->perturb_period = 0;
+	q->perturbation = prandom_u32();
+
+	if (opt) {
 		int err = sfq_change(sch, opt);
 		if (err)
 			return err;
 	}
-	for (i=0; i<SFQ_DEPTH; i++)
+
+	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
+	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
+	if (!q->ht || !q->slots) {
+		sfq_destroy(sch);
+		return -ENOMEM;
+	}
+	for (i = 0; i < q->divisor; i++)
+		q->ht[i] = SFQ_EMPTY_SLOT;
+
+	for (i = 0; i < q->maxflows; i++) {
+		slot_queue_init(&q->slots[i]);
 		sfq_link(q, i);
+	}
+	if (q->limit >= 1)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
-static void sfq_destroy(struct Qdisc *sch)
+static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	del_timer(&q->perturb_timer);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_sfq_qopt_v1 opt;
+	struct red_parms *p = q->red_parms;
+
+	memset(&opt, 0, sizeof(opt));
+	opt.v0.quantum	= q->quantum;
+	opt.v0.perturb_period = q->perturb_period / HZ;
+	opt.v0.limit	= q->limit;
+	opt.v0.divisor	= q->divisor;
+	opt.v0.flows	= q->maxflows;
+	opt.depth	= q->maxdepth;
+	opt.headdrop	= q->headdrop;
+
+	if (p) {
+		opt.qth_min	= p->qth_min >> p->Wlog;
+		opt.qth_max	= p->qth_max >> p->Wlog;
+		opt.Wlog	= p->Wlog;
+		opt.Plog	= p->Plog;
+		opt.Scell_log	= p->Scell_log;
+		opt.max_P	= p->max_P;
+	}
+	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
+	opt.flags	= q->flags;
+
+	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
 }
 
-static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
+static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
+			      u32 classid)
+{
+	/* we cannot bypass queue discipline anymore */
+	sch->flags &= ~TCQ_F_CAN_BYPASS;
+	return 0;
+}
+
+static void sfq_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	unsigned char	 *b = skb->tail;
-	struct tc_sfq_qopt opt;
 
-	opt.quantum = q->quantum;
-	opt.perturb_period = q->perturb_period/HZ;
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
 
-	opt.limit = q->limit;
-	opt.divisor = SFQ_HASH_DIVISOR;
-	opt.flows = q->limit;
+static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
 
-	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				struct gnet_dump *d)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	sfq_index idx = q->ht[cl - 1];
+	struct gnet_stats_queue qs = { 0 };
+	struct tc_sfq_xstats xstats = { 0 };
 
-	return skb->len;
+	if (idx != SFQ_EMPTY_SLOT) {
+		const struct sfq_slot *slot = &q->slots[idx];
 
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
+		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
+		qs.qlen = slot->qlen;
+		qs.backlog = slot->backlog;
+	}
+	if (gnet_stats_copy_queue(d, &qs) < 0)
+		return -1;
+	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 }
 
-static struct Qdisc_ops sfq_qdisc_ops = {
-	.next		=	NULL,
-	.cl_ops		=	NULL,
+static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->divisor; i++) {
+		if (q->ht[i] == SFQ_EMPTY_SLOT ||
+		    arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, i + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops sfq_class_ops = {
+	.leaf		=	sfq_leaf,
+	.get		=	sfq_get,
+	.put		=	sfq_put,
+	.tcf_chain	=	sfq_find_tcf,
+	.bind_tcf	=	sfq_bind,
+	.unbind_tcf	=	sfq_put,
+	.dump		=	sfq_dump_class,
+	.dump_stats	=	sfq_dump_class_stats,
+	.walk		=	sfq_walk,
+};
+
+static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
+	.cl_ops		=	&sfq_class_ops,
 	.id		=	"sfq",
 	.priv_size	=	sizeof(struct sfq_sched_data),
 	.enqueue	=	sfq_enqueue,
 	.dequeue	=	sfq_dequeue,
-	.requeue	=	sfq_requeue,
+	.peek		=	qdisc_peek_dequeued,
 	.drop		=	sfq_drop,
 	.init		=	sfq_init,
 	.reset		=	sfq_reset,
@@ -492,7 +931,7 @@ static int __init sfq_module_init(void)
 {
 	return register_qdisc(&sfq_qdisc_ops);
 }
-static void __exit sfq_module_exit(void) 
+static void __exit sfq_module_exit(void)
 {
 	unregister_qdisc(&sfq_qdisc_ops);
 }
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index cb9711ea8c6..18ff6343370 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -12,30 +12,14 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/jiffies.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
 #include <net/pkt_sched.h>
 
 
@@ -114,129 +98,177 @@
 	changed the limit is not effective anymore.
 */
 
-struct tbf_sched_data
-{
+struct tbf_sched_data {
 /* Parameters */
 	u32		limit;		/* Maximal length of backlog: bytes */
-	u32		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
-	u32		mtu;
 	u32		max_size;
-	struct qdisc_rate_table	*R_tab;
-	struct qdisc_rate_table	*P_tab;
+	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
+	s64		mtu;
+	struct psched_ratecfg rate;
+	struct psched_ratecfg peak;
 
 /* Variables */
-	long	tokens;			/* Current number of B tokens */
-	long	ptokens;		/* Current number of P tokens */
-	psched_time_t	t_c;		/* Time check-point */
-	struct timer_list wd_timer;	/* Watchdog timer */
+	s64	tokens;			/* Current number of B tokens */
+	s64	ptokens;		/* Current number of P tokens */
+	s64	t_c;			/* Time check-point */
 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
+	struct qdisc_watchdog watchdog;	/* Watchdog timer */
 };
 
-#define L2T(q,L)   ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
-#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
 
-static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+/* Time to Length, convert time in ns to length in bytes
+ * to determinate how many bytes can be sent in given time.
+ */
+static u64 psched_ns_t2l(const struct psched_ratecfg *r,
+			 u64 time_in_ns)
 {
-	struct tbf_sched_data *q = qdisc_priv(sch);
-	int ret;
+	/* The formula is :
+	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
+	 */
+	u64 len = time_in_ns * r->rate_bytes_ps;
 
-	if (skb->len > q->max_size) {
-		sch->qstats.drops++;
-#ifdef CONFIG_NET_CLS_POLICE
-		if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
-#endif
-			kfree_skb(skb);
+	do_div(len, NSEC_PER_SEC);
 
-		return NET_XMIT_DROP;
+	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
+		do_div(len, 53);
+		len = len * 48;
 	}
 
-	if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
-		sch->qstats.drops++;
-		return ret;
-	}
+	if (len > r->overhead)
+		len -= r->overhead;
+	else
+		len = 0;
 
-	sch->q.qlen++;
-	sch->bstats.bytes += skb->len;
-	sch->bstats.packets++;
-	return 0;
+	return len;
 }
 
-static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
+/*
+ * Return length of individual segments of a gso packet,
+ * including all headers (MAC, IP, TCP/UDP)
+ */
+static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
+{
+	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+	return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/* GSO packet is too big, segment it so that tbf can transmit
+ * each segment in time
+ */
+static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *segs, *nskb;
+	netdev_features_t features = netif_skb_features(skb);
+	int ret, nb;
+
+	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+
+	if (IS_ERR_OR_NULL(segs))
+		return qdisc_reshape_fail(skb, sch);
+
+	nb = 0;
+	while (segs) {
+		nskb = segs->next;
+		segs->next = NULL;
+		qdisc_skb_cb(segs)->pkt_len = segs->len;
+		ret = qdisc_enqueue(segs, q->qdisc);
+		if (ret != NET_XMIT_SUCCESS) {
+			if (net_xmit_drop_count(ret))
+				sch->qstats.drops++;
+		} else {
+			nb++;
+		}
+		segs = nskb;
+	}
+	sch->q.qlen += nb;
+	if (nb > 1)
+		qdisc_tree_decrease_qlen(sch, 1 - nb);
+	consume_skb(skb);
+	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
+
+static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 	int ret;
 
-	if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
-		sch->q.qlen++;
-		sch->qstats.requeues++;
+	if (qdisc_pkt_len(skb) > q->max_size) {
+		if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
+			return tbf_segment(skb, sch);
+		return qdisc_reshape_fail(skb, sch);
+	}
+	ret = qdisc_enqueue(skb, q->qdisc);
+	if (ret != NET_XMIT_SUCCESS) {
+		if (net_xmit_drop_count(ret))
+			sch->qstats.drops++;
+		return ret;
 	}
 
-	return ret;
+	sch->q.qlen++;
+	return NET_XMIT_SUCCESS;
 }
 
-static unsigned int tbf_drop(struct Qdisc* sch)
+static unsigned int tbf_drop(struct Qdisc *sch)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
-	unsigned int len;
+	unsigned int len = 0;
 
-	if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
+	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 		sch->q.qlen--;
 		sch->qstats.drops++;
 	}
 	return len;
 }
 
-static void tbf_watchdog(unsigned long arg)
+static bool tbf_peak_present(const struct tbf_sched_data *q)
 {
-	struct Qdisc *sch = (struct Qdisc*)arg;
-
-	sch->flags &= ~TCQ_F_THROTTLED;
-	netif_schedule(sch->dev);
+	return q->peak.rate_bytes_ps;
 }
 
-static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
+static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
 
-	skb = q->qdisc->dequeue(q->qdisc);
+	skb = q->qdisc->ops->peek(q->qdisc);
 
 	if (skb) {
-		psched_time_t now;
-		long toks, delay;
-		long ptoks = 0;
-		unsigned int len = skb->len;
-
-		PSCHED_GET_TIME(now);
+		s64 now;
+		s64 toks;
+		s64 ptoks = 0;
+		unsigned int len = qdisc_pkt_len(skb);
 
-		toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer);
+		now = ktime_to_ns(ktime_get());
+		toks = min_t(s64, now - q->t_c, q->buffer);
 
-		if (q->P_tab) {
+		if (tbf_peak_present(q)) {
 			ptoks = toks + q->ptokens;
-			if (ptoks > (long)q->mtu)
+			if (ptoks > q->mtu)
 				ptoks = q->mtu;
-			ptoks -= L2T_P(q, len);
+			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
 		}
 		toks += q->tokens;
-		if (toks > (long)q->buffer)
+		if (toks > q->buffer)
 			toks = q->buffer;
-		toks -= L2T(q, len);
+		toks -= (s64) psched_l2t_ns(&q->rate, len);
 
 		if ((toks|ptoks) >= 0) {
+			skb = qdisc_dequeue_peeked(q->qdisc);
+			if (unlikely(!skb))
+				return NULL;
+
 			q->t_c = now;
 			q->tokens = toks;
 			q->ptokens = ptoks;
 			sch->q.qlen--;
-			sch->flags &= ~TCQ_F_THROTTLED;
+			qdisc_unthrottled(sch);
+			qdisc_bstats_update(sch, skb);
 			return skb;
 		}
 
-		delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks));
-
-		if (delay == 0)
-			delay = 1;
-
-		mod_timer(&q->wd_timer, jiffies+delay);
+		qdisc_watchdog_schedule_ns(&q->watchdog,
+					   now + max_t(long, -toks, -ptoks));
 
 		/* Maybe we have a shorter packet in the queue,
 		   which can be sent now. It sounds cool,
@@ -249,135 +281,158 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
 		   (cf. CSZ, HPFQ, HFSC)
 		 */
 
-		if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
-			/* When requeue fails skb is dropped */
-			sch->q.qlen--;
-			sch->qstats.drops++;
-		}
-
-		sch->flags |= TCQ_F_THROTTLED;
 		sch->qstats.overlimits++;
 	}
 	return NULL;
 }
 
-static void tbf_reset(struct Qdisc* sch)
+static void tbf_reset(struct Qdisc *sch)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 
 	qdisc_reset(q->qdisc);
 	sch->q.qlen = 0;
-	PSCHED_GET_TIME(q->t_c);
+	q->t_c = ktime_to_ns(ktime_get());
 	q->tokens = q->buffer;
 	q->ptokens = q->mtu;
-	sch->flags &= ~TCQ_F_THROTTLED;
-	del_timer(&q->wd_timer);
+	qdisc_watchdog_cancel(&q->watchdog);
 }
 
-static struct Qdisc *tbf_create_dflt_qdisc(struct net_device *dev, u32 limit)
-{
-	struct Qdisc *q = qdisc_create_dflt(dev, &bfifo_qdisc_ops);
-        struct rtattr *rta;
-	int ret;
-
-	if (q) {
-		rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
-		if (rta) {
-			rta->rta_type = RTM_NEWQDISC;
-			rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); 
-			((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
-
-			ret = q->ops->change(q, rta);
-			kfree(rta);
-
-			if (ret == 0)
-				return q;
-		}
-		qdisc_destroy(q);
-	}
-
-	return NULL;
-}
+static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
+	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
+	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
+	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
+	[TCA_TBF_BURST] = { .type = NLA_U32 },
+	[TCA_TBF_PBURST] = { .type = NLA_U32 },
+};
 
-static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
+static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
 {
-	int err = -EINVAL;
+	int err;
 	struct tbf_sched_data *q = qdisc_priv(sch);
-	struct rtattr *tb[TCA_TBF_PTAB];
+	struct nlattr *tb[TCA_TBF_MAX + 1];
 	struct tc_tbf_qopt *qopt;
-	struct qdisc_rate_table *rtab = NULL;
-	struct qdisc_rate_table *ptab = NULL;
 	struct Qdisc *child = NULL;
-	int max_size,n;
-
-	if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) ||
-	    tb[TCA_TBF_PARMS-1] == NULL ||
-	    RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
+	struct psched_ratecfg rate;
+	struct psched_ratecfg peak;
+	u64 max_size;
+	s64 buffer, mtu;
+	u64 rate64 = 0, prate64 = 0;
+
+	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	if (tb[TCA_TBF_PARMS] == NULL)
 		goto done;
 
-	qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
-	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
-	if (rtab == NULL)
-		goto done;
+	qopt = nla_data(tb[TCA_TBF_PARMS]);
+	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
+		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
+					      tb[TCA_TBF_RTAB]));
+
+	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
+			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
+						      tb[TCA_TBF_PTAB]));
+
+	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
+	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
+
+	if (tb[TCA_TBF_RATE64])
+		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
+	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
+
+	if (tb[TCA_TBF_BURST]) {
+		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
+		buffer = psched_l2t_ns(&rate, max_size);
+	} else {
+		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
+	}
 
 	if (qopt->peakrate.rate) {
-		if (qopt->peakrate.rate > qopt->rate.rate)
-			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
-		if (ptab == NULL)
+		if (tb[TCA_TBF_PRATE64])
+			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
+		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
+		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
+			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
+					peak.rate_bytes_ps, rate.rate_bytes_ps);
+			err = -EINVAL;
 			goto done;
+		}
+
+		if (tb[TCA_TBF_PBURST]) {
+			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
+			max_size = min_t(u32, max_size, pburst);
+			mtu = psched_l2t_ns(&peak, pburst);
+		} else {
+			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
+		}
+	} else {
+		memset(&peak, 0, sizeof(peak));
 	}
 
-	for (n = 0; n < 256; n++)
-		if (rtab->data[n] > qopt->buffer) break;
-	max_size = (n << qopt->rate.cell_log)-1;
-	if (ptab) {
-		int size;
+	if (max_size < psched_mtu(qdisc_dev(sch)))
+		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
+				    max_size, qdisc_dev(sch)->name,
+				    psched_mtu(qdisc_dev(sch)));
 
-		for (n = 0; n < 256; n++)
-			if (ptab->data[n] > qopt->mtu) break;
-		size = (n << qopt->peakrate.cell_log)-1;
-		if (size < max_size) max_size = size;
-	}
-	if (max_size < 0)
+	if (!max_size) {
+		err = -EINVAL;
 		goto done;
+	}
 
-	if (q->qdisc == &noop_qdisc) {
-		if ((child = tbf_create_dflt_qdisc(sch->dev, qopt->limit)) == NULL)
+	if (q->qdisc != &noop_qdisc) {
+		err = fifo_set_limit(q->qdisc, qopt->limit);
+		if (err)
+			goto done;
+	} else if (qopt->limit > 0) {
+		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
+		if (IS_ERR(child)) {
+			err = PTR_ERR(child);
 			goto done;
+		}
 	}
 
 	sch_tree_lock(sch);
-	if (child) q->qdisc = child;
+	if (child) {
+		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+		qdisc_destroy(q->qdisc);
+		q->qdisc = child;
+	}
 	q->limit = qopt->limit;
-	q->mtu = qopt->mtu;
+	if (tb[TCA_TBF_PBURST])
+		q->mtu = mtu;
+	else
+		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
 	q->max_size = max_size;
-	q->buffer = qopt->buffer;
+	if (tb[TCA_TBF_BURST])
+		q->buffer = buffer;
+	else
+		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
 	q->tokens = q->buffer;
 	q->ptokens = q->mtu;
-	rtab = xchg(&q->R_tab, rtab);
-	ptab = xchg(&q->P_tab, ptab);
+
+	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
+	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
+
 	sch_tree_unlock(sch);
 	err = 0;
 done:
-	if (rtab)
-		qdisc_put_rtab(rtab);
-	if (ptab)
-		qdisc_put_rtab(ptab);
 	return err;
 }
 
-static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
+static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 
 	if (opt == NULL)
 		return -EINVAL;
 
-	PSCHED_GET_TIME(q->t_c);
-	init_timer(&q->wd_timer);
-	q->wd_timer.function = tbf_watchdog;
-	q->wd_timer.data = (unsigned long)sch;
-
+	q->t_c = ktime_to_ns(ktime_get());
+	qdisc_watchdog_init(&q->watchdog, sch);
 	q->qdisc = &noop_qdisc;
 
 	return tbf_change(sch, opt);
@@ -387,41 +442,43 @@ static void tbf_destroy(struct Qdisc *sch)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 
-	del_timer(&q->wd_timer);
-
-	if (q->P_tab)
-		qdisc_put_rtab(q->P_tab);
-	if (q->R_tab)
-		qdisc_put_rtab(q->R_tab);
-
+	qdisc_watchdog_cancel(&q->watchdog);
 	qdisc_destroy(q->qdisc);
 }
 
 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
-	unsigned char	 *b = skb->tail;
-	struct rtattr *rta;
+	struct nlattr *nest;
 	struct tc_tbf_qopt opt;
 
-	rta = (struct rtattr*)b;
-	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	sch->qstats.backlog = q->qdisc->qstats.backlog;
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
 
 	opt.limit = q->limit;
-	opt.rate = q->R_tab->rate;
-	if (q->P_tab)
-		opt.peakrate = q->P_tab->rate;
+	psched_ratecfg_getrate(&opt.rate, &q->rate);
+	if (tbf_peak_present(q))
+		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
 	else
 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
-	opt.mtu = q->mtu;
-	opt.buffer = q->buffer;
-	RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
-	rta->rta_len = skb->tail - b;
-
-	return skb->len;
-
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
+	opt.mtu = PSCHED_NS2TICKS(q->mtu);
+	opt.buffer = PSCHED_NS2TICKS(q->buffer);
+	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
+	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
+		goto nla_put_failure;
+	if (tbf_peak_present(q) &&
+	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
+	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
 	return -1;
 }
 
@@ -430,9 +487,6 @@ static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 
-	if (cl != 1) 	/* only one class */
-		return -ENOENT;
-
 	tcm->tcm_handle |= TC_H_MIN(1);
 	tcm->tcm_info = q->qdisc->handle;
 
@@ -448,9 +502,10 @@ static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 		new = &noop_qdisc;
 
 	sch_tree_lock(sch);
-	*old = xchg(&q->qdisc, new);
+	*old = q->qdisc;
+	q->qdisc = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 	qdisc_reset(*old);
-	sch->q.qlen = 0;
 	sch_tree_unlock(sch);
 
 	return 0;
@@ -471,17 +526,6 @@ static void tbf_put(struct Qdisc *sch, unsigned long arg)
 {
 }
 
-static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid, 
-			    struct rtattr **tca, unsigned long *arg)
-{
-	return -ENOSYS;
-}
-
-static int tbf_delete(struct Qdisc *sch, unsigned long arg)
-{
-	return -ENOSYS;
-}
-
 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
 	if (!walker->stop) {
@@ -494,32 +538,23 @@ static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 	}
 }
 
-static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
-{
-	return NULL;
-}
-
-static struct Qdisc_class_ops tbf_class_ops =
-{
+static const struct Qdisc_class_ops tbf_class_ops = {
 	.graft		=	tbf_graft,
 	.leaf		=	tbf_leaf,
 	.get		=	tbf_get,
 	.put		=	tbf_put,
-	.change		=	tbf_change_class,
-	.delete		=	tbf_delete,
 	.walk		=	tbf_walk,
-	.tcf_chain	=	tbf_find_tcf,
 	.dump		=	tbf_dump_class,
 };
 
-static struct Qdisc_ops tbf_qdisc_ops = {
+static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 	.next		=	NULL,
 	.cl_ops		=	&tbf_class_ops,
 	.id		=	"tbf",
 	.priv_size	=	sizeof(struct tbf_sched_data),
 	.enqueue	=	tbf_enqueue,
 	.dequeue	=	tbf_dequeue,
-	.requeue	=	tbf_requeue,
+	.peek		=	qdisc_peek_dequeued,
 	.drop		=	tbf_drop,
 	.init		=	tbf_init,
 	.reset		=	tbf_reset,
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 79b8ef34c6e..47416716294 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -9,31 +9,18 @@
  */
 
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
 #include <linux/if_arp.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
 #include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
 #include <linux/init.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
 #include <linux/moduleparam.h>
-#include <net/sock.h>
+#include <net/dst.h>
+#include <net/neighbour.h>
 #include <net/pkt_sched.h>
 
 /*
@@ -66,77 +53,73 @@
       which will not break load balancing, though native slave
       traffic will have the highest priority.  */
 
-struct teql_master
-{
+struct teql_master {
 	struct Qdisc_ops qops;
 	struct net_device *dev;
 	struct Qdisc *slaves;
 	struct list_head master_list;
-	struct net_device_stats stats;
+	unsigned long	tx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_errors;
+	unsigned long	tx_dropped;
 };
 
-struct teql_sched_data
-{
+struct teql_sched_data {
 	struct Qdisc *next;
 	struct teql_master *m;
-	struct neighbour *ncache;
 	struct sk_buff_head q;
 };
 
-#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next)
+#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
 
-#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST)
+#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
 
 /* "teql*" qdisc routines */
 
 static int
-teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
-	struct net_device *dev = sch->dev;
+	struct net_device *dev = qdisc_dev(sch);
 	struct teql_sched_data *q = qdisc_priv(sch);
 
-	__skb_queue_tail(&q->q, skb);
-	if (q->q.qlen <= dev->tx_queue_len) {
-		sch->bstats.bytes += skb->len;
-		sch->bstats.packets++;
-		return 0;
+	if (q->q.qlen < dev->tx_queue_len) {
+		__skb_queue_tail(&q->q, skb);
+		return NET_XMIT_SUCCESS;
 	}
 
-	__skb_unlink(skb, &q->q);
-	kfree_skb(skb);
-	sch->qstats.drops++;
-	return NET_XMIT_DROP;
-}
-
-static int
-teql_requeue(struct sk_buff *skb, struct Qdisc* sch)
-{
-	struct teql_sched_data *q = qdisc_priv(sch);
-
-	__skb_queue_head(&q->q, skb);
-	sch->qstats.requeues++;
-	return 0;
+	return qdisc_drop(skb, sch);
 }
 
 static struct sk_buff *
-teql_dequeue(struct Qdisc* sch)
+teql_dequeue(struct Qdisc *sch)
 {
 	struct teql_sched_data *dat = qdisc_priv(sch);
+	struct netdev_queue *dat_queue;
 	struct sk_buff *skb;
 
 	skb = __skb_dequeue(&dat->q);
+	dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
 	if (skb == NULL) {
-		struct net_device *m = dat->m->dev->qdisc->dev;
+		struct net_device *m = qdisc_dev(dat_queue->qdisc);
 		if (m) {
 			dat->m->slaves = sch;
 			netif_wake_queue(m);
 		}
+	} else {
+		qdisc_bstats_update(sch, skb);
 	}
-	sch->q.qlen = dat->q.qlen + dat->m->dev->qdisc->q.qlen;
+	sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
 	return skb;
 }
 
-static __inline__ void
+static struct sk_buff *
+teql_peek(struct Qdisc *sch)
+{
+	/* teql is meant to be used as root qdisc */
+	return NULL;
+}
+
+static inline void
 teql_neigh_release(struct neighbour *n)
 {
 	if (n)
@@ -144,23 +127,23 @@ teql_neigh_release(struct neighbour *n)
 }
 
 static void
-teql_reset(struct Qdisc* sch)
+teql_reset(struct Qdisc *sch)
 {
 	struct teql_sched_data *dat = qdisc_priv(sch);
 
 	skb_queue_purge(&dat->q);
 	sch->q.qlen = 0;
-	teql_neigh_release(xchg(&dat->ncache, NULL));
 }
 
 static void
-teql_destroy(struct Qdisc* sch)
+teql_destroy(struct Qdisc *sch)
 {
 	struct Qdisc *q, *prev;
 	struct teql_sched_data *dat = qdisc_priv(sch);
 	struct teql_master *master = dat->m;
 
-	if ((prev = master->slaves) != NULL) {
+	prev = master->slaves;
+	if (prev) {
 		do {
 			q = NEXT_SLAVE(prev);
 			if (q == sch) {
@@ -168,25 +151,30 @@ teql_destroy(struct Qdisc* sch)
 				if (q == master->slaves) {
 					master->slaves = NEXT_SLAVE(q);
 					if (q == master->slaves) {
+						struct netdev_queue *txq;
+						spinlock_t *root_lock;
+
+						txq = netdev_get_tx_queue(master->dev, 0);
 						master->slaves = NULL;
-						spin_lock_bh(&master->dev->queue_lock);
-						qdisc_reset(master->dev->qdisc);
-						spin_unlock_bh(&master->dev->queue_lock);
+
+						root_lock = qdisc_root_sleeping_lock(txq->qdisc);
+						spin_lock_bh(root_lock);
+						qdisc_reset(txq->qdisc);
+						spin_unlock_bh(root_lock);
 					}
 				}
 				skb_queue_purge(&dat->q);
-				teql_neigh_release(xchg(&dat->ncache, NULL));
 				break;
 			}
-				
+
 		} while ((prev = q) != master->slaves);
 	}
 }
 
-static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt)
+static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
 {
-	struct net_device *dev = sch->dev;
-	struct teql_master *m = (struct teql_master*)sch->ops;
+	struct net_device *dev = qdisc_dev(sch);
+	struct teql_master *m = (struct teql_master *)sch->ops;
 	struct teql_sched_data *q = qdisc_priv(sch);
 
 	if (dev->hard_header_len > m->dev->hard_header_len)
@@ -201,10 +189,13 @@ static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt)
 
 	if (m->slaves) {
 		if (m->dev->flags & IFF_UP) {
-			if ((m->dev->flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT))
-			    || (m->dev->flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST))
-			    || (m->dev->flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST))
-			    || dev->mtu < m->dev->mtu)
+			if ((m->dev->flags & IFF_POINTOPOINT &&
+			     !(dev->flags & IFF_POINTOPOINT)) ||
+			    (m->dev->flags & IFF_BROADCAST &&
+			     !(dev->flags & IFF_BROADCAST)) ||
+			    (m->dev->flags & IFF_MULTICAST &&
+			     !(dev->flags & IFF_MULTICAST)) ||
+			    dev->mtu < m->dev->mtu)
 				return -EINVAL;
 		} else {
 			if (!(dev->flags&IFF_POINTOPOINT))
@@ -227,58 +218,74 @@ static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt)
 	return 0;
 }
 
-/* "teql*" netdevice routines */
 
 static int
-__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
+__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
+	       struct net_device *dev, struct netdev_queue *txq,
+	       struct dst_entry *dst)
 {
-	struct teql_sched_data *q = qdisc_priv(dev->qdisc);
-	struct neighbour *mn = skb->dst->neighbour;
-	struct neighbour *n = q->ncache;
+	struct neighbour *n;
+	int err = 0;
 
-	if (mn->tbl == NULL)
-		return -EINVAL;
-	if (n && n->tbl == mn->tbl &&
-	    memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) {
-		atomic_inc(&n->refcnt);
-	} else {
-		n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev);
-		if (IS_ERR(n))
-			return PTR_ERR(n);
+	n = dst_neigh_lookup_skb(dst, skb);
+	if (!n)
+		return -ENOENT;
+
+	if (dst->dev != dev) {
+		struct neighbour *mn;
+
+		mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev);
+		neigh_release(n);
+		if (IS_ERR(mn))
+			return PTR_ERR(mn);
+		n = mn;
 	}
+
 	if (neigh_event_send(n, skb_res) == 0) {
 		int err;
-		read_lock(&n->lock);
-		err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len);
-		read_unlock(&n->lock);
-		if (err < 0) {
-			neigh_release(n);
-			return -EINVAL;
-		}
-		teql_neigh_release(xchg(&q->ncache, n));
-		return 0;
+		char haddr[MAX_ADDR_LEN];
+
+		neigh_ha_snapshot(haddr, n, dev);
+		err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
+				      NULL, skb->len);
+
+		if (err < 0)
+			err = -EINVAL;
+	} else {
+		err = (skb_res == NULL) ? -EAGAIN : 1;
 	}
 	neigh_release(n);
-	return (skb_res == NULL) ? -EAGAIN : 1;
+	return err;
 }
 
-static __inline__ int
-teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
+static inline int teql_resolve(struct sk_buff *skb,
+			       struct sk_buff *skb_res,
+			       struct net_device *dev,
+			       struct netdev_queue *txq)
 {
-	if (dev->hard_header == NULL ||
-	    skb->dst == NULL ||
-	    skb->dst->neighbour == NULL)
+	struct dst_entry *dst = skb_dst(skb);
+	int res;
+
+	if (txq->qdisc == &noop_qdisc)
+		return -ENODEV;
+
+	if (!dev->header_ops || !dst)
 		return 0;
-	return __teql_resolve(skb, skb_res, dev);
+
+	rcu_read_lock();
+	res = __teql_resolve(skb, skb_res, dev, txq, dst);
+	rcu_read_unlock();
+
+	return res;
 }
 
-static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct teql_master *master = netdev_priv(dev);
 	struct Qdisc *start, *q;
 	int busy;
 	int nores;
-	int len = skb->len;
+	int subq = skb_get_queue_mapping(skb);
 	struct sk_buff *skb_res = NULL;
 
 	start = master->slaves;
@@ -287,47 +294,51 @@ restart:
 	nores = 0;
 	busy = 0;
 
-	if ((q = start) == NULL)
+	q = start;
+	if (!q)
 		goto drop;
 
 	do {
-		struct net_device *slave = q->dev;
-		
-		if (slave->qdisc_sleeping != q)
+		struct net_device *slave = qdisc_dev(q);
+		struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
+		const struct net_device_ops *slave_ops = slave->netdev_ops;
+
+		if (slave_txq->qdisc_sleeping != q)
 			continue;
-		if (netif_queue_stopped(slave) || ! netif_running(slave)) {
+		if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
+		    !netif_running(slave)) {
 			busy = 1;
 			continue;
 		}
 
-		switch (teql_resolve(skb, skb_res, slave)) {
+		switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
 		case 0:
-			if (spin_trylock(&slave->xmit_lock)) {
-				slave->xmit_lock_owner = smp_processor_id();
-				if (!netif_queue_stopped(slave) &&
-				    slave->hard_start_xmit(skb, slave) == 0) {
-					slave->xmit_lock_owner = -1;
-					spin_unlock(&slave->xmit_lock);
+			if (__netif_tx_trylock(slave_txq)) {
+				unsigned int length = qdisc_pkt_len(skb);
+
+				if (!netif_xmit_frozen_or_stopped(slave_txq) &&
+				    slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) {
+					txq_trans_update(slave_txq);
+					__netif_tx_unlock(slave_txq);
 					master->slaves = NEXT_SLAVE(q);
 					netif_wake_queue(dev);
-					master->stats.tx_packets++;
-					master->stats.tx_bytes += len;
-					return 0;
+					master->tx_packets++;
+					master->tx_bytes += length;
+					return NETDEV_TX_OK;
 				}
-				slave->xmit_lock_owner = -1;
-				spin_unlock(&slave->xmit_lock);
+				__netif_tx_unlock(slave_txq);
 			}
-			if (netif_queue_stopped(dev))
+			if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))
 				busy = 1;
 			break;
 		case 1:
 			master->slaves = NEXT_SLAVE(q);
-			return 0;
+			return NETDEV_TX_OK;
 		default:
 			nores = 1;
 			break;
 		}
-		__skb_pull(skb, skb->nh.raw - skb->data);
+		__skb_pull(skb, skb_network_offset(skb));
 	} while ((q = NEXT_SLAVE(q)) != start);
 
 	if (nores && skb_res == NULL) {
@@ -337,22 +348,22 @@ restart:
 
 	if (busy) {
 		netif_stop_queue(dev);
-		return 1;
+		return NETDEV_TX_BUSY;
 	}
-	master->stats.tx_errors++;
+	master->tx_errors++;
 
 drop:
-	master->stats.tx_dropped++;
+	master->tx_dropped++;
 	dev_kfree_skb(skb);
-	return 0;
+	return NETDEV_TX_OK;
 }
 
 static int teql_master_open(struct net_device *dev)
 {
-	struct Qdisc * q;
+	struct Qdisc *q;
 	struct teql_master *m = netdev_priv(dev);
 	int mtu = 0xFFFE;
-	unsigned flags = IFF_NOARP|IFF_MULTICAST;
+	unsigned int flags = IFF_NOARP | IFF_MULTICAST;
 
 	if (m->slaves == NULL)
 		return -EUNATCH;
@@ -361,7 +372,7 @@ static int teql_master_open(struct net_device *dev)
 
 	q = m->slaves;
 	do {
-		struct net_device *slave = q->dev;
+		struct net_device *slave = qdisc_dev(q);
 
 		if (slave == NULL)
 			return -EUNATCH;
@@ -395,10 +406,16 @@ static int teql_master_close(struct net_device *dev)
 	return 0;
 }
 
-static struct net_device_stats *teql_master_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
+						     struct rtnl_link_stats64 *stats)
 {
 	struct teql_master *m = netdev_priv(dev);
-	return &m->stats;
+
+	stats->tx_packets	= m->tx_packets;
+	stats->tx_bytes		= m->tx_bytes;
+	stats->tx_errors	= m->tx_errors;
+	stats->tx_dropped	= m->tx_dropped;
+	return stats;
 }
 
 static int teql_master_mtu(struct net_device *dev, int new_mtu)
@@ -412,15 +429,23 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu)
 	q = m->slaves;
 	if (q) {
 		do {
-			if (new_mtu > q->dev->mtu)
+			if (new_mtu > qdisc_dev(q)->mtu)
 				return -EINVAL;
-		} while ((q=NEXT_SLAVE(q)) != m->slaves);
+		} while ((q = NEXT_SLAVE(q)) != m->slaves);
 	}
 
 	dev->mtu = new_mtu;
 	return 0;
 }
 
+static const struct net_device_ops teql_netdev_ops = {
+	.ndo_open	= teql_master_open,
+	.ndo_stop	= teql_master_close,
+	.ndo_start_xmit	= teql_master_xmit,
+	.ndo_get_stats64 = teql_master_stats64,
+	.ndo_change_mtu	= teql_master_mtu,
+};
+
 static __init void teql_master_setup(struct net_device *dev)
 {
 	struct teql_master *master = netdev_priv(dev);
@@ -428,26 +453,22 @@ static __init void teql_master_setup(struct net_device *dev)
 
 	master->dev	= dev;
 	ops->priv_size  = sizeof(struct teql_sched_data);
-	
+
 	ops->enqueue	=	teql_enqueue;
 	ops->dequeue	=	teql_dequeue;
-	ops->requeue	=	teql_requeue;
+	ops->peek	=	teql_peek;
 	ops->init	=	teql_qdisc_init;
 	ops->reset	=	teql_reset;
 	ops->destroy	=	teql_destroy;
 	ops->owner	=	THIS_MODULE;
 
-	dev->open		= teql_master_open;
-	dev->hard_start_xmit	= teql_master_xmit;
-	dev->stop		= teql_master_close;
-	dev->get_stats		= teql_master_stats;
-	dev->change_mtu		= teql_master_mtu;
+	dev->netdev_ops =       &teql_netdev_ops;
 	dev->type		= ARPHRD_VOID;
 	dev->mtu		= 1500;
 	dev->tx_queue_len	= 100;
 	dev->flags		= IFF_NOARP;
 	dev->hard_header_len	= LL_MAX_HEADER;
-	SET_MODULE_OWNER(dev);
+	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
 }
 
 static LIST_HEAD(master_dev_list);
@@ -492,7 +513,7 @@ static int __init teql_init(void)
 	return i ? 0 : err;
 }
 
-static void __exit teql_exit(void) 
+static void __exit teql_exit(void)
 {
 	struct teql_master *master, *nxt;