diff options
Diffstat (limited to 'net/sched')
63 files changed, 20333 insertions, 8300 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 778b1e5a4b5..a1a8e29e5fc 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -2,10 +2,9 @@ # Traffic control configuration. # -menu "QoS and/or fair queueing" - -config NET_SCHED +menuconfig NET_SCHED bool "QoS and/or fair queueing" + select NET_SCH_FIFO ---help--- When the kernel has several packets to send out over a network device, it has to decide which ones to send first, which ones to @@ -25,7 +24,7 @@ config NET_SCHED To administer these schedulers, you'll need the user-level utilities from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>. That package also contains some documentation; for more, check out - <http://linux-net.osdl.org/index.php/Iproute2>. + <http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2>. This Quality of Service (QoS) support will enable you to use Differentiated Services (diffserv) and Resource Reservation Protocol @@ -42,62 +41,6 @@ config NET_SCHED if NET_SCHED -choice - prompt "Packet scheduler clock source" - default NET_SCH_CLK_GETTIMEOFDAY - ---help--- - Packet schedulers need a monotonic clock that increments at a static - rate. The kernel provides several suitable interfaces, each with - different properties: - - - high resolution (us or better) - - fast to read (minimal locking, no i/o access) - - synchronized on all processors - - handles cpu clock frequency changes - - but nothing provides all of the above. - -config NET_SCH_CLK_JIFFIES - bool "Timer interrupt" - ---help--- - Say Y here if you want to use the timer interrupt (jiffies) as clock - source. This clock source is fast, synchronized on all processors and - handles cpu clock frequency changes, but its resolution is too low - for accurate shaping except at very low speed. - -config NET_SCH_CLK_GETTIMEOFDAY - bool "gettimeofday" - ---help--- - Say Y here if you want to use gettimeofday as clock source. This clock - source has high resolution, is synchronized on all processors and - handles cpu clock frequency changes, but it is slow. - - Choose this if you need a high resolution clock source but can't use - the CPU's cycle counter. - -# don't allow on SMP x86 because they can have unsynchronized TSCs. -# gettimeofday is a good alternative -config NET_SCH_CLK_CPU - bool "CPU cycle counter" - depends on ((X86_TSC || X86_64) && !SMP) || ALPHA || SPARC64 || PPC64 || IA64 - ---help--- - Say Y here if you want to use the CPU's cycle counter as clock source. - This is a cheap and high resolution clock source, but on some - architectures it is not synchronized on all processors and doesn't - handle cpu clock frequency changes. - - The useable cycle counters are: - - x86/x86_64 - Timestamp Counter - alpha - Cycle Counter - sparc64 - %ticks register - ppc64 - Time base - ia64 - Interval Time Counter - - Choose this if your CPU's cycle counter is working properly. - -endchoice - comment "Queueing/Scheduling" config NET_SCH_CBQ @@ -149,7 +92,7 @@ config NET_SCH_ATM select classes of this queuing discipline. Each class maps the flow(s) it is handling to a given virtual circuit. - See the top of <file:net/sched/sch_atm.c>) for more details. + See the top of <file:net/sched/sch_atm.c> for more details. To compile this code as a module, choose M here: the module will be called sch_atm. @@ -163,6 +106,15 @@ config NET_SCH_PRIO To compile this code as a module, choose M here: the module will be called sch_prio. +config NET_SCH_MULTIQ + tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)" + ---help--- + Say Y here if you want to use an n-band queue packet scheduler + to support devices that have multiple hardware transmit queues. + + To compile this code as a module, choose M here: the + module will be called sch_multiq. + config NET_SCH_RED tristate "Random Early Detection (RED)" ---help--- @@ -174,11 +126,22 @@ config NET_SCH_RED To compile this code as a module, choose M here: the module will be called sch_red. +config NET_SCH_SFB + tristate "Stochastic Fair Blue (SFB)" + ---help--- + Say Y here if you want to use the Stochastic Fair Blue (SFB) + packet scheduling algorithm. + + See the top of <file:net/sched/sch_sfb.c> for more details. + + To compile this code as a module, choose M here: the + module will be called sch_sfb. + config NET_SCH_SFQ tristate "Stochastic Fairness Queueing (SFQ)" ---help--- Say Y here if you want to use the Stochastic Fairness Queueing (SFQ) - packet scheduling algorithm . + packet scheduling algorithm. See the top of <file:net/sched/sch_sfq.c> for more details. @@ -242,8 +205,112 @@ config NET_SCH_NETEM If unsure, say N. +config NET_SCH_DRR + tristate "Deficit Round Robin scheduler (DRR)" + help + Say Y here if you want to use the Deficit Round Robin (DRR) packet + scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_drr. + + If unsure, say N. + +config NET_SCH_MQPRIO + tristate "Multi-queue priority scheduler (MQPRIO)" + help + Say Y here if you want to use the Multi-queue Priority scheduler. + This scheduler allows QOS to be offloaded on NICs that have support + for offloading QOS schedulers. + + To compile this driver as a module, choose M here: the module will + be called sch_mqprio. + + If unsure, say N. + +config NET_SCH_CHOKE + tristate "CHOose and Keep responsive flow scheduler (CHOKE)" + help + Say Y here if you want to use the CHOKe packet scheduler (CHOose + and Keep for responsive flows, CHOose and Kill for unresponsive + flows). This is a variation of RED which trys to penalize flows + that monopolize the queue. + + To compile this code as a module, choose M here: the + module will be called sch_choke. + +config NET_SCH_QFQ + tristate "Quick Fair Queueing scheduler (QFQ)" + help + Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_qfq. + + If unsure, say N. + +config NET_SCH_CODEL + tristate "Controlled Delay AQM (CODEL)" + help + Say Y here if you want to use the Controlled Delay (CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_codel. + + If unsure, say N. + +config NET_SCH_FQ_CODEL + tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)" + help + Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_fq_codel. + + If unsure, say N. + +config NET_SCH_FQ + tristate "Fair Queue" + help + Say Y here if you want to use the FQ packet scheduling algorithm. + + FQ does flow separation, and is able to respect pacing requirements + set by TCP stack into sk->sk_pacing_rate (for localy generated + traffic) + + To compile this driver as a module, choose M here: the module + will be called sch_fq. + + If unsure, say N. + +config NET_SCH_HHF + tristate "Heavy-Hitter Filter (HHF)" + help + Say Y here if you want to use the Heavy-Hitter Filter (HHF) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_hhf. + +config NET_SCH_PIE + tristate "Proportional Integral controller Enhanced (PIE) scheduler" + help + Say Y here if you want to use the Proportional Integral controller + Enhanced scheduler packet scheduling algorithm. + For more information, please see + http://tools.ietf.org/html/draft-pan-tsvwg-pie-00 + + To compile this driver as a module, choose M here: the module + will be called sch_pie. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" + depends on NET_CLS_ACT ---help--- Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. @@ -251,6 +318,32 @@ config NET_SCH_INGRESS To compile this code as a module, choose M here: the module will be called sch_ingress. +config NET_SCH_PLUG + tristate "Plug network traffic until release (PLUG)" + ---help--- + + This queuing discipline allows userspace to plug/unplug a network + output queue, using the netlink interface. When it receives an + enqueue command it inserts a plug into the outbound queue that + causes following packets to enqueue until a dequeue command arrives + over netlink, causing the plug to be removed and resuming the normal + packet flow. + + This module also provides a generic "network output buffering" + functionality (aka output commit), wherein upon arrival of a dequeue + command, only packets up to the first plug are released for delivery. + The Remus HA project uses this module to enable speculative execution + of virtual machines by allowing the generated network output to be rolled + back if needed. + + For more information, please refer to http://wiki.xensource.com/xenwiki/Remus + + Say Y here if you are using this kernel for Xen dom0 and + want to protect Xen guests with Remus. + + To compile this code as a module, choose M here: the + module will be called sch_plug. + comment "Classification" config NET_CLS @@ -279,7 +372,8 @@ config NET_CLS_TCINDEX config NET_CLS_ROUTE4 tristate "Routing decision (ROUTE)" - select NET_CLS_ROUTE + depends on INET + select IP_ROUTE_CLASSID select NET_CLS ---help--- If you say Y here, you will be able to classify packets @@ -288,9 +382,6 @@ config NET_CLS_ROUTE4 To compile this code as a module, choose M here: the module will be called cls_route. -config NET_CLS_ROUTE - bool - config NET_CLS_FW tristate "Netfilter mark (FW)" select NET_CLS @@ -305,7 +396,7 @@ config NET_CLS_U32 tristate "Universal 32bit comparisons w/ hashing (U32)" select NET_CLS ---help--- - Say Y here to be able to classify packetes using a universal + Say Y here to be able to classify packets using a universal 32bit pieces based comparison scheme. To compile this code as a module, choose M here: the @@ -320,14 +411,13 @@ config CLS_U32_PERF config CLS_U32_MARK bool "Netfilter marks support" - depends on NET_CLS_U32 && NETFILTER + depends on NET_CLS_U32 ---help--- Say Y here to be able to use netfilter marks as u32 key. config NET_CLS_RSVP tristate "IPv4 Resource Reservation Protocol (RSVP)" select NET_CLS - select NET_ESTIMATOR ---help--- The Resource Reservation Protocol (RSVP) permits end systems to request a minimum and maximum data flow rate for a connection; this @@ -342,18 +432,50 @@ config NET_CLS_RSVP config NET_CLS_RSVP6 tristate "IPv6 Resource Reservation Protocol (RSVP6)" select NET_CLS - select NET_ESTIMATOR ---help--- The Resource Reservation Protocol (RSVP) permits end systems to request a minimum and maximum data flow rate for a connection; this is important for real time data such as streaming sound or video. Say Y here if you want to be able to classify outgoing packets based - on their RSVP requests and you are using the IPv6. + on their RSVP requests and you are using the IPv6 protocol. To compile this code as a module, choose M here: the module will be called cls_rsvp6. +config NET_CLS_FLOW + tristate "Flow classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + a configurable combination of packet keys. This is mostly useful + in combination with SFQ. + + To compile this code as a module, choose M here: the + module will be called cls_flow. + +config NET_CLS_CGROUP + tristate "Control Group Classifier" + select NET_CLS + select CGROUP_NET_CLASSID + depends on CGROUPS + ---help--- + Say Y here if you want to classify packets based on the control + cgroup of their process. + + To compile this code as a module, choose M here: the + module will be called cls_cgroup. + +config NET_CLS_BPF + tristate "BPF-based classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + programmable BPF (JIT'ed) filters as an alternative to ematches. + + To compile this code as a module, choose M here: the module will + be called cls_bpf. + config NET_EMATCH bool "Extended Matches" select NET_CLS @@ -432,10 +554,28 @@ config NET_EMATCH_TEXT To compile this code as a module, choose M here: the module will be called em_text. +config NET_EMATCH_CANID + tristate "CAN Identifier" + depends on NET_EMATCH && (CAN=y || CAN=m) + ---help--- + Say Y here if you want to be able to classify CAN frames based + on CAN Identifier. + + To compile this code as a module, choose M here: the + module will be called em_canid. + +config NET_EMATCH_IPSET + tristate "IPset" + depends on NET_EMATCH && IP_SET + ---help--- + Say Y here if you want to be able to classify packets based on + ipset membership. + + To compile this code as a module, choose M here: the + module will be called em_ipset. + config NET_CLS_ACT bool "Actions" - depends on EXPERIMENTAL - select NET_ESTIMATOR ---help--- Say Y here if you want to use traffic control actions. Actions get attached to classifiers and are invoked after a successful @@ -454,7 +594,7 @@ config NET_ACT_POLICE module. To compile this code as a module, choose M here: the - module will be called police. + module will be called act_police. config NET_ACT_GACT tristate "Generic actions" @@ -464,7 +604,7 @@ config NET_ACT_GACT accepting packets. To compile this code as a module, choose M here: the - module will be called gact. + module will be called act_gact. config GACT_PROB bool "Probability support" @@ -480,17 +620,27 @@ config NET_ACT_MIRRED other devices. To compile this code as a module, choose M here: the - module will be called mirred. + module will be called act_mirred. config NET_ACT_IPT tristate "IPtables targets" depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES ---help--- - Say Y here to be able to invoke iptables targets after succesful + Say Y here to be able to invoke iptables targets after successful classification. To compile this code as a module, choose M here: the - module will be called ipt. + module will be called act_ipt. + +config NET_ACT_NAT + tristate "Stateless NAT" + depends on NET_CLS_ACT + ---help--- + Say Y here to do stateless NAT on IPv4 packets. You should use + netfilter for NAT unless you know what you are doing. + + To compile this code as a module, choose M here: the + module will be called act_nat. config NET_ACT_PEDIT tristate "Packet Editing" @@ -499,7 +649,7 @@ config NET_ACT_PEDIT Say Y here if you want to mangle the content of packets. To compile this code as a module, choose M here: the - module will be called pedit. + module will be called act_pedit. config NET_ACT_SIMP tristate "Simple Example (Debug)" @@ -513,17 +663,28 @@ config NET_ACT_SIMP If unsure, say N. To compile this code as a module, choose M here: the - module will be called simple. + module will be called act_simple. -config NET_CLS_POLICE - bool "Traffic Policing (obsolete)" - depends on NET_CLS_ACT!=y - select NET_ESTIMATOR - ---help--- - Say Y here if you want to do traffic policing, i.e. strict - bandwidth limiting. This option is obsoleted by the traffic - policer implemented as action, it stays here for compatibility - reasons. +config NET_ACT_SKBEDIT + tristate "SKB Editing" + depends on NET_CLS_ACT + ---help--- + Say Y here to change skb priority or queue_mapping settings. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_skbedit. + +config NET_ACT_CSUM + tristate "Checksum Updating" + depends on NET_CLS_ACT && INET + ---help--- + Say Y here to update some common checksum after some direct + packet alterations. + + To compile this code as a module, choose M here: the + module will be called act_csum. config NET_CLS_IND bool "Incoming device classification" @@ -533,14 +694,7 @@ config NET_CLS_IND classification based on the incoming device. This option is likely to disappear in favour of the metadata ematch. -config NET_ESTIMATOR - bool "Rate estimator" - ---help--- - Say Y here to allow using rate estimators to estimate the current - rate-of-flow for network devices, queues, etc. This module is - automaticaly selected if needed but can be selected manually for - statstical purposes. - endif # NET_SCHED -endmenu +config NET_SCH_FIFO + bool diff --git a/net/sched/Makefile b/net/sched/Makefile index 0f06aec6609..0a869a11f3e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -2,32 +2,47 @@ # Makefile for the Linux Traffic Control Unit. # -obj-y := sch_generic.o +obj-y := sch_generic.o sch_mq.o -obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o +obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o obj-$(CONFIG_NET_CLS) += cls_api.o obj-$(CONFIG_NET_CLS_ACT) += act_api.o obj-$(CONFIG_NET_ACT_POLICE) += act_police.o -obj-$(CONFIG_NET_CLS_POLICE) += act_police.o obj-$(CONFIG_NET_ACT_GACT) += act_gact.o obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o +obj-$(CONFIG_NET_ACT_NAT) += act_nat.o obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o +obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o +obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o +obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o -obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o obj-$(CONFIG_NET_SCH_RED) += sch_red.o obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o +obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o +obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o +obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o +obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o +obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o +obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o +obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o +obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o +obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o +obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o +obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o + obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o obj-$(CONFIG_NET_CLS_FW) += cls_fw.o @@ -35,9 +50,14 @@ obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o +obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o +obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o +obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o obj-$(CONFIG_NET_EMATCH) += ematch.o obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o obj-$(CONFIG_NET_EMATCH_META) += em_meta.o obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o +obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o +obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 792ce59940e..648778aef1a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -11,204 +11,421 @@ * */ -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/netdevice.h> +#include <linux/slab.h> #include <linux/skbuff.h> -#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/kmod.h> +#include <linux/err.h> +#include <linux/module.h> +#include <net/net_namespace.h> #include <net/sock.h> #include <net/sch_generic.h> #include <net/act_api.h> +#include <net/netlink.h> -#if 0 /* control */ -#define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args) -#else -#define DPRINTK(format, args...) -#endif -#if 0 /* data */ -#define D2PRINTK(format, args...) printk(KERN_DEBUG format, ##args) -#else -#define D2PRINTK(format, args...) -#endif +void tcf_hash_destroy(struct tc_action *a) +{ + struct tcf_common *p = a->priv; + struct tcf_hashinfo *hinfo = a->ops->hinfo; + + spin_lock_bh(&hinfo->lock); + hlist_del(&p->tcfc_head); + spin_unlock_bh(&hinfo->lock); + gen_kill_estimator(&p->tcfc_bstats, + &p->tcfc_rate_est); + /* + * gen_estimator est_timer() might access p->tcfc_lock + * or bstats, wait a RCU grace period before freeing p + */ + kfree_rcu(p, tcfc_rcu); +} +EXPORT_SYMBOL(tcf_hash_destroy); + +int tcf_hash_release(struct tc_action *a, int bind) +{ + struct tcf_common *p = a->priv; + int ret = 0; + + if (p) { + if (bind) + p->tcfc_bindcnt--; + else if (p->tcfc_bindcnt > 0) + return -EPERM; + + p->tcfc_refcnt--; + if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { + if (a->ops->cleanup) + a->ops->cleanup(a, bind); + tcf_hash_destroy(a); + ret = 1; + } + } + return ret; +} +EXPORT_SYMBOL(tcf_hash_release); + +static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, + struct tc_action *a) +{ + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct hlist_head *head; + struct tcf_common *p; + int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; + struct nlattr *nest; + + spin_lock_bh(&hinfo->lock); + + s_i = cb->args[0]; + + for (i = 0; i < (hinfo->hmask + 1); i++) { + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; + + hlist_for_each_entry_rcu(p, head, tcfc_head) { + index++; + if (index < s_i) + continue; + a->priv = p; + a->order = n_i; + + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; + err = tcf_action_dump_1(skb, a, 0, 0); + if (err < 0) { + index--; + nlmsg_trim(skb, nest); + goto done; + } + nla_nest_end(skb, nest); + n_i++; + if (n_i >= TCA_ACT_MAX_PRIO) + goto done; + } + } +done: + spin_unlock_bh(&hinfo->lock); + if (n_i) + cb->args[0] += n_i; + return n_i; + +nla_put_failure: + nla_nest_cancel(skb, nest); + goto done; +} + +static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a) +{ + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct hlist_head *head; + struct hlist_node *n; + struct tcf_common *p; + struct nlattr *nest; + int i = 0, n_i = 0; + int ret = -EINVAL; + + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; + if (nla_put_string(skb, TCA_KIND, a->ops->kind)) + goto nla_put_failure; + for (i = 0; i < (hinfo->hmask + 1); i++) { + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; + hlist_for_each_entry_safe(p, n, head, tcfc_head) { + a->priv = p; + ret = tcf_hash_release(a, 0); + if (ret == ACT_P_DELETED) { + module_put(a->ops->owner); + n_i++; + } else if (ret < 0) + goto nla_put_failure; + } + } + if (nla_put_u32(skb, TCA_FCNT, n_i)) + goto nla_put_failure; + nla_nest_end(skb, nest); + + return n_i; +nla_put_failure: + nla_nest_cancel(skb, nest); + return ret; +} + +static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, + int type, struct tc_action *a) +{ + if (type == RTM_DELACTION) { + return tcf_del_walker(skb, a); + } else if (type == RTM_GETACTION) { + return tcf_dump_walker(skb, cb, a); + } else { + WARN(1, "tcf_generic_walker: unknown action %d\n", type); + return -EINVAL; + } +} + +static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) +{ + struct tcf_common *p = NULL; + struct hlist_head *head; + + spin_lock_bh(&hinfo->lock); + head = &hinfo->htab[tcf_hash(index, hinfo->hmask)]; + hlist_for_each_entry_rcu(p, head, tcfc_head) + if (p->tcfc_index == index) + break; + spin_unlock_bh(&hinfo->lock); + + return p; +} + +u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo) +{ + u32 val = hinfo->index; + + do { + if (++val == 0) + val = 1; + } while (tcf_hash_lookup(val, hinfo)); + + hinfo->index = val; + return val; +} +EXPORT_SYMBOL(tcf_hash_new_index); + +int tcf_hash_search(struct tc_action *a, u32 index) +{ + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_common *p = tcf_hash_lookup(index, hinfo); + + if (p) { + a->priv = p; + return 1; + } + return 0; +} +EXPORT_SYMBOL(tcf_hash_search); + +int tcf_hash_check(u32 index, struct tc_action *a, int bind) +{ + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_common *p = NULL; + if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) { + if (bind) + p->tcfc_bindcnt++; + p->tcfc_refcnt++; + a->priv = p; + return 1; + } + return 0; +} +EXPORT_SYMBOL(tcf_hash_check); + +void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) +{ + struct tcf_common *pc = a->priv; + if (est) + gen_kill_estimator(&pc->tcfc_bstats, + &pc->tcfc_rate_est); + kfree_rcu(pc, tcfc_rcu); +} +EXPORT_SYMBOL(tcf_hash_cleanup); + +int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, + int size, int bind) +{ + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_common *p = kzalloc(size, GFP_KERNEL); + + if (unlikely(!p)) + return -ENOMEM; + p->tcfc_refcnt = 1; + if (bind) + p->tcfc_bindcnt = 1; + + spin_lock_init(&p->tcfc_lock); + INIT_HLIST_NODE(&p->tcfc_head); + p->tcfc_index = index ? index : tcf_hash_new_index(hinfo); + p->tcfc_tm.install = jiffies; + p->tcfc_tm.lastuse = jiffies; + if (est) { + int err = gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est, + &p->tcfc_lock, est); + if (err) { + kfree(p); + return err; + } + } -static struct tc_action_ops *act_base = NULL; + a->priv = (void *) p; + return 0; +} +EXPORT_SYMBOL(tcf_hash_create); + +void tcf_hash_insert(struct tc_action *a) +{ + struct tcf_common *p = a->priv; + struct tcf_hashinfo *hinfo = a->ops->hinfo; + unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); + + spin_lock_bh(&hinfo->lock); + hlist_add_head(&p->tcfc_head, &hinfo->htab[h]); + spin_unlock_bh(&hinfo->lock); +} +EXPORT_SYMBOL(tcf_hash_insert); + +static LIST_HEAD(act_base); static DEFINE_RWLOCK(act_mod_lock); -int tcf_register_action(struct tc_action_ops *act) +int tcf_register_action(struct tc_action_ops *act, unsigned int mask) { - struct tc_action_ops *a, **ap; + struct tc_action_ops *a; + int err; + + /* Must supply act, dump and init */ + if (!act->act || !act->dump || !act->init) + return -EINVAL; + + /* Supply defaults */ + if (!act->lookup) + act->lookup = tcf_hash_search; + if (!act->walk) + act->walk = tcf_generic_walker; + + act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL); + if (!act->hinfo) + return -ENOMEM; + err = tcf_hashinfo_init(act->hinfo, mask); + if (err) { + kfree(act->hinfo); + return err; + } write_lock(&act_mod_lock); - for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) { + list_for_each_entry(a, &act_base, head) { if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { write_unlock(&act_mod_lock); + tcf_hashinfo_destroy(act->hinfo); + kfree(act->hinfo); return -EEXIST; } } - act->next = NULL; - *ap = act; + list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); return 0; } +EXPORT_SYMBOL(tcf_register_action); int tcf_unregister_action(struct tc_action_ops *act) { - struct tc_action_ops *a, **ap; + struct tc_action_ops *a; int err = -ENOENT; write_lock(&act_mod_lock); - for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) - if (a == act) + list_for_each_entry(a, &act_base, head) { + if (a == act) { + list_del(&act->head); + tcf_hashinfo_destroy(act->hinfo); + kfree(act->hinfo); + err = 0; break; - if (a) { - *ap = a->next; - a->next = NULL; - err = 0; + } } write_unlock(&act_mod_lock); return err; } +EXPORT_SYMBOL(tcf_unregister_action); /* lookup by name */ static struct tc_action_ops *tc_lookup_action_n(char *kind) { - struct tc_action_ops *a = NULL; + struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); - for (a = act_base; a; a = a->next) { + list_for_each_entry(a, &act_base, head) { if (strcmp(kind, a->kind) == 0) { - if (!try_module_get(a->owner)) { - read_unlock(&act_mod_lock); - return NULL; - } + if (try_module_get(a->owner)) + res = a; break; } } read_unlock(&act_mod_lock); } - return a; + return res; } -/* lookup by rtattr */ -static struct tc_action_ops *tc_lookup_action(struct rtattr *kind) +/* lookup by nlattr */ +static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) { - struct tc_action_ops *a = NULL; + struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); - for (a = act_base; a; a = a->next) { - if (rtattr_strcmp(kind, a->kind) == 0) { - if (!try_module_get(a->owner)) { - read_unlock(&act_mod_lock); - return NULL; - } - break; - } - } - read_unlock(&act_mod_lock); - } - return a; -} - -#if 0 -/* lookup by id */ -static struct tc_action_ops *tc_lookup_action_id(u32 type) -{ - struct tc_action_ops *a = NULL; - - if (type) { - read_lock(&act_mod_lock); - for (a = act_base; a; a = a->next) { - if (a->type == type) { - if (!try_module_get(a->owner)) { - read_unlock(&act_mod_lock); - return NULL; - } + list_for_each_entry(a, &act_base, head) { + if (nla_strcmp(kind, a->kind) == 0) { + if (try_module_get(a->owner)) + res = a; break; } } read_unlock(&act_mod_lock); } - return a; + return res; } -#endif -int tcf_action_exec(struct sk_buff *skb, struct tc_action *act, - struct tcf_result *res) +int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, + struct tcf_result *res) { - struct tc_action *a; + const struct tc_action *a; int ret = -1; if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); - D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n", - skb, skb->input_dev ? skb->input_dev->name : "xxx", - skb->dev->name); ret = TC_ACT_OK; goto exec_done; } - while ((a = act) != NULL) { + list_for_each_entry(a, actions, list) { repeat: - if (a->ops && a->ops->act) { - ret = a->ops->act(skb, a, res); - if (TC_MUNGED & skb->tc_verd) { - /* copied already, allow trampling */ - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); - skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); - } - if (ret == TC_ACT_REPEAT) - goto repeat; /* we need a ttl - JHS */ - if (ret != TC_ACT_PIPE) - goto exec_done; + ret = a->ops->act(skb, a, res); + if (TC_MUNGED & skb->tc_verd) { + /* copied already, allow trampling */ + skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); + skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); } - act = a->next; + if (ret == TC_ACT_REPEAT) + goto repeat; /* we need a ttl - JHS */ + if (ret != TC_ACT_PIPE) + goto exec_done; } exec_done: return ret; } +EXPORT_SYMBOL(tcf_action_exec); -void tcf_action_destroy(struct tc_action *act, int bind) +int tcf_action_destroy(struct list_head *actions, int bind) { - struct tc_action *a; + struct tc_action *a, *tmp; + int ret = 0; - for (a = act; a; a = act) { - if (a->ops && a->ops->cleanup) { - DPRINTK("tcf_action_destroy destroying %p next %p\n", - a, a->next); - if (a->ops->cleanup(a, bind) == ACT_P_DELETED) - module_put(a->ops->owner); - act = act->next; - kfree(a); - } else { /*FIXME: Remove later - catch insertion bugs*/ - printk("tcf_action_destroy: BUG? destroying NULL ops\n"); - act = act->next; - kfree(a); - } + list_for_each_entry_safe(a, tmp, actions, list) { + ret = tcf_hash_release(a, bind); + if (ret == ACT_P_DELETED) + module_put(a->ops->owner); + else if (ret < 0) + return ret; + list_del(&a->list); + kfree(a); } + return ret; } int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - int err = -EINVAL; - - if (a->ops == NULL || a->ops->dump == NULL) - return err; return a->ops->dump(skb, a, bind, ref); } @@ -216,79 +433,84 @@ int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { int err = -EINVAL; - unsigned char *b = skb->tail; - struct rtattr *r; - - if (a->ops == NULL || a->ops->dump == NULL) - return err; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; - RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind); + if (nla_put_string(skb, TCA_KIND, a->ops->kind)) + goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) - goto rtattr_failure; - r = (struct rtattr*) skb->tail; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) { - r->rta_len = skb->tail - (u8*)r; + goto nla_put_failure; + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + err = tcf_action_dump_old(skb, a, bind, ref); + if (err > 0) { + nla_nest_end(skb, nest); return err; } -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } +EXPORT_SYMBOL(tcf_action_dump_1); int -tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) +tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref) { struct tc_action *a; int err = -EINVAL; - unsigned char *b = skb->tail; - struct rtattr *r ; + struct nlattr *nest; - while ((a = act) != NULL) { - r = (struct rtattr*) skb->tail; - act = a->next; - RTA_PUT(skb, a->order, 0, NULL); + list_for_each_entry(a, actions, list) { + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; err = tcf_action_dump_1(skb, a, bind, ref); if (err < 0) - goto rtattr_failure; - r->rta_len = skb->tail - (u8*)r; + goto errout; + nla_nest_end(skb, nest); } return 0; -rtattr_failure: - skb_trim(skb, b - skb->data); - return -err; +nla_put_failure: + err = -EINVAL; +errout: + nla_nest_cancel(skb, nest); + return err; } -struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est, - char *name, int ovr, int bind, int *err) +struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, + struct nlattr *est, char *name, int ovr, + int bind) { struct tc_action *a; struct tc_action_ops *a_o; char act_name[IFNAMSIZ]; - struct rtattr *tb[TCA_ACT_MAX+1]; - struct rtattr *kind; - - *err = -EINVAL; + struct nlattr *tb[TCA_ACT_MAX + 1]; + struct nlattr *kind; + int err; if (name == NULL) { - if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + if (err < 0) goto err_out; - kind = tb[TCA_ACT_KIND-1]; + err = -EINVAL; + kind = tb[TCA_ACT_KIND]; if (kind == NULL) goto err_out; - if (rtattr_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) + if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) goto err_out; } else { + err = -EINVAL; if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) goto err_out; } a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES rtnl_unlock(); request_module("act_%s", act_name); rtnl_lock(); @@ -302,37 +524,36 @@ struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est, * indicate this using -EAGAIN. */ if (a_o != NULL) { - *err = -EAGAIN; + err = -EAGAIN; goto err_mod; } #endif + err = -ENOENT; goto err_out; } - *err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_KERNEL); + err = -ENOMEM; + a = kzalloc(sizeof(*a), GFP_KERNEL); if (a == NULL) goto err_mod; - memset(a, 0, sizeof(*a)); + a->ops = a_o; + INIT_LIST_HEAD(&a->list); /* backward compatibility for policer */ if (name == NULL) - *err = a_o->init(tb[TCA_ACT_OPTIONS-1], est, a, ovr, bind); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind); else - *err = a_o->init(rta, est, a, ovr, bind); - if (*err < 0) + err = a_o->init(net, nla, est, a, ovr, bind); + if (err < 0) goto err_free; /* module count goes up only when brand new policy is created - if it exists and is only bound to in a_o->init() then - ACT_P_CREATED is not returned (a zero is). - */ - if (*err != ACT_P_CREATED) + * if it exists and is only bound to in a_o->init() then + * ACT_P_CREATED is not returned (a zero is). + */ + if (err != ACT_P_CREATED) module_put(a_o->owner); - a->ops = a_o; - DPRINTK("tcf_action_init_1: successfull %s\n", act_name); - *err = 0; return a; err_free: @@ -340,39 +561,36 @@ err_free: err_mod: module_put(a_o->owner); err_out: - return NULL; + return ERR_PTR(err); } -struct tc_action *tcf_action_init(struct rtattr *rta, struct rtattr *est, - char *name, int ovr, int bind, int *err) +int tcf_action_init(struct net *net, struct nlattr *nla, + struct nlattr *est, char *name, int ovr, + int bind, struct list_head *actions) { - struct rtattr *tb[TCA_ACT_MAX_PRIO+1]; - struct tc_action *head = NULL, *act, *act_prev = NULL; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct tc_action *act; + int err; int i; - if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) { - *err = -EINVAL; - return head; - } + err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); + if (err < 0) + return err; - for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(tb[i], est, name, ovr, bind, err); - if (act == NULL) + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { + act = tcf_action_init_1(net, tb[i], est, name, ovr, bind); + if (IS_ERR(act)) { + err = PTR_ERR(act); goto err; - act->order = i+1; - - if (head == NULL) - head = act; - else - act_prev->next = act; - act_prev = act; + } + act->order = i; + list_add_tail(&act->list, actions); } - return head; + return 0; err: - if (head != NULL) - tcf_action_destroy(head, bind); - return NULL; + tcf_action_destroy(actions, bind); + return err; } int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, @@ -380,36 +598,31 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, { int err = 0; struct gnet_dump d; - struct tcf_act_hdr *h = a->priv; - - if (h == NULL) + struct tcf_common *p = a->priv; + + if (p == NULL) goto errout; /* compat_mode being true specifies a call that is supposed - * to add additional backward compatiblity statistic TLVs. + * to add additional backward compatibility statistic TLVs. */ if (compat_mode) { if (a->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, - TCA_STATS, TCA_XSTATS, h->stats_lock, &d); + TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, - h->stats_lock, &d); + &p->tcfc_lock, &d); if (err < 0) goto errout; - if (a->ops != NULL && a->ops->get_stats != NULL) - if (a->ops->get_stats(skb, a) < 0) - goto errout; - - if (gnet_stats_copy_basic(&d, &h->bstats) < 0 || -#ifdef CONFIG_NET_ESTIMATOR - gnet_stats_copy_rate_est(&d, &h->rate_est) < 0 || -#endif - gnet_stats_copy_queue(&d, &h->qstats) < 0) + if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 || + gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, + &p->tcfc_rate_est) < 0 || + gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) @@ -422,353 +635,354 @@ errout: } static int -tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, - u16 flags, int event, int bind, int ref) +tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq, + u16 flags, int event, int bind, int ref) { struct tcamsg *t; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; - struct rtattr *x; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; - nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); - - t = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags); + if (!nlh) + goto out_nlmsg_trim; + t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; - - x = (struct rtattr*) skb->tail; - RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); - if (tcf_action_dump(skb, a, bind, ref) < 0) - goto rtattr_failure; + nest = nla_nest_start(skb, TCA_ACT_TAB); + if (nest == NULL) + goto out_nlmsg_trim; + + if (tcf_action_dump(skb, actions, bind, ref) < 0) + goto out_nlmsg_trim; - x->rta_len = skb->tail - (u8*)x; - - nlh->nlmsg_len = skb->tail - b; + nla_nest_end(skb, nest); + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; -rtattr_failure: -nlmsg_failure: - skb_trim(skb, b - skb->data); +out_nlmsg_trim: + nlmsg_trim(skb, b); return -1; } static int -act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event) +act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, + struct list_head *actions, int event) { struct sk_buff *skb; - int err = 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { kfree_skb(skb); return -EINVAL; } - err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); - if (err > 0) - err = 0; - return err; + + return rtnl_unicast(skb, net, portid); +} + +static struct tc_action *create_a(int i) +{ + struct tc_action *act; + + act = kzalloc(sizeof(*act), GFP_KERNEL); + if (act == NULL) { + pr_debug("create_a: failed to alloc!\n"); + return NULL; + } + act->order = i; + INIT_LIST_HEAD(&act->list); + return act; } static struct tc_action * -tcf_action_get_1(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int *err) +tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) { - struct rtattr *tb[TCA_ACT_MAX+1]; + struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action *a; int index; + int err; - *err = -EINVAL; - if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) - return NULL; + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + if (err < 0) + goto err_out; - if (tb[TCA_ACT_INDEX - 1] == NULL || - RTA_PAYLOAD(tb[TCA_ACT_INDEX - 1]) < sizeof(index)) - return NULL; - index = *(int *)RTA_DATA(tb[TCA_ACT_INDEX - 1]); + err = -EINVAL; + if (tb[TCA_ACT_INDEX] == NULL || + nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) + goto err_out; + index = nla_get_u32(tb[TCA_ACT_INDEX]); - *err = -ENOMEM; - a = kmalloc(sizeof(struct tc_action), GFP_KERNEL); + err = -ENOMEM; + a = create_a(0); if (a == NULL) - return NULL; - memset(a, 0, sizeof(struct tc_action)); + goto err_out; - *err = -EINVAL; - a->ops = tc_lookup_action(tb[TCA_ACT_KIND - 1]); - if (a->ops == NULL) + err = -EINVAL; + a->ops = tc_lookup_action(tb[TCA_ACT_KIND]); + if (a->ops == NULL) /* could happen in batch of actions */ goto err_free; - if (a->ops->lookup == NULL) - goto err_mod; - *err = -ENOENT; + err = -ENOENT; if (a->ops->lookup(a, index) == 0) goto err_mod; module_put(a->ops->owner); - *err = 0; return a; + err_mod: module_put(a->ops->owner); err_free: kfree(a); - return NULL; +err_out: + return ERR_PTR(err); } -static void cleanup_a(struct tc_action *act) +static void cleanup_a(struct list_head *actions) { - struct tc_action *a; + struct tc_action *a, *tmp; - for (a = act; a; a = act) { - act = a->next; + list_for_each_entry_safe(a, tmp, actions, list) { + list_del(&a->list); kfree(a); } } -static struct tc_action *create_a(int i) -{ - struct tc_action *act; - - act = kmalloc(sizeof(*act), GFP_KERNEL); - if (act == NULL) { - printk("create_a: failed to alloc!\n"); - return NULL; - } - memset(act, 0, sizeof(*act)); - act->order = i; - return act; -} - -static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid) +static int tca_action_flush(struct net *net, struct nlattr *nla, + struct nlmsghdr *n, u32 portid) { struct sk_buff *skb; unsigned char *b; struct nlmsghdr *nlh; struct tcamsg *t; struct netlink_callback dcb; - struct rtattr *x; - struct rtattr *tb[TCA_ACT_MAX+1]; - struct rtattr *kind; - struct tc_action *a = create_a(0); - int err = -EINVAL; - - if (a == NULL) { - printk("tca_action_flush: couldnt create tc_action\n"); - return err; - } + struct nlattr *nest; + struct nlattr *tb[TCA_ACT_MAX + 1]; + struct nlattr *kind; + struct tc_action a; + int err = -ENOMEM; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { - printk("tca_action_flush: failed skb alloc\n"); - kfree(a); - return -ENOBUFS; + pr_debug("tca_action_flush: failed skb alloc\n"); + return err; } - b = (unsigned char *)skb->tail; + b = skb_tail_pointer(skb); - if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + if (err < 0) goto err_out; - kind = tb[TCA_ACT_KIND-1]; - a->ops = tc_lookup_action(kind); - if (a->ops == NULL) + err = -EINVAL; + kind = tb[TCA_ACT_KIND]; + memset(&a, 0, sizeof(struct tc_action)); + INIT_LIST_HEAD(&a.list); + a.ops = tc_lookup_action(kind); + if (a.ops == NULL) /*some idjot trying to flush unknown action */ goto err_out; - nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t)); - t = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); + if (!nlh) + goto out_module_put; + t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; - x = (struct rtattr *) skb->tail; - RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + nest = nla_nest_start(skb, TCA_ACT_TAB); + if (nest == NULL) + goto out_module_put; - err = a->ops->walk(skb, &dcb, RTM_DELACTION, a); + err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a); if (err < 0) - goto rtattr_failure; + goto out_module_put; + if (err == 0) + goto noflush_out; - x->rta_len = skb->tail - (u8 *) x; + nla_nest_end(skb, nest); - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; - module_put(a->ops->owner); - kfree(a); - err = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + module_put(a.ops->owner); + err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); if (err > 0) return 0; return err; -rtattr_failure: - module_put(a->ops->owner); -nlmsg_failure: +out_module_put: + module_put(a.ops->owner); err_out: +noflush_out: kfree_skb(skb); - kfree(a); return err; } static int -tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event) +tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, + u32 portid) { - int i, ret = 0; - struct rtattr *tb[TCA_ACT_MAX_PRIO+1]; - struct tc_action *head = NULL, *act, *act_prev = NULL; + int ret; + struct sk_buff *skb; - if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, + 0, 1) <= 0) { + kfree_skb(skb); return -EINVAL; + } - if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) { - if (tb[0] != NULL && tb[1] == NULL) - return tca_action_flush(tb[0], n, pid); + /* now do the delete */ + ret = tcf_action_destroy(actions, 0); + if (ret < 0) { + kfree_skb(skb); + return ret; } - for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_get_1(tb[i], n, pid, &ret); - if (act == NULL) - goto err; - act->order = i+1; + ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); + if (ret > 0) + return 0; + return ret; +} + +static int +tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, + u32 portid, int event) +{ + int i, ret; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct tc_action *act; + LIST_HEAD(actions); + + ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); + if (ret < 0) + return ret; - if (head == NULL) - head = act; + if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { + if (tb[1] != NULL) + return tca_action_flush(net, tb[1], n, portid); else - act_prev->next = act; - act_prev = act; + return -EINVAL; } - if (event == RTM_GETACTION) - ret = act_get_notify(pid, n, head, event); - else { /* delete */ - struct sk_buff *skb; - - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) { - ret = -ENOBUFS; + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { + act = tcf_action_get_1(tb[i], n, portid); + if (IS_ERR(act)) { + ret = PTR_ERR(act); goto err; } + act->order = i; + list_add_tail(&act->list, &actions); + } - if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event, - 0, 1) <= 0) { - kfree_skb(skb); - ret = -EINVAL; + if (event == RTM_GETACTION) + ret = act_get_notify(net, portid, n, &actions, event); + else { /* delete */ + ret = tcf_del_notify(net, n, &actions, portid); + if (ret) goto err; - } - - /* now do the delete */ - tcf_action_destroy(head, 0); - ret = rtnetlink_send(skb, pid, RTNLGRP_TC, - n->nlmsg_flags&NLM_F_ECHO); - if (ret > 0) - return 0; return ret; } err: - cleanup_a(head); + cleanup_a(&actions); return ret; } -static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, - u16 flags) +static int +tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, + u32 portid) { - struct tcamsg *t; - struct nlmsghdr *nlh; struct sk_buff *skb; - struct rtattr *x; - unsigned char *b; int err = 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - b = (unsigned char *)skb->tail; - - nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); - t = NLMSG_DATA(nlh); - t->tca_family = AF_UNSPEC; - t->tca__pad1 = 0; - t->tca__pad2 = 0; - - x = (struct rtattr*) skb->tail; - RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); - - if (tcf_action_dump(skb, a, 0, 0) < 0) - goto rtattr_failure; + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, + RTM_NEWACTION, 0, 0) <= 0) { + kfree_skb(skb); + return -EINVAL; + } - x->rta_len = skb->tail - (u8*)x; - - nlh->nlmsg_len = skb->tail - b; - NETLINK_CB(skb).dst_group = RTNLGRP_TC; - - err = rtnetlink_send(skb, pid, RTNLGRP_TC, flags&NLM_F_ECHO); + err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); if (err > 0) err = 0; return err; - -rtattr_failure: -nlmsg_failure: - skb_trim(skb, b - skb->data); - return -1; } - static int -tcf_action_add(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int ovr) +tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, + u32 portid, int ovr) { int ret = 0; - struct tc_action *act; - struct tc_action *a; - u32 seq = n->nlmsg_seq; + LIST_HEAD(actions); - act = tcf_action_init(rta, NULL, NULL, ovr, 0, &ret); - if (act == NULL) + ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions); + if (ret) goto done; /* dump then free all the actions after update; inserted policy * stays intact - * */ - ret = tcf_add_notify(act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); - for (a = act; a; a = act) { - act = a->next; - kfree(a); - } + */ + ret = tcf_add_notify(net, n, &actions, portid); + cleanup_a(&actions); done: return ret; } -static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n) { - struct rtattr **tca = arg; - u32 pid = skb ? NETLINK_CB(skb).pid : 0; + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_ACT_MAX + 1]; + u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; - if (tca[TCA_ACT_TAB-1] == NULL) { - printk("tc_ctl_action: received NO action attribs\n"); + if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); + if (ret < 0) + return ret; + + if (tca[TCA_ACT_TAB] == NULL) { + pr_notice("tc_ctl_action: received NO action attribs\n"); return -EINVAL; } - /* n->nlmsg_flags&NLM_F_CREATE - * */ + /* n->nlmsg_flags & NLM_F_CREATE */ switch (n->nlmsg_type) { case RTM_NEWACTION: /* we are going to assume all other flags - * imply create only if it doesnt exist + * imply create only if it doesn't exist * Note that CREATE | EXCL implies that * but since we want avoid ambiguity (eg when flags * is zero) then just set this */ - if (n->nlmsg_flags&NLM_F_REPLACE) + if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; replay: - ret = tcf_action_add(tca[TCA_ACT_TAB-1], n, pid, ovr); + ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr); if (ret == -EAGAIN) goto replay; break; case RTM_DELACTION: - ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_DELACTION); + ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, + portid, RTM_DELACTION); break; case RTM_GETACTION: - ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_GETACTION); + ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, + portid, RTM_GETACTION); break; default: BUG(); @@ -777,120 +991,101 @@ replay: return ret; } -static char * -find_dump_kind(struct nlmsghdr *n) +static struct nlattr * +find_dump_kind(const struct nlmsghdr *n) { - struct rtattr *tb1, *tb2[TCA_ACT_MAX+1]; - struct rtattr *tb[TCA_ACT_MAX_PRIO + 1]; - struct rtattr *rta[TCAA_MAX + 1]; - struct rtattr *kind; - int min_len = NLMSG_LENGTH(sizeof(struct tcamsg)); - int attrlen = n->nlmsg_len - NLMSG_ALIGN(min_len); - struct rtattr *attr = (void *) n + NLMSG_ALIGN(min_len); + struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct nlattr *nla[TCAA_MAX + 1]; + struct nlattr *kind; - if (rtattr_parse(rta, TCAA_MAX, attr, attrlen) < 0) + if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, NULL) < 0) return NULL; - tb1 = rta[TCA_ACT_TAB - 1]; + tb1 = nla[TCA_ACT_TAB]; if (tb1 == NULL) return NULL; - if (rtattr_parse(tb, TCA_ACT_MAX_PRIO, RTA_DATA(tb1), - NLMSG_ALIGN(RTA_PAYLOAD(tb1))) < 0) - return NULL; - if (tb[0] == NULL) + if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), + NLMSG_ALIGN(nla_len(tb1)), NULL) < 0) return NULL; - if (rtattr_parse(tb2, TCA_ACT_MAX, RTA_DATA(tb[0]), - RTA_PAYLOAD(tb[0])) < 0) + if (tb[1] == NULL) + return NULL; + if (nla_parse(tb2, TCA_ACT_MAX, nla_data(tb[1]), + nla_len(tb[1]), NULL) < 0) return NULL; - kind = tb2[TCA_ACT_KIND-1]; + kind = tb2[TCA_ACT_KIND]; - return (char *) RTA_DATA(kind); + return kind; } static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { struct nlmsghdr *nlh; - unsigned char *b = skb->tail; - struct rtattr *x; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; struct tc_action_ops *a_o; struct tc_action a; int ret = 0; - struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh); - char *kind = find_dump_kind(cb->nlh); + struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); + struct nlattr *kind = find_dump_kind(cb->nlh); if (kind == NULL) { - printk("tc_dump_action: action bad kind\n"); + pr_info("tc_dump_action: action bad kind\n"); return 0; } - a_o = tc_lookup_action_n(kind); - if (a_o == NULL) { - printk("failed to find %s\n", kind); + a_o = tc_lookup_action(kind); + if (a_o == NULL) return 0; - } memset(&a, 0, sizeof(struct tc_action)); a.ops = a_o; - if (a_o->walk == NULL) { - printk("tc_dump_action: %s !capable of dumping table\n", kind); - goto rtattr_failure; - } - - nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - cb->nlh->nlmsg_type, sizeof(*t)); - t = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*t), 0); + if (!nlh) + goto out_module_put; + t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; - x = (struct rtattr *) skb->tail; - RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + nest = nla_nest_start(skb, TCA_ACT_TAB); + if (nest == NULL) + goto out_module_put; ret = a_o->walk(skb, cb, RTM_GETACTION, &a); if (ret < 0) - goto rtattr_failure; + goto out_module_put; if (ret > 0) { - x->rta_len = skb->tail - (u8 *) x; + nla_nest_end(skb, nest); ret = skb->len; } else - skb_trim(skb, (u8*)x - skb->data); + nla_nest_cancel(skb, nest); - nlh->nlmsg_len = skb->tail - b; - if (NETLINK_CB(cb->skb).pid && ret) + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + if (NETLINK_CB(cb->skb).portid && ret) nlh->nlmsg_flags |= NLM_F_MULTI; module_put(a_o->owner); return skb->len; -rtattr_failure: -nlmsg_failure: +out_module_put: module_put(a_o->owner); - skb_trim(skb, b - skb->data); + nlmsg_trim(skb, b); return skb->len; } static int __init tc_action_init(void) { - struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; + rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, + NULL); - if (link_p) { - link_p[RTM_NEWACTION-RTM_BASE].doit = tc_ctl_action; - link_p[RTM_DELACTION-RTM_BASE].doit = tc_ctl_action; - link_p[RTM_GETACTION-RTM_BASE].doit = tc_ctl_action; - link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action; - } - - printk("TC classifier action (bugs to netdev@vger.kernel.org cc " - "hadi@cyberus.ca)\n"); return 0; } subsys_initcall(tc_action_init); - -EXPORT_SYMBOL(tcf_register_action); -EXPORT_SYMBOL(tcf_unregister_action); -EXPORT_SYMBOL(tcf_action_exec); -EXPORT_SYMBOL(tcf_action_dump_1); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c new file mode 100644 index 00000000000..edbf40dac70 --- /dev/null +++ b/net/sched/act_csum.c @@ -0,0 +1,584 @@ +/* + * Checksum updating actions + * + * Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/spinlock.h> + +#include <linux/netlink.h> +#include <net/netlink.h> +#include <linux/rtnetlink.h> + +#include <linux/skbuff.h> + +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <linux/icmpv6.h> +#include <linux/igmp.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/ip6_checksum.h> + +#include <net/act_api.h> + +#include <linux/tc_act/tc_csum.h> +#include <net/tc_act/tc_csum.h> + +#define CSUM_TAB_MASK 15 + +static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { + [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, +}; + +static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_CSUM_MAX + 1]; + struct tc_csum *parm; + struct tcf_csum *p; + int ret = 0, err; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy); + if (err < 0) + return err; + + if (tb[TCA_CSUM_PARMS] == NULL) + return -EINVAL; + parm = nla_data(tb[TCA_CSUM_PARMS]); + + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + if (ret) + return ret; + ret = ACT_P_CREATED; + } else { + if (bind)/* dont override defaults */ + return 0; + tcf_hash_release(a, bind); + if (!ovr) + return -EEXIST; + } + + p = to_tcf_csum(a); + spin_lock_bh(&p->tcf_lock); + p->tcf_action = parm->action; + p->update_flags = parm->update_flags; + spin_unlock_bh(&p->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(a); + + return ret; +} + +/** + * tcf_csum_skb_nextlayer - Get next layer pointer + * @skb: sk_buff to use + * @ihl: previous summed headers length + * @ipl: complete packet length + * @jhl: next header length + * + * Check the expected next layer availability in the specified sk_buff. + * Return the next layer pointer if pass, NULL otherwise. + */ +static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl, + unsigned int jhl) +{ + int ntkoff = skb_network_offset(skb); + int hl = ihl + jhl; + + if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) || + (skb_cloned(skb) && + !skb_clone_writable(skb, hl + ntkoff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return NULL; + else + return (void *)(skb_network_header(skb) + ihl); +} + +static int tcf_csum_ipv4_icmp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct icmphdr *icmph; + + icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph)); + if (icmph == NULL) + return 0; + + icmph->checksum = 0; + skb->csum = csum_partial(icmph, ipl - ihl, 0); + icmph->checksum = csum_fold(skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_igmp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct igmphdr *igmph; + + igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph)); + if (igmph == NULL) + return 0; + + igmph->csum = 0; + skb->csum = csum_partial(igmph, ipl - ihl, 0); + igmph->csum = csum_fold(skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv6_icmp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct icmp6hdr *icmp6h; + const struct ipv6hdr *ip6h; + + icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h)); + if (icmp6h == NULL) + return 0; + + ip6h = ipv6_hdr(skb); + icmp6h->icmp6_cksum = 0; + skb->csum = csum_partial(icmp6h, ipl - ihl, 0); + icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ipl - ihl, IPPROTO_ICMPV6, + skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_tcp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct tcphdr *tcph; + const struct iphdr *iph; + + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); + if (tcph == NULL) + return 0; + + iph = ip_hdr(skb); + tcph->check = 0; + skb->csum = csum_partial(tcph, ipl - ihl, 0); + tcph->check = tcp_v4_check(ipl - ihl, + iph->saddr, iph->daddr, skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv6_tcp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct tcphdr *tcph; + const struct ipv6hdr *ip6h; + + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); + if (tcph == NULL) + return 0; + + ip6h = ipv6_hdr(skb); + tcph->check = 0; + skb->csum = csum_partial(tcph, ipl - ihl, 0); + tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ipl - ihl, IPPROTO_TCP, + skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_udp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl, int udplite) +{ + struct udphdr *udph; + const struct iphdr *iph; + u16 ul; + + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, + * UDPLITE uses udph->len for another thing, + * Use iph->tot_len, or just ipl. + */ + + udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); + if (udph == NULL) + return 0; + + iph = ip_hdr(skb); + ul = ntohs(udph->len); + + if (udplite || udph->check) { + + udph->check = 0; + + if (udplite) { + if (ul == 0) + skb->csum = csum_partial(udph, ipl - ihl, 0); + else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) + skb->csum = csum_partial(udph, ul, 0); + else + goto ignore_obscure_skb; + } else { + if (ul != ipl - ihl) + goto ignore_obscure_skb; + + skb->csum = csum_partial(udph, ul, 0); + } + + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + ul, iph->protocol, + skb->csum); + + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + skb->ip_summed = CHECKSUM_NONE; + +ignore_obscure_skb: + return 1; +} + +static int tcf_csum_ipv6_udp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl, int udplite) +{ + struct udphdr *udph; + const struct ipv6hdr *ip6h; + u16 ul; + + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, + * UDPLITE uses udph->len for another thing, + * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl. + */ + + udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); + if (udph == NULL) + return 0; + + ip6h = ipv6_hdr(skb); + ul = ntohs(udph->len); + + udph->check = 0; + + if (udplite) { + if (ul == 0) + skb->csum = csum_partial(udph, ipl - ihl, 0); + + else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) + skb->csum = csum_partial(udph, ul, 0); + + else + goto ignore_obscure_skb; + } else { + if (ul != ipl - ihl) + goto ignore_obscure_skb; + + skb->csum = csum_partial(udph, ul, 0); + } + + udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul, + udplite ? IPPROTO_UDPLITE : IPPROTO_UDP, + skb->csum); + + if (!udph->check) + udph->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_NONE; + +ignore_obscure_skb: + return 1; +} + +static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) +{ + const struct iphdr *iph; + int ntkoff; + + ntkoff = skb_network_offset(skb); + + if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff)) + goto fail; + + iph = ip_hdr(skb); + + switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) { + case IPPROTO_ICMP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) + if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_IGMP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP) + if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_TCP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) + if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_UDP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) + if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4, + ntohs(iph->tot_len), 0)) + goto fail; + break; + case IPPROTO_UDPLITE: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) + if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4, + ntohs(iph->tot_len), 1)) + goto fail; + break; + } + + if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { + if (skb_cloned(skb) && + !skb_clone_writable(skb, sizeof(*iph) + ntkoff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto fail; + + ip_send_check(ip_hdr(skb)); + } + + return 1; + +fail: + return 0; +} + +static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, + unsigned int ixhl, unsigned int *pl) +{ + int off, len, optlen; + unsigned char *xh = (void *)ip6xh; + + off = sizeof(*ip6xh); + len = ixhl - off; + + while (len > 1) { + switch (xh[off]) { + case IPV6_TLV_PAD1: + optlen = 1; + break; + case IPV6_TLV_JUMBO: + optlen = xh[off + 1] + 2; + if (optlen != 6 || len < 6 || (off & 3) != 2) + /* wrong jumbo option length/alignment */ + return 0; + *pl = ntohl(*(__be32 *)(xh + off + 2)); + goto done; + default: + optlen = xh[off + 1] + 2; + if (optlen > len) + /* ignore obscure options */ + goto done; + break; + } + off += optlen; + len -= optlen; + } + +done: + return 1; +} + +static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) +{ + struct ipv6hdr *ip6h; + struct ipv6_opt_hdr *ip6xh; + unsigned int hl, ixhl; + unsigned int pl; + int ntkoff; + u8 nexthdr; + + ntkoff = skb_network_offset(skb); + + hl = sizeof(*ip6h); + + if (!pskb_may_pull(skb, hl + ntkoff)) + goto fail; + + ip6h = ipv6_hdr(skb); + + pl = ntohs(ip6h->payload_len); + nexthdr = ip6h->nexthdr; + + do { + switch (nexthdr) { + case NEXTHDR_FRAGMENT: + goto ignore_skb; + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: + case NEXTHDR_DEST: + if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff)) + goto fail; + ip6xh = (void *)(skb_network_header(skb) + hl); + ixhl = ipv6_optlen(ip6xh); + if (!pskb_may_pull(skb, hl + ixhl + ntkoff)) + goto fail; + ip6xh = (void *)(skb_network_header(skb) + hl); + if ((nexthdr == NEXTHDR_HOP) && + !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl))) + goto fail; + nexthdr = ip6xh->nexthdr; + hl += ixhl; + break; + case IPPROTO_ICMPV6: + if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) + if (!tcf_csum_ipv6_icmp(skb, + hl, pl + sizeof(*ip6h))) + goto fail; + goto done; + case IPPROTO_TCP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) + if (!tcf_csum_ipv6_tcp(skb, + hl, pl + sizeof(*ip6h))) + goto fail; + goto done; + case IPPROTO_UDP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) + if (!tcf_csum_ipv6_udp(skb, hl, + pl + sizeof(*ip6h), 0)) + goto fail; + goto done; + case IPPROTO_UDPLITE: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) + if (!tcf_csum_ipv6_udp(skb, hl, + pl + sizeof(*ip6h), 1)) + goto fail; + goto done; + default: + goto ignore_skb; + } + } while (pskb_may_pull(skb, hl + 1 + ntkoff)); + +done: +ignore_skb: + return 1; + +fail: + return 0; +} + +static int tcf_csum(struct sk_buff *skb, + const struct tc_action *a, struct tcf_result *res) +{ + struct tcf_csum *p = a->priv; + int action; + u32 update_flags; + + spin_lock(&p->tcf_lock); + p->tcf_tm.lastuse = jiffies; + bstats_update(&p->tcf_bstats, skb); + action = p->tcf_action; + update_flags = p->update_flags; + spin_unlock(&p->tcf_lock); + + if (unlikely(action == TC_ACT_SHOT)) + goto drop; + + switch (skb->protocol) { + case cpu_to_be16(ETH_P_IP): + if (!tcf_csum_ipv4(skb, update_flags)) + goto drop; + break; + case cpu_to_be16(ETH_P_IPV6): + if (!tcf_csum_ipv6(skb, update_flags)) + goto drop; + break; + } + + return action; + +drop: + spin_lock(&p->tcf_lock); + p->tcf_qstats.drops++; + spin_unlock(&p->tcf_lock); + return TC_ACT_SHOT; +} + +static int tcf_csum_dump(struct sk_buff *skb, + struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_csum *p = a->priv; + struct tc_csum opt = { + .update_flags = p->update_flags, + .index = p->tcf_index, + .action = p->tcf_action, + .refcnt = p->tcf_refcnt - ref, + .bindcnt = p->tcf_bindcnt - bind, + }; + struct tcf_t t; + + if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t)) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_csum_ops = { + .kind = "csum", + .type = TCA_ACT_CSUM, + .owner = THIS_MODULE, + .act = tcf_csum, + .dump = tcf_csum_dump, + .init = tcf_csum_init, +}; + +MODULE_DESCRIPTION("Checksum updating actions"); +MODULE_LICENSE("GPL"); + +static int __init csum_init_module(void) +{ + return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK); +} + +static void __exit csum_cleanup_module(void) +{ + tcf_unregister_action(&act_csum_ops); +} + +module_init(csum_init_module); +module_exit(csum_cleanup_module); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index a1e68f78dcc..d6bcbd9f779 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -10,218 +10,197 @@ * */ -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/proc_fs.h> -#include <net/sock.h> +#include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_gact.h> #include <net/tc_act/tc_gact.h> -/* use generic hash table */ -#define MY_TAB_SIZE 16 -#define MY_TAB_MASK 15 - -static u32 idx_gen; -static struct tcf_gact *tcf_gact_ht[MY_TAB_SIZE]; -static DEFINE_RWLOCK(gact_lock); - -/* ovewrride the defaults */ -#define tcf_st tcf_gact -#define tc_st tc_gact -#define tcf_t_lock gact_lock -#define tcf_ht tcf_gact_ht - -#define CONFIG_NET_ACT_INIT 1 -#include <net/pkt_act.h> +#define GACT_TAB_MASK 15 #ifdef CONFIG_GACT_PROB -static int gact_net_rand(struct tcf_gact *p) +static int gact_net_rand(struct tcf_gact *gact) { - if (net_random()%p->pval) - return p->action; - return p->paction; + if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval) + return gact->tcf_action; + return gact->tcfg_paction; } -static int gact_determ(struct tcf_gact *p) +static int gact_determ(struct tcf_gact *gact) { - if (p->bstats.packets%p->pval) - return p->action; - return p->paction; + if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval) + return gact->tcf_action; + return gact->tcfg_paction; } -typedef int (*g_rand)(struct tcf_gact *p); -static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ }; -#endif +typedef int (*g_rand)(struct tcf_gact *gact); +static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ }; +#endif /* CONFIG_GACT_PROB */ + +static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { + [TCA_GACT_PARMS] = { .len = sizeof(struct tc_gact) }, + [TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) }, +}; -static int tcf_gact_init(struct rtattr *rta, struct rtattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_gact_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { - struct rtattr *tb[TCA_GACT_MAX]; + struct nlattr *tb[TCA_GACT_MAX + 1]; struct tc_gact *parm; - struct tcf_gact *p; + struct tcf_gact *gact; int ret = 0; + int err; +#ifdef CONFIG_GACT_PROB + struct tc_gact_p *p_parm = NULL; +#endif - if (rta == NULL || rtattr_parse_nested(tb, TCA_GACT_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - if (tb[TCA_GACT_PARMS - 1] == NULL || - RTA_PAYLOAD(tb[TCA_GACT_PARMS - 1]) < sizeof(*parm)) + err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy); + if (err < 0) + return err; + + if (tb[TCA_GACT_PARMS] == NULL) return -EINVAL; - parm = RTA_DATA(tb[TCA_GACT_PARMS - 1]); + parm = nla_data(tb[TCA_GACT_PARMS]); - if (tb[TCA_GACT_PROB-1] != NULL) -#ifdef CONFIG_GACT_PROB - if (RTA_PAYLOAD(tb[TCA_GACT_PROB-1]) < sizeof(struct tc_gact_p)) - return -EINVAL; -#else +#ifndef CONFIG_GACT_PROB + if (tb[TCA_GACT_PROB] != NULL) return -EOPNOTSUPP; +#else + if (tb[TCA_GACT_PROB]) { + p_parm = nla_data(tb[TCA_GACT_PROB]); + if (p_parm->ptype >= MAX_RAND) + return -EINVAL; + } #endif - p = tcf_hash_check(parm->index, a, ovr, bind); - if (p == NULL) { - p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); - if (p == NULL) - return -ENOMEM; + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { - if (!ovr) { - tcf_hash_release(p, bind); + if (bind)/* dont override defaults */ + return 0; + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } - spin_lock_bh(&p->lock); - p->action = parm->action; + gact = to_gact(a); + + spin_lock_bh(&gact->tcf_lock); + gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB - if (tb[TCA_GACT_PROB-1] != NULL) { - struct tc_gact_p *p_parm = RTA_DATA(tb[TCA_GACT_PROB-1]); - p->paction = p_parm->paction; - p->pval = p_parm->pval; - p->ptype = p_parm->ptype; + if (p_parm) { + gact->tcfg_paction = p_parm->paction; + gact->tcfg_pval = p_parm->pval; + gact->tcfg_ptype = p_parm->ptype; } #endif - spin_unlock_bh(&p->lock); + spin_unlock_bh(&gact->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(p); + tcf_hash_insert(a); return ret; } -static int -tcf_gact_cleanup(struct tc_action *a, int bind) -{ - struct tcf_gact *p = PRIV(a, gact); - - if (p != NULL) - return tcf_hash_release(p, bind); - return 0; -} - -static int -tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) +static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { - struct tcf_gact *p = PRIV(a, gact); + struct tcf_gact *gact = a->priv; int action = TC_ACT_SHOT; - spin_lock(&p->lock); + spin_lock(&gact->tcf_lock); #ifdef CONFIG_GACT_PROB - if (p->ptype && gact_rand[p->ptype] != NULL) - action = gact_rand[p->ptype](p); + if (gact->tcfg_ptype) + action = gact_rand[gact->tcfg_ptype](gact); else - action = p->action; + action = gact->tcf_action; #else - action = p->action; + action = gact->tcf_action; #endif - p->bstats.bytes += skb->len; - p->bstats.packets++; + gact->tcf_bstats.bytes += qdisc_pkt_len(skb); + gact->tcf_bstats.packets++; if (action == TC_ACT_SHOT) - p->qstats.drops++; - p->tm.lastuse = jiffies; - spin_unlock(&p->lock); + gact->tcf_qstats.drops++; + gact->tcf_tm.lastuse = jiffies; + spin_unlock(&gact->tcf_lock); return action; } -static int -tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - unsigned char *b = skb->tail; - struct tc_gact opt; - struct tcf_gact *p = PRIV(a, gact); + unsigned char *b = skb_tail_pointer(skb); + struct tcf_gact *gact = a->priv; + struct tc_gact opt = { + .index = gact->tcf_index, + .refcnt = gact->tcf_refcnt - ref, + .bindcnt = gact->tcf_bindcnt - bind, + .action = gact->tcf_action, + }; struct tcf_t t; - opt.index = p->index; - opt.refcnt = p->refcnt - ref; - opt.bindcnt = p->bindcnt - bind; - opt.action = p->action; - RTA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; #ifdef CONFIG_GACT_PROB - if (p->ptype) { - struct tc_gact_p p_opt; - p_opt.paction = p->paction; - p_opt.pval = p->pval; - p_opt.ptype = p->ptype; - RTA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); + if (gact->tcfg_ptype) { + struct tc_gact_p p_opt = { + .paction = gact->tcfg_paction, + .pval = gact->tcfg_pval, + .ptype = gact->tcfg_ptype, + }; + + if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt)) + goto nla_put_failure; } #endif - t.install = jiffies_to_clock_t(jiffies - p->tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); - t.expires = jiffies_to_clock_t(p->tm.expires); - RTA_PUT(skb, TCA_GACT_TM, sizeof(t), &t); + t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); + if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; - rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_gact, .dump = tcf_gact_dump, - .cleanup = tcf_gact_cleanup, - .lookup = tcf_hash_search, .init = tcf_gact_init, - .walk = tcf_generic_walker }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); MODULE_DESCRIPTION("Generic Classifier actions"); MODULE_LICENSE("GPL"); -static int __init -gact_init_module(void) +static int __init gact_init_module(void) { #ifdef CONFIG_GACT_PROB - printk("GACT probability on\n"); + pr_info("GACT probability on\n"); #else - printk("GACT probability NOT on\n"); + pr_info("GACT probability NOT on\n"); #endif - return tcf_register_action(&act_gact_ops); + return tcf_register_action(&act_gact_ops, GACT_TAB_MASK); } -static void __exit -gact_cleanup_module(void) +static void __exit gact_cleanup_module(void) { tcf_unregister_action(&act_gact_ops); } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 39a22a3ffe7..8a64a0734ae 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -1,5 +1,5 @@ /* - * net/sched/ipt.c iptables target interface + * net/sched/ipt.c iptables target interface * *TODO: Add other tables. For now we only support the ipv4 table targets * @@ -8,180 +8,156 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Copyright: Jamal Hadi Salim (2002-4) + * Copyright: Jamal Hadi Salim (2002-13) */ -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/bitops.h> -#include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/proc_fs.h> -#include <linux/kmod.h> -#include <net/sock.h> +#include <linux/slab.h> +#include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_ipt.h> #include <net/tc_act/tc_ipt.h> #include <linux/netfilter_ipv4/ip_tables.h> -/* use generic hash table */ -#define MY_TAB_SIZE 16 -#define MY_TAB_MASK 15 -static u32 idx_gen; -static struct tcf_ipt *tcf_ipt_ht[MY_TAB_SIZE]; -/* ipt hash table lock */ -static DEFINE_RWLOCK(ipt_lock); +#define IPT_TAB_MASK 15 -/* ovewrride the defaults */ -#define tcf_st tcf_ipt -#define tcf_t_lock ipt_lock -#define tcf_ht tcf_ipt_ht - -#define CONFIG_NET_ACT_INIT -#include <net/pkt_act.h> - -static int -ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook) +static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) { - struct ipt_target *target; + struct xt_tgchk_param par; + struct xt_target *target; int ret = 0; - target = xt_find_target(AF_INET, t->u.user.name, t->u.user.revision); - if (!target) - return -ENOENT; + target = xt_request_find_target(AF_INET, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) + return PTR_ERR(target); - DPRINTK("ipt_init_target: found %s\n", target->name); t->u.kernel.target = target; - - if (t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(table, NULL, t->data, - t->u.target_size - sizeof(*t), - hook)) { - DPRINTK("ipt_init_target: check failed for `%s'.\n", - t->u.kernel.target->name); + par.table = table; + par.entryinfo = NULL; + par.target = target; + par.targinfo = t->data; + par.hook_mask = hook; + par.family = NFPROTO_IPV4; + + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); + if (ret < 0) { module_put(t->u.kernel.target->me); - ret = -EINVAL; + return ret; } - - return ret; + return 0; } -static void -ipt_destroy_target(struct ipt_entry_target *t) +static void ipt_destroy_target(struct xt_entry_target *t) { - if (t->u.kernel.target->destroy) - t->u.kernel.target->destroy(t->data, - t->u.target_size - sizeof(*t)); - module_put(t->u.kernel.target->me); + struct xt_tgdtor_param par = { + .target = t->u.kernel.target, + .targinfo = t->data, + }; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); } -static int -tcf_ipt_release(struct tcf_ipt *p, int bind) +static void tcf_ipt_release(struct tc_action *a, int bind) { - int ret = 0; - if (p) { - if (bind) - p->bindcnt--; - p->refcnt--; - if (p->bindcnt <= 0 && p->refcnt <= 0) { - ipt_destroy_target(p->t); - kfree(p->tname); - kfree(p->t); - tcf_hash_destroy(p); - ret = ACT_P_DELETED; - } - } - return ret; + struct tcf_ipt *ipt = to_ipt(a); + ipt_destroy_target(ipt->tcfi_t); + kfree(ipt->tcfi_tname); + kfree(ipt->tcfi_t); } -static int -tcf_ipt_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, - int ovr, int bind) +static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { + [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ }, + [TCA_IPT_HOOK] = { .type = NLA_U32 }, + [TCA_IPT_INDEX] = { .type = NLA_U32 }, + [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, +}; + +static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) { - struct rtattr *tb[TCA_IPT_MAX]; - struct tcf_ipt *p; - struct ipt_entry_target *td, *t; + struct nlattr *tb[TCA_IPT_MAX + 1]; + struct tcf_ipt *ipt; + struct xt_entry_target *td, *t; char *tname; int ret = 0, err; u32 hook = 0; u32 index = 0; - if (rta == NULL || rtattr_parse_nested(tb, TCA_IPT_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - if (tb[TCA_IPT_HOOK-1] == NULL || - RTA_PAYLOAD(tb[TCA_IPT_HOOK-1]) < sizeof(u32)) + err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy); + if (err < 0) + return err; + + if (tb[TCA_IPT_HOOK] == NULL) return -EINVAL; - if (tb[TCA_IPT_TARG-1] == NULL || - RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < sizeof(*t)) + if (tb[TCA_IPT_TARG] == NULL) return -EINVAL; - td = (struct ipt_entry_target *)RTA_DATA(tb[TCA_IPT_TARG-1]); - if (RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < td->u.target_size) + + td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); + if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) return -EINVAL; - if (tb[TCA_IPT_INDEX-1] != NULL && - RTA_PAYLOAD(tb[TCA_IPT_INDEX-1]) >= sizeof(u32)) - index = *(u32 *)RTA_DATA(tb[TCA_IPT_INDEX-1]); + if (tb[TCA_IPT_INDEX] != NULL) + index = nla_get_u32(tb[TCA_IPT_INDEX]); - p = tcf_hash_check(index, a, ovr, bind); - if (p == NULL) { - p = tcf_hash_create(index, est, a, sizeof(*p), ovr, bind); - if (p == NULL) - return -ENOMEM; + if (!tcf_hash_check(index, a, bind) ) { + ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { - if (!ovr) { - tcf_ipt_release(p, bind); + if (bind)/* dont override defaults */ + return 0; + tcf_hash_release(a, bind); + + if (!ovr) return -EEXIST; - } } + ipt = to_ipt(a); - hook = *(u32 *)RTA_DATA(tb[TCA_IPT_HOOK-1]); + hook = nla_get_u32(tb[TCA_IPT_HOOK]); err = -ENOMEM; tname = kmalloc(IFNAMSIZ, GFP_KERNEL); - if (tname == NULL) + if (unlikely(!tname)) goto err1; - if (tb[TCA_IPT_TABLE - 1] == NULL || - rtattr_strlcpy(tname, tb[TCA_IPT_TABLE-1], IFNAMSIZ) >= IFNAMSIZ) + if (tb[TCA_IPT_TABLE] == NULL || + nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ) strcpy(tname, "mangle"); - t = kmalloc(td->u.target_size, GFP_KERNEL); - if (t == NULL) + t = kmemdup(td, td->u.target_size, GFP_KERNEL); + if (unlikely(!t)) goto err2; - memcpy(t, td, td->u.target_size); - if ((err = ipt_init_target(t, tname, hook)) < 0) + err = ipt_init_target(t, tname, hook); + if (err < 0) goto err3; - spin_lock_bh(&p->lock); + spin_lock_bh(&ipt->tcf_lock); if (ret != ACT_P_CREATED) { - ipt_destroy_target(p->t); - kfree(p->tname); - kfree(p->t); + ipt_destroy_target(ipt->tcfi_t); + kfree(ipt->tcfi_tname); + kfree(ipt->tcfi_t); } - p->tname = tname; - p->t = t; - p->hook = hook; - spin_unlock_bh(&p->lock); + ipt->tcfi_tname = tname; + ipt->tcfi_t = t; + ipt->tcfi_hook = hook; + spin_unlock_bh(&ipt->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(p); + tcf_hash_insert(a); return ret; err3: @@ -189,108 +165,96 @@ err3: err2: kfree(tname); err1: - kfree(p); + if (ret == ACT_P_CREATED) + tcf_hash_cleanup(a, est); return err; } -static int -tcf_ipt_cleanup(struct tc_action *a, int bind) -{ - struct tcf_ipt *p = PRIV(a, ipt); - return tcf_ipt_release(p, bind); -} - -static int -tcf_ipt(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) +static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { int ret = 0, result = 0; - struct tcf_ipt *p = PRIV(a, ipt); + struct tcf_ipt *ipt = a->priv; + struct xt_action_param par; - if (skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return TC_ACT_UNSPEC; - } + if (skb_unclone(skb, GFP_ATOMIC)) + return TC_ACT_UNSPEC; - spin_lock(&p->lock); + spin_lock(&ipt->tcf_lock); - p->tm.lastuse = jiffies; - p->bstats.bytes += skb->len; - p->bstats.packets++; + ipt->tcf_tm.lastuse = jiffies; + bstats_update(&ipt->tcf_bstats, skb); /* yes, we have to worry about both in and out dev - worry later - danger - this API seems to have changed - from earlier kernels */ - - /* iptables targets take a double skb pointer in case the skb - * needs to be replaced. We don't own the skb, so this must not - * happen. The pskb_expand_head above should make sure of this */ - ret = p->t->u.kernel.target->target(&skb, skb->dev, NULL, - p->hook, p->t->data, NULL); + * worry later - danger - this API seems to have changed + * from earlier kernels + */ + par.in = skb->dev; + par.out = NULL; + par.hooknum = ipt->tcfi_hook; + par.target = ipt->tcfi_t->u.kernel.target; + par.targinfo = ipt->tcfi_t->data; + ret = par.target->target(skb, &par); + switch (ret) { case NF_ACCEPT: result = TC_ACT_OK; break; case NF_DROP: result = TC_ACT_SHOT; - p->qstats.drops++; + ipt->tcf_qstats.drops++; break; - case IPT_CONTINUE: + case XT_CONTINUE: result = TC_ACT_PIPE; break; default: - if (net_ratelimit()) - printk("Bogus netfilter code %d assume ACCEPT\n", ret); + net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n", + ret); result = TC_POLICE_OK; break; } - spin_unlock(&p->lock); + spin_unlock(&ipt->tcf_lock); return result; } -static int -tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - struct ipt_entry_target *t; + unsigned char *b = skb_tail_pointer(skb); + struct tcf_ipt *ipt = a->priv; + struct xt_entry_target *t; struct tcf_t tm; struct tc_cnt c; - unsigned char *b = skb->tail; - struct tcf_ipt *p = PRIV(a, ipt); /* for simple targets kernel size == user size - ** user name = target name - ** for foolproof you need to not assume this - */ - - t = kmalloc(p->t->u.user.target_size, GFP_ATOMIC); - if (t == NULL) - goto rtattr_failure; - - c.bindcnt = p->bindcnt - bind; - c.refcnt = p->refcnt - ref; - memcpy(t, p->t, p->t->u.user.target_size); - strcpy(t->u.user.name, p->t->u.kernel.target->name); - - DPRINTK("\ttcf_ipt_dump tablename %s length %d\n", p->tname, - strlen(p->tname)); - DPRINTK("\tdump target name %s size %d size user %d " - "data[0] %x data[1] %x\n", p->t->u.kernel.target->name, - p->t->u.target_size, p->t->u.user.target_size, - p->t->data[0], p->t->data[1]); - RTA_PUT(skb, TCA_IPT_TARG, p->t->u.user.target_size, t); - RTA_PUT(skb, TCA_IPT_INDEX, 4, &p->index); - RTA_PUT(skb, TCA_IPT_HOOK, 4, &p->hook); - RTA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c); - RTA_PUT(skb, TCA_IPT_TABLE, IFNAMSIZ, p->tname); - tm.install = jiffies_to_clock_t(jiffies - p->tm.install); - tm.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); - tm.expires = jiffies_to_clock_t(p->tm.expires); - RTA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm); + * user name = target name + * for foolproof you need to not assume this + */ + + t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); + if (unlikely(!t)) + goto nla_put_failure; + + c.bindcnt = ipt->tcf_bindcnt - bind; + c.refcnt = ipt->tcf_refcnt - ref; + strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); + + if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || + nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) || + nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) || + nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) || + nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname)) + goto nla_put_failure; + tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); + tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); + tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); + if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm)) + goto nla_put_failure; kfree(t); return skb->len; - rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); kfree(t); return -1; } @@ -298,29 +262,48 @@ tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .type = TCA_ACT_IPT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_ipt, .dump = tcf_ipt_dump, - .cleanup = tcf_ipt_cleanup, - .lookup = tcf_hash_search, + .cleanup = tcf_ipt_release, + .init = tcf_ipt_init, +}; + +static struct tc_action_ops act_xt_ops = { + .kind = "xt", + .type = TCA_ACT_XT, + .owner = THIS_MODULE, + .act = tcf_ipt, + .dump = tcf_ipt_dump, + .cleanup = tcf_ipt_release, .init = tcf_ipt_init, - .walk = tcf_generic_walker }; -MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); MODULE_DESCRIPTION("Iptables target actions"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("act_xt"); -static int __init -ipt_init_module(void) +static int __init ipt_init_module(void) { - return tcf_register_action(&act_ipt_ops); + int ret1, ret2; + + ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK); + if (ret1 < 0) + printk("Failed to load xt action\n"); + ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK); + if (ret2 < 0) + printk("Failed to load ipt action\n"); + + if (ret1 < 0 && ret2 < 0) { + return ret1; + } else + return 0; } -static void __exit -ipt_cleanup_module(void) +static void __exit ipt_cleanup_module(void) { + tcf_unregister_action(&act_xt_ops); tcf_unregister_action(&act_ipt_ops); } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 4fcccbd5088..4f912c0e225 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -12,263 +12,253 @@ * */ -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/bitops.h> -#include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/proc_fs.h> -#include <net/sock.h> +#include <linux/gfp.h> +#include <net/net_namespace.h> +#include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_mirred.h> #include <net/tc_act/tc_mirred.h> -#include <linux/etherdevice.h> #include <linux/if_arp.h> +#define MIRRED_TAB_MASK 7 +static LIST_HEAD(mirred_list); -/* use generic hash table */ -#define MY_TAB_SIZE 8 -#define MY_TAB_MASK (MY_TAB_SIZE - 1) -static u32 idx_gen; -static struct tcf_mirred *tcf_mirred_ht[MY_TAB_SIZE]; -static DEFINE_RWLOCK(mirred_lock); - -/* ovewrride the defaults */ -#define tcf_st tcf_mirred -#define tc_st tc_mirred -#define tcf_t_lock mirred_lock -#define tcf_ht tcf_mirred_ht - -#define CONFIG_NET_ACT_INIT 1 -#include <net/pkt_act.h> - -static inline int -tcf_mirred_release(struct tcf_mirred *p, int bind) +static void tcf_mirred_release(struct tc_action *a, int bind) { - if (p) { - if (bind) - p->bindcnt--; - p->refcnt--; - if(!p->bindcnt && p->refcnt <= 0) { - dev_put(p->dev); - tcf_hash_destroy(p); - return 1; - } - } - return 0; + struct tcf_mirred *m = to_mirred(a); + list_del(&m->tcfm_list); + if (m->tcfm_dev) + dev_put(m->tcfm_dev); } -static int -tcf_mirred_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, - int ovr, int bind) +static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { + [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, +}; + +static int tcf_mirred_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) { - struct rtattr *tb[TCA_MIRRED_MAX]; + struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tc_mirred *parm; - struct tcf_mirred *p; - struct net_device *dev = NULL; - int ret = 0; - int ok_push = 0; + struct tcf_mirred *m; + struct net_device *dev; + int ret, ok_push = 0; - if (rta == NULL || rtattr_parse_nested(tb, TCA_MIRRED_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - - if (tb[TCA_MIRRED_PARMS-1] == NULL || - RTA_PAYLOAD(tb[TCA_MIRRED_PARMS-1]) < sizeof(*parm)) + ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy); + if (ret < 0) + return ret; + if (tb[TCA_MIRRED_PARMS] == NULL) return -EINVAL; - parm = RTA_DATA(tb[TCA_MIRRED_PARMS-1]); - + parm = nla_data(tb[TCA_MIRRED_PARMS]); + switch (parm->eaction) { + case TCA_EGRESS_MIRROR: + case TCA_EGRESS_REDIR: + break; + default: + return -EINVAL; + } if (parm->ifindex) { - dev = __dev_get_by_index(parm->ifindex); + dev = __dev_get_by_index(net, parm->ifindex); if (dev == NULL) return -ENODEV; switch (dev->type) { - case ARPHRD_TUNNEL: - case ARPHRD_TUNNEL6: - case ARPHRD_SIT: - case ARPHRD_IPGRE: - case ARPHRD_VOID: - case ARPHRD_NONE: - ok_push = 0; - break; - default: - ok_push = 1; - break; + case ARPHRD_TUNNEL: + case ARPHRD_TUNNEL6: + case ARPHRD_SIT: + case ARPHRD_IPGRE: + case ARPHRD_VOID: + case ARPHRD_NONE: + ok_push = 0; + break; + default: + ok_push = 1; + break; } + } else { + dev = NULL; } - p = tcf_hash_check(parm->index, a, ovr, bind); - if (p == NULL) { - if (!parm->ifindex) + if (!tcf_hash_check(parm->index, a, bind)) { + if (dev == NULL) return -EINVAL; - p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); - if (p == NULL) - return -ENOMEM; + ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { if (!ovr) { - tcf_mirred_release(p, bind); + tcf_hash_release(a, bind); return -EEXIST; } } + m = to_mirred(a); - spin_lock_bh(&p->lock); - p->action = parm->action; - p->eaction = parm->eaction; - if (parm->ifindex) { - p->ifindex = parm->ifindex; + spin_lock_bh(&m->tcf_lock); + m->tcf_action = parm->action; + m->tcfm_eaction = parm->eaction; + if (dev != NULL) { + m->tcfm_ifindex = parm->ifindex; if (ret != ACT_P_CREATED) - dev_put(p->dev); - p->dev = dev; + dev_put(m->tcfm_dev); dev_hold(dev); - p->ok_push = ok_push; + m->tcfm_dev = dev; + m->tcfm_ok_push = ok_push; + } + spin_unlock_bh(&m->tcf_lock); + if (ret == ACT_P_CREATED) { + list_add(&m->tcfm_list, &mirred_list); + tcf_hash_insert(a); } - spin_unlock_bh(&p->lock); - if (ret == ACT_P_CREATED) - tcf_hash_insert(p); - DPRINTK("tcf_mirred_init index %d action %d eaction %d device %s " - "ifindex %d\n", parm->index, parm->action, parm->eaction, - dev->name, parm->ifindex); return ret; } -static int -tcf_mirred_cleanup(struct tc_action *a, int bind) +static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { - struct tcf_mirred *p = PRIV(a, mirred); - - if (p != NULL) - return tcf_mirred_release(p, bind); - return 0; -} - -static int -tcf_mirred(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) -{ - struct tcf_mirred *p = PRIV(a, mirred); + struct tcf_mirred *m = a->priv; struct net_device *dev; - struct sk_buff *skb2 = NULL; - u32 at = G_TC_AT(skb->tc_verd); - - spin_lock(&p->lock); - - dev = p->dev; - p->tm.lastuse = jiffies; - - if (!(dev->flags&IFF_UP) ) { - if (net_ratelimit()) - printk("mirred to Houston: device %s is gone!\n", - dev->name); -bad_mirred: - if (skb2 != NULL) - kfree_skb(skb2); - p->qstats.overlimits++; - p->bstats.bytes += skb->len; - p->bstats.packets++; - spin_unlock(&p->lock); - /* should we be asking for packet to be dropped? - * may make sense for redirect case only - */ - return TC_ACT_SHOT; + struct sk_buff *skb2; + u32 at; + int retval, err = 1; + + spin_lock(&m->tcf_lock); + m->tcf_tm.lastuse = jiffies; + bstats_update(&m->tcf_bstats, skb); + + dev = m->tcfm_dev; + if (!dev) { + printk_once(KERN_NOTICE "tc mirred: target device is gone\n"); + goto out; } - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 == NULL) - goto bad_mirred; - if (p->eaction != TCA_EGRESS_MIRROR && p->eaction != TCA_EGRESS_REDIR) { - if (net_ratelimit()) - printk("tcf_mirred unknown action %d\n", p->eaction); - goto bad_mirred; + if (!(dev->flags & IFF_UP)) { + net_notice_ratelimited("tc mirred to Houston: device %s is down\n", + dev->name); + goto out; } - p->bstats.bytes += skb2->len; - p->bstats.packets++; - if (!(at & AT_EGRESS)) - if (p->ok_push) + at = G_TC_AT(skb->tc_verd); + skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action); + if (skb2 == NULL) + goto out; + + if (!(at & AT_EGRESS)) { + if (m->tcfm_ok_push) skb_push(skb2, skb2->dev->hard_header_len); + } /* mirror is always swallowed */ - if (p->eaction != TCA_EGRESS_MIRROR) + if (m->tcfm_eaction != TCA_EGRESS_MIRROR) skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at); + skb2->skb_iif = skb->dev->ifindex; skb2->dev = dev; - skb2->input_dev = skb->dev; - dev_queue_xmit(skb2); - spin_unlock(&p->lock); - return p->action; + err = dev_queue_xmit(skb2); + +out: + if (err) { + m->tcf_qstats.overlimits++; + if (m->tcfm_eaction != TCA_EGRESS_MIRROR) + retval = TC_ACT_SHOT; + else + retval = m->tcf_action; + } else + retval = m->tcf_action; + spin_unlock(&m->tcf_lock); + + return retval; } -static int -tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - unsigned char *b = skb->tail; - struct tc_mirred opt; - struct tcf_mirred *p = PRIV(a, mirred); + unsigned char *b = skb_tail_pointer(skb); + struct tcf_mirred *m = a->priv; + struct tc_mirred opt = { + .index = m->tcf_index, + .action = m->tcf_action, + .refcnt = m->tcf_refcnt - ref, + .bindcnt = m->tcf_bindcnt - bind, + .eaction = m->tcfm_eaction, + .ifindex = m->tcfm_ifindex, + }; struct tcf_t t; - opt.index = p->index; - opt.action = p->action; - opt.refcnt = p->refcnt - ref; - opt.bindcnt = p->bindcnt - bind; - opt.eaction = p->eaction; - opt.ifindex = p->ifindex; - DPRINTK("tcf_mirred_dump index %d action %d eaction %d ifindex %d\n", - p->index, p->action, p->eaction, p->ifindex); - RTA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); - t.install = jiffies_to_clock_t(jiffies - p->tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); - t.expires = jiffies_to_clock_t(p->tm.expires); - RTA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t); + if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(m->tcf_tm.expires); + if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; - rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } +static int mirred_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct tcf_mirred *m; + + if (event == NETDEV_UNREGISTER) + list_for_each_entry(m, &mirred_list, tcfm_list) { + if (m->tcfm_dev == dev) { + dev_put(dev); + m->tcfm_dev = NULL; + } + } + + return NOTIFY_DONE; +} + +static struct notifier_block mirred_device_notifier = { + .notifier_call = mirred_device_event, +}; + static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .type = TCA_ACT_MIRRED, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_mirred, .dump = tcf_mirred_dump, - .cleanup = tcf_mirred_cleanup, - .lookup = tcf_hash_search, + .cleanup = tcf_mirred_release, .init = tcf_mirred_init, - .walk = tcf_generic_walker }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); MODULE_DESCRIPTION("Device Mirror/redirect actions"); MODULE_LICENSE("GPL"); -static int __init -mirred_init_module(void) +static int __init mirred_init_module(void) { - printk("Mirror/redirect action on\n"); - return tcf_register_action(&act_mirred_ops); + int err = register_netdevice_notifier(&mirred_device_notifier); + if (err) + return err; + + pr_info("Mirror/redirect action on\n"); + return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK); } -static void __exit -mirred_cleanup_module(void) +static void __exit mirred_cleanup_module(void) { tcf_unregister_action(&act_mirred_ops); + unregister_netdevice_notifier(&mirred_device_notifier); } module_init(mirred_init_module); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c new file mode 100644 index 00000000000..270a030d5fd --- /dev/null +++ b/net/sched/act_nat.c @@ -0,0 +1,306 @@ +/* + * Stateless NAT actions + * + * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/tc_act/tc_nat.h> +#include <net/act_api.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/netlink.h> +#include <net/tc_act/tc_nat.h> +#include <net/tcp.h> +#include <net/udp.h> + + +#define NAT_TAB_MASK 15 + +static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { + [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, +}; + +static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_NAT_MAX + 1]; + struct tc_nat *parm; + int ret = 0, err; + struct tcf_nat *p; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy); + if (err < 0) + return err; + + if (tb[TCA_NAT_PARMS] == NULL) + return -EINVAL; + parm = nla_data(tb[TCA_NAT_PARMS]); + + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + if (ret) + return ret; + ret = ACT_P_CREATED; + } else { + if (bind) + return 0; + tcf_hash_release(a, bind); + if (!ovr) + return -EEXIST; + } + p = to_tcf_nat(a); + + spin_lock_bh(&p->tcf_lock); + p->old_addr = parm->old_addr; + p->new_addr = parm->new_addr; + p->mask = parm->mask; + p->flags = parm->flags; + + p->tcf_action = parm->action; + spin_unlock_bh(&p->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(a); + + return ret; +} + +static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_nat *p = a->priv; + struct iphdr *iph; + __be32 old_addr; + __be32 new_addr; + __be32 mask; + __be32 addr; + int egress; + int action; + int ihl; + int noff; + + spin_lock(&p->tcf_lock); + + p->tcf_tm.lastuse = jiffies; + old_addr = p->old_addr; + new_addr = p->new_addr; + mask = p->mask; + egress = p->flags & TCA_NAT_FLAG_EGRESS; + action = p->tcf_action; + + bstats_update(&p->tcf_bstats, skb); + + spin_unlock(&p->tcf_lock); + + if (unlikely(action == TC_ACT_SHOT)) + goto drop; + + noff = skb_network_offset(skb); + if (!pskb_may_pull(skb, sizeof(*iph) + noff)) + goto drop; + + iph = ip_hdr(skb); + + if (egress) + addr = iph->saddr; + else + addr = iph->daddr; + + if (!((old_addr ^ addr) & mask)) { + if (skb_cloned(skb) && + !skb_clone_writable(skb, sizeof(*iph) + noff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto drop; + + new_addr &= mask; + new_addr |= addr & ~mask; + + /* Rewrite IP header */ + iph = ip_hdr(skb); + if (egress) + iph->saddr = new_addr; + else + iph->daddr = new_addr; + + csum_replace4(&iph->check, addr, new_addr); + } else if ((iph->frag_off & htons(IP_OFFSET)) || + iph->protocol != IPPROTO_ICMP) { + goto out; + } + + ihl = iph->ihl * 4; + + /* It would be nice to share code with stateful NAT. */ + switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) { + case IPPROTO_TCP: + { + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) || + (skb_cloned(skb) && + !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto drop; + + tcph = (void *)(skb_network_header(skb) + ihl); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1); + break; + } + case IPPROTO_UDP: + { + struct udphdr *udph; + + if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) || + (skb_cloned(skb) && + !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto drop; + + udph = (void *)(skb_network_header(skb) + ihl); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&udph->check, skb, addr, + new_addr, 1); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + break; + } + case IPPROTO_ICMP: + { + struct icmphdr *icmph; + + if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + noff)) + goto drop; + + icmph = (void *)(skb_network_header(skb) + ihl); + + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_TIME_EXCEEDED) && + (icmph->type != ICMP_PARAMETERPROB)) + break; + + if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) + + noff)) + goto drop; + + icmph = (void *)(skb_network_header(skb) + ihl); + iph = (void *)(icmph + 1); + if (egress) + addr = iph->daddr; + else + addr = iph->saddr; + + if ((old_addr ^ addr) & mask) + break; + + if (skb_cloned(skb) && + !skb_clone_writable(skb, ihl + sizeof(*icmph) + + sizeof(*iph) + noff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto drop; + + icmph = (void *)(skb_network_header(skb) + ihl); + iph = (void *)(icmph + 1); + + new_addr &= mask; + new_addr |= addr & ~mask; + + /* XXX Fix up the inner checksums. */ + if (egress) + iph->daddr = new_addr; + else + iph->saddr = new_addr; + + inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr, + 0); + break; + } + default: + break; + } + +out: + return action; + +drop: + spin_lock(&p->tcf_lock); + p->tcf_qstats.drops++; + spin_unlock(&p->tcf_lock); + return TC_ACT_SHOT; +} + +static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_nat *p = a->priv; + struct tc_nat opt = { + .old_addr = p->old_addr, + .new_addr = p->new_addr, + .mask = p->mask, + .flags = p->flags, + + .index = p->tcf_index, + .action = p->tcf_action, + .refcnt = p->tcf_refcnt - ref, + .bindcnt = p->tcf_bindcnt - bind, + }; + struct tcf_t t; + + if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t)) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_nat_ops = { + .kind = "nat", + .type = TCA_ACT_NAT, + .owner = THIS_MODULE, + .act = tcf_nat, + .dump = tcf_nat_dump, + .init = tcf_nat_init, +}; + +MODULE_DESCRIPTION("Stateless NAT actions"); +MODULE_LICENSE("GPL"); + +static int __init nat_init_module(void) +{ + return tcf_register_action(&act_nat_ops, NAT_TAB_MASK); +} + +static void __exit nat_cleanup_module(void) +{ + tcf_unregister_action(&act_nat_ops); +} + +module_init(nat_init_module); +module_exit(nat_cleanup_module); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 1742a68e012..5f9bcb2e080 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -9,277 +9,231 @@ * Authors: Jamal Hadi Salim (2002-4) */ -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/bitops.h> -#include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/proc_fs.h> -#include <net/sock.h> +#include <linux/slab.h> +#include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_pedit.h> #include <net/tc_act/tc_pedit.h> +#define PEDIT_TAB_MASK 15 -#define PEDIT_DEB 1 - -/* use generic hash table */ -#define MY_TAB_SIZE 16 -#define MY_TAB_MASK 15 -static u32 idx_gen; -static struct tcf_pedit *tcf_pedit_ht[MY_TAB_SIZE]; -static DEFINE_RWLOCK(pedit_lock); - -#define tcf_st tcf_pedit -#define tc_st tc_pedit -#define tcf_t_lock pedit_lock -#define tcf_ht tcf_pedit_ht - -#define CONFIG_NET_ACT_INIT 1 -#include <net/pkt_act.h> +static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { + [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, +}; -static int -tcf_pedit_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, - int ovr, int bind) +static int tcf_pedit_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { - struct rtattr *tb[TCA_PEDIT_MAX]; + struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; - int ret = 0; + int ret = 0, err; struct tcf_pedit *p; struct tc_pedit_key *keys = NULL; int ksize; - if (rta == NULL || rtattr_parse_nested(tb, TCA_PEDIT_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - if (tb[TCA_PEDIT_PARMS - 1] == NULL || - RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm)) + err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy); + if (err < 0) + return err; + + if (tb[TCA_PEDIT_PARMS] == NULL) return -EINVAL; - parm = RTA_DATA(tb[TCA_PEDIT_PARMS-1]); + parm = nla_data(tb[TCA_PEDIT_PARMS]); ksize = parm->nkeys * sizeof(struct tc_pedit_key); - if (RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm) + ksize) + if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize) return -EINVAL; - p = tcf_hash_check(parm->index, a, ovr, bind); - if (p == NULL) { + if (!tcf_hash_check(parm->index, a, bind)) { if (!parm->nkeys) return -EINVAL; - p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); - if (p == NULL) - return -ENOMEM; + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + if (ret) + return ret; + p = to_pedit(a); keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) { - kfree(p); + tcf_hash_cleanup(a, est); return -ENOMEM; } ret = ACT_P_CREATED; } else { - if (!ovr) { - tcf_hash_release(p, bind); + p = to_pedit(a); + tcf_hash_release(a, bind); + if (bind) + return 0; + if (!ovr) return -EEXIST; - } - if (p->nkeys && p->nkeys != parm->nkeys) { + + if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) return -ENOMEM; } } - spin_lock_bh(&p->lock); - p->flags = parm->flags; - p->action = parm->action; + spin_lock_bh(&p->tcf_lock); + p->tcfp_flags = parm->flags; + p->tcf_action = parm->action; if (keys) { - kfree(p->keys); - p->keys = keys; - p->nkeys = parm->nkeys; + kfree(p->tcfp_keys); + p->tcfp_keys = keys; + p->tcfp_nkeys = parm->nkeys; } - memcpy(p->keys, parm->keys, ksize); - spin_unlock_bh(&p->lock); + memcpy(p->tcfp_keys, parm->keys, ksize); + spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(p); + tcf_hash_insert(a); return ret; } -static int -tcf_pedit_cleanup(struct tc_action *a, int bind) +static void tcf_pedit_cleanup(struct tc_action *a, int bind) { - struct tcf_pedit *p = PRIV(a, pedit); - - if (p != NULL) { - struct tc_pedit_key *keys = p->keys; - if (tcf_hash_release(p, bind)) { - kfree(keys); - return 1; - } - } - return 0; + struct tcf_pedit *p = a->priv; + struct tc_pedit_key *keys = p->tcfp_keys; + kfree(keys); } -static int -tcf_pedit(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) +static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { - struct tcf_pedit *p = PRIV(a, pedit); + struct tcf_pedit *p = a->priv; int i, munged = 0; - u8 *pptr; + unsigned int off; - if (!(skb->tc_verd & TC_OK2MUNGE)) { - /* should we set skb->cloned? */ - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { - return p->action; - } - } + if (skb_unclone(skb, GFP_ATOMIC)) + return p->tcf_action; - pptr = skb->nh.raw; + off = skb_network_offset(skb); - spin_lock(&p->lock); + spin_lock(&p->tcf_lock); - p->tm.lastuse = jiffies; + p->tcf_tm.lastuse = jiffies; - if (p->nkeys > 0) { - struct tc_pedit_key *tkey = p->keys; + if (p->tcfp_nkeys > 0) { + struct tc_pedit_key *tkey = p->tcfp_keys; - for (i = p->nkeys; i > 0; i--, tkey++) { - u32 *ptr; + for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { + u32 *ptr, _data; int offset = tkey->off; if (tkey->offmask) { - if (skb->len > tkey->at) { - char *j = pptr + tkey->at; - offset += ((*j & tkey->offmask) >> - tkey->shift); - } else { + char *d, _d; + + d = skb_header_pointer(skb, off + tkey->at, 1, + &_d); + if (!d) goto bad; - } + offset += (*d & tkey->offmask) >> tkey->shift; } if (offset % 4) { - printk("offset must be on 32 bit boundaries\n"); + pr_info("tc filter pedit" + " offset must be on 32 bit boundaries\n"); goto bad; } - if (skb->len < 0 || (offset > 0 && offset > skb->len)) { - printk("offset %d cant exceed pkt length %d\n", + if (offset > 0 && offset > skb->len) { + pr_info("tc filter pedit" + " offset %d can't exceed pkt length %d\n", offset, skb->len); goto bad; } - ptr = (u32 *)(pptr+offset); + ptr = skb_header_pointer(skb, off + offset, 4, &_data); + if (!ptr) + goto bad; /* just do it, baby */ *ptr = ((*ptr & tkey->mask) ^ tkey->val); + if (ptr == &_data) + skb_store_bits(skb, off + offset, ptr, 4); munged++; } - + if (munged) skb->tc_verd = SET_TC_MUNGED(skb->tc_verd); goto done; - } else { - printk("pedit BUG: index %d\n",p->index); - } + } else + WARN(1, "pedit BUG: index %d\n", p->tcf_index); bad: - p->qstats.overlimits++; + p->tcf_qstats.overlimits++; done: - p->bstats.bytes += skb->len; - p->bstats.packets++; - spin_unlock(&p->lock); - return p->action; + bstats_update(&p->tcf_bstats, skb); + spin_unlock(&p->tcf_lock); + return p->tcf_action; } -static int -tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,int bind, int ref) +static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); + struct tcf_pedit *p = a->priv; struct tc_pedit *opt; - struct tcf_pedit *p = PRIV(a, pedit); struct tcf_t t; - int s; - - s = sizeof(*opt) + p->nkeys * sizeof(struct tc_pedit_key); + int s; + + s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key); /* netlink spinlocks held above us - must use ATOMIC */ - opt = kmalloc(s, GFP_ATOMIC); - if (opt == NULL) + opt = kzalloc(s, GFP_ATOMIC); + if (unlikely(!opt)) return -ENOBUFS; - memset(opt, 0, s); - - memcpy(opt->keys, p->keys, p->nkeys * sizeof(struct tc_pedit_key)); - opt->index = p->index; - opt->nkeys = p->nkeys; - opt->flags = p->flags; - opt->action = p->action; - opt->refcnt = p->refcnt - ref; - opt->bindcnt = p->bindcnt - bind; - - -#ifdef PEDIT_DEB - { - /* Debug - get rid of later */ - int i; - struct tc_pedit_key *key = opt->keys; - - for (i=0; i<opt->nkeys; i++, key++) { - printk( "\n key #%d",i); - printk( " at %d: val %08x mask %08x", - (unsigned int)key->off, - (unsigned int)key->val, - (unsigned int)key->mask); - } - } -#endif - RTA_PUT(skb, TCA_PEDIT_PARMS, s, opt); - t.install = jiffies_to_clock_t(jiffies - p->tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); - t.expires = jiffies_to_clock_t(p->tm.expires); - RTA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t); + memcpy(opt->keys, p->tcfp_keys, + p->tcfp_nkeys * sizeof(struct tc_pedit_key)); + opt->index = p->tcf_index; + opt->nkeys = p->tcfp_nkeys; + opt->flags = p->tcfp_flags; + opt->action = p->tcf_action; + opt->refcnt = p->tcf_refcnt - ref; + opt->bindcnt = p->tcf_bindcnt - bind; + + if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) + goto nla_put_failure; + t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t)) + goto nla_put_failure; kfree(opt); return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); kfree(opt); return -1; } -static -struct tc_action_ops act_pedit_ops = { +static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .type = TCA_ACT_PEDIT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_pedit, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, - .lookup = tcf_hash_search, .init = tcf_pedit_init, - .walk = tcf_generic_walker }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); MODULE_DESCRIPTION("Generic Packet Editor actions"); MODULE_LICENSE("GPL"); -static int __init -pedit_init_module(void) +static int __init pedit_init_module(void) { - return tcf_register_action(&act_pedit_ops); + return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK); } -static void __exit -pedit_cleanup_module(void) +static void __exit pedit_cleanup_module(void) { tcf_unregister_action(&act_pedit_ops); } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index fa877f8f652..0566e4606a4 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -10,368 +10,335 @@ * J Hadi Salim (action changes) */ -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> -#include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/init.h> -#include <net/sock.h> +#include <linux/slab.h> #include <net/act_api.h> - -#define L2T(p,L) ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log]) -#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log]) -#define PRIV(a) ((struct tcf_police *) (a)->priv) - -/* use generic hash table */ -#define MY_TAB_SIZE 16 -#define MY_TAB_MASK 15 -static u32 idx_gen; -static struct tcf_police *tcf_police_ht[MY_TAB_SIZE]; -/* Policer hash table lock */ -static DEFINE_RWLOCK(police_lock); +#include <net/netlink.h> + +struct tcf_police { + struct tcf_common common; + int tcfp_result; + u32 tcfp_ewma_rate; + s64 tcfp_burst; + u32 tcfp_mtu; + s64 tcfp_toks; + s64 tcfp_ptoks; + s64 tcfp_mtu_ptoks; + s64 tcfp_t_c; + struct psched_ratecfg rate; + bool rate_present; + struct psched_ratecfg peak; + bool peak_present; +}; +#define to_police(pc) \ + container_of(pc, struct tcf_police, common) + +#define POL_TAB_MASK 15 + +/* old policer structure from before tc actions */ +struct tc_police_compat { + u32 index; + int action; + u32 limit; + u32 burst; + u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; +}; /* Each policer is serialized by its individual spinlock */ -static __inline__ unsigned tcf_police_hash(u32 index) -{ - return index&0xF; -} - -static __inline__ struct tcf_police * tcf_police_lookup(u32 index) +static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb, + int type, struct tc_action *a) { - struct tcf_police *p; - - read_lock(&police_lock); - for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) { - if (p->index == index) - break; - } - read_unlock(&police_lock); - return p; -} - -#ifdef CONFIG_NET_CLS_ACT -static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, - int type, struct tc_action *a) -{ - struct tcf_police *p; + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct hlist_head *head; + struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; - struct rtattr *r; + struct nlattr *nest; - read_lock(&police_lock); + spin_lock_bh(&hinfo->lock); s_i = cb->args[0]; - for (i = 0; i < MY_TAB_SIZE; i++) { - p = tcf_police_ht[tcf_police_hash(i)]; + for (i = 0; i < (POL_TAB_MASK + 1); i++) { + head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)]; - for (; p; p = p->next) { + hlist_for_each_entry_rcu(p, head, tcfc_head) { index++; if (index < s_i) continue; a->priv = p; a->order = index; - r = (struct rtattr*) skb->tail; - RTA_PUT(skb, a->order, 0, NULL); + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; if (type == RTM_DELACTION) err = tcf_action_dump_1(skb, a, 0, 1); else err = tcf_action_dump_1(skb, a, 0, 0); if (err < 0) { index--; - skb_trim(skb, (u8*)r - skb->data); + nla_nest_cancel(skb, nest); goto done; } - r->rta_len = skb->tail - (u8*)r; + nla_nest_end(skb, nest); n_i++; } } done: - read_unlock(&police_lock); + spin_unlock_bh(&hinfo->lock); if (n_i) cb->args[0] += n_i; return n_i; -rtattr_failure: - skb_trim(skb, (u8*)r - skb->data); +nla_put_failure: + nla_nest_cancel(skb, nest); goto done; } -static inline int -tcf_hash_search(struct tc_action *a, u32 index) -{ - struct tcf_police *p = tcf_police_lookup(index); - - if (p != NULL) { - a->priv = p; - return 1; - } else { - return 0; - } -} -#endif - -static inline u32 tcf_police_new_index(void) -{ - do { - if (++idx_gen == 0) - idx_gen = 1; - } while (tcf_police_lookup(idx_gen)); - - return idx_gen; -} - -void tcf_police_destroy(struct tcf_police *p) -{ - unsigned h = tcf_police_hash(p->index); - struct tcf_police **p1p; - - for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) { - if (*p1p == p) { - write_lock_bh(&police_lock); - *p1p = p->next; - write_unlock_bh(&police_lock); -#ifdef CONFIG_NET_ESTIMATOR - gen_kill_estimator(&p->bstats, &p->rate_est); -#endif - if (p->R_tab) - qdisc_put_rtab(p->R_tab); - if (p->P_tab) - qdisc_put_rtab(p->P_tab); - kfree(p); - return; - } - } - BUG_TRAP(0); -} +static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { + [TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE }, + [TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE }, + [TCA_POLICE_AVRATE] = { .type = NLA_U32 }, + [TCA_POLICE_RESULT] = { .type = NLA_U32 }, +}; -#ifdef CONFIG_NET_CLS_ACT -static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_act_police_locate(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { - unsigned h; + unsigned int h; int ret = 0, err; - struct rtattr *tb[TCA_POLICE_MAX]; + struct nlattr *tb[TCA_POLICE_MAX + 1]; struct tc_police *parm; - struct tcf_police *p; + struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; + struct tcf_hashinfo *hinfo = a->ops->hinfo; + int size; - if (rta == NULL || rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - if (tb[TCA_POLICE_TBF-1] == NULL || - RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm)) - return -EINVAL; - parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); + err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy); + if (err < 0) + return err; - if (tb[TCA_POLICE_RESULT-1] != NULL && - RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) + if (tb[TCA_POLICE_TBF] == NULL) return -EINVAL; - if (tb[TCA_POLICE_RESULT-1] != NULL && - RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) + size = nla_len(tb[TCA_POLICE_TBF]); + if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat)) return -EINVAL; - - if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { - a->priv = p; - if (bind) { - p->bindcnt += 1; - p->refcnt += 1; + parm = nla_data(tb[TCA_POLICE_TBF]); + + if (parm->index) { + if (tcf_hash_search(a, parm->index)) { + police = to_police(a->priv); + if (bind) { + police->tcf_bindcnt += 1; + police->tcf_refcnt += 1; + return 0; + } + if (ovr) + goto override; + /* not replacing */ + return -EEXIST; } - if (ovr) - goto override; - return ret; } - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (p == NULL) + police = kzalloc(sizeof(*police), GFP_KERNEL); + if (police == NULL) return -ENOMEM; - memset(p, 0, sizeof(*p)); - ret = ACT_P_CREATED; - p->refcnt = 1; - spin_lock_init(&p->lock); - p->stats_lock = &p->lock; + police->tcf_refcnt = 1; + spin_lock_init(&police->tcf_lock); if (bind) - p->bindcnt = 1; + police->tcf_bindcnt = 1; override: if (parm->rate.rate) { err = -ENOMEM; - R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]); + R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]); if (R_tab == NULL) goto failure; + if (parm->peakrate.rate) { P_tab = qdisc_get_rtab(&parm->peakrate, - tb[TCA_POLICE_PEAKRATE-1]); - if (p->P_tab == NULL) { - qdisc_put_rtab(R_tab); + tb[TCA_POLICE_PEAKRATE]); + if (P_tab == NULL) goto failure; - } } } + + spin_lock_bh(&police->tcf_lock); + if (est) { + err = gen_replace_estimator(&police->tcf_bstats, + &police->tcf_rate_est, + &police->tcf_lock, est); + if (err) + goto failure_unlock; + } else if (tb[TCA_POLICE_AVRATE] && + (ret == ACT_P_CREATED || + !gen_estimator_active(&police->tcf_bstats, + &police->tcf_rate_est))) { + err = -EINVAL; + goto failure_unlock; + } + /* No failure allowed after this point */ - spin_lock_bh(&p->lock); - if (R_tab != NULL) { - qdisc_put_rtab(p->R_tab); - p->R_tab = R_tab; + police->tcfp_mtu = parm->mtu; + if (police->tcfp_mtu == 0) { + police->tcfp_mtu = ~0; + if (R_tab) + police->tcfp_mtu = 255 << R_tab->rate.cell_log; } - if (P_tab != NULL) { - qdisc_put_rtab(p->P_tab); - p->P_tab = P_tab; + if (R_tab) { + police->rate_present = true; + psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0); + qdisc_put_rtab(R_tab); + } else { + police->rate_present = false; + } + if (P_tab) { + police->peak_present = true; + psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0); + qdisc_put_rtab(P_tab); + } else { + police->peak_present = false; } - if (tb[TCA_POLICE_RESULT-1]) - p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]); - p->toks = p->burst = parm->burst; - p->mtu = parm->mtu; - if (p->mtu == 0) { - p->mtu = ~0; - if (p->R_tab) - p->mtu = 255<<p->R_tab->rate.cell_log; + if (tb[TCA_POLICE_RESULT]) + police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); + police->tcfp_burst = PSCHED_TICKS2NS(parm->burst); + police->tcfp_toks = police->tcfp_burst; + if (police->peak_present) { + police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak, + police->tcfp_mtu); + police->tcfp_ptoks = police->tcfp_mtu_ptoks; } - if (p->P_tab) - p->ptoks = L2T_P(p, p->mtu); - p->action = parm->action; - -#ifdef CONFIG_NET_ESTIMATOR - if (tb[TCA_POLICE_AVRATE-1]) - p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); - if (est) - gen_replace_estimator(&p->bstats, &p->rate_est, p->stats_lock, est); -#endif - - spin_unlock_bh(&p->lock); + police->tcf_action = parm->action; + + if (tb[TCA_POLICE_AVRATE]) + police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); + + spin_unlock_bh(&police->tcf_lock); if (ret != ACT_P_CREATED) return ret; - PSCHED_GET_TIME(p->t_c); - p->index = parm->index ? : tcf_police_new_index(); - h = tcf_police_hash(p->index); - write_lock_bh(&police_lock); - p->next = tcf_police_ht[h]; - tcf_police_ht[h] = p; - write_unlock_bh(&police_lock); + police->tcfp_t_c = ktime_to_ns(ktime_get()); + police->tcf_index = parm->index ? parm->index : + tcf_hash_new_index(hinfo); + h = tcf_hash(police->tcf_index, POL_TAB_MASK); + spin_lock_bh(&hinfo->lock); + hlist_add_head(&police->tcf_head, &hinfo->htab[h]); + spin_unlock_bh(&hinfo->lock); - a->priv = p; + a->priv = police; return ret; +failure_unlock: + spin_unlock_bh(&police->tcf_lock); failure: + qdisc_put_rtab(P_tab); + qdisc_put_rtab(R_tab); if (ret == ACT_P_CREATED) - kfree(p); + kfree(police); return err; } -static int tcf_act_police_cleanup(struct tc_action *a, int bind) +static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { - struct tcf_police *p = PRIV(a); - - if (p != NULL) - return tcf_police_release(p, bind); - return 0; -} - -static int tcf_act_police(struct sk_buff *skb, struct tc_action *a, - struct tcf_result *res) -{ - psched_time_t now; - struct tcf_police *p = PRIV(a); - long toks; - long ptoks = 0; - - spin_lock(&p->lock); - - p->bstats.bytes += skb->len; - p->bstats.packets++; - -#ifdef CONFIG_NET_ESTIMATOR - if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) { - p->qstats.overlimits++; - spin_unlock(&p->lock); - return p->action; + struct tcf_police *police = a->priv; + s64 now; + s64 toks; + s64 ptoks = 0; + + spin_lock(&police->tcf_lock); + + bstats_update(&police->tcf_bstats, skb); + + if (police->tcfp_ewma_rate && + police->tcf_rate_est.bps >= police->tcfp_ewma_rate) { + police->tcf_qstats.overlimits++; + if (police->tcf_action == TC_ACT_SHOT) + police->tcf_qstats.drops++; + spin_unlock(&police->tcf_lock); + return police->tcf_action; } -#endif - if (skb->len <= p->mtu) { - if (p->R_tab == NULL) { - spin_unlock(&p->lock); - return p->result; + if (qdisc_pkt_len(skb) <= police->tcfp_mtu) { + if (!police->rate_present) { + spin_unlock(&police->tcf_lock); + return police->tcfp_result; } - PSCHED_GET_TIME(now); - - toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst); - - if (p->P_tab) { - ptoks = toks + p->ptoks; - if (ptoks > (long)L2T_P(p, p->mtu)) - ptoks = (long)L2T_P(p, p->mtu); - ptoks -= L2T_P(p, skb->len); + now = ktime_to_ns(ktime_get()); + toks = min_t(s64, now - police->tcfp_t_c, + police->tcfp_burst); + if (police->peak_present) { + ptoks = toks + police->tcfp_ptoks; + if (ptoks > police->tcfp_mtu_ptoks) + ptoks = police->tcfp_mtu_ptoks; + ptoks -= (s64) psched_l2t_ns(&police->peak, + qdisc_pkt_len(skb)); } - toks += p->toks; - if (toks > (long)p->burst) - toks = p->burst; - toks -= L2T(p, skb->len); - + toks += police->tcfp_toks; + if (toks > police->tcfp_burst) + toks = police->tcfp_burst; + toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb)); if ((toks|ptoks) >= 0) { - p->t_c = now; - p->toks = toks; - p->ptoks = ptoks; - spin_unlock(&p->lock); - return p->result; + police->tcfp_t_c = now; + police->tcfp_toks = toks; + police->tcfp_ptoks = ptoks; + spin_unlock(&police->tcf_lock); + return police->tcfp_result; } } - p->qstats.overlimits++; - spin_unlock(&p->lock); - return p->action; + police->tcf_qstats.overlimits++; + if (police->tcf_action == TC_ACT_SHOT) + police->tcf_qstats.drops++; + spin_unlock(&police->tcf_lock); + return police->tcf_action; } static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - unsigned char *b = skb->tail; - struct tc_police opt; - struct tcf_police *p = PRIV(a); - - opt.index = p->index; - opt.action = p->action; - opt.mtu = p->mtu; - opt.burst = p->burst; - opt.refcnt = p->refcnt - ref; - opt.bindcnt = p->bindcnt - bind; - if (p->R_tab) - opt.rate = p->R_tab->rate; - else - memset(&opt.rate, 0, sizeof(opt.rate)); - if (p->P_tab) - opt.peakrate = p->P_tab->rate; - else - memset(&opt.peakrate, 0, sizeof(opt.peakrate)); - RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); - if (p->result) - RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result); -#ifdef CONFIG_NET_ESTIMATOR - if (p->ewma_rate) - RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate); -#endif + unsigned char *b = skb_tail_pointer(skb); + struct tcf_police *police = a->priv; + struct tc_police opt = { + .index = police->tcf_index, + .action = police->tcf_action, + .mtu = police->tcfp_mtu, + .burst = PSCHED_NS2TICKS(police->tcfp_burst), + .refcnt = police->tcf_refcnt - ref, + .bindcnt = police->tcf_bindcnt - bind, + }; + + if (police->rate_present) + psched_ratecfg_getrate(&opt.rate, &police->rate); + if (police->peak_present) + psched_ratecfg_getrate(&opt.peakrate, &police->peak); + if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) + goto nla_put_failure; + if (police->tcfp_result && + nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result)) + goto nla_put_failure; + if (police->tcfp_ewma_rate && + nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate)) + goto nla_put_failure; return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } @@ -382,20 +349,17 @@ MODULE_LICENSE("GPL"); static struct tc_action_ops act_police_ops = { .kind = "police", .type = TCA_ID_POLICE, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_act_police, .dump = tcf_act_police_dump, - .cleanup = tcf_act_police_cleanup, - .lookup = tcf_hash_search, .init = tcf_act_police_locate, - .walk = tcf_generic_walker + .walk = tcf_act_police_walker }; static int __init police_init_module(void) { - return tcf_register_action(&act_police_ops); + return tcf_register_action(&act_police_ops, POL_TAB_MASK); } static void __exit @@ -406,199 +370,3 @@ police_cleanup_module(void) module_init(police_init_module); module_exit(police_cleanup_module); - -#else /* CONFIG_NET_CLS_ACT */ - -struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est) -{ - unsigned h; - struct tcf_police *p; - struct rtattr *tb[TCA_POLICE_MAX]; - struct tc_police *parm; - - if (rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0) - return NULL; - - if (tb[TCA_POLICE_TBF-1] == NULL || - RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm)) - return NULL; - - parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); - - if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { - p->refcnt++; - return p; - } - - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (p == NULL) - return NULL; - - memset(p, 0, sizeof(*p)); - p->refcnt = 1; - spin_lock_init(&p->lock); - p->stats_lock = &p->lock; - if (parm->rate.rate) { - p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]); - if (p->R_tab == NULL) - goto failure; - if (parm->peakrate.rate) { - p->P_tab = qdisc_get_rtab(&parm->peakrate, - tb[TCA_POLICE_PEAKRATE-1]); - if (p->P_tab == NULL) - goto failure; - } - } - if (tb[TCA_POLICE_RESULT-1]) { - if (RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) - goto failure; - p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]); - } -#ifdef CONFIG_NET_ESTIMATOR - if (tb[TCA_POLICE_AVRATE-1]) { - if (RTA_PAYLOAD(tb[TCA_POLICE_AVRATE-1]) != sizeof(u32)) - goto failure; - p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); - } -#endif - p->toks = p->burst = parm->burst; - p->mtu = parm->mtu; - if (p->mtu == 0) { - p->mtu = ~0; - if (p->R_tab) - p->mtu = 255<<p->R_tab->rate.cell_log; - } - if (p->P_tab) - p->ptoks = L2T_P(p, p->mtu); - PSCHED_GET_TIME(p->t_c); - p->index = parm->index ? : tcf_police_new_index(); - p->action = parm->action; -#ifdef CONFIG_NET_ESTIMATOR - if (est) - gen_new_estimator(&p->bstats, &p->rate_est, p->stats_lock, est); -#endif - h = tcf_police_hash(p->index); - write_lock_bh(&police_lock); - p->next = tcf_police_ht[h]; - tcf_police_ht[h] = p; - write_unlock_bh(&police_lock); - return p; - -failure: - if (p->R_tab) - qdisc_put_rtab(p->R_tab); - kfree(p); - return NULL; -} - -int tcf_police(struct sk_buff *skb, struct tcf_police *p) -{ - psched_time_t now; - long toks; - long ptoks = 0; - - spin_lock(&p->lock); - - p->bstats.bytes += skb->len; - p->bstats.packets++; - -#ifdef CONFIG_NET_ESTIMATOR - if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) { - p->qstats.overlimits++; - spin_unlock(&p->lock); - return p->action; - } -#endif - - if (skb->len <= p->mtu) { - if (p->R_tab == NULL) { - spin_unlock(&p->lock); - return p->result; - } - - PSCHED_GET_TIME(now); - - toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst); - - if (p->P_tab) { - ptoks = toks + p->ptoks; - if (ptoks > (long)L2T_P(p, p->mtu)) - ptoks = (long)L2T_P(p, p->mtu); - ptoks -= L2T_P(p, skb->len); - } - toks += p->toks; - if (toks > (long)p->burst) - toks = p->burst; - toks -= L2T(p, skb->len); - - if ((toks|ptoks) >= 0) { - p->t_c = now; - p->toks = toks; - p->ptoks = ptoks; - spin_unlock(&p->lock); - return p->result; - } - } - - p->qstats.overlimits++; - spin_unlock(&p->lock); - return p->action; -} -EXPORT_SYMBOL(tcf_police); - -int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p) -{ - unsigned char *b = skb->tail; - struct tc_police opt; - - opt.index = p->index; - opt.action = p->action; - opt.mtu = p->mtu; - opt.burst = p->burst; - if (p->R_tab) - opt.rate = p->R_tab->rate; - else - memset(&opt.rate, 0, sizeof(opt.rate)); - if (p->P_tab) - opt.peakrate = p->P_tab->rate; - else - memset(&opt.peakrate, 0, sizeof(opt.peakrate)); - RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); - if (p->result) - RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result); -#ifdef CONFIG_NET_ESTIMATOR - if (p->ewma_rate) - RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate); -#endif - return skb->len; - -rtattr_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -int tcf_police_dump_stats(struct sk_buff *skb, struct tcf_police *p) -{ - struct gnet_dump d; - - if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, - TCA_XSTATS, p->stats_lock, &d) < 0) - goto errout; - - if (gnet_stats_copy_basic(&d, &p->bstats) < 0 || -#ifdef CONFIG_NET_ESTIMATOR - gnet_stats_copy_rate_est(&d, &p->rate_est) < 0 || -#endif - gnet_stats_copy_queue(&d, &p->qstats) < 0) - goto errout; - - if (gnet_stats_finish_copy(&d) < 0) - goto errout; - - return 0; - -errout: - return -1; -} - -#endif /* CONFIG_NET_CLS_ACT */ diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index e5f2e1f431e..992c2317ce8 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -6,69 +6,168 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Authors: Jamal Hadi Salim (2005) + * Authors: Jamal Hadi Salim (2005-8) * */ -#include <linux/config.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <net/netlink.h> #include <net/pkt_sched.h> #define TCA_ACT_SIMP 22 -/* XXX: Hide all these common elements under some macro - * probably -*/ #include <linux/tc_act/tc_defact.h> #include <net/tc_act/tc_defact.h> -/* use generic hash table with 8 buckets */ -#define MY_TAB_SIZE 8 -#define MY_TAB_MASK (MY_TAB_SIZE - 1) -static u32 idx_gen; -static struct tcf_defact *tcf_simp_ht[MY_TAB_SIZE]; -static DEFINE_RWLOCK(simp_lock); +#define SIMP_TAB_MASK 7 -/* override the defaults */ -#define tcf_st tcf_defact -#define tc_st tc_defact -#define tcf_t_lock simp_lock -#define tcf_ht tcf_simp_ht +#define SIMP_MAX_DATA 32 +static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_defact *d = a->priv; + + spin_lock(&d->tcf_lock); + d->tcf_tm.lastuse = jiffies; + bstats_update(&d->tcf_bstats, skb); + + /* print policy string followed by _ then packet count + * Example if this was the 3rd packet and the string was "hello" + * then it would look like "hello_3" (without quotes) + */ + pr_info("simple: %s_%d\n", + (char *)d->tcfd_defdata, d->tcf_bstats.packets); + spin_unlock(&d->tcf_lock); + return d->tcf_action; +} + +static void tcf_simp_release(struct tc_action *a, int bind) +{ + struct tcf_defact *d = to_defact(a); + kfree(d->tcfd_defdata); +} + +static int alloc_defdata(struct tcf_defact *d, char *defdata) +{ + d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL); + if (unlikely(!d->tcfd_defdata)) + return -ENOMEM; + strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + return 0; +} + +static void reset_policy(struct tcf_defact *d, char *defdata, + struct tc_defact *p) +{ + spin_lock_bh(&d->tcf_lock); + d->tcf_action = p->action; + memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); + strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + spin_unlock_bh(&d->tcf_lock); +} + +static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { + [TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) }, + [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA }, +}; -#define CONFIG_NET_ACT_INIT 1 -#include <net/pkt_act.h> -#include <net/act_generic.h> +static int tcf_simp_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) +{ + struct nlattr *tb[TCA_DEF_MAX + 1]; + struct tc_defact *parm; + struct tcf_defact *d; + char *defdata; + int ret = 0, err; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy); + if (err < 0) + return err; + + if (tb[TCA_DEF_PARMS] == NULL) + return -EINVAL; + + if (tb[TCA_DEF_DATA] == NULL) + return -EINVAL; + + parm = nla_data(tb[TCA_DEF_PARMS]); + defdata = nla_data(tb[TCA_DEF_DATA]); + + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + if (ret) + return ret; + + d = to_defact(a); + ret = alloc_defdata(d, defdata); + if (ret < 0) { + tcf_hash_cleanup(a, est); + return ret; + } + d->tcf_action = parm->action; + ret = ACT_P_CREATED; + } else { + d = to_defact(a); -static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) + if (bind) + return 0; + tcf_hash_release(a, bind); + if (!ovr) + return -EEXIST; + + reset_policy(d, defdata, parm); + } + + if (ret == ACT_P_CREATED) + tcf_hash_insert(a); + return ret; +} + +static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) { - struct tcf_defact *p = PRIV(a, defact); - - spin_lock(&p->lock); - p->tm.lastuse = jiffies; - p->bstats.bytes += skb->len; - p->bstats.packets++; - - /* print policy string followed by _ then packet count - * Example if this was the 3rd packet and the string was "hello" - * then it would look like "hello_3" (without quotes) - **/ - printk("simple: %s_%d\n", (char *)p->defdata, p->bstats.packets); - spin_unlock(&p->lock); - return p->action; + unsigned char *b = skb_tail_pointer(skb); + struct tcf_defact *d = a->priv; + struct tc_defact opt = { + .index = d->tcf_index, + .refcnt = d->tcf_refcnt - ref, + .bindcnt = d->tcf_bindcnt - bind, + .action = d->tcf_action, + }; + struct tcf_t t; + + if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || + nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) + goto nla_put_failure; + t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(d->tcf_tm.expires); + if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t)) + goto nla_put_failure; + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; } static struct tc_action_ops act_simp_ops = { - .kind = "simple", - .type = TCA_ACT_SIMP, - .capab = TCA_CAP_NONE, - .owner = THIS_MODULE, - .act = tcf_simp, - tca_use_default_ops + .kind = "simple", + .type = TCA_ACT_SIMP, + .owner = THIS_MODULE, + .act = tcf_simp, + .dump = tcf_simp_dump, + .cleanup = tcf_simp_release, + .init = tcf_simp_init, }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); @@ -77,9 +176,10 @@ MODULE_LICENSE("GPL"); static int __init simp_init_module(void) { - int ret = tcf_register_action(&act_simp_ops); + int ret; + ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK); if (!ret) - printk("Simple TC action Loaded\n"); + pr_info("Simple TC action Loaded\n"); return ret; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c new file mode 100644 index 00000000000..fcfeeaf838b --- /dev/null +++ b/net/sched/act_skbedit.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2008, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, see <http://www.gnu.org/licenses/>. + * + * Author: Alexander Duyck <alexander.h.duyck@intel.com> + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> + +#include <linux/tc_act/tc_skbedit.h> +#include <net/tc_act/tc_skbedit.h> + +#define SKBEDIT_TAB_MASK 15 + +static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_skbedit *d = a->priv; + + spin_lock(&d->tcf_lock); + d->tcf_tm.lastuse = jiffies; + bstats_update(&d->tcf_bstats, skb); + + if (d->flags & SKBEDIT_F_PRIORITY) + skb->priority = d->priority; + if (d->flags & SKBEDIT_F_QUEUE_MAPPING && + skb->dev->real_num_tx_queues > d->queue_mapping) + skb_set_queue_mapping(skb, d->queue_mapping); + if (d->flags & SKBEDIT_F_MARK) + skb->mark = d->mark; + + spin_unlock(&d->tcf_lock); + return d->tcf_action; +} + +static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { + [TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) }, + [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) }, + [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) }, + [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, +}; + +static int tcf_skbedit_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) +{ + struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; + struct tc_skbedit *parm; + struct tcf_skbedit *d; + u32 flags = 0, *priority = NULL, *mark = NULL; + u16 *queue_mapping = NULL; + int ret = 0, err; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy); + if (err < 0) + return err; + + if (tb[TCA_SKBEDIT_PARMS] == NULL) + return -EINVAL; + + if (tb[TCA_SKBEDIT_PRIORITY] != NULL) { + flags |= SKBEDIT_F_PRIORITY; + priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]); + } + + if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) { + flags |= SKBEDIT_F_QUEUE_MAPPING; + queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]); + } + + if (tb[TCA_SKBEDIT_MARK] != NULL) { + flags |= SKBEDIT_F_MARK; + mark = nla_data(tb[TCA_SKBEDIT_MARK]); + } + + if (!flags) + return -EINVAL; + + parm = nla_data(tb[TCA_SKBEDIT_PARMS]); + + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + if (ret) + return ret; + + d = to_skbedit(a); + ret = ACT_P_CREATED; + } else { + d = to_skbedit(a); + if (bind) + return 0; + tcf_hash_release(a, bind); + if (!ovr) + return -EEXIST; + } + + spin_lock_bh(&d->tcf_lock); + + d->flags = flags; + if (flags & SKBEDIT_F_PRIORITY) + d->priority = *priority; + if (flags & SKBEDIT_F_QUEUE_MAPPING) + d->queue_mapping = *queue_mapping; + if (flags & SKBEDIT_F_MARK) + d->mark = *mark; + + d->tcf_action = parm->action; + + spin_unlock_bh(&d->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(a); + return ret; +} + +static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_skbedit *d = a->priv; + struct tc_skbedit opt = { + .index = d->tcf_index, + .refcnt = d->tcf_refcnt - ref, + .bindcnt = d->tcf_bindcnt - bind, + .action = d->tcf_action, + }; + struct tcf_t t; + + if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + if ((d->flags & SKBEDIT_F_PRIORITY) && + nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), + &d->priority)) + goto nla_put_failure; + if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && + nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING, + sizeof(d->queue_mapping), &d->queue_mapping)) + goto nla_put_failure; + if ((d->flags & SKBEDIT_F_MARK) && + nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), + &d->mark)) + goto nla_put_failure; + t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(d->tcf_tm.expires); + if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t)) + goto nla_put_failure; + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_skbedit_ops = { + .kind = "skbedit", + .type = TCA_ACT_SKBEDIT, + .owner = THIS_MODULE, + .act = tcf_skbedit, + .dump = tcf_skbedit_dump, + .init = tcf_skbedit_init, +}; + +MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>"); +MODULE_DESCRIPTION("SKB Editing"); +MODULE_LICENSE("GPL"); + +static int __init skbedit_init_module(void) +{ + return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK); +} + +static void __exit skbedit_cleanup_module(void) +{ + tcf_unregister_action(&act_skbedit_ops); +} + +module_init(skbedit_init_module); +module_exit(skbedit_cleanup_module); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b4d89fbb378..45527e6b52d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -14,123 +14,110 @@ * */ -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> -#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/kmod.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <net/net_namespace.h> #include <net/sock.h> +#include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> -#if 0 /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif - /* The list of all installed classifier types */ - -static struct tcf_proto_ops *tcf_proto_base; +static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); /* Find classifier type by string name */ -static struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind) +static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind) { - struct tcf_proto_ops *t = NULL; + const struct tcf_proto_ops *t, *res = NULL; if (kind) { read_lock(&cls_mod_lock); - for (t = tcf_proto_base; t; t = t->next) { - if (rtattr_strcmp(kind, t->kind) == 0) { - if (!try_module_get(t->owner)) - t = NULL; + list_for_each_entry(t, &tcf_proto_base, head) { + if (nla_strcmp(kind, t->kind) == 0) { + if (try_module_get(t->owner)) + res = t; break; } } read_unlock(&cls_mod_lock); } - return t; + return res; } /* Register(unregister) new classifier type */ int register_tcf_proto_ops(struct tcf_proto_ops *ops) { - struct tcf_proto_ops *t, **tp; + struct tcf_proto_ops *t; int rc = -EEXIST; write_lock(&cls_mod_lock); - for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) + list_for_each_entry(t, &tcf_proto_base, head) if (!strcmp(ops->kind, t->kind)) goto out; - ops->next = NULL; - *tp = ops; + list_add_tail(&ops->head, &tcf_proto_base); rc = 0; out: write_unlock(&cls_mod_lock); return rc; } +EXPORT_SYMBOL(register_tcf_proto_ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) { - struct tcf_proto_ops *t, **tp; + struct tcf_proto_ops *t; int rc = -ENOENT; write_lock(&cls_mod_lock); - for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) - if (t == ops) + list_for_each_entry(t, &tcf_proto_base, head) { + if (t == ops) { + list_del(&t->head); + rc = 0; break; - - if (!t) - goto out; - *tp = t->next; - rc = 0; -out: + } + } write_unlock(&cls_mod_lock); return rc; } +EXPORT_SYMBOL(unregister_tcf_proto_ops); -static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, - struct tcf_proto *tp, unsigned long fh, int event); +static int tfilter_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct tcf_proto *tp, + unsigned long fh, int event); /* Select new prio value from the range, managed by kernel. */ -static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp) +static inline u32 tcf_auto_prio(struct tcf_proto *tp) { - u32 first = TC_H_MAKE(0xC0000000U,0U); + u32 first = TC_H_MAKE(0xC0000000U, 0U); if (tp) - first = tp->prio-1; + first = tp->prio - 1; return first; } /* Add/change/delete/get a filter node */ -static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) { - struct rtattr **tca; + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + spinlock_t *root_lock; struct tcmsg *t; u32 protocol; u32 prio; @@ -140,15 +127,23 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) struct Qdisc *q; struct tcf_proto **back, **chain; struct tcf_proto *tp; - struct tcf_proto_ops *tp_ops; - struct Qdisc_class_ops *cops; + const struct tcf_proto_ops *tp_ops; + const struct Qdisc_class_ops *cops; unsigned long cl; unsigned long fh; int err; + int tp_created = 0; + + if ((n->nlmsg_type != RTM_GETTFILTER) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; replay: - tca = arg; - t = NLMSG_DATA(n); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); + if (err < 0) + return err; + + t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); nprio = prio; @@ -157,28 +152,37 @@ replay: if (prio == 0) { /* If no priority is given, user wants we allocated it. */ - if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + if (n->nlmsg_type != RTM_NEWTFILTER || + !(n->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - prio = TC_H_MAKE(0x80000000U,0U); + prio = TC_H_MAKE(0x80000000U, 0U); } /* Find head of filter chain. */ /* Find link */ - if ((dev = __dev_get_by_index(t->tcm_ifindex)) == NULL) + dev = __dev_get_by_index(net, t->tcm_ifindex); + if (dev == NULL) return -ENODEV; /* Find qdisc */ if (!parent) { - q = dev->qdisc_sleeping; + q = dev->qdisc; parent = q->handle; - } else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL) - return -EINVAL; + } else { + q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent)); + if (q == NULL) + return -EINVAL; + } /* Is it classful? */ - if ((cops = q->ops->cl_ops) == NULL) + cops = q->ops->cl_ops; + if (!cops) return -EINVAL; + if (cops->tcf_chain == NULL) + return -EOPNOTSUPP; + /* Do we search for filter, attached to class? */ if (TC_H_MIN(parent)) { cl = cops->get(q, parent); @@ -193,10 +197,11 @@ replay: goto errout; /* Check the chain for existence of proto-tcf with this priority */ - for (back = chain; (tp=*back) != NULL; back = &tp->next) { + for (back = chain; (tp = *back) != NULL; back = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { - if (!nprio || (tp->protocol != protocol && protocol)) + if (!nprio || + (tp->protocol != protocol && protocol)) goto errout; } else tp = NULL; @@ -204,31 +209,35 @@ replay: } } + root_lock = qdisc_root_sleeping_lock(q); + if (tp == NULL) { /* Proto-tcf does not exist, create new one */ - if (tca[TCA_KIND-1] == NULL || !protocol) + if (tca[TCA_KIND] == NULL || !protocol) goto errout; err = -ENOENT; - if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + if (n->nlmsg_type != RTM_NEWTFILTER || + !(n->nlmsg_flags & NLM_F_CREATE)) goto errout; /* Create new proto tcf */ err = -ENOBUFS; - if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL) + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (tp == NULL) goto errout; - err = -EINVAL; - tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]); + err = -ENOENT; + tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]); if (tp_ops == NULL) { -#ifdef CONFIG_KMOD - struct rtattr *kind = tca[TCA_KIND-1]; +#ifdef CONFIG_MODULES + struct nlattr *kind = tca[TCA_KIND]; char name[IFNAMSIZ]; if (kind != NULL && - rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { rtnl_unlock(); request_module("cls_%s", name); rtnl_lock(); @@ -248,58 +257,60 @@ replay: kfree(tp); goto errout; } - memset(tp, 0, sizeof(*tp)); tp->ops = tp_ops; tp->protocol = protocol; - tp->prio = nprio ? : tcf_auto_prio(*back); + tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back)); tp->q = q; tp->classify = tp_ops->classify; tp->classid = parent; - if ((err = tp_ops->init(tp)) != 0) { + + err = tp_ops->init(tp); + if (err != 0) { module_put(tp_ops->owner); kfree(tp); goto errout; } - qdisc_lock_tree(dev); - tp->next = *back; - *back = tp; - qdisc_unlock_tree(dev); + tp_created = 1; - } else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind)) + } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) goto errout; fh = tp->ops->get(tp, t->tcm_handle); if (fh == 0) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { - qdisc_lock_tree(dev); + spin_lock_bh(root_lock); *back = tp->next; - qdisc_unlock_tree(dev); + spin_unlock_bh(root_lock); - tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER); + tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); tcf_destroy(tp); err = 0; goto errout; } err = -ENOENT; - if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + if (n->nlmsg_type != RTM_NEWTFILTER || + !(n->nlmsg_flags & NLM_F_CREATE)) goto errout; } else { switch (n->nlmsg_type) { - case RTM_NEWTFILTER: + case RTM_NEWTFILTER: err = -EEXIST; - if (n->nlmsg_flags&NLM_F_EXCL) + if (n->nlmsg_flags & NLM_F_EXCL) { + if (tp_created) + tcf_destroy(tp); goto errout; + } break; case RTM_DELTFILTER: err = tp->ops->delete(tp, fh); if (err == 0) - tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER); + tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); goto errout; case RTM_GETTFILTER: - err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); goto errout; default: err = -EINVAL; @@ -307,9 +318,20 @@ replay: } } - err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh); - if (err == 0) - tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, + n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE); + if (err == 0) { + if (tp_created) { + spin_lock_bh(root_lock); + tp->next = *back; + *back = tp; + spin_unlock_bh(root_lock); + } + tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); + } else { + if (tp_created) + tcf_destroy(tp); + } errout: if (cl) @@ -320,96 +342,106 @@ errout: return err; } -static int -tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, - u32 pid, u32 seq, u16 flags, int event) +static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, + unsigned long fh, u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); - nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); - tcm = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); + if (!nlh) + goto out_nlmsg_trim; + tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; - tcm->tcm__pad1 = 0; - tcm->tcm_ifindex = tp->q->dev->ifindex; + tcm->tcm__pad2 = 0; + tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex; tcm->tcm_parent = tp->classid; tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); - RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind); + if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) + goto nla_put_failure; tcm->tcm_handle = fh; if (RTM_DELTFILTER != event) { tcm->tcm_handle = 0; - if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) - goto rtattr_failure; + if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0) + goto nla_put_failure; } - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; -nlmsg_failure: -rtattr_failure: - skb_trim(skb, b - skb->data); +out_nlmsg_trim: +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, - struct tcf_proto *tp, unsigned long fh, int event) +static int tfilter_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct tcf_proto *tp, + unsigned long fh, int event) { struct sk_buff *skb; - u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { + if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) { kfree_skb(skb); return -EINVAL; } - return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } -struct tcf_dump_args -{ +struct tcf_dump_args { struct tcf_walker w; struct sk_buff *skb; struct netlink_callback *cb; }; -static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg) +static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, + struct tcf_walker *arg) { - struct tcf_dump_args *a = (void*)arg; + struct tcf_dump_args *a = (void *)arg; + struct net *net = sock_net(a->skb->sk); - return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, + return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); } +/* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); int t; int s_t; struct net_device *dev; struct Qdisc *q; struct tcf_proto *tp, **chain; - struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + struct tcmsg *tcm = nlmsg_data(cb->nlh); unsigned long cl = 0; - struct Qdisc_class_ops *cops; + const struct Qdisc_class_ops *cops; struct tcf_dump_args arg; - if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; - if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) return skb->len; - read_lock_bh(&qdisc_tree_lock); if (!tcm->tcm_parent) - q = dev->qdisc_sleeping; + q = dev->qdisc; else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); if (!q) goto out; - if ((cops = q->ops->cl_ops) == NULL) + cops = q->ops->cl_ops; + if (!cops) + goto errout; + if (cops->tcf_chain == NULL) goto errout; if (TC_H_MIN(tcm->tcm_parent)) { cl = cops->get(q, tcm->tcm_parent); @@ -422,8 +454,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; - for (tp=*chain, t=0; tp; tp = tp->next, t++) { - if (t < s_t) continue; + for (tp = *chain, t = 0; tp; tp = tp->next, t++) { + if (t < s_t) + continue; if (TC_H_MAJ(tcm->tcm_info) && TC_H_MAJ(tcm->tcm_info) != tp->prio) continue; @@ -433,10 +466,11 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) { + if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWTFILTER) <= 0) break; - } + cb->args[1] = 1; } if (tp->ops->walk == NULL) @@ -445,10 +479,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) arg.skb = skb; arg.cb = cb; arg.w.stop = 0; - arg.w.skip = cb->args[1]-1; + arg.w.skip = cb->args[1] - 1; arg.w.count = 0; tp->ops->walk(tp, &arg.w); - cb->args[1] = arg.w.count+1; + cb->args[1] = arg.w.count + 1; if (arg.w.stop) break; } @@ -459,185 +493,125 @@ errout: if (cl) cops->put(q, cl); out: - read_unlock_bh(&qdisc_tree_lock); - dev_put(dev); return skb->len; } -void -tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) +void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (exts->action) { - tcf_action_destroy(exts->action, TCA_ACT_UNBIND); - exts->action = NULL; - } -#elif defined CONFIG_NET_CLS_POLICE - if (exts->police) { - tcf_police_release(exts->police, TCA_ACT_UNBIND); - exts->police = NULL; - } + tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND); + INIT_LIST_HEAD(&exts->actions); #endif } +EXPORT_SYMBOL(tcf_exts_destroy); - -int -tcf_exts_validate(struct tcf_proto *tp, struct rtattr **tb, - struct rtattr *rate_tlv, struct tcf_exts *exts, - struct tcf_ext_map *map) +int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, + struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr) { - memset(exts, 0, sizeof(*exts)); - #ifdef CONFIG_NET_CLS_ACT { - int err; struct tc_action *act; - if (map->police && tb[map->police-1]) { - act = tcf_action_init_1(tb[map->police-1], rate_tlv, "police", - TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); - if (act == NULL) + INIT_LIST_HEAD(&exts->actions); + if (exts->police && tb[exts->police]) { + act = tcf_action_init_1(net, tb[exts->police], rate_tlv, + "police", ovr, + TCA_ACT_BIND); + if (IS_ERR(act)) + return PTR_ERR(act); + + act->type = exts->type = TCA_OLD_COMPAT; + list_add(&act->list, &exts->actions); + } else if (exts->action && tb[exts->action]) { + int err; + err = tcf_action_init(net, tb[exts->action], rate_tlv, + NULL, ovr, + TCA_ACT_BIND, &exts->actions); + if (err) return err; - - act->type = TCA_OLD_COMPAT; - exts->action = act; - } else if (map->action && tb[map->action-1]) { - act = tcf_action_init(tb[map->action-1], rate_tlv, NULL, - TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); - if (act == NULL) - return err; - - exts->action = act; } } -#elif defined CONFIG_NET_CLS_POLICE - if (map->police && tb[map->police-1]) { - struct tcf_police *p; - - p = tcf_police_locate(tb[map->police-1], rate_tlv); - if (p == NULL) - return -EINVAL; - - exts->police = p; - } else if (map->action && tb[map->action-1]) - return -EOPNOTSUPP; #else - if ((map->action && tb[map->action-1]) || - (map->police && tb[map->police-1])) + if ((exts->action && tb[exts->action]) || + (exts->police && tb[exts->police])) return -EOPNOTSUPP; #endif return 0; } +EXPORT_SYMBOL(tcf_exts_validate); -void -tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, - struct tcf_exts *src) +void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, + struct tcf_exts *src) { #ifdef CONFIG_NET_CLS_ACT - if (src->action) { - struct tc_action *act; - tcf_tree_lock(tp); - act = xchg(&dst->action, src->action); - tcf_tree_unlock(tp); - if (act) - tcf_action_destroy(act, TCA_ACT_UNBIND); - } -#elif defined CONFIG_NET_CLS_POLICE - if (src->police) { - struct tcf_police *p; - tcf_tree_lock(tp); - p = xchg(&dst->police, src->police); - tcf_tree_unlock(tp); - if (p) - tcf_police_release(p, TCA_ACT_UNBIND); - } + LIST_HEAD(tmp); + tcf_tree_lock(tp); + list_splice_init(&dst->actions, &tmp); + list_splice(&src->actions, &dst->actions); + tcf_tree_unlock(tp); + tcf_action_destroy(&tmp, TCA_ACT_UNBIND); #endif } +EXPORT_SYMBOL(tcf_exts_change); -int -tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, - struct tcf_ext_map *map) +#define tcf_exts_first_act(ext) \ + list_first_entry(&(exts)->actions, struct tc_action, list) + +int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (map->action && exts->action) { + if (exts->action && !list_empty(&exts->actions)) { /* * again for backward compatible mode - we want * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ - struct rtattr * p_rta = (struct rtattr*) skb->tail; - - if (exts->action->type != TCA_OLD_COMPAT) { - RTA_PUT(skb, map->action, 0, NULL); - if (tcf_action_dump(skb, exts->action, 0, 0) < 0) - goto rtattr_failure; - p_rta->rta_len = skb->tail - (u8*)p_rta; - } else if (map->police) { - RTA_PUT(skb, map->police, 0, NULL); - if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) - goto rtattr_failure; - p_rta->rta_len = skb->tail - (u8*)p_rta; + struct nlattr *nest; + if (exts->type != TCA_OLD_COMPAT) { + nest = nla_nest_start(skb, exts->action); + if (nest == NULL) + goto nla_put_failure; + if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + } else if (exts->police) { + struct tc_action *act = tcf_exts_first_act(exts); + nest = nla_nest_start(skb, exts->police); + if (nest == NULL || !act) + goto nla_put_failure; + if (tcf_action_dump_old(skb, act, 0, 0) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); } } -#elif defined CONFIG_NET_CLS_POLICE - if (map->police && exts->police) { - struct rtattr * p_rta = (struct rtattr*) skb->tail; - - RTA_PUT(skb, map->police, 0, NULL); - - if (tcf_police_dump(skb, exts->police) < 0) - goto rtattr_failure; - - p_rta->rta_len = skb->tail - (u8*)p_rta; - } #endif return 0; -rtattr_failure: __attribute__ ((unused)) +nla_put_failure: __attribute__ ((unused)) return -1; } +EXPORT_SYMBOL(tcf_exts_dump); + -int -tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, - struct tcf_ext_map *map) +int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (exts->action) - if (tcf_action_copy_stats(skb, exts->action, 1) < 0) - goto rtattr_failure; -#elif defined CONFIG_NET_CLS_POLICE - if (exts->police) - if (tcf_police_dump_stats(skb, exts->police) < 0) - goto rtattr_failure; + struct tc_action *a = tcf_exts_first_act(exts); + if (tcf_action_copy_stats(skb, a, 1) < 0) + return -1; #endif return 0; -rtattr_failure: __attribute__ ((unused)) - return -1; } +EXPORT_SYMBOL(tcf_exts_dump_stats); static int __init tc_filter_init(void) { - struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, + tc_dump_tfilter, NULL); - /* Setup rtnetlink links. It is made here to avoid - exporting large number of public symbols. - */ - - if (link_p) { - link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter; - link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter; - link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter; - link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter; - } return 0; } subsys_initcall(tc_filter_init); - -EXPORT_SYMBOL(register_tcf_proto_ops); -EXPORT_SYMBOL(unregister_tcf_proto_ops); -EXPORT_SYMBOL(tcf_exts_validate); -EXPORT_SYMBOL(tcf_exts_destroy); -EXPORT_SYMBOL(tcf_exts_change); -EXPORT_SYMBOL(tcf_exts_dump); -EXPORT_SYMBOL(tcf_exts_dump_stats); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index dfb300bb6ba..0ae1813e3e9 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -9,27 +9,24 @@ * Authors: Thomas Graf <tgraf@suug.ch> */ -#include <linux/config.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> -struct basic_head -{ +struct basic_head { u32 hgenerator; struct list_head flist; }; -struct basic_filter -{ +struct basic_filter { u32 handle; struct tcf_exts exts; struct tcf_ematch_tree ematches; @@ -37,16 +34,11 @@ struct basic_filter struct list_head link; }; -static struct tcf_ext_map basic_ext_map = { - .action = TCA_BASIC_ACT, - .police = TCA_BASIC_POLICE -}; - -static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp, +static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { int r; - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *f; list_for_each_entry(f, &head->flist, link) { @@ -64,7 +56,7 @@ static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp, static unsigned long basic_get(struct tcf_proto *tp, u32 handle) { unsigned long l = 0UL; - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *f; if (head == NULL) @@ -83,11 +75,17 @@ static void basic_put(struct tcf_proto *tp, unsigned long f) static int basic_init(struct tcf_proto *tp) { + struct basic_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + INIT_LIST_HEAD(&head->flist); + tp->root = head; return 0; } -static inline void basic_delete_filter(struct tcf_proto *tp, - struct basic_filter *f) +static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f) { tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(tp, &f->exts); @@ -97,18 +95,19 @@ static inline void basic_delete_filter(struct tcf_proto *tp, static void basic_destroy(struct tcf_proto *tp) { - struct basic_head *head = (struct basic_head *) xchg(&tp->root, NULL); + struct basic_head *head = tp->root; struct basic_filter *f, *n; - + list_for_each_entry_safe(f, n, &head->flist, link) { list_del(&f->link); basic_delete_filter(tp, f); } + kfree(head); } static int basic_delete(struct tcf_proto *tp, unsigned long arg) { - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *t, *f = (struct basic_filter *) arg; list_for_each_entry(t, &head->flist, link) @@ -123,28 +122,31 @@ static int basic_delete(struct tcf_proto *tp, unsigned long arg) return -ENOENT; } -static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, - unsigned long base, struct rtattr **tb, - struct rtattr *est) +static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { + [TCA_BASIC_CLASSID] = { .type = NLA_U32 }, + [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED }, +}; + +static int basic_set_parms(struct net *net, struct tcf_proto *tp, + struct basic_filter *f, unsigned long base, + struct nlattr **tb, + struct nlattr *est, bool ovr) { - int err = -EINVAL; + int err; struct tcf_exts e; struct tcf_ematch_tree t; - if (tb[TCA_BASIC_CLASSID-1]) - if (RTA_PAYLOAD(tb[TCA_BASIC_CLASSID-1]) < sizeof(u32)) - return err; - - err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map); + tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; - err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES-1], &t); + err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t); if (err < 0) goto errout; - if (tb[TCA_BASIC_CLASSID-1]) { - f->res.classid = *(u32*)RTA_DATA(tb[TCA_BASIC_CLASSID-1]); + if (tb[TCA_BASIC_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_BASIC_CLASSID]); tcf_bind_filter(tp, &f->res, base); } @@ -157,61 +159,54 @@ errout: return err; } -static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, unsigned long *arg) +static int basic_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, + struct nlattr **tca, unsigned long *arg, bool ovr) { - int err = -EINVAL; - struct basic_head *head = (struct basic_head *) tp->root; - struct rtattr *tb[TCA_BASIC_MAX]; + int err; + struct basic_head *head = tp->root; + struct nlattr *tb[TCA_BASIC_MAX + 1]; struct basic_filter *f = (struct basic_filter *) *arg; - if (tca[TCA_OPTIONS-1] == NULL) + if (tca[TCA_OPTIONS] == NULL) return -EINVAL; - if (rtattr_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS-1]) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS], + basic_policy); + if (err < 0) + return err; if (f != NULL) { if (handle && f->handle != handle) return -EINVAL; - return basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]); + return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); } err = -ENOBUFS; - if (head == NULL) { - head = kmalloc(sizeof(*head), GFP_KERNEL); - if (head == NULL) - goto errout; - - memset(head, 0, sizeof(*head)); - INIT_LIST_HEAD(&head->flist); - tp->root = head; - } - - f = kmalloc(sizeof(*f), GFP_KERNEL); + f = kzalloc(sizeof(*f), GFP_KERNEL); if (f == NULL) goto errout; - memset(f, 0, sizeof(*f)); + tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); err = -EINVAL; if (handle) f->handle = handle; else { - int i = 0x80000000; + unsigned int i = 0x80000000; do { if (++head->hgenerator == 0x7FFFFFFF) head->hgenerator = 1; } while (--i > 0 && basic_get(tp, head->hgenerator)); if (i <= 0) { - printk(KERN_ERR "Insufficient number of handles\n"); + pr_err("Insufficient number of handles\n"); goto errout; } f->handle = head->hgenerator; } - err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]); + err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); if (err < 0) goto errout; @@ -230,7 +225,7 @@ errout: static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *f; list_for_each_entry(f, &head->flist, link) { @@ -246,37 +241,42 @@ skip: } } -static int basic_dump(struct tcf_proto *tp, unsigned long fh, +static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct basic_filter *f = (struct basic_filter *) fh; - unsigned char *b = skb->tail; - struct rtattr *rta; + struct nlattr *nest; if (f == NULL) return skb->len; t->tcm_handle = f->handle; - rta = (struct rtattr *) b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; - if (f->res.classid) - RTA_PUT(skb, TCA_BASIC_CLASSID, sizeof(u32), &f->res.classid); + if (f->res.classid && + nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid)) + goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || + if (tcf_exts_dump(skb, &f->exts) < 0 || tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) - goto rtattr_failure; + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; - rta->rta_len = (skb->tail - b); return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } -static struct tcf_proto_ops cls_basic_ops = { +static struct tcf_proto_ops cls_basic_ops __read_mostly = { .kind = "basic", .classify = basic_classify, .init = basic_init, @@ -295,7 +295,7 @@ static int __init init_basic(void) return register_tcf_proto_ops(&cls_basic_ops); } -static void __exit exit_basic(void) +static void __exit exit_basic(void) { unregister_tcf_proto_ops(&cls_basic_ops); } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c new file mode 100644 index 00000000000..13f64df2c71 --- /dev/null +++ b/net/sched/cls_bpf.c @@ -0,0 +1,382 @@ +/* + * Berkeley Packet Filter based traffic classifier + * + * Might be used to classify traffic through flexible, user-defined and + * possibly JIT-ed BPF filters for traffic control as an alternative to + * ematches. + * + * (C) 2013 Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/filter.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_DESCRIPTION("TC BPF based classifier"); + +struct cls_bpf_head { + struct list_head plist; + u32 hgen; +}; + +struct cls_bpf_prog { + struct sk_filter *filter; + struct sock_filter *bpf_ops; + struct tcf_exts exts; + struct tcf_result res; + struct list_head link; + u32 handle; + u16 bpf_len; +}; + +static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { + [TCA_BPF_CLASSID] = { .type = NLA_U32 }, + [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, + [TCA_BPF_OPS] = { .type = NLA_BINARY, + .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, +}; + +static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog; + int ret; + + list_for_each_entry(prog, &head->plist, link) { + int filter_res = SK_RUN_FILTER(prog->filter, skb); + + if (filter_res == 0) + continue; + + *res = prog->res; + if (filter_res != -1) + res->classid = filter_res; + + ret = tcf_exts_exec(skb, &prog->exts, res); + if (ret < 0) + continue; + + return ret; + } + + return -1; +} + +static int cls_bpf_init(struct tcf_proto *tp) +{ + struct cls_bpf_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + + INIT_LIST_HEAD(&head->plist); + tp->root = head; + + return 0; +} + +static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) +{ + tcf_unbind_filter(tp, &prog->res); + tcf_exts_destroy(tp, &prog->exts); + + sk_unattached_filter_destroy(prog->filter); + + kfree(prog->bpf_ops); + kfree(prog); +} + +static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg; + + list_for_each_entry(prog, &head->plist, link) { + if (prog == todel) { + tcf_tree_lock(tp); + list_del(&prog->link); + tcf_tree_unlock(tp); + + cls_bpf_delete_prog(tp, prog); + return 0; + } + } + + return -ENOENT; +} + +static void cls_bpf_destroy(struct tcf_proto *tp) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog, *tmp; + + list_for_each_entry_safe(prog, tmp, &head->plist, link) { + list_del(&prog->link); + cls_bpf_delete_prog(tp, prog); + } + + kfree(head); +} + +static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog; + unsigned long ret = 0UL; + + if (head == NULL) + return 0UL; + + list_for_each_entry(prog, &head->plist, link) { + if (prog->handle == handle) { + ret = (unsigned long) prog; + break; + } + } + + return ret; +} + +static void cls_bpf_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, + struct cls_bpf_prog *prog, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct sock_filter *bpf_ops, *bpf_old; + struct tcf_exts exts; + struct sock_fprog_kern tmp; + struct sk_filter *fp, *fp_old; + u16 bpf_size, bpf_len; + u32 classid; + int ret; + + if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID]) + return -EINVAL; + + tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + if (ret < 0) + return ret; + + classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]); + if (bpf_len > BPF_MAXINSNS || bpf_len == 0) { + ret = -EINVAL; + goto errout; + } + + bpf_size = bpf_len * sizeof(*bpf_ops); + bpf_ops = kzalloc(bpf_size, GFP_KERNEL); + if (bpf_ops == NULL) { + ret = -ENOMEM; + goto errout; + } + + memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); + + tmp.len = bpf_len; + tmp.filter = bpf_ops; + + ret = sk_unattached_filter_create(&fp, &tmp); + if (ret) + goto errout_free; + + tcf_tree_lock(tp); + fp_old = prog->filter; + bpf_old = prog->bpf_ops; + + prog->bpf_len = bpf_len; + prog->bpf_ops = bpf_ops; + prog->filter = fp; + prog->res.classid = classid; + tcf_tree_unlock(tp); + + tcf_bind_filter(tp, &prog->res, base); + tcf_exts_change(tp, &prog->exts, &exts); + + if (fp_old) + sk_unattached_filter_destroy(fp_old); + if (bpf_old) + kfree(bpf_old); + + return 0; + +errout_free: + kfree(bpf_ops); +errout: + tcf_exts_destroy(tp, &exts); + return ret; +} + +static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, + struct cls_bpf_head *head) +{ + unsigned int i = 0x80000000; + + do { + if (++head->hgen == 0x7FFFFFFF) + head->hgen = 1; + } while (--i > 0 && cls_bpf_get(tp, head->hgen)); + if (i == 0) + pr_err("Insufficient number of handles\n"); + + return i; +} + +static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg, bool ovr) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg; + struct nlattr *tb[TCA_BPF_MAX + 1]; + int ret; + + if (tca[TCA_OPTIONS] == NULL) + return -EINVAL; + + ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy); + if (ret < 0) + return ret; + + if (prog != NULL) { + if (handle && prog->handle != handle) + return -EINVAL; + return cls_bpf_modify_existing(net, tp, prog, base, tb, + tca[TCA_RATE], ovr); + } + + prog = kzalloc(sizeof(*prog), GFP_KERNEL); + if (prog == NULL) + return -ENOBUFS; + + tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); + if (handle == 0) + prog->handle = cls_bpf_grab_new_handle(tp, head); + else + prog->handle = handle; + if (prog->handle == 0) { + ret = -EINVAL; + goto errout; + } + + ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr); + if (ret < 0) + goto errout; + + tcf_tree_lock(tp); + list_add(&prog->link, &head->plist); + tcf_tree_unlock(tp); + + *arg = (unsigned long) prog; + + return 0; +errout: + if (*arg == 0UL && prog) + kfree(prog); + + return ret; +} + +static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *tm) +{ + struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; + struct nlattr *nest, *nla; + + if (prog == NULL) + return skb->len; + + tm->tcm_handle = prog->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) + goto nla_put_failure; + if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len)) + goto nla_put_failure; + + nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len * + sizeof(struct sock_filter)); + if (nla == NULL) + goto nla_put_failure; + + memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + + if (tcf_exts_dump(skb, &prog->exts) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &prog->exts) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog; + + list_for_each_entry(prog, &head->plist, link) { + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long) prog, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static struct tcf_proto_ops cls_bpf_ops __read_mostly = { + .kind = "bpf", + .owner = THIS_MODULE, + .classify = cls_bpf_classify, + .init = cls_bpf_init, + .destroy = cls_bpf_destroy, + .get = cls_bpf_get, + .put = cls_bpf_put, + .change = cls_bpf_change, + .delete = cls_bpf_delete, + .walk = cls_bpf_walk, + .dump = cls_bpf_dump, +}; + +static int __init cls_bpf_init_mod(void) +{ + return register_tcf_proto_ops(&cls_bpf_ops); +} + +static void __exit cls_bpf_exit_mod(void) +{ + unregister_tcf_proto_ops(&cls_bpf_ops); +} + +module_init(cls_bpf_init_mod); +module_exit(cls_bpf_exit_mod); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c new file mode 100644 index 00000000000..cacf01bd04f --- /dev/null +++ b/net/sched/cls_cgroup.c @@ -0,0 +1,222 @@ +/* + * net/sched/cls_cgroup.c Control Group Classifier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/skbuff.h> +#include <linux/rcupdate.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> +#include <net/cls_cgroup.h> + +struct cls_cgroup_head { + u32 handle; + struct tcf_exts exts; + struct tcf_ematch_tree ematches; +}; + +static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_cgroup_head *head = tp->root; + u32 classid; + + rcu_read_lock(); + classid = task_cls_state(current)->classid; + rcu_read_unlock(); + + /* + * Due to the nature of the classifier it is required to ignore all + * packets originating from softirq context as accessing `current' + * would lead to false results. + * + * This test assumes that all callers of dev_queue_xmit() explicitely + * disable bh. Knowing this, it is possible to detect softirq based + * calls by looking at the number of nested bh disable calls because + * softirqs always disables bh. + */ + if (in_serving_softirq()) { + /* If there is an sk_classid we'll use that. */ + if (!skb->sk) + return -1; + classid = skb->sk->sk_classid; + } + + if (!classid) + return -1; + + if (!tcf_em_tree_match(skb, &head->ematches, NULL)) + return -1; + + res->classid = classid; + res->class = 0; + return tcf_exts_exec(skb, &head->exts, res); +} + +static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle) +{ + return 0UL; +} + +static void cls_cgroup_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int cls_cgroup_init(struct tcf_proto *tp) +{ + return 0; +} + +static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { + [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, +}; + +static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg, bool ovr) +{ + struct nlattr *tb[TCA_CGROUP_MAX + 1]; + struct cls_cgroup_head *head = tp->root; + struct tcf_ematch_tree t; + struct tcf_exts e; + int err; + + if (!tca[TCA_OPTIONS]) + return -EINVAL; + + if (head == NULL) { + if (!handle) + return -EINVAL; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + + tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + head->handle = handle; + + tcf_tree_lock(tp); + tp->root = head; + tcf_tree_unlock(tp); + } + + if (handle != head->handle) + return -ENOENT; + + err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], + cgroup_policy); + if (err < 0) + return err; + + tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + if (err < 0) + return err; + + err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t); + if (err < 0) + return err; + + tcf_exts_change(tp, &head->exts, &e); + tcf_em_tree_change(tp, &head->ematches, &t); + + return 0; +} + +static void cls_cgroup_destroy(struct tcf_proto *tp) +{ + struct cls_cgroup_head *head = tp->root; + + if (head) { + tcf_exts_destroy(tp, &head->exts); + tcf_em_tree_destroy(tp, &head->ematches); + kfree(head); + } +} + +static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg) +{ + return -EOPNOTSUPP; +} + +static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_cgroup_head *head = tp->root; + + if (arg->count < arg->skip) + goto skip; + + if (arg->fn(tp, (unsigned long) head, arg) < 0) { + arg->stop = 1; + return; + } +skip: + arg->count++; +} + +static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_cgroup_head *head = tp->root; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + t->tcm_handle = head->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (tcf_exts_dump(skb, &head->exts) < 0 || + tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &head->exts) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tcf_proto_ops cls_cgroup_ops __read_mostly = { + .kind = "cgroup", + .init = cls_cgroup_init, + .change = cls_cgroup_change, + .classify = cls_cgroup_classify, + .destroy = cls_cgroup_destroy, + .get = cls_cgroup_get, + .put = cls_cgroup_put, + .delete = cls_cgroup_delete, + .walk = cls_cgroup_walk, + .dump = cls_cgroup_dump, + .owner = THIS_MODULE, +}; + +static int __init init_cgroup_cls(void) +{ + return register_tcf_proto_ops(&cls_cgroup_ops); +} + +static void __exit exit_cgroup_cls(void) +{ + unregister_tcf_proto_ops(&cls_cgroup_ops); +} + +module_init(init_cgroup_cls); +module_exit(exit_cgroup_cls); +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c new file mode 100644 index 00000000000..35be16f7c19 --- /dev/null +++ b/net/sched/cls_flow.c @@ -0,0 +1,673 @@ +/* + * net/sched/cls_flow.c Generic flow classifier + * + * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/random.h> +#include <linux/pkt_cls.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/if_vlan.h> +#include <linux/slab.h> +#include <linux/module.h> + +#include <net/pkt_cls.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/flow_keys.h> + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#include <net/netfilter/nf_conntrack.h> +#endif + +struct flow_head { + struct list_head filters; +}; + +struct flow_filter { + struct list_head list; + struct tcf_exts exts; + struct tcf_ematch_tree ematches; + struct timer_list perturb_timer; + u32 perturb_period; + u32 handle; + + u32 nkeys; + u32 keymask; + u32 mode; + u32 mask; + u32 xor; + u32 rshift; + u32 addend; + u32 divisor; + u32 baseclass; + u32 hashrnd; +}; + +static inline u32 addr_fold(void *addr) +{ + unsigned long a = (unsigned long)addr; + + return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0); +} + +static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) +{ + if (flow->src) + return ntohl(flow->src); + return addr_fold(skb->sk); +} + +static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) +{ + if (flow->dst) + return ntohl(flow->dst); + return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; +} + +static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) +{ + return flow->ip_proto; +} + +static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) +{ + if (flow->ports) + return ntohs(flow->port16[0]); + + return addr_fold(skb->sk); +} + +static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) +{ + if (flow->ports) + return ntohs(flow->port16[1]); + + return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; +} + +static u32 flow_get_iif(const struct sk_buff *skb) +{ + return skb->skb_iif; +} + +static u32 flow_get_priority(const struct sk_buff *skb) +{ + return skb->priority; +} + +static u32 flow_get_mark(const struct sk_buff *skb) +{ + return skb->mark; +} + +static u32 flow_get_nfct(const struct sk_buff *skb) +{ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + return addr_fold(skb->nfct); +#else + return 0; +#endif +} + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#define CTTUPLE(skb, member) \ +({ \ + enum ip_conntrack_info ctinfo; \ + const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \ + if (ct == NULL) \ + goto fallback; \ + ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \ +}) +#else +#define CTTUPLE(skb, member) \ +({ \ + goto fallback; \ + 0; \ +}) +#endif + +static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + return ntohl(CTTUPLE(skb, src.u3.ip)); + case htons(ETH_P_IPV6): + return ntohl(CTTUPLE(skb, src.u3.ip6[3])); + } +fallback: + return flow_get_src(skb, flow); +} + +static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + return ntohl(CTTUPLE(skb, dst.u3.ip)); + case htons(ETH_P_IPV6): + return ntohl(CTTUPLE(skb, dst.u3.ip6[3])); + } +fallback: + return flow_get_dst(skb, flow); +} + +static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) +{ + return ntohs(CTTUPLE(skb, src.u.all)); +fallback: + return flow_get_proto_src(skb, flow); +} + +static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) +{ + return ntohs(CTTUPLE(skb, dst.u.all)); +fallback: + return flow_get_proto_dst(skb, flow); +} + +static u32 flow_get_rtclassid(const struct sk_buff *skb) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + if (skb_dst(skb)) + return skb_dst(skb)->tclassid; +#endif + return 0; +} + +static u32 flow_get_skuid(const struct sk_buff *skb) +{ + if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { + kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid; + return from_kuid(&init_user_ns, skuid); + } + return 0; +} + +static u32 flow_get_skgid(const struct sk_buff *skb) +{ + if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { + kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid; + return from_kgid(&init_user_ns, skgid); + } + return 0; +} + +static u32 flow_get_vlan_tag(const struct sk_buff *skb) +{ + u16 uninitialized_var(tag); + + if (vlan_get_tag(skb, &tag) < 0) + return 0; + return tag & VLAN_VID_MASK; +} + +static u32 flow_get_rxhash(struct sk_buff *skb) +{ + return skb_get_hash(skb); +} + +static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) +{ + switch (key) { + case FLOW_KEY_SRC: + return flow_get_src(skb, flow); + case FLOW_KEY_DST: + return flow_get_dst(skb, flow); + case FLOW_KEY_PROTO: + return flow_get_proto(skb, flow); + case FLOW_KEY_PROTO_SRC: + return flow_get_proto_src(skb, flow); + case FLOW_KEY_PROTO_DST: + return flow_get_proto_dst(skb, flow); + case FLOW_KEY_IIF: + return flow_get_iif(skb); + case FLOW_KEY_PRIORITY: + return flow_get_priority(skb); + case FLOW_KEY_MARK: + return flow_get_mark(skb); + case FLOW_KEY_NFCT: + return flow_get_nfct(skb); + case FLOW_KEY_NFCT_SRC: + return flow_get_nfct_src(skb, flow); + case FLOW_KEY_NFCT_DST: + return flow_get_nfct_dst(skb, flow); + case FLOW_KEY_NFCT_PROTO_SRC: + return flow_get_nfct_proto_src(skb, flow); + case FLOW_KEY_NFCT_PROTO_DST: + return flow_get_nfct_proto_dst(skb, flow); + case FLOW_KEY_RTCLASSID: + return flow_get_rtclassid(skb); + case FLOW_KEY_SKUID: + return flow_get_skuid(skb); + case FLOW_KEY_SKGID: + return flow_get_skgid(skb); + case FLOW_KEY_VLAN_TAG: + return flow_get_vlan_tag(skb); + case FLOW_KEY_RXHASH: + return flow_get_rxhash(skb); + default: + WARN_ON(1); + return 0; + } +} + +#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \ + (1 << FLOW_KEY_DST) | \ + (1 << FLOW_KEY_PROTO) | \ + (1 << FLOW_KEY_PROTO_SRC) | \ + (1 << FLOW_KEY_PROTO_DST) | \ + (1 << FLOW_KEY_NFCT_SRC) | \ + (1 << FLOW_KEY_NFCT_DST) | \ + (1 << FLOW_KEY_NFCT_PROTO_SRC) | \ + (1 << FLOW_KEY_NFCT_PROTO_DST)) + +static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct flow_head *head = tp->root; + struct flow_filter *f; + u32 keymask; + u32 classid; + unsigned int n, key; + int r; + + list_for_each_entry(f, &head->filters, list) { + u32 keys[FLOW_KEY_MAX + 1]; + struct flow_keys flow_keys; + + if (!tcf_em_tree_match(skb, &f->ematches, NULL)) + continue; + + keymask = f->keymask; + if (keymask & FLOW_KEYS_NEEDED) + skb_flow_dissect(skb, &flow_keys); + + for (n = 0; n < f->nkeys; n++) { + key = ffs(keymask) - 1; + keymask &= ~(1 << key); + keys[n] = flow_key_get(skb, key, &flow_keys); + } + + if (f->mode == FLOW_MODE_HASH) + classid = jhash2(keys, f->nkeys, f->hashrnd); + else { + classid = keys[0]; + classid = (classid & f->mask) ^ f->xor; + classid = (classid >> f->rshift) + f->addend; + } + + if (f->divisor) + classid %= f->divisor; + + res->class = 0; + res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid); + + r = tcf_exts_exec(skb, &f->exts, res); + if (r < 0) + continue; + return r; + } + return -1; +} + +static void flow_perturbation(unsigned long arg) +{ + struct flow_filter *f = (struct flow_filter *)arg; + + get_random_bytes(&f->hashrnd, 4); + if (f->perturb_period) + mod_timer(&f->perturb_timer, jiffies + f->perturb_period); +} + +static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { + [TCA_FLOW_KEYS] = { .type = NLA_U32 }, + [TCA_FLOW_MODE] = { .type = NLA_U32 }, + [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, + [TCA_FLOW_RSHIFT] = { .type = NLA_U32 }, + [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, + [TCA_FLOW_MASK] = { .type = NLA_U32 }, + [TCA_FLOW_XOR] = { .type = NLA_U32 }, + [TCA_FLOW_DIVISOR] = { .type = NLA_U32 }, + [TCA_FLOW_ACT] = { .type = NLA_NESTED }, + [TCA_FLOW_POLICE] = { .type = NLA_NESTED }, + [TCA_FLOW_EMATCHES] = { .type = NLA_NESTED }, + [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, +}; + +static int flow_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg, bool ovr) +{ + struct flow_head *head = tp->root; + struct flow_filter *f; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_FLOW_MAX + 1]; + struct tcf_exts e; + struct tcf_ematch_tree t; + unsigned int nkeys = 0; + unsigned int perturb_period = 0; + u32 baseclass = 0; + u32 keymask = 0; + u32 mode; + int err; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy); + if (err < 0) + return err; + + if (tb[TCA_FLOW_BASECLASS]) { + baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]); + if (TC_H_MIN(baseclass) == 0) + return -EINVAL; + } + + if (tb[TCA_FLOW_KEYS]) { + keymask = nla_get_u32(tb[TCA_FLOW_KEYS]); + + nkeys = hweight32(keymask); + if (nkeys == 0) + return -EINVAL; + + if (fls(keymask) - 1 > FLOW_KEY_MAX) + return -EOPNOTSUPP; + + if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && + sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns) + return -EOPNOTSUPP; + } + + tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + if (err < 0) + return err; + + err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t); + if (err < 0) + goto err1; + + f = (struct flow_filter *)*arg; + if (f != NULL) { + err = -EINVAL; + if (f->handle != handle && handle) + goto err2; + + mode = f->mode; + if (tb[TCA_FLOW_MODE]) + mode = nla_get_u32(tb[TCA_FLOW_MODE]); + if (mode != FLOW_MODE_HASH && nkeys > 1) + goto err2; + + if (mode == FLOW_MODE_HASH) + perturb_period = f->perturb_period; + if (tb[TCA_FLOW_PERTURB]) { + if (mode != FLOW_MODE_HASH) + goto err2; + perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; + } + } else { + err = -EINVAL; + if (!handle) + goto err2; + if (!tb[TCA_FLOW_KEYS]) + goto err2; + + mode = FLOW_MODE_MAP; + if (tb[TCA_FLOW_MODE]) + mode = nla_get_u32(tb[TCA_FLOW_MODE]); + if (mode != FLOW_MODE_HASH && nkeys > 1) + goto err2; + + if (tb[TCA_FLOW_PERTURB]) { + if (mode != FLOW_MODE_HASH) + goto err2; + perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; + } + + if (TC_H_MAJ(baseclass) == 0) + baseclass = TC_H_MAKE(tp->q->handle, baseclass); + if (TC_H_MIN(baseclass) == 0) + baseclass = TC_H_MAKE(baseclass, 1); + + err = -ENOBUFS; + f = kzalloc(sizeof(*f), GFP_KERNEL); + if (f == NULL) + goto err2; + + f->handle = handle; + f->mask = ~0U; + tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); + + get_random_bytes(&f->hashrnd, 4); + f->perturb_timer.function = flow_perturbation; + f->perturb_timer.data = (unsigned long)f; + init_timer_deferrable(&f->perturb_timer); + } + + tcf_exts_change(tp, &f->exts, &e); + tcf_em_tree_change(tp, &f->ematches, &t); + + tcf_tree_lock(tp); + + if (tb[TCA_FLOW_KEYS]) { + f->keymask = keymask; + f->nkeys = nkeys; + } + + f->mode = mode; + + if (tb[TCA_FLOW_MASK]) + f->mask = nla_get_u32(tb[TCA_FLOW_MASK]); + if (tb[TCA_FLOW_XOR]) + f->xor = nla_get_u32(tb[TCA_FLOW_XOR]); + if (tb[TCA_FLOW_RSHIFT]) + f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]); + if (tb[TCA_FLOW_ADDEND]) + f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]); + + if (tb[TCA_FLOW_DIVISOR]) + f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]); + if (baseclass) + f->baseclass = baseclass; + + f->perturb_period = perturb_period; + del_timer(&f->perturb_timer); + if (perturb_period) + mod_timer(&f->perturb_timer, jiffies + perturb_period); + + if (*arg == 0) + list_add_tail(&f->list, &head->filters); + + tcf_tree_unlock(tp); + + *arg = (unsigned long)f; + return 0; + +err2: + tcf_em_tree_destroy(tp, &t); +err1: + tcf_exts_destroy(tp, &e); + return err; +} + +static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f) +{ + del_timer_sync(&f->perturb_timer); + tcf_exts_destroy(tp, &f->exts); + tcf_em_tree_destroy(tp, &f->ematches); + kfree(f); +} + +static int flow_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct flow_filter *f = (struct flow_filter *)arg; + + tcf_tree_lock(tp); + list_del(&f->list); + tcf_tree_unlock(tp); + flow_destroy_filter(tp, f); + return 0; +} + +static int flow_init(struct tcf_proto *tp) +{ + struct flow_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + INIT_LIST_HEAD(&head->filters); + tp->root = head; + return 0; +} + +static void flow_destroy(struct tcf_proto *tp) +{ + struct flow_head *head = tp->root; + struct flow_filter *f, *next; + + list_for_each_entry_safe(f, next, &head->filters, list) { + list_del(&f->list); + flow_destroy_filter(tp, f); + } + kfree(head); +} + +static unsigned long flow_get(struct tcf_proto *tp, u32 handle) +{ + struct flow_head *head = tp->root; + struct flow_filter *f; + + list_for_each_entry(f, &head->filters, list) + if (f->handle == handle) + return (unsigned long)f; + return 0; +} + +static void flow_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct flow_filter *f = (struct flow_filter *)fh; + struct nlattr *nest; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) || + nla_put_u32(skb, TCA_FLOW_MODE, f->mode)) + goto nla_put_failure; + + if (f->mask != ~0 || f->xor != 0) { + if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) || + nla_put_u32(skb, TCA_FLOW_XOR, f->xor)) + goto nla_put_failure; + } + if (f->rshift && + nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift)) + goto nla_put_failure; + if (f->addend && + nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend)) + goto nla_put_failure; + + if (f->divisor && + nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor)) + goto nla_put_failure; + if (f->baseclass && + nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass)) + goto nla_put_failure; + + if (f->perturb_period && + nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ)) + goto nla_put_failure; + + if (tcf_exts_dump(skb, &f->exts) < 0) + goto nla_put_failure; +#ifdef CONFIG_NET_EMATCH + if (f->ematches.hdr.nmatches && + tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0) + goto nla_put_failure; +#endif + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, nest); + return -1; +} + +static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct flow_head *head = tp->root; + struct flow_filter *f; + + list_for_each_entry(f, &head->filters, list) { + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static struct tcf_proto_ops cls_flow_ops __read_mostly = { + .kind = "flow", + .classify = flow_classify, + .init = flow_init, + .destroy = flow_destroy, + .change = flow_change, + .delete = flow_delete, + .get = flow_get, + .put = flow_put, + .dump = flow_dump, + .walk = flow_walk, + .owner = THIS_MODULE, +}; + +static int __init cls_flow_init(void) +{ + return register_tcf_proto_ops(&cls_flow_ops); +} + +static void __exit cls_flow_exit(void) +{ + unregister_tcf_proto_ops(&cls_flow_ops); +} + +module_init(cls_flow_init); +module_exit(cls_flow_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("TC flow classifier"); diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 75470486e40..861b03ccfed 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -18,101 +18,56 @@ * */ -#include <linux/config.h> #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> +#include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <linux/netfilter.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> +#include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> -#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *)) +#define HTSIZE 256 -struct fw_head -{ - struct fw_filter *ht[HTSIZE]; +struct fw_head { + u32 mask; + struct fw_filter *ht[HTSIZE]; }; -struct fw_filter -{ +struct fw_filter { struct fw_filter *next; u32 id; struct tcf_result res; #ifdef CONFIG_NET_CLS_IND - char indev[IFNAMSIZ]; + int ifindex; #endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; }; -static struct tcf_ext_map fw_ext_map = { - .action = TCA_FW_ACT, - .police = TCA_FW_POLICE -}; - -static __inline__ int fw_hash(u32 handle) +static u32 fw_hash(u32 handle) { - if (HTSIZE == 4096) - return ((handle >> 24) & 0xFFF) ^ - ((handle >> 12) & 0xFFF) ^ - (handle & 0xFFF); - else if (HTSIZE == 2048) - return ((handle >> 22) & 0x7FF) ^ - ((handle >> 11) & 0x7FF) ^ - (handle & 0x7FF); - else if (HTSIZE == 1024) - return ((handle >> 20) & 0x3FF) ^ - ((handle >> 10) & 0x3FF) ^ - (handle & 0x3FF); - else if (HTSIZE == 512) - return (handle >> 27) ^ - ((handle >> 18) & 0x1FF) ^ - ((handle >> 9) & 0x1FF) ^ - (handle & 0x1FF); - else if (HTSIZE == 256) { - u8 *t = (u8 *) &handle; - return t[0] ^ t[1] ^ t[2] ^ t[3]; - } else - return handle & (HTSIZE - 1); + handle ^= (handle >> 16); + handle ^= (handle >> 8); + return handle % HTSIZE; } -static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, +static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct fw_head *head = (struct fw_head*)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f; int r; -#ifdef CONFIG_NETFILTER - u32 id = skb->nfmark; -#else - u32 id = 0; -#endif + u32 id = skb->mark; if (head != NULL) { - for (f=head->ht[fw_hash(id)]; f; f=f->next) { + id &= head->mask; + for (f = head->ht[fw_hash(id)]; f; f = f->next) { if (f->id == id) { *res = f->res; #ifdef CONFIG_NET_CLS_IND - if (!tcf_match_indev(skb, f->indev)) + if (!tcf_match_indev(skb, f->ifindex)) continue; #endif /* CONFIG_NET_CLS_IND */ r = tcf_exts_exec(skb, &f->exts, res); @@ -124,7 +79,8 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, } } else { /* old method */ - if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) { + if (id && (TC_H_MAJ(id) == 0 || + !(TC_H_MAJ(id ^ tp->q->handle)))) { res->classid = id; res->class = 0; return 0; @@ -136,13 +92,13 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, static unsigned long fw_get(struct tcf_proto *tp, u32 handle) { - struct fw_head *head = (struct fw_head*)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f; if (head == NULL) return 0; - for (f=head->ht[fw_hash(handle)]; f; f=f->next) { + for (f = head->ht[fw_hash(handle)]; f; f = f->next) { if (f->id == handle) return (unsigned long)f; } @@ -158,8 +114,7 @@ static int fw_init(struct tcf_proto *tp) return 0; } -static inline void -fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) +static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) { tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(tp, &f->exts); @@ -168,15 +123,15 @@ fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) static void fw_destroy(struct tcf_proto *tp) { - struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL); + struct fw_head *head = tp->root; struct fw_filter *f; int h; if (head == NULL) return; - for (h=0; h<HTSIZE; h++) { - while ((f=head->ht[h]) != NULL) { + for (h = 0; h < HTSIZE; h++) { + while ((f = head->ht[h]) != NULL) { head->ht[h] = f->next; fw_delete_filter(tp, f); } @@ -186,14 +141,14 @@ static void fw_destroy(struct tcf_proto *tp) static int fw_delete(struct tcf_proto *tp, unsigned long arg) { - struct fw_head *head = (struct fw_head*)tp->root; - struct fw_filter *f = (struct fw_filter*)arg; + struct fw_head *head = tp->root; + struct fw_filter *f = (struct fw_filter *)arg; struct fw_filter **fp; if (head == NULL || f == NULL) goto out; - for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { + for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { if (*fp == f) { tcf_tree_lock(tp); *fp = f->next; @@ -206,33 +161,51 @@ out: return -EINVAL; } +static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { + [TCA_FW_CLASSID] = { .type = NLA_U32 }, + [TCA_FW_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, + [TCA_FW_MASK] = { .type = NLA_U32 }, +}; + static int -fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, - struct rtattr **tb, struct rtattr **tca, unsigned long base) +fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, + struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr) { + struct fw_head *head = tp->root; struct tcf_exts e; + u32 mask; int err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &fw_ext_map); + tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) return err; - err = -EINVAL; - if (tb[TCA_FW_CLASSID-1]) { - if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != sizeof(u32)) - goto errout; - f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); + if (tb[TCA_FW_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); tcf_bind_filter(tp, &f->res, base); } #ifdef CONFIG_NET_CLS_IND - if (tb[TCA_FW_INDEV-1]) { - err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV-1]); - if (err < 0) + if (tb[TCA_FW_INDEV]) { + int ret; + ret = tcf_change_indev(net, tb[TCA_FW_INDEV]); + if (ret < 0) { + err = ret; goto errout; + } + f->ifindex = ret; } #endif /* CONFIG_NET_CLS_IND */ + err = -EINVAL; + if (tb[TCA_FW_MASK]) { + mask = nla_get_u32(tb[TCA_FW_MASK]); + if (mask != head->mask) + goto errout; + } else if (head->mask != 0xFFFFFFFF) + goto errout; + tcf_exts_change(tp, &f->exts, &e); return 0; @@ -241,51 +214,57 @@ errout: return err; } -static int fw_change(struct tcf_proto *tp, unsigned long base, +static int fw_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, - unsigned long *arg) + struct nlattr **tca, + unsigned long *arg, bool ovr) { - struct fw_head *head = (struct fw_head*)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f = (struct fw_filter *) *arg; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_FW_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_FW_MAX + 1]; int err; if (!opt) return handle ? -EINVAL : 0; - if (rtattr_parse_nested(tb, TCA_FW_MAX, opt) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy); + if (err < 0) + return err; if (f != NULL) { if (f->id != handle && handle) return -EINVAL; - return fw_change_attrs(tp, f, tb, tca, base); + return fw_change_attrs(net, tp, f, tb, tca, base, ovr); } if (!handle) return -EINVAL; if (head == NULL) { - head = kmalloc(sizeof(struct fw_head), GFP_KERNEL); + u32 mask = 0xFFFFFFFF; + if (tb[TCA_FW_MASK]) + mask = nla_get_u32(tb[TCA_FW_MASK]); + + head = kzalloc(sizeof(struct fw_head), GFP_KERNEL); if (head == NULL) return -ENOBUFS; - memset(head, 0, sizeof(*head)); + head->mask = mask; tcf_tree_lock(tp); tp->root = head; tcf_tree_unlock(tp); } - f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL); + f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); if (f == NULL) return -ENOBUFS; - memset(f, 0, sizeof(*f)); + tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); f->id = handle; - err = fw_change_attrs(tp, f, tb, tca, base); + err = fw_change_attrs(net, tp, f, tb, tca, base, ovr); if (err < 0) goto errout; @@ -304,7 +283,7 @@ errout: static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct fw_head *head = (struct fw_head*)tp->root; + struct fw_head *head = tp->root; int h; if (head == NULL) @@ -330,12 +309,13 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int fw_dump(struct tcf_proto *tp, unsigned long fh, +static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct fw_filter *f = (struct fw_filter*)fh; - unsigned char *b = skb->tail; - struct rtattr *rta; + struct fw_head *head = tp->root; + struct fw_filter *f = (struct fw_filter *)fh; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; if (f == NULL) return skb->len; @@ -345,33 +325,41 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh, if (!f->res.classid && !tcf_exts_is_available(&f->exts)) return skb->len; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; - if (f->res.classid) - RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid); + if (f->res.classid && + nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid)) + goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND - if (strlen(f->indev)) - RTA_PUT(skb, TCA_FW_INDEV, IFNAMSIZ, f->indev); + if (f->ifindex) { + struct net_device *dev; + dev = __dev_get_by_index(net, f->ifindex); + if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name)) + goto nla_put_failure; + } #endif /* CONFIG_NET_CLS_IND */ + if (head->mask != 0xFFFFFFFF && + nla_put_u32(skb, TCA_FW_MASK, head->mask)) + goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) - goto rtattr_failure; + if (tcf_exts_dump(skb, &f->exts) < 0) + goto nla_put_failure; - rta->rta_len = skb->tail - b; + nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) - goto rtattr_failure; + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -static struct tcf_proto_ops cls_fw_ops = { - .next = NULL, +static struct tcf_proto_ops cls_fw_ops __read_mostly = { .kind = "fw", .classify = fw_classify, .init = fw_init, @@ -390,7 +378,7 @@ static int __init init_fw(void) return register_tcf_proto_ops(&cls_fw_ops); } -static void __exit exit_fw(void) +static void __exit exit_fw(void) { unregister_tcf_proto_ops(&cls_fw_ops); } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 520ff716dab..dd9fc2523c7 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -10,61 +10,43 @@ */ #include <linux/module.h> -#include <linux/config.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> +#include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> +#include <net/dst.h> +#include <net/route.h> +#include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> /* - 1. For now we assume that route tags < 256. - It allows to use direct table lookups, instead of hash tables. - 2. For now we assume that "from TAG" and "fromdev DEV" statements - are mutually exclusive. - 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" + * 1. For now we assume that route tags < 256. + * It allows to use direct table lookups, instead of hash tables. + * 2. For now we assume that "from TAG" and "fromdev DEV" statements + * are mutually exclusive. + * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" */ -struct route4_fastmap -{ +struct route4_fastmap { struct route4_filter *filter; u32 id; int iif; }; -struct route4_head -{ +struct route4_head { struct route4_fastmap fastmap[16]; - struct route4_bucket *table[256+1]; + struct route4_bucket *table[256 + 1]; }; -struct route4_bucket -{ +struct route4_bucket { /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ - struct route4_filter *ht[16+16+1]; + struct route4_filter *ht[16 + 16 + 1]; }; -struct route4_filter -{ +struct route4_filter { struct route4_filter *next; u32 id; int iif; @@ -75,52 +57,50 @@ struct route4_filter struct route4_bucket *bkt; }; -#define ROUTE4_FAILURE ((struct route4_filter*)(-1L)) - -static struct tcf_ext_map route_ext_map = { - .police = TCA_ROUTE4_POLICE, - .action = TCA_ROUTE4_ACT -}; +#define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) -static __inline__ int route4_fastmap_hash(u32 id, int iif) +static inline int route4_fastmap_hash(u32 id, int iif) { - return id&0xF; + return id & 0xF; } -static inline -void route4_reset_fastmap(struct net_device *dev, struct route4_head *head, u32 id) +static void +route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id) { - spin_lock_bh(&dev->queue_lock); + spinlock_t *root_lock = qdisc_root_sleeping_lock(q); + + spin_lock_bh(root_lock); memset(head->fastmap, 0, sizeof(head->fastmap)); - spin_unlock_bh(&dev->queue_lock); + spin_unlock_bh(root_lock); } -static void __inline__ +static void route4_set_fastmap(struct route4_head *head, u32 id, int iif, struct route4_filter *f) { int h = route4_fastmap_hash(id, iif); + head->fastmap[h].id = id; head->fastmap[h].iif = iif; head->fastmap[h].filter = f; } -static __inline__ int route4_hash_to(u32 id) +static inline int route4_hash_to(u32 id) { - return id&0xFF; + return id & 0xFF; } -static __inline__ int route4_hash_from(u32 id) +static inline int route4_hash_from(u32 id) { - return (id>>16)&0xF; + return (id >> 16) & 0xF; } -static __inline__ int route4_hash_iif(int iif) +static inline int route4_hash_iif(int iif) { - return 16 + ((iif>>16)&0xF); + return 16 + ((iif >> 16) & 0xF); } -static __inline__ int route4_hash_wild(void) +static inline int route4_hash_wild(void) { return 32; } @@ -140,24 +120,25 @@ static __inline__ int route4_hash_wild(void) return 0; \ } -static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, +static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct route4_head *head = (struct route4_head*)tp->root; + struct route4_head *head = tp->root; struct dst_entry *dst; struct route4_bucket *b; struct route4_filter *f; u32 id, h; int iif, dont_cache = 0; - if ((dst = skb->dst) == NULL) + dst = skb_dst(skb); + if (!dst) goto failure; id = dst->tclassid; if (head == NULL) goto old_method; - iif = ((struct rtable*)dst)->fl.iif; + iif = inet_iif(skb); h = route4_fastmap_hash(id, iif); if (id == head->fastmap[h].id && @@ -173,7 +154,8 @@ static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, h = route4_hash_to(id); restart: - if ((b = head->table[h]) != NULL) { + b = head->table[h]; + if (b) { for (f = b->ht[route4_hash_from(id)]; f; f = f->next) if (f->id == id) ROUTE4_APPLY_RESULT(); @@ -209,8 +191,9 @@ old_method: static inline u32 to_hash(u32 id) { - u32 h = id&0xFF; - if (id&0x8000) + u32 h = id & 0xFF; + + if (id & 0x8000) h += 256; return h; } @@ -223,17 +206,17 @@ static inline u32 from_hash(u32 id) if (!(id & 0x8000)) { if (id > 255) return 256; - return id&0xF; + return id & 0xF; } - return 16 + (id&0xF); + return 16 + (id & 0xF); } static unsigned long route4_get(struct tcf_proto *tp, u32 handle) { - struct route4_head *head = (struct route4_head*)tp->root; + struct route4_head *head = tp->root; struct route4_bucket *b; struct route4_filter *f; - unsigned h1, h2; + unsigned int h1, h2; if (!head) return 0; @@ -242,11 +225,12 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle) if (h1 > 256) return 0; - h2 = from_hash(handle>>16); + h2 = from_hash(handle >> 16); if (h2 > 32) return 0; - if ((b = head->table[h1]) != NULL) { + b = head->table[h1]; + if (b) { for (f = b->ht[h2]; f; f = f->next) if (f->handle == handle) return (unsigned long)f; @@ -263,7 +247,7 @@ static int route4_init(struct tcf_proto *tp) return 0; } -static inline void +static void route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f) { tcf_unbind_filter(tp, &f->res); @@ -273,17 +257,18 @@ route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f) static void route4_destroy(struct tcf_proto *tp) { - struct route4_head *head = xchg(&tp->root, NULL); + struct route4_head *head = tp->root; int h1, h2; if (head == NULL) return; - for (h1=0; h1<=256; h1++) { + for (h1 = 0; h1 <= 256; h1++) { struct route4_bucket *b; - if ((b = head->table[h1]) != NULL) { - for (h2=0; h2<=32; h2++) { + b = head->table[h1]; + if (b) { + for (h2 = 0; h2 <= 32; h2++) { struct route4_filter *f; while ((f = b->ht[h2]) != NULL) { @@ -299,9 +284,9 @@ static void route4_destroy(struct tcf_proto *tp) static int route4_delete(struct tcf_proto *tp, unsigned long arg) { - struct route4_head *head = (struct route4_head*)tp->root; - struct route4_filter **fp, *f = (struct route4_filter*)arg; - unsigned h = 0; + struct route4_head *head = tp->root; + struct route4_filter **fp, *f = (struct route4_filter *)arg; + unsigned int h = 0; struct route4_bucket *b; int i; @@ -311,18 +296,18 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg) h = f->handle; b = f->bkt; - for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) { + for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) { if (*fp == f) { tcf_tree_lock(tp); *fp = f->next; tcf_tree_unlock(tp); - route4_reset_fastmap(tp->q->dev, head, f->id); + route4_reset_fastmap(tp->q, head, f->id); route4_delete_filter(tp, f); /* Strip tree */ - for (i=0; i<=32; i++) + for (i = 0; i <= 32; i++) if (b->ht[i]) return 0; @@ -338,9 +323,18 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg) return 0; } -static int route4_set_parms(struct tcf_proto *tp, unsigned long base, - struct route4_filter *f, u32 handle, struct route4_head *head, - struct rtattr **tb, struct rtattr *est, int new) +static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = { + [TCA_ROUTE4_CLASSID] = { .type = NLA_U32 }, + [TCA_ROUTE4_TO] = { .type = NLA_U32 }, + [TCA_ROUTE4_FROM] = { .type = NLA_U32 }, + [TCA_ROUTE4_IIF] = { .type = NLA_U32 }, +}; + +static int route4_set_parms(struct net *net, struct tcf_proto *tp, + unsigned long base, struct route4_filter *f, + u32 handle, struct route4_head *head, + struct nlattr **tb, struct nlattr *est, int new, + bool ovr) { int err; u32 id = 0, to = 0, nhandle = 0x8000; @@ -349,39 +343,30 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base, struct route4_bucket *b; struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map); + tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; err = -EINVAL; - if (tb[TCA_ROUTE4_CLASSID-1]) - if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < sizeof(u32)) - goto errout; - - if (tb[TCA_ROUTE4_TO-1]) { + if (tb[TCA_ROUTE4_TO]) { if (new && handle & 0x8000) goto errout; - if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < sizeof(u32)) - goto errout; - to = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]); + to = nla_get_u32(tb[TCA_ROUTE4_TO]); if (to > 0xFF) goto errout; nhandle = to; } - if (tb[TCA_ROUTE4_FROM-1]) { - if (tb[TCA_ROUTE4_IIF-1]) + if (tb[TCA_ROUTE4_FROM]) { + if (tb[TCA_ROUTE4_IIF]) goto errout; - if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < sizeof(u32)) - goto errout; - id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1]); + id = nla_get_u32(tb[TCA_ROUTE4_FROM]); if (id > 0xFF) goto errout; nhandle |= id << 16; - } else if (tb[TCA_ROUTE4_IIF-1]) { - if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < sizeof(u32)) - goto errout; - id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]); + } else if (tb[TCA_ROUTE4_IIF]) { + id = nla_get_u32(tb[TCA_ROUTE4_IIF]); if (id > 0x7FFF) goto errout; nhandle |= (id | 0x8000) << 16; @@ -395,18 +380,19 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base, } h1 = to_hash(nhandle); - if ((b = head->table[h1]) == NULL) { + b = head->table[h1]; + if (!b) { err = -ENOBUFS; - b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL); + b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL); if (b == NULL) goto errout; - memset(b, 0, sizeof(*b)); tcf_tree_lock(tp); head->table[h1] = b; tcf_tree_unlock(tp); } else { unsigned int h2 = from_hash(nhandle >> 16); + err = -EEXIST; for (fp = b->ht[h2]; fp; fp = fp->next) if (fp->handle == f->handle) @@ -414,20 +400,20 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base, } tcf_tree_lock(tp); - if (tb[TCA_ROUTE4_TO-1]) + if (tb[TCA_ROUTE4_TO]) f->id = to; - if (tb[TCA_ROUTE4_FROM-1]) + if (tb[TCA_ROUTE4_FROM]) f->id = to | id<<16; - else if (tb[TCA_ROUTE4_IIF-1]) + else if (tb[TCA_ROUTE4_IIF]) f->iif = id; f->handle = nhandle; f->bkt = b; tcf_tree_unlock(tp); - if (tb[TCA_ROUTE4_CLASSID-1]) { - f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]); + if (tb[TCA_ROUTE4_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]); tcf_bind_filter(tp, &f->res, base); } @@ -439,16 +425,17 @@ errout: return err; } -static int route4_change(struct tcf_proto *tp, unsigned long base, +static int route4_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, - unsigned long *arg) + struct nlattr **tca, + unsigned long *arg, bool ovr) { struct route4_head *head = tp->root; struct route4_filter *f, *f1, **fp; struct route4_bucket *b; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_ROUTE4_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_ROUTE4_MAX + 1]; unsigned int h, th; u32 old_handle = 0; int err; @@ -456,18 +443,20 @@ static int route4_change(struct tcf_proto *tp, unsigned long base, if (opt == NULL) return handle ? -EINVAL : 0; - if (rtattr_parse_nested(tb, TCA_ROUTE4_MAX, opt) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy); + if (err < 0) + return err; - if ((f = (struct route4_filter*)*arg) != NULL) { + f = (struct route4_filter *)*arg; + if (f) { if (f->handle != handle && handle) return -EINVAL; if (f->bkt) old_handle = f->handle; - err = route4_set_parms(tp, base, f, handle, head, tb, - tca[TCA_RATE-1], 0); + err = route4_set_parms(net, tp, base, f, handle, head, tb, + tca[TCA_RATE], 0, ovr); if (err < 0) return err; @@ -476,29 +465,28 @@ static int route4_change(struct tcf_proto *tp, unsigned long base, err = -ENOBUFS; if (head == NULL) { - head = kmalloc(sizeof(struct route4_head), GFP_KERNEL); + head = kzalloc(sizeof(struct route4_head), GFP_KERNEL); if (head == NULL) goto errout; - memset(head, 0, sizeof(struct route4_head)); tcf_tree_lock(tp); tp->root = head; tcf_tree_unlock(tp); } - f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL); + f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL); if (f == NULL) goto errout; - memset(f, 0, sizeof(*f)); - err = route4_set_parms(tp, base, f, handle, head, tb, - tca[TCA_RATE-1], 1); + tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + err = route4_set_parms(net, tp, base, f, handle, head, tb, + tca[TCA_RATE], 1, ovr); if (err < 0) goto errout; reinsert: h = from_hash(f->handle >> 16); - for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next) + for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next) if (f->handle < f1->handle) break; @@ -509,7 +497,8 @@ reinsert: if (old_handle && f->handle != old_handle) { th = to_hash(old_handle); h = from_hash(old_handle >> 16); - if ((b = head->table[th]) != NULL) { + b = head->table[th]; + if (b) { for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) { if (*fp == f) { *fp = f->next; @@ -520,7 +509,7 @@ reinsert: } tcf_tree_unlock(tp); - route4_reset_fastmap(tp->q->dev, head, f->id); + route4_reset_fastmap(tp->q, head, f->id); *arg = (unsigned long)f; return 0; @@ -532,7 +521,7 @@ errout: static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct route4_head *head = tp->root; - unsigned h, h1; + unsigned int h, h1; if (head == NULL) arg->stop = 1; @@ -563,12 +552,12 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int route4_dump(struct tcf_proto *tp, unsigned long fh, +static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct route4_filter *f = (struct route4_filter*)fh; - unsigned char *b = skb->tail; - struct rtattr *rta; + struct route4_filter *f = (struct route4_filter *)fh; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; u32 id; if (f == NULL) @@ -576,40 +565,44 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh, t->tcm_handle = f->handle; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; - if (!(f->handle&0x8000)) { - id = f->id&0xFF; - RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id); + if (!(f->handle & 0x8000)) { + id = f->id & 0xFF; + if (nla_put_u32(skb, TCA_ROUTE4_TO, id)) + goto nla_put_failure; } - if (f->handle&0x80000000) { - if ((f->handle>>16) != 0xFFFF) - RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif); + if (f->handle & 0x80000000) { + if ((f->handle >> 16) != 0xFFFF && + nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif)) + goto nla_put_failure; } else { - id = f->id>>16; - RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id); + id = f->id >> 16; + if (nla_put_u32(skb, TCA_ROUTE4_FROM, id)) + goto nla_put_failure; } - if (f->res.classid) - RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid); + if (f->res.classid && + nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid)) + goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) - goto rtattr_failure; + if (tcf_exts_dump(skb, &f->exts) < 0) + goto nla_put_failure; - rta->rta_len = skb->tail - b; + nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) - goto rtattr_failure; + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -static struct tcf_proto_ops cls_route4_ops = { - .next = NULL, +static struct tcf_proto_ops cls_route4_ops __read_mostly = { .kind = "route", .classify = route4_classify, .init = route4_init, diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c index ad2613790d8..cbb5e0d600f 100644 --- a/net/sched/cls_rsvp.c +++ b/net/sched/cls_rsvp.c @@ -10,28 +10,13 @@ */ #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> +#include <net/ip.h> +#include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 572f06be3b0..1020e233a5d 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -65,32 +65,28 @@ Well, as result, despite its simplicity, we get a pretty powerful classification engine. */ -#include <linux/config.h> -struct rsvp_head -{ +struct rsvp_head { u32 tmap[256/32]; u32 hgenerator; u8 tgenerator; struct rsvp_session *ht[256]; }; -struct rsvp_session -{ +struct rsvp_session { struct rsvp_session *next; - u32 dst[RSVP_DST_LEN]; + __be32 dst[RSVP_DST_LEN]; struct tc_rsvp_gpi dpi; u8 protocol; u8 tunnelid; /* 16 (src,sport) hash slots, and one wildcard source slot */ - struct rsvp_filter *ht[16+1]; + struct rsvp_filter *ht[16 + 1]; }; -struct rsvp_filter -{ +struct rsvp_filter { struct rsvp_filter *next; - u32 src[RSVP_DST_LEN]; + __be32 src[RSVP_DST_LEN]; struct tc_rsvp_gpi spi; u8 tunnelhdr; @@ -101,28 +97,25 @@ struct rsvp_filter struct rsvp_session *sess; }; -static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid) +static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) { - unsigned h = dst[RSVP_DST_LEN-1]; + unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1]; + h ^= h>>16; h ^= h>>8; return (h ^ protocol ^ tunnelid) & 0xFF; } -static __inline__ unsigned hash_src(u32 *src) +static inline unsigned int hash_src(__be32 *src) { - unsigned h = src[RSVP_DST_LEN-1]; + unsigned int h = (__force __u32)src[RSVP_DST_LEN-1]; + h ^= h>>16; h ^= h>>8; h ^= h>>4; return h & 0xF; } -static struct tcf_ext_map rsvp_ext_map = { - .police = TCA_RSVP_POLICE, - .action = TCA_RSVP_ACT -}; - #define RSVP_APPLY_RESULT() \ { \ int r = tcf_exts_exec(skb, &f->exts, res); \ @@ -131,22 +124,30 @@ static struct tcf_ext_map rsvp_ext_map = { else if (r > 0) \ return r; \ } - -static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, + +static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht; struct rsvp_session *s; struct rsvp_filter *f; - unsigned h1, h2; - u32 *dst, *src; + unsigned int h1, h2; + __be32 *dst, *src; u8 protocol; u8 tunnelid = 0; u8 *xprt; #if RSVP_DST_LEN == 4 - struct ipv6hdr *nhptr = skb->nh.ipv6h; + struct ipv6hdr *nhptr; + + if (!pskb_network_may_pull(skb, sizeof(*nhptr))) + return -1; + nhptr = ipv6_hdr(skb); #else - struct iphdr *nhptr = skb->nh.iph; + struct iphdr *nhptr; + + if (!pskb_network_may_pull(skb, sizeof(*nhptr))) + return -1; + nhptr = ip_hdr(skb); #endif restart: @@ -155,13 +156,13 @@ restart: src = &nhptr->saddr.s6_addr32[0]; dst = &nhptr->daddr.s6_addr32[0]; protocol = nhptr->nexthdr; - xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr); + xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr); #else src = &nhptr->saddr; dst = &nhptr->daddr; protocol = nhptr->protocol; - xprt = ((u8*)nhptr) + (nhptr->ihl<<2); - if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET)) + xprt = ((u8 *)nhptr) + (nhptr->ihl<<2); + if (ip_is_fragment(nhptr)) return -1; #endif @@ -169,23 +170,25 @@ restart: h2 = hash_src(src); for (s = sht[h1]; s; s = s->next) { - if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] && protocol == s->protocol && - !(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) + !(s->dpi.mask & + (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) && #if RSVP_DST_LEN == 4 - && dst[0] == s->dst[0] - && dst[1] == s->dst[1] - && dst[2] == s->dst[2] + dst[0] == s->dst[0] && + dst[1] == s->dst[1] && + dst[2] == s->dst[2] && #endif - && tunnelid == s->tunnelid) { + tunnelid == s->tunnelid) { for (f = s->ht[h2]; f; f = f->next) { - if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] && - !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key)) + if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] && + !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key)) #if RSVP_DST_LEN == 4 - && src[0] == f->src[0] - && src[1] == f->src[1] - && src[2] == f->src[2] + && + src[0] == f->src[0] && + src[1] == f->src[1] && + src[2] == f->src[2] #endif ) { *res = f->res; @@ -196,7 +199,7 @@ matched: return 0; tunnelid = f->res.classid; - nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr)); + nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr)); goto restart; } } @@ -215,11 +218,11 @@ matched: static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) { - struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht; struct rsvp_session *s; struct rsvp_filter *f; - unsigned h1 = handle&0xFF; - unsigned h2 = (handle>>8)&0xFF; + unsigned int h1 = handle & 0xFF; + unsigned int h2 = (handle >> 8) & 0xFF; if (h2 > 16) return 0; @@ -241,16 +244,15 @@ static int rsvp_init(struct tcf_proto *tp) { struct rsvp_head *data; - data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL); + data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL); if (data) { - memset(data, 0, sizeof(struct rsvp_head)); tp->root = data; return 0; } return -ENOBUFS; } -static inline void +static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) { tcf_unbind_filter(tp, &f->res); @@ -269,13 +271,13 @@ static void rsvp_destroy(struct tcf_proto *tp) sht = data->ht; - for (h1=0; h1<256; h1++) { + for (h1 = 0; h1 < 256; h1++) { struct rsvp_session *s; while ((s = sht[h1]) != NULL) { sht[h1] = s->next; - for (h2=0; h2<=16; h2++) { + for (h2 = 0; h2 <= 16; h2++) { struct rsvp_filter *f; while ((f = s->ht[h2]) != NULL) { @@ -291,13 +293,13 @@ static void rsvp_destroy(struct tcf_proto *tp) static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) { - struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg; - unsigned h = f->handle; + struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg; + unsigned int h = f->handle; struct rsvp_session **sp; struct rsvp_session *s = f->sess; int i; - for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) { + for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) { if (*fp == f) { tcf_tree_lock(tp); *fp = f->next; @@ -306,12 +308,12 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) /* Strip tree */ - for (i=0; i<=16; i++) + for (i = 0; i <= 16; i++) if (s->ht[i]) return 0; /* OK, session has no flows */ - for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; + for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF]; *sp; sp = &(*sp)->next) { if (*sp == s) { tcf_tree_lock(tp); @@ -329,13 +331,14 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) return 0; } -static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) +static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt) { struct rsvp_head *data = tp->root; int i = 0xFFFF; while (i-- > 0) { u32 h; + if ((data->hgenerator += 0x10000) == 0) data->hgenerator = 0x10000; h = data->hgenerator|salt; @@ -347,10 +350,10 @@ static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) static int tunnel_bts(struct rsvp_head *data) { - int n = data->tgenerator>>5; - u32 b = 1<<(data->tgenerator&0x1F); - - if (data->tmap[n]&b) + int n = data->tgenerator >> 5; + u32 b = 1 << (data->tgenerator & 0x1F); + + if (data->tmap[n] & b) return 0; data->tmap[n] |= b; return 1; @@ -364,10 +367,10 @@ static void tunnel_recycle(struct rsvp_head *data) memset(tmap, 0, sizeof(tmap)); - for (h1=0; h1<256; h1++) { + for (h1 = 0; h1 < 256; h1++) { struct rsvp_session *s; for (s = sht[h1]; s; s = s->next) { - for (h2=0; h2<=16; h2++) { + for (h2 = 0; h2 <= 16; h2++) { struct rsvp_filter *f; for (f = s->ht[h2]; f; f = f->next) { @@ -387,8 +390,8 @@ static u32 gen_tunnel(struct rsvp_head *data) { int i, k; - for (k=0; k<2; k++) { - for (i=255; i>0; i--) { + for (k = 0; k < 2; k++) { + for (i = 255; i > 0; i--) { if (++data->tgenerator == 0) data->tgenerator = 1; if (tunnel_bts(data)) @@ -399,39 +402,52 @@ static u32 gen_tunnel(struct rsvp_head *data) return 0; } -static int rsvp_change(struct tcf_proto *tp, unsigned long base, +static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { + [TCA_RSVP_CLASSID] = { .type = NLA_U32 }, + [TCA_RSVP_DST] = { .type = NLA_BINARY, + .len = RSVP_DST_LEN * sizeof(u32) }, + [TCA_RSVP_SRC] = { .type = NLA_BINARY, + .len = RSVP_DST_LEN * sizeof(u32) }, + [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, +}; + +static int rsvp_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, - unsigned long *arg) + struct nlattr **tca, + unsigned long *arg, bool ovr) { struct rsvp_head *data = tp->root; struct rsvp_filter *f, **fp; struct rsvp_session *s, **sp; struct tc_rsvp_pinfo *pinfo = NULL; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_RSVP_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_RSVP_MAX + 1]; struct tcf_exts e; - unsigned h1, h2; - u32 *dst; + unsigned int h1, h2; + __be32 *dst; int err; if (opt == NULL) return handle ? -EINVAL : 0; - if (rtattr_parse_nested(tb, TCA_RSVP_MAX, opt) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy); + if (err < 0) + return err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map); + tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) return err; - if ((f = (struct rsvp_filter*)*arg) != NULL) { + f = (struct rsvp_filter *)*arg; + if (f) { /* Node exists: adjust only classid */ if (f->handle != handle && handle) goto errout2; - if (tb[TCA_RSVP_CLASSID-1]) { - f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + if (tb[TCA_RSVP_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); tcf_bind_filter(tp, &f->res, base); } @@ -443,42 +459,29 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base, err = -EINVAL; if (handle) goto errout2; - if (tb[TCA_RSVP_DST-1] == NULL) + if (tb[TCA_RSVP_DST] == NULL) goto errout2; err = -ENOBUFS; - f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL); + f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL); if (f == NULL) goto errout2; - memset(f, 0, sizeof(*f)); + tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); h2 = 16; - if (tb[TCA_RSVP_SRC-1]) { - err = -EINVAL; - if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src)) - goto errout; - memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); + if (tb[TCA_RSVP_SRC]) { + memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); h2 = hash_src(f->src); } - if (tb[TCA_RSVP_PINFO-1]) { - err = -EINVAL; - if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo)) - goto errout; - pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]); + if (tb[TCA_RSVP_PINFO]) { + pinfo = nla_data(tb[TCA_RSVP_PINFO]); f->spi = pinfo->spi; f->tunnelhdr = pinfo->tunnelhdr; } - if (tb[TCA_RSVP_CLASSID-1]) { - err = -EINVAL; - if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4) - goto errout; - f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); - } + if (tb[TCA_RSVP_CLASSID]) + f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); - err = -EINVAL; - if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src)) - goto errout; - dst = RTA_DATA(tb[TCA_RSVP_DST-1]); + dst = nla_data(tb[TCA_RSVP_DST]); h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); err = -ENOMEM; @@ -496,16 +499,16 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base, goto errout; } - for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) { + for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) { if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && pinfo && pinfo->protocol == s->protocol && - memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 + memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 && #if RSVP_DST_LEN == 4 - && dst[0] == s->dst[0] - && dst[1] == s->dst[1] - && dst[2] == s->dst[2] + dst[0] == s->dst[0] && + dst[1] == s->dst[1] && + dst[2] == s->dst[2] && #endif - && pinfo->tunnelid == s->tunnelid) { + pinfo->tunnelid == s->tunnelid) { insert: /* OK, we found appropriate session */ @@ -519,7 +522,7 @@ insert: tcf_exts_change(tp, &f->exts, &e); for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) - if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask) + if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask) break; f->next = *fp; wmb(); @@ -533,10 +536,9 @@ insert: /* No session found. Create new one. */ err = -ENOBUFS; - s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL); + s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL); if (s == NULL) goto errout; - memset(s, 0, sizeof(*s)); memcpy(s->dst, dst, sizeof(s->dst)); if (pinfo) { @@ -551,7 +553,7 @@ insert: s->next = *sp; wmb(); *sp = s; - + goto insert; errout: @@ -564,7 +566,7 @@ errout2: static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct rsvp_head *head = tp->root; - unsigned h, h1; + unsigned int h, h1; if (arg->stop) return; @@ -592,13 +594,13 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, +static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct rsvp_filter *f = (struct rsvp_filter*)fh; + struct rsvp_filter *f = (struct rsvp_filter *)fh; struct rsvp_session *s; - unsigned char *b = skb->tail; - struct rtattr *rta; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; struct tc_rsvp_pinfo pinfo; if (f == NULL) @@ -607,39 +609,42 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, t->tcm_handle = f->handle; + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - - RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); + if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst)) + goto nla_put_failure; pinfo.dpi = s->dpi; pinfo.spi = f->spi; pinfo.protocol = s->protocol; pinfo.tunnelid = s->tunnelid; pinfo.tunnelhdr = f->tunnelhdr; pinfo.pad = 0; - RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); - if (f->res.classid) - RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); - if (((f->handle>>8)&0xFF) != 16) - RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); - - if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) - goto rtattr_failure; - - rta->rta_len = skb->tail - b; - - if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) - goto rtattr_failure; + if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo)) + goto nla_put_failure; + if (f->res.classid && + nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid)) + goto nla_put_failure; + if (((f->handle >> 8) & 0xFF) != 16 && + nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src)) + goto nla_put_failure; + + if (tcf_exts_dump(skb, &f->exts) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -static struct tcf_proto_ops RSVP_OPS = { - .next = NULL, +static struct tcf_proto_ops RSVP_OPS __read_mostly = { .kind = RSVP_ID, .classify = rsvp_classify, .init = rsvp_init, @@ -658,7 +663,7 @@ static int __init init_rsvp(void) return register_tcf_proto_ops(&RSVP_OPS); } -static void __exit exit_rsvp(void) +static void __exit exit_rsvp(void) { unregister_tcf_proto_ops(&RSVP_OPS); } diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c index fde51f7848e..dd08aea2aee 100644 --- a/net/sched/cls_rsvp6.c +++ b/net/sched/cls_rsvp6.c @@ -10,31 +10,15 @@ */ #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> #include <linux/ipv6.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <net/netlink.h> #define RSVP_DST_LEN 4 #define RSVP_ID "rsvp6" diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 9f921174c8a..c721cd4a469 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -4,23 +4,15 @@ * Written 1998,1999 by Werner Almesberger, EPFL ICA */ -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/errno.h> -#include <linux/netdevice.h> -#include <net/ip.h> +#include <linux/slab.h> #include <net/act_api.h> +#include <net/netlink.h> #include <net/pkt_cls.h> -#include <net/route.h> - - -/* - * Not quite sure if we need all the xchgs Alexey uses when accessing things. - * Can always add them later ... :) - */ /* * Passing parameters to the root seems to be done more awkwardly than really @@ -32,22 +24,6 @@ #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ -#if 1 /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif - -#if 0 /* data */ -#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define D2PRINTK(format,args...) -#endif - - -#define PRIV(tp) ((struct tcindex_data *) (tp)->root) - - struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; @@ -71,11 +47,6 @@ struct tcindex_data { int fall_through; /* 0: only classify if explicit match */ }; -static struct tcf_ext_map tcindex_ext_map = { - .police = TCA_TCINDEX_POLICE, - .action = TCA_TCINDEX_ACT -}; - static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) { @@ -100,14 +71,15 @@ tcindex_lookup(struct tcindex_data *p, u16 key) } -static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, +static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *f; int key = (skb->tc_index & p->mask) >> p->shift; - D2PRINTK("tcindex_classify(skb %p,tp %p,res %p),p %p\n",skb,tp,res,p); + pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n", + skb, tp, res, p); f = tcindex_lookup(p, key); if (!f) { @@ -115,11 +87,11 @@ static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, return -1; res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key); res->class = 0; - D2PRINTK("alg 0x%x\n",res->classid); + pr_debug("alg 0x%x\n", res->classid); return 0; } *res = f->res; - D2PRINTK("map 0x%x\n",res->classid); + pr_debug("map 0x%x\n", res->classid); return tcf_exts_exec(skb, &f->exts, res); } @@ -127,10 +99,10 @@ static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r; - DPRINTK("tcindex_get(tp %p,handle 0x%08x)\n",tp,handle); + pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); if (p->perfect && handle >= p->alloc_hash) return 0; r = tcindex_lookup(p, handle); @@ -140,7 +112,7 @@ static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) static void tcindex_put(struct tcf_proto *tp, unsigned long f) { - DPRINTK("tcindex_put(tp %p,f 0x%lx)\n",tp,f); + pr_debug("tcindex_put(tp %p,f 0x%lx)\n", tp, f); } @@ -148,12 +120,11 @@ static int tcindex_init(struct tcf_proto *tp) { struct tcindex_data *p; - DPRINTK("tcindex_init(tp %p)\n",tp); - p = kmalloc(sizeof(struct tcindex_data),GFP_KERNEL); + pr_debug("tcindex_init(tp %p)\n", tp); + p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL); if (!p) return -ENOMEM; - memset(p, 0, sizeof(*p)); p->mask = 0xffff; p->hash = DEFAULT_HASH_SIZE; p->fall_through = 1; @@ -166,11 +137,11 @@ static int tcindex_init(struct tcf_proto *tp) static int __tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; struct tcindex_filter *f = NULL; - DPRINTK("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n",tp,arg,p,f); + pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f); if (p->perfect) { if (!r->res.class) return -ENOENT; @@ -209,10 +180,25 @@ valid_perfect_hash(struct tcindex_data *p) return p->hash > (p->mask >> p->shift); } +static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { + [TCA_TCINDEX_HASH] = { .type = NLA_U32 }, + [TCA_TCINDEX_MASK] = { .type = NLA_U16 }, + [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 }, + [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 }, + [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, +}; + +static void tcindex_filter_result_init(struct tcindex_filter_result *r) +{ + memset(r, 0, sizeof(*r)); + tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); +} + static int -tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, - struct tcindex_data *p, struct tcindex_filter_result *r, - struct rtattr **tb, struct rtattr *est) +tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, + u32 handle, struct tcindex_data *p, + struct tcindex_filter_result *r, struct nlattr **tb, + struct nlattr *est, bool ovr) { int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; @@ -221,36 +207,26 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map); + tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; - + memcpy(&cp, p, sizeof(cp)); - memset(&new_filter_result, 0, sizeof(new_filter_result)); + tcindex_filter_result_init(&new_filter_result); + tcindex_filter_result_init(&cr); if (old_r) - memcpy(&cr, r, sizeof(cr)); - else - memset(&cr, 0, sizeof(cr)); + cr.res = r->res; - err = -EINVAL; - if (tb[TCA_TCINDEX_HASH-1]) { - if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH-1]) < sizeof(u32)) - goto errout; - cp.hash = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_HASH-1]); - } + if (tb[TCA_TCINDEX_HASH]) + cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); - if (tb[TCA_TCINDEX_MASK-1]) { - if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK-1]) < sizeof(u16)) - goto errout; - cp.mask = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK-1]); - } + if (tb[TCA_TCINDEX_MASK]) + cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - if (tb[TCA_TCINDEX_SHIFT-1]) { - if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT-1]) < sizeof(u16)) - goto errout; - cp.shift = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_SHIFT-1]); - } + if (tb[TCA_TCINDEX_SHIFT]) + cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); err = -EBUSY; /* Hash already allocated, make sure that we still meet the @@ -264,19 +240,15 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, goto errout; err = -EINVAL; - if (tb[TCA_TCINDEX_FALL_THROUGH-1]) { - if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH-1]) < sizeof(u32)) - goto errout; - cp.fall_through = - *(u32 *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH-1]); - } + if (tb[TCA_TCINDEX_FALL_THROUGH]) + cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); if (!cp.hash) { /* Hash not specified, use perfect hash if the upper limit * of the hashing index is below the threshold. */ if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) - cp.hash = (cp.mask >> cp.shift)+1; + cp.hash = (cp.mask >> cp.shift) + 1; else cp.hash = DEFAULT_HASH_SIZE; } @@ -297,16 +269,19 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, err = -ENOMEM; if (!cp.perfect && !cp.h) { if (valid_perfect_hash(&cp)) { - cp.perfect = kmalloc(cp.hash * sizeof(*r), GFP_KERNEL); + int i; + + cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL); if (!cp.perfect) goto errout; - memset(cp.perfect, 0, cp.hash * sizeof(*r)); + for (i = 0; i < cp.hash; i++) + tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT, + TCA_TCINDEX_POLICE); balloc = 1; } else { - cp.h = kmalloc(cp.hash * sizeof(f), GFP_KERNEL); + cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL); if (!cp.h) goto errout; - memset(cp.h, 0, cp.hash * sizeof(f)); balloc = 2; } } @@ -317,25 +292,27 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, r = tcindex_lookup(&cp, handle) ? : &new_filter_result; if (r == &new_filter_result) { - f = kmalloc(sizeof(*f), GFP_KERNEL); + f = kzalloc(sizeof(*f), GFP_KERNEL); if (!f) goto errout_alloc; - memset(f, 0, sizeof(*f)); - } + } - if (tb[TCA_TCINDEX_CLASSID-1]) { - cr.res.classid = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_CLASSID-1]); + if (tb[TCA_TCINDEX_CLASSID]) { + cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]); tcf_bind_filter(tp, &cr.res, base); - } + } - tcf_exts_change(tp, &cr.exts, &e); + if (old_r) + tcf_exts_change(tp, &r->exts, &e); + else + tcf_exts_change(tp, &cr.exts, &e); tcf_tree_lock(tp); if (old_r && old_r != r) - memset(old_r, 0, sizeof(*old_r)); + tcindex_filter_result_init(old_r); memcpy(p, &cp, sizeof(cp)); - memcpy(r, &cr, sizeof(cr)); + r->res = cr.res; if (r == &new_filter_result) { struct tcindex_filter **fp; @@ -346,7 +323,7 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next) /* nothing */; *fp = f; - } + } tcf_tree_unlock(tp); return 0; @@ -362,35 +339,39 @@ errout: } static int -tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, unsigned long *arg) +tcindex_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, + struct nlattr **tca, unsigned long *arg, bool ovr) { - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_TCINDEX_MAX]; - struct tcindex_data *p = PRIV(tp); + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_TCINDEX_MAX + 1]; + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; + int err; - DPRINTK("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," + pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," "p %p,r %p,*arg 0x%lx\n", tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L); if (!opt) return 0; - if (rtattr_parse_nested(tb, TCA_TCINDEX_MAX, opt) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy); + if (err < 0) + return err; - return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE-1]); + return tcindex_set_parms(net, tp, base, handle, p, r, tb, + tca[TCA_RATE], ovr); } static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) { - struct tcindex_data *p = PRIV(tp); - struct tcindex_filter *f,*next; + struct tcindex_data *p = tp->root; + struct tcindex_filter *f, *next; int i; - DPRINTK("tcindex_walk(tp %p,walker %p),p %p\n",tp,walker,p); + pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p); if (p->perfect) { for (i = 0; i < p->hash; i++) { if (!p->perfect[i].res.class) @@ -412,7 +393,7 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) for (f = p->h[i]; f; f = next) { next = f->next; if (walker->count >= walker->skip) { - if (walker->fn(tp,(unsigned long) &f->result, + if (walker->fn(tp, (unsigned long) &f->result, walker) < 0) { walker->stop = 1; return; @@ -433,14 +414,14 @@ static int tcindex_destroy_element(struct tcf_proto *tp, static void tcindex_destroy(struct tcf_proto *tp) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcf_walker walker; - DPRINTK("tcindex_destroy(tp %p),p %p\n",tp,p); + pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); walker.count = 0; walker.skip = 0; walker.fn = &tcindex_destroy_element; - tcindex_walk(tp,&walker); + tcindex_walk(tp, &walker); kfree(p->perfect); kfree(p->h); kfree(p); @@ -448,27 +429,30 @@ static void tcindex_destroy(struct tcf_proto *tp) } -static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, +static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; - unsigned char *b = skb->tail; - struct rtattr *rta; - - DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n", - tp,fh,skb,t,p,r,b); - DPRINTK("p->perfect %p p->h %p\n",p->perfect,p->h); - rta = (struct rtattr *) b; - RTA_PUT(skb,TCA_OPTIONS,0,NULL); + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + pr_debug("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n", + tp, fh, skb, t, p, r, b); + pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h); + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + if (!fh) { t->tcm_handle = ~0; /* whatever ... */ - RTA_PUT(skb,TCA_TCINDEX_HASH,sizeof(p->hash),&p->hash); - RTA_PUT(skb,TCA_TCINDEX_MASK,sizeof(p->mask),&p->mask); - RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift); - RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through), - &p->fall_through); - rta->rta_len = skb->tail-b; + if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) || + nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) || + nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) || + nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through)) + goto nla_put_failure; + nla_nest_end(skb, nest); } else { if (p->perfect) { t->tcm_handle = r-p->perfect; @@ -485,27 +469,27 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, } } } - DPRINTK("handle = %d\n",t->tcm_handle); - if (r->res.class) - RTA_PUT(skb, TCA_TCINDEX_CLASSID, 4, &r->res.classid); + pr_debug("handle = %d\n", t->tcm_handle); + if (r->res.class && + nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid)) + goto nla_put_failure; - if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) - goto rtattr_failure; - rta->rta_len = skb->tail-b; + if (tcf_exts_dump(skb, &r->exts) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) - goto rtattr_failure; + if (tcf_exts_dump_stats(skb, &r->exts) < 0) + goto nla_put_failure; } - + return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -static struct tcf_proto_ops cls_tcindex_ops = { - .next = NULL, +static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { .kind = "tcindex", .classify = tcindex_classify, .init = tcindex_init, @@ -524,7 +508,7 @@ static int __init init_tcindex(void) return register_tcf_proto_ops(&cls_tcindex_ops); } -static void __exit exit_tcindex(void) +static void __exit exit_tcindex(void) { unregister_tcf_proto_ops(&cls_tcindex_ops); } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 2b670479dde..70c0be8d012 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -30,42 +30,26 @@ * nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro> */ -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/config.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> #include <linux/rtnetlink.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> +#include <linux/bitmap.h> +#include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> -struct tc_u_knode -{ +struct tc_u_knode { struct tc_u_knode *next; u32 handle; struct tc_u_hnode *ht_up; struct tcf_exts exts; #ifdef CONFIG_NET_CLS_IND - char indev[IFNAMSIZ]; + int ifindex; #endif u8 fshift; struct tcf_result res; @@ -79,49 +63,41 @@ struct tc_u_knode struct tc_u32_sel sel; }; -struct tc_u_hnode -{ +struct tc_u_hnode { struct tc_u_hnode *next; u32 handle; u32 prio; struct tc_u_common *tp_c; int refcnt; - unsigned divisor; + unsigned int divisor; struct tc_u_knode *ht[1]; }; -struct tc_u_common -{ - struct tc_u_common *next; +struct tc_u_common { struct tc_u_hnode *hlist; struct Qdisc *q; int refcnt; u32 hgenerator; }; -static struct tcf_ext_map u32_ext_map = { - .action = TCA_U32_ACT, - .police = TCA_U32_POLICE -}; - -static struct tc_u_common *u32_list; - -static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel, u8 fshift) +static inline unsigned int u32_hash_fold(__be32 key, + const struct tc_u32_sel *sel, + u8 fshift) { - unsigned h = (key & sel->hmask)>>fshift; + unsigned int h = ntohl(key & sel->hmask) >> fshift; return h; } -static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) +static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct { struct tc_u_knode *knode; - u8 *ptr; + unsigned int off; } stack[TC_U32_MAXDEPTH]; - struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; - u8 *ptr = skb->nh.raw; + struct tc_u_hnode *ht = tp->root; + unsigned int off = skb_network_offset(skb); struct tc_u_knode *n; int sdepth = 0; int off2 = 0; @@ -139,12 +115,12 @@ next_knode: struct tc_u32_key *key = n->sel.keys; #ifdef CONFIG_CLS_U32_PERF - n->pf->rcnt +=1; + n->pf->rcnt += 1; j = 0; #endif #ifdef CONFIG_CLS_U32_MARK - if ((skb->nfmark & n->mark.mask) != n->mark.val) { + if ((skb->mark & n->mark.mask) != n->mark.val) { n = n->next; goto next_knode; } else { @@ -152,30 +128,38 @@ next_knode: } #endif - for (i = n->sel.nkeys; i>0; i--, key++) { + for (i = n->sel.nkeys; i > 0; i--, key++) { + int toff = off + key->off + (off2 & key->offmask); + __be32 *data, hdata; + + if (skb_headroom(skb) + toff > INT_MAX) + goto out; - if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) { + data = skb_header_pointer(skb, toff, 4, &hdata); + if (!data) + goto out; + if ((*data ^ key->val) & key->mask) { n = n->next; goto next_knode; } #ifdef CONFIG_CLS_U32_PERF - n->pf->kcnts[j] +=1; + n->pf->kcnts[j] += 1; j++; #endif } if (n->ht_down == NULL) { check_terminal: - if (n->sel.flags&TC_U32_TERMINAL) { + if (n->sel.flags & TC_U32_TERMINAL) { *res = n->res; #ifdef CONFIG_NET_CLS_IND - if (!tcf_match_indev(skb, n->indev)) { + if (!tcf_match_indev(skb, n->ifindex)) { n = n->next; goto next_knode; } #endif #ifdef CONFIG_CLS_U32_PERF - n->pf->rhit +=1; + n->pf->rhit += 1; #endif r = tcf_exts_exec(skb, &n->exts, res); if (r < 0) { @@ -193,29 +177,45 @@ check_terminal: if (sdepth >= TC_U32_MAXDEPTH) goto deadloop; stack[sdepth].knode = n; - stack[sdepth].ptr = ptr; + stack[sdepth].off = off; sdepth++; ht = n->ht_down; sel = 0; - if (ht->divisor) - sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel,n->fshift); - - if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) + if (ht->divisor) { + __be32 *data, hdata; + + data = skb_header_pointer(skb, off + n->sel.hoff, 4, + &hdata); + if (!data) + goto out; + sel = ht->divisor & u32_hash_fold(*data, &n->sel, + n->fshift); + } + if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT))) goto next_ht; - if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) { + if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) { off2 = n->sel.off + 3; - if (n->sel.flags&TC_U32_VAROFFSET) - off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift; + if (n->sel.flags & TC_U32_VAROFFSET) { + __be16 *data, hdata; + + data = skb_header_pointer(skb, + off + n->sel.offoff, + 2, &hdata); + if (!data) + goto out; + off2 += ntohs(n->sel.offmask & *data) >> + n->sel.offshift; + } off2 &= ~3; } - if (n->sel.flags&TC_U32_EAT) { - ptr += off2; + if (n->sel.flags & TC_U32_EAT) { + off += off2; off2 = 0; } - if (ptr < skb->tail) + if (off < skb->len) goto next_ht; } @@ -223,18 +223,18 @@ check_terminal: if (sdepth--) { n = stack[sdepth].knode; ht = n->ht_up; - ptr = stack[sdepth].ptr; + off = stack[sdepth].off; goto check_terminal; } +out: return -1; deadloop: - if (net_ratelimit()) - printk("cls_u32: dead loop\n"); + net_warn_ratelimited("cls_u32: dead loop\n"); return -1; } -static __inline__ struct tc_u_hnode * +static struct tc_u_hnode * u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) { struct tc_u_hnode *ht; @@ -246,10 +246,10 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) return ht; } -static __inline__ struct tc_u_knode * +static struct tc_u_knode * u32_lookup_key(struct tc_u_hnode *ht, u32 handle) { - unsigned sel; + unsigned int sel; struct tc_u_knode *n = NULL; sel = TC_U32_HASH(handle); @@ -294,7 +294,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c) do { if (++tp_c->hgenerator == 0x7FF) tp_c->hgenerator = 1; - } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); + } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; } @@ -304,30 +304,25 @@ static int u32_init(struct tcf_proto *tp) struct tc_u_hnode *root_ht; struct tc_u_common *tp_c; - for (tp_c = u32_list; tp_c; tp_c = tp_c->next) - if (tp_c->q == tp->q) - break; + tp_c = tp->q->u32_node; - root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL); + root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL); if (root_ht == NULL) return -ENOBUFS; - memset(root_ht, 0, sizeof(*root_ht)); root_ht->divisor = 0; root_ht->refcnt++; root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; root_ht->prio = tp->prio; if (tp_c == NULL) { - tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL); + tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); if (tp_c == NULL) { kfree(root_ht); return -ENOBUFS; } - memset(tp_c, 0, sizeof(*tp_c)); tp_c->q = tp->q; - tp_c->next = u32_list; - u32_list = tp_c; + tp->q->u32_node = tp_c; } tp_c->refcnt++; @@ -347,14 +342,13 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) if (n->ht_down) n->ht_down->refcnt--; #ifdef CONFIG_CLS_U32_PERF - if (n) - kfree(n->pf); + kfree(n->pf); #endif kfree(n); return 0; } -static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) +static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { struct tc_u_knode **kp; struct tc_u_hnode *ht = key->ht_up; @@ -371,16 +365,16 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) } } } - BUG_TRAP(0); + WARN_ON(1); return 0; } static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) { struct tc_u_knode *n; - unsigned h; + unsigned int h; - for (h=0; h<=ht->divisor; h++) { + for (h = 0; h <= ht->divisor; h++) { while ((n = ht->ht[h]) != NULL) { ht->ht[h] = n->next; @@ -394,7 +388,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) struct tc_u_common *tp_c = tp->data; struct tc_u_hnode **hn; - BUG_TRAP(!ht->refcnt); + WARN_ON(ht->refcnt); u32_clear_hnode(tp, ht); @@ -406,41 +400,37 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) } } - BUG_TRAP(0); + WARN_ON(1); return -ENOENT; } static void u32_destroy(struct tcf_proto *tp) { struct tc_u_common *tp_c = tp->data; - struct tc_u_hnode *root_ht = xchg(&tp->root, NULL); + struct tc_u_hnode *root_ht = tp->root; - BUG_TRAP(root_ht != NULL); + WARN_ON(root_ht == NULL); if (root_ht && --root_ht->refcnt == 0) u32_destroy_hnode(tp, root_ht); if (--tp_c->refcnt == 0) { struct tc_u_hnode *ht; - struct tc_u_common **tp_cp; - for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) { - if (*tp_cp == tp_c) { - *tp_cp = tp_c->next; - break; - } - } + tp->q->u32_node = NULL; - for (ht=tp_c->hlist; ht; ht = ht->next) + for (ht = tp_c->hlist; ht; ht = ht->next) { + ht->refcnt--; u32_clear_hnode(tp, ht); + } while ((ht = tp_c->hlist) != NULL) { tp_c->hlist = ht->next; - BUG_TRAP(ht->refcnt == 0); + WARN_ON(ht->refcnt != 0); kfree(ht); - }; + } kfree(tp_c); } @@ -450,52 +440,75 @@ static void u32_destroy(struct tcf_proto *tp) static int u32_delete(struct tcf_proto *tp, unsigned long arg) { - struct tc_u_hnode *ht = (struct tc_u_hnode*)arg; + struct tc_u_hnode *ht = (struct tc_u_hnode *)arg; if (ht == NULL) return 0; if (TC_U32_KEY(ht->handle)) - return u32_delete_key(tp, (struct tc_u_knode*)ht); + return u32_delete_key(tp, (struct tc_u_knode *)ht); if (tp->root == ht) return -EINVAL; - if (--ht->refcnt == 0) + if (ht->refcnt == 1) { + ht->refcnt--; u32_destroy_hnode(tp, ht); + } else { + return -EBUSY; + } return 0; } +#define NR_U32_NODE (1<<12) static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) { struct tc_u_knode *n; - unsigned i = 0x7FF; + unsigned long i; + unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long), + GFP_KERNEL); + if (!bitmap) + return handle | 0xFFF; + + for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next) + set_bit(TC_U32_NODE(n->handle), bitmap); - for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next) - if (i < TC_U32_NODE(n->handle)) - i = TC_U32_NODE(n->handle); - i++; + i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); + if (i >= NR_U32_NODE) + i = find_next_zero_bit(bitmap, NR_U32_NODE, 1); - return handle|(i>0xFFF ? 0xFFF : i); + kfree(bitmap); + return handle | (i >= NR_U32_NODE ? 0xFFF : i); } -static int u32_set_parms(struct tcf_proto *tp, unsigned long base, - struct tc_u_hnode *ht, - struct tc_u_knode *n, struct rtattr **tb, - struct rtattr *est) +static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { + [TCA_U32_CLASSID] = { .type = NLA_U32 }, + [TCA_U32_HASH] = { .type = NLA_U32 }, + [TCA_U32_LINK] = { .type = NLA_U32 }, + [TCA_U32_DIVISOR] = { .type = NLA_U32 }, + [TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) }, + [TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, + [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) }, +}; + +static int u32_set_parms(struct net *net, struct tcf_proto *tp, + unsigned long base, struct tc_u_hnode *ht, + struct tc_u_knode *n, struct nlattr **tb, + struct nlattr *est, bool ovr) { int err; struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map); + tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; err = -EINVAL; - if (tb[TCA_U32_LINK-1]) { - u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]); - struct tc_u_hnode *ht_down = NULL; + if (tb[TCA_U32_LINK]) { + u32 handle = nla_get_u32(tb[TCA_U32_LINK]); + struct tc_u_hnode *ht_down = NULL, *ht_old; if (TC_U32_KEY(handle)) goto errout; @@ -509,22 +522,25 @@ static int u32_set_parms(struct tcf_proto *tp, unsigned long base, } tcf_tree_lock(tp); - ht_down = xchg(&n->ht_down, ht_down); + ht_old = n->ht_down; + n->ht_down = ht_down; tcf_tree_unlock(tp); - if (ht_down) - ht_down->refcnt--; + if (ht_old) + ht_old->refcnt--; } - if (tb[TCA_U32_CLASSID-1]) { - n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]); + if (tb[TCA_U32_CLASSID]) { + n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]); tcf_bind_filter(tp, &n->res, base); } #ifdef CONFIG_NET_CLS_IND - if (tb[TCA_U32_INDEV-1]) { - int err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV-1]); - if (err < 0) + if (tb[TCA_U32_INDEV]) { + int ret; + ret = tcf_change_indev(net, tb[TCA_U32_INDEV]); + if (ret < 0) goto errout; + n->ifindex = ret; } #endif tcf_exts_change(tp, &n->exts, &e); @@ -535,34 +551,38 @@ errout: return err; } -static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, - unsigned long *arg) +static int u32_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, + struct nlattr **tca, + unsigned long *arg, bool ovr) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; struct tc_u_knode *n; struct tc_u32_sel *s; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_U32_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_U32_MAX + 1]; u32 htid; int err; if (opt == NULL) return handle ? -EINVAL : 0; - if (rtattr_parse_nested(tb, TCA_U32_MAX, opt) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy); + if (err < 0) + return err; - if ((n = (struct tc_u_knode*)*arg) != NULL) { + n = (struct tc_u_knode *)*arg; + if (n) { if (TC_U32_KEY(n->handle) == 0) return -EINVAL; - return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE-1]); + return u32_set_parms(net, tp, base, n->ht_up, n, tb, + tca[TCA_RATE], ovr); } - if (tb[TCA_U32_DIVISOR-1]) { - unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]); + if (tb[TCA_U32_DIVISOR]) { + unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); if (--divisor > 0x100) return -EINVAL; @@ -573,12 +593,11 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, if (handle == 0) return -ENOMEM; } - ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL); + ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); if (ht == NULL) return -ENOBUFS; - memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*)); ht->tp_c = tp_c; - ht->refcnt = 0; + ht->refcnt = 1; ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; @@ -588,8 +607,8 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, return 0; } - if (tb[TCA_U32_HASH-1]) { - htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]); + if (tb[TCA_U32_HASH]) { + htid = nla_get_u32(tb[TCA_U32_HASH]); if (TC_U32_HTID(htid) == TC_U32_ROOT) { ht = tp->root; htid = ht->handle; @@ -613,59 +632,40 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, } else handle = gen_new_kid(ht, htid); - if (tb[TCA_U32_SEL-1] == 0 || - RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel)) + if (tb[TCA_U32_SEL] == NULL) return -EINVAL; - s = RTA_DATA(tb[TCA_U32_SEL-1]); + s = nla_data(tb[TCA_U32_SEL]); - n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); + n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); if (n == NULL) return -ENOBUFS; - memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key)); #ifdef CONFIG_CLS_U32_PERF - n->pf = kmalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL); + n->pf = kzalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL); if (n->pf == NULL) { kfree(n); return -ENOBUFS; } - memset(n->pf, 0, sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64)); #endif memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); n->ht_up = ht; n->handle = handle; -{ - u8 i = 0; - u32 mask = s->hmask; - if (mask) { - while (!(mask & 1)) { - i++; - mask>>=1; - } - } - n->fshift = i; -} + n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; + tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); #ifdef CONFIG_CLS_U32_MARK - if (tb[TCA_U32_MARK-1]) { + if (tb[TCA_U32_MARK]) { struct tc_u32_mark *mark; - if (RTA_PAYLOAD(tb[TCA_U32_MARK-1]) < sizeof(struct tc_u32_mark)) { -#ifdef CONFIG_CLS_U32_PERF - kfree(n->pf); -#endif - kfree(n); - return -EINVAL; - } - mark = RTA_DATA(tb[TCA_U32_MARK-1]); + mark = nla_data(tb[TCA_U32_MARK]); memcpy(&n->mark, mark, sizeof(struct tc_u32_mark)); n->mark.success = 0; } #endif - err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE-1]); + err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr); if (err == 0) { struct tc_u_knode **ins; for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) @@ -673,15 +673,15 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, break; n->next = *ins; - wmb(); + tcf_tree_lock(tp); *ins = n; + tcf_tree_unlock(tp); *arg = (unsigned long)n; return 0; } #ifdef CONFIG_CLS_U32_PERF - if (n) - kfree(n->pf); + kfree(n->pf); #endif kfree(n); return err; @@ -692,7 +692,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; struct tc_u_knode *n; - unsigned h; + unsigned int h; if (arg->stop) return; @@ -723,70 +723,82 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int u32_dump(struct tcf_proto *tp, unsigned long fh, +static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct tc_u_knode *n = (struct tc_u_knode*)fh; - unsigned char *b = skb->tail; - struct rtattr *rta; + struct tc_u_knode *n = (struct tc_u_knode *)fh; + struct nlattr *nest; if (n == NULL) return skb->len; t->tcm_handle = n->handle; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if (TC_U32_KEY(n->handle) == 0) { - struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; - u32 divisor = ht->divisor+1; - RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor); + struct tc_u_hnode *ht = (struct tc_u_hnode *)fh; + u32 divisor = ht->divisor + 1; + + if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor)) + goto nla_put_failure; } else { - RTA_PUT(skb, TCA_U32_SEL, - sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), - &n->sel); + if (nla_put(skb, TCA_U32_SEL, + sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), + &n->sel)) + goto nla_put_failure; if (n->ht_up) { u32 htid = n->handle & 0xFFFFF000; - RTA_PUT(skb, TCA_U32_HASH, 4, &htid); + if (nla_put_u32(skb, TCA_U32_HASH, htid)) + goto nla_put_failure; } - if (n->res.classid) - RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid); - if (n->ht_down) - RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle); + if (n->res.classid && + nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid)) + goto nla_put_failure; + if (n->ht_down && + nla_put_u32(skb, TCA_U32_LINK, n->ht_down->handle)) + goto nla_put_failure; #ifdef CONFIG_CLS_U32_MARK - if (n->mark.val || n->mark.mask) - RTA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark); + if ((n->mark.val || n->mark.mask) && + nla_put(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark)) + goto nla_put_failure; #endif - if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) - goto rtattr_failure; + if (tcf_exts_dump(skb, &n->exts) < 0) + goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND - if(strlen(n->indev)) - RTA_PUT(skb, TCA_U32_INDEV, IFNAMSIZ, n->indev); + if (n->ifindex) { + struct net_device *dev; + dev = __dev_get_by_index(net, n->ifindex); + if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name)) + goto nla_put_failure; + } #endif #ifdef CONFIG_CLS_U32_PERF - RTA_PUT(skb, TCA_U32_PCNT, - sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), - n->pf); + if (nla_put(skb, TCA_U32_PCNT, + sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), + n->pf)) + goto nla_put_failure; #endif } - rta->rta_len = skb->tail - b; + nla_nest_end(skb, nest); + if (TC_U32_KEY(n->handle)) - if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) - goto rtattr_failure; + if (tcf_exts_dump_stats(skb, &n->exts) < 0) + goto nla_put_failure; return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } -static struct tcf_proto_ops cls_u32_ops = { - .next = NULL, +static struct tcf_proto_ops cls_u32_ops __read_mostly = { .kind = "u32", .classify = u32_classify, .init = u32_init, @@ -802,23 +814,20 @@ static struct tcf_proto_ops cls_u32_ops = { static int __init init_u32(void) { - printk("u32 classifier\n"); + pr_info("u32 classifier\n"); #ifdef CONFIG_CLS_U32_PERF - printk(" Perfomance counters on\n"); -#endif -#ifdef CONFIG_NET_CLS_POLICE - printk(" OLD policer on \n"); + pr_info(" Performance counters on\n"); #endif #ifdef CONFIG_NET_CLS_IND - printk(" input device check on \n"); + pr_info(" input device check on\n"); #endif #ifdef CONFIG_NET_CLS_ACT - printk(" Actions configured \n"); + pr_info(" Actions configured\n"); #endif return register_tcf_proto_ops(&cls_u32_ops); } -static void __exit exit_u32(void) +static void __exit exit_u32(void) { unregister_tcf_proto_ops(&cls_u32_ops); } diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c new file mode 100644 index 00000000000..bfd34e4c1af --- /dev/null +++ b/net/sched/em_canid.c @@ -0,0 +1,240 @@ +/* + * em_canid.c Ematch rule to match CAN frames according to their CAN IDs + * + * This program is free software; you can distribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de> + * Copyright: (c) 2011 Czech Technical University in Prague + * (c) 2011 Volkswagen Group Research + * Authors: Michal Sojka <sojkam1@fel.cvut.cz> + * Pavel Pisa <pisa@cmp.felk.cvut.cz> + * Rostislav Lisovy <lisovy@gmail.cz> + * Funded by: Volkswagen Group Research + */ + +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <net/pkt_cls.h> +#include <linux/can.h> + +#define EM_CAN_RULES_MAX 500 + +struct canid_match { + /* For each SFF CAN ID (11 bit) there is one record in this bitfield */ + DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS)); + + int rules_count; + int sff_rules_count; + int eff_rules_count; + + /* + * Raw rules copied from netlink message; Used for sending + * information to userspace (when 'tc filter show' is invoked) + * AND when matching EFF frames + */ + struct can_filter rules_raw[]; +}; + +/** + * em_canid_get_id() - Extracts Can ID out of the sk_buff structure. + */ +static canid_t em_canid_get_id(struct sk_buff *skb) +{ + /* CAN ID is stored within the data field */ + struct can_frame *cf = (struct can_frame *)skb->data; + + return cf->can_id; +} + +static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id, + u32 can_mask) +{ + int i; + + /* + * Limit can_mask and can_id to SFF range to + * protect against write after end of array + */ + can_mask &= CAN_SFF_MASK; + can_id &= can_mask; + + /* Single frame */ + if (can_mask == CAN_SFF_MASK) { + set_bit(can_id, cm->match_sff); + return; + } + + /* All frames */ + if (can_mask == 0) { + bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS)); + return; + } + + /* + * Individual frame filter. + * Add record (set bit to 1) for each ID that + * conforms particular rule + */ + for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) { + if ((i & can_mask) == can_id) + set_bit(i, cm->match_sff); + } +} + +static inline struct canid_match *em_canid_priv(struct tcf_ematch *m) +{ + return (struct canid_match *)m->data; +} + +static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, + struct tcf_pkt_info *info) +{ + struct canid_match *cm = em_canid_priv(m); + canid_t can_id; + int match = 0; + int i; + const struct can_filter *lp; + + can_id = em_canid_get_id(skb); + + if (can_id & CAN_EFF_FLAG) { + for (i = 0, lp = cm->rules_raw; + i < cm->eff_rules_count; i++, lp++) { + if (!(((lp->can_id ^ can_id) & lp->can_mask))) { + match = 1; + break; + } + } + } else { /* SFF */ + can_id &= CAN_SFF_MASK; + match = (test_bit(can_id, cm->match_sff) ? 1 : 0); + } + + return match; +} + +static int em_canid_change(struct tcf_proto *tp, void *data, int len, + struct tcf_ematch *m) +{ + struct can_filter *conf = data; /* Array with rules */ + struct canid_match *cm; + struct canid_match *cm_old = (struct canid_match *)m->data; + int i; + + if (!len) + return -EINVAL; + + if (len % sizeof(struct can_filter)) + return -EINVAL; + + if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX) + return -EINVAL; + + cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL); + if (!cm) + return -ENOMEM; + + cm->rules_count = len / sizeof(struct can_filter); + + /* + * We need two for() loops for copying rules into two contiguous + * areas in rules_raw to process all eff rules with a simple loop. + * NB: The configuration interface supports sff and eff rules. + * We do not support filters here that match for the same can_id + * provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123). + * For this (unusual case) two filters have to be specified. The + * SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id. + */ + + /* Fill rules_raw with EFF rules first */ + for (i = 0; i < cm->rules_count; i++) { + if (conf[i].can_id & CAN_EFF_FLAG) { + memcpy(cm->rules_raw + cm->eff_rules_count, + &conf[i], + sizeof(struct can_filter)); + + cm->eff_rules_count++; + } + } + + /* append SFF frame rules */ + for (i = 0; i < cm->rules_count; i++) { + if (!(conf[i].can_id & CAN_EFF_FLAG)) { + memcpy(cm->rules_raw + + cm->eff_rules_count + + cm->sff_rules_count, + &conf[i], sizeof(struct can_filter)); + + cm->sff_rules_count++; + + em_canid_sff_match_add(cm, + conf[i].can_id, conf[i].can_mask); + } + } + + m->datalen = sizeof(struct canid_match) + len; + m->data = (unsigned long)cm; + + if (cm_old != NULL) { + pr_err("canid: Configuring an existing ematch!\n"); + kfree(cm_old); + } + + return 0; +} + +static void em_canid_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +{ + struct canid_match *cm = em_canid_priv(m); + + kfree(cm); +} + +static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m) +{ + struct canid_match *cm = em_canid_priv(m); + + /* + * When configuring this ematch 'rules_count' is set not to exceed + * 'rules_raw' array size + */ + if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count, + &cm->rules_raw) < 0) + return -EMSGSIZE; + + return 0; +} + +static struct tcf_ematch_ops em_canid_ops = { + .kind = TCF_EM_CANID, + .change = em_canid_change, + .match = em_canid_match, + .destroy = em_canid_destroy, + .dump = em_canid_dump, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_canid_ops.link) +}; + +static int __init init_em_canid(void) +{ + return tcf_em_register(&em_canid_ops); +} + +static void __exit exit_em_canid(void) +{ + tcf_em_unregister(&em_canid_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_canid); +module_exit(exit_em_canid); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID); diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index bf1f00f8b1b..1c8360a2752 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -9,12 +9,12 @@ * Authors: Thomas Graf <tgraf@suug.ch> */ -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/tc_ematch/tc_em_cmp.h> +#include <asm/unaligned.h> #include <net/pkt_cls.h> static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp) @@ -33,44 +33,41 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, return 0; switch (cmp->align) { - case TCF_EM_ALIGN_U8: - val = *ptr; - break; - - case TCF_EM_ALIGN_U16: - val = *ptr << 8; - val |= *(ptr+1); - - if (cmp_needs_transformation(cmp)) - val = be16_to_cpu(val); - break; - - case TCF_EM_ALIGN_U32: - /* Worth checking boundries? The branching seems - * to get worse. Visit again. */ - val = *ptr << 24; - val |= *(ptr+1) << 16; - val |= *(ptr+2) << 8; - val |= *(ptr+3); - - if (cmp_needs_transformation(cmp)) - val = be32_to_cpu(val); - break; - - default: - return 0; + case TCF_EM_ALIGN_U8: + val = *ptr; + break; + + case TCF_EM_ALIGN_U16: + val = get_unaligned_be16(ptr); + + if (cmp_needs_transformation(cmp)) + val = be16_to_cpu(val); + break; + + case TCF_EM_ALIGN_U32: + /* Worth checking boundries? The branching seems + * to get worse. Visit again. + */ + val = get_unaligned_be32(ptr); + + if (cmp_needs_transformation(cmp)) + val = be32_to_cpu(val); + break; + + default: + return 0; } if (cmp->mask) val &= cmp->mask; switch (cmp->opnd) { - case TCF_EM_OPND_EQ: - return val == cmp->val; - case TCF_EM_OPND_LT: - return val < cmp->val; - case TCF_EM_OPND_GT: - return val > cmp->val; + case TCF_EM_OPND_EQ: + return val == cmp->val; + case TCF_EM_OPND_LT: + return val < cmp->val; + case TCF_EM_OPND_GT: + return val > cmp->val; } return 0; @@ -89,7 +86,7 @@ static int __init init_em_cmp(void) return tcf_em_register(&em_cmp_ops); } -static void __exit exit_em_cmp(void) +static void __exit exit_em_cmp(void) { tcf_em_unregister(&em_cmp_ops); } @@ -99,3 +96,4 @@ MODULE_LICENSE("GPL"); module_init(init_em_cmp); module_exit(exit_em_cmp); +MODULE_ALIAS_TCF_EMATCH(TCF_EM_CMP); diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c new file mode 100644 index 00000000000..527aeb7a3ff --- /dev/null +++ b/net/sched/em_ipset.c @@ -0,0 +1,136 @@ +/* + * net/sched/em_ipset.c ipset ematch + * + * Copyright (c) 2012 Florian Westphal <fw@strlen.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/gfp.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/netfilter/xt_set.h> +#include <linux/ipv6.h> +#include <net/ip.h> +#include <net/pkt_cls.h> + +static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, + struct tcf_ematch *em) +{ + struct xt_set_info *set = data; + ip_set_id_t index; + struct net *net = dev_net(qdisc_dev(tp->q)); + + if (data_len != sizeof(*set)) + return -EINVAL; + + index = ip_set_nfnl_get_byindex(net, set->index); + if (index == IPSET_INVALID_ID) + return -ENOENT; + + em->datalen = sizeof(*set); + em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL); + if (em->data) + return 0; + + ip_set_nfnl_put(net, index); + return -ENOMEM; +} + +static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em) +{ + const struct xt_set_info *set = (const void *) em->data; + if (set) { + ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index); + kfree((void *) em->data); + } +} + +static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct ip_set_adt_opt opt; + struct xt_action_param acpar; + const struct xt_set_info *set = (const void *) em->data; + struct net_device *dev, *indev = NULL; + int ret, network_offset; + + switch (skb->protocol) { + case htons(ETH_P_IP): + acpar.family = NFPROTO_IPV4; + if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + return 0; + acpar.thoff = ip_hdrlen(skb); + break; + case htons(ETH_P_IPV6): + acpar.family = NFPROTO_IPV6; + if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + /* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */ + acpar.thoff = sizeof(struct ipv6hdr); + break; + default: + return 0; + } + + acpar.hooknum = 0; + + opt.family = acpar.family; + opt.dim = set->dim; + opt.flags = set->flags; + opt.cmdflags = 0; + opt.ext.timeout = ~0u; + + network_offset = skb_network_offset(skb); + skb_pull(skb, network_offset); + + dev = skb->dev; + + rcu_read_lock(); + + if (dev && skb->skb_iif) + indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif); + + acpar.in = indev ? indev : dev; + acpar.out = dev; + + ret = ip_set_test(set->index, skb, &acpar, &opt); + + rcu_read_unlock(); + + skb_push(skb, network_offset); + return ret; +} + +static struct tcf_ematch_ops em_ipset_ops = { + .kind = TCF_EM_IPSET, + .change = em_ipset_change, + .destroy = em_ipset_destroy, + .match = em_ipset_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_ipset_ops.link) +}; + +static int __init init_em_ipset(void) +{ + return tcf_em_register(&em_ipset_ops); +} + +static void __exit exit_em_ipset(void) +{ + tcf_em_unregister(&em_ipset_ops); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("TC extended match for IP sets"); + +module_init(init_em_ipset); +module_exit(exit_em_ipset); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 700844d49d7..9b8c0b0e60d 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -9,7 +9,7 @@ * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== - * + * * The metadata ematch compares two meta objects where each object * represents either a meta value stored in the kernel or a static * value provided by userspace. The objects are not provided by @@ -47,7 +47,7 @@ * on the meta type. Obviously, the length of the data must also * be provided for non-numeric types. * - * Additionaly, type dependant modifiers such as shift operators + * Additionally, type dependent modifiers such as shift operators * or mask may be applied to extend the functionaliy. As of now, * the variable length type supports shifting the byte string to * the right, eating up any number of octets and thus supporting @@ -55,10 +55,10 @@ * ppp0..9. * * NOTE: Certain meta values depend on other subsystems and are - * only available if that subsytem is enabled in the kernel. + * only available if that subsystem is enabled in the kernel. */ -#include <linux/config.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> @@ -66,27 +66,25 @@ #include <linux/string.h> #include <linux/skbuff.h> #include <linux/random.h> +#include <linux/if_vlan.h> #include <linux/tc_ematch/tc_em_meta.h> #include <net/dst.h> #include <net/route.h> #include <net/pkt_cls.h> #include <net/sock.h> -struct meta_obj -{ +struct meta_obj { unsigned long value; unsigned int len; }; -struct meta_value -{ +struct meta_value { struct tcf_meta_val hdr; unsigned long val; unsigned int len; }; -struct meta_match -{ +struct meta_match { struct meta_value lvalue; struct meta_value rvalue; }; @@ -171,6 +169,23 @@ META_COLLECTOR(var_dev) } /************************************************************************** + * vlan tag + **************************************************************************/ + +META_COLLECTOR(int_vlan_tag) +{ + unsigned short tag; + + tag = vlan_tx_tag_get(skb); + if (!tag && __vlan_get_tag(skb, &tag)) + *err = -1; + else + dst->value = tag; +} + + + +/************************************************************************** * skb attributes **************************************************************************/ @@ -205,17 +220,18 @@ META_COLLECTOR(int_maclen) dst->value = skb->mac_len; } +META_COLLECTOR(int_rxhash) +{ + dst->value = skb_get_hash(skb); +} + /************************************************************************** * Netfilter **************************************************************************/ -META_COLLECTOR(int_nfmark) +META_COLLECTOR(int_mark) { -#ifdef CONFIG_NETFILTER - dst->value = skb->nfmark; -#else - dst->value = 0; -#endif + dst->value = skb->mark; } /************************************************************************** @@ -233,11 +249,11 @@ META_COLLECTOR(int_tcindex) META_COLLECTOR(int_rtclassid) { - if (unlikely(skb->dst == NULL)) + if (unlikely(skb_dst(skb) == NULL)) *err = -1; else -#ifdef CONFIG_NET_CLS_ROUTE - dst->value = skb->dst->tclassid; +#ifdef CONFIG_IP_ROUTE_CLASSID + dst->value = skb_dst(skb)->tclassid; #else dst->value = 0; #endif @@ -245,217 +261,299 @@ META_COLLECTOR(int_rtclassid) META_COLLECTOR(int_rtiif) { - if (unlikely(skb->dst == NULL)) + if (unlikely(skb_rtable(skb) == NULL)) *err = -1; else - dst->value = ((struct rtable*) skb->dst)->fl.iif; + dst->value = inet_iif(skb); } /************************************************************************** * Socket Attributes **************************************************************************/ -#define SKIP_NONLOCAL(skb) \ - if (unlikely(skb->sk == NULL)) { \ - *err = -1; \ - return; \ - } +#define skip_nonlocal(skb) \ + (unlikely(skb->sk == NULL)) META_COLLECTOR(int_sk_family) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_family; } META_COLLECTOR(int_sk_state) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_state; } META_COLLECTOR(int_sk_reuse) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_reuse; } META_COLLECTOR(int_sk_bound_if) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } /* No error if bound_dev_if is 0, legal userspace check */ dst->value = skb->sk->sk_bound_dev_if; } META_COLLECTOR(var_sk_bound_if) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } - if (skb->sk->sk_bound_dev_if == 0) { + if (skb->sk->sk_bound_dev_if == 0) { dst->value = (unsigned long) "any"; dst->len = 3; - } else { + } else { struct net_device *dev; - - dev = dev_get_by_index(skb->sk->sk_bound_dev_if); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(skb->sk), + skb->sk->sk_bound_dev_if); *err = var_dev(dev, dst); - if (dev) - dev_put(dev); - } + rcu_read_unlock(); + } } META_COLLECTOR(int_sk_refcnt) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = atomic_read(&skb->sk->sk_refcnt); } META_COLLECTOR(int_sk_rcvbuf) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_rcvbuf; } META_COLLECTOR(int_sk_shutdown) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_shutdown; } META_COLLECTOR(int_sk_proto) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_protocol; } META_COLLECTOR(int_sk_type) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_type; } META_COLLECTOR(int_sk_rmem_alloc) { - SKIP_NONLOCAL(skb); - dst->value = atomic_read(&skb->sk->sk_rmem_alloc); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } + dst->value = sk_rmem_alloc_get(skb->sk); } META_COLLECTOR(int_sk_wmem_alloc) { - SKIP_NONLOCAL(skb); - dst->value = atomic_read(&skb->sk->sk_wmem_alloc); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } + dst->value = sk_wmem_alloc_get(skb->sk); } META_COLLECTOR(int_sk_omem_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = atomic_read(&skb->sk->sk_omem_alloc); } META_COLLECTOR(int_sk_rcv_qlen) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_receive_queue.qlen; } META_COLLECTOR(int_sk_snd_qlen) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_write_queue.qlen; } META_COLLECTOR(int_sk_wmem_queued) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_wmem_queued; } META_COLLECTOR(int_sk_fwd_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_forward_alloc; } META_COLLECTOR(int_sk_sndbuf) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_sndbuf; } META_COLLECTOR(int_sk_alloc) { - SKIP_NONLOCAL(skb); - dst->value = skb->sk->sk_allocation; -} - -META_COLLECTOR(int_sk_route_caps) -{ - SKIP_NONLOCAL(skb); - dst->value = skb->sk->sk_route_caps; + if (skip_nonlocal(skb)) { + *err = -1; + return; + } + dst->value = (__force int) skb->sk->sk_allocation; } META_COLLECTOR(int_sk_hash) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_hash; } META_COLLECTOR(int_sk_lingertime) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_lingertime / HZ; } META_COLLECTOR(int_sk_err_qlen) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_error_queue.qlen; } META_COLLECTOR(int_sk_ack_bl) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_ack_backlog; } META_COLLECTOR(int_sk_max_ack_bl) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_max_ack_backlog; } META_COLLECTOR(int_sk_prio) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_priority; } META_COLLECTOR(int_sk_rcvlowat) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_rcvlowat; } META_COLLECTOR(int_sk_rcvtimeo) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_rcvtimeo / HZ; } META_COLLECTOR(int_sk_sndtimeo) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_sndtimeo / HZ; } META_COLLECTOR(int_sk_sendmsg_off) { - SKIP_NONLOCAL(skb); - dst->value = skb->sk->sk_sndmsg_off; + if (skip_nonlocal(skb)) { + *err = -1; + return; + } + dst->value = skb->sk->sk_frag.offset; } META_COLLECTOR(int_sk_write_pend) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_write_pending; } @@ -463,8 +561,7 @@ META_COLLECTOR(int_sk_write_pend) * Meta value collectors assignment table **************************************************************************/ -struct meta_ops -{ +struct meta_ops { void (*get)(struct sk_buff *, struct tcf_pkt_info *, struct meta_value *, struct meta_obj *, int *); }; @@ -474,7 +571,7 @@ struct meta_ops /* Meta value operations table listing all meta value collectors and * assigns them to a type and meta id. */ -static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { +static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = { [TCF_META_TYPE_VAR] = { [META_ID(DEV)] = META_FUNC(var_dev), [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if), @@ -491,7 +588,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { [META_ID(PKTLEN)] = META_FUNC(int_pktlen), [META_ID(DATALEN)] = META_FUNC(int_datalen), [META_ID(MACLEN)] = META_FUNC(int_maclen), - [META_ID(NFMARK)] = META_FUNC(int_nfmark), + [META_ID(NFMARK)] = META_FUNC(int_mark), [META_ID(TCINDEX)] = META_FUNC(int_tcindex), [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid), [META_ID(RTIIF)] = META_FUNC(int_rtiif), @@ -514,7 +611,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { [META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen), [META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc), [META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc), - [META_ID(SK_ROUTE_CAPS)] = META_FUNC(int_sk_route_caps), [META_ID(SK_HASH)] = META_FUNC(int_sk_hash), [META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime), [META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl), @@ -525,10 +621,12 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { [META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo), [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off), [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend), + [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag), + [META_ID(RXHASH)] = META_FUNC(int_rxhash), } }; -static inline struct meta_ops * meta_ops(struct meta_value *val) +static inline struct meta_ops *meta_ops(struct meta_value *val) { return &__meta_ops[meta_type(val)][meta_id(val)]; } @@ -547,14 +645,13 @@ static int meta_var_compare(struct meta_obj *a, struct meta_obj *b) return r; } -static int meta_var_change(struct meta_value *dst, struct rtattr *rta) +static int meta_var_change(struct meta_value *dst, struct nlattr *nla) { - int len = RTA_PAYLOAD(rta); + int len = nla_len(nla); - dst->val = (unsigned long) kmalloc(len, GFP_KERNEL); + dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL); if (dst->val == 0UL) return -ENOMEM; - memcpy((void *) dst->val, RTA_DATA(rta), len); dst->len = len; return 0; } @@ -575,11 +672,12 @@ static void meta_var_apply_extras(struct meta_value *v, static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { - if (v->val && v->len) - RTA_PUT(skb, tlv, v->len, (void *) v->val); + if (v->val && v->len && + nla_put(skb, tlv, v->len, (void *) v->val)) + goto nla_put_failure; return 0; -rtattr_failure: +nla_put_failure: return -1; } @@ -600,13 +698,13 @@ static int meta_int_compare(struct meta_obj *a, struct meta_obj *b) return 1; } -static int meta_int_change(struct meta_value *dst, struct rtattr *rta) +static int meta_int_change(struct meta_value *dst, struct nlattr *nla) { - if (RTA_PAYLOAD(rta) >= sizeof(unsigned long)) { - dst->val = *(unsigned long *) RTA_DATA(rta); + if (nla_len(nla) >= sizeof(unsigned long)) { + dst->val = *(unsigned long *) nla_data(nla); dst->len = sizeof(unsigned long); - } else if (RTA_PAYLOAD(rta) == sizeof(u32)) { - dst->val = *(u32 *) RTA_DATA(rta); + } else if (nla_len(nla) == sizeof(u32)) { + dst->val = nla_get_u32(nla); dst->len = sizeof(u32); } else return -EINVAL; @@ -626,16 +724,17 @@ static void meta_int_apply_extras(struct meta_value *v, static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { - if (v->len == sizeof(unsigned long)) - RTA_PUT(skb, tlv, sizeof(unsigned long), &v->val); - else if (v->len == sizeof(u32)) { - u32 d = v->val; - RTA_PUT(skb, tlv, sizeof(d), &d); + if (v->len == sizeof(unsigned long)) { + if (nla_put(skb, tlv, sizeof(unsigned long), &v->val)) + goto nla_put_failure; + } else if (v->len == sizeof(u32)) { + if (nla_put_u32(skb, tlv, v->val)) + goto nla_put_failure; } return 0; -rtattr_failure: +nla_put_failure: return -1; } @@ -643,16 +742,15 @@ rtattr_failure: * Type specific operations table **************************************************************************/ -struct meta_type_ops -{ +struct meta_type_ops { void (*destroy)(struct meta_value *); int (*compare)(struct meta_obj *, struct meta_obj *); - int (*change)(struct meta_value *, struct rtattr *); + int (*change)(struct meta_value *, struct nlattr *); void (*apply_extras)(struct meta_value *, struct meta_obj *); int (*dump)(struct sk_buff *, struct meta_value *, int); }; -static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = { +static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = { [TCF_META_TYPE_VAR] = { .destroy = meta_var_destroy, .compare = meta_var_compare, @@ -668,7 +766,7 @@ static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = { } }; -static inline struct meta_type_ops * meta_type_ops(struct meta_value *v) +static inline struct meta_type_ops *meta_type_ops(struct meta_value *v) { return &__meta_type_ops[meta_type(v)]; } @@ -677,8 +775,8 @@ static inline struct meta_type_ops * meta_type_ops(struct meta_value *v) * Core **************************************************************************/ -static inline int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, - struct meta_value *v, struct meta_obj *dst) +static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, + struct meta_value *v, struct meta_obj *dst) { int err = 0; @@ -693,7 +791,7 @@ static inline int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, return err; if (meta_type_ops(v)->apply_extras) - meta_type_ops(v)->apply_extras(v, dst); + meta_type_ops(v)->apply_extras(v, dst); return 0; } @@ -712,36 +810,38 @@ static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m, r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value); switch (meta->lvalue.hdr.op) { - case TCF_EM_OPND_EQ: - return !r; - case TCF_EM_OPND_LT: - return r < 0; - case TCF_EM_OPND_GT: - return r > 0; + case TCF_EM_OPND_EQ: + return !r; + case TCF_EM_OPND_LT: + return r < 0; + case TCF_EM_OPND_GT: + return r > 0; } return 0; } -static inline void meta_delete(struct meta_match *meta) +static void meta_delete(struct meta_match *meta) { - struct meta_type_ops *ops = meta_type_ops(&meta->lvalue); + if (meta) { + struct meta_type_ops *ops = meta_type_ops(&meta->lvalue); - if (ops && ops->destroy) { - ops->destroy(&meta->lvalue); - ops->destroy(&meta->rvalue); + if (ops && ops->destroy) { + ops->destroy(&meta->lvalue); + ops->destroy(&meta->rvalue); + } } kfree(meta); } -static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta) +static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla) { - if (rta) { - if (RTA_PAYLOAD(rta) == 0) + if (nla) { + if (nla_len(nla) == 0) return -EINVAL; - return meta_type_ops(dst)->change(dst, rta); + return meta_type_ops(dst)->change(dst, nla); } return 0; @@ -749,24 +849,29 @@ static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta) static inline int meta_is_supported(struct meta_value *val) { - return (!meta_id(val) || meta_ops(val)->get); + return !meta_id(val) || meta_ops(val)->get; } +static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { + [TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) }, +}; + static int em_meta_change(struct tcf_proto *tp, void *data, int len, struct tcf_ematch *m) { - int err = -EINVAL; - struct rtattr *tb[TCA_EM_META_MAX]; + int err; + struct nlattr *tb[TCA_EM_META_MAX + 1]; struct tcf_meta_hdr *hdr; struct meta_match *meta = NULL; - - if (rtattr_parse(tb, TCA_EM_META_MAX, data, len) < 0) + + err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy); + if (err < 0) goto errout; - if (tb[TCA_EM_META_HDR-1] == NULL || - RTA_PAYLOAD(tb[TCA_EM_META_HDR-1]) < sizeof(*hdr)) + err = -EINVAL; + if (tb[TCA_EM_META_HDR] == NULL) goto errout; - hdr = RTA_DATA(tb[TCA_EM_META_HDR-1]); + hdr = nla_data(tb[TCA_EM_META_HDR]); if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) || TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX || @@ -774,10 +879,11 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len, TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX) goto errout; - meta = kmalloc(sizeof(*meta), GFP_KERNEL); - if (meta == NULL) + meta = kzalloc(sizeof(*meta), GFP_KERNEL); + if (meta == NULL) { + err = -ENOMEM; goto errout; - memset(meta, 0, sizeof(*meta)); + } memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left)); memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); @@ -788,8 +894,8 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len, goto errout; } - if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE-1]) < 0 || - meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE-1]) < 0) + if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 || + meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0) goto errout; m->datalen = sizeof(*meta); @@ -818,18 +924,19 @@ static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em) memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left)); memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right)); - RTA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr); + if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr)) + goto nla_put_failure; ops = meta_type_ops(&meta->lvalue); if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 || ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0) - goto rtattr_failure; + goto nla_put_failure; return 0; -rtattr_failure: +nla_put_failure: return -1; -} +} static struct tcf_ematch_ops em_meta_ops = { .kind = TCF_EM_META, @@ -846,7 +953,7 @@ static int __init init_em_meta(void) return tcf_em_register(&em_meta_ops); } -static void __exit exit_em_meta(void) +static void __exit exit_em_meta(void) { tcf_em_unregister(&em_meta_ops); } @@ -855,3 +962,5 @@ MODULE_LICENSE("GPL"); module_init(init_em_meta); module_exit(exit_em_meta); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_META); diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index 71ea926a9f0..a3bed07a008 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -9,22 +9,20 @@ * Authors: Thomas Graf <tgraf@suug.ch> */ -#include <linux/config.h> +#include <linux/gfp.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/tc_ematch/tc_em_nbyte.h> #include <net/pkt_cls.h> -struct nbyte_data -{ +struct nbyte_data { struct tcf_em_nbyte hdr; char pattern[0]; }; - + static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len, struct tcf_ematch *em) { @@ -35,12 +33,10 @@ static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len, return -EINVAL; em->datalen = sizeof(*nbyte) + nbyte->len; - em->data = (unsigned long) kmalloc(em->datalen, GFP_KERNEL); + em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL); if (em->data == 0UL) return -ENOBUFS; - memcpy((void *) em->data, data, em->datalen); - return 0; } @@ -71,7 +67,7 @@ static int __init init_em_nbyte(void) return tcf_em_register(&em_nbyte_ops); } -static void __exit exit_em_nbyte(void) +static void __exit exit_em_nbyte(void) { tcf_em_unregister(&em_nbyte_ops); } @@ -80,3 +76,5 @@ MODULE_LICENSE("GPL"); module_init(init_em_nbyte); module_exit(exit_em_nbyte); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_NBYTE); diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 77beabc91fa..15d353d2e4b 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -9,19 +9,17 @@ * Authors: Thomas Graf <tgraf@suug.ch> */ -#include <linux/config.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/textsearch.h> #include <linux/tc_ematch/tc_em_text.h> #include <net/pkt_cls.h> -struct text_match -{ +struct text_match { u16 from_offset; u16 to_offset; u8 from_layer; @@ -104,7 +102,8 @@ retry: static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m) { - textsearch_destroy(EM_TEXT_PRIV(m)->config); + if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) + textsearch_destroy(EM_TEXT_PRIV(m)->config); } static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) @@ -120,13 +119,16 @@ static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) conf.pattern_len = textsearch_get_pattern_len(tm->config); conf.pad = 0; - RTA_PUT_NOHDR(skb, sizeof(conf), &conf); - RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config)); + if (nla_put_nohdr(skb, sizeof(conf), &conf) < 0) + goto nla_put_failure; + if (nla_append(skb, conf.pattern_len, + textsearch_get_pattern(tm->config)) < 0) + goto nla_put_failure; return 0; -rtattr_failure: +nla_put_failure: return -1; -} +} static struct tcf_ematch_ops em_text_ops = { .kind = TCF_EM_TEXT, @@ -143,7 +145,7 @@ static int __init init_em_text(void) return tcf_em_register(&em_text_ops); } -static void __exit exit_em_text(void) +static void __exit exit_em_text(void) { tcf_em_unregister(&em_text_ops); } @@ -152,3 +154,5 @@ MODULE_LICENSE("GPL"); module_init(init_em_text); module_exit(exit_em_text); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_TEXT); diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c index 34e7e51e601..797bdb88c01 100644 --- a/net/sched/em_u32.c +++ b/net/sched/em_u32.c @@ -12,7 +12,6 @@ * Based on net/sched/cls_u32.c */ -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> @@ -23,8 +22,8 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { struct tc_u32_key *key = (struct tc_u32_key *) em->data; - unsigned char *ptr = skb->nh.raw; - + const unsigned char *ptr = skb_network_header(skb); + if (info) { if (info->ptr) ptr = info->ptr; @@ -35,8 +34,8 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em, if (!tcf_valid_offset(skb, ptr, sizeof(u32))) return 0; - - return !(((*(u32*) ptr) ^ key->val) & key->mask); + + return !(((*(__be32 *) ptr) ^ key->val) & key->mask); } static struct tcf_ematch_ops em_u32_ops = { @@ -52,7 +51,7 @@ static int __init init_em_u32(void) return tcf_em_register(&em_u32_ops); } -static void __exit exit_em_u32(void) +static void __exit exit_em_u32(void) { tcf_em_unregister(&em_u32_ops); } @@ -61,3 +60,5 @@ MODULE_LICENSE("GPL"); module_init(init_em_u32); module_exit(exit_em_u32); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_U32); diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 5cb956b721e..3a633debb6d 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -37,12 +37,12 @@ * --------<-POP--------- * * where B is a virtual ematch referencing to sequence starting with B1. - * + * * ========================================================================== * * How to write an ematch in 60 seconds * ------------------------------------ - * + * * 1) Provide a matcher function: * static int my_match(struct sk_buff *skb, struct tcf_ematch *m, * struct tcf_pkt_info *info) @@ -71,7 +71,7 @@ * * static void __exit exit_my_ematch(void) * { - * return tcf_em_unregister(&my_ops); + * tcf_em_unregister(&my_ops); * } * * module_init(init_my_ematch); @@ -81,14 +81,11 @@ * open up a beer to watch the compilation going. */ -#include <linux/config.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> #include <linux/errno.h> -#include <linux/interrupt.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/pkt_cls.h> @@ -96,7 +93,7 @@ static LIST_HEAD(ematch_ops); static DEFINE_RWLOCK(ematch_mod_lock); -static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind) +static struct tcf_ematch_ops *tcf_em_lookup(u16 kind) { struct tcf_ematch_ops *e = NULL; @@ -116,7 +113,7 @@ static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind) /** * tcf_em_register - register an extended match - * + * * @ops: ematch operations lookup table * * This function must be called by ematches to announce their presence. @@ -145,6 +142,7 @@ errout: write_unlock(&ematch_mod_lock); return err; } +EXPORT_SYMBOL(tcf_em_register); /** * tcf_em_unregister - unregster and extended match @@ -157,27 +155,16 @@ errout: * * Returns -ENOENT if no matching ematch was found. */ -int tcf_em_unregister(struct tcf_ematch_ops *ops) +void tcf_em_unregister(struct tcf_ematch_ops *ops) { - int err = 0; - struct tcf_ematch_ops *e; - write_lock(&ematch_mod_lock); - list_for_each_entry(e, &ematch_ops, link) { - if (e == ops) { - list_del(&e->link); - goto out; - } - } - - err = -ENOENT; -out: + list_del(&ops->link); write_unlock(&ematch_mod_lock); - return err; } +EXPORT_SYMBOL(tcf_em_unregister); -static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree, - int index) +static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree, + int index) { return &tree->matches[index]; } @@ -185,11 +172,11 @@ static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree, static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_tree_hdr *tree_hdr, - struct tcf_ematch *em, struct rtattr *rta, int idx) + struct tcf_ematch *em, struct nlattr *nla, int idx) { int err = -EINVAL; - struct tcf_ematch_hdr *em_hdr = RTA_DATA(rta); - int data_len = RTA_PAYLOAD(rta) - sizeof(*em_hdr); + struct tcf_ematch_hdr *em_hdr = nla_data(nla); + int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); if (!TCF_EM_REL_VALID(em_hdr->flags)) @@ -197,7 +184,8 @@ static int tcf_em_validate(struct tcf_proto *tp, if (em_hdr->kind == TCF_EM_CONTAINER) { /* Special ematch called "container", carries an index - * referencing an external ematch sequence. */ + * referencing an external ematch sequence. + */ u32 ref; if (data_len < sizeof(ref)) @@ -208,11 +196,12 @@ static int tcf_em_validate(struct tcf_proto *tp, goto errout; /* We do not allow backward jumps to avoid loops and jumps - * to our own position are of course illegal. */ + * to our own position are of course illegal. + */ if (ref <= idx) goto errout; - + em->data = ref; } else { /* Note: This lookup will increase the module refcnt @@ -221,16 +210,32 @@ static int tcf_em_validate(struct tcf_proto *tp, * which automatically releases the reference again, therefore * the module MUST not be given back under any circumstances * here. Be aware, the destroy function assumes that the - * module is held if the ops field is non zero. */ + * module is held if the ops field is non zero. + */ em->ops = tcf_em_lookup(em_hdr->kind); if (em->ops == NULL) { err = -ENOENT; +#ifdef CONFIG_MODULES + __rtnl_unlock(); + request_module("ematch-kind-%u", em_hdr->kind); + rtnl_lock(); + em->ops = tcf_em_lookup(em_hdr->kind); + if (em->ops) { + /* We dropped the RTNL mutex in order to + * perform the module load. Tell the caller + * to replay the request. + */ + module_put(em->ops->owner); + err = -EAGAIN; + } +#endif goto errout; } /* ematch module provides expected length of data, so we - * can do a basic sanity check. */ + * can do a basic sanity check. + */ if (em->ops->datalen && data_len < em->ops->datalen) goto errout; @@ -246,18 +251,18 @@ static int tcf_em_validate(struct tcf_proto *tp, * TCF_EM_SIMPLE may be specified stating that the * data only consists of a u32 integer and the module * does not expected a memory reference but rather - * the value carried. */ + * the value carried. + */ if (em_hdr->flags & TCF_EM_SIMPLE) { if (data_len < sizeof(u32)) goto errout; em->data = *(u32 *) data; } else { - void *v = kmalloc(data_len, GFP_KERNEL); + void *v = kmemdup(data, data_len, GFP_KERNEL); if (v == NULL) { err = -ENOBUFS; goto errout; } - memcpy(v, data, data_len); em->data = (unsigned long) v; } } @@ -272,15 +277,20 @@ errout: return err; } +static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = { + [TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) }, + [TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED }, +}; + /** * tcf_em_tree_validate - validate ematch config TLV and build ematch tree * * @tp: classifier kind handle - * @rta: ematch tree configuration TLV + * @nla: ematch tree configuration TLV * @tree: destination ematch tree variable to store the resulting * ematch tree. * - * This function validates the given configuration TLV @rta and builds an + * This function validates the given configuration TLV @nla and builds an * ematch tree in @tree. The resulting tree must later be copied into * the private classifier data using tcf_em_tree_change(). You MUST NOT * provide the ematch tree variable of the private classifier data directly, @@ -288,63 +298,60 @@ errout: * * Returns a negative error code if the configuration TLV contains errors. */ -int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta, +int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla, struct tcf_ematch_tree *tree) { - int idx, list_len, matches_len, err = -EINVAL; - struct rtattr *tb[TCA_EMATCH_TREE_MAX]; - struct rtattr *rt_match, *rt_hdr, *rt_list; + int idx, list_len, matches_len, err; + struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1]; + struct nlattr *rt_match, *rt_hdr, *rt_list; struct tcf_ematch_tree_hdr *tree_hdr; struct tcf_ematch *em; - if (!rta) { - memset(tree, 0, sizeof(*tree)); + memset(tree, 0, sizeof(*tree)); + if (!nla) return 0; - } - if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0) + err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy); + if (err < 0) goto errout; - rt_hdr = tb[TCA_EMATCH_TREE_HDR-1]; - rt_list = tb[TCA_EMATCH_TREE_LIST-1]; + err = -EINVAL; + rt_hdr = tb[TCA_EMATCH_TREE_HDR]; + rt_list = tb[TCA_EMATCH_TREE_LIST]; if (rt_hdr == NULL || rt_list == NULL) goto errout; - if (RTA_PAYLOAD(rt_hdr) < sizeof(*tree_hdr) || - RTA_PAYLOAD(rt_list) < sizeof(*rt_match)) - goto errout; - - tree_hdr = RTA_DATA(rt_hdr); + tree_hdr = nla_data(rt_hdr); memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr)); - rt_match = RTA_DATA(rt_list); - list_len = RTA_PAYLOAD(rt_list); + rt_match = nla_data(rt_list); + list_len = nla_len(rt_list); matches_len = tree_hdr->nmatches * sizeof(*em); - tree->matches = kmalloc(matches_len, GFP_KERNEL); + tree->matches = kzalloc(matches_len, GFP_KERNEL); if (tree->matches == NULL) goto errout; - memset(tree->matches, 0, matches_len); - /* We do not use rtattr_parse_nested here because the maximum + /* We do not use nla_parse_nested here because the maximum * number of attributes is unknown. This saves us the allocation * for a tb buffer which would serve no purpose at all. - * + * * The array of rt attributes is parsed in the order as they are * provided, their type must be incremental from 1 to n. Even * if it does not serve any real purpose, a failure of sticking - * to this policy will result in parsing failure. */ - for (idx = 0; RTA_OK(rt_match, list_len); idx++) { + * to this policy will result in parsing failure. + */ + for (idx = 0; nla_ok(rt_match, list_len); idx++) { err = -EINVAL; - if (rt_match->rta_type != (idx + 1)) + if (rt_match->nla_type != (idx + 1)) goto errout_abort; if (idx >= tree_hdr->nmatches) goto errout_abort; - if (RTA_PAYLOAD(rt_match) < sizeof(struct tcf_ematch_hdr)) + if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr)) goto errout_abort; em = tcf_em_get_match(tree, idx); @@ -353,13 +360,14 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta, if (err < 0) goto errout_abort; - rt_match = RTA_NEXT(rt_match, list_len); + rt_match = nla_next(rt_match, &list_len); } /* Check if the number of matches provided by userspace actually * complies with the array of matches. The number was used for * the validation of references and a mismatch could lead to - * undefined references during the matching process. */ + * undefined references during the matching process. + */ if (idx != tree_hdr->nmatches) { err = -EINVAL; goto errout_abort; @@ -373,6 +381,7 @@ errout_abort: tcf_em_tree_destroy(tp, tree); return err; } +EXPORT_SYMBOL(tcf_em_tree_validate); /** * tcf_em_tree_destroy - destroy an ematch tree @@ -397,15 +406,17 @@ void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) if (em->ops) { if (em->ops->destroy) em->ops->destroy(tp, em); - else if (!tcf_em_is_simple(em) && em->data) + else if (!tcf_em_is_simple(em)) kfree((void *) em->data); module_put(em->ops->owner); } } - + tree->hdr.nmatches = 0; kfree(tree->matches); + tree->matches = NULL; } +EXPORT_SYMBOL(tcf_em_tree_destroy); /** * tcf_em_tree_dump - dump ematch tree into a rtnl message @@ -422,17 +433,24 @@ void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) { int i; - struct rtattr * top_start = (struct rtattr*) skb->tail; - struct rtattr * list_start; + u8 *tail; + struct nlattr *top_start; + struct nlattr *list_start; - RTA_PUT(skb, tlv, 0, NULL); - RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr); + top_start = nla_nest_start(skb, tlv); + if (top_start == NULL) + goto nla_put_failure; - list_start = (struct rtattr *) skb->tail; - RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL); + if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr)) + goto nla_put_failure; + list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST); + if (list_start == NULL) + goto nla_put_failure; + + tail = skb_tail_pointer(skb); for (i = 0; i < tree->hdr.nmatches; i++) { - struct rtattr *match_start = (struct rtattr*) skb->tail; + struct nlattr *match_start = (struct nlattr *)tail; struct tcf_ematch *em = tcf_em_get_match(tree, i); struct tcf_ematch_hdr em_hdr = { .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER, @@ -440,33 +458,37 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) .flags = em->flags }; - RTA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr); + if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr)) + goto nla_put_failure; if (em->ops && em->ops->dump) { if (em->ops->dump(skb, em) < 0) - goto rtattr_failure; + goto nla_put_failure; } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) { u32 u = em->data; - RTA_PUT_NOHDR(skb, sizeof(u), &u); + nla_put_nohdr(skb, sizeof(u), &u); } else if (em->datalen > 0) - RTA_PUT_NOHDR(skb, em->datalen, (void *) em->data); + nla_put_nohdr(skb, em->datalen, (void *) em->data); - match_start->rta_len = skb->tail - (u8*) match_start; + tail = skb_tail_pointer(skb); + match_start->nla_len = tail - (u8 *)match_start; } - list_start->rta_len = skb->tail - (u8 *) list_start; - top_start->rta_len = skb->tail - (u8 *) top_start; + nla_nest_end(skb, list_start); + nla_nest_end(skb, top_start); return 0; -rtattr_failure: +nla_put_failure: return -1; } +EXPORT_SYMBOL(tcf_em_tree_dump); static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { int r = em->ops->match(skb, em, info); + return tcf_em_is_inverted(em) ? !r : r; } @@ -515,14 +537,7 @@ pop_stack: return res; stack_overflow: - if (net_ratelimit()) - printk("Local stack overflow, increase NET_EMATCH_STACK\n"); + net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n"); return -1; } - -EXPORT_SYMBOL(tcf_em_register); -EXPORT_SYMBOL(tcf_em_unregister); -EXPORT_SYMBOL(tcf_em_tree_validate); -EXPORT_SYMBOL(tcf_em_tree_destroy); -EXPORT_SYMBOL(tcf_em_tree_dump); EXPORT_SYMBOL(__tcf_em_tree_match); diff --git a/net/sched/estimator.c b/net/sched/estimator.c deleted file mode 100644 index 5d3ae03e22a..00000000000 --- a/net/sched/estimator.c +++ /dev/null @@ -1,197 +0,0 @@ -/* - * net/sched/estimator.c Simple rate estimator. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/jiffies.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/rtnetlink.h> -#include <linux/init.h> -#include <net/sock.h> -#include <net/pkt_sched.h> - -/* - This code is NOT intended to be used for statistics collection, - its purpose is to provide a base for statistical multiplexing - for controlled load service. - If you need only statistics, run a user level daemon which - periodically reads byte counters. - - Unfortunately, rate estimation is not a very easy task. - F.e. I did not find a simple way to estimate the current peak rate - and even failed to formulate the problem 8)8) - - So I preferred not to built an estimator into the scheduler, - but run this task separately. - Ideally, it should be kernel thread(s), but for now it runs - from timers, which puts apparent top bounds on the number of rated - flows, has minimal overhead on small, but is enough - to handle controlled load service, sets of aggregates. - - We measure rate over A=(1<<interval) seconds and evaluate EWMA: - - avrate = avrate*(1-W) + rate*W - - where W is chosen as negative power of 2: W = 2^(-ewma_log) - - The resulting time constant is: - - T = A/(-ln(1-W)) - - - NOTES. - - * The stored value for avbps is scaled by 2^5, so that maximal - rate is ~1Gbit, avpps is scaled by 2^10. - - * Minimal interval is HZ/4=250msec (it is the greatest common divisor - for HZ=100 and HZ=1024 8)), maximal interval - is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals - are too expensive, longer ones can be implemented - at user level painlessly. - */ - -#define EST_MAX_INTERVAL 5 - -struct qdisc_estimator -{ - struct qdisc_estimator *next; - struct tc_stats *stats; - spinlock_t *stats_lock; - unsigned interval; - int ewma_log; - u64 last_bytes; - u32 last_packets; - u32 avpps; - u32 avbps; -}; - -struct qdisc_estimator_head -{ - struct timer_list timer; - struct qdisc_estimator *list; -}; - -static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1]; - -/* Estimator array lock */ -static DEFINE_RWLOCK(est_lock); - -static void est_timer(unsigned long arg) -{ - int idx = (int)arg; - struct qdisc_estimator *e; - - read_lock(&est_lock); - for (e = elist[idx].list; e; e = e->next) { - struct tc_stats *st = e->stats; - u64 nbytes; - u32 npackets; - u32 rate; - - spin_lock(e->stats_lock); - nbytes = st->bytes; - npackets = st->packets; - rate = (nbytes - e->last_bytes)<<(7 - idx); - e->last_bytes = nbytes; - e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; - st->bps = (e->avbps+0xF)>>5; - - rate = (npackets - e->last_packets)<<(12 - idx); - e->last_packets = npackets; - e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; - e->stats->pps = (e->avpps+0x1FF)>>10; - spin_unlock(e->stats_lock); - } - - mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); - read_unlock(&est_lock); -} - -int qdisc_new_estimator(struct tc_stats *stats, spinlock_t *stats_lock, struct rtattr *opt) -{ - struct qdisc_estimator *est; - struct tc_estimator *parm = RTA_DATA(opt); - - if (RTA_PAYLOAD(opt) < sizeof(*parm)) - return -EINVAL; - - if (parm->interval < -2 || parm->interval > 3) - return -EINVAL; - - est = kmalloc(sizeof(*est), GFP_KERNEL); - if (est == NULL) - return -ENOBUFS; - - memset(est, 0, sizeof(*est)); - est->interval = parm->interval + 2; - est->stats = stats; - est->stats_lock = stats_lock; - est->ewma_log = parm->ewma_log; - est->last_bytes = stats->bytes; - est->avbps = stats->bps<<5; - est->last_packets = stats->packets; - est->avpps = stats->pps<<10; - - est->next = elist[est->interval].list; - if (est->next == NULL) { - init_timer(&elist[est->interval].timer); - elist[est->interval].timer.data = est->interval; - elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4); - elist[est->interval].timer.function = est_timer; - add_timer(&elist[est->interval].timer); - } - write_lock_bh(&est_lock); - elist[est->interval].list = est; - write_unlock_bh(&est_lock); - return 0; -} - -void qdisc_kill_estimator(struct tc_stats *stats) -{ - int idx; - struct qdisc_estimator *est, **pest; - - for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { - int killed = 0; - pest = &elist[idx].list; - while ((est=*pest) != NULL) { - if (est->stats != stats) { - pest = &est->next; - continue; - } - - write_lock_bh(&est_lock); - *pest = est->next; - write_unlock_bh(&est_lock); - - kfree(est); - killed++; - } - if (killed && elist[idx].list == NULL) - del_timer(&elist[idx].timer); - } -} - -EXPORT_SYMBOL(qdisc_kill_estimator); -EXPORT_SYMBOL(qdisc_new_estimator); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 31570b9a6e9..58bed7599db 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -15,39 +15,32 @@ * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support */ -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> -#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/list.h> -#include <linux/bitops.h> +#include <linux/hrtimer.h> +#include <linux/lockdep.h> +#include <linux/slab.h> +#include <net/net_namespace.h> #include <net/sock.h> +#include <net/netlink.h> #include <net/pkt_sched.h> -#include <asm/processor.h> -#include <asm/uaccess.h> -#include <asm/system.h> - -static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, +static int qdisc_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new); -static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, - struct Qdisc *q, unsigned long cl, int event); +static int tclass_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct Qdisc *q, + unsigned long cl, int event); /* @@ -107,10 +100,9 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, Auxiliary routines: - ---requeue + ---peek - requeues once dequeued packet. It is used for non-standard or - just buggy devices, which can defer output even if dev->tbusy=0. + like dequeue but without removing a packet from the queue ---reset @@ -143,7 +135,7 @@ static DEFINE_RWLOCK(qdisc_mod_lock); static struct Qdisc_ops *qdisc_base; -/* Register/uregister queueing discipline */ +/* Register/unregister queueing discipline */ int register_qdisc(struct Qdisc_ops *qops) { @@ -157,18 +149,37 @@ int register_qdisc(struct Qdisc_ops *qops) if (qops->enqueue == NULL) qops->enqueue = noop_qdisc_ops.enqueue; - if (qops->requeue == NULL) - qops->requeue = noop_qdisc_ops.requeue; + if (qops->peek == NULL) { + if (qops->dequeue == NULL) + qops->peek = noop_qdisc_ops.peek; + else + goto out_einval; + } if (qops->dequeue == NULL) qops->dequeue = noop_qdisc_ops.dequeue; + if (qops->cl_ops) { + const struct Qdisc_class_ops *cops = qops->cl_ops; + + if (!(cops->get && cops->put && cops->walk && cops->leaf)) + goto out_einval; + + if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf)) + goto out_einval; + } + qops->next = NULL; *qp = qops; rc = 0; out: write_unlock(&qdisc_mod_lock); return rc; + +out_einval: + rc = -EINVAL; + goto out; } +EXPORT_SYMBOL(register_qdisc); int unregister_qdisc(struct Qdisc_ops *qops) { @@ -176,7 +187,7 @@ int unregister_qdisc(struct Qdisc_ops *qops) int err = -ENOENT; write_lock(&qdisc_mod_lock); - for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) if (q == qops) break; if (q) { @@ -187,31 +198,118 @@ int unregister_qdisc(struct Qdisc_ops *qops) write_unlock(&qdisc_mod_lock); return err; } +EXPORT_SYMBOL(unregister_qdisc); + +/* Get default qdisc if not otherwise specified */ +void qdisc_get_default(char *name, size_t len) +{ + read_lock(&qdisc_mod_lock); + strlcpy(name, default_qdisc_ops->id, len); + read_unlock(&qdisc_mod_lock); +} + +static struct Qdisc_ops *qdisc_lookup_default(const char *name) +{ + struct Qdisc_ops *q = NULL; + + for (q = qdisc_base; q; q = q->next) { + if (!strcmp(name, q->id)) { + if (!try_module_get(q->owner)) + q = NULL; + break; + } + } + + return q; +} + +/* Set new default qdisc to use */ +int qdisc_set_default(const char *name) +{ + const struct Qdisc_ops *ops; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + write_lock(&qdisc_mod_lock); + ops = qdisc_lookup_default(name); + if (!ops) { + /* Not found, drop lock and try to load module */ + write_unlock(&qdisc_mod_lock); + request_module("sch_%s", name); + write_lock(&qdisc_mod_lock); + + ops = qdisc_lookup_default(name); + } + + if (ops) { + /* Set new default */ + module_put(default_qdisc_ops->owner); + default_qdisc_ops = ops; + } + write_unlock(&qdisc_mod_lock); + + return ops ? 0 : -ENOENT; +} /* We know handle. Find qdisc among all qdisc's attached to device (root qdisc, all its children, children of children etc.) */ -struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) +static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) { struct Qdisc *q; - read_lock_bh(&qdisc_tree_lock); - list_for_each_entry(q, &dev->qdisc_list, list) { - if (q->handle == handle) { - read_unlock_bh(&qdisc_tree_lock); + if (!(root->flags & TCQ_F_BUILTIN) && + root->handle == handle) + return root; + + list_for_each_entry(q, &root->list, list) { + if (q->handle == handle) return q; - } } - read_unlock_bh(&qdisc_tree_lock); return NULL; } +void qdisc_list_add(struct Qdisc *q) +{ + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { + struct Qdisc *root = qdisc_dev(q)->qdisc; + + WARN_ON_ONCE(root == &noop_qdisc); + list_add_tail(&q->list, &root->list); + } +} +EXPORT_SYMBOL(qdisc_list_add); + +void qdisc_list_del(struct Qdisc *q) +{ + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) + list_del(&q->list); +} +EXPORT_SYMBOL(qdisc_list_del); + +struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) +{ + struct Qdisc *q; + + q = qdisc_match_from_root(dev->qdisc, handle); + if (q) + goto out; + + if (dev_ingress_queue(dev)) + q = qdisc_match_from_root( + dev_ingress_queue(dev)->qdisc_sleeping, + handle); +out: + return q; +} + static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; struct Qdisc *leaf; - struct Qdisc_class_ops *cops = p->ops->cl_ops; + const struct Qdisc_class_ops *cops = p->ops->cl_ops; if (cops == NULL) return NULL; @@ -226,14 +324,14 @@ static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) /* Find queueing discipline by name */ -static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) +static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) { struct Qdisc_ops *q = NULL; if (kind) { read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { - if (rtattr_strcmp(kind, q->id) == 0) { + if (nla_strcmp(kind, q->id) == 0) { if (!try_module_get(q->owner)) q = NULL; break; @@ -244,32 +342,76 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) return q; } +/* The linklayer setting were not transferred from iproute2, in older + * versions, and the rate tables lookup systems have been dropped in + * the kernel. To keep backward compatible with older iproute2 tc + * utils, we detect the linklayer setting by detecting if the rate + * table were modified. + * + * For linklayer ATM table entries, the rate table will be aligned to + * 48 bytes, thus some table entries will contain the same value. The + * mpu (min packet unit) is also encoded into the old rate table, thus + * starting from the mpu, we find low and high table entries for + * mapping this cell. If these entries contain the same value, when + * the rate tables have been modified for linklayer ATM. + * + * This is done by rounding mpu to the nearest 48 bytes cell/entry, + * and then roundup to the next cell, calc the table entry one below, + * and compare. + */ +static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) +{ + int low = roundup(r->mpu, 48); + int high = roundup(low+1, 48); + int cell_low = low >> r->cell_log; + int cell_high = (high >> r->cell_log) - 1; + + /* rtab is too inaccurate at rates > 100Mbit/s */ + if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { + pr_debug("TC linklayer: Giving up ATM detection\n"); + return TC_LINKLAYER_ETHERNET; + } + + if ((cell_high > cell_low) && (cell_high < 256) + && (rtab[cell_low] == rtab[cell_high])) { + pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", + cell_low, cell_high, rtab[cell_high]); + return TC_LINKLAYER_ATM; + } + return TC_LINKLAYER_ETHERNET; +} + static struct qdisc_rate_table *qdisc_rtab_list; -struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) { struct qdisc_rate_table *rtab; + if (tab == NULL || r->rate == 0 || r->cell_log == 0 || + nla_len(tab) != TC_RTAB_SIZE) + return NULL; + for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { - if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { + if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && + !memcmp(&rtab->data, nla_data(tab), 1024)) { rtab->refcnt++; return rtab; } } - if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) - return NULL; - rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); if (rtab) { rtab->rate = *r; rtab->refcnt = 1; - memcpy(rtab->data, RTA_DATA(tab), 1024); + memcpy(rtab->data, nla_data(tab), 1024); + if (r->linklayer == TC_LINKLAYER_UNAWARE) + r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } return rtab; } +EXPORT_SYMBOL(qdisc_get_rtab); void qdisc_put_rtab(struct qdisc_rate_table *tab) { @@ -278,7 +420,9 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab) if (!tab || --tab->refcnt) return; - for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { + for (rtabp = &qdisc_rtab_list; + (rtab = *rtabp) != NULL; + rtabp = &rtab->next) { if (rtab == tab) { *rtabp = rtab->next; kfree(rtab); @@ -286,108 +430,441 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab) } } } +EXPORT_SYMBOL(qdisc_put_rtab); + +static LIST_HEAD(qdisc_stab_list); +static DEFINE_SPINLOCK(qdisc_stab_lock); + +static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { + [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, + [TCA_STAB_DATA] = { .type = NLA_BINARY }, +}; + +static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) +{ + struct nlattr *tb[TCA_STAB_MAX + 1]; + struct qdisc_size_table *stab; + struct tc_sizespec *s; + unsigned int tsize = 0; + u16 *tab = NULL; + int err; + err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy); + if (err < 0) + return ERR_PTR(err); + if (!tb[TCA_STAB_BASE]) + return ERR_PTR(-EINVAL); -/* Allocate an unique handle from space managed by kernel */ + s = nla_data(tb[TCA_STAB_BASE]); + if (s->tsize > 0) { + if (!tb[TCA_STAB_DATA]) + return ERR_PTR(-EINVAL); + tab = nla_data(tb[TCA_STAB_DATA]); + tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); + } + + if (tsize != s->tsize || (!tab && tsize > 0)) + return ERR_PTR(-EINVAL); + + spin_lock(&qdisc_stab_lock); + + list_for_each_entry(stab, &qdisc_stab_list, list) { + if (memcmp(&stab->szopts, s, sizeof(*s))) + continue; + if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16))) + continue; + stab->refcnt++; + spin_unlock(&qdisc_stab_lock); + return stab; + } + + spin_unlock(&qdisc_stab_lock); + + stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); + if (!stab) + return ERR_PTR(-ENOMEM); + + stab->refcnt = 1; + stab->szopts = *s; + if (tsize > 0) + memcpy(stab->data, tab, tsize * sizeof(u16)); + + spin_lock(&qdisc_stab_lock); + list_add_tail(&stab->list, &qdisc_stab_list); + spin_unlock(&qdisc_stab_lock); + + return stab; +} + +static void stab_kfree_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct qdisc_size_table, rcu)); +} + +void qdisc_put_stab(struct qdisc_size_table *tab) +{ + if (!tab) + return; + + spin_lock(&qdisc_stab_lock); + + if (--tab->refcnt == 0) { + list_del(&tab->list); + call_rcu_bh(&tab->rcu, stab_kfree_rcu); + } + + spin_unlock(&qdisc_stab_lock); +} +EXPORT_SYMBOL(qdisc_put_stab); + +static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_STAB); + if (nest == NULL) + goto nla_put_failure; + if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) + goto nla_put_failure; + nla_nest_end(skb, nest); + + return skb->len; + +nla_put_failure: + return -1; +} + +void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab) +{ + int pkt_len, slot; + + pkt_len = skb->len + stab->szopts.overhead; + if (unlikely(!stab->szopts.tsize)) + goto out; + + slot = pkt_len + stab->szopts.cell_align; + if (unlikely(slot < 0)) + slot = 0; + + slot >>= stab->szopts.cell_log; + if (likely(slot < stab->szopts.tsize)) + pkt_len = stab->data[slot]; + else + pkt_len = stab->data[stab->szopts.tsize - 1] * + (slot / stab->szopts.tsize) + + stab->data[slot % stab->szopts.tsize]; + + pkt_len <<= stab->szopts.size_log; +out: + if (unlikely(pkt_len < 1)) + pkt_len = 1; + qdisc_skb_cb(skb)->pkt_len = pkt_len; +} +EXPORT_SYMBOL(__qdisc_calculate_pkt_len); + +void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) +{ + if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { + pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", + txt, qdisc->ops->id, qdisc->handle >> 16); + qdisc->flags |= TCQ_F_WARN_NONWC; + } +} +EXPORT_SYMBOL(qdisc_warn_nonwc); + +static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) +{ + struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, + timer); + + qdisc_unthrottled(wd->qdisc); + __netif_schedule(qdisc_root(wd->qdisc)); + + return HRTIMER_NORESTART; +} + +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +{ + hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + wd->timer.function = qdisc_watchdog; + wd->qdisc = qdisc; +} +EXPORT_SYMBOL(qdisc_watchdog_init); + +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) +{ + if (test_bit(__QDISC_STATE_DEACTIVATED, + &qdisc_root_sleeping(wd->qdisc)->state)) + return; + + qdisc_throttled(wd->qdisc); + + hrtimer_start(&wd->timer, + ns_to_ktime(expires), + HRTIMER_MODE_ABS); +} +EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); + +void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) +{ + hrtimer_cancel(&wd->timer); + qdisc_unthrottled(wd->qdisc); +} +EXPORT_SYMBOL(qdisc_watchdog_cancel); + +static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) +{ + unsigned int size = n * sizeof(struct hlist_head), i; + struct hlist_head *h; + + if (size <= PAGE_SIZE) + h = kmalloc(size, GFP_KERNEL); + else + h = (struct hlist_head *) + __get_free_pages(GFP_KERNEL, get_order(size)); + + if (h != NULL) { + for (i = 0; i < n; i++) + INIT_HLIST_HEAD(&h[i]); + } + return h; +} + +static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n) +{ + unsigned int size = n * sizeof(struct hlist_head); + + if (size <= PAGE_SIZE) + kfree(h); + else + free_pages((unsigned long)h, get_order(size)); +} + +void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) +{ + struct Qdisc_class_common *cl; + struct hlist_node *next; + struct hlist_head *nhash, *ohash; + unsigned int nsize, nmask, osize; + unsigned int i, h; + + /* Rehash when load factor exceeds 0.75 */ + if (clhash->hashelems * 4 <= clhash->hashsize * 3) + return; + nsize = clhash->hashsize * 2; + nmask = nsize - 1; + nhash = qdisc_class_hash_alloc(nsize); + if (nhash == NULL) + return; + + ohash = clhash->hash; + osize = clhash->hashsize; + + sch_tree_lock(sch); + for (i = 0; i < osize; i++) { + hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { + h = qdisc_class_hash(cl->classid, nmask); + hlist_add_head(&cl->hnode, &nhash[h]); + } + } + clhash->hash = nhash; + clhash->hashsize = nsize; + clhash->hashmask = nmask; + sch_tree_unlock(sch); + + qdisc_class_hash_free(ohash, osize); +} +EXPORT_SYMBOL(qdisc_class_hash_grow); + +int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) +{ + unsigned int size = 4; + + clhash->hash = qdisc_class_hash_alloc(size); + if (clhash->hash == NULL) + return -ENOMEM; + clhash->hashsize = size; + clhash->hashmask = size - 1; + clhash->hashelems = 0; + return 0; +} +EXPORT_SYMBOL(qdisc_class_hash_init); + +void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash) +{ + qdisc_class_hash_free(clhash->hash, clhash->hashsize); +} +EXPORT_SYMBOL(qdisc_class_hash_destroy); + +void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash, + struct Qdisc_class_common *cl) +{ + unsigned int h; + + INIT_HLIST_NODE(&cl->hnode); + h = qdisc_class_hash(cl->classid, clhash->hashmask); + hlist_add_head(&cl->hnode, &clhash->hash[h]); + clhash->hashelems++; +} +EXPORT_SYMBOL(qdisc_class_hash_insert); + +void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, + struct Qdisc_class_common *cl) +{ + hlist_del(&cl->hnode); + clhash->hashelems--; +} +EXPORT_SYMBOL(qdisc_class_hash_remove); + +/* Allocate an unique handle from space managed by kernel + * Possible range is [8000-FFFF]:0000 (0x8000 values) + */ static u32 qdisc_alloc_handle(struct net_device *dev) { - int i = 0x10000; + int i = 0x8000; static u32 autohandle = TC_H_MAKE(0x80000000U, 0); do { autohandle += TC_H_MAKE(0x10000U, 0); if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) autohandle = TC_H_MAKE(0x80000000U, 0); - } while (qdisc_lookup(dev, autohandle) && --i > 0); + if (!qdisc_lookup(dev, autohandle)) + return autohandle; + cond_resched(); + } while (--i > 0); - return i>0 ? autohandle : 0; + return 0; } -/* Attach toplevel qdisc to device dev */ - -static struct Qdisc * -dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) -{ - struct Qdisc *oqdisc; - - if (dev->flags & IFF_UP) - dev_deactivate(dev); - - qdisc_lock_tree(dev); - if (qdisc && qdisc->flags&TCQ_F_INGRESS) { - oqdisc = dev->qdisc_ingress; - /* Prune old scheduler */ - if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { - /* delete */ - qdisc_reset(oqdisc); - dev->qdisc_ingress = NULL; - } else { /* new */ - dev->qdisc_ingress = qdisc; - } - - } else { - - oqdisc = dev->qdisc_sleeping; +void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) +{ + const struct Qdisc_class_ops *cops; + unsigned long cl; + u32 parentid; + int drops; - /* Prune old scheduler */ - if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) - qdisc_reset(oqdisc); + if (n == 0) + return; + drops = max_t(int, n, 0); + while ((parentid = sch->parent)) { + if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) + return; - /* ... and graft new one */ - if (qdisc == NULL) - qdisc = &noop_qdisc; - dev->qdisc_sleeping = qdisc; - dev->qdisc = &noop_qdisc; + sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); + if (sch == NULL) { + WARN_ON(parentid != TC_H_ROOT); + return; + } + cops = sch->ops->cl_ops; + if (cops->qlen_notify) { + cl = cops->get(sch, parentid); + cops->qlen_notify(sch, cl); + cops->put(sch, cl); + } + sch->q.qlen -= n; + sch->qstats.drops += drops; } +} +EXPORT_SYMBOL(qdisc_tree_decrease_qlen); - qdisc_unlock_tree(dev); - - if (dev->flags & IFF_UP) - dev_activate(dev); +static void notify_and_destroy(struct net *net, struct sk_buff *skb, + struct nlmsghdr *n, u32 clid, + struct Qdisc *old, struct Qdisc *new) +{ + if (new || old) + qdisc_notify(net, skb, n, clid, old, new); - return oqdisc; + if (old) + qdisc_destroy(old); } - /* Graft qdisc "new" to class "classid" of qdisc "parent" or - to device "dev". - - Old qdisc is not destroyed but returned in *old. + * to device "dev". + * + * When appropriate send a netlink notification using 'skb' + * and "n". + * + * On success, destroy old qdisc. */ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, - u32 classid, - struct Qdisc *new, struct Qdisc **old) + struct sk_buff *skb, struct nlmsghdr *n, u32 classid, + struct Qdisc *new, struct Qdisc *old) { + struct Qdisc *q = old; + struct net *net = dev_net(dev); int err = 0; - struct Qdisc *q = *old; + if (parent == NULL) { + unsigned int i, num_q, ingress; + + ingress = 0; + num_q = dev->num_tx_queues; + if ((q && q->flags & TCQ_F_INGRESS) || + (new && new->flags & TCQ_F_INGRESS)) { + num_q = 1; + ingress = 1; + if (!dev_ingress_queue(dev)) + return -ENOENT; + } + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + if (new && new->ops->attach) { + new->ops->attach(new); + num_q = 0; + } + + for (i = 0; i < num_q; i++) { + struct netdev_queue *dev_queue = dev_ingress_queue(dev); + + if (!ingress) + dev_queue = netdev_get_tx_queue(dev, i); - if (parent == NULL) { - if (q && q->flags&TCQ_F_INGRESS) { - *old = dev_graft_qdisc(dev, q); + old = dev_graft_qdisc(dev_queue, new); + if (new && i > 0) + atomic_inc(&new->refcnt); + + if (!ingress) + qdisc_destroy(old); + } + + if (!ingress) { + notify_and_destroy(net, skb, n, classid, + dev->qdisc, new); + if (new && !new->ops->attach) + atomic_inc(&new->refcnt); + dev->qdisc = new ? : &noop_qdisc; } else { - *old = dev_graft_qdisc(dev, new); + notify_and_destroy(net, skb, n, classid, old, new); } - } else { - struct Qdisc_class_ops *cops = parent->ops->cl_ops; - err = -EINVAL; + if (dev->flags & IFF_UP) + dev_activate(dev); + } else { + const struct Qdisc_class_ops *cops = parent->ops->cl_ops; - if (cops) { + err = -EOPNOTSUPP; + if (cops && cops->graft) { unsigned long cl = cops->get(parent, classid); if (cl) { - err = cops->graft(parent, cl, new, old); - if (new) - new->parent = classid; + err = cops->graft(parent, cl, new, &old); cops->put(parent, cl); - } + } else + err = -ENOENT; } + if (!err) + notify_and_destroy(net, skb, n, classid, old, new); } return err; } +/* lockdep annotation is needed for ingress; egress gets it only for name */ +static struct lock_class_key qdisc_tx_lock; +static struct lock_class_key qdisc_rx_lock; + /* Allocate and initialize new qdisc. @@ -395,18 +872,21 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, */ static struct Qdisc * -qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) +qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, + struct Qdisc *p, u32 parent, u32 handle, + struct nlattr **tca, int *errp) { int err; - struct rtattr *kind = tca[TCA_KIND-1]; + struct nlattr *kind = tca[TCA_KIND]; struct Qdisc *sch; struct Qdisc_ops *ops; + struct qdisc_size_table *stab; ops = qdisc_lookup_ops(kind); -#ifdef CONFIG_KMOD +#ifdef CONFIG_MODULES if (ops == NULL && kind != NULL) { char name[IFNAMSIZ]; - if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to @@ -431,49 +911,66 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) } #endif - err = -EINVAL; + err = -ENOENT; if (ops == NULL) goto err_out; - sch = qdisc_alloc(dev, ops); + sch = qdisc_alloc(dev_queue, ops); if (IS_ERR(sch)) { err = PTR_ERR(sch); goto err_out2; } + sch->parent = parent; + if (handle == TC_H_INGRESS) { sch->flags |= TCQ_F_INGRESS; handle = TC_H_MAKE(TC_H_INGRESS, 0); - } else if (handle == 0) { - handle = qdisc_alloc_handle(dev); - err = -ENOMEM; - if (handle == 0) - goto err_out3; + lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock); + } else { + if (handle == 0) { + handle = qdisc_alloc_handle(dev); + err = -ENOMEM; + if (handle == 0) + goto err_out3; + } + lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); + if (!netif_is_multiqueue(dev)) + sch->flags |= TCQ_F_ONETXQUEUE; } sch->handle = handle; - if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { -#ifdef CONFIG_NET_ESTIMATOR - if (tca[TCA_RATE-1]) { - err = gen_new_estimator(&sch->bstats, &sch->rate_est, - sch->stats_lock, - tca[TCA_RATE-1]); - if (err) { - /* - * Any broken qdiscs that would require - * a ops->reset() here? The qdisc was never - * in action so it shouldn't be necessary. - */ - if (ops->destroy) - ops->destroy(sch); - goto err_out3; + if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { + if (tca[TCA_STAB]) { + stab = qdisc_get_stab(tca[TCA_STAB]); + if (IS_ERR(stab)) { + err = PTR_ERR(stab); + goto err_out4; } + rcu_assign_pointer(sch->stab, stab); } -#endif - qdisc_lock_tree(dev); - list_add_tail(&sch->list, &dev->qdisc_list); - qdisc_unlock_tree(dev); + if (tca[TCA_RATE]) { + spinlock_t *root_lock; + + err = -EOPNOTSUPP; + if (sch->flags & TCQ_F_MQROOT) + goto err_out4; + + if ((sch->parent != TC_H_ROOT) && + !(sch->flags & TCQ_F_INGRESS) && + (!p || !(p->flags & TCQ_F_MQROOT))) + root_lock = qdisc_root_sleeping_lock(sch); + else + root_lock = qdisc_lock(sch); + + err = gen_new_estimator(&sch->bstats, &sch->rate_est, + root_lock, tca[TCA_RATE]); + if (err) + goto err_out4; + } + + qdisc_list_add(sch); return sch; } @@ -485,30 +982,56 @@ err_out2: err_out: *errp = err; return NULL; + +err_out4: + /* + * Any broken qdiscs that would require a ops->reset() here? + * The qdisc was never in action so it shouldn't be necessary. + */ + qdisc_put_stab(rtnl_dereference(sch->stab)); + if (ops->destroy) + ops->destroy(sch); + goto err_out3; } -static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) +static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) { - if (tca[TCA_OPTIONS-1]) { - int err; + struct qdisc_size_table *ostab, *stab = NULL; + int err = 0; + if (tca[TCA_OPTIONS]) { if (sch->ops->change == NULL) return -EINVAL; - err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); + err = sch->ops->change(sch, tca[TCA_OPTIONS]); if (err) return err; } -#ifdef CONFIG_NET_ESTIMATOR - if (tca[TCA_RATE-1]) + + if (tca[TCA_STAB]) { + stab = qdisc_get_stab(tca[TCA_STAB]); + if (IS_ERR(stab)) + return PTR_ERR(stab); + } + + ostab = rtnl_dereference(sch->stab); + rcu_assign_pointer(sch->stab, stab); + qdisc_put_stab(ostab); + + if (tca[TCA_RATE]) { + /* NB: ignores errors from replace_estimator + because change can't be undone. */ + if (sch->flags & TCQ_F_MQROOT) + goto out; gen_replace_estimator(&sch->bstats, &sch->rate_est, - sch->stats_lock, tca[TCA_RATE-1]); -#endif + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + } +out: return 0; } -struct check_loop_arg -{ - struct qdisc_walker w; +struct check_loop_arg { + struct qdisc_walker w; struct Qdisc *p; int depth; }; @@ -534,7 +1057,7 @@ static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) { struct Qdisc *leaf; - struct Qdisc_class_ops *cops = q->ops->cl_ops; + const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct check_loop_arg *arg = (struct check_loop_arg *)w; leaf = cops->leaf(q, cl); @@ -550,30 +1073,42 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) * Delete/get qdisc. */ -static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n) { - struct tcmsg *tcm = NLMSG_DATA(n); - struct rtattr **tca = arg; + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm = nlmsg_data(n); + struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; - u32 clid = tcm->tcm_parent; + u32 clid; struct Qdisc *q = NULL; struct Qdisc *p = NULL; int err; - if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + if ((n->nlmsg_type != RTM_GETQDISC) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) return -ENODEV; + clid = tcm->tcm_parent; if (clid) { if (clid != TC_H_ROOT) { if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { - if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) + p = qdisc_lookup(dev, TC_H_MAJ(clid)); + if (!p) return -ENOENT; q = qdisc_leaf(p, clid); - } else { /* ingress */ - q = dev->qdisc_ingress; - } + } else if (dev_ingress_queue(dev)) { + q = dev_ingress_queue(dev)->qdisc_sleeping; + } } else { - q = dev->qdisc_sleeping; + q = dev->qdisc; } if (!q) return -ENOENT; @@ -581,11 +1116,12 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) if (tcm->tcm_handle && q->handle != tcm->tcm_handle) return -EINVAL; } else { - if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) + q = qdisc_lookup(dev, tcm->tcm_handle); + if (!q) return -ENOENT; } - if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; if (n->nlmsg_type == RTM_DELQDISC) { @@ -593,54 +1129,59 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) return -EINVAL; if (q->handle == 0) return -ENOENT; - if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0) + err = qdisc_graft(dev, p, skb, n, clid, NULL, q); + if (err != 0) return err; - if (q) { - qdisc_notify(skb, n, clid, q, NULL); - spin_lock_bh(&dev->queue_lock); - qdisc_destroy(q); - spin_unlock_bh(&dev->queue_lock); - } } else { - qdisc_notify(skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q); } return 0; } /* - Create/change qdisc. + * Create/change qdisc. */ -static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n) { + struct net *net = sock_net(skb->sk); struct tcmsg *tcm; - struct rtattr **tca; + struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; u32 clid; struct Qdisc *q, *p; int err; + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + replay: /* Reinit, just in case something touches this. */ - tcm = NLMSG_DATA(n); - tca = arg; + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + + tcm = nlmsg_data(n); clid = tcm->tcm_parent; q = p = NULL; - if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) return -ENODEV; + if (clid) { if (clid != TC_H_ROOT) { if (clid != TC_H_INGRESS) { - if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) + p = qdisc_lookup(dev, TC_H_MAJ(clid)); + if (!p) return -ENOENT; q = qdisc_leaf(p, clid); - } else { /*ingress */ - q = dev->qdisc_ingress; + } else if (dev_ingress_queue_create(dev)) { + q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { - q = dev->qdisc_sleeping; + q = dev->qdisc; } /* It may be default qdisc, ignore it */ @@ -649,15 +1190,16 @@ replay: if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { if (tcm->tcm_handle) { - if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) + if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) return -EEXIST; if (TC_H_MIN(tcm->tcm_handle)) return -EINVAL; - if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) + q = qdisc_lookup(dev, tcm->tcm_handle); + if (!q) goto create_n_graft; - if (n->nlmsg_flags&NLM_F_EXCL) + if (n->nlmsg_flags & NLM_F_EXCL) return -EEXIST; - if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; if (q == p || (p && check_loop(q, p, 0))) @@ -665,7 +1207,7 @@ replay: atomic_inc(&q->refcnt); goto graft; } else { - if (q == NULL) + if (!q) goto create_n_graft; /* This magic test requires explanation. @@ -687,11 +1229,11 @@ replay: * For now we select create/graft, if * user gave KIND, which does not match existing. */ - if ((n->nlmsg_flags&NLM_F_CREATE) && - (n->nlmsg_flags&NLM_F_REPLACE) && - ((n->nlmsg_flags&NLM_F_EXCL) || - (tca[TCA_KIND-1] && - rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) + if ((n->nlmsg_flags & NLM_F_CREATE) && + (n->nlmsg_flags & NLM_F_REPLACE) && + ((n->nlmsg_flags & NLM_F_EXCL) || + (tca[TCA_KIND] && + nla_strcmp(tca[TCA_KIND], q->ops->id)))) goto create_n_graft; } } @@ -704,22 +1246,39 @@ replay: /* Change qdisc parameters */ if (q == NULL) return -ENOENT; - if (n->nlmsg_flags&NLM_F_EXCL) + if (n->nlmsg_flags & NLM_F_EXCL) return -EEXIST; - if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; err = qdisc_change(q, tca); if (err == 0) - qdisc_notify(skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q); return err; create_n_graft: - if (!(n->nlmsg_flags&NLM_F_CREATE)) + if (!(n->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - if (clid == TC_H_INGRESS) - q = qdisc_create(dev, tcm->tcm_parent, tca, &err); - else - q = qdisc_create(dev, tcm->tcm_handle, tca, &err); + if (clid == TC_H_INGRESS) { + if (dev_ingress_queue(dev)) + q = qdisc_create(dev, dev_ingress_queue(dev), p, + tcm->tcm_parent, tcm->tcm_parent, + tca, &err); + else + err = -ENOENT; + } else { + struct netdev_queue *dev_queue; + + if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) + dev_queue = p->ops->cl_ops->select_queue(p, tcm); + else if (p) + dev_queue = p->dev_queue; + else + dev_queue = netdev_get_tx_queue(dev, 0); + + q = qdisc_create(dev, dev_queue, p, + tcm->tcm_parent, tcm->tcm_handle, + tca, &err); + } if (q == NULL) { if (err == -EAGAIN) goto replay; @@ -727,137 +1286,182 @@ create_n_graft: } graft: - if (1) { - struct Qdisc *old_q = NULL; - err = qdisc_graft(dev, p, clid, q, &old_q); - if (err) { - if (q) { - spin_lock_bh(&dev->queue_lock); - qdisc_destroy(q); - spin_unlock_bh(&dev->queue_lock); - } - return err; - } - qdisc_notify(skb, n, clid, old_q, q); - if (old_q) { - spin_lock_bh(&dev->queue_lock); - qdisc_destroy(old_q); - spin_unlock_bh(&dev->queue_lock); - } + err = qdisc_graft(dev, p, skb, n, clid, q, NULL); + if (err) { + if (q) + qdisc_destroy(q); + return err; } + return 0; } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, - u32 pid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; + struct qdisc_size_table *stab; - nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); - tcm = NLMSG_DATA(nlh); + cond_resched(); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); + if (!nlh) + goto out_nlmsg_trim; + tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; - tcm->tcm_ifindex = q->dev->ifindex; + tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = clid; tcm->tcm_handle = q->handle; tcm->tcm_info = atomic_read(&q->refcnt); - RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (nla_put_string(skb, TCA_KIND, q->ops->id)) + goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) - goto rtattr_failure; + goto nla_put_failure; q->qstats.qlen = q->q.qlen; - if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, - TCA_XSTATS, q->stats_lock, &d) < 0) - goto rtattr_failure; + stab = rtnl_dereference(q->stab); + if (stab && qdisc_dump_stab(skb, stab) < 0) + goto nla_put_failure; + + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, + qdisc_root_sleeping_lock(q), &d) < 0) + goto nla_put_failure; if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) - goto rtattr_failure; + goto nla_put_failure; if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || -#ifdef CONFIG_NET_ESTIMATOR - gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || -#endif + gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, &q->qstats) < 0) - goto rtattr_failure; - + goto nla_put_failure; + if (gnet_stats_finish_copy(&d) < 0) - goto rtattr_failure; - - nlh->nlmsg_len = skb->tail - b; + goto nla_put_failure; + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; -nlmsg_failure: -rtattr_failure: - skb_trim(skb, b - skb->data); +out_nlmsg_trim: +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, - u32 clid, struct Qdisc *old, struct Qdisc *new) +static bool tc_qdisc_dump_ignore(struct Qdisc *q) +{ + return (q->flags & TCQ_F_BUILTIN) ? true : false; +} + +static int qdisc_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, u32 clid, + struct Qdisc *old, struct Qdisc *new) { struct sk_buff *skb; - u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (old && old->handle) { - if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) + if (old && !tc_qdisc_dump_ignore(old)) { + if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, + 0, RTM_DELQDISC) < 0) goto err_out; } - if (new) { - if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + if (new && !tc_qdisc_dump_ignore(new)) { + if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, + old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) goto err_out; } if (skb->len) - return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); err_out: kfree_skb(skb); return -EINVAL; } +static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, + struct netlink_callback *cb, + int *q_idx_p, int s_q_idx) +{ + int ret = 0, q_idx = *q_idx_p; + struct Qdisc *q; + + if (!root) + return 0; + + q = root; + if (q_idx < s_q_idx) { + q_idx++; + } else { + if (!tc_qdisc_dump_ignore(q) && + tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) + goto done; + q_idx++; + } + list_for_each_entry(q, &root->list, list) { + if (q_idx < s_q_idx) { + q_idx++; + continue; + } + if (!tc_qdisc_dump_ignore(q) && + tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) + goto done; + q_idx++; + } + +out: + *q_idx_p = q_idx; + return ret; +done: + ret = -1; + goto out; +} + static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); int idx, q_idx; int s_idx, s_q_idx; struct net_device *dev; - struct Qdisc *q; s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; - read_lock(&dev_base_lock); - for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + + idx = 0; + ASSERT_RTNL(); + for_each_netdev(net, dev) { + struct netdev_queue *dev_queue; + if (idx < s_idx) - continue; + goto cont; if (idx > s_idx) s_q_idx = 0; - read_lock_bh(&qdisc_tree_lock); q_idx = 0; - list_for_each_entry(q, &dev->qdisc_list, list) { - if (q_idx < s_q_idx) { - q_idx++; - continue; - } - if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { - read_unlock_bh(&qdisc_tree_lock); - goto done; - } - q_idx++; - } - read_unlock_bh(&qdisc_tree_lock); + + if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0) + goto done; + + dev_queue = dev_ingress_queue(dev); + if (dev_queue && + tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, + &q_idx, s_q_idx) < 0) + goto done; + +cont: + idx++; } done: - read_unlock(&dev_base_lock); - cb->args[0] = idx; cb->args[1] = q_idx; @@ -872,21 +1476,31 @@ done: -static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n) { - struct tcmsg *tcm = NLMSG_DATA(n); - struct rtattr **tca = arg; + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm = nlmsg_data(n); + struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; struct Qdisc *q = NULL; - struct Qdisc_class_ops *cops; + const struct Qdisc_class_ops *cops; unsigned long cl = 0; unsigned long new_cl; - u32 pid = tcm->tcm_parent; - u32 clid = tcm->tcm_handle; - u32 qid = TC_H_MAJ(clid); + u32 portid; + u32 clid; + u32 qid; int err; - if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + if ((n->nlmsg_type != RTM_GETTCLASS) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) return -ENODEV; /* @@ -904,8 +1518,12 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) /* Step 1. Determine qdisc handle X:0 */ - if (pid != TC_H_ROOT) { - u32 qid1 = TC_H_MAJ(pid); + portid = tcm->tcm_parent; + clid = tcm->tcm_handle; + qid = TC_H_MAJ(clid); + + if (portid != TC_H_ROOT) { + u32 qid1 = TC_H_MAJ(portid); if (qid && qid1) { /* If both majors are known, they must be identical. */ @@ -914,22 +1532,23 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) } else if (qid1) { qid = qid1; } else if (qid == 0) - qid = dev->qdisc_sleeping->handle; + qid = dev->qdisc->handle; /* Now qid is genuine qdisc handle consistent - both with parent and child. - - TC_H_MAJ(pid) still may be unspecified, complete it now. + * both with parent and child. + * + * TC_H_MAJ(portid) still may be unspecified, complete it now. */ - if (pid) - pid = TC_H_MAKE(qid, pid); + if (portid) + portid = TC_H_MAKE(qid, portid); } else { if (qid == 0) - qid = dev->qdisc_sleeping->handle; + qid = dev->qdisc->handle; } /* OK. Locate qdisc */ - if ((q = qdisc_lookup(dev, qid)) == NULL) + q = qdisc_lookup(dev, qid); + if (!q) return -ENOENT; /* An check that it supports classes */ @@ -939,7 +1558,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) /* Now try to get class */ if (clid == 0) { - if (pid == TC_H_ROOT) + if (portid == TC_H_ROOT) clid = qid; } else clid = TC_H_MAKE(qid, clid); @@ -949,22 +1568,25 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) if (cl == 0) { err = -ENOENT; - if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) + if (n->nlmsg_type != RTM_NEWTCLASS || + !(n->nlmsg_flags & NLM_F_CREATE)) goto out; } else { switch (n->nlmsg_type) { - case RTM_NEWTCLASS: + case RTM_NEWTCLASS: err = -EEXIST; - if (n->nlmsg_flags&NLM_F_EXCL) + if (n->nlmsg_flags & NLM_F_EXCL) goto out; break; case RTM_DELTCLASS: - err = cops->delete(q, cl); + err = -EOPNOTSUPP; + if (cops->delete) + err = cops->delete(q, cl); if (err == 0) - tclass_notify(skb, n, q, cl, RTM_DELTCLASS); + tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS); goto out; case RTM_GETTCLASS: - err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); + err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); goto out; default: err = -EINVAL; @@ -973,9 +1595,11 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) } new_cl = cl; - err = cops->change(q, clid, pid, tca, &new_cl); + err = -EOPNOTSUPP; + if (cops->change) + err = cops->change(q, clid, portid, tca, &new_cl); if (err == 0) - tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); + tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); out: if (cl) @@ -987,118 +1611,160 @@ out: static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, unsigned long cl, - u32 pid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; - struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; + const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; - nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); - tcm = NLMSG_DATA(nlh); + cond_resched(); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); + if (!nlh) + goto out_nlmsg_trim; + tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; - tcm->tcm_ifindex = q->dev->ifindex; + tcm->tcm__pad1 = 0; + tcm->tcm__pad2 = 0; + tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = q->handle; tcm->tcm_handle = q->handle; tcm->tcm_info = 0; - RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (nla_put_string(skb, TCA_KIND, q->ops->id)) + goto nla_put_failure; if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) - goto rtattr_failure; + goto nla_put_failure; - if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, - TCA_XSTATS, q->stats_lock, &d) < 0) - goto rtattr_failure; + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, + qdisc_root_sleeping_lock(q), &d) < 0) + goto nla_put_failure; if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) - goto rtattr_failure; + goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) - goto rtattr_failure; + goto nla_put_failure; - nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; -nlmsg_failure: -rtattr_failure: - skb_trim(skb, b - skb->data); +out_nlmsg_trim: +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, - struct Qdisc *q, unsigned long cl, int event) +static int tclass_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct Qdisc *q, + unsigned long cl, int event) { struct sk_buff *skb; - u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) { kfree_skb(skb); return -EINVAL; } - return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } -struct qdisc_dump_args -{ - struct qdisc_walker w; - struct sk_buff *skb; - struct netlink_callback *cb; +struct qdisc_dump_args { + struct qdisc_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; }; static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) { struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; - return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, + return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); } +static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, + struct tcmsg *tcm, struct netlink_callback *cb, + int *t_p, int s_t) +{ + struct qdisc_dump_args arg; + + if (tc_qdisc_dump_ignore(q) || + *t_p < s_t || !q->ops->cl_ops || + (tcm->tcm_parent && + TC_H_MAJ(tcm->tcm_parent) != q->handle)) { + (*t_p)++; + return 0; + } + if (*t_p > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + arg.w.fn = qdisc_class_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]; + arg.w.count = 0; + q->ops->cl_ops->walk(q, &arg.w); + cb->args[1] = arg.w.count; + if (arg.w.stop) + return -1; + (*t_p)++; + return 0; +} + +static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, + struct tcmsg *tcm, struct netlink_callback *cb, + int *t_p, int s_t) +{ + struct Qdisc *q; + + if (!root) + return 0; + + if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) + return -1; + + list_for_each_entry(q, &root->list, list) { + if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) + return -1; + } + + return 0; +} + static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) { - int t; - int s_t; + struct tcmsg *tcm = nlmsg_data(cb->nlh); + struct net *net = sock_net(skb->sk); + struct netdev_queue *dev_queue; struct net_device *dev; - struct Qdisc *q; - struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); - struct qdisc_dump_args arg; + int t, s_t; - if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return 0; - if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + dev = dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) return 0; s_t = cb->args[0]; t = 0; - read_lock_bh(&qdisc_tree_lock); - list_for_each_entry(q, &dev->qdisc_list, list) { - if (t < s_t || !q->ops->cl_ops || - (tcm->tcm_parent && - TC_H_MAJ(tcm->tcm_parent) != q->handle)) { - t++; - continue; - } - if (t > s_t) - memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); - arg.w.fn = qdisc_class_dump; - arg.skb = skb; - arg.cb = cb; - arg.w.stop = 0; - arg.w.skip = cb->args[1]; - arg.w.count = 0; - q->ops->cl_ops->walk(q, &arg.w); - cb->args[1] = arg.w.count; - if (arg.w.stop) - break; - t++; - } - read_unlock_bh(&qdisc_tree_lock); + if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0) + goto done; + + dev_queue = dev_ingress_queue(dev); + if (dev_queue && + tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, + &t, s_t) < 0) + goto done; +done: cb->args[0] = t; dev_put(dev); @@ -1106,181 +1772,164 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) } /* Main classifier routine: scans classifier chain attached - to this qdisc, (optionally) tests for protocol and asks - specific classifiers. + * to this qdisc, (optionally) tests for protocol and asks + * specific classifiers. */ -int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, - struct tcf_result *res) +int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + __be16 protocol = skb->protocol; + int err; + + for (; tp; tp = tp->next) { + if (tp->protocol != protocol && + tp->protocol != htons(ETH_P_ALL)) + continue; + err = tp->classify(skb, tp, res); + + if (err >= 0) { +#ifdef CONFIG_NET_CLS_ACT + if (err != TC_ACT_RECLASSIFY && skb->tc_verd) + skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); +#endif + return err; + } + } + return -1; +} +EXPORT_SYMBOL(tc_classify_compat); + +int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) { int err = 0; - u32 protocol = skb->protocol; #ifdef CONFIG_NET_CLS_ACT - struct tcf_proto *otp = tp; + const struct tcf_proto *otp = tp; reclassify: #endif - protocol = skb->protocol; - for ( ; tp; tp = tp->next) { - if ((tp->protocol == protocol || - tp->protocol == __constant_htons(ETH_P_ALL)) && - (err = tp->classify(skb, tp, res)) >= 0) { + err = tc_classify_compat(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT - if ( TC_ACT_RECLASSIFY == err) { - __u32 verd = (__u32) G_TC_VERD(skb->tc_verd); - tp = otp; - - if (MAX_REC_LOOP < verd++) { - printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n", - tp->prio&0xffff, ntohs(tp->protocol)); - return TC_ACT_SHOT; - } - skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd); - goto reclassify; - } else { - if (skb->tc_verd) - skb->tc_verd = SET_TC_VERD(skb->tc_verd,0); - return err; - } -#else - - return err; -#endif + if (err == TC_ACT_RECLASSIFY) { + u32 verd = G_TC_VERD(skb->tc_verd); + tp = otp; + + if (verd++ >= MAX_REC_LOOP) { + net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n", + tp->q->ops->id, + tp->prio & 0xffff, + ntohs(tp->protocol)); + return TC_ACT_SHOT; } - + skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); + goto reclassify; } - return -1; +#endif + return err; +} +EXPORT_SYMBOL(tc_classify); + +void tcf_destroy(struct tcf_proto *tp) +{ + tp->ops->destroy(tp); + module_put(tp->ops->owner); + kfree(tp); } -static int psched_us_per_tick = 1; -static int psched_tick_per_us = 1; +void tcf_destroy_chain(struct tcf_proto **fl) +{ + struct tcf_proto *tp; + + while ((tp = *fl) != NULL) { + *fl = tp->next; + tcf_destroy(tp); + } +} +EXPORT_SYMBOL(tcf_destroy_chain); #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { + struct timespec ts; + + hrtimer_get_res(CLOCK_MONOTONIC, &ts); seq_printf(seq, "%08x %08x %08x %08x\n", - psched_tick_per_us, psched_us_per_tick, - 1000000, HZ); + (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), + 1000000, + (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts))); return 0; } static int psched_open(struct inode *inode, struct file *file) { - return single_open(file, psched_show, PDE(inode)->data); + return single_open(file, psched_show, NULL); } -static struct file_operations psched_fops = { +static const struct file_operations psched_fops = { .owner = THIS_MODULE, .open = psched_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, -}; -#endif +}; -#ifdef CONFIG_NET_SCH_CLK_CPU -psched_tdiff_t psched_clock_per_hz; -int psched_clock_scale; -EXPORT_SYMBOL(psched_clock_per_hz); -EXPORT_SYMBOL(psched_clock_scale); +static int __net_init psched_net_init(struct net *net) +{ + struct proc_dir_entry *e; -psched_time_t psched_time_base; -cycles_t psched_time_mark; -EXPORT_SYMBOL(psched_time_mark); -EXPORT_SYMBOL(psched_time_base); + e = proc_create("psched", 0, net->proc_net, &psched_fops); + if (e == NULL) + return -ENOMEM; -/* - * Periodically adjust psched_time_base to avoid overflow - * with 32-bit get_cycles(). Safe up to 4GHz CPU. - */ -static void psched_tick(unsigned long); -static DEFINE_TIMER(psched_timer, psched_tick, 0, 0); + return 0; +} -static void psched_tick(unsigned long dummy) +static void __net_exit psched_net_exit(struct net *net) { - if (sizeof(cycles_t) == sizeof(u32)) { - psched_time_t dummy_stamp; - PSCHED_GET_TIME(dummy_stamp); - psched_timer.expires = jiffies + 1*HZ; - add_timer(&psched_timer); - } + remove_proc_entry("psched", net->proc_net); } - -int __init psched_calibrate_clock(void) +#else +static int __net_init psched_net_init(struct net *net) { - psched_time_t stamp, stamp1; - struct timeval tv, tv1; - psched_tdiff_t delay; - long rdelay; - unsigned long stop; - - psched_tick(0); - stop = jiffies + HZ/10; - PSCHED_GET_TIME(stamp); - do_gettimeofday(&tv); - while (time_before(jiffies, stop)) { - barrier(); - cpu_relax(); - } - PSCHED_GET_TIME(stamp1); - do_gettimeofday(&tv1); - - delay = PSCHED_TDIFF(stamp1, stamp); - rdelay = tv1.tv_usec - tv.tv_usec; - rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; - if (rdelay > delay) - return -1; - delay /= rdelay; - psched_tick_per_us = delay; - while ((delay>>=1) != 0) - psched_clock_scale++; - psched_us_per_tick = 1<<psched_clock_scale; - psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale; return 0; } -#endif -static int __init pktsched_init(void) +static void __net_exit psched_net_exit(struct net *net) { - struct rtnetlink_link *link_p; - -#ifdef CONFIG_NET_SCH_CLK_CPU - if (psched_calibrate_clock() < 0) - return -1; -#elif defined(CONFIG_NET_SCH_CLK_JIFFIES) - psched_tick_per_us = HZ<<PSCHED_JSCALE; - psched_us_per_tick = 1000000; +} #endif - link_p = rtnetlink_links[PF_UNSPEC]; +static struct pernet_operations psched_net_ops = { + .init = psched_net_init, + .exit = psched_net_exit, +}; - /* Setup rtnetlink links. It is made here to avoid - exporting large number of public symbols. - */ +static int __init pktsched_init(void) +{ + int err; - if (link_p) { - link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc; - link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc; - link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc; - link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc; - link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass; - link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass; - link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass; - link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass; + err = register_pernet_subsys(&psched_net_ops); + if (err) { + pr_err("pktsched_init: " + "cannot initialize per netns operations\n"); + return err; } + register_qdisc(&pfifo_fast_ops); register_qdisc(&pfifo_qdisc_ops); register_qdisc(&bfifo_qdisc_ops); - proc_net_fops_create("psched", 0, &psched_fops); + register_qdisc(&pfifo_head_drop_qdisc_ops); + register_qdisc(&mq_qdisc_ops); + + rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL); + rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL); return 0; } subsys_initcall(pktsched_init); - -EXPORT_SYMBOL(qdisc_lookup); -EXPORT_SYMBOL(qdisc_get_rtab); -EXPORT_SYMBOL(qdisc_put_rtab); -EXPORT_SYMBOL(register_qdisc); -EXPORT_SYMBOL(unregister_qdisc); -EXPORT_SYMBOL(tc_classify); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 93ebce40aca..8449b337f9e 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -2,37 +2,19 @@ /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ - -#include <linux/config.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/init.h> +#include <linux/interrupt.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> -#include <linux/interrupt.h> #include <linux/atmdev.h> #include <linux/atmclip.h> -#include <linux/netdevice.h> #include <linux/rtnetlink.h> -#include <linux/file.h> /* for fput */ +#include <linux/file.h> /* for fput */ +#include <net/netlink.h> #include <net/pkt_sched.h> -#include <net/sock.h> - - -extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ - -#if 0 /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif - -#if 0 /* data */ -#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define D2PRINTK(format,args...) -#endif - /* * The ATM queuing discipline provides a framework for invoking classifiers @@ -55,24 +37,21 @@ extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ * - should lock the flow while there is data in the queue (?) */ - -#define PRIV(sch) qdisc_priv(sch) #define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back)) - struct atm_flow_data { - struct Qdisc *q; /* FIFO, TBF, etc. */ + struct Qdisc *q; /* FIFO, TBF, etc. */ struct tcf_proto *filter_list; - struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ - void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); /* chaining */ + struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ + void (*old_pop)(struct atm_vcc *vcc, + struct sk_buff *skb); /* chaining */ struct atm_qdisc_data *parent; /* parent qdisc */ struct socket *sock; /* for closing */ u32 classid; /* x:y type ID */ int ref; /* reference count */ - struct gnet_stats_basic bstats; + struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - spinlock_t *stats_lock; - struct atm_flow_data *next; + struct list_head list; struct atm_flow_data *excess; /* flow for excess traffic; NULL to set CLP instead */ int hdr_len; @@ -81,167 +60,142 @@ struct atm_flow_data { struct atm_qdisc_data { struct atm_flow_data link; /* unclassified skbs go here */ - struct atm_flow_data *flows; /* NB: "link" is also on this + struct list_head flows; /* NB: "link" is also on this list */ - struct tasklet_struct task; /* requeue tasklet */ + struct tasklet_struct task; /* dequeue tasklet */ }; - /* ------------------------- Class/flow operations ------------------------- */ - -static int find_flow(struct atm_qdisc_data *qdisc,struct atm_flow_data *flow) +static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid) { - struct atm_flow_data *walk; - - DPRINTK("find_flow(qdisc %p,flow %p)\n",qdisc,flow); - for (walk = qdisc->flows; walk; walk = walk->next) - if (walk == flow) return 1; - DPRINTK("find_flow: not found\n"); - return 0; -} - - -static __inline__ struct atm_flow_data *lookup_flow(struct Qdisc *sch, - u32 classid) -{ - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; - for (flow = p->flows; flow; flow = flow->next) - if (flow->classid == classid) break; - return flow; + list_for_each_entry(flow, &p->flows, list) { + if (flow->classid == classid) + return flow; + } + return NULL; } - -static int atm_tc_graft(struct Qdisc *sch,unsigned long arg, - struct Qdisc *new,struct Qdisc **old) +static int atm_tc_graft(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old) { - struct atm_qdisc_data *p = PRIV(sch); - struct atm_flow_data *flow = (struct atm_flow_data *) arg; - - DPRINTK("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",sch, - p,flow,new,old); - if (!find_flow(p,flow)) return -EINVAL; - if (!new) new = &noop_qdisc; - *old = xchg(&flow->q,new); - if (*old) qdisc_reset(*old); - return 0; -} + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)arg; + pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n", + sch, p, flow, new, old); + if (list_empty(&flow->list)) + return -EINVAL; + if (!new) + new = &noop_qdisc; + *old = flow->q; + flow->q = new; + if (*old) + qdisc_reset(*old); + return 0; +} -static struct Qdisc *atm_tc_leaf(struct Qdisc *sch,unsigned long cl) +static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl) { - struct atm_flow_data *flow = (struct atm_flow_data *) cl; + struct atm_flow_data *flow = (struct atm_flow_data *)cl; - DPRINTK("atm_tc_leaf(sch %p,flow %p)\n",sch,flow); + pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow); return flow ? flow->q : NULL; } - -static unsigned long atm_tc_get(struct Qdisc *sch,u32 classid) +static unsigned long atm_tc_get(struct Qdisc *sch, u32 classid) { - struct atm_qdisc_data *p __attribute__((unused)) = PRIV(sch); + struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch); struct atm_flow_data *flow; - DPRINTK("atm_tc_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); - flow = lookup_flow(sch,classid); - if (flow) flow->ref++; - DPRINTK("atm_tc_get: flow %p\n",flow); - return (unsigned long) flow; + pr_debug("atm_tc_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid); + flow = lookup_flow(sch, classid); + if (flow) + flow->ref++; + pr_debug("atm_tc_get: flow %p\n", flow); + return (unsigned long)flow; } - static unsigned long atm_tc_bind_filter(struct Qdisc *sch, - unsigned long parent, u32 classid) + unsigned long parent, u32 classid) { - return atm_tc_get(sch,classid); -} - - -static void destroy_filters(struct atm_flow_data *flow) -{ - struct tcf_proto *filter; - - while ((filter = flow->filter_list)) { - DPRINTK("destroy_filters: destroying filter %p\n",filter); - flow->filter_list = filter->next; - tcf_destroy(filter); - } + return atm_tc_get(sch, classid); } - /* * atm_tc_put handles all destructions, including the ones that are explicitly * requested (atm_tc_destroy, etc.). The assumption here is that we never drop * anything that still seems to be in use. */ - static void atm_tc_put(struct Qdisc *sch, unsigned long cl) { - struct atm_qdisc_data *p = PRIV(sch); - struct atm_flow_data *flow = (struct atm_flow_data *) cl; - struct atm_flow_data **prev; - - DPRINTK("atm_tc_put(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); - if (--flow->ref) return; - DPRINTK("atm_tc_put: destroying\n"); - for (prev = &p->flows; *prev; prev = &(*prev)->next) - if (*prev == flow) break; - if (!*prev) { - printk(KERN_CRIT "atm_tc_put: class %p not found\n",flow); + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)cl; + + pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); + if (--flow->ref) return; - } - *prev = flow->next; - DPRINTK("atm_tc_put: qdisc %p\n",flow->q); + pr_debug("atm_tc_put: destroying\n"); + list_del_init(&flow->list); + pr_debug("atm_tc_put: qdisc %p\n", flow->q); qdisc_destroy(flow->q); - destroy_filters(flow); + tcf_destroy_chain(&flow->filter_list); if (flow->sock) { - DPRINTK("atm_tc_put: f_count %d\n", - file_count(flow->sock->file)); + pr_debug("atm_tc_put: f_count %ld\n", + file_count(flow->sock->file)); flow->vcc->pop = flow->old_pop; sockfd_put(flow->sock); } - if (flow->excess) atm_tc_put(sch,(unsigned long) flow->excess); - if (flow != &p->link) kfree(flow); + if (flow->excess) + atm_tc_put(sch, (unsigned long)flow->excess); + if (flow != &p->link) + kfree(flow); /* * If flow == &p->link, the qdisc no longer works at this point and * needs to be removed. (By the caller of atm_tc_put.) */ } - -static void sch_atm_pop(struct atm_vcc *vcc,struct sk_buff *skb) +static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb) { struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent; - D2PRINTK("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n",vcc,skb,p); - VCC2FLOW(vcc)->old_pop(vcc,skb); + pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p); + VCC2FLOW(vcc)->old_pop(vcc, skb); tasklet_schedule(&p->task); } static const u8 llc_oui_ip[] = { - 0xaa, /* DSAP: non-ISO */ - 0xaa, /* SSAP: non-ISO */ - 0x03, /* Ctrl: Unnumbered Information Command PDU */ - 0x00, /* OUI: EtherType */ + 0xaa, /* DSAP: non-ISO */ + 0xaa, /* SSAP: non-ISO */ + 0x03, /* Ctrl: Unnumbered Information Command PDU */ + 0x00, /* OUI: EtherType */ 0x00, 0x00, - 0x08, 0x00 }; /* Ethertype IP (0800) */ + 0x08, 0x00 +}; /* Ethertype IP (0800) */ + +static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = { + [TCA_ATM_FD] = { .type = NLA_U32 }, + [TCA_ATM_EXCESS] = { .type = NLA_U32 }, +}; static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { - struct atm_qdisc_data *p = PRIV(sch); - struct atm_flow_data *flow = (struct atm_flow_data *) *arg; + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)*arg; struct atm_flow_data *excess = NULL; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_ATM_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_ATM_MAX + 1]; struct socket *sock; - int fd,error,hdr_len; + int fd, error, hdr_len; void *hdr; - DPRINTK("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," - "flow %p,opt %p)\n",sch,p,classid,parent,flow,opt); + pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," + "flow %p,opt %p)\n", sch, p, classid, parent, flow, opt); /* * The concept of parents doesn't apply for this qdisc. */ @@ -254,210 +208,219 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, * class needs to be removed and a new one added. (This may be changed * later.) */ - if (flow) return -EBUSY; - if (opt == NULL || rtattr_parse_nested(tb, TCA_ATM_MAX, opt)) + if (flow) + return -EBUSY; + if (opt == NULL) return -EINVAL; - if (!tb[TCA_ATM_FD-1] || RTA_PAYLOAD(tb[TCA_ATM_FD-1]) < sizeof(fd)) + + error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy); + if (error < 0) + return error; + + if (!tb[TCA_ATM_FD]) return -EINVAL; - fd = *(int *) RTA_DATA(tb[TCA_ATM_FD-1]); - DPRINTK("atm_tc_change: fd %d\n",fd); - if (tb[TCA_ATM_HDR-1]) { - hdr_len = RTA_PAYLOAD(tb[TCA_ATM_HDR-1]); - hdr = RTA_DATA(tb[TCA_ATM_HDR-1]); - } - else { + fd = nla_get_u32(tb[TCA_ATM_FD]); + pr_debug("atm_tc_change: fd %d\n", fd); + if (tb[TCA_ATM_HDR]) { + hdr_len = nla_len(tb[TCA_ATM_HDR]); + hdr = nla_data(tb[TCA_ATM_HDR]); + } else { hdr_len = RFC1483LLC_LEN; - hdr = NULL; /* default LLC/SNAP for IP */ + hdr = NULL; /* default LLC/SNAP for IP */ } - if (!tb[TCA_ATM_EXCESS-1]) excess = NULL; + if (!tb[TCA_ATM_EXCESS]) + excess = NULL; else { - if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS-1]) != sizeof(u32)) - return -EINVAL; - excess = (struct atm_flow_data *) atm_tc_get(sch, - *(u32 *) RTA_DATA(tb[TCA_ATM_EXCESS-1])); - if (!excess) return -ENOENT; + excess = (struct atm_flow_data *) + atm_tc_get(sch, nla_get_u32(tb[TCA_ATM_EXCESS])); + if (!excess) + return -ENOENT; } - DPRINTK("atm_tc_change: type %d, payload %d, hdr_len %d\n", - opt->rta_type,RTA_PAYLOAD(opt),hdr_len); - if (!(sock = sockfd_lookup(fd,&error))) return error; /* f_count++ */ - DPRINTK("atm_tc_change: f_count %d\n",file_count(sock->file)); - if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { + pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n", + opt->nla_type, nla_len(opt), hdr_len); + sock = sockfd_lookup(fd, &error); + if (!sock) + return error; /* f_count++ */ + pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file)); + if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { error = -EPROTOTYPE; - goto err_out; + goto err_out; } /* @@@ should check if the socket is really operational or we'll crash on vcc->send */ if (classid) { if (TC_H_MAJ(classid ^ sch->handle)) { - DPRINTK("atm_tc_change: classid mismatch\n"); + pr_debug("atm_tc_change: classid mismatch\n"); error = -EINVAL; goto err_out; } - if (find_flow(p,flow)) { - error = -EEXIST; - goto err_out; - } - } - else { + } else { int i; unsigned long cl; for (i = 1; i < 0x8000; i++) { - classid = TC_H_MAKE(sch->handle,0x8000 | i); - if (!(cl = atm_tc_get(sch,classid))) break; - atm_tc_put(sch,cl); + classid = TC_H_MAKE(sch->handle, 0x8000 | i); + cl = atm_tc_get(sch, classid); + if (!cl) + break; + atm_tc_put(sch, cl); } } - DPRINTK("atm_tc_change: new id %x\n",classid); - flow = kmalloc(sizeof(struct atm_flow_data)+hdr_len,GFP_KERNEL); - DPRINTK("atm_tc_change: flow %p\n",flow); + pr_debug("atm_tc_change: new id %x\n", classid); + flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL); + pr_debug("atm_tc_change: flow %p\n", flow); if (!flow) { error = -ENOBUFS; goto err_out; } - memset(flow,0,sizeof(*flow)); flow->filter_list = NULL; - if (!(flow->q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) + flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); + if (!flow->q) flow->q = &noop_qdisc; - DPRINTK("atm_tc_change: qdisc %p\n",flow->q); + pr_debug("atm_tc_change: qdisc %p\n", flow->q); flow->sock = sock; - flow->vcc = ATM_SD(sock); /* speedup */ + flow->vcc = ATM_SD(sock); /* speedup */ flow->vcc->user_back = flow; - DPRINTK("atm_tc_change: vcc %p\n",flow->vcc); + pr_debug("atm_tc_change: vcc %p\n", flow->vcc); flow->old_pop = flow->vcc->pop; flow->parent = p; flow->vcc->pop = sch_atm_pop; flow->classid = classid; flow->ref = 1; flow->excess = excess; - flow->next = p->link.next; - p->link.next = flow; + list_add(&flow->list, &p->link.list); flow->hdr_len = hdr_len; if (hdr) - memcpy(flow->hdr,hdr,hdr_len); + memcpy(flow->hdr, hdr, hdr_len); else - memcpy(flow->hdr,llc_oui_ip,sizeof(llc_oui_ip)); - *arg = (unsigned long) flow; + memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip)); + *arg = (unsigned long)flow; return 0; err_out: - if (excess) atm_tc_put(sch,(unsigned long) excess); + if (excess) + atm_tc_put(sch, (unsigned long)excess); sockfd_put(sock); return error; } - -static int atm_tc_delete(struct Qdisc *sch,unsigned long arg) +static int atm_tc_delete(struct Qdisc *sch, unsigned long arg) { - struct atm_qdisc_data *p = PRIV(sch); - struct atm_flow_data *flow = (struct atm_flow_data *) arg; + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)arg; - DPRINTK("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); - if (!find_flow(PRIV(sch),flow)) return -EINVAL; - if (flow->filter_list || flow == &p->link) return -EBUSY; + pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); + if (list_empty(&flow->list)) + return -EINVAL; + if (flow->filter_list || flow == &p->link) + return -EBUSY; /* * Reference count must be 2: one for "keepalive" (set at class * creation), and one for the reference held when calling delete. */ if (flow->ref < 2) { - printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n",flow->ref); + pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref); return -EINVAL; } - if (flow->ref > 2) return -EBUSY; /* catch references via excess, etc.*/ - atm_tc_put(sch,arg); + if (flow->ref > 2) + return -EBUSY; /* catch references via excess, etc. */ + atm_tc_put(sch, arg); return 0; } - -static void atm_tc_walk(struct Qdisc *sch,struct qdisc_walker *walker) +static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; - DPRINTK("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); - if (walker->stop) return; - for (flow = p->flows; flow; flow = flow->next) { - if (walker->count >= walker->skip) - if (walker->fn(sch,(unsigned long) flow,walker) < 0) { - walker->stop = 1; - break; - } + pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); + if (walker->stop) + return; + list_for_each_entry(flow, &p->flows, list) { + if (walker->count >= walker->skip && + walker->fn(sch, (unsigned long)flow, walker) < 0) { + walker->stop = 1; + break; + } walker->count++; } } - -static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch,unsigned long cl) +static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl) { - struct atm_qdisc_data *p = PRIV(sch); - struct atm_flow_data *flow = (struct atm_flow_data *) cl; + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)cl; - DPRINTK("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); - return flow ? &flow->filter_list : &p->link.filter_list; + pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); + return flow ? &flow->filter_list : &p->link.filter_list; } - /* --------------------------- Qdisc operations ---------------------------- */ - -static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch) +static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); - struct atm_flow_data *flow = NULL ; /* @@@ */ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow; struct tcf_result res; int result; int ret = NET_XMIT_POLICED; - D2PRINTK("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); - result = TC_POLICE_OK; /* be nice to gcc */ + pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + result = TC_POLICE_OK; /* be nice to gcc */ + flow = NULL; if (TC_H_MAJ(skb->priority) != sch->handle || - !(flow = (struct atm_flow_data *) atm_tc_get(sch,skb->priority))) - for (flow = p->flows; flow; flow = flow->next) + !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) { + list_for_each_entry(flow, &p->flows, list) { if (flow->filter_list) { - result = tc_classify(skb,flow->filter_list, - &res); - if (result < 0) continue; - flow = (struct atm_flow_data *) res.class; - if (!flow) flow = lookup_flow(sch,res.classid); - break; + result = tc_classify_compat(skb, + flow->filter_list, + &res); + if (result < 0) + continue; + flow = (struct atm_flow_data *)res.class; + if (!flow) + flow = lookup_flow(sch, res.classid); + goto done; } - if (!flow) flow = &p->link; - else { + } + flow = NULL; +done: + ; + } + if (!flow) { + flow = &p->link; + } else { if (flow->vcc) ATM_SKB(skb)->atm_options = flow->vcc->atm_options; - /*@@@ looks good ... but it's not supposed to work :-)*/ -#ifdef CONFIG_NET_CLS_POLICE + /*@@@ looks good ... but it's not supposed to work :-) */ +#ifdef CONFIG_NET_CLS_ACT switch (result) { - case TC_POLICE_SHOT: - kfree_skb(skb); - break; - case TC_POLICE_RECLASSIFY: - if (flow->excess) flow = flow->excess; - else { - ATM_SKB(skb)->atm_options |= - ATM_ATMOPT_CLP; - break; - } - /* fall through */ - case TC_POLICE_OK: - /* fall through */ - default: - break; + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + kfree_skb(skb); + return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + kfree_skb(skb); + goto drop; + case TC_POLICE_RECLASSIFY: + if (flow->excess) + flow = flow->excess; + else + ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP; + break; } #endif } - if ( -#ifdef CONFIG_NET_CLS_POLICE - result == TC_POLICE_SHOT || -#endif - (ret = flow->q->enqueue(skb,flow->q)) != 0) { - sch->qstats.drops++; - if (flow) flow->qstats.drops++; + + ret = qdisc_enqueue(skb, flow->q); + if (ret != NET_XMIT_SUCCESS) { +drop: __maybe_unused + if (net_xmit_drop_count(ret)) { + sch->qstats.drops++; + if (flow) + flow->qstats.drops++; + } return ret; } - sch->bstats.bytes += skb->len; - sch->bstats.packets++; - flow->bstats.bytes += skb->len; - flow->bstats.packets++; /* * Okay, this may seem weird. We pretend we've dropped the packet if * it goes via ATM. The reason for this is that the outer qdisc @@ -469,13 +432,12 @@ static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch) */ if (flow == &p->link) { sch->q.qlen++; - return 0; + return NET_XMIT_SUCCESS; } tasklet_schedule(&p->task); - return NET_XMIT_BYPASS; + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } - /* * Dequeue packets and send them over ATM. Note that we quite deliberately * avoid checking net_device's flow control here, simply because sch_atm @@ -483,195 +445,195 @@ static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch) * non-ATM interfaces. */ - static void sch_atm_dequeue(unsigned long data) { - struct Qdisc *sch = (struct Qdisc *) data; - struct atm_qdisc_data *p = PRIV(sch); + struct Qdisc *sch = (struct Qdisc *)data; + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; struct sk_buff *skb; - D2PRINTK("sch_atm_dequeue(sch %p,[qdisc %p])\n",sch,p); - for (flow = p->link.next; flow; flow = flow->next) + pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p); + list_for_each_entry(flow, &p->flows, list) { + if (flow == &p->link) + continue; /* * If traffic is properly shaped, this won't generate nasty * little bursts. Otherwise, it may ... (but that's okay) */ - while ((skb = flow->q->dequeue(flow->q))) { - if (!atm_may_send(flow->vcc,skb->truesize)) { - (void) flow->q->ops->requeue(skb,flow->q); + while ((skb = flow->q->ops->peek(flow->q))) { + if (!atm_may_send(flow->vcc, skb->truesize)) break; - } - D2PRINTK("atm_tc_dequeue: sending on class %p\n",flow); + + skb = qdisc_dequeue_peeked(flow->q); + if (unlikely(!skb)) + break; + + qdisc_bstats_update(sch, skb); + bstats_update(&flow->bstats, skb); + pr_debug("atm_tc_dequeue: sending on class %p\n", flow); /* remove any LL header somebody else has attached */ - skb_pull(skb,(char *) skb->nh.iph-(char *) skb->data); + skb_pull(skb, skb_network_offset(skb)); if (skb_headroom(skb) < flow->hdr_len) { struct sk_buff *new; - new = skb_realloc_headroom(skb,flow->hdr_len); + new = skb_realloc_headroom(skb, flow->hdr_len); dev_kfree_skb(skb); - if (!new) continue; + if (!new) + continue; skb = new; } - D2PRINTK("sch_atm_dequeue: ip %p, data %p\n", - skb->nh.iph,skb->data); + pr_debug("sch_atm_dequeue: ip %p, data %p\n", + skb_network_header(skb), skb->data); ATM_SKB(skb)->vcc = flow->vcc; - memcpy(skb_push(skb,flow->hdr_len),flow->hdr, - flow->hdr_len); + memcpy(skb_push(skb, flow->hdr_len), flow->hdr, + flow->hdr_len); atomic_add(skb->truesize, &sk_atm(flow->vcc)->sk_wmem_alloc); /* atm.atm_options are already set by atm_tc_enqueue */ - (void) flow->vcc->send(flow->vcc,skb); + flow->vcc->send(flow->vcc, skb); } + } } - static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct sk_buff *skb; - D2PRINTK("atm_tc_dequeue(sch %p,[qdisc %p])\n",sch,p); + pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p); tasklet_schedule(&p->task); - skb = p->link.q->dequeue(p->link.q); - if (skb) sch->q.qlen--; + skb = qdisc_dequeue_peeked(p->link.q); + if (skb) + sch->q.qlen--; return skb; } - -static int atm_tc_requeue(struct sk_buff *skb,struct Qdisc *sch) +static struct sk_buff *atm_tc_peek(struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); - int ret; - - D2PRINTK("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); - ret = p->link.q->ops->requeue(skb,p->link.q); - if (!ret) { - sch->q.qlen++; - sch->qstats.requeues++; - } else { - sch->qstats.drops++; - p->link.qstats.drops++; - } - return ret; -} + struct atm_qdisc_data *p = qdisc_priv(sch); + pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p); + + return p->link.q->ops->peek(p->link.q); +} static unsigned int atm_tc_drop(struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; unsigned int len; - DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n",sch,p); - for (flow = p->flows; flow; flow = flow->next) + pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p); + list_for_each_entry(flow, &p->flows, list) { if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q))) return len; + } return 0; } - -static int atm_tc_init(struct Qdisc *sch,struct rtattr *opt) +static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) { - struct atm_qdisc_data *p = PRIV(sch); - - DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); - p->flows = &p->link; - if(!(p->link.q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) + struct atm_qdisc_data *p = qdisc_priv(sch); + + pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); + INIT_LIST_HEAD(&p->flows); + INIT_LIST_HEAD(&p->link.list); + list_add(&p->link.list, &p->flows); + p->link.q = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, sch->handle); + if (!p->link.q) p->link.q = &noop_qdisc; - DPRINTK("atm_tc_init: link (%p) qdisc %p\n",&p->link,p->link.q); + pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); p->link.filter_list = NULL; p->link.vcc = NULL; p->link.sock = NULL; p->link.classid = sch->handle; p->link.ref = 1; - p->link.next = NULL; - tasklet_init(&p->task,sch_atm_dequeue,(unsigned long) sch); + tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch); return 0; } - static void atm_tc_reset(struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; - DPRINTK("atm_tc_reset(sch %p,[qdisc %p])\n",sch,p); - for (flow = p->flows; flow; flow = flow->next) qdisc_reset(flow->q); + pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p); + list_for_each_entry(flow, &p->flows, list) + qdisc_reset(flow->q); sch->q.qlen = 0; } - static void atm_tc_destroy(struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); - struct atm_flow_data *flow; + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow, *tmp; + + pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); + list_for_each_entry(flow, &p->flows, list) + tcf_destroy_chain(&flow->filter_list); - DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n",sch,p); - /* races ? */ - while ((flow = p->flows)) { - destroy_filters(flow); + list_for_each_entry_safe(flow, tmp, &p->flows, list) { if (flow->ref > 1) - printk(KERN_ERR "atm_destroy: %p->ref = %d\n",flow, - flow->ref); - atm_tc_put(sch,(unsigned long) flow); - if (p->flows == flow) { - printk(KERN_ERR "atm_destroy: putting flow %p didn't " - "kill it\n",flow); - p->flows = flow->next; /* brute force */ - break; - } + pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref); + atm_tc_put(sch, (unsigned long)flow); } tasklet_kill(&p->task); } - static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, - struct sk_buff *skb, struct tcmsg *tcm) + struct sk_buff *skb, struct tcmsg *tcm) { - struct atm_qdisc_data *p = PRIV(sch); - struct atm_flow_data *flow = (struct atm_flow_data *) cl; - unsigned char *b = skb->tail; - struct rtattr *rta; - - DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", - sch,p,flow,skb,tcm); - if (!find_flow(p,flow)) return -EINVAL; + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)cl; + struct nlattr *nest; + + pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", + sch, p, flow, skb, tcm); + if (list_empty(&flow->list)) + return -EINVAL; tcm->tcm_handle = flow->classid; - rta = (struct rtattr *) b; - RTA_PUT(skb,TCA_OPTIONS,0,NULL); - RTA_PUT(skb,TCA_ATM_HDR,flow->hdr_len,flow->hdr); + tcm->tcm_info = flow->q->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr)) + goto nla_put_failure; if (flow->vcc) { struct sockaddr_atmpvc pvc; int state; + memset(&pvc, 0, sizeof(pvc)); pvc.sap_family = AF_ATMPVC; pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; pvc.sap_addr.vpi = flow->vcc->vpi; pvc.sap_addr.vci = flow->vcc->vci; - RTA_PUT(skb,TCA_ATM_ADDR,sizeof(pvc),&pvc); + if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc)) + goto nla_put_failure; state = ATM_VF2VS(flow->vcc->flags); - RTA_PUT(skb,TCA_ATM_STATE,sizeof(state),&state); + if (nla_put_u32(skb, TCA_ATM_STATE, state)) + goto nla_put_failure; } - if (flow->excess) - RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(u32),&flow->classid); - else { - static u32 zero; - - RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(zero),&zero); + if (flow->excess) { + if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->classid)) + goto nla_put_failure; + } else { + if (nla_put_u32(skb, TCA_ATM_EXCESS, 0)) + goto nla_put_failure; } - rta->rta_len = skb->tail-b; - return skb->len; + return nla_nest_end(skb, nest); -rtattr_failure: - skb_trim(skb,b-skb->data); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } static int atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, - struct gnet_dump *d) + struct gnet_dump *d) { - struct atm_flow_data *flow = (struct atm_flow_data *) arg; + struct atm_flow_data *flow = (struct atm_flow_data *)arg; flow->qstats.qlen = flow->q->q.qlen; @@ -687,45 +649,42 @@ static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb) return 0; } -static struct Qdisc_class_ops atm_class_ops = { - .graft = atm_tc_graft, - .leaf = atm_tc_leaf, - .get = atm_tc_get, - .put = atm_tc_put, - .change = atm_tc_change, - .delete = atm_tc_delete, - .walk = atm_tc_walk, - .tcf_chain = atm_tc_find_tcf, - .bind_tcf = atm_tc_bind_filter, - .unbind_tcf = atm_tc_put, - .dump = atm_tc_dump_class, - .dump_stats = atm_tc_dump_class_stats, +static const struct Qdisc_class_ops atm_class_ops = { + .graft = atm_tc_graft, + .leaf = atm_tc_leaf, + .get = atm_tc_get, + .put = atm_tc_put, + .change = atm_tc_change, + .delete = atm_tc_delete, + .walk = atm_tc_walk, + .tcf_chain = atm_tc_find_tcf, + .bind_tcf = atm_tc_bind_filter, + .unbind_tcf = atm_tc_put, + .dump = atm_tc_dump_class, + .dump_stats = atm_tc_dump_class_stats, }; -static struct Qdisc_ops atm_qdisc_ops = { - .next = NULL, - .cl_ops = &atm_class_ops, - .id = "atm", - .priv_size = sizeof(struct atm_qdisc_data), - .enqueue = atm_tc_enqueue, - .dequeue = atm_tc_dequeue, - .requeue = atm_tc_requeue, - .drop = atm_tc_drop, - .init = atm_tc_init, - .reset = atm_tc_reset, - .destroy = atm_tc_destroy, - .change = NULL, - .dump = atm_tc_dump, - .owner = THIS_MODULE, +static struct Qdisc_ops atm_qdisc_ops __read_mostly = { + .cl_ops = &atm_class_ops, + .id = "atm", + .priv_size = sizeof(struct atm_qdisc_data), + .enqueue = atm_tc_enqueue, + .dequeue = atm_tc_dequeue, + .peek = atm_tc_peek, + .drop = atm_tc_drop, + .init = atm_tc_init, + .reset = atm_tc_reset, + .destroy = atm_tc_destroy, + .dump = atm_tc_dump, + .owner = THIS_MODULE, }; - static int __init atm_init(void) { return register_qdisc(&atm_qdisc_ops); } -static void __exit atm_exit(void) +static void __exit atm_exit(void) { unregister_qdisc(&atm_qdisc_ops); } diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c index 81f0b8346d1..094a874b48b 100644 --- a/net/sched/sch_blackhole.c +++ b/net/sched/sch_blackhole.c @@ -11,11 +11,9 @@ * Note: Quantum tunneling is not supported. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> @@ -30,11 +28,12 @@ static struct sk_buff *blackhole_dequeue(struct Qdisc *sch) return NULL; } -static struct Qdisc_ops blackhole_qdisc_ops = { +static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = { .id = "blackhole", .priv_size = 0, .enqueue = blackhole_enqueue, .dequeue = blackhole_dequeue, + .peek = blackhole_dequeue, .owner = THIS_MODULE, }; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 6cd81708bf7..ead526467cc 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -10,30 +10,14 @@ * */ -#include <linux/config.h> #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> +#include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> +#include <net/netlink.h> #include <net/pkt_sched.h> @@ -41,12 +25,12 @@ ======================================= Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource - Management Models for Packet Networks", + Management Models for Packet Networks", IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 - [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 + [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 - [3] Sally Floyd, "Notes on Class-Based Queueing: Setting + [3] Sally Floyd, "Notes on Class-Based Queueing: Setting Parameters", 1996 [4] Sally Floyd and Michael Speer, "Experimental Results @@ -60,12 +44,12 @@ the implementation is different. Particularly: --- The WRR algorithm is different. Our version looks more - reasonable (I hope) and works when quanta are allowed to be - less than MTU, which is always the case when real time classes - have small rates. Note, that the statement of [3] is - incomplete, delay may actually be estimated even if class - per-round allotment is less than MTU. Namely, if per-round - allotment is W*r_i, and r_1+...+r_k = r < 1 + reasonable (I hope) and works when quanta are allowed to be + less than MTU, which is always the case when real time classes + have small rates. Note, that the statement of [3] is + incomplete, delay may actually be estimated even if class + per-round allotment is less than MTU. Namely, if per-round + allotment is W*r_i, and r_1+...+r_k = r < 1 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B @@ -88,18 +72,16 @@ struct cbq_sched_data; -struct cbq_class -{ - struct cbq_class *next; /* hash table link */ +struct cbq_class { + struct Qdisc_class_common common; struct cbq_class *next_alive; /* next class with backlog in this priority band */ /* Parameters */ - u32 classid; unsigned char priority; /* class priority */ unsigned char priority2; /* priority to be used after overlimit */ unsigned char ewma_log; /* time constant for idle time calculation */ unsigned char ovl_strategy; -#ifdef CONFIG_NET_CLS_POLICE +#ifdef CONFIG_NET_CLS_ACT unsigned char police; #endif @@ -114,7 +96,7 @@ struct cbq_class /* Overlimit strategy parameters */ void (*overlimit)(struct cbq_class *cl); - long penalty; + psched_tdiff_t penalty; /* General scheduler (WRR) parameters */ long allot; @@ -145,11 +127,10 @@ struct cbq_class psched_time_t undertime; long avgidle; long deficit; /* Saved deficit for WRR */ - unsigned long penalized; - struct gnet_stats_basic bstats; + psched_time_t penalized; + struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; - spinlock_t *stats_lock; + struct gnet_stats_rate_est64 rate_est; struct tc_cbq_xstats xstats; struct tcf_proto *filter_list; @@ -157,22 +138,21 @@ struct cbq_class int refcnt; int filters; - struct cbq_class *defaults[TC_PRIO_MAX+1]; + struct cbq_class *defaults[TC_PRIO_MAX + 1]; }; -struct cbq_sched_data -{ - struct cbq_class *classes[16]; /* Hash table of all classes */ - int nclasses[TC_CBQ_MAXPRIO+1]; - unsigned quanta[TC_CBQ_MAXPRIO+1]; +struct cbq_sched_data { + struct Qdisc_class_hash clhash; /* Hash table of all classes */ + int nclasses[TC_CBQ_MAXPRIO + 1]; + unsigned int quanta[TC_CBQ_MAXPRIO + 1]; struct cbq_class link; - unsigned activemask; - struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes + unsigned int activemask; + struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes with backlog */ -#ifdef CONFIG_NET_CLS_POLICE +#ifdef CONFIG_NET_CLS_ACT struct cbq_class *rx_class; #endif struct cbq_class *tx_class; @@ -180,64 +160,58 @@ struct cbq_sched_data int tx_len; psched_time_t now; /* Cached timestamp */ psched_time_t now_rt; /* Cached real time */ - unsigned pmask; + unsigned int pmask; - struct timer_list delay_timer; - struct timer_list wd_timer; /* Watchdog timer, + struct hrtimer delay_timer; + struct qdisc_watchdog watchdog; /* Watchdog timer, started when CBQ has backlog, but cannot transmit just now */ - long wd_expires; + psched_tdiff_t wd_expires; int toplevel; u32 hgenerator; }; -#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log]) - +#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len) -static __inline__ unsigned cbq_hash(u32 h) -{ - h ^= h>>8; - h ^= h>>4; - return h&0xF; -} - -static __inline__ struct cbq_class * +static inline struct cbq_class * cbq_class_lookup(struct cbq_sched_data *q, u32 classid) { - struct cbq_class *cl; + struct Qdisc_class_common *clc; - for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next) - if (cl->classid == classid) - return cl; - return NULL; + clc = qdisc_class_find(&q->clhash, classid); + if (clc == NULL) + return NULL; + return container_of(clc, struct cbq_class, common); } -#ifdef CONFIG_NET_CLS_POLICE +#ifdef CONFIG_NET_CLS_ACT static struct cbq_class * cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) { - struct cbq_class *cl, *new; + struct cbq_class *cl; - for (cl = this->tparent; cl; cl = cl->tparent) - if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) - return new; + for (cl = this->tparent; cl; cl = cl->tparent) { + struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT]; + if (new != NULL && new != this) + return new; + } return NULL; } #endif /* Classify packet. The procedure is pretty complicated, but - it allows us to combine link sharing and priority scheduling - transparently. - - Namely, you can put link sharing rules (f.e. route based) at root of CBQ, - so that it resolves to split nodes. Then packets are classified - by logical priority, or a more specific classifier may be attached - to the split node. + * it allows us to combine link sharing and priority scheduling + * transparently. + * + * Namely, you can put link sharing rules (f.e. route based) at root of CBQ, + * so that it resolves to split nodes. Then packets are classified + * by logical priority, or a more specific classifier may be attached + * to the split node. */ static struct cbq_class * @@ -253,11 +227,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 1. If skb->priority points to one of our classes, use it. */ - if (TC_H_MAJ(prio^sch->handle) == 0 && + if (TC_H_MAJ(prio ^ sch->handle) == 0 && (cl = cbq_class_lookup(q, prio)) != NULL) return cl; - *qerr = NET_XMIT_BYPASS; + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; for (;;) { int result = 0; defmap = head->defaults; @@ -265,35 +239,31 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 2+n. Apply classifier. */ - if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0) + if (!head->filter_list || + (result = tc_classify_compat(skb, head->filter_list, &res)) < 0) goto fallback; - if ((cl = (void*)res.class) == NULL) { + cl = (void *)res.class; + if (!cl) { if (TC_H_MAJ(res.classid)) cl = cbq_class_lookup(q, res.classid); - else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) + else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) cl = defmap[TC_PRIO_BESTEFFORT]; - if (cl == NULL || cl->level >= head->level) + if (cl == NULL) goto fallback; } - + if (cl->level >= head->level) + goto fallback; #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - *qerr = NET_XMIT_SUCCESS; + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; - } -#elif defined(CONFIG_NET_CLS_POLICE) - switch (result) { - case TC_POLICE_RECLASSIFY: + case TC_ACT_RECLASSIFY: return cbq_reclassify(skb, cl); - case TC_POLICE_SHOT: - return NULL; - default: - break; } #endif if (cl->level == 0) @@ -314,7 +284,7 @@ fallback: * Step 4. No success... */ if (TC_H_MAJ(prio) == 0 && - !(cl = head->defaults[prio&TC_PRIO_MAX]) && + !(cl = head->defaults[prio & TC_PRIO_MAX]) && !(cl = head->defaults[TC_PRIO_BESTEFFORT])) return head; @@ -322,12 +292,12 @@ fallback: } /* - A packet has just been enqueued on the empty class. - cbq_activate_class adds it to the tail of active class list - of its priority band. + * A packet has just been enqueued on the empty class. + * cbq_activate_class adds it to the tail of active class list + * of its priority band. */ -static __inline__ void cbq_activate_class(struct cbq_class *cl) +static inline void cbq_activate_class(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); int prio = cl->cpriority; @@ -346,9 +316,9 @@ static __inline__ void cbq_activate_class(struct cbq_class *cl) } /* - Unlink class from active chain. - Note that this same procedure is done directly in cbq_dequeue* - during round-robin procedure. + * Unlink class from active chain. + * Note that this same procedure is done directly in cbq_dequeue* + * during round-robin procedure. */ static void cbq_deactivate_class(struct cbq_class *this) @@ -372,8 +342,6 @@ static void cbq_deactivate_class(struct cbq_class *this) return; } } - - cl = cl_prev->next_alive; return; } } while ((cl_prev = cl) != q->active[prio]); @@ -384,20 +352,20 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) { int toplevel = q->toplevel; - if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { + if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) { psched_time_t now; psched_tdiff_t incr; - PSCHED_GET_TIME(now); - incr = PSCHED_TDIFF(now, q->now_rt); - PSCHED_TADD2(q->now, incr, now); + now = psched_get_time(); + incr = now - q->now_rt; + now = q->now + incr; do { - if (PSCHED_TLESS(cl->undertime, now)) { + if (cl->undertime < now) { q->toplevel = cl->level; return; } - } while ((cl=cl->borrow) != NULL && toplevel > cl->level); + } while ((cl = cl->borrow) != NULL && toplevel > cl->level); } } @@ -405,68 +373,36 @@ static int cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); - int len = skb->len; - int ret; + int uninitialized_var(ret); struct cbq_class *cl = cbq_classify(skb, sch, &ret); -#ifdef CONFIG_NET_CLS_POLICE +#ifdef CONFIG_NET_CLS_ACT q->rx_class = cl; #endif if (cl == NULL) { - if (ret == NET_XMIT_BYPASS) + if (ret & __NET_XMIT_BYPASS) sch->qstats.drops++; kfree_skb(skb); return ret; } -#ifdef CONFIG_NET_CLS_POLICE +#ifdef CONFIG_NET_CLS_ACT cl->q->__parent = sch; #endif - if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) { + ret = qdisc_enqueue(skb, cl->q); + if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; - sch->bstats.packets++; - sch->bstats.bytes+=len; cbq_mark_toplevel(q, cl); if (!cl->next_alive) cbq_activate_class(cl); return ret; } - sch->qstats.drops++; - cbq_mark_toplevel(q, cl); - cl->qstats.drops++; - return ret; -} - -static int -cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl; - int ret; - - if ((cl = q->tx_class) == NULL) { - kfree_skb(skb); + if (net_xmit_drop_count(ret)) { sch->qstats.drops++; - return NET_XMIT_CN; - } - q->tx_class = NULL; - - cbq_mark_toplevel(q, cl); - -#ifdef CONFIG_NET_CLS_POLICE - q->rx_class = cl; - cl->q->__parent = sch; -#endif - if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) { - sch->q.qlen++; - sch->qstats.requeues++; - if (!cl->next_alive) - cbq_activate_class(cl); - return 0; + cbq_mark_toplevel(q, cl); + cl->qstats.drops++; } - sch->qstats.drops++; - cl->qstats.drops++; return ret; } @@ -477,17 +413,17 @@ cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) static void cbq_ovl_classic(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + psched_tdiff_t delay = cl->undertime - q->now; if (!cl->delayed) { delay += cl->offtime; - /* - Class goes to sleep, so that it will have no - chance to work avgidle. Let's forgive it 8) - - BTW cbq-2.0 has a crap in this - place, apparently they forgot to shift it by cl->ewma_log. + /* + * Class goes to sleep, so that it will have no + * chance to work avgidle. Let's forgive it 8) + * + * BTW cbq-2.0 has a crap in this + * place, apparently they forgot to shift it by cl->ewma_log. */ if (cl->avgidle < 0) delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); @@ -495,7 +431,7 @@ static void cbq_ovl_classic(struct cbq_class *cl) cl->avgidle = cl->minidle; if (delay <= 0) delay = 1; - PSCHED_TADD2(q->now, delay, cl->undertime); + cl->undertime = q->now + delay; cl->xstats.overactions++; cl->delayed = 1; @@ -504,15 +440,15 @@ static void cbq_ovl_classic(struct cbq_class *cl) q->wd_expires = delay; /* Dirty work! We must schedule wakeups based on - real available rate, rather than leaf rate, - which may be tiny (even zero). + * real available rate, rather than leaf rate, + * which may be tiny (even zero). */ if (q->toplevel == TC_CBQ_MAXLEVEL) { struct cbq_class *b; psched_tdiff_t base_delay = q->wd_expires; for (b = cl->borrow; b; b = b->borrow) { - delay = PSCHED_TDIFF(b->undertime, q->now); + delay = b->undertime - q->now; if (delay < base_delay) { if (delay <= 0) delay = 1; @@ -525,7 +461,7 @@ static void cbq_ovl_classic(struct cbq_class *cl) } /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when - they go overlimit + * they go overlimit */ static void cbq_ovl_rclassic(struct cbq_class *cl) @@ -550,27 +486,36 @@ static void cbq_ovl_rclassic(struct cbq_class *cl) static void cbq_ovl_delay(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + psched_tdiff_t delay = cl->undertime - q->now; + + if (test_bit(__QDISC_STATE_DEACTIVATED, + &qdisc_root_sleeping(cl->qdisc)->state)) + return; if (!cl->delayed) { - unsigned long sched = jiffies; + psched_time_t sched = q->now; + ktime_t expires; delay += cl->offtime; if (cl->avgidle < 0) delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); if (cl->avgidle < cl->minidle) cl->avgidle = cl->minidle; - PSCHED_TADD2(q->now, delay, cl->undertime); + cl->undertime = q->now + delay; if (delay > 0) { - sched += PSCHED_US2JIFFIE(delay) + cl->penalty; + sched += delay + cl->penalty; cl->penalized = sched; cl->cpriority = TC_CBQ_MAXPRIO; q->pmask |= (1<<TC_CBQ_MAXPRIO); - if (del_timer(&q->delay_timer) && - (long)(q->delay_timer.expires - sched) > 0) - q->delay_timer.expires = sched; - add_timer(&q->delay_timer); + + expires = ns_to_ktime(PSCHED_TICKS2NS(sched)); + if (hrtimer_try_to_cancel(&q->delay_timer) && + ktime_to_ns(ktime_sub( + hrtimer_get_expires(&q->delay_timer), + expires)) > 0) + hrtimer_set_expires(&q->delay_timer, expires); + hrtimer_restart(&q->delay_timer); cl->delayed = 1; cl->xstats.overactions++; return; @@ -587,7 +532,7 @@ static void cbq_ovl_lowprio(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - cl->penalized = jiffies + cl->penalty; + cl->penalized = q->now + cl->penalty; if (cl->cpriority != cl->priority2) { cl->cpriority = cl->priority2; @@ -608,27 +553,19 @@ static void cbq_ovl_drop(struct cbq_class *cl) cbq_ovl_classic(cl); } -static void cbq_watchdog(unsigned long arg) -{ - struct Qdisc *sch = (struct Qdisc*)arg; - - sch->flags &= ~TCQ_F_THROTTLED; - netif_schedule(sch->dev); -} - -static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) +static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio, + psched_time_t now) { struct cbq_class *cl; struct cbq_class *cl_prev = q->active[prio]; - unsigned long now = jiffies; - unsigned long sched = now; + psched_time_t sched = now; if (cl_prev == NULL) - return now; + return 0; do { cl = cl_prev->next_alive; - if ((long)(now - cl->penalized) > 0) { + if (now - cl->penalized > 0) { cl_prev->next_alive = cl->next_alive; cl->next_alive = NULL; cl->cpriority = cl->priority; @@ -644,30 +581,34 @@ static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) } cl = cl_prev->next_alive; - } else if ((long)(sched - cl->penalized) > 0) + } else if (sched - cl->penalized > 0) sched = cl->penalized; } while ((cl_prev = cl) != q->active[prio]); - return (long)(sched - now); + return sched - now; } -static void cbq_undelay(unsigned long arg) +static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) { - struct Qdisc *sch = (struct Qdisc*)arg; - struct cbq_sched_data *q = qdisc_priv(sch); - long delay = 0; - unsigned pmask; + struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data, + delay_timer); + struct Qdisc *sch = q->watchdog.qdisc; + psched_time_t now; + psched_tdiff_t delay = 0; + unsigned int pmask; + + now = psched_get_time(); pmask = q->pmask; q->pmask = 0; while (pmask) { int prio = ffz(~pmask); - long tmp; + psched_tdiff_t tmp; pmask &= ~(1<<prio); - tmp = cbq_undelay_prio(q, prio); + tmp = cbq_undelay_prio(q, prio, now); if (tmp > 0) { q->pmask |= 1<<prio; if (tmp < delay || delay == 0) @@ -676,20 +617,21 @@ static void cbq_undelay(unsigned long arg) } if (delay) { - q->delay_timer.expires = jiffies + delay; - add_timer(&q->delay_timer); + ktime_t time; + + time = ktime_set(0, 0); + time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay)); + hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); } - sch->flags &= ~TCQ_F_THROTTLED; - netif_schedule(sch->dev); + qdisc_unthrottled(sch); + __netif_schedule(qdisc_root(sch)); + return HRTIMER_NORESTART; } - -#ifdef CONFIG_NET_CLS_POLICE - +#ifdef CONFIG_NET_CLS_ACT static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) { - int len = skb->len; struct Qdisc *sch = child->__parent; struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = q->rx_class; @@ -697,21 +639,22 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) q->rx_class = NULL; if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { + int ret; cbq_mark_toplevel(q, cl); q->rx_class = cl; cl->q->__parent = sch; - if (cl->q->enqueue(skb, cl->q) == 0) { + ret = qdisc_enqueue(skb, cl->q); + if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; - sch->bstats.packets++; - sch->bstats.bytes+=len; if (!cl->next_alive) cbq_activate_class(cl); return 0; } - sch->qstats.drops++; + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; return 0; } @@ -720,29 +663,29 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) } #endif -/* - It is mission critical procedure. - - We "regenerate" toplevel cutoff, if transmitting class - has backlog and it is not regulated. It is not part of - original CBQ description, but looks more reasonable. - Probably, it is wrong. This question needs further investigation. -*/ +/* + * It is mission critical procedure. + * + * We "regenerate" toplevel cutoff, if transmitting class + * has backlog and it is not regulated. It is not part of + * original CBQ description, but looks more reasonable. + * Probably, it is wrong. This question needs further investigation. + */ -static __inline__ void +static inline void cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, struct cbq_class *borrowed) { if (cl && q->toplevel >= borrowed->level) { if (cl->q->q.qlen > 1) { do { - if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) { + if (borrowed->undertime == PSCHED_PASTPERFECT) { q->toplevel = borrowed->level; return; } - } while ((borrowed=borrowed->borrow) != NULL); + } while ((borrowed = borrowed->borrow) != NULL); } -#if 0 +#if 0 /* It is not necessary now. Uncommenting it will save CPU cycles, but decrease fairness. */ @@ -768,22 +711,22 @@ cbq_update(struct cbq_sched_data *q) cl->bstats.bytes += len; /* - (now - last) is total time between packet right edges. - (last_pktlen/rate) is "virtual" busy time, so that - - idle = (now - last) - last_pktlen/rate + * (now - last) is total time between packet right edges. + * (last_pktlen/rate) is "virtual" busy time, so that + * + * idle = (now - last) - last_pktlen/rate */ - idle = PSCHED_TDIFF(q->now, cl->last); + idle = q->now - cl->last; if ((unsigned long)idle > 128*1024*1024) { avgidle = cl->maxidle; } else { idle -= L2T(cl, len); /* true_avgidle := (1-W)*true_avgidle + W*idle, - where W=2^{-ewma_log}. But cl->avgidle is scaled: - cl->avgidle == true_avgidle/W, - hence: + * where W=2^{-ewma_log}. But cl->avgidle is scaled: + * cl->avgidle == true_avgidle/W, + * hence: */ avgidle += idle - (avgidle>>cl->ewma_log); } @@ -797,34 +740,32 @@ cbq_update(struct cbq_sched_data *q) cl->avgidle = avgidle; /* Calculate expected time, when this class - will be allowed to send. - It will occur, when: - (1-W)*true_avgidle + W*delay = 0, i.e. - idle = (1/W - 1)*(-true_avgidle) - or - idle = (1 - W)*(-cl->avgidle); + * will be allowed to send. + * It will occur, when: + * (1-W)*true_avgidle + W*delay = 0, i.e. + * idle = (1/W - 1)*(-true_avgidle) + * or + * idle = (1 - W)*(-cl->avgidle); */ idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); /* - That is not all. - To maintain the rate allocated to the class, - we add to undertime virtual clock, - necessary to complete transmitted packet. - (len/phys_bandwidth has been already passed - to the moment of cbq_update) + * That is not all. + * To maintain the rate allocated to the class, + * we add to undertime virtual clock, + * necessary to complete transmitted packet. + * (len/phys_bandwidth has been already passed + * to the moment of cbq_update) */ idle -= L2T(&q->link, len); idle += L2T(cl, len); - PSCHED_AUDIT_TDIFF(idle); - - PSCHED_TADD2(q->now, idle, cl->undertime); + cl->undertime = q->now + idle; } else { /* Underlimit */ - PSCHED_SET_PASTPERFECT(cl->undertime); + cl->undertime = PSCHED_PASTPERFECT; if (avgidle > cl->maxidle) cl->avgidle = cl->maxidle; else @@ -836,7 +777,7 @@ cbq_update(struct cbq_sched_data *q) cbq_update_toplevel(q, this, q->tx_borrowed); } -static __inline__ struct cbq_class * +static inline struct cbq_class * cbq_under_limit(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); @@ -845,38 +786,37 @@ cbq_under_limit(struct cbq_class *cl) if (cl->tparent == NULL) return cl; - if (PSCHED_IS_PASTPERFECT(cl->undertime) || - !PSCHED_TLESS(q->now, cl->undertime)) { + if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) { cl->delayed = 0; return cl; } do { /* It is very suspicious place. Now overlimit - action is generated for not bounded classes - only if link is completely congested. - Though it is in agree with ancestor-only paradigm, - it looks very stupid. Particularly, - it means that this chunk of code will either - never be called or result in strong amplification - of burstiness. Dangerous, silly, and, however, - no another solution exists. + * action is generated for not bounded classes + * only if link is completely congested. + * Though it is in agree with ancestor-only paradigm, + * it looks very stupid. Particularly, + * it means that this chunk of code will either + * never be called or result in strong amplification + * of burstiness. Dangerous, silly, and, however, + * no another solution exists. */ - if ((cl = cl->borrow) == NULL) { + cl = cl->borrow; + if (!cl) { this_cl->qstats.overlimits++; this_cl->overlimit(this_cl); return NULL; } if (cl->level > q->toplevel) return NULL; - } while (!PSCHED_IS_PASTPERFECT(cl->undertime) && - PSCHED_TLESS(q->now, cl->undertime)); + } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime); cl->delayed = 0; return cl; } -static __inline__ struct sk_buff * +static inline struct sk_buff * cbq_dequeue_prio(struct Qdisc *sch, int prio) { struct cbq_sched_data *q = qdisc_priv(sch); @@ -900,7 +840,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio) if (cl->deficit <= 0) { /* Class exhausted its allotment per - this round. Switch to the next one. + * this round. Switch to the next one. */ deficit = 1; cl->deficit += cl->quantum; @@ -910,13 +850,13 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio) skb = cl->q->dequeue(cl->q); /* Class did not give us any skb :-( - It could occur even if cl->q->q.qlen != 0 - f.e. if cl->q == "tbf" + * It could occur even if cl->q->q.qlen != 0 + * f.e. if cl->q == "tbf" */ if (skb == NULL) goto skip_class; - cl->deficit -= skb->len; + cl->deficit -= qdisc_pkt_len(skb); q->tx_class = cl; q->tx_borrowed = borrow; if (borrow != cl) { @@ -924,11 +864,11 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio) borrow->xstats.borrows++; cl->xstats.borrows++; #else - borrow->xstats.borrows += skb->len; - cl->xstats.borrows += skb->len; + borrow->xstats.borrows += qdisc_pkt_len(skb); + cl->xstats.borrows += qdisc_pkt_len(skb); #endif } - q->tx_len = skb->len; + q->tx_len = qdisc_pkt_len(skb); if (cl->deficit <= 0) { q->active[prio] = cl; @@ -940,7 +880,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio) skip_class: if (cl->q->q.qlen == 0 || prio != cl->cpriority) { /* Class is empty or penalized. - Unlink it from active chain. + * Unlink it from active chain. */ cl_prev->next_alive = cl->next_alive; cl->next_alive = NULL; @@ -979,14 +919,14 @@ next_class: return NULL; } -static __inline__ struct sk_buff * +static inline struct sk_buff * cbq_dequeue_1(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - unsigned activemask; + unsigned int activemask; - activemask = q->activemask&0xFF; + activemask = q->activemask & 0xFF; while (activemask) { int prio = ffz(~activemask); activemask &= ~(1<<prio); @@ -1005,25 +945,28 @@ cbq_dequeue(struct Qdisc *sch) psched_time_t now; psched_tdiff_t incr; - PSCHED_GET_TIME(now); - incr = PSCHED_TDIFF(now, q->now_rt); + now = psched_get_time(); + incr = now - q->now_rt; if (q->tx_class) { psched_tdiff_t incr2; /* Time integrator. We calculate EOS time - by adding expected packet transmission time. - If real time is greater, we warp artificial clock, - so that: - - cbq_time = max(real_time, work); + * by adding expected packet transmission time. + * If real time is greater, we warp artificial clock, + * so that: + * + * cbq_time = max(real_time, work); */ incr2 = L2T(&q->link, q->tx_len); - PSCHED_TADD(q->now, incr2); + q->now += incr2; cbq_update(q); if ((incr -= incr2) < 0) incr = 0; + q->now += incr; + } else { + if (now > q->now) + q->now = now; } - PSCHED_TADD(q->now, incr); q->now_rt = now; for (;;) { @@ -1031,49 +974,47 @@ cbq_dequeue(struct Qdisc *sch) skb = cbq_dequeue_1(sch); if (skb) { + qdisc_bstats_update(sch, skb); sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(sch); return skb; } /* All the classes are overlimit. - - It is possible, if: - - 1. Scheduler is empty. - 2. Toplevel cutoff inhibited borrowing. - 3. Root class is overlimit. - - Reset 2d and 3d conditions and retry. - - Note, that NS and cbq-2.0 are buggy, peeking - an arbitrary class is appropriate for ancestor-only - sharing, but not for toplevel algorithm. - - Our version is better, but slower, because it requires - two passes, but it is unavoidable with top-level sharing. - */ + * + * It is possible, if: + * + * 1. Scheduler is empty. + * 2. Toplevel cutoff inhibited borrowing. + * 3. Root class is overlimit. + * + * Reset 2d and 3d conditions and retry. + * + * Note, that NS and cbq-2.0 are buggy, peeking + * an arbitrary class is appropriate for ancestor-only + * sharing, but not for toplevel algorithm. + * + * Our version is better, but slower, because it requires + * two passes, but it is unavoidable with top-level sharing. + */ if (q->toplevel == TC_CBQ_MAXLEVEL && - PSCHED_IS_PASTPERFECT(q->link.undertime)) + q->link.undertime == PSCHED_PASTPERFECT) break; q->toplevel = TC_CBQ_MAXLEVEL; - PSCHED_SET_PASTPERFECT(q->link.undertime); + q->link.undertime = PSCHED_PASTPERFECT; } /* No packets in scheduler or nobody wants to give them to us :-( - Sigh... start watchdog timer in the last case. */ + * Sigh... start watchdog timer in the last case. + */ if (sch->q.qlen) { sch->qstats.overlimits++; - if (q->wd_expires) { - long delay = PSCHED_US2JIFFIE(q->wd_expires); - if (delay <= 0) - delay = 1; - mod_timer(&q->wd_timer, jiffies + delay); - sch->flags |= TCQ_F_THROTTLED; - } + if (q->wd_expires) + qdisc_watchdog_schedule(&q->watchdog, + now + q->wd_expires); } return NULL; } @@ -1089,36 +1030,39 @@ static void cbq_adjust_levels(struct cbq_class *this) int level = 0; struct cbq_class *cl; - if ((cl = this->children) != NULL) { + cl = this->children; + if (cl) { do { if (cl->level > level) level = cl->level; } while ((cl = cl->sibling) != this->children); } - this->level = level+1; + this->level = level + 1; } while ((this = this->tparent) != NULL); } static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) { struct cbq_class *cl; - unsigned h; + unsigned int h; if (q->quanta[prio] == 0) return; - for (h=0; h<16; h++) { - for (cl = q->classes[h]; cl; cl = cl->next) { + for (h = 0; h < q->clhash.hashsize; h++) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { /* BUGGGG... Beware! This expression suffer of - arithmetic overflows! + * arithmetic overflows! */ if (cl->priority == prio) { cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ q->quanta[prio]; } - if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { - printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); - cl->quantum = cl->qdisc->dev->mtu/2 + 1; + if (cl->quantum <= 0 || + cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) { + pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n", + cl->common.classid, cl->quantum); + cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; } } } @@ -1128,29 +1072,30 @@ static void cbq_sync_defmap(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); struct cbq_class *split = cl->split; - unsigned h; + unsigned int h; int i; if (split == NULL) return; - for (i=0; i<=TC_PRIO_MAX; i++) { - if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) + for (i = 0; i <= TC_PRIO_MAX; i++) { + if (split->defaults[i] == cl && !(cl->defmap & (1<<i))) split->defaults[i] = NULL; } - for (i=0; i<=TC_PRIO_MAX; i++) { + for (i = 0; i <= TC_PRIO_MAX; i++) { int level = split->level; if (split->defaults[i]) continue; - for (h=0; h<16; h++) { + for (h = 0; h < q->clhash.hashsize; h++) { struct cbq_class *c; - for (c = q->classes[h]; c; c = c->next) { + hlist_for_each_entry(c, &q->clhash.hash[h], + common.hnode) { if (c->split == split && c->level < level && - c->defmap&(1<<i)) { + c->defmap & (1<<i)) { split->defaults[i] = c; level = c->level; } @@ -1164,14 +1109,15 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma struct cbq_class *split = NULL; if (splitid == 0) { - if ((split = cl->split) == NULL) + split = cl->split; + if (!split) return; - splitid = split->classid; + splitid = split->common.classid; } - if (split == NULL || split->classid != splitid) { + if (split == NULL || split->common.classid != splitid) { for (split = cl->tparent; split; split = split->tparent) - if (split->classid == splitid) + if (split->common.classid == splitid) break; } @@ -1182,9 +1128,9 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma cl->defmap = 0; cbq_sync_defmap(cl); cl->split = split; - cl->defmap = def&mask; + cl->defmap = def & mask; } else - cl->defmap = (cl->defmap&~mask)|(def&mask); + cl->defmap = (cl->defmap & ~mask) | (def & mask); cbq_sync_defmap(cl); } @@ -1194,16 +1140,10 @@ static void cbq_unlink_class(struct cbq_class *this) struct cbq_class *cl, **clp; struct cbq_sched_data *q = qdisc_priv(this->qdisc); - for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) { - if (cl == this) { - *clp = cl->next; - cl->next = NULL; - break; - } - } + qdisc_class_hash_remove(&q->clhash, &this->common); if (this->tparent) { - clp=&this->sibling; + clp = &this->sibling; cl = *clp; do { if (cl == this) { @@ -1219,19 +1159,17 @@ static void cbq_unlink_class(struct cbq_class *this) this->tparent->children = NULL; } } else { - BUG_TRAP(this->sibling == this); + WARN_ON(this->sibling != this); } } static void cbq_link_class(struct cbq_class *this) { struct cbq_sched_data *q = qdisc_priv(this->qdisc); - unsigned h = cbq_hash(this->classid); struct cbq_class *parent = this->tparent; this->sibling = this; - this->next = q->classes[h]; - q->classes[h] = this; + qdisc_class_hash_insert(&q->clhash, &this->common); if (parent == NULL) return; @@ -1244,7 +1182,7 @@ static void cbq_link_class(struct cbq_class *this) } } -static unsigned int cbq_drop(struct Qdisc* sch) +static unsigned int cbq_drop(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl, *cl_head; @@ -1252,13 +1190,16 @@ static unsigned int cbq_drop(struct Qdisc* sch) unsigned int len; for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { - if ((cl_head = q->active[prio]) == NULL) + cl_head = q->active[prio]; + if (!cl_head) continue; cl = cl_head; do { if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { sch->q.qlen--; + if (!cl->q->q.qlen) + cbq_deactivate_class(cl); return len; } } while ((cl = cl->next_alive) != cl_head); @@ -1267,32 +1208,32 @@ static unsigned int cbq_drop(struct Qdisc* sch) } static void -cbq_reset(struct Qdisc* sch) +cbq_reset(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl; int prio; - unsigned h; + unsigned int h; q->activemask = 0; q->pmask = 0; q->tx_class = NULL; q->tx_borrowed = NULL; - del_timer(&q->wd_timer); - del_timer(&q->delay_timer); + qdisc_watchdog_cancel(&q->watchdog); + hrtimer_cancel(&q->delay_timer); q->toplevel = TC_CBQ_MAXLEVEL; - PSCHED_GET_TIME(q->now); + q->now = psched_get_time(); q->now_rt = q->now; for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) q->active[prio] = NULL; - for (h = 0; h < 16; h++) { - for (cl = q->classes[h]; cl; cl = cl->next) { + for (h = 0; h < q->clhash.hashsize; h++) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { qdisc_reset(cl->q); cl->next_alive = NULL; - PSCHED_SET_PASTPERFECT(cl->undertime); + cl->undertime = PSCHED_PASTPERFECT; cl->avgidle = cl->maxidle; cl->deficit = cl->quantum; cl->cpriority = cl->priority; @@ -1304,21 +1245,21 @@ cbq_reset(struct Qdisc* sch) static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) { - if (lss->change&TCF_CBQ_LSS_FLAGS) { - cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; - cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; + if (lss->change & TCF_CBQ_LSS_FLAGS) { + cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; + cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; } - if (lss->change&TCF_CBQ_LSS_EWMA) + if (lss->change & TCF_CBQ_LSS_EWMA) cl->ewma_log = lss->ewma_log; - if (lss->change&TCF_CBQ_LSS_AVPKT) + if (lss->change & TCF_CBQ_LSS_AVPKT) cl->avpkt = lss->avpkt; - if (lss->change&TCF_CBQ_LSS_MINIDLE) + if (lss->change & TCF_CBQ_LSS_MINIDLE) cl->minidle = -(long)lss->minidle; - if (lss->change&TCF_CBQ_LSS_MAXIDLE) { + if (lss->change & TCF_CBQ_LSS_MAXIDLE) { cl->maxidle = lss->maxidle; cl->avgidle = lss->maxidle; } - if (lss->change&TCF_CBQ_LSS_OFFTIME) + if (lss->change & TCF_CBQ_LSS_OFFTIME) cl->offtime = lss->offtime; return 0; } @@ -1346,10 +1287,10 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) if (wrr->weight) cl->weight = wrr->weight; if (wrr->priority) { - cl->priority = wrr->priority-1; + cl->priority = wrr->priority - 1; cl->cpriority = cl->priority; if (cl->priority >= cl->priority2) - cl->priority2 = TC_CBQ_MAXPRIO-1; + cl->priority2 = TC_CBQ_MAXPRIO - 1; } cbq_addprio(q, cl); @@ -1366,10 +1307,10 @@ static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) cl->overlimit = cbq_ovl_delay; break; case TC_CBQ_OVL_LOWPRIO: - if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || - ovl->priority2-1 <= cl->priority) + if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO || + ovl->priority2 - 1 <= cl->priority) return -EINVAL; - cl->priority2 = ovl->priority2-1; + cl->priority2 = ovl->priority2 - 1; cl->overlimit = cbq_ovl_lowprio; break; case TC_CBQ_OVL_DROP: @@ -1381,11 +1322,11 @@ static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) default: return -EINVAL; } - cl->penalty = (ovl->penalty*HZ)/1000; + cl->penalty = ovl->penalty; return 0; } -#ifdef CONFIG_NET_CLS_POLICE +#ifdef CONFIG_NET_CLS_ACT static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) { cl->police = p->police; @@ -1406,81 +1347,97 @@ static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) return 0; } -static int cbq_init(struct Qdisc *sch, struct rtattr *opt) +static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { + [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) }, + [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) }, + [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) }, + [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) }, + [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) }, + [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, +}; + +static int cbq_init(struct Qdisc *sch, struct nlattr *opt) { struct cbq_sched_data *q = qdisc_priv(sch); - struct rtattr *tb[TCA_CBQ_MAX]; + struct nlattr *tb[TCA_CBQ_MAX + 1]; struct tc_ratespec *r; + int err; - if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 || - tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || - RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) - return -EINVAL; + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); + if (err < 0) + return err; - if (tb[TCA_CBQ_LSSOPT-1] && - RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL) return -EINVAL; - r = RTA_DATA(tb[TCA_CBQ_RATE-1]); + r = nla_data(tb[TCA_CBQ_RATE]); - if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) + if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL) return -EINVAL; + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) + goto put_rtab; + q->link.refcnt = 1; q->link.sibling = &q->link; - q->link.classid = sch->handle; + q->link.common.classid = sch->handle; q->link.qdisc = sch; - if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle); + if (!q->link.q) q->link.q = &noop_qdisc; - q->link.priority = TC_CBQ_MAXPRIO-1; - q->link.priority2 = TC_CBQ_MAXPRIO-1; - q->link.cpriority = TC_CBQ_MAXPRIO-1; + q->link.priority = TC_CBQ_MAXPRIO - 1; + q->link.priority2 = TC_CBQ_MAXPRIO - 1; + q->link.cpriority = TC_CBQ_MAXPRIO - 1; q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; q->link.overlimit = cbq_ovl_classic; - q->link.allot = psched_mtu(sch->dev); + q->link.allot = psched_mtu(qdisc_dev(sch)); q->link.quantum = q->link.allot; q->link.weight = q->link.R_tab->rate.rate; q->link.ewma_log = TC_CBQ_DEF_EWMA; q->link.avpkt = q->link.allot/2; q->link.minidle = -0x7FFFFFFF; - q->link.stats_lock = &sch->dev->queue_lock; - init_timer(&q->wd_timer); - q->wd_timer.data = (unsigned long)sch; - q->wd_timer.function = cbq_watchdog; - init_timer(&q->delay_timer); - q->delay_timer.data = (unsigned long)sch; + qdisc_watchdog_init(&q->watchdog, sch); + hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); q->delay_timer.function = cbq_undelay; q->toplevel = TC_CBQ_MAXLEVEL; - PSCHED_GET_TIME(q->now); + q->now = psched_get_time(); q->now_rt = q->now; cbq_link_class(&q->link); - if (tb[TCA_CBQ_LSSOPT-1]) - cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + if (tb[TCA_CBQ_LSSOPT]) + cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT])); cbq_addprio(q, &q->link); return 0; + +put_rtab: + qdisc_put_rtab(q->link.R_tab); + return err; } -static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); - RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); + if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate)) + goto nla_put_failure; return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_lssopt opt; opt.flags = 0; @@ -1495,83 +1452,89 @@ static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) opt.minidle = (u32)(-cl->minidle); opt.offtime = cl->offtime; opt.change = ~0; - RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_wrropt opt; + memset(&opt, 0, sizeof(opt)); opt.flags = 0; opt.allot = cl->allot; - opt.priority = cl->priority+1; - opt.cpriority = cl->cpriority+1; + opt.priority = cl->priority + 1; + opt.cpriority = cl->cpriority + 1; opt.weight = cl->weight; - RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_ovl opt; opt.strategy = cl->ovl_strategy; - opt.priority2 = cl->priority2+1; + opt.priority2 = cl->priority2 + 1; opt.pad = 0; - opt.penalty = (cl->penalty*1000)/HZ; - RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + opt.penalty = cl->penalty; + if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_fopt opt; if (cl->split || cl->defmap) { - opt.split = cl->split ? cl->split->classid : 0; + opt.split = cl->split ? cl->split->common.classid : 0; opt.defmap = cl->defmap; opt.defchange = ~0; - RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt)) + goto nla_put_failure; } return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } -#ifdef CONFIG_NET_CLS_POLICE -static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) +#ifdef CONFIG_NET_CLS_ACT +static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) { - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_police opt; if (cl->police) { opt.police = cl->police; opt.__res1 = 0; opt.__res2 = 0; - RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt)) + goto nla_put_failure; } return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } #endif @@ -1582,7 +1545,7 @@ static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) cbq_dump_rate(skb, cl) < 0 || cbq_dump_wrr(skb, cl) < 0 || cbq_dump_ovl(skb, cl) < 0 || -#ifdef CONFIG_NET_CLS_POLICE +#ifdef CONFIG_NET_CLS_ACT cbq_dump_police(skb, cl) < 0 || #endif cbq_dump_fopt(skb, cl) < 0) @@ -1593,18 +1556,17 @@ static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct cbq_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; - struct rtattr *rta; + struct nlattr *nest; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if (cbq_dump_attr(skb, &q->link) < 0) - goto rtattr_failure; - rta->rta_len = skb->tail - b; - return skb->len; + goto nla_put_failure; + return nla_nest_end(skb, nest); -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } @@ -1621,26 +1583,25 @@ static int cbq_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { - struct cbq_class *cl = (struct cbq_class*)arg; - unsigned char *b = skb->tail; - struct rtattr *rta; + struct cbq_class *cl = (struct cbq_class *)arg; + struct nlattr *nest; if (cl->tparent) - tcm->tcm_parent = cl->tparent->classid; + tcm->tcm_parent = cl->tparent->common.classid; else tcm->tcm_parent = TC_H_ROOT; - tcm->tcm_handle = cl->classid; + tcm->tcm_handle = cl->common.classid; tcm->tcm_info = cl->q->handle; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if (cbq_dump_attr(skb, cl) < 0) - goto rtattr_failure; - rta->rta_len = skb->tail - b; - return skb->len; + goto nla_put_failure; + return nla_nest_end(skb, nest); -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } @@ -1649,19 +1610,17 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; cl->qstats.qlen = cl->q->q.qlen; cl->xstats.avgidle = cl->avgidle; cl->xstats.undertime = 0; - if (!PSCHED_IS_PASTPERFECT(cl->undertime)) - cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now); + if (cl->undertime != PSCHED_PASTPERFECT) + cl->xstats.undertime = cl->undertime - q->now; if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || -#ifdef CONFIG_NET_ESTIMATOR - gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || -#endif + gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qstats) < 0) return -1; @@ -1671,36 +1630,42 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old) { - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; - if (cl) { - if (new == NULL) { - if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL) - return -ENOBUFS; - } else { -#ifdef CONFIG_NET_CLS_POLICE - if (cl->police == TC_POLICE_RECLASSIFY) - new->reshape_fail = cbq_reshape_fail; + if (new == NULL) { + new = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, cl->common.classid); + if (new == NULL) + return -ENOBUFS; + } else { +#ifdef CONFIG_NET_CLS_ACT + if (cl->police == TC_POLICE_RECLASSIFY) + new->reshape_fail = cbq_reshape_fail; #endif - } - sch_tree_lock(sch); - *old = cl->q; - cl->q = new; - sch->q.qlen -= (*old)->q.qlen; - qdisc_reset(*old); - sch_tree_unlock(sch); - - return 0; } - return -ENOENT; + sch_tree_lock(sch); + *old = cl->q; + cl->q = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class *)arg; + + return cl->q; } -static struct Qdisc * -cbq_leaf(struct Qdisc *sch, unsigned long arg) +static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg) { - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; - return cl ? cl->q : NULL; + if (cl->q->q.qlen == 0) + cbq_deactivate_class(cl); } static unsigned long cbq_get(struct Qdisc *sch, u32 classid) @@ -1715,40 +1680,28 @@ static unsigned long cbq_get(struct Qdisc *sch, u32 classid) return 0; } -static void cbq_destroy_filters(struct cbq_class *cl) -{ - struct tcf_proto *tp; - - while ((tp = cl->filter_list) != NULL) { - cl->filter_list = tp->next; - tcf_destroy(tp); - } -} - static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(sch); - BUG_TRAP(!cl->filters); + WARN_ON(cl->filters); - cbq_destroy_filters(cl); + tcf_destroy_chain(&cl->filter_list); qdisc_destroy(cl->q); qdisc_put_rtab(cl->R_tab); -#ifdef CONFIG_NET_ESTIMATOR gen_kill_estimator(&cl->bstats, &cl->rate_est); -#endif if (cl != &q->link) kfree(cl); } -static void -cbq_destroy(struct Qdisc* sch) +static void cbq_destroy(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); + struct hlist_node *next; struct cbq_class *cl; - unsigned h; + unsigned int h; -#ifdef CONFIG_NET_CLS_POLICE +#ifdef CONFIG_NET_CLS_ACT q->rx_class = NULL; #endif /* @@ -1756,32 +1709,31 @@ cbq_destroy(struct Qdisc* sch) * classes from root to leafs which means that filters can still * be bound to classes which have been destroyed already. --TGR '04 */ - for (h = 0; h < 16; h++) - for (cl = q->classes[h]; cl; cl = cl->next) - cbq_destroy_filters(cl); - - for (h = 0; h < 16; h++) { - struct cbq_class *next; - - for (cl = q->classes[h]; cl; cl = next) { - next = cl->next; + for (h = 0; h < q->clhash.hashsize; h++) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) + tcf_destroy_chain(&cl->filter_list); + } + for (h = 0; h < q->clhash.hashsize; h++) { + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], + common.hnode) cbq_destroy_class(sch, cl); - } } + qdisc_class_hash_destroy(&q->clhash); } static void cbq_put(struct Qdisc *sch, unsigned long arg) { - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; if (--cl->refcnt == 0) { -#ifdef CONFIG_NET_CLS_POLICE +#ifdef CONFIG_NET_CLS_ACT + spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); struct cbq_sched_data *q = qdisc_priv(sch); - spin_lock_bh(&sch->dev->queue_lock); + spin_lock_bh(root_lock); if (q->rx_class == cl) q->rx_class = NULL; - spin_unlock_bh(&sch->dev->queue_lock); + spin_unlock_bh(root_lock); #endif cbq_destroy_class(sch, cl); @@ -1789,61 +1741,51 @@ static void cbq_put(struct Qdisc *sch, unsigned long arg) } static int -cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, +cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg) { int err; struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class*)*arg; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_CBQ_MAX]; + struct cbq_class *cl = (struct cbq_class *)*arg; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_CBQ_MAX + 1]; struct cbq_class *parent; struct qdisc_rate_table *rtab = NULL; - if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt)) - return -EINVAL; - - if (tb[TCA_CBQ_OVL_STRATEGY-1] && - RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl)) + if (opt == NULL) return -EINVAL; - if (tb[TCA_CBQ_FOPT-1] && - RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt)) - return -EINVAL; - - if (tb[TCA_CBQ_RATE-1] && - RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) - return -EINVAL; - - if (tb[TCA_CBQ_LSSOPT-1] && - RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) - return -EINVAL; - - if (tb[TCA_CBQ_WRROPT-1] && - RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt)) - return -EINVAL; - -#ifdef CONFIG_NET_CLS_POLICE - if (tb[TCA_CBQ_POLICE-1] && - RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police)) - return -EINVAL; -#endif + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); + if (err < 0) + return err; if (cl) { /* Check parent */ if (parentid) { - if (cl->tparent && cl->tparent->classid != parentid) + if (cl->tparent && + cl->tparent->common.classid != parentid) return -EINVAL; if (!cl->tparent && parentid != TC_H_ROOT) return -EINVAL; } - if (tb[TCA_CBQ_RATE-1]) { - rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (tb[TCA_CBQ_RATE]) { + rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), + tb[TCA_CBQ_RTAB]); if (rtab == NULL) return -EINVAL; } + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) { + qdisc_put_rtab(rtab); + return err; + } + } + /* Change class parameters */ sch_tree_lock(sch); @@ -1851,62 +1793,58 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t cbq_deactivate_class(cl); if (rtab) { - rtab = xchg(&cl->R_tab, rtab); - qdisc_put_rtab(rtab); + qdisc_put_rtab(cl->R_tab); + cl->R_tab = rtab; } - if (tb[TCA_CBQ_LSSOPT-1]) - cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + if (tb[TCA_CBQ_LSSOPT]) + cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); - if (tb[TCA_CBQ_WRROPT-1]) { + if (tb[TCA_CBQ_WRROPT]) { cbq_rmprio(q, cl); - cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); } - if (tb[TCA_CBQ_OVL_STRATEGY-1]) - cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); + if (tb[TCA_CBQ_OVL_STRATEGY]) + cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); -#ifdef CONFIG_NET_CLS_POLICE - if (tb[TCA_CBQ_POLICE-1]) - cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#ifdef CONFIG_NET_CLS_ACT + if (tb[TCA_CBQ_POLICE]) + cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); #endif - if (tb[TCA_CBQ_FOPT-1]) - cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + if (tb[TCA_CBQ_FOPT]) + cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); if (cl->q->q.qlen) cbq_activate_class(cl); sch_tree_unlock(sch); -#ifdef CONFIG_NET_ESTIMATOR - if (tca[TCA_RATE-1]) - gen_replace_estimator(&cl->bstats, &cl->rate_est, - cl->stats_lock, tca[TCA_RATE-1]); -#endif return 0; } if (parentid == TC_H_ROOT) return -EINVAL; - if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || - tb[TCA_CBQ_LSSOPT-1] == NULL) + if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL || + tb[TCA_CBQ_LSSOPT] == NULL) return -EINVAL; - rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]); if (rtab == NULL) return -EINVAL; if (classid) { err = -EINVAL; - if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) + if (TC_H_MAJ(classid ^ sch->handle) || + cbq_class_lookup(q, classid)) goto failure; } else { int i; - classid = TC_H_MAKE(sch->handle,0x8000); + classid = TC_H_MAKE(sch->handle, 0x8000); - for (i=0; i<0x8000; i++) { + for (i = 0; i < 0x8000; i++) { if (++q->hgenerator >= 0x8000) q->hgenerator = 1; if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) @@ -1927,22 +1865,32 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t } err = -ENOBUFS; - cl = kmalloc(sizeof(*cl), GFP_KERNEL); + cl = kzalloc(sizeof(*cl), GFP_KERNEL); if (cl == NULL) goto failure; - memset(cl, 0, sizeof(*cl)); + + if (tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) { + kfree(cl); + goto failure; + } + } + cl->R_tab = rtab; rtab = NULL; cl->refcnt = 1; - if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); + if (!cl->q) cl->q = &noop_qdisc; - cl->classid = classid; + cl->common.classid = classid; cl->tparent = parent; cl->qdisc = sch; cl->allot = parent->allot; cl->quantum = cl->allot; cl->weight = cl->R_tab->rate.rate; - cl->stats_lock = &sch->dev->queue_lock; sch_tree_lock(sch); cbq_link_class(cl); @@ -1951,30 +1899,26 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t cl->share = cl->tparent; cbq_adjust_levels(parent); cl->minidle = -0x7FFFFFFF; - cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); - cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); - if (cl->ewma_log==0) + cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); + cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); + if (cl->ewma_log == 0) cl->ewma_log = q->link.ewma_log; - if (cl->maxidle==0) + if (cl->maxidle == 0) cl->maxidle = q->link.maxidle; - if (cl->avpkt==0) + if (cl->avpkt == 0) cl->avpkt = q->link.avpkt; cl->overlimit = cbq_ovl_classic; - if (tb[TCA_CBQ_OVL_STRATEGY-1]) - cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); -#ifdef CONFIG_NET_CLS_POLICE - if (tb[TCA_CBQ_POLICE-1]) - cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); + if (tb[TCA_CBQ_OVL_STRATEGY]) + cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); +#ifdef CONFIG_NET_CLS_ACT + if (tb[TCA_CBQ_POLICE]) + cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); #endif - if (tb[TCA_CBQ_FOPT-1]) - cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + if (tb[TCA_CBQ_FOPT]) + cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); sch_tree_unlock(sch); -#ifdef CONFIG_NET_ESTIMATOR - if (tca[TCA_RATE-1]) - gen_new_estimator(&cl->bstats, &cl->rate_est, - cl->stats_lock, tca[TCA_RATE-1]); -#endif + qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; return 0; @@ -1987,13 +1931,18 @@ failure: static int cbq_delete(struct Qdisc *sch, unsigned long arg) { struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; + unsigned int qlen; if (cl->filters || cl->children || cl == &q->link) return -EBUSY; sch_tree_lock(sch); + qlen = cl->q->q.qlen; + qdisc_reset(cl->q); + qdisc_tree_decrease_qlen(cl->q, qlen); + if (cl->next_alive) cbq_deactivate_class(cl); @@ -2003,7 +1952,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) q->tx_class = NULL; q->tx_borrowed = NULL; } -#ifdef CONFIG_NET_CLS_POLICE +#ifdef CONFIG_NET_CLS_ACT if (q->rx_class == cl) q->rx_class = NULL; #endif @@ -2016,8 +1965,11 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) cbq_rmprio(q, cl); sch_tree_unlock(sch); - if (--cl->refcnt == 0) - cbq_destroy_class(sch, cl); + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ return 0; } @@ -2037,7 +1989,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, u32 classid) { struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *p = (struct cbq_class*)parent; + struct cbq_class *p = (struct cbq_class *)parent; struct cbq_class *cl = cbq_class_lookup(q, classid); if (cl) { @@ -2051,7 +2003,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) { - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; cl->filters--; } @@ -2059,15 +2011,14 @@ static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct cbq_sched_data *q = qdisc_priv(sch); - unsigned h; + struct cbq_class *cl; + unsigned int h; if (arg->stop) return; - for (h = 0; h < 16; h++) { - struct cbq_class *cl; - - for (cl = q->classes[h]; cl; cl = cl->next) { + for (h = 0; h < q->clhash.hashsize; h++) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; @@ -2081,9 +2032,10 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct Qdisc_class_ops cbq_class_ops = { +static const struct Qdisc_class_ops cbq_class_ops = { .graft = cbq_graft, .leaf = cbq_leaf, + .qlen_notify = cbq_qlen_notify, .get = cbq_get, .put = cbq_put, .change = cbq_change_class, @@ -2096,14 +2048,14 @@ static struct Qdisc_class_ops cbq_class_ops = { .dump_stats = cbq_dump_class_stats, }; -static struct Qdisc_ops cbq_qdisc_ops = { +static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &cbq_class_ops, .id = "cbq", .priv_size = sizeof(struct cbq_sched_data), .enqueue = cbq_enqueue, .dequeue = cbq_dequeue, - .requeue = cbq_requeue, + .peek = qdisc_peek_dequeued, .drop = cbq_drop, .init = cbq_init, .reset = cbq_reset, @@ -2118,7 +2070,7 @@ static int __init cbq_module_init(void) { return register_qdisc(&cbq_qdisc_ops); } -static void __exit cbq_module_exit(void) +static void __exit cbq_module_exit(void) { unregister_qdisc(&cbq_qdisc_ops); } diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c new file mode 100644 index 00000000000..ed30e436128 --- /dev/null +++ b/net/sched/sch_choke.c @@ -0,0 +1,632 @@ +/* + * net/sched/sch_choke.c CHOKE scheduler + * + * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com> + * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> +#include <net/red.h> +#include <net/flow_keys.h> + +/* + CHOKe stateless AQM for fair bandwidth allocation + ================================================= + + CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for + unresponsive flows) is a variant of RED that penalizes misbehaving flows but + maintains no flow state. The difference from RED is an additional step + during the enqueuing process. If average queue size is over the + low threshold (qmin), a packet is chosen at random from the queue. + If both the new and chosen packet are from the same flow, both + are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it + needs to access packets in queue randomly. It has a minimal class + interface to allow overriding the builtin flow classifier with + filters. + + Source: + R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless + Active Queue Management Scheme for Approximating Fair Bandwidth Allocation", + IEEE INFOCOM, 2000. + + A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial + Characteristics", IEEE/ACM Transactions on Networking, 2004 + + */ + +/* Upper bound on size of sk_buff table (packets) */ +#define CHOKE_MAX_QUEUE (128*1024 - 1) + +struct choke_sched_data { +/* Parameters */ + u32 limit; + unsigned char flags; + + struct red_parms parms; + +/* Variables */ + struct red_vars vars; + struct tcf_proto *filter_list; + struct { + u32 prob_drop; /* Early probability drops */ + u32 prob_mark; /* Early probability marks */ + u32 forced_drop; /* Forced drops, qavg > max_thresh */ + u32 forced_mark; /* Forced marks, qavg > max_thresh */ + u32 pdrop; /* Drops due to queue limits */ + u32 other; /* Drops due to drop() calls */ + u32 matched; /* Drops to flow match */ + } stats; + + unsigned int head; + unsigned int tail; + + unsigned int tab_mask; /* size - 1 */ + + struct sk_buff **tab; +}; + +/* number of elements in queue including holes */ +static unsigned int choke_len(const struct choke_sched_data *q) +{ + return (q->tail - q->head) & q->tab_mask; +} + +/* Is ECN parameter configured */ +static int use_ecn(const struct choke_sched_data *q) +{ + return q->flags & TC_RED_ECN; +} + +/* Should packets over max just be dropped (versus marked) */ +static int use_harddrop(const struct choke_sched_data *q) +{ + return q->flags & TC_RED_HARDDROP; +} + +/* Move head pointer forward to skip over holes */ +static void choke_zap_head_holes(struct choke_sched_data *q) +{ + do { + q->head = (q->head + 1) & q->tab_mask; + if (q->head == q->tail) + break; + } while (q->tab[q->head] == NULL); +} + +/* Move tail pointer backwards to reuse holes */ +static void choke_zap_tail_holes(struct choke_sched_data *q) +{ + do { + q->tail = (q->tail - 1) & q->tab_mask; + if (q->head == q->tail) + break; + } while (q->tab[q->tail] == NULL); +} + +/* Drop packet from queue array by creating a "hole" */ +static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = q->tab[idx]; + + q->tab[idx] = NULL; + + if (idx == q->head) + choke_zap_head_holes(q); + if (idx == q->tail) + choke_zap_tail_holes(q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + qdisc_tree_decrease_qlen(sch, 1); + --sch->q.qlen; +} + +struct choke_skb_cb { + u16 classid; + u8 keys_valid; + struct flow_keys keys; +}; + +static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb)); + return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static inline void choke_set_classid(struct sk_buff *skb, u16 classid) +{ + choke_skb_cb(skb)->classid = classid; +} + +static u16 choke_get_classid(const struct sk_buff *skb) +{ + return choke_skb_cb(skb)->classid; +} + +/* + * Compare flow of two packets + * Returns true only if source and destination address and port match. + * false for special cases + */ +static bool choke_match_flow(struct sk_buff *skb1, + struct sk_buff *skb2) +{ + if (skb1->protocol != skb2->protocol) + return false; + + if (!choke_skb_cb(skb1)->keys_valid) { + choke_skb_cb(skb1)->keys_valid = 1; + skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys); + } + + if (!choke_skb_cb(skb2)->keys_valid) { + choke_skb_cb(skb2)->keys_valid = 1; + skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys); + } + + return !memcmp(&choke_skb_cb(skb1)->keys, + &choke_skb_cb(skb2)->keys, + sizeof(struct flow_keys)); +} + +/* + * Classify flow using either: + * 1. pre-existing classification result in skb + * 2. fast internal classification + * 3. use TC filter based classification + */ +static bool choke_classify(struct sk_buff *skb, + struct Qdisc *sch, int *qerr) + +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct tcf_result res; + int result; + + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return false; + } +#endif + choke_set_classid(skb, TC_H_MIN(res.classid)); + return true; + } + + return false; +} + +/* + * Select a packet at random from queue + * HACK: since queue can have holes from previous deletion; retry several + * times to find a random skb but then just give up and return the head + * Will return NULL if queue is empty (q->head == q->tail) + */ +static struct sk_buff *choke_peek_random(const struct choke_sched_data *q, + unsigned int *pidx) +{ + struct sk_buff *skb; + int retrys = 3; + + do { + *pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask; + skb = q->tab[*pidx]; + if (skb) + return skb; + } while (--retrys > 0); + + return q->tab[*pidx = q->head]; +} + +/* + * Compare new packet with random packet in queue + * returns true if matched and sets *pidx + */ +static bool choke_match_random(const struct choke_sched_data *q, + struct sk_buff *nskb, + unsigned int *pidx) +{ + struct sk_buff *oskb; + + if (q->head == q->tail) + return false; + + oskb = choke_peek_random(q, pidx); + if (q->filter_list) + return choke_get_classid(nskb) == choke_get_classid(oskb); + + return choke_match_flow(oskb, nskb); +} + +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + const struct red_parms *p = &q->parms; + int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + + if (q->filter_list) { + /* If using external classifiers, get result and record it. */ + if (!choke_classify(skb, sch, &ret)) + goto other_drop; /* Packet was eaten by filter */ + } + + choke_skb_cb(skb)->keys_valid = 0; + /* Compute average queue usage (see RED) */ + q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen); + if (red_is_idling(&q->vars)) + red_end_of_idle_period(&q->vars); + + /* Is queue small? */ + if (q->vars.qavg <= p->qth_min) + q->vars.qcount = -1; + else { + unsigned int idx; + + /* Draw a packet at random from queue and compare flow */ + if (choke_match_random(q, skb, &idx)) { + q->stats.matched++; + choke_drop_by_idx(sch, idx); + goto congestion_drop; + } + + /* Queue is large, always mark/drop */ + if (q->vars.qavg > p->qth_max) { + q->vars.qcount = -1; + + sch->qstats.overlimits++; + if (use_harddrop(q) || !use_ecn(q) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } + + q->stats.forced_mark++; + } else if (++q->vars.qcount) { + if (red_mark_probability(p, &q->vars, q->vars.qavg)) { + q->vars.qcount = 0; + q->vars.qR = red_random(p); + + sch->qstats.overlimits++; + if (!use_ecn(q) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + q->stats.prob_mark++; + } + } else + q->vars.qR = red_random(p); + } + + /* Admit new packet */ + if (sch->q.qlen < q->limit) { + q->tab[q->tail] = skb; + q->tail = (q->tail + 1) & q->tab_mask; + ++sch->q.qlen; + sch->qstats.backlog += qdisc_pkt_len(skb); + return NET_XMIT_SUCCESS; + } + + q->stats.pdrop++; + return qdisc_drop(skb, sch); + +congestion_drop: + qdisc_drop(skb, sch); + return NET_XMIT_CN; + +other_drop: + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; +} + +static struct sk_buff *choke_dequeue(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + if (q->head == q->tail) { + if (!red_is_idling(&q->vars)) + red_start_of_idle_period(&q->vars); + return NULL; + } + + skb = q->tab[q->head]; + q->tab[q->head] = NULL; + choke_zap_head_holes(q); + --sch->q.qlen; + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_bstats_update(sch, skb); + + return skb; +} + +static unsigned int choke_drop(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + unsigned int len; + + len = qdisc_queue_drop(sch); + if (len > 0) + q->stats.other++; + else { + if (!red_is_idling(&q->vars)) + red_start_of_idle_period(&q->vars); + } + + return len; +} + +static void choke_reset(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + red_restart(&q->vars); +} + +static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { + [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) }, + [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE }, + [TCA_CHOKE_MAX_P] = { .type = NLA_U32 }, +}; + + +static void choke_free(void *addr) +{ + kvfree(addr); +} + +static int choke_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CHOKE_MAX + 1]; + const struct tc_red_qopt *ctl; + int err; + struct sk_buff **old = NULL; + unsigned int mask; + u32 max_P; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy); + if (err < 0) + return err; + + if (tb[TCA_CHOKE_PARMS] == NULL || + tb[TCA_CHOKE_STAB] == NULL) + return -EINVAL; + + max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0; + + ctl = nla_data(tb[TCA_CHOKE_PARMS]); + + if (ctl->limit > CHOKE_MAX_QUEUE) + return -EINVAL; + + mask = roundup_pow_of_two(ctl->limit + 1) - 1; + if (mask != q->tab_mask) { + struct sk_buff **ntab; + + ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), + GFP_KERNEL | __GFP_NOWARN); + if (!ntab) + ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *)); + if (!ntab) + return -ENOMEM; + + sch_tree_lock(sch); + old = q->tab; + if (old) { + unsigned int oqlen = sch->q.qlen, tail = 0; + + while (q->head != q->tail) { + struct sk_buff *skb = q->tab[q->head]; + + q->head = (q->head + 1) & q->tab_mask; + if (!skb) + continue; + if (tail < mask) { + ntab[tail++] = skb; + continue; + } + sch->qstats.backlog -= qdisc_pkt_len(skb); + --sch->q.qlen; + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen); + q->head = 0; + q->tail = tail; + } + + q->tab_mask = mask; + q->tab = ntab; + } else + sch_tree_lock(sch); + + q->flags = ctl->flags; + q->limit = ctl->limit; + + red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, + ctl->Plog, ctl->Scell_log, + nla_data(tb[TCA_CHOKE_STAB]), + max_P); + red_set_vars(&q->vars); + + if (q->head == q->tail) + red_end_of_idle_period(&q->vars); + + sch_tree_unlock(sch); + choke_free(old); + return 0; +} + +static int choke_init(struct Qdisc *sch, struct nlattr *opt) +{ + return choke_change(sch, opt); +} + +static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct nlattr *opts = NULL; + struct tc_red_qopt opt = { + .limit = q->limit, + .flags = q->flags, + .qth_min = q->parms.qth_min >> q->parms.Wlog, + .qth_max = q->parms.qth_max >> q->parms.Wlog, + .Wlog = q->parms.Wlog, + .Plog = q->parms.Plog, + .Scell_log = q->parms.Scell_log, + }; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) || + nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P)) + goto nla_put_failure; + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct tc_choke_xstats st = { + .early = q->stats.prob_drop + q->stats.forced_drop, + .marked = q->stats.prob_mark + q->stats.forced_mark, + .pdrop = q->stats.pdrop, + .other = q->stats.other, + .matched = q->stats.matched, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void choke_destroy(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + choke_free(q->tab); +} + +static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long choke_get(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static void choke_put(struct Qdisc *q, unsigned long cl) +{ +} + +static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + +static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static int choke_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + if (!arg->stop) { + if (arg->fn(sch, 1, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops choke_class_ops = { + .leaf = choke_leaf, + .get = choke_get, + .put = choke_put, + .tcf_chain = choke_find_tcf, + .bind_tcf = choke_bind, + .unbind_tcf = choke_put, + .dump = choke_dump_class, + .walk = choke_walk, +}; + +static struct sk_buff *choke_peek_head(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + return (q->head != q->tail) ? q->tab[q->head] : NULL; +} + +static struct Qdisc_ops choke_qdisc_ops __read_mostly = { + .id = "choke", + .priv_size = sizeof(struct choke_sched_data), + + .enqueue = choke_enqueue, + .dequeue = choke_dequeue, + .peek = choke_peek_head, + .drop = choke_drop, + .init = choke_init, + .destroy = choke_destroy, + .reset = choke_reset, + .change = choke_change, + .dump = choke_dump, + .dump_stats = choke_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init choke_module_init(void) +{ + return register_qdisc(&choke_qdisc_ops); +} + +static void __exit choke_module_exit(void) +{ + unregister_qdisc(&choke_qdisc_ops); +} + +module_init(choke_module_init) +module_exit(choke_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c new file mode 100644 index 00000000000..2f9ab17db85 --- /dev/null +++ b/net/sched/sch_codel.c @@ -0,0 +1,276 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm + * + * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com> + * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net> + * + * Implemented on linux by : + * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> + * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/prefetch.h> +#include <net/pkt_sched.h> +#include <net/codel.h> + + +#define DEFAULT_CODEL_LIMIT 1000 + +struct codel_sched_data { + struct codel_params params; + struct codel_vars vars; + struct codel_stats stats; + u32 drop_overlimit; +}; + +/* This is the specific function called from codel_dequeue() + * to dequeue a packet from queue. Note: backlog is handled in + * codel, we dont need to reduce it here. + */ +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +{ + struct sk_buff *skb = __skb_dequeue(&sch->q); + + prefetch(&skb->end); /* we'll need skb_shinfo() */ + return skb; +} + +static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue); + + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + * or HTB crashes. Defer it for next round. + */ + if (q->stats.drop_count && sch->q.qlen) { + qdisc_tree_decrease_qlen(sch, q->stats.drop_count); + q->stats.drop_count = 0; + } + if (skb) + qdisc_bstats_update(sch, skb); + return skb; +} + +static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct codel_sched_data *q; + + if (likely(qdisc_qlen(sch) < sch->limit)) { + codel_set_enqueue_time(skb); + return qdisc_enqueue_tail(skb, sch); + } + q = qdisc_priv(sch); + q->drop_overlimit++; + return qdisc_drop(skb, sch); +} + +static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { + [TCA_CODEL_TARGET] = { .type = NLA_U32 }, + [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, + [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, + [TCA_CODEL_ECN] = { .type = NLA_U32 }, +}; + +static int codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CODEL_MAX + 1]; + unsigned int qlen; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + if (tb[TCA_CODEL_TARGET]) { + u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); + + q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_CODEL_INTERVAL]) { + u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); + + q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_CODEL_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); + + if (tb[TCA_CODEL_ECN]) + q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]); + + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = __skb_dequeue(&sch->q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + sch_tree_unlock(sch); + return 0; +} + +static int codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + sch->limit = DEFAULT_CODEL_LIMIT; + + codel_params_init(&q->params); + codel_vars_init(&q->vars); + codel_stats_init(&q->stats); + + if (opt) { + int err = codel_change(sch, opt); + + if (err) + return err; + } + + if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + + return 0; +} + +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CODEL_TARGET, + codel_time_to_us(q->params.target)) || + nla_put_u32(skb, TCA_CODEL_LIMIT, + sch->limit) || + nla_put_u32(skb, TCA_CODEL_INTERVAL, + codel_time_to_us(q->params.interval)) || + nla_put_u32(skb, TCA_CODEL_ECN, + q->params.ecn)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; +} + +static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + const struct codel_sched_data *q = qdisc_priv(sch); + struct tc_codel_xstats st = { + .maxpacket = q->stats.maxpacket, + .count = q->vars.count, + .lastcount = q->vars.lastcount, + .drop_overlimit = q->drop_overlimit, + .ldelay = codel_time_to_us(q->vars.ldelay), + .dropping = q->vars.dropping, + .ecn_mark = q->stats.ecn_mark, + }; + + if (q->vars.dropping) { + codel_tdiff_t delta = q->vars.drop_next - codel_get_time(); + + if (delta >= 0) + st.drop_next = codel_time_to_us(delta); + else + st.drop_next = -codel_time_to_us(-delta); + } + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void codel_reset(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + qdisc_reset_queue(sch); + codel_vars_init(&q->vars); +} + +static struct Qdisc_ops codel_qdisc_ops __read_mostly = { + .id = "codel", + .priv_size = sizeof(struct codel_sched_data), + + .enqueue = codel_qdisc_enqueue, + .dequeue = codel_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = codel_init, + .reset = codel_reset, + .change = codel_change, + .dump = codel_dump, + .dump_stats = codel_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init codel_module_init(void) +{ + return register_qdisc(&codel_qdisc_ops); +} + +static void __exit codel_module_exit(void) +{ + unregister_qdisc(&codel_qdisc_ops); +} + +module_init(codel_module_init) +module_exit(codel_module_exit) + +MODULE_DESCRIPTION("Controlled Delay queue discipline"); +MODULE_AUTHOR("Dave Taht"); +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c new file mode 100644 index 00000000000..7bbbfe11219 --- /dev/null +++ b/net/sched/sch_drr.c @@ -0,0 +1,526 @@ +/* + * net/sched/sch_drr.c Deficit Round Robin scheduler + * + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/pkt_sched.h> +#include <net/sch_generic.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> + +struct drr_class { + struct Qdisc_class_common common; + unsigned int refcnt; + unsigned int filter_cnt; + + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est64 rate_est; + struct list_head alist; + struct Qdisc *qdisc; + + u32 quantum; + u32 deficit; +}; + +struct drr_sched { + struct list_head active; + struct tcf_proto *filter_list; + struct Qdisc_class_hash clhash; +}; + +static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) +{ + struct drr_sched *q = qdisc_priv(sch); + struct Qdisc_class_common *clc; + + clc = qdisc_class_find(&q->clhash, classid); + if (clc == NULL) + return NULL; + return container_of(clc, struct drr_class, common); +} + +static void drr_purge_queue(struct drr_class *cl) +{ + unsigned int len = cl->qdisc->q.qlen; + + qdisc_reset(cl->qdisc); + qdisc_tree_decrease_qlen(cl->qdisc, len); +} + +static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = { + [TCA_DRR_QUANTUM] = { .type = NLA_U32 }, +}; + +static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl = (struct drr_class *)*arg; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_DRR_MAX + 1]; + u32 quantum; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy); + if (err < 0) + return err; + + if (tb[TCA_DRR_QUANTUM]) { + quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]); + if (quantum == 0) + return -EINVAL; + } else + quantum = psched_mtu(qdisc_dev(sch)); + + if (cl != NULL) { + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) + return err; + } + + sch_tree_lock(sch); + if (tb[TCA_DRR_QUANTUM]) + cl->quantum = quantum; + sch_tree_unlock(sch); + + return 0; + } + + cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL); + if (cl == NULL) + return -ENOBUFS; + + cl->refcnt = 1; + cl->common.classid = classid; + cl->quantum = quantum; + cl->qdisc = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, classid); + if (cl->qdisc == NULL) + cl->qdisc = &noop_qdisc; + + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) { + qdisc_destroy(cl->qdisc); + kfree(cl); + return err; + } + } + + sch_tree_lock(sch); + qdisc_class_hash_insert(&q->clhash, &cl->common); + sch_tree_unlock(sch); + + qdisc_class_hash_grow(sch, &q->clhash); + + *arg = (unsigned long)cl; + return 0; +} + +static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) +{ + gen_kill_estimator(&cl->bstats, &cl->rate_est); + qdisc_destroy(cl->qdisc); + kfree(cl); +} + +static int drr_delete_class(struct Qdisc *sch, unsigned long arg) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl = (struct drr_class *)arg; + + if (cl->filter_cnt > 0) + return -EBUSY; + + sch_tree_lock(sch); + + drr_purge_queue(cl); + qdisc_class_hash_remove(&q->clhash, &cl->common); + + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ + + sch_tree_unlock(sch); + return 0; +} + +static unsigned long drr_get_class(struct Qdisc *sch, u32 classid) +{ + struct drr_class *cl = drr_find_class(sch, classid); + + if (cl != NULL) + cl->refcnt++; + + return (unsigned long)cl; +} + +static void drr_put_class(struct Qdisc *sch, unsigned long arg) +{ + struct drr_class *cl = (struct drr_class *)arg; + + if (--cl->refcnt == 0) + drr_destroy_class(sch, cl); +} + +static struct tcf_proto **drr_tcf_chain(struct Qdisc *sch, unsigned long cl) +{ + struct drr_sched *q = qdisc_priv(sch); + + if (cl) + return NULL; + + return &q->filter_list; +} + +static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct drr_class *cl = drr_find_class(sch, classid); + + if (cl != NULL) + cl->filter_cnt++; + + return (unsigned long)cl; +} + +static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct drr_class *cl = (struct drr_class *)arg; + + cl->filter_cnt--; +} + +static int drr_graft_class(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old) +{ + struct drr_class *cl = (struct drr_class *)arg; + + if (new == NULL) { + new = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, cl->common.classid); + if (new == NULL) + new = &noop_qdisc; + } + + sch_tree_lock(sch); + drr_purge_queue(cl); + *old = cl->qdisc; + cl->qdisc = new; + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct drr_class *cl = (struct drr_class *)arg; + + return cl->qdisc; +} + +static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg) +{ + struct drr_class *cl = (struct drr_class *)arg; + + if (cl->qdisc->q.qlen == 0) + list_del(&cl->alist); +} + +static int drr_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct drr_class *cl = (struct drr_class *)arg; + struct nlattr *nest; + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->common.classid; + tcm->tcm_info = cl->qdisc->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum)) + goto nla_put_failure; + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct drr_class *cl = (struct drr_class *)arg; + struct tc_drr_stats xstats; + + memset(&xstats, 0, sizeof(xstats)); + if (cl->qdisc->q.qlen) { + xstats.deficit = cl->deficit; + cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; + } + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); +} + +static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { + cl = drr_find_class(sch, skb->priority); + if (cl != NULL) + return cl; + } + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + } +#endif + cl = (struct drr_class *)res.class; + if (cl == NULL) + cl = drr_find_class(sch, res.classid); + return cl; + } + return NULL; +} + +static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + int err = 0; + + cl = drr_classify(skb, sch, &err); + if (cl == NULL) { + if (err & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return err; + } + + err = qdisc_enqueue(skb, cl->qdisc); + if (unlikely(err != NET_XMIT_SUCCESS)) { + if (net_xmit_drop_count(err)) { + cl->qstats.drops++; + sch->qstats.drops++; + } + return err; + } + + if (cl->qdisc->q.qlen == 1) { + list_add_tail(&cl->alist, &q->active); + cl->deficit = cl->quantum; + } + + sch->q.qlen++; + return err; +} + +static struct sk_buff *drr_dequeue(struct Qdisc *sch) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + struct sk_buff *skb; + unsigned int len; + + if (list_empty(&q->active)) + goto out; + while (1) { + cl = list_first_entry(&q->active, struct drr_class, alist); + skb = cl->qdisc->ops->peek(cl->qdisc); + if (skb == NULL) { + qdisc_warn_nonwc(__func__, cl->qdisc); + goto out; + } + + len = qdisc_pkt_len(skb); + if (len <= cl->deficit) { + cl->deficit -= len; + skb = qdisc_dequeue_peeked(cl->qdisc); + if (cl->qdisc->q.qlen == 0) + list_del(&cl->alist); + + bstats_update(&cl->bstats, skb); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + return skb; + } + + cl->deficit += cl->quantum; + list_move_tail(&cl->alist, &q->active); + } +out: + return NULL; +} + +static unsigned int drr_drop(struct Qdisc *sch) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + unsigned int len; + + list_for_each_entry(cl, &q->active, alist) { + if (cl->qdisc->ops->drop) { + len = cl->qdisc->ops->drop(cl->qdisc); + if (len > 0) { + sch->q.qlen--; + if (cl->qdisc->q.qlen == 0) + list_del(&cl->alist); + return len; + } + } + } + return 0; +} + +static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt) +{ + struct drr_sched *q = qdisc_priv(sch); + int err; + + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) + return err; + INIT_LIST_HEAD(&q->active); + return 0; +} + +static void drr_reset_qdisc(struct Qdisc *sch) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + unsigned int i; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { + if (cl->qdisc->q.qlen) + list_del(&cl->alist); + qdisc_reset(cl->qdisc); + } + } + sch->q.qlen = 0; +} + +static void drr_destroy_qdisc(struct Qdisc *sch) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + struct hlist_node *next; + unsigned int i; + + tcf_destroy_chain(&q->filter_list); + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], + common.hnode) + drr_destroy_class(sch, cl); + } + qdisc_class_hash_destroy(&q->clhash); +} + +static const struct Qdisc_class_ops drr_class_ops = { + .change = drr_change_class, + .delete = drr_delete_class, + .get = drr_get_class, + .put = drr_put_class, + .tcf_chain = drr_tcf_chain, + .bind_tcf = drr_bind_tcf, + .unbind_tcf = drr_unbind_tcf, + .graft = drr_graft_class, + .leaf = drr_class_leaf, + .qlen_notify = drr_qlen_notify, + .dump = drr_dump_class, + .dump_stats = drr_dump_class_stats, + .walk = drr_walk, +}; + +static struct Qdisc_ops drr_qdisc_ops __read_mostly = { + .cl_ops = &drr_class_ops, + .id = "drr", + .priv_size = sizeof(struct drr_sched), + .enqueue = drr_enqueue, + .dequeue = drr_dequeue, + .peek = qdisc_peek_dequeued, + .drop = drr_drop, + .init = drr_init_qdisc, + .reset = drr_reset_qdisc, + .destroy = drr_destroy_qdisc, + .owner = THIS_MODULE, +}; + +static int __init drr_init(void) +{ + return register_qdisc(&drr_qdisc_ops); +} + +static void __exit drr_exit(void) +{ + unregister_qdisc(&drr_qdisc_ops); +} + +module_init(drr_init); +module_exit(drr_exit); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 13e0e7b3856..49d6ef338b5 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -3,37 +3,20 @@ /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ -#include <linux/config.h> #include <linux/module.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> -#include <linux/netdevice.h> /* for pkt_sched */ #include <linux/rtnetlink.h> +#include <linux/bitops.h> #include <net/pkt_sched.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <asm/byteorder.h> - -#if 0 /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif - -#if 0 /* data */ -#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define D2PRINTK(format,args...) -#endif - - -#define PRIV(sch) ((struct dsmark_qdisc_data *) qdisc_priv(sch)) - - /* * classid class marking * ------- ----- ------- @@ -62,20 +45,9 @@ struct dsmark_qdisc_data { int set_tc_index; }; -static inline int dsmark_valid_indices(u16 indices) -{ - while (indices != 1) { - if (indices & 1) - return 0; - indices >>= 1; - } - - return 1; -} - static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) { - return (index <= p->indices && index > 0); + return index <= p->indices && index > 0; } /* ------------------------- Class/flow operations ------------------------- */ @@ -83,35 +55,38 @@ static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) static int dsmark_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); - DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n", - sch, p, new, old); + pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n", + __func__, sch, p, new, old); if (new == NULL) { - new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle); if (new == NULL) new = &noop_qdisc; } sch_tree_lock(sch); - *old = xchg(&p->q, new); + *old = p->q; + p->q = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); qdisc_reset(*old); - sch->q.qlen = 0; sch_tree_unlock(sch); - return 0; + return 0; } static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) { - return PRIV(sch)->q; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + return p->q; } static unsigned long dsmark_get(struct Qdisc *sch, u32 classid) { - DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n", - sch, PRIV(sch), classid); + pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", + __func__, sch, qdisc_priv(sch), classid); return TC_H_MIN(classid) + 1; } @@ -126,60 +101,73 @@ static void dsmark_put(struct Qdisc *sch, unsigned long cl) { } +static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = { + [TCA_DSMARK_INDICES] = { .type = NLA_U16 }, + [TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 }, + [TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG }, + [TCA_DSMARK_MASK] = { .type = NLA_U8 }, + [TCA_DSMARK_VALUE] = { .type = NLA_U8 }, +}; + static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { - struct dsmark_qdisc_data *p = PRIV(sch); - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_DSMARK_MAX]; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_DSMARK_MAX + 1]; int err = -EINVAL; u8 mask = 0; - DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," - "arg 0x%lx\n", sch, p, classid, parent, *arg); + pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n", + __func__, sch, p, classid, parent, *arg); if (!dsmark_valid_index(p, *arg)) { err = -ENOENT; - goto rtattr_failure; + goto errout; } - if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt)) - goto rtattr_failure; + if (!opt) + goto errout; + + err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy); + if (err < 0) + goto errout; + + if (tb[TCA_DSMARK_MASK]) + mask = nla_get_u8(tb[TCA_DSMARK_MASK]); - if (tb[TCA_DSMARK_MASK-1]) - mask = RTA_GET_U8(tb[TCA_DSMARK_MASK-1]); + if (tb[TCA_DSMARK_VALUE]) + p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]); - if (tb[TCA_DSMARK_VALUE-1]) - p->value[*arg-1] = RTA_GET_U8(tb[TCA_DSMARK_VALUE-1]); - - if (tb[TCA_DSMARK_MASK-1]) - p->mask[*arg-1] = mask; + if (tb[TCA_DSMARK_MASK]) + p->mask[*arg - 1] = mask; err = 0; -rtattr_failure: +errout: return err; } static int dsmark_delete(struct Qdisc *sch, unsigned long arg) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); if (!dsmark_valid_index(p, arg)) return -EINVAL; - - p->mask[arg-1] = 0xff; - p->value[arg-1] = 0; + + p->mask[arg - 1] = 0xff; + p->value[arg - 1] = 0; return 0; } -static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) +static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); int i; - DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); + pr_debug("%s(sch %p,[qdisc %p],walker %p)\n", + __func__, sch, p, walker); if (walker->stop) return; @@ -188,45 +176,53 @@ static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) if (p->mask[i] == 0xff && !p->value[i]) goto ignore; if (walker->count >= walker->skip) { - if (walker->fn(sch, i+1, walker) < 0) { + if (walker->fn(sch, i + 1, walker) < 0) { walker->stop = 1; break; } } -ignore: +ignore: walker->count++; - } + } } -static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl) +static inline struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch, + unsigned long cl) { - return &PRIV(sch)->filter_list; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + return &p->filter_list; } /* --------------------------- Qdisc operations ---------------------------- */ -static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) +static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); int err; - D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p); if (p->set_tc_index) { - /* FIXME: Safe with non-linear skbs? --RR */ switch (skb->protocol) { - case __constant_htons(ETH_P_IP): - skb->tc_index = ipv4_get_dsfield(skb->nh.iph) - & ~INET_ECN_MASK; - break; - case __constant_htons(ETH_P_IPV6): - skb->tc_index = ipv6_get_dsfield(skb->nh.ipv6h) - & ~INET_ECN_MASK; - break; - default: - skb->tc_index = 0; - break; - }; + case htons(ETH_P_IP): + if (skb_cow_head(skb, sizeof(struct iphdr))) + goto drop; + + skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) + & ~INET_ECN_MASK; + break; + + case htons(ETH_P_IPV6): + if (skb_cow_head(skb, sizeof(struct ipv6hdr))) + goto drop; + + skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) + & ~INET_ECN_MASK; + break; + default: + skb->tc_index = 0; + break; + } } if (TC_H_MAJ(skb->priority) == sch->handle) @@ -235,111 +231,102 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) struct tcf_result res; int result = tc_classify(skb, p->filter_list, &res); - D2PRINTK("result %d class 0x%04x\n", result, res.classid); + pr_debug("result %d class 0x%04x\n", result, res.classid); switch (result) { -#ifdef CONFIG_NET_CLS_POLICE - case TC_POLICE_SHOT: - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_POLICED; -#if 0 - case TC_POLICE_RECLASSIFY: - /* FIXME: what to do here ??? */ +#ifdef CONFIG_NET_CLS_ACT + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + kfree_skb(skb); + return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + + case TC_ACT_SHOT: + goto drop; #endif -#endif - case TC_POLICE_OK: - skb->tc_index = TC_H_MIN(res.classid); - break; - case TC_POLICE_UNSPEC: - /* fall through */ - default: - if (p->default_index != NO_DEFAULT_INDEX) - skb->tc_index = p->default_index; - break; - }; + case TC_ACT_OK: + skb->tc_index = TC_H_MIN(res.classid); + break; + + default: + if (p->default_index != NO_DEFAULT_INDEX) + skb->tc_index = p->default_index; + break; + } } - err = p->q->enqueue(skb,p->q); + err = qdisc_enqueue(skb, p->q); if (err != NET_XMIT_SUCCESS) { - sch->qstats.drops++; + if (net_xmit_drop_count(err)) + sch->qstats.drops++; return err; } - sch->bstats.bytes += skb->len; - sch->bstats.packets++; sch->q.qlen++; return NET_XMIT_SUCCESS; + +drop: + qdisc_drop(skb, sch); + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); struct sk_buff *skb; u32 index; - D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); skb = p->q->ops->dequeue(p->q); if (skb == NULL) return NULL; + qdisc_bstats_update(sch, skb); sch->q.qlen--; index = skb->tc_index & (p->indices - 1); - D2PRINTK("index %d->%d\n", skb->tc_index, index); + pr_debug("index %d->%d\n", skb->tc_index, index); switch (skb->protocol) { - case __constant_htons(ETH_P_IP): - ipv4_change_dsfield(skb->nh.iph, p->mask[index], - p->value[index]); - break; - case __constant_htons(ETH_P_IPV6): - ipv6_change_dsfield(skb->nh.ipv6h, p->mask[index], - p->value[index]); + case htons(ETH_P_IP): + ipv4_change_dsfield(ip_hdr(skb), p->mask[index], + p->value[index]); break; - default: - /* - * Only complain if a change was actually attempted. - * This way, we can send non-IP traffic through dsmark - * and don't need yet another qdisc as a bypass. - */ - if (p->mask[index] != 0xff || p->value[index]) - printk(KERN_WARNING "dsmark_dequeue: " - "unsupported protocol %d\n", - htons(skb->protocol)); + case htons(ETH_P_IPV6): + ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index], + p->value[index]); break; - }; + default: + /* + * Only complain if a change was actually attempted. + * This way, we can send non-IP traffic through dsmark + * and don't need yet another qdisc as a bypass. + */ + if (p->mask[index] != 0xff || p->value[index]) + pr_warn("%s: unsupported protocol %d\n", + __func__, ntohs(skb->protocol)); + break; + } return skb; } -static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch) +static struct sk_buff *dsmark_peek(struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); - int err; - - D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); - - err = p->q->ops->requeue(skb, p->q); - if (err != NET_XMIT_SUCCESS) { - sch->qstats.drops++; - return err; - } + struct dsmark_qdisc_data *p = qdisc_priv(sch); - sch->q.qlen++; - sch->qstats.requeues++; + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - return NET_XMIT_SUCCESS; + return p->q->ops->peek(p->q); } static unsigned int dsmark_drop(struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); unsigned int len; - - DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); + + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); if (p->q->ops->drop == NULL) return 0; @@ -351,26 +338,32 @@ static unsigned int dsmark_drop(struct Qdisc *sch) return len; } -static int dsmark_init(struct Qdisc *sch, struct rtattr *opt) +static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) { - struct dsmark_qdisc_data *p = PRIV(sch); - struct rtattr *tb[TCA_DSMARK_MAX]; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *tb[TCA_DSMARK_MAX + 1]; int err = -EINVAL; u32 default_index = NO_DEFAULT_INDEX; u16 indices; u8 *mask; - DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); + pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt); - if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt) < 0) + if (!opt) goto errout; - indices = RTA_GET_U16(tb[TCA_DSMARK_INDICES-1]); - if (!indices || !dsmark_valid_indices(indices)) + err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy); + if (err < 0) goto errout; - if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) - default_index = RTA_GET_U16(tb[TCA_DSMARK_DEFAULT_INDEX-1]); + err = -EINVAL; + indices = nla_get_u16(tb[TCA_DSMARK_INDICES]); + + if (hweight32(indices) != 1) + goto errout; + + if (tb[TCA_DSMARK_DEFAULT_INDEX]) + default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]); mask = kmalloc(indices * 2, GFP_KERNEL); if (mask == NULL) { @@ -386,42 +379,35 @@ static int dsmark_init(struct Qdisc *sch, struct rtattr *opt) p->indices = indices; p->default_index = default_index; - p->set_tc_index = RTA_GET_FLAG(tb[TCA_DSMARK_SET_TC_INDEX-1]); + p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); - p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle); if (p->q == NULL) p->q = &noop_qdisc; - DPRINTK("dsmark_init: qdisc %p\n", p->q); + pr_debug("%s: qdisc %p\n", __func__, p->q); err = 0; errout: -rtattr_failure: return err; } static void dsmark_reset(struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); - DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); qdisc_reset(p->q); sch->q.qlen = 0; } static void dsmark_destroy(struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); - struct tcf_proto *tp; - - DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p); + struct dsmark_qdisc_data *p = qdisc_priv(sch); - while (p->filter_list) { - tp = p->filter_list; - p->filter_list = tp->next; - tcf_destroy(tp); - } + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); + tcf_destroy_chain(&p->filter_list); qdisc_destroy(p->q); kfree(p->mask); } @@ -429,47 +415,58 @@ static void dsmark_destroy(struct Qdisc *sch) static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { - struct dsmark_qdisc_data *p = PRIV(sch); - struct rtattr *opts = NULL; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *opts = NULL; - DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl); + pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl); if (!dsmark_valid_index(p, cl)) return -EINVAL; - tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1); + tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1); + tcm->tcm_info = p->q->handle; - opts = RTA_NEST(skb, TCA_OPTIONS); - RTA_PUT_U8(skb,TCA_DSMARK_MASK, p->mask[cl-1]); - RTA_PUT_U8(skb,TCA_DSMARK_VALUE, p->value[cl-1]); + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) || + nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1])) + goto nla_put_failure; - return RTA_NEST_END(skb, opts); + return nla_nest_end(skb, opts); -rtattr_failure: - return RTA_NEST_CANCEL(skb, opts); +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; } static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) { - struct dsmark_qdisc_data *p = PRIV(sch); - struct rtattr *opts = NULL; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *opts = NULL; - opts = RTA_NEST(skb, TCA_OPTIONS); - RTA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices); + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices)) + goto nla_put_failure; - if (p->default_index != NO_DEFAULT_INDEX) - RTA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index); + if (p->default_index != NO_DEFAULT_INDEX && + nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index)) + goto nla_put_failure; - if (p->set_tc_index) - RTA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX); + if (p->set_tc_index && + nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX)) + goto nla_put_failure; - return RTA_NEST_END(skb, opts); + return nla_nest_end(skb, opts); -rtattr_failure: - return RTA_NEST_CANCEL(skb, opts); +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; } -static struct Qdisc_class_ops dsmark_class_ops = { +static const struct Qdisc_class_ops dsmark_class_ops = { .graft = dsmark_graft, .leaf = dsmark_leaf, .get = dsmark_get, @@ -483,14 +480,14 @@ static struct Qdisc_class_ops dsmark_class_ops = { .dump = dsmark_dump_class, }; -static struct Qdisc_ops dsmark_qdisc_ops = { +static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &dsmark_class_ops, .id = "dsmark", .priv_size = sizeof(struct dsmark_qdisc_data), .enqueue = dsmark_enqueue, .dequeue = dsmark_dequeue, - .requeue = dsmark_requeue, + .peek = dsmark_peek, .drop = dsmark_drop, .init = dsmark_init, .reset = dsmark_reset, @@ -505,7 +502,7 @@ static int __init dsmark_module_init(void) return register_qdisc(&dsmark_qdisc_ops); } -static void __exit dsmark_module_exit(void) +static void __exit dsmark_module_exit(void) { unregister_qdisc(&dsmark_qdisc_ops); } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 033083bf0e7..e15a9eb2908 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -9,83 +9,96 @@ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ -#include <linux/config.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> /* 1 band FIFO pseudo-"scheduler" */ -struct fifo_sched_data +static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - u32 limit; -}; + if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) + return qdisc_enqueue_tail(skb, sch); -static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) -{ - struct fifo_sched_data *q = qdisc_priv(sch); + return qdisc_reshape_fail(skb, sch); +} - if (likely(sch->qstats.backlog + skb->len <= q->limit)) +static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + if (likely(skb_queue_len(&sch->q) < sch->limit)) return qdisc_enqueue_tail(skb, sch); return qdisc_reshape_fail(skb, sch); } -static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct fifo_sched_data *q = qdisc_priv(sch); - - if (likely(skb_queue_len(&sch->q) < q->limit)) + if (likely(skb_queue_len(&sch->q) < sch->limit)) return qdisc_enqueue_tail(skb, sch); - return qdisc_reshape_fail(skb, sch); + /* queue full, remove one skb to fulfill the limit */ + __qdisc_queue_drop_head(sch, &sch->q); + sch->qstats.drops++; + qdisc_enqueue_tail(skb, sch); + + return NET_XMIT_CN; } -static int fifo_init(struct Qdisc *sch, struct rtattr *opt) +static int fifo_init(struct Qdisc *sch, struct nlattr *opt) { - struct fifo_sched_data *q = qdisc_priv(sch); + bool bypass; + bool is_bfifo = sch->ops == &bfifo_qdisc_ops; if (opt == NULL) { - u32 limit = sch->dev->tx_queue_len ? : 1; + u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1; - if (sch->ops == &bfifo_qdisc_ops) - limit *= sch->dev->mtu; + if (is_bfifo) + limit *= psched_mtu(qdisc_dev(sch)); - q->limit = limit; + sch->limit = limit; } else { - struct tc_fifo_qopt *ctl = RTA_DATA(opt); + struct tc_fifo_qopt *ctl = nla_data(opt); - if (RTA_PAYLOAD(opt) < sizeof(*ctl)) + if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; - q->limit = ctl->limit; + sch->limit = ctl->limit; } + if (is_bfifo) + bypass = sch->limit >= psched_mtu(qdisc_dev(sch)); + else + bypass = sch->limit >= 1; + + if (bypass) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { - struct fifo_sched_data *q = qdisc_priv(sch); - struct tc_fifo_qopt opt = { .limit = q->limit }; + struct tc_fifo_qopt opt = { .limit = sch->limit }; - RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; -rtattr_failure: +nla_put_failure: return -1; } -struct Qdisc_ops pfifo_qdisc_ops = { +struct Qdisc_ops pfifo_qdisc_ops __read_mostly = { .id = "pfifo", - .priv_size = sizeof(struct fifo_sched_data), + .priv_size = 0, .enqueue = pfifo_enqueue, .dequeue = qdisc_dequeue_head, - .requeue = qdisc_requeue, + .peek = qdisc_peek_head, .drop = qdisc_queue_drop, .init = fifo_init, .reset = qdisc_reset_queue, @@ -93,13 +106,14 @@ struct Qdisc_ops pfifo_qdisc_ops = { .dump = fifo_dump, .owner = THIS_MODULE, }; +EXPORT_SYMBOL(pfifo_qdisc_ops); -struct Qdisc_ops bfifo_qdisc_ops = { +struct Qdisc_ops bfifo_qdisc_ops __read_mostly = { .id = "bfifo", - .priv_size = sizeof(struct fifo_sched_data), + .priv_size = 0, .enqueue = bfifo_enqueue, .dequeue = qdisc_dequeue_head, - .requeue = qdisc_requeue, + .peek = qdisc_peek_head, .drop = qdisc_queue_drop, .init = fifo_init, .reset = qdisc_reset_queue, @@ -107,6 +121,60 @@ struct Qdisc_ops bfifo_qdisc_ops = { .dump = fifo_dump, .owner = THIS_MODULE, }; - EXPORT_SYMBOL(bfifo_qdisc_ops); -EXPORT_SYMBOL(pfifo_qdisc_ops); + +struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = { + .id = "pfifo_head_drop", + .priv_size = 0, + .enqueue = pfifo_tail_enqueue, + .dequeue = qdisc_dequeue_head, + .peek = qdisc_peek_head, + .drop = qdisc_queue_drop_head, + .init = fifo_init, + .reset = qdisc_reset_queue, + .change = fifo_init, + .dump = fifo_dump, + .owner = THIS_MODULE, +}; + +/* Pass size change message down to embedded FIFO */ +int fifo_set_limit(struct Qdisc *q, unsigned int limit) +{ + struct nlattr *nla; + int ret = -ENOMEM; + + /* Hack to avoid sending change message to non-FIFO */ + if (strncmp(q->ops->id + 1, "fifo", 4) != 0) + return 0; + + nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); + if (nla) { + nla->nla_type = RTM_NEWQDISC; + nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt)); + ((struct tc_fifo_qopt *)nla_data(nla))->limit = limit; + + ret = q->ops->change(q, nla); + kfree(nla); + } + return ret; +} +EXPORT_SYMBOL(fifo_set_limit); + +struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, + unsigned int limit) +{ + struct Qdisc *q; + int err = -ENOMEM; + + q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1)); + if (q) { + err = fifo_set_limit(q, limit); + if (err < 0) { + qdisc_destroy(q); + q = NULL; + } + } + + return q ? : ERR_PTR(err); +} +EXPORT_SYMBOL(fifo_create_dflt); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c new file mode 100644 index 00000000000..ba32c2b005d --- /dev/null +++ b/net/sched/sch_fq.c @@ -0,0 +1,849 @@ +/* + * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) + * + * Copyright (C) 2013 Eric Dumazet <edumazet@google.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Meant to be mostly used for localy generated traffic : + * Fast classification depends on skb->sk being set before reaching us. + * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. + * All packets belonging to a socket are considered as a 'flow'. + * + * Flows are dynamically allocated and stored in a hash table of RB trees + * They are also part of one Round Robin 'queues' (new or old flows) + * + * Burst avoidance (aka pacing) capability : + * + * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a + * bunch of packets, and this packet scheduler adds delay between + * packets to respect rate limitation. + * + * enqueue() : + * - lookup one RB tree (out of 1024 or more) to find the flow. + * If non existent flow, create it, add it to the tree. + * Add skb to the per flow list of skb (fifo). + * - Use a special fifo for high prio packets + * + * dequeue() : serves flows in Round Robin + * Note : When a flow becomes empty, we do not immediately remove it from + * rb trees, for performance reasons (its expected to send additional packets, + * or SLAB cache will reuse socket for another flow) + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/hash.h> +#include <linux/prefetch.h> +#include <linux/vmalloc.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/sock.h> +#include <net/tcp_states.h> + +/* + * Per flow structure, dynamically allocated + */ +struct fq_flow { + struct sk_buff *head; /* list of skbs for this flow : first skb */ + union { + struct sk_buff *tail; /* last skb in the list */ + unsigned long age; /* jiffies when flow was emptied, for gc */ + }; + struct rb_node fq_node; /* anchor in fq_root[] trees */ + struct sock *sk; + int qlen; /* number of packets in flow queue */ + int credit; + u32 socket_hash; /* sk_hash */ + struct fq_flow *next; /* next pointer in RR lists, or &detached */ + + struct rb_node rate_node; /* anchor in q->delayed tree */ + u64 time_next_packet; +}; + +struct fq_flow_head { + struct fq_flow *first; + struct fq_flow *last; +}; + +struct fq_sched_data { + struct fq_flow_head new_flows; + + struct fq_flow_head old_flows; + + struct rb_root delayed; /* for rate limited flows */ + u64 time_next_delayed_flow; + + struct fq_flow internal; /* for non classified or high prio packets */ + u32 quantum; + u32 initial_quantum; + u32 flow_refill_delay; + u32 flow_max_rate; /* optional max rate per flow */ + u32 flow_plimit; /* max packets per flow */ + struct rb_root *fq_root; + u8 rate_enable; + u8 fq_trees_log; + + u32 flows; + u32 inactive_flows; + u32 throttled_flows; + + u64 stat_gc_flows; + u64 stat_internal_packets; + u64 stat_tcp_retrans; + u64 stat_throttled; + u64 stat_flows_plimit; + u64 stat_pkts_too_long; + u64 stat_allocation_errors; + struct qdisc_watchdog watchdog; +}; + +/* special value to mark a detached flow (not on old/new list) */ +static struct fq_flow detached, throttled; + +static void fq_flow_set_detached(struct fq_flow *f) +{ + f->next = &detached; + f->age = jiffies; +} + +static bool fq_flow_is_detached(const struct fq_flow *f) +{ + return f->next == &detached; +} + +static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) +{ + struct rb_node **p = &q->delayed.rb_node, *parent = NULL; + + while (*p) { + struct fq_flow *aux; + + parent = *p; + aux = container_of(parent, struct fq_flow, rate_node); + if (f->time_next_packet >= aux->time_next_packet) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&f->rate_node, parent, p); + rb_insert_color(&f->rate_node, &q->delayed); + q->throttled_flows++; + q->stat_throttled++; + + f->next = &throttled; + if (q->time_next_delayed_flow > f->time_next_packet) + q->time_next_delayed_flow = f->time_next_packet; +} + + +static struct kmem_cache *fq_flow_cachep __read_mostly; + +static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) +{ + if (head->first) + head->last->next = flow; + else + head->first = flow; + head->last = flow; + flow->next = NULL; +} + +/* limit number of collected flows per round */ +#define FQ_GC_MAX 8 +#define FQ_GC_AGE (3*HZ) + +static bool fq_gc_candidate(const struct fq_flow *f) +{ + return fq_flow_is_detached(f) && + time_after(jiffies, f->age + FQ_GC_AGE); +} + +static void fq_gc(struct fq_sched_data *q, + struct rb_root *root, + struct sock *sk) +{ + struct fq_flow *f, *tofree[FQ_GC_MAX]; + struct rb_node **p, *parent; + int fcnt = 0; + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) + break; + + if (fq_gc_candidate(f)) { + tofree[fcnt++] = f; + if (fcnt == FQ_GC_MAX) + break; + } + + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; + while (fcnt) { + struct fq_flow *f = tofree[--fcnt]; + + rb_erase(&f->fq_node, root); + kmem_cache_free(fq_flow_cachep, f); + } +} + +static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) +{ + struct rb_node **p, *parent; + struct sock *sk = skb->sk; + struct rb_root *root; + struct fq_flow *f; + + /* warning: no starvation prevention... */ + if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) + return &q->internal; + + if (unlikely(!sk)) { + /* By forcing low order bit to 1, we make sure to not + * collide with a local flow (socket pointers are word aligned) + */ + sk = (struct sock *)(skb_get_hash(skb) | 1L); + } + + root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; + + if (q->flows >= (2U << q->fq_trees_log) && + q->inactive_flows > q->flows/2) + fq_gc(q, root, sk); + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) { + /* socket might have been reallocated, so check + * if its sk_hash is the same. + * It not, we need to refill credit with + * initial quantum + */ + if (unlikely(skb->sk && + f->socket_hash != sk->sk_hash)) { + f->credit = q->initial_quantum; + f->socket_hash = sk->sk_hash; + f->time_next_packet = 0ULL; + } + return f; + } + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!f)) { + q->stat_allocation_errors++; + return &q->internal; + } + fq_flow_set_detached(f); + f->sk = sk; + if (skb->sk) + f->socket_hash = sk->sk_hash; + f->credit = q->initial_quantum; + + rb_link_node(&f->fq_node, parent, p); + rb_insert_color(&f->fq_node, root); + + q->flows++; + q->inactive_flows++; + return f; +} + + +/* remove one skb from head of flow queue */ +static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) +{ + struct sk_buff *skb = flow->head; + + if (skb) { + flow->head = skb->next; + skb->next = NULL; + flow->qlen--; + sch->qstats.backlog -= qdisc_pkt_len(skb); + sch->q.qlen--; + } + return skb; +} + +/* We might add in the future detection of retransmits + * For the time being, just return false + */ +static bool skb_is_retransmit(struct sk_buff *skb) +{ + return false; +} + +/* add skb to flow queue + * flow queue is a linked list, kind of FIFO, except for TCP retransmits + * We special case tcp retransmits to be transmitted before other packets. + * We rely on fact that TCP retransmits are unlikely, so we do not waste + * a separate queue or a pointer. + * head-> [retrans pkt 1] + * [retrans pkt 2] + * [ normal pkt 1] + * [ normal pkt 2] + * [ normal pkt 3] + * tail-> [ normal pkt 4] + */ +static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) +{ + struct sk_buff *prev, *head = flow->head; + + skb->next = NULL; + if (!head) { + flow->head = skb; + flow->tail = skb; + return; + } + if (likely(!skb_is_retransmit(skb))) { + flow->tail->next = skb; + flow->tail = skb; + return; + } + + /* This skb is a tcp retransmit, + * find the last retrans packet in the queue + */ + prev = NULL; + while (skb_is_retransmit(head)) { + prev = head; + head = head->next; + if (!head) + break; + } + if (!prev) { /* no rtx packet in queue, become the new head */ + skb->next = flow->head; + flow->head = skb; + } else { + if (prev == flow->tail) + flow->tail = skb; + else + skb->next = prev->next; + prev->next = skb; + } +} + +static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct fq_flow *f; + + if (unlikely(sch->q.qlen >= sch->limit)) + return qdisc_drop(skb, sch); + + f = fq_classify(skb, q); + if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { + q->stat_flows_plimit++; + return qdisc_drop(skb, sch); + } + + f->qlen++; + if (skb_is_retransmit(skb)) + q->stat_tcp_retrans++; + sch->qstats.backlog += qdisc_pkt_len(skb); + if (fq_flow_is_detached(f)) { + fq_flow_add_tail(&q->new_flows, f); + if (time_after(jiffies, f->age + q->flow_refill_delay)) + f->credit = max_t(u32, f->credit, q->quantum); + q->inactive_flows--; + qdisc_unthrottled(sch); + } + + /* Note: this overwrites f->age */ + flow_queue_add(f, skb); + + if (unlikely(f == &q->internal)) { + q->stat_internal_packets++; + qdisc_unthrottled(sch); + } + sch->q.qlen++; + + return NET_XMIT_SUCCESS; +} + +static void fq_check_throttled(struct fq_sched_data *q, u64 now) +{ + struct rb_node *p; + + if (q->time_next_delayed_flow > now) + return; + + q->time_next_delayed_flow = ~0ULL; + while ((p = rb_first(&q->delayed)) != NULL) { + struct fq_flow *f = container_of(p, struct fq_flow, rate_node); + + if (f->time_next_packet > now) { + q->time_next_delayed_flow = f->time_next_packet; + break; + } + rb_erase(p, &q->delayed); + q->throttled_flows--; + fq_flow_add_tail(&q->old_flows, f); + } +} + +static struct sk_buff *fq_dequeue(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct fq_flow_head *head; + struct sk_buff *skb; + struct fq_flow *f; + u32 rate; + + skb = fq_dequeue_head(sch, &q->internal); + if (skb) + goto out; + fq_check_throttled(q, now); +begin: + head = &q->new_flows; + if (!head->first) { + head = &q->old_flows; + if (!head->first) { + if (q->time_next_delayed_flow != ~0ULL) + qdisc_watchdog_schedule_ns(&q->watchdog, + q->time_next_delayed_flow); + return NULL; + } + } + f = head->first; + + if (f->credit <= 0) { + f->credit += q->quantum; + head->first = f->next; + fq_flow_add_tail(&q->old_flows, f); + goto begin; + } + + if (unlikely(f->head && now < f->time_next_packet)) { + head->first = f->next; + fq_flow_set_throttled(q, f); + goto begin; + } + + skb = fq_dequeue_head(sch, f); + if (!skb) { + head->first = f->next; + /* force a pass through old_flows to prevent starvation */ + if ((head == &q->new_flows) && q->old_flows.first) { + fq_flow_add_tail(&q->old_flows, f); + } else { + fq_flow_set_detached(f); + q->inactive_flows++; + } + goto begin; + } + prefetch(&skb->end); + f->time_next_packet = now; + f->credit -= qdisc_pkt_len(skb); + + if (f->credit > 0 || !q->rate_enable) + goto out; + + rate = q->flow_max_rate; + if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) + rate = min(skb->sk->sk_pacing_rate, rate); + + if (rate != ~0U) { + u32 plen = max(qdisc_pkt_len(skb), q->quantum); + u64 len = (u64)plen * NSEC_PER_SEC; + + if (likely(rate)) + do_div(len, rate); + /* Since socket rate can change later, + * clamp the delay to 125 ms. + * TODO: maybe segment the too big skb, as in commit + * e43ac79a4bc ("sch_tbf: segment too big GSO packets") + */ + if (unlikely(len > 125 * NSEC_PER_MSEC)) { + len = 125 * NSEC_PER_MSEC; + q->stat_pkts_too_long++; + } + + f->time_next_packet = now + len; + } +out: + qdisc_bstats_update(sch, skb); + qdisc_unthrottled(sch); + return skb; +} + +static void fq_reset(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct rb_root *root; + struct sk_buff *skb; + struct rb_node *p; + struct fq_flow *f; + unsigned int idx; + + while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL) + kfree_skb(skb); + + if (!q->fq_root) + return; + + for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { + root = &q->fq_root[idx]; + while ((p = rb_first(root)) != NULL) { + f = container_of(p, struct fq_flow, fq_node); + rb_erase(p, root); + + while ((skb = fq_dequeue_head(sch, f)) != NULL) + kfree_skb(skb); + + kmem_cache_free(fq_flow_cachep, f); + } + } + q->new_flows.first = NULL; + q->old_flows.first = NULL; + q->delayed = RB_ROOT; + q->flows = 0; + q->inactive_flows = 0; + q->throttled_flows = 0; +} + +static void fq_rehash(struct fq_sched_data *q, + struct rb_root *old_array, u32 old_log, + struct rb_root *new_array, u32 new_log) +{ + struct rb_node *op, **np, *parent; + struct rb_root *oroot, *nroot; + struct fq_flow *of, *nf; + int fcnt = 0; + u32 idx; + + for (idx = 0; idx < (1U << old_log); idx++) { + oroot = &old_array[idx]; + while ((op = rb_first(oroot)) != NULL) { + rb_erase(op, oroot); + of = container_of(op, struct fq_flow, fq_node); + if (fq_gc_candidate(of)) { + fcnt++; + kmem_cache_free(fq_flow_cachep, of); + continue; + } + nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; + + np = &nroot->rb_node; + parent = NULL; + while (*np) { + parent = *np; + + nf = container_of(parent, struct fq_flow, fq_node); + BUG_ON(nf->sk == of->sk); + + if (nf->sk > of->sk) + np = &parent->rb_right; + else + np = &parent->rb_left; + } + + rb_link_node(&of->fq_node, parent, np); + rb_insert_color(&of->fq_node, nroot); + } + } + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; +} + +static void *fq_alloc_node(size_t sz, int node) +{ + void *ptr; + + ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); + if (!ptr) + ptr = vmalloc_node(sz, node); + return ptr; +} + +static void fq_free(void *addr) +{ + kvfree(addr); +} + +static int fq_resize(struct Qdisc *sch, u32 log) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct rb_root *array; + void *old_fq_root; + u32 idx; + + if (q->fq_root && log == q->fq_trees_log) + return 0; + + /* If XPS was setup, we can allocate memory on right NUMA node */ + array = fq_alloc_node(sizeof(struct rb_root) << log, + netdev_queue_numa_node_read(sch->dev_queue)); + if (!array) + return -ENOMEM; + + for (idx = 0; idx < (1U << log); idx++) + array[idx] = RB_ROOT; + + sch_tree_lock(sch); + + old_fq_root = q->fq_root; + if (old_fq_root) + fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); + + q->fq_root = array; + q->fq_trees_log = log; + + sch_tree_unlock(sch); + + fq_free(old_fq_root); + + return 0; +} + +static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { + [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, + [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, +}; + +static int fq_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_MAX + 1]; + int err, drop_count = 0; + u32 fq_log; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + fq_log = q->fq_trees_log; + + if (tb[TCA_FQ_BUCKETS_LOG]) { + u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); + + if (nval >= 1 && nval <= ilog2(256*1024)) + fq_log = nval; + else + err = -EINVAL; + } + if (tb[TCA_FQ_PLIMIT]) + sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); + + if (tb[TCA_FQ_FLOW_PLIMIT]) + q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); + + if (tb[TCA_FQ_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); + + if (tb[TCA_FQ_INITIAL_QUANTUM]) + q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); + + if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) + pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", + nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); + + if (tb[TCA_FQ_FLOW_MAX_RATE]) + q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + + if (tb[TCA_FQ_RATE_ENABLE]) { + u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); + + if (enable <= 1) + q->rate_enable = enable; + else + err = -EINVAL; + } + + if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { + u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; + + q->flow_refill_delay = usecs_to_jiffies(usecs_delay); + } + + if (!err) { + sch_tree_unlock(sch); + err = fq_resize(sch, fq_log); + sch_tree_lock(sch); + } + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_dequeue(sch); + + if (!skb) + break; + kfree_skb(skb); + drop_count++; + } + qdisc_tree_decrease_qlen(sch, drop_count); + + sch_tree_unlock(sch); + return err; +} + +static void fq_destroy(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + + fq_reset(sch); + fq_free(q->fq_root); + qdisc_watchdog_cancel(&q->watchdog); +} + +static int fq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + int err; + + sch->limit = 10000; + q->flow_plimit = 100; + q->quantum = 2 * psched_mtu(qdisc_dev(sch)); + q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); + q->flow_refill_delay = msecs_to_jiffies(40); + q->flow_max_rate = ~0U; + q->rate_enable = 1; + q->new_flows.first = NULL; + q->old_flows.first = NULL; + q->delayed = RB_ROOT; + q->fq_root = NULL; + q->fq_trees_log = ilog2(1024); + qdisc_watchdog_init(&q->watchdog, sch); + + if (opt) + err = fq_change(sch, opt); + else + err = fq_resize(sch, q->fq_trees_log); + + return err; +} + +static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ + + if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || + nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || + nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || + nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || + nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || + nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, + jiffies_to_usecs(q->flow_refill_delay)) || + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + return -1; +} + +static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct tc_fq_qd_stats st = { + .gc_flows = q->stat_gc_flows, + .highprio_packets = q->stat_internal_packets, + .tcp_retrans = q->stat_tcp_retrans, + .throttled = q->stat_throttled, + .flows_plimit = q->stat_flows_plimit, + .pkts_too_long = q->stat_pkts_too_long, + .allocation_errors = q->stat_allocation_errors, + .flows = q->flows, + .inactive_flows = q->inactive_flows, + .throttled_flows = q->throttled_flows, + .time_next_delayed_flow = q->time_next_delayed_flow - now, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops fq_qdisc_ops __read_mostly = { + .id = "fq", + .priv_size = sizeof(struct fq_sched_data), + + .enqueue = fq_enqueue, + .dequeue = fq_dequeue, + .peek = qdisc_peek_dequeued, + .init = fq_init, + .reset = fq_reset, + .destroy = fq_destroy, + .change = fq_change, + .dump = fq_dump, + .dump_stats = fq_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_module_init(void) +{ + int ret; + + fq_flow_cachep = kmem_cache_create("fq_flow_cache", + sizeof(struct fq_flow), + 0, 0, NULL); + if (!fq_flow_cachep) + return -ENOMEM; + + ret = register_qdisc(&fq_qdisc_ops); + if (ret) + kmem_cache_destroy(fq_flow_cachep); + return ret; +} + +static void __exit fq_module_exit(void) +{ + unregister_qdisc(&fq_qdisc_ops); + kmem_cache_destroy(fq_flow_cachep); +} + +module_init(fq_module_init) +module_exit(fq_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c new file mode 100644 index 00000000000..063b726bf1f --- /dev/null +++ b/net/sched/sch_fq_codel.c @@ -0,0 +1,620 @@ +/* + * Fair Queue CoDel discipline + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/flow_keys.h> +#include <net/codel.h> + +/* Fair Queue CoDel. + * + * Principles : + * Packets are classified (internal classifier or external) on flows. + * This is a Stochastic model (as we use a hash, several flows + * might be hashed on same slot) + * Each flow has a CoDel managed queue. + * Flows are linked onto two (Round Robin) lists, + * so that new flows have priority on old ones. + * + * For a given flow, packets are not reordered (CoDel uses a FIFO) + * head drops only. + * ECN capability is on by default. + * Low memory footprint (64 bytes per flow) + */ + +struct fq_codel_flow { + struct sk_buff *head; + struct sk_buff *tail; + struct list_head flowchain; + int deficit; + u32 dropped; /* number of drops (or ECN marks) on this flow */ + struct codel_vars cvars; +}; /* please try to keep this structure <= 64 bytes */ + +struct fq_codel_sched_data { + struct tcf_proto *filter_list; /* optional external classifier */ + struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ + u32 *backlogs; /* backlog table [flows_cnt] */ + u32 flows_cnt; /* number of flows */ + u32 perturbation; /* hash perturbation */ + u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ + struct codel_params cparams; + struct codel_stats cstats; + u32 drop_overlimit; + u32 new_flow_count; + + struct list_head new_flows; /* list of new flows */ + struct list_head old_flows; /* list of old flows */ +}; + +static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, + const struct sk_buff *skb) +{ + struct flow_keys keys; + unsigned int hash; + + skb_flow_dissect(skb, &keys); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src ^ keys.ip_proto, + (__force u32)keys.ports, q->perturbation); + return ((u64)hash * q->flows_cnt) >> 32; +} + +static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->flows_cnt) + return TC_H_MIN(skb->priority); + + if (!q->filter_list) + return fq_codel_hash(q, skb) + 1; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= q->flows_cnt) + return TC_H_MIN(res.classid); + } + return 0; +} + +/* helper functions : might be changed when/if skb use a standard list_head */ + +/* remove one skb from head of slot queue */ +static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) +{ + struct sk_buff *skb = flow->head; + + flow->head = skb->next; + skb->next = NULL; + return skb; +} + +/* add skb to flow queue (tail add) */ +static inline void flow_queue_add(struct fq_codel_flow *flow, + struct sk_buff *skb) +{ + if (flow->head == NULL) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static unsigned int fq_codel_drop(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + unsigned int maxbacklog = 0, idx = 0, i, len; + struct fq_codel_flow *flow; + + /* Queue is full! Find the fat flow and drop packet from it. + * This might sound expensive, but with 1024 flows, we scan + * 4KB of memory, and we dont need to handle a complex tree + * in fast path (packet queue/enqueue) with many cache misses. + */ + for (i = 0; i < q->flows_cnt; i++) { + if (q->backlogs[i] > maxbacklog) { + maxbacklog = q->backlogs[i]; + idx = i; + } + } + flow = &q->flows[idx]; + skb = dequeue_head(flow); + len = qdisc_pkt_len(skb); + q->backlogs[idx] -= len; + kfree_skb(skb); + sch->q.qlen--; + sch->qstats.drops++; + sch->qstats.backlog -= len; + flow->dropped++; + return idx; +} + +static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + unsigned int idx; + struct fq_codel_flow *flow; + int uninitialized_var(ret); + + idx = fq_codel_classify(skb, sch, &ret); + if (idx == 0) { + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } + idx--; + + codel_set_enqueue_time(skb); + flow = &q->flows[idx]; + flow_queue_add(flow, skb); + q->backlogs[idx] += qdisc_pkt_len(skb); + sch->qstats.backlog += qdisc_pkt_len(skb); + + if (list_empty(&flow->flowchain)) { + list_add_tail(&flow->flowchain, &q->new_flows); + q->new_flow_count++; + flow->deficit = q->quantum; + flow->dropped = 0; + } + if (++sch->q.qlen <= sch->limit) + return NET_XMIT_SUCCESS; + + q->drop_overlimit++; + /* Return Congestion Notification only if we dropped a packet + * from this flow. + */ + if (fq_codel_drop(sch) == idx) + return NET_XMIT_CN; + + /* As we dropped a packet, better let upper stack know this */ + qdisc_tree_decrease_qlen(sch, 1); + return NET_XMIT_SUCCESS; +} + +/* This is the specific function called from codel_dequeue() + * to dequeue a packet from queue. Note: backlog is handled in + * codel, we dont need to reduce it here. + */ +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct fq_codel_flow *flow; + struct sk_buff *skb = NULL; + + flow = container_of(vars, struct fq_codel_flow, cvars); + if (flow->head) { + skb = dequeue_head(flow); + q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); + sch->q.qlen--; + } + return skb; +} + +static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + struct fq_codel_flow *flow; + struct list_head *head; + u32 prev_drop_count, prev_ecn_mark; + +begin: + head = &q->new_flows; + if (list_empty(head)) { + head = &q->old_flows; + if (list_empty(head)) + return NULL; + } + flow = list_first_entry(head, struct fq_codel_flow, flowchain); + + if (flow->deficit <= 0) { + flow->deficit += q->quantum; + list_move_tail(&flow->flowchain, &q->old_flows); + goto begin; + } + + prev_drop_count = q->cstats.drop_count; + prev_ecn_mark = q->cstats.ecn_mark; + + skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, + dequeue); + + flow->dropped += q->cstats.drop_count - prev_drop_count; + flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; + + if (!skb) { + /* force a pass through old_flows to prevent starvation */ + if ((head == &q->new_flows) && !list_empty(&q->old_flows)) + list_move_tail(&flow->flowchain, &q->old_flows); + else + list_del_init(&flow->flowchain); + goto begin; + } + qdisc_bstats_update(sch, skb); + flow->deficit -= qdisc_pkt_len(skb); + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + * or HTB crashes. Defer it for next round. + */ + if (q->cstats.drop_count && sch->q.qlen) { + qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + q->cstats.drop_count = 0; + } + return skb; +} + +static void fq_codel_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = fq_codel_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { + [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, +}; + +static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy); + if (err < 0) + return err; + if (tb[TCA_FQ_CODEL_FLOWS]) { + if (q->flows) + return -EINVAL; + q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]); + if (!q->flows_cnt || + q->flows_cnt > 65536) + return -EINVAL; + } + sch_tree_lock(sch); + + if (tb[TCA_FQ_CODEL_TARGET]) { + u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); + + q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_FQ_CODEL_INTERVAL]) { + u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); + + q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_FQ_CODEL_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); + + if (tb[TCA_FQ_CODEL_ECN]) + q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); + + if (tb[TCA_FQ_CODEL_QUANTUM]) + q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_codel_dequeue(sch); + + kfree_skb(skb); + q->cstats.drop_count++; + } + qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + q->cstats.drop_count = 0; + + sch_tree_unlock(sch); + return 0; +} + +static void *fq_codel_zalloc(size_t sz) +{ + void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); + + if (!ptr) + ptr = vzalloc(sz); + return ptr; +} + +static void fq_codel_free(void *addr) +{ + kvfree(addr); +} + +static void fq_codel_destroy(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + fq_codel_free(q->backlogs); + fq_codel_free(q->flows); +} + +static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + int i; + + sch->limit = 10*1024; + q->flows_cnt = 1024; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->perturbation = prandom_u32(); + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + codel_params_init(&q->cparams); + codel_stats_init(&q->cstats); + q->cparams.ecn = true; + + if (opt) { + int err = fq_codel_change(sch, opt); + if (err) + return err; + } + + if (!q->flows) { + q->flows = fq_codel_zalloc(q->flows_cnt * + sizeof(struct fq_codel_flow)); + if (!q->flows) + return -ENOMEM; + q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); + if (!q->backlogs) { + fq_codel_free(q->flows); + return -ENOMEM; + } + for (i = 0; i < q->flows_cnt; i++) { + struct fq_codel_flow *flow = q->flows + i; + + INIT_LIST_HEAD(&flow->flowchain); + codel_vars_init(&flow->cvars); + } + } + if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, + codel_time_to_us(q->cparams.target)) || + nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, + sch->limit) || + nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, + codel_time_to_us(q->cparams.interval)) || + nla_put_u32(skb, TCA_FQ_CODEL_ECN, + q->cparams.ecn) || + nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, + q->quantum) || + nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, + q->flows_cnt)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + return -1; +} + +static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct tc_fq_codel_xstats st = { + .type = TCA_FQ_CODEL_XSTATS_QDISC, + }; + struct list_head *pos; + + st.qdisc_stats.maxpacket = q->cstats.maxpacket; + st.qdisc_stats.drop_overlimit = q->drop_overlimit; + st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; + st.qdisc_stats.new_flow_count = q->new_flow_count; + + list_for_each(pos, &q->new_flows) + st.qdisc_stats.new_flows_len++; + + list_for_each(pos, &q->old_flows) + st.qdisc_stats.old_flows_len++; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + /* we cannot bypass queue discipline anymore */ + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static void fq_codel_put(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + u32 idx = cl - 1; + struct gnet_stats_queue qs = { 0 }; + struct tc_fq_codel_xstats xstats; + + if (idx < q->flows_cnt) { + const struct fq_codel_flow *flow = &q->flows[idx]; + const struct sk_buff *skb = flow->head; + + memset(&xstats, 0, sizeof(xstats)); + xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; + xstats.class_stats.deficit = flow->deficit; + xstats.class_stats.ldelay = + codel_time_to_us(flow->cvars.ldelay); + xstats.class_stats.count = flow->cvars.count; + xstats.class_stats.lastcount = flow->cvars.lastcount; + xstats.class_stats.dropping = flow->cvars.dropping; + if (flow->cvars.dropping) { + codel_tdiff_t delta = flow->cvars.drop_next - + codel_get_time(); + + xstats.class_stats.drop_next = (delta >= 0) ? + codel_time_to_us(delta) : + -codel_time_to_us(-delta); + } + while (skb) { + qs.qlen++; + skb = skb->next; + } + qs.backlog = q->backlogs[idx]; + qs.drops = flow->dropped; + } + if (gnet_stats_copy_queue(d, &qs) < 0) + return -1; + if (idx < q->flows_cnt) + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); + return 0; +} + +static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->flows_cnt; i++) { + if (list_empty(&q->flows[i].flowchain) || + arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops fq_codel_class_ops = { + .leaf = fq_codel_leaf, + .get = fq_codel_get, + .put = fq_codel_put, + .tcf_chain = fq_codel_find_tcf, + .bind_tcf = fq_codel_bind, + .unbind_tcf = fq_codel_put, + .dump = fq_codel_dump_class, + .dump_stats = fq_codel_dump_class_stats, + .walk = fq_codel_walk, +}; + +static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { + .cl_ops = &fq_codel_class_ops, + .id = "fq_codel", + .priv_size = sizeof(struct fq_codel_sched_data), + .enqueue = fq_codel_enqueue, + .dequeue = fq_codel_dequeue, + .peek = qdisc_peek_dequeued, + .drop = fq_codel_drop, + .init = fq_codel_init, + .reset = fq_codel_reset, + .destroy = fq_codel_destroy, + .change = fq_codel_change, + .dump = fq_codel_dump, + .dump_stats = fq_codel_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_codel_module_init(void) +{ + return register_qdisc(&fq_codel_qdisc_ops); +} + +static void __exit fq_codel_module_exit(void) +{ + unregister_qdisc(&fq_codel_qdisc_ops); +} + +module_init(fq_codel_module_init) +module_exit(fq_codel_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 99ceb91f015..e1543b03e39 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -11,309 +11,402 @@ * - Ingress support */ -#include <asm/uaccess.h> -#include <asm/system.h> #include <linux/bitops.h> -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/list.h> -#include <net/sock.h> +#include <linux/slab.h> +#include <linux/if_vlan.h> +#include <net/sch_generic.h> #include <net/pkt_sched.h> +#include <net/dst.h> -/* Main transmission queue. */ - -/* Main qdisc structure lock. +/* Qdisc to use by default */ +const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; +EXPORT_SYMBOL(default_qdisc_ops); - However, modifications - to data, participating in scheduling must be additionally - protected with dev->queue_lock spinlock. - - The idea is the following: - - enqueue, dequeue are serialized via top level device - spinlock dev->queue_lock. - - tree walking is protected by read_lock_bh(qdisc_tree_lock) - and this lock is used only in process context. - - updates to tree are made under rtnl semaphore or - from softirq context (__qdisc_destroy rcu-callback) - hence this lock needs local bh disabling. +/* Main transmission queue. */ - qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! +/* Modifications to data participating in scheduling must be protected with + * qdisc_lock(qdisc) spinlock. + * + * The idea is the following: + * - enqueue, dequeue are serialized via qdisc root lock + * - ingress filtering is also serialized via qdisc root lock + * - updates to tree and tree walking are only done under the rtnl mutex. */ -DEFINE_RWLOCK(qdisc_tree_lock); -void qdisc_lock_tree(struct net_device *dev) +static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - write_lock_bh(&qdisc_tree_lock); - spin_lock_bh(&dev->queue_lock); + skb_dst_force(skb); + q->gso_skb = skb; + q->qstats.requeues++; + q->q.qlen++; /* it's still part of the queue */ + __netif_schedule(q); + + return 0; } -void qdisc_unlock_tree(struct net_device *dev) +static inline struct sk_buff *dequeue_skb(struct Qdisc *q) { - spin_unlock_bh(&dev->queue_lock); - write_unlock_bh(&qdisc_tree_lock); + struct sk_buff *skb = q->gso_skb; + const struct netdev_queue *txq = q->dev_queue; + + if (unlikely(skb)) { + /* check the reason of requeuing without tx lock first */ + txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb)); + if (!netif_xmit_frozen_or_stopped(txq)) { + q->gso_skb = NULL; + q->q.qlen--; + } else + skb = NULL; + } else { + if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) + skb = q->dequeue(q); + } + + return skb; } -/* - dev->queue_lock serializes queue accesses for this device - AND dev->qdisc pointer itself. +static inline int handle_dev_cpu_collision(struct sk_buff *skb, + struct netdev_queue *dev_queue, + struct Qdisc *q) +{ + int ret; - dev->xmit_lock serializes accesses to device driver. + if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) { + /* + * Same CPU holding the lock. It may be a transient + * configuration error, when hard_start_xmit() recurses. We + * detect it by checking xmit owner and drop the packet when + * deadloop is detected. Return OK to try the next skb. + */ + kfree_skb(skb); + net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n", + dev_queue->dev->name); + ret = qdisc_qlen(q); + } else { + /* + * Another cpu is holding lock, requeue & delay xmits for + * some time. + */ + __this_cpu_inc(softnet_data.cpu_collision); + ret = dev_requeue_skb(skb, q); + } + + return ret; +} - dev->queue_lock and dev->xmit_lock are mutually exclusive, - if one is grabbed, another must be free. +/* + * Transmit one skb, and handle the return status as required. Holding the + * __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this + * function. + * + * Returns to the caller: + * 0 - queue is empty or throttled. + * >0 - queue is not empty. */ +int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, struct netdev_queue *txq, + spinlock_t *root_lock) +{ + int ret = NETDEV_TX_BUSY; + + /* And release qdisc */ + spin_unlock(root_lock); + + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_stopped(txq)) + ret = dev_hard_start_xmit(skb, dev, txq); + HARD_TX_UNLOCK(dev, txq); -/* Kick device. - Note, that this procedure can be called by a watchdog timer, so that - we do not check dev->tbusy flag here. + spin_lock(root_lock); + + if (dev_xmit_complete(ret)) { + /* Driver sent out skb successfully or skb was consumed */ + ret = qdisc_qlen(q); + } else if (ret == NETDEV_TX_LOCKED) { + /* Driver try lock failed */ + ret = handle_dev_cpu_collision(skb, txq, q); + } else { + /* Driver returned NETDEV_TX_BUSY - requeue skb */ + if (unlikely(ret != NETDEV_TX_BUSY)) + net_warn_ratelimited("BUG %s code %d qlen %d\n", + dev->name, ret, q->q.qlen); + + ret = dev_requeue_skb(skb, q); + } - Returns: 0 - queue is empty. - >0 - queue is not empty, but throttled. - <0 - queue is not empty. Device is throttled, if dev->tbusy != 0. + if (ret && netif_xmit_frozen_or_stopped(txq)) + ret = 0; - NOTE: Called under dev->queue_lock with locally disabled BH. -*/ + return ret; +} -int qdisc_restart(struct net_device *dev) +/* + * NOTE: Called under qdisc_lock(q) with locally disabled BH. + * + * __QDISC_STATE_RUNNING guarantees only one CPU can process + * this qdisc at a time. qdisc_lock(q) serializes queue accesses for + * this queue. + * + * netif_tx_lock serializes accesses to device driver. + * + * qdisc_lock(q) and netif_tx_lock are mutually exclusive, + * if one is grabbed, another must be free. + * + * Note, that this procedure can be called by a watchdog timer + * + * Returns to the caller: + * 0 - queue is empty or throttled. + * >0 - queue is not empty. + * + */ +static inline int qdisc_restart(struct Qdisc *q) { - struct Qdisc *q = dev->qdisc; + struct netdev_queue *txq; + struct net_device *dev; + spinlock_t *root_lock; struct sk_buff *skb; /* Dequeue packet */ - if ((skb = q->dequeue(q)) != NULL) { - unsigned nolock = (dev->features & NETIF_F_LLTX); + skb = dequeue_skb(q); + if (unlikely(!skb)) + return 0; + WARN_ON_ONCE(skb_dst_is_noref(skb)); + root_lock = qdisc_lock(q); + dev = qdisc_dev(q); + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + + return sch_direct_xmit(skb, q, dev, txq, root_lock); +} + +void __qdisc_run(struct Qdisc *q) +{ + int quota = weight_p; + + while (qdisc_restart(q)) { /* - * When the driver has LLTX set it does its own locking - * in start_xmit. No need to add additional overhead by - * locking again. These checks are worth it because - * even uncongested locks can be quite expensive. - * The driver can do trylock like here too, in case - * of lock congestion it should return -1 and the packet - * will be requeued. + * Ordered by possible occurrence: Postpone processing if + * 1. we've exceeded packet quota + * 2. another process needs the CPU; */ - if (!nolock) { - if (!spin_trylock(&dev->xmit_lock)) { - collision: - /* So, someone grabbed the driver. */ - - /* It may be transient configuration error, - when hard_start_xmit() recurses. We detect - it by checking xmit owner and drop the - packet when deadloop is detected. - */ - if (dev->xmit_lock_owner == smp_processor_id()) { - kfree_skb(skb); - if (net_ratelimit()) - printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); - return -1; - } - __get_cpu_var(netdev_rx_stat).cpu_collision++; - goto requeue; - } - /* Remember that the driver is grabbed by us. */ - dev->xmit_lock_owner = smp_processor_id(); - } - - { - /* And release queue */ - spin_unlock(&dev->queue_lock); - - if (!netif_queue_stopped(dev)) { - int ret; - if (netdev_nit) - dev_queue_xmit_nit(skb, dev); - - ret = dev->hard_start_xmit(skb, dev); - if (ret == NETDEV_TX_OK) { - if (!nolock) { - dev->xmit_lock_owner = -1; - spin_unlock(&dev->xmit_lock); - } - spin_lock(&dev->queue_lock); - return -1; - } - if (ret == NETDEV_TX_LOCKED && nolock) { - spin_lock(&dev->queue_lock); - goto collision; - } - } - - /* NETDEV_TX_BUSY - we need to requeue */ - /* Release the driver */ - if (!nolock) { - dev->xmit_lock_owner = -1; - spin_unlock(&dev->xmit_lock); - } - spin_lock(&dev->queue_lock); - q = dev->qdisc; + if (--quota <= 0 || need_resched()) { + __netif_schedule(q); + break; } + } - /* Device kicked us out :( - This is possible in three cases: - - 0. driver is locked - 1. fastroute is enabled - 2. device cannot determine busy state - before start of transmission (f.e. dialout) - 3. device is buggy (ppp) - */ + qdisc_run_end(q); +} -requeue: - q->ops->requeue(skb, q); - netif_schedule(dev); - return 1; +unsigned long dev_trans_start(struct net_device *dev) +{ + unsigned long val, res; + unsigned int i; + + if (is_vlan_dev(dev)) + dev = vlan_dev_real_dev(dev); + res = dev->trans_start; + for (i = 0; i < dev->num_tx_queues; i++) { + val = netdev_get_tx_queue(dev, i)->trans_start; + if (val && time_after(val, res)) + res = val; } - BUG_ON((int) q->q.qlen < 0); - return q->q.qlen; + dev->trans_start = res; + + return res; } +EXPORT_SYMBOL(dev_trans_start); static void dev_watchdog(unsigned long arg) { struct net_device *dev = (struct net_device *)arg; - spin_lock(&dev->xmit_lock); - if (dev->qdisc != &noop_qdisc) { + netif_tx_lock(dev); + if (!qdisc_tx_is_noop(dev)) { if (netif_device_present(dev) && netif_running(dev) && netif_carrier_ok(dev)) { - if (netif_queue_stopped(dev) && - (jiffies - dev->trans_start) > dev->watchdog_timeo) { - printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name); - dev->tx_timeout(dev); + int some_queue_timedout = 0; + unsigned int i; + unsigned long trans_start; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq; + + txq = netdev_get_tx_queue(dev, i); + /* + * old device drivers set dev->trans_start + */ + trans_start = txq->trans_start ? : dev->trans_start; + if (netif_xmit_stopped(txq) && + time_after(jiffies, (trans_start + + dev->watchdog_timeo))) { + some_queue_timedout = 1; + txq->trans_timeout++; + break; + } + } + + if (some_queue_timedout) { + WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", + dev->name, netdev_drivername(dev), i); + dev->netdev_ops->ndo_tx_timeout(dev); } - if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) + if (!mod_timer(&dev->watchdog_timer, + round_jiffies(jiffies + + dev->watchdog_timeo))) dev_hold(dev); } } - spin_unlock(&dev->xmit_lock); + netif_tx_unlock(dev); dev_put(dev); } -static void dev_watchdog_init(struct net_device *dev) -{ - init_timer(&dev->watchdog_timer); - dev->watchdog_timer.data = (unsigned long)dev; - dev->watchdog_timer.function = dev_watchdog; -} - void __netdev_watchdog_up(struct net_device *dev) { - if (dev->tx_timeout) { + if (dev->netdev_ops->ndo_tx_timeout) { if (dev->watchdog_timeo <= 0) dev->watchdog_timeo = 5*HZ; - if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) + if (!mod_timer(&dev->watchdog_timer, + round_jiffies(jiffies + dev->watchdog_timeo))) dev_hold(dev); } } static void dev_watchdog_up(struct net_device *dev) { - spin_lock_bh(&dev->xmit_lock); __netdev_watchdog_up(dev); - spin_unlock_bh(&dev->xmit_lock); } static void dev_watchdog_down(struct net_device *dev) { - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); if (del_timer(&dev->watchdog_timer)) - __dev_put(dev); - spin_unlock_bh(&dev->xmit_lock); + dev_put(dev); + netif_tx_unlock_bh(dev); } +/** + * netif_carrier_on - set carrier + * @dev: network device + * + * Device has detected that carrier. + */ void netif_carrier_on(struct net_device *dev) { - if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) + if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { + if (dev->reg_state == NETREG_UNINITIALIZED) + return; + atomic_inc(&dev->carrier_changes); linkwatch_fire_event(dev); - if (netif_running(dev)) - __netdev_watchdog_up(dev); + if (netif_running(dev)) + __netdev_watchdog_up(dev); + } } +EXPORT_SYMBOL(netif_carrier_on); +/** + * netif_carrier_off - clear carrier + * @dev: network device + * + * Device has detected loss of carrier. + */ void netif_carrier_off(struct net_device *dev) { - if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) + if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { + if (dev->reg_state == NETREG_UNINITIALIZED) + return; + atomic_inc(&dev->carrier_changes); linkwatch_fire_event(dev); + } } +EXPORT_SYMBOL(netif_carrier_off); /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. */ -static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) { kfree_skb(skb); return NET_XMIT_CN; } -static struct sk_buff *noop_dequeue(struct Qdisc * qdisc) +static struct sk_buff *noop_dequeue(struct Qdisc *qdisc) { return NULL; } -static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc) -{ - if (net_ratelimit()) - printk(KERN_DEBUG "%s deferred output. It is buggy.\n", - skb->dev->name); - kfree_skb(skb); - return NET_XMIT_CN; -} - -struct Qdisc_ops noop_qdisc_ops = { +struct Qdisc_ops noop_qdisc_ops __read_mostly = { .id = "noop", .priv_size = 0, .enqueue = noop_enqueue, .dequeue = noop_dequeue, - .requeue = noop_requeue, + .peek = noop_dequeue, .owner = THIS_MODULE, }; +static struct netdev_queue noop_netdev_queue = { + .qdisc = &noop_qdisc, + .qdisc_sleeping = &noop_qdisc, +}; + struct Qdisc noop_qdisc = { .enqueue = noop_enqueue, .dequeue = noop_dequeue, .flags = TCQ_F_BUILTIN, - .ops = &noop_qdisc_ops, + .ops = &noop_qdisc_ops, .list = LIST_HEAD_INIT(noop_qdisc.list), + .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), + .dev_queue = &noop_netdev_queue, + .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), }; +EXPORT_SYMBOL(noop_qdisc); -static struct Qdisc_ops noqueue_qdisc_ops = { +static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .id = "noqueue", .priv_size = 0, .enqueue = noop_enqueue, .dequeue = noop_dequeue, - .requeue = noop_requeue, + .peek = noop_dequeue, .owner = THIS_MODULE, }; +static struct Qdisc noqueue_qdisc; +static struct netdev_queue noqueue_netdev_queue = { + .qdisc = &noqueue_qdisc, + .qdisc_sleeping = &noqueue_qdisc, +}; + static struct Qdisc noqueue_qdisc = { .enqueue = NULL, .dequeue = noop_dequeue, .flags = TCQ_F_BUILTIN, .ops = &noqueue_qdisc_ops, .list = LIST_HEAD_INIT(noqueue_qdisc.list), + .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), + .dev_queue = &noqueue_netdev_queue, + .busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock), }; -static const u8 prio2band[TC_PRIO_MAX+1] = - { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; +static const u8 prio2band[TC_PRIO_MAX + 1] = { + 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 +}; /* 3-band FIFO queue: old style, but should be a bit faster than generic prio+fifo combination. @@ -321,18 +414,38 @@ static const u8 prio2band[TC_PRIO_MAX+1] = #define PFIFO_FAST_BANDS 3 -static inline struct sk_buff_head *prio2list(struct sk_buff *skb, - struct Qdisc *qdisc) +/* + * Private data for a pfifo_fast scheduler containing: + * - queues for the three band + * - bitmap indicating which of the bands contain skbs + */ +struct pfifo_fast_priv { + u32 bitmap; + struct sk_buff_head q[PFIFO_FAST_BANDS]; +}; + +/* + * Convert a bitmap to the first band number where an skb is queued, where: + * bitmap=0 means there are no skbs on any band. + * bitmap=1 means there is an skb on band 0. + * bitmap=7 means there are skbs on all 3 bands, etc. + */ +static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; + +static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, + int band) { - struct sk_buff_head *list = qdisc_priv(qdisc); - return list + prio2band[skb->priority & TC_PRIO_MAX]; + return priv->q + band; } -static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) +static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) { - struct sk_buff_head *list = prio2list(skb, qdisc); + if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { + int band = prio2band[skb->priority & TC_PRIO_MAX]; + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + struct sk_buff_head *list = band2list(priv, band); - if (skb_queue_len(list) < qdisc->dev->tx_queue_len) { + priv->bitmap |= (1 << band); qdisc->q.qlen++; return __qdisc_enqueue_tail(skb, qdisc, list); } @@ -340,35 +453,48 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) return qdisc_drop(skb, qdisc); } -static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc) +static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { - int prio; - struct sk_buff_head *list = qdisc_priv(qdisc); + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + int band = bitmap2band[priv->bitmap]; - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { - if (!skb_queue_empty(list + prio)) { - qdisc->q.qlen--; - return __qdisc_dequeue_head(qdisc, list + prio); - } + if (likely(band >= 0)) { + struct sk_buff_head *list = band2list(priv, band); + struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); + + qdisc->q.qlen--; + if (skb_queue_empty(list)) + priv->bitmap &= ~(1 << band); + + return skb; } return NULL; } -static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) { - qdisc->q.qlen++; - return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc)); + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + int band = bitmap2band[priv->bitmap]; + + if (band >= 0) { + struct sk_buff_head *list = band2list(priv, band); + + return skb_peek(list); + } + + return NULL; } -static void pfifo_fast_reset(struct Qdisc* qdisc) +static void pfifo_fast_reset(struct Qdisc *qdisc) { int prio; - struct sk_buff_head *list = qdisc_priv(qdisc); + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(qdisc, list + prio); + __qdisc_reset_queue(qdisc, band2list(priv, prio)); + priv->bitmap = 0; qdisc->qstats.backlog = 0; qdisc->q.qlen = 0; } @@ -377,77 +503,99 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) { struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; - memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); - RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; -rtattr_failure: +nla_put_failure: return -1; } -static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) +static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) { int prio; - struct sk_buff_head *list = qdisc_priv(qdisc); + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - skb_queue_head_init(list + prio); + skb_queue_head_init(band2list(priv, prio)); + /* Can by-pass the queue discipline */ + qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } -static struct Qdisc_ops pfifo_fast_ops = { +struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", - .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head), + .priv_size = sizeof(struct pfifo_fast_priv), .enqueue = pfifo_fast_enqueue, .dequeue = pfifo_fast_dequeue, - .requeue = pfifo_fast_requeue, + .peek = pfifo_fast_peek, .init = pfifo_fast_init, .reset = pfifo_fast_reset, .dump = pfifo_fast_dump, .owner = THIS_MODULE, }; -struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops) +static struct lock_class_key qdisc_tx_busylock; + +struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + const struct Qdisc_ops *ops) { void *p; struct Qdisc *sch; - unsigned int size; + unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size; int err = -ENOBUFS; + struct net_device *dev = dev_queue->dev; - /* ensure that the Qdisc and the private data are 32-byte aligned */ - size = QDISC_ALIGN(sizeof(*sch)); - size += ops->priv_size + (QDISC_ALIGNTO - 1); + p = kzalloc_node(size, GFP_KERNEL, + netdev_queue_numa_node_read(dev_queue)); - p = kmalloc(size, GFP_KERNEL); if (!p) goto errout; - memset(p, 0, size); sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); - sch->padded = (char *) sch - (char *) p; - + /* if we got non aligned memory, ask more and do alignment ourself */ + if (sch != p) { + kfree(p); + p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL, + netdev_queue_numa_node_read(dev_queue)); + if (!p) + goto errout; + sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); + sch->padded = (char *) sch - (char *) p; + } INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); + + spin_lock_init(&sch->busylock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + sch->ops = ops; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; - sch->dev = dev; + sch->dev_queue = dev_queue; dev_hold(dev); - sch->stats_lock = &dev->queue_lock; atomic_set(&sch->refcnt, 1); return sch; errout: - return ERR_PTR(-err); + return ERR_PTR(err); } -struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) +struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, + const struct Qdisc_ops *ops, + unsigned int parentid) { struct Qdisc *sch; - - sch = qdisc_alloc(dev, ops); + + if (!try_module_get(ops->owner)) + goto errout; + + sch = qdisc_alloc(dev_queue, ops); if (IS_ERR(sch)) goto errout; + sch->parent = parentid; if (!ops->init || ops->init(sch, NULL) == 0) return sch; @@ -456,172 +604,345 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) errout: return NULL; } +EXPORT_SYMBOL(qdisc_create_dflt); -/* Under dev->queue_lock and BH! */ +/* Under qdisc_lock(qdisc) and BH! */ void qdisc_reset(struct Qdisc *qdisc) { - struct Qdisc_ops *ops = qdisc->ops; + const struct Qdisc_ops *ops = qdisc->ops; if (ops->reset) ops->reset(qdisc); + + if (qdisc->gso_skb) { + kfree_skb(qdisc->gso_skb); + qdisc->gso_skb = NULL; + qdisc->q.qlen = 0; + } } +EXPORT_SYMBOL(qdisc_reset); -/* this is the rcu callback function to clean up a qdisc when there - * are no further references to it */ +static void qdisc_rcu_free(struct rcu_head *head) +{ + struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); -static void __qdisc_destroy(struct rcu_head *head) + kfree((char *) qdisc - qdisc->padded); +} + +void qdisc_destroy(struct Qdisc *qdisc) { - struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); - struct Qdisc_ops *ops = qdisc->ops; + const struct Qdisc_ops *ops = qdisc->ops; -#ifdef CONFIG_NET_ESTIMATOR - gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); + if (qdisc->flags & TCQ_F_BUILTIN || + !atomic_dec_and_test(&qdisc->refcnt)) + return; + +#ifdef CONFIG_NET_SCHED + qdisc_list_del(qdisc); + + qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif - write_lock(&qdisc_tree_lock); + gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); if (ops->reset) ops->reset(qdisc); if (ops->destroy) ops->destroy(qdisc); - write_unlock(&qdisc_tree_lock); + module_put(ops->owner); + dev_put(qdisc_dev(qdisc)); - dev_put(qdisc->dev); - kfree((char *) qdisc - qdisc->padded); + kfree_skb(qdisc->gso_skb); + /* + * gen_estimator est_timer() might access qdisc->q.lock, + * wait a RCU grace period before freeing qdisc. + */ + call_rcu(&qdisc->rcu_head, qdisc_rcu_free); } +EXPORT_SYMBOL(qdisc_destroy); -/* Under dev->queue_lock and BH! */ +/* Attach toplevel qdisc to device queue. */ +struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, + struct Qdisc *qdisc) +{ + struct Qdisc *oqdisc = dev_queue->qdisc_sleeping; + spinlock_t *root_lock; -void qdisc_destroy(struct Qdisc *qdisc) + root_lock = qdisc_lock(oqdisc); + spin_lock_bh(root_lock); + + /* Prune old scheduler */ + if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) + qdisc_reset(oqdisc); + + /* ... and graft new one */ + if (qdisc == NULL) + qdisc = &noop_qdisc; + dev_queue->qdisc_sleeping = qdisc; + rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); + + spin_unlock_bh(root_lock); + + return oqdisc; +} +EXPORT_SYMBOL(dev_graft_qdisc); + +static void attach_one_default_qdisc(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_unused) { - struct list_head cql = LIST_HEAD_INIT(cql); - struct Qdisc *cq, *q, *n; + struct Qdisc *qdisc = &noqueue_qdisc; + + if (dev->tx_queue_len) { + qdisc = qdisc_create_dflt(dev_queue, + default_qdisc_ops, TC_H_ROOT); + if (!qdisc) { + netdev_info(dev, "activation failed\n"); + return; + } + if (!netif_is_multiqueue(dev)) + qdisc->flags |= TCQ_F_ONETXQUEUE; + } + dev_queue->qdisc_sleeping = qdisc; +} - if (qdisc->flags & TCQ_F_BUILTIN || - !atomic_dec_and_test(&qdisc->refcnt)) - return; +static void attach_default_qdiscs(struct net_device *dev) +{ + struct netdev_queue *txq; + struct Qdisc *qdisc; - if (!list_empty(&qdisc->list)) { - if (qdisc->ops->cl_ops == NULL) - list_del(&qdisc->list); - else - list_move(&qdisc->list, &cql); + txq = netdev_get_tx_queue(dev, 0); + + if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) { + netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); + dev->qdisc = txq->qdisc_sleeping; + atomic_inc(&dev->qdisc->refcnt); + } else { + qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT); + if (qdisc) { + dev->qdisc = qdisc; + qdisc->ops->attach(qdisc); + } } +} - /* unlink inner qdiscs from dev->qdisc_list immediately */ - list_for_each_entry(cq, &cql, list) - list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list) - if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) { - if (q->ops->cl_ops == NULL) - list_del_init(&q->list); - else - list_move_tail(&q->list, &cql); - } - list_for_each_entry_safe(cq, n, &cql, list) - list_del_init(&cq->list); +static void transition_one_qdisc(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_need_watchdog) +{ + struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping; + int *need_watchdog_p = _need_watchdog; + + if (!(new_qdisc->flags & TCQ_F_BUILTIN)) + clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); - call_rcu(&qdisc->q_rcu, __qdisc_destroy); + rcu_assign_pointer(dev_queue->qdisc, new_qdisc); + if (need_watchdog_p && new_qdisc != &noqueue_qdisc) { + dev_queue->trans_start = 0; + *need_watchdog_p = 1; + } } void dev_activate(struct net_device *dev) { + int need_watchdog; + /* No queueing discipline is attached to device; - create default one i.e. pfifo_fast for devices, - which need queueing and noqueue_qdisc for - virtual interfaces + * create default one for devices, which need queueing + * and noqueue_qdisc for virtual interfaces */ - if (dev->qdisc_sleeping == &noop_qdisc) { - struct Qdisc *qdisc; - if (dev->tx_queue_len) { - qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops); - if (qdisc == NULL) { - printk(KERN_INFO "%s: activation failed\n", dev->name); - return; - } - write_lock_bh(&qdisc_tree_lock); - list_add_tail(&qdisc->list, &dev->qdisc_list); - write_unlock_bh(&qdisc_tree_lock); - } else { - qdisc = &noqueue_qdisc; - } - write_lock_bh(&qdisc_tree_lock); - dev->qdisc_sleeping = qdisc; - write_unlock_bh(&qdisc_tree_lock); - } + if (dev->qdisc == &noop_qdisc) + attach_default_qdiscs(dev); if (!netif_carrier_ok(dev)) /* Delay activation until next carrier-on event */ return; - spin_lock_bh(&dev->queue_lock); - rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping); - if (dev->qdisc != &noqueue_qdisc) { + need_watchdog = 0; + netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); + if (dev_ingress_queue(dev)) + transition_one_qdisc(dev, dev_ingress_queue(dev), NULL); + + if (need_watchdog) { dev->trans_start = jiffies; dev_watchdog_up(dev); } - spin_unlock_bh(&dev->queue_lock); } +EXPORT_SYMBOL(dev_activate); -void dev_deactivate(struct net_device *dev) +static void dev_deactivate_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_qdisc_default) { + struct Qdisc *qdisc_default = _qdisc_default; struct Qdisc *qdisc; - spin_lock_bh(&dev->queue_lock); - qdisc = dev->qdisc; - dev->qdisc = &noop_qdisc; + qdisc = dev_queue->qdisc; + if (qdisc) { + spin_lock_bh(qdisc_lock(qdisc)); + + if (!(qdisc->flags & TCQ_F_BUILTIN)) + set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); - qdisc_reset(qdisc); + rcu_assign_pointer(dev_queue->qdisc, qdisc_default); + qdisc_reset(qdisc); - spin_unlock_bh(&dev->queue_lock); + spin_unlock_bh(qdisc_lock(qdisc)); + } +} + +static bool some_qdisc_is_busy(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *dev_queue; + spinlock_t *root_lock; + struct Qdisc *q; + int val; + + dev_queue = netdev_get_tx_queue(dev, i); + q = dev_queue->qdisc_sleeping; + root_lock = qdisc_lock(q); + + spin_lock_bh(root_lock); - dev_watchdog_down(dev); + val = (qdisc_is_running(q) || + test_bit(__QDISC_STATE_SCHED, &q->state)); - while (test_bit(__LINK_STATE_SCHED, &dev->state)) - yield(); + spin_unlock_bh(root_lock); - spin_unlock_wait(&dev->xmit_lock); + if (val) + return true; + } + return false; +} + +/** + * dev_deactivate_many - deactivate transmissions on several devices + * @head: list of devices to deactivate + * + * This function returns only when all outstanding transmissions + * have completed, unless all devices are in dismantle phase. + */ +void dev_deactivate_many(struct list_head *head) +{ + struct net_device *dev; + bool sync_needed = false; + + list_for_each_entry(dev, head, close_list) { + netdev_for_each_tx_queue(dev, dev_deactivate_queue, + &noop_qdisc); + if (dev_ingress_queue(dev)) + dev_deactivate_queue(dev, dev_ingress_queue(dev), + &noop_qdisc); + + dev_watchdog_down(dev); + sync_needed |= !dev->dismantle; + } + + /* Wait for outstanding qdisc-less dev_queue_xmit calls. + * This is avoided if all devices are in dismantle phase : + * Caller will call synchronize_net() for us + */ + if (sync_needed) + synchronize_net(); + + /* Wait for outstanding qdisc_run calls. */ + list_for_each_entry(dev, head, close_list) + while (some_qdisc_is_busy(dev)) + yield(); +} + +void dev_deactivate(struct net_device *dev) +{ + LIST_HEAD(single); + + list_add(&dev->close_list, &single); + dev_deactivate_many(&single); + list_del(&single); +} +EXPORT_SYMBOL(dev_deactivate); + +static void dev_init_scheduler_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_qdisc) +{ + struct Qdisc *qdisc = _qdisc; + + dev_queue->qdisc = qdisc; + dev_queue->qdisc_sleeping = qdisc; } void dev_init_scheduler(struct net_device *dev) { - qdisc_lock_tree(dev); dev->qdisc = &noop_qdisc; - dev->qdisc_sleeping = &noop_qdisc; - INIT_LIST_HEAD(&dev->qdisc_list); - qdisc_unlock_tree(dev); + netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); + if (dev_ingress_queue(dev)) + dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); - dev_watchdog_init(dev); + setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev); } -void dev_shutdown(struct net_device *dev) +static void shutdown_scheduler_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_qdisc_default) { - struct Qdisc *qdisc; + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + struct Qdisc *qdisc_default = _qdisc_default; + + if (qdisc) { + rcu_assign_pointer(dev_queue->qdisc, qdisc_default); + dev_queue->qdisc_sleeping = qdisc_default; - qdisc_lock_tree(dev); - qdisc = dev->qdisc_sleeping; - dev->qdisc = &noop_qdisc; - dev->qdisc_sleeping = &noop_qdisc; - qdisc_destroy(qdisc); -#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE) - if ((qdisc = dev->qdisc_ingress) != NULL) { - dev->qdisc_ingress = NULL; qdisc_destroy(qdisc); - } -#endif - BUG_TRAP(!timer_pending(&dev->watchdog_timer)); - qdisc_unlock_tree(dev); + } } -EXPORT_SYMBOL(__netdev_watchdog_up); -EXPORT_SYMBOL(netif_carrier_on); -EXPORT_SYMBOL(netif_carrier_off); -EXPORT_SYMBOL(noop_qdisc); -EXPORT_SYMBOL(noop_qdisc_ops); -EXPORT_SYMBOL(qdisc_create_dflt); -EXPORT_SYMBOL(qdisc_alloc); -EXPORT_SYMBOL(qdisc_destroy); -EXPORT_SYMBOL(qdisc_reset); -EXPORT_SYMBOL(qdisc_restart); -EXPORT_SYMBOL(qdisc_lock_tree); -EXPORT_SYMBOL(qdisc_unlock_tree); +void dev_shutdown(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); + if (dev_ingress_queue(dev)) + shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); + qdisc_destroy(dev->qdisc); + dev->qdisc = &noop_qdisc; + + WARN_ON(timer_pending(&dev->watchdog_timer)); +} + +void psched_ratecfg_precompute(struct psched_ratecfg *r, + const struct tc_ratespec *conf, + u64 rate64) +{ + memset(r, 0, sizeof(*r)); + r->overhead = conf->overhead; + r->rate_bytes_ps = max_t(u64, conf->rate, rate64); + r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); + r->mult = 1; + /* + * The deal here is to replace a divide by a reciprocal one + * in fast path (a reciprocal divide is a multiply and a shift) + * + * Normal formula would be : + * time_in_ns = (NSEC_PER_SEC * len) / rate_bps + * + * We compute mult/shift to use instead : + * time_in_ns = (len * mult) >> shift; + * + * We try to get the highest possible mult value for accuracy, + * but have to make sure no overflows will ever happen. + */ + if (r->rate_bytes_ps > 0) { + u64 factor = NSEC_PER_SEC; + + for (;;) { + r->mult = div64_u64(factor, r->rate_bytes_ps); + if (r->mult & (1U << 31) || factor & (1ULL << 63)) + break; + factor <<= 1; + r->shift++; + } + } +} +EXPORT_SYMBOL(psched_ratecfg_precompute); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 29a2dd9f302..12cbc09157f 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -18,11 +18,10 @@ * For all the glorious comments look at include/net/red.h */ -#include <linux/config.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/red.h> @@ -33,16 +32,16 @@ struct gred_sched_data; struct gred_sched; -struct gred_sched_data -{ +struct gred_sched_data { u32 limit; /* HARD maximal queue length */ - u32 DP; /* the drop pramaters */ + u32 DP; /* the drop parameters */ u32 bytesin; /* bytes seen on virtualQ so far*/ u32 packetsin; /* packets seen on virtualQ so far*/ u32 backlog; /* bytes on the virtualQ */ u8 prio; /* the prio of this vq */ struct red_parms parms; + struct red_vars vars; struct red_stats stats; }; @@ -51,14 +50,13 @@ enum { GRED_RIO_MODE, }; -struct gred_sched -{ +struct gred_sched { struct gred_sched_data *tab[MAX_DPs]; unsigned long flags; u32 red_flags; u32 DPs; u32 def; - struct red_parms wred_set; + struct red_vars wred_set; }; static inline int gred_wred_mode(struct gred_sched *table) @@ -104,9 +102,8 @@ static inline int gred_wred_mode_check(struct Qdisc *sch) if (q == NULL) continue; - for (n = 0; n < table->DPs; n++) - if (table->tab[n] && table->tab[n] != q && - table->tab[n]->prio == q->prio) + for (n = i + 1; n < table->DPs; n++) + if (table->tab[n] && table->tab[n]->prio == q->prio) return 1; } @@ -128,17 +125,18 @@ static inline u16 tc_index_to_dp(struct sk_buff *skb) return skb->tc_index & GRED_VQ_MASK; } -static inline void gred_load_wred_set(struct gred_sched *table, +static inline void gred_load_wred_set(const struct gred_sched *table, struct gred_sched_data *q) { - q->parms.qavg = table->wred_set.qavg; - q->parms.qidlestart = table->wred_set.qidlestart; + q->vars.qavg = table->wred_set.qavg; + q->vars.qidlestart = table->wred_set.qidlestart; } static inline void gred_store_wred_set(struct gred_sched *table, struct gred_sched_data *q) { - table->wred_set.qavg = q->parms.qavg; + table->wred_set.qavg = q->vars.qavg; + table->wred_set.qidlestart = q->vars.qidlestart; } static inline int gred_use_ecn(struct gred_sched *t) @@ -151,85 +149,88 @@ static inline int gred_use_harddrop(struct gred_sched *t) return t->red_flags & TC_RED_HARDDROP; } -static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct gred_sched_data *q=NULL; - struct gred_sched *t= qdisc_priv(sch); + struct gred_sched_data *q = NULL; + struct gred_sched *t = qdisc_priv(sch); unsigned long qavg = 0; u16 dp = tc_index_to_dp(skb); - if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { dp = t->def; - if ((q = t->tab[dp]) == NULL) { + q = t->tab[dp]; + if (!q) { /* Pass through packets not assigned to a DP * if no default DP has been configured. This * allows for DP flows to be left untouched. */ - if (skb_queue_len(&sch->q) < sch->dev->tx_queue_len) + if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len) return qdisc_enqueue_tail(skb, sch); else goto drop; } - /* fix tc_index? --could be controvesial but needed for + /* fix tc_index? --could be controversial but needed for requeueing */ skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; } - /* sum up all the qaves of prios <= to ours to get the new qave */ + /* sum up all the qaves of prios < ours to get the new qave */ if (!gred_wred_mode(t) && gred_rio_mode(t)) { int i; for (i = 0; i < t->DPs; i++) { if (t->tab[i] && t->tab[i]->prio < q->prio && - !red_is_idling(&t->tab[i]->parms)) - qavg +=t->tab[i]->parms.qavg; + !red_is_idling(&t->tab[i]->vars)) + qavg += t->tab[i]->vars.qavg; } } q->packetsin++; - q->bytesin += skb->len; + q->bytesin += qdisc_pkt_len(skb); if (gred_wred_mode(t)) gred_load_wred_set(t, q); - q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch)); + q->vars.qavg = red_calc_qavg(&q->parms, + &q->vars, + gred_backlog(t, q, sch)); - if (red_is_idling(&q->parms)) - red_end_of_idle_period(&q->parms); + if (red_is_idling(&q->vars)) + red_end_of_idle_period(&q->vars); if (gred_wred_mode(t)) gred_store_wred_set(t, q); - switch (red_action(&q->parms, q->parms.qavg + qavg)) { - case RED_DONT_MARK: - break; + switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) { + case RED_DONT_MARK: + break; - case RED_PROB_MARK: - sch->qstats.overlimits++; - if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { - q->stats.prob_drop++; - goto congestion_drop; - } + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } - q->stats.prob_mark++; - break; + q->stats.prob_mark++; + break; - case RED_HARD_MARK: - sch->qstats.overlimits++; - if (gred_use_harddrop(t) || !gred_use_ecn(t) || - !INET_ECN_set_ce(skb)) { - q->stats.forced_drop++; - goto congestion_drop; - } - q->stats.forced_mark++; - break; + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (gred_use_harddrop(t) || !gred_use_ecn(t) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } + q->stats.forced_mark++; + break; } - if (q->backlog + skb->len <= q->limit) { - q->backlog += skb->len; + if (q->backlog + qdisc_pkt_len(skb) <= q->limit) { + q->backlog += qdisc_pkt_len(skb); return qdisc_enqueue_tail(skb, sch); } @@ -242,27 +243,7 @@ congestion_drop: return NET_XMIT_CN; } -static int gred_requeue(struct sk_buff *skb, struct Qdisc* sch) -{ - struct gred_sched *t = qdisc_priv(sch); - struct gred_sched_data *q; - u16 dp = tc_index_to_dp(skb); - - if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { - if (net_ratelimit()) - printk(KERN_WARNING "GRED: Unable to relocate VQ 0x%x " - "for requeue, screwing up backlog.\n", - tc_index_to_dp(skb)); - } else { - if (red_is_idling(&q->parms)) - red_end_of_idle_period(&q->parms); - q->backlog += skb->len; - } - - return qdisc_requeue(skb, sch); -} - -static struct sk_buff *gred_dequeue(struct Qdisc* sch) +static struct sk_buff *gred_dequeue(struct Qdisc *sch) { struct sk_buff *skb; struct gred_sched *t = qdisc_priv(sch); @@ -274,75 +255,74 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch) u16 dp = tc_index_to_dp(skb); if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { - if (net_ratelimit()) - printk(KERN_WARNING "GRED: Unable to relocate " - "VQ 0x%x after dequeue, screwing up " - "backlog.\n", tc_index_to_dp(skb)); + net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n", + tc_index_to_dp(skb)); } else { - q->backlog -= skb->len; - - if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->parms); + q->backlog -= qdisc_pkt_len(skb); + + if (gred_wred_mode(t)) { + if (!sch->qstats.backlog) + red_start_of_idle_period(&t->wred_set); + } else { + if (!q->backlog) + red_start_of_idle_period(&q->vars); + } } return skb; } - if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) - red_start_of_idle_period(&t->wred_set); - return NULL; } -static unsigned int gred_drop(struct Qdisc* sch) +static unsigned int gred_drop(struct Qdisc *sch) { struct sk_buff *skb; struct gred_sched *t = qdisc_priv(sch); skb = qdisc_dequeue_tail(sch); if (skb) { - unsigned int len = skb->len; + unsigned int len = qdisc_pkt_len(skb); struct gred_sched_data *q; u16 dp = tc_index_to_dp(skb); if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { - if (net_ratelimit()) - printk(KERN_WARNING "GRED: Unable to relocate " - "VQ 0x%x while dropping, screwing up " - "backlog.\n", tc_index_to_dp(skb)); + net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n", + tc_index_to_dp(skb)); } else { q->backlog -= len; q->stats.other++; - if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->parms); + if (gred_wred_mode(t)) { + if (!sch->qstats.backlog) + red_start_of_idle_period(&t->wred_set); + } else { + if (!q->backlog) + red_start_of_idle_period(&q->vars); + } } qdisc_drop(skb, sch); return len; } - if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) - red_start_of_idle_period(&t->wred_set); - return 0; - } -static void gred_reset(struct Qdisc* sch) +static void gred_reset(struct Qdisc *sch) { int i; struct gred_sched *t = qdisc_priv(sch); qdisc_reset_queue(sch); - for (i = 0; i < t->DPs; i++) { + for (i = 0; i < t->DPs; i++) { struct gred_sched_data *q = t->tab[i]; if (!q) continue; - red_restart(&q->parms); + red_restart(&q->vars); q->backlog = 0; } } @@ -352,16 +332,16 @@ static inline void gred_destroy_vq(struct gred_sched_data *q) kfree(q); } -static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps) +static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_sopt *sopt; int i; - if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt)) + if (dps == NULL) return -EINVAL; - sopt = RTA_DATA(dps); + sopt = nla_data(dps); if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs) return -EINVAL; @@ -390,66 +370,81 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps) for (i = table->DPs; i < MAX_DPs; i++) { if (table->tab[i]) { - printk(KERN_WARNING "GRED: Warning: Destroying " - "shadowed VQ 0x%x\n", i); + pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n", + i); gred_destroy_vq(table->tab[i]); table->tab[i] = NULL; - } + } } return 0; } static inline int gred_change_vq(struct Qdisc *sch, int dp, - struct tc_gred_qopt *ctl, int prio, u8 *stab) + struct tc_gred_qopt *ctl, int prio, + u8 *stab, u32 max_P, + struct gred_sched_data **prealloc) { struct gred_sched *table = qdisc_priv(sch); - struct gred_sched_data *q; + struct gred_sched_data *q = table->tab[dp]; - if (table->tab[dp] == NULL) { - table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL); - if (table->tab[dp] == NULL) + if (!q) { + table->tab[dp] = q = *prealloc; + *prealloc = NULL; + if (!q) return -ENOMEM; - memset(table->tab[dp], 0, sizeof(*q)); } - q = table->tab[dp]; q->DP = dp; q->prio = prio; q->limit = ctl->limit; if (q->backlog == 0) - red_end_of_idle_period(&q->parms); + red_end_of_idle_period(&q->vars); red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, - ctl->Scell_log, stab); - + ctl->Scell_log, stab, max_P); + red_set_vars(&q->vars); return 0; } -static int gred_change(struct Qdisc *sch, struct rtattr *opt) +static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { + [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) }, + [TCA_GRED_STAB] = { .len = 256 }, + [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, + [TCA_GRED_MAX_P] = { .type = NLA_U32 }, +}; + +static int gred_change(struct Qdisc *sch, struct nlattr *opt) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_qopt *ctl; - struct rtattr *tb[TCA_GRED_MAX]; - int err = -EINVAL, prio = GRED_DEF_PRIO; + struct nlattr *tb[TCA_GRED_MAX + 1]; + int err, prio = GRED_DEF_PRIO; u8 *stab; + u32 max_P; + struct gred_sched_data *prealloc; - if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) + if (opt == NULL) return -EINVAL; - if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL) + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy); + if (err < 0) + return err; + + if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) return gred_change_table_def(sch, opt); - if (tb[TCA_GRED_PARMS-1] == NULL || - RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || - tb[TCA_GRED_STAB-1] == NULL || - RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) + if (tb[TCA_GRED_PARMS] == NULL || + tb[TCA_GRED_STAB] == NULL) return -EINVAL; - ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); - stab = RTA_DATA(tb[TCA_GRED_STAB-1]); + max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; + + err = -EINVAL; + ctl = nla_data(tb[TCA_GRED_PARMS]); + stab = nla_data(tb[TCA_GRED_STAB]); if (ctl->DP >= table->DPs) goto errout; @@ -469,9 +464,10 @@ static int gred_change(struct Qdisc *sch, struct rtattr *opt) prio = ctl->prio; } + prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); sch_tree_lock(sch); - err = gred_change_vq(sch, ctl->DP, ctl, prio, stab); + err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc); if (err < 0) goto errout_locked; @@ -485,28 +481,35 @@ static int gred_change(struct Qdisc *sch, struct rtattr *opt) errout_locked: sch_tree_unlock(sch); + kfree(prealloc); errout: return err; } -static int gred_init(struct Qdisc *sch, struct rtattr *opt) +static int gred_init(struct Qdisc *sch, struct nlattr *opt) { - struct rtattr *tb[TCA_GRED_MAX]; + struct nlattr *tb[TCA_GRED_MAX + 1]; + int err; - if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) + if (opt == NULL) return -EINVAL; - if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1]) + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy); + if (err < 0) + return err; + + if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) return -EINVAL; - return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]); + return gred_change_table_def(sch, tb[TCA_GRED_DPS]); } static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) { struct gred_sched *table = qdisc_priv(sch); - struct rtattr *parms, *opts = NULL; + struct nlattr *parms, *opts = NULL; int i; + u32 max_p[MAX_DPs]; struct tc_gred_sopt sopt = { .DPs = table->DPs, .def_DP = table->def, @@ -514,13 +517,28 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) .flags = table->red_flags, }; - opts = RTA_NEST(skb, TCA_OPTIONS); - RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); - parms = RTA_NEST(skb, TCA_GRED_PARMS); + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt)) + goto nla_put_failure; + + for (i = 0; i < MAX_DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + + max_p[i] = q ? q->parms.max_P : 0; + } + if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p)) + goto nla_put_failure; + + parms = nla_nest_start(skb, TCA_GRED_PARMS); + if (parms == NULL) + goto nla_put_failure; for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; struct tc_gred_qopt opt; + unsigned long qavg; memset(&opt, 0, sizeof(opt)); @@ -549,24 +567,25 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) opt.packets = q->packetsin; opt.bytesin = q->bytesin; - if (gred_wred_mode(table)) { - q->parms.qidlestart = - table->tab[table->def]->parms.qidlestart; - q->parms.qavg = table->tab[table->def]->parms.qavg; - } + if (gred_wred_mode(table)) + gred_load_wred_set(table, q); - opt.qave = red_calc_qavg(&q->parms, q->parms.qavg); + qavg = red_calc_qavg(&q->parms, &q->vars, + q->vars.qavg >> q->parms.Wlog); + opt.qave = qavg >> q->parms.Wlog; append_opt: - RTA_APPEND(skb, sizeof(opt), &opt); + if (nla_append(skb, sizeof(opt), &opt) < 0) + goto nla_put_failure; } - RTA_NEST_END(skb, parms); + nla_nest_end(skb, parms); - return RTA_NEST_END(skb, opts); + return nla_nest_end(skb, opts); -rtattr_failure: - return RTA_NEST_CANCEL(skb, opts); +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; } static void gred_destroy(struct Qdisc *sch) @@ -580,12 +599,12 @@ static void gred_destroy(struct Qdisc *sch) } } -static struct Qdisc_ops gred_qdisc_ops = { +static struct Qdisc_ops gred_qdisc_ops __read_mostly = { .id = "gred", .priv_size = sizeof(struct gred_sched), .enqueue = gred_enqueue, .dequeue = gred_dequeue, - .requeue = gred_requeue, + .peek = qdisc_peek_head, .drop = gred_drop, .init = gred_init, .reset = gred_reset, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 91132f6871d..ec8aeaac1dd 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -50,30 +50,24 @@ */ #include <linux/kernel.h> -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/errno.h> -#include <linux/jiffies.h> #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/slab.h> -#include <linux/timer.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/init.h> -#include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/pkt_sched.h> +#include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> -#include <asm/system.h> #include <asm/div64.h> -#define HFSC_DEBUG 1 - /* * kernel internal service curve representation: * coordinates are given by 64 bit unsigned integers. @@ -83,12 +77,11 @@ * The service curve parameters are converted to the internal * representation. The slope values are scaled to avoid overflow. * the inverse slope values as well as the y-projection of the 1st - * segment are kept in order to to avoid 64-bit divide operations + * segment are kept in order to avoid 64-bit divide operations * that are expensive on 32-bit architectures. */ -struct internal_sc -{ +struct internal_sc { u64 sm1; /* scaled slope of the 1st segment */ u64 ism1; /* scaled inverse-slope of the 1st segment */ u64 dx; /* the x-projection of the 1st segment */ @@ -98,8 +91,7 @@ struct internal_sc }; /* runtime service curve */ -struct runtime_sc -{ +struct runtime_sc { u64 x; /* current starting position on x-axis */ u64 y; /* current starting position on y-axis */ u64 sm1; /* scaled slope of the 1st segment */ @@ -110,22 +102,19 @@ struct runtime_sc u64 ism2; /* scaled inverse-slope of the 2nd segment */ }; -enum hfsc_class_flags -{ +enum hfsc_class_flags { HFSC_RSC = 0x1, HFSC_FSC = 0x2, HFSC_USC = 0x4 }; -struct hfsc_class -{ - u32 classid; /* class id */ +struct hfsc_class { + struct Qdisc_class_common cl_common; unsigned int refcnt; /* usage count */ - struct gnet_stats_basic bstats; + struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; - spinlock_t *stats_lock; + struct gnet_stats_rate_est64 rate_est; unsigned int level; /* class level in hierarchy */ struct tcf_proto *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ @@ -141,15 +130,14 @@ struct hfsc_class struct rb_node vt_node; /* parent's vt_tree member */ struct rb_root cf_tree; /* active children sorted by cl_f */ struct rb_node cf_node; /* parent's cf_heap member */ - struct list_head hlist; /* hash list member */ struct list_head dlist; /* drop list member */ u64 cl_total; /* total work in bytes */ u64 cl_cumul; /* cumulative work in bytes done by real-time criteria */ - u64 cl_d; /* deadline*/ - u64 cl_e; /* eligible time */ + u64 cl_d; /* deadline*/ + u64 cl_e; /* eligible time */ u64 cl_vt; /* virtual time */ u64 cl_f; /* time when this class will fit for link-sharing, max(myf, cfmin) */ @@ -167,7 +155,7 @@ struct hfsc_class u64 cl_vtoff; /* inter-period cumulative vt offset */ u64 cl_cvtmax; /* max child's vt in the last period */ u64 cl_cvtoff; /* cumulative cvtmax of all periods */ - u64 cl_pcvtoff; /* parent's cvtoff at initalization + u64 cl_pcvtoff; /* parent's cvtoff at initialization time */ struct internal_sc cl_rsc; /* internal real-time service curve */ @@ -184,45 +172,16 @@ struct hfsc_class unsigned long cl_nactive; /* number of active children */ }; -#define HFSC_HSIZE 16 - -struct hfsc_sched -{ +struct hfsc_sched { u16 defcls; /* default class id */ struct hfsc_class root; /* root class */ - struct list_head clhash[HFSC_HSIZE]; /* class hash */ + struct Qdisc_class_hash clhash; /* class hash */ struct rb_root eligible; /* eligible tree */ struct list_head droplist; /* active leaf class list (for dropping) */ - struct sk_buff_head requeue; /* requeued packet */ - struct timer_list wd_timer; /* watchdog timer */ + struct qdisc_watchdog watchdog; /* watchdog timer */ }; -/* - * macros - */ -#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY -#include <linux/time.h> -#undef PSCHED_GET_TIME -#define PSCHED_GET_TIME(stamp) \ -do { \ - struct timeval tv; \ - do_gettimeofday(&tv); \ - (stamp) = 1ULL * USEC_PER_SEC * tv.tv_sec + tv.tv_usec; \ -} while (0) -#endif - -#if HFSC_DEBUG -#define ASSERT(cond) \ -do { \ - if (unlikely(!(cond))) \ - printk("assertion %s failed at %s:%i (%s)\n", \ - #cond, __FILE__, __LINE__, __FUNCTION__); \ -} while (0) -#else -#define ASSERT(cond) -#endif /* HFSC_DEBUG */ - #define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ @@ -285,7 +244,7 @@ static inline struct hfsc_class * eltree_get_minel(struct hfsc_sched *q) { struct rb_node *n; - + n = rb_first(&q->eligible); if (n == NULL) return NULL; @@ -408,31 +367,22 @@ cftree_update(struct hfsc_class *cl) * ism: (psched_us/byte) << ISM_SHIFT * dx: psched_us * - * Clock source resolution (CONFIG_NET_SCH_CLK_*) - * JIFFIES: for 48<=HZ<=1534 resolution is between 0.63us and 1.27us. - * CPU: resolution is between 0.5us and 1us. - * GETTIMEOFDAY: resolution is exactly 1us. + * The clock source resolution with ktime and PSCHED_SHIFT 10 is 1.024us. * * sm and ism are scaled in order to keep effective digits. * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective * digits in decimal using the following table. * - * Note: We can afford the additional accuracy (altq hfsc keeps at most - * 3 effective digits) thanks to the fact that linux clock is bounded - * much more tightly. - * * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps * ------------+------------------------------------------------------- - * bytes/0.5us 6.25e-3 62.5e-3 625e-3 6250e-e 62500e-3 - * bytes/us 12.5e-3 125e-3 1250e-3 12500e-3 125000e-3 - * bytes/1.27us 15.875e-3 158.75e-3 1587.5e-3 15875e-3 158750e-3 + * bytes/1.024us 12.8e-3 128e-3 1280e-3 12800e-3 128000e-3 * - * 0.5us/byte 160 16 1.6 0.16 0.016 - * us/byte 80 8 0.8 0.08 0.008 - * 1.27us/byte 63 6.3 0.63 0.063 0.0063 + * 1.024us/byte 78.125 7.8125 0.78125 0.078125 0.0078125 + * + * So, for PSCHED_SHIFT 10 we need: SM_SHIFT 20, ISM_SHIFT 18. */ -#define SM_SHIFT 20 -#define ISM_SHIFT 18 +#define SM_SHIFT (30 - PSCHED_SHIFT) +#define ISM_SHIFT (8 + PSCHED_SHIFT) #define SM_MASK ((1ULL << SM_SHIFT) - 1) #define ISM_MASK ((1ULL << ISM_SHIFT) - 1) @@ -474,8 +424,8 @@ m2sm(u32 m) u64 sm; sm = ((u64)m << SM_SHIFT); - sm += PSCHED_JIFFIE2US(HZ) - 1; - do_div(sm, PSCHED_JIFFIE2US(HZ)); + sm += PSCHED_TICKS_PER_SEC - 1; + do_div(sm, PSCHED_TICKS_PER_SEC); return sm; } @@ -488,7 +438,7 @@ m2ism(u32 m) if (m == 0) ism = HT_INFINITY; else { - ism = ((u64)PSCHED_JIFFIE2US(HZ) << ISM_SHIFT); + ism = ((u64)PSCHED_TICKS_PER_SEC << ISM_SHIFT); ism += m - 1; do_div(ism, m); } @@ -501,7 +451,7 @@ d2dx(u32 d) { u64 dx; - dx = ((u64)d * PSCHED_JIFFIE2US(HZ)); + dx = ((u64)d * PSCHED_TICKS_PER_SEC); dx += USEC_PER_SEC - 1; do_div(dx, USEC_PER_SEC); return dx; @@ -513,7 +463,7 @@ sm2m(u64 sm) { u64 m; - m = (sm * PSCHED_JIFFIE2US(HZ)) >> SM_SHIFT; + m = (sm * PSCHED_TICKS_PER_SEC) >> SM_SHIFT; return (u32)m; } @@ -524,7 +474,7 @@ dx2d(u64 dx) u64 d; d = dx * USEC_PER_SEC; - do_div(d, PSCHED_JIFFIE2US(HZ)); + do_div(d, PSCHED_TICKS_PER_SEC); return (u32)d; } @@ -662,15 +612,12 @@ rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) rtsc->y = y; rtsc->dx = dx; rtsc->dy = dy; - return; } static void init_ed(struct hfsc_class *cl, unsigned int next_len) { - u64 cur_time; - - PSCHED_GET_TIME(cur_time); + u64 cur_time = psched_get_time(); /* update the deadline curve */ rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); @@ -741,7 +688,7 @@ init_vf(struct hfsc_class *cl, unsigned int len) if (go_active) { n = rb_last(&cl->cl_parent->vt_tree); if (n != NULL) { - max_cl = rb_entry(n, struct hfsc_class,vt_node); + max_cl = rb_entry(n, struct hfsc_class, vt_node); /* * set vt to the average of the min and max * classes. if the parent's period didn't @@ -774,7 +721,7 @@ init_vf(struct hfsc_class *cl, unsigned int len) /* update the virtual curve */ vt = cl->cl_vt + cl->cl_vtoff; rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt, - cl->cl_total); + cl->cl_total); if (cl->cl_virtual.x == vt) { cl->cl_virtual.x -= cl->cl_vtoff; cl->cl_vtoff = 0; @@ -793,14 +740,14 @@ init_vf(struct hfsc_class *cl, unsigned int len) if (cl->cl_flags & HFSC_USC) { /* class has upper limit curve */ if (cur_time == 0) - PSCHED_GET_TIME(cur_time); + cur_time = psched_get_time(); /* update the ulimit curve */ rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time, - cl->cl_total); + cl->cl_total); /* compute myf */ cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, - cl->cl_total); + cl->cl_total); cl->cl_myfadj = 0; } } @@ -809,8 +756,8 @@ init_vf(struct hfsc_class *cl, unsigned int len) if (f != cl->cl_f) { cl->cl_f = f; cftree_update(cl); - update_cfmin(cl->cl_parent); } + update_cfmin(cl->cl_parent); } } @@ -854,7 +801,7 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) * update vt and f */ cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) - - cl->cl_vtoff + cl->cl_vtadj; + - cl->cl_vtoff + cl->cl_vtadj; /* * if vt of the class is smaller than cvtmin, @@ -871,7 +818,7 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) if (cl->cl_flags & HFSC_USC) { cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, - cl->cl_total); + cl->cl_total); #if 0 /* * This code causes classes to stay way under their @@ -928,27 +875,19 @@ set_passive(struct hfsc_class *cl) */ } -/* - * hack to get length of first packet in queue. - */ static unsigned int qdisc_peek_len(struct Qdisc *sch) { struct sk_buff *skb; unsigned int len; - skb = sch->dequeue(sch); + skb = sch->ops->peek(sch); if (skb == NULL) { - if (net_ratelimit()) - printk("qdisc_peek_len: non work-conserving qdisc ?\n"); - return 0; - } - len = skb->len; - if (unlikely(sch->ops->requeue(skb, sch) != NET_XMIT_SUCCESS)) { - if (net_ratelimit()) - printk("qdisc_peek_len: failed to requeue\n"); + qdisc_warn_nonwc("qdisc_peek_len", sch); return 0; } + len = qdisc_pkt_len(skb); + return len; } @@ -958,11 +897,7 @@ hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl) unsigned int len = cl->qdisc->q.qlen; qdisc_reset(cl->qdisc); - if (len > 0) { - update_vf(cl, 0, 0); - set_passive(cl); - sch->q.qlen -= len; - } + qdisc_tree_decrease_qlen(cl->qdisc, len); } static void @@ -974,38 +909,28 @@ hfsc_adjust_levels(struct hfsc_class *cl) do { level = 0; list_for_each_entry(p, &cl->children, siblings) { - if (p->level > level) - level = p->level; + if (p->level >= level) + level = p->level + 1; } - cl->level = level + 1; + cl->level = level; } while ((cl = cl->cl_parent) != NULL); } -static inline unsigned int -hfsc_hash(u32 h) -{ - h ^= h >> 8; - h ^= h >> 4; - - return h & (HFSC_HSIZE - 1); -} - static inline struct hfsc_class * hfsc_find_class(u32 classid, struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); - struct hfsc_class *cl; + struct Qdisc_class_common *clc; - list_for_each_entry(cl, &q->clhash[hfsc_hash(classid)], hlist) { - if (cl->classid == classid) - return cl; - } - return NULL; + clc = qdisc_class_find(&q->clhash, classid); + if (clc == NULL) + return NULL; + return container_of(clc, struct hfsc_class, cl_common); } static void hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc, - u64 cur_time) + u64 cur_time) { sc2isc(rsc, &cl->cl_rsc); rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); @@ -1027,60 +952,74 @@ hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) static void hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc, - u64 cur_time) + u64 cur_time) { sc2isc(usc, &cl->cl_usc); rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total); cl->cl_flags |= HFSC_USC; } +static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = { + [TCA_HFSC_RSC] = { .len = sizeof(struct tc_service_curve) }, + [TCA_HFSC_FSC] = { .len = sizeof(struct tc_service_curve) }, + [TCA_HFSC_USC] = { .len = sizeof(struct tc_service_curve) }, +}; + static int hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)*arg; struct hfsc_class *parent = NULL; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_HFSC_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_HFSC_MAX + 1]; struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL; u64 cur_time; + int err; - if (opt == NULL || rtattr_parse_nested(tb, TCA_HFSC_MAX, opt)) + if (opt == NULL) return -EINVAL; - if (tb[TCA_HFSC_RSC-1]) { - if (RTA_PAYLOAD(tb[TCA_HFSC_RSC-1]) < sizeof(*rsc)) - return -EINVAL; - rsc = RTA_DATA(tb[TCA_HFSC_RSC-1]); + err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy); + if (err < 0) + return err; + + if (tb[TCA_HFSC_RSC]) { + rsc = nla_data(tb[TCA_HFSC_RSC]); if (rsc->m1 == 0 && rsc->m2 == 0) rsc = NULL; } - if (tb[TCA_HFSC_FSC-1]) { - if (RTA_PAYLOAD(tb[TCA_HFSC_FSC-1]) < sizeof(*fsc)) - return -EINVAL; - fsc = RTA_DATA(tb[TCA_HFSC_FSC-1]); + if (tb[TCA_HFSC_FSC]) { + fsc = nla_data(tb[TCA_HFSC_FSC]); if (fsc->m1 == 0 && fsc->m2 == 0) fsc = NULL; } - if (tb[TCA_HFSC_USC-1]) { - if (RTA_PAYLOAD(tb[TCA_HFSC_USC-1]) < sizeof(*usc)) - return -EINVAL; - usc = RTA_DATA(tb[TCA_HFSC_USC-1]); + if (tb[TCA_HFSC_USC]) { + usc = nla_data(tb[TCA_HFSC_USC]); if (usc->m1 == 0 && usc->m2 == 0) usc = NULL; } if (cl != NULL) { if (parentid) { - if (cl->cl_parent && cl->cl_parent->classid != parentid) + if (cl->cl_parent && + cl->cl_parent->cl_common.classid != parentid) return -EINVAL; if (cl->cl_parent == NULL && parentid != TC_H_ROOT) return -EINVAL; } - PSCHED_GET_TIME(cur_time); + cur_time = psched_get_time(); + + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) + return err; + } sch_tree_lock(sch); if (rsc != NULL) @@ -1098,11 +1037,6 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } sch_tree_unlock(sch); -#ifdef CONFIG_NET_ESTIMATOR - if (tca[TCA_RATE-1]) - gen_replace_estimator(&cl->bstats, &cl->rate_est, - cl->stats_lock, tca[TCA_RATE-1]); -#endif return 0; } @@ -1124,10 +1058,19 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (rsc == NULL && fsc == NULL) return -EINVAL; - cl = kmalloc(sizeof(struct hfsc_class), GFP_KERNEL); + cl = kzalloc(sizeof(struct hfsc_class), GFP_KERNEL); if (cl == NULL) return -ENOBUFS; - memset(cl, 0, sizeof(struct hfsc_class)); + + if (tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) { + kfree(cl); + return err; + } + } if (rsc != NULL) hfsc_change_rsc(cl, rsc, 0); @@ -1136,20 +1079,20 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (usc != NULL) hfsc_change_usc(cl, usc, 0); + cl->cl_common.classid = classid; cl->refcnt = 1; - cl->classid = classid; cl->sched = q; cl->cl_parent = parent; - cl->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + cl->qdisc = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, classid); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; - cl->stats_lock = &sch->dev->queue_lock; INIT_LIST_HEAD(&cl->children); cl->vt_tree = RB_ROOT; cl->cf_tree = RB_ROOT; sch_tree_lock(sch); - list_add_tail(&cl->hlist, &q->clhash[hfsc_hash(classid)]); + qdisc_class_hash_insert(&q->clhash, &cl->cl_common); list_add_tail(&cl->siblings, &parent->children); if (parent->level == 0) hfsc_purge_queue(sch, parent); @@ -1157,36 +1100,20 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->cl_pcvtoff = parent->cl_cvtoff; sch_tree_unlock(sch); -#ifdef CONFIG_NET_ESTIMATOR - if (tca[TCA_RATE-1]) - gen_new_estimator(&cl->bstats, &cl->rate_est, - cl->stats_lock, tca[TCA_RATE-1]); -#endif + qdisc_class_hash_grow(sch, &q->clhash); + *arg = (unsigned long)cl; return 0; } static void -hfsc_destroy_filters(struct tcf_proto **fl) -{ - struct tcf_proto *tp; - - while ((tp = *fl) != NULL) { - *fl = tp->next; - tcf_destroy(tp); - } -} - -static void hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) { struct hfsc_sched *q = qdisc_priv(sch); - hfsc_destroy_filters(&cl->filter_list); + tcf_destroy_chain(&cl->filter_list); qdisc_destroy(cl->qdisc); -#ifdef CONFIG_NET_ESTIMATOR gen_kill_estimator(&cl->bstats, &cl->rate_est); -#endif if (cl != &q->root) kfree(cl); } @@ -1202,12 +1129,17 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); - list_del(&cl->hlist); list_del(&cl->siblings); hfsc_adjust_levels(cl->cl_parent); + hfsc_purge_queue(sch, cl); - if (--cl->refcnt == 0) - hfsc_destroy_class(sch, cl); + qdisc_class_hash_remove(&q->clhash, &cl->cl_common); + + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ sch_tree_unlock(sch); return 0; @@ -1217,7 +1149,7 @@ static struct hfsc_class * hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct hfsc_sched *q = qdisc_priv(sch); - struct hfsc_class *cl; + struct hfsc_class *head, *cl; struct tcf_result res; struct tcf_proto *tcf; int result; @@ -1227,24 +1159,26 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) if (cl->level == 0) return cl; - *qerr = NET_XMIT_BYPASS; + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + head = &q->root; tcf = q->root.filter_list; while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - *qerr = NET_XMIT_SUCCESS; - case TC_ACT_SHOT: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: return NULL; } -#elif defined(CONFIG_NET_CLS_POLICE) - if (result == TC_POLICE_SHOT) - return NULL; #endif - if ((cl = (struct hfsc_class *)res.class) == NULL) { - if ((cl = hfsc_find_class(res.classid, sch)) == NULL) + cl = (struct hfsc_class *)res.class; + if (!cl) { + cl = hfsc_find_class(res.classid, sch); + if (!cl) break; /* filter selected invalid classid */ + if (cl->level >= head->level) + break; /* filter may only point downwards */ } if (cl->level == 0) @@ -1252,6 +1186,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* apply inner filter chain */ tcf = cl->filter_list; + head = cl; } /* classification failed, try default class */ @@ -1264,23 +1199,23 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) static int hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old) { struct hfsc_class *cl = (struct hfsc_class *)arg; - if (cl == NULL) - return -ENOENT; if (cl->level > 0) return -EINVAL; if (new == NULL) { - new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + cl->cl_common.classid); if (new == NULL) new = &noop_qdisc; } sch_tree_lock(sch); hfsc_purge_queue(sch, cl); - *old = xchg(&cl->qdisc, new); + *old = cl->qdisc; + cl->qdisc = new; sch_tree_unlock(sch); return 0; } @@ -1290,12 +1225,23 @@ hfsc_class_leaf(struct Qdisc *sch, unsigned long arg) { struct hfsc_class *cl = (struct hfsc_class *)arg; - if (cl != NULL && cl->level == 0) + if (cl->level == 0) return cl->qdisc; return NULL; } +static void +hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl->qdisc->q.qlen == 0) { + update_vf(cl, 0, 0); + set_passive(cl); + } +} + static unsigned long hfsc_get_class(struct Qdisc *sch, u32 classid) { @@ -1359,57 +1305,59 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) tsc.m1 = sm2m(sc->sm1); tsc.d = dx2d(sc->dx); tsc.m2 = sm2m(sc->sm2); - RTA_PUT(skb, attr, sizeof(tsc), &tsc); + if (nla_put(skb, attr, sizeof(tsc), &tsc)) + goto nla_put_failure; return skb->len; - rtattr_failure: + nla_put_failure: return -1; } -static inline int +static int hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl) { if ((cl->cl_flags & HFSC_RSC) && (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0)) - goto rtattr_failure; + goto nla_put_failure; if ((cl->cl_flags & HFSC_FSC) && (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0)) - goto rtattr_failure; + goto nla_put_failure; if ((cl->cl_flags & HFSC_USC) && (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0)) - goto rtattr_failure; + goto nla_put_failure; return skb->len; - rtattr_failure: + nla_put_failure: return -1; } static int hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, - struct tcmsg *tcm) + struct tcmsg *tcm) { struct hfsc_class *cl = (struct hfsc_class *)arg; - unsigned char *b = skb->tail; - struct rtattr *rta = (struct rtattr *)b; + struct nlattr *nest; - tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT; - tcm->tcm_handle = cl->classid; + tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->cl_common.classid : + TC_H_ROOT; + tcm->tcm_handle = cl->cl_common.classid; if (cl->level == 0) tcm->tcm_info = cl->qdisc->handle; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if (hfsc_dump_curves(skb, cl) < 0) - goto rtattr_failure; - rta->rta_len = skb->tail - b; - return skb->len; + goto nla_put_failure; + return nla_nest_end(skb, nest); - rtattr_failure: - skb_trim(skb, b - skb->data); - return -1; + nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; } static int @@ -1420,15 +1368,14 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct tc_hfsc_stats xstats; cl->qstats.qlen = cl->qdisc->q.qlen; + cl->qstats.backlog = cl->qdisc->qstats.backlog; xstats.level = cl->level; xstats.period = cl->cl_vtperiod; xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || -#ifdef CONFIG_NET_ESTIMATOR - gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || -#endif + gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qstats) < 0) return -1; @@ -1447,8 +1394,9 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) if (arg->stop) return; - for (i = 0; i < HFSC_HSIZE; i++) { - list_for_each_entry(cl, &q->clhash[i], hlist) { + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], + cl_common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; @@ -1463,85 +1411,69 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) } static void -hfsc_watchdog(unsigned long arg) -{ - struct Qdisc *sch = (struct Qdisc *)arg; - - sch->flags &= ~TCQ_F_THROTTLED; - netif_schedule(sch->dev); -} - -static void -hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time) +hfsc_schedule_watchdog(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl; u64 next_time = 0; - long delay; - if ((cl = eltree_get_minel(q)) != NULL) + cl = eltree_get_minel(q); + if (cl) next_time = cl->cl_e; if (q->root.cl_cfmin != 0) { if (next_time == 0 || next_time > q->root.cl_cfmin) next_time = q->root.cl_cfmin; } - ASSERT(next_time != 0); - delay = next_time - cur_time; - delay = PSCHED_US2JIFFIE(delay); - - sch->flags |= TCQ_F_THROTTLED; - mod_timer(&q->wd_timer, jiffies + delay); + WARN_ON(next_time == 0); + qdisc_watchdog_schedule(&q->watchdog, next_time); } static int -hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt) +hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) { struct hfsc_sched *q = qdisc_priv(sch); struct tc_hfsc_qopt *qopt; - unsigned int i; + int err; - if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + if (opt == NULL || nla_len(opt) < sizeof(*qopt)) return -EINVAL; - qopt = RTA_DATA(opt); - - sch->stats_lock = &sch->dev->queue_lock; + qopt = nla_data(opt); q->defcls = qopt->defcls; - for (i = 0; i < HFSC_HSIZE; i++) - INIT_LIST_HEAD(&q->clhash[i]); + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) + return err; q->eligible = RB_ROOT; INIT_LIST_HEAD(&q->droplist); - skb_queue_head_init(&q->requeue); + q->root.cl_common.classid = sch->handle; q->root.refcnt = 1; - q->root.classid = sch->handle; q->root.sched = q; - q->root.qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle); if (q->root.qdisc == NULL) q->root.qdisc = &noop_qdisc; - q->root.stats_lock = &sch->dev->queue_lock; INIT_LIST_HEAD(&q->root.children); q->root.vt_tree = RB_ROOT; q->root.cf_tree = RB_ROOT; - list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]); + qdisc_class_hash_insert(&q->clhash, &q->root.cl_common); + qdisc_class_hash_grow(sch, &q->clhash); - init_timer(&q->wd_timer); - q->wd_timer.function = hfsc_watchdog; - q->wd_timer.data = (unsigned long)sch; + qdisc_watchdog_init(&q->watchdog, sch); return 0; } static int -hfsc_change_qdisc(struct Qdisc *sch, struct rtattr *opt) +hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt) { struct hfsc_sched *q = qdisc_priv(sch); struct tc_hfsc_qopt *qopt; - if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + if (opt == NULL || nla_len(opt) < sizeof(*qopt)) return -EINVAL; - qopt = RTA_DATA(opt); + qopt = nla_data(opt); sch_tree_lock(sch); q->defcls = qopt->defcls; @@ -1591,15 +1523,13 @@ hfsc_reset_qdisc(struct Qdisc *sch) struct hfsc_class *cl; unsigned int i; - for (i = 0; i < HFSC_HSIZE; i++) { - list_for_each_entry(cl, &q->clhash[i], hlist) + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) hfsc_reset_class(cl); } - __skb_queue_purge(&q->requeue); q->eligible = RB_ROOT; INIT_LIST_HEAD(&q->droplist); - del_timer(&q->wd_timer); - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_watchdog_cancel(&q->watchdog); sch->q.qlen = 0; } @@ -1607,30 +1537,45 @@ static void hfsc_destroy_qdisc(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); - struct hfsc_class *cl, *next; + struct hlist_node *next; + struct hfsc_class *cl; unsigned int i; - for (i = 0; i < HFSC_HSIZE; i++) { - list_for_each_entry_safe(cl, next, &q->clhash[i], hlist) + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) + tcf_destroy_chain(&cl->filter_list); + } + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], + cl_common.hnode) hfsc_destroy_class(sch, cl); } - __skb_queue_purge(&q->requeue); - del_timer(&q->wd_timer); + qdisc_class_hash_destroy(&q->clhash); + qdisc_watchdog_cancel(&q->watchdog); } static int hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) { struct hfsc_sched *q = qdisc_priv(sch); - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; + struct hfsc_class *cl; + unsigned int i; + + sch->qstats.backlog = 0; + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) + sch->qstats.backlog += cl->qdisc->qstats.backlog; + } qopt.defcls = q->defcls; - RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) + goto nla_put_failure; return skb->len; - rtattr_failure: - skb_trim(skb, b - skb->data); + nla_put_failure: + nlmsg_trim(skb, b); return -1; } @@ -1638,32 +1583,28 @@ static int hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct hfsc_class *cl; - unsigned int len; - int err; + int uninitialized_var(err); cl = hfsc_classify(skb, sch, &err); if (cl == NULL) { - if (err == NET_XMIT_BYPASS) + if (err & __NET_XMIT_BYPASS) sch->qstats.drops++; kfree_skb(skb); return err; } - len = skb->len; - err = cl->qdisc->enqueue(skb, cl->qdisc); + err = qdisc_enqueue(skb, cl->qdisc); if (unlikely(err != NET_XMIT_SUCCESS)) { - cl->qstats.drops++; - sch->qstats.drops++; + if (net_xmit_drop_count(err)) { + cl->qstats.drops++; + sch->qstats.drops++; + } return err; } if (cl->qdisc->q.qlen == 1) - set_active(cl, len); + set_active(cl, qdisc_pkt_len(skb)); - cl->bstats.packets++; - cl->bstats.bytes += len; - sch->bstats.packets++; - sch->bstats.bytes += len; sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -1681,17 +1622,16 @@ hfsc_dequeue(struct Qdisc *sch) if (sch->q.qlen == 0) return NULL; - if ((skb = __skb_dequeue(&q->requeue))) - goto out; - PSCHED_GET_TIME(cur_time); + cur_time = psched_get_time(); /* * if there are eligible classes, use real-time criteria. * find the class with the minimum deadline among * the eligible classes. */ - if ((cl = eltree_get_mindl(q, cur_time)) != NULL) { + cl = eltree_get_mindl(q, cur_time); + if (cl) { realtime = 1; } else { /* @@ -1701,21 +1641,21 @@ hfsc_dequeue(struct Qdisc *sch) cl = vttree_get_minvt(&q->root, cur_time); if (cl == NULL) { sch->qstats.overlimits++; - hfsc_schedule_watchdog(sch, cur_time); + hfsc_schedule_watchdog(sch); return NULL; } } - skb = cl->qdisc->dequeue(cl->qdisc); + skb = qdisc_dequeue_peeked(cl->qdisc); if (skb == NULL) { - if (net_ratelimit()) - printk("HFSC: Non-work-conserving qdisc ?\n"); + qdisc_warn_nonwc("HFSC", cl->qdisc); return NULL; } - update_vf(cl, skb->len, cur_time); + bstats_update(&cl->bstats, skb); + update_vf(cl, qdisc_pkt_len(skb), cur_time); if (realtime) - cl->cl_cumul += skb->len; + cl->cl_cumul += qdisc_pkt_len(skb); if (cl->qdisc->q.qlen != 0) { if (cl->cl_flags & HFSC_RSC) { @@ -1731,24 +1671,13 @@ hfsc_dequeue(struct Qdisc *sch) set_passive(cl); } - out: - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } -static int -hfsc_requeue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct hfsc_sched *q = qdisc_priv(sch); - - __skb_queue_head(&q->requeue, skb); - sch->q.qlen++; - sch->qstats.requeues++; - return NET_XMIT_SUCCESS; -} - static unsigned int hfsc_drop(struct Qdisc *sch) { @@ -1774,11 +1703,12 @@ hfsc_drop(struct Qdisc *sch) return 0; } -static struct Qdisc_class_ops hfsc_class_ops = { +static const struct Qdisc_class_ops hfsc_class_ops = { .change = hfsc_change_class, .delete = hfsc_delete_class, .graft = hfsc_graft_class, .leaf = hfsc_class_leaf, + .qlen_notify = hfsc_qlen_notify, .get = hfsc_get_class, .put = hfsc_put_class, .bind_tcf = hfsc_bind_tcf, @@ -1789,7 +1719,7 @@ static struct Qdisc_class_ops hfsc_class_ops = { .walk = hfsc_walk }; -static struct Qdisc_ops hfsc_qdisc_ops = { +static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = { .id = "hfsc", .init = hfsc_init_qdisc, .change = hfsc_change_qdisc, @@ -1798,7 +1728,7 @@ static struct Qdisc_ops hfsc_qdisc_ops = { .dump = hfsc_dump_qdisc, .enqueue = hfsc_enqueue, .dequeue = hfsc_dequeue, - .requeue = hfsc_requeue, + .peek = qdisc_peek_dequeued, .drop = hfsc_drop, .cl_ops = &hfsc_class_ops, .priv_size = sizeof(struct hfsc_sched), diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c new file mode 100644 index 00000000000..d85b6812a7d --- /dev/null +++ b/net/sched/sch_hhf.c @@ -0,0 +1,740 @@ +/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF) + * + * Copyright (C) 2013 Terry Lam <vtlam@google.com> + * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> + */ + +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <net/flow_keys.h> +#include <net/pkt_sched.h> +#include <net/sock.h> + +/* Heavy-Hitter Filter (HHF) + * + * Principles : + * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter + * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified + * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. + * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, + * in which the heavy-hitter bucket is served with less weight. + * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) + * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have + * higher share of bandwidth. + * + * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the + * following paper: + * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and + * Accounting", in ACM SIGCOMM, 2002. + * + * Conceptually, a multi-stage filter comprises k independent hash functions + * and k counter arrays. Packets are indexed into k counter arrays by k hash + * functions, respectively. The counters are then increased by the packet sizes. + * Therefore, + * - For a heavy-hitter flow: *all* of its k array counters must be large. + * - For a non-heavy-hitter flow: some of its k array counters can be large + * due to hash collision with other small flows; however, with high + * probability, not *all* k counters are large. + * + * By the design of the multi-stage filter algorithm, the false negative rate + * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is + * susceptible to false positives (non-heavy-hitters mistakenly classified as + * heavy-hitters). + * Therefore, we also implement the following optimizations to reduce false + * positives by avoiding unnecessary increment of the counter values: + * - Optimization O1: once a heavy-hitter is identified, its bytes are not + * accounted in the array counters. This technique is called "shielding" + * in Section 3.3.1 of [EV02]. + * - Optimization O2: conservative update of counters + * (Section 3.3.2 of [EV02]), + * New counter value = max {old counter value, + * smallest counter value + packet bytes} + * + * Finally, we refresh the counters periodically since otherwise the counter + * values will keep accumulating. + * + * Once a flow is classified as heavy-hitter, we also save its per-flow state + * in an exact-matching flow table so that its subsequent packets can be + * dispatched to the heavy-hitter bucket accordingly. + * + * + * At a high level, this qdisc works as follows: + * Given a packet p: + * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching + * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter + * bucket. + * - Otherwise, forward p to the multi-stage filter, denoted filter F + * + If F decides that p belongs to a non-heavy-hitter flow, then send p + * to the non-heavy-hitter bucket. + * + Otherwise, if F decides that p belongs to a new heavy-hitter flow, + * then set up a new flow entry for the flow-id of p in the table T and + * send p to the heavy-hitter bucket. + * + * In this implementation: + * - T is a fixed-size hash-table with 1024 entries. Hash collision is + * resolved by linked-list chaining. + * - F has four counter arrays, each array containing 1024 32-bit counters. + * That means 4 * 1024 * 32 bits = 16KB of memory. + * - Since each array in F contains 1024 counters, 10 bits are sufficient to + * index into each array. + * Hence, instead of having four hash functions, we chop the 32-bit + * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is + * computed as XOR sum of those three chunks. + * - We need to clear the counter arrays periodically; however, directly + * memsetting 16KB of memory can lead to cache eviction and unwanted delay. + * So by representing each counter by a valid bit, we only need to reset + * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. + * - The Deficit Round Robin engine is taken from fq_codel implementation + * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to + * fq_codel_flow in fq_codel implementation. + * + */ + +/* Non-configurable parameters */ +#define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */ +#define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */ +#define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */ +#define HHF_BIT_MASK_LEN 10 /* masking 10 bits */ +#define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */ + +#define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */ +enum wdrr_bucket_idx { + WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */ + WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */ +}; + +#define hhf_time_before(a, b) \ + (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) + +/* Heavy-hitter per-flow state */ +struct hh_flow_state { + u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */ + u32 hit_timestamp; /* last time heavy-hitter was seen */ + struct list_head flowchain; /* chaining under hash collision */ +}; + +/* Weighted Deficit Round Robin (WDRR) scheduler */ +struct wdrr_bucket { + struct sk_buff *head; + struct sk_buff *tail; + struct list_head bucketchain; + int deficit; +}; + +struct hhf_sched_data { + struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; + u32 perturbation; /* hash perturbation */ + u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ + u32 drop_overlimit; /* number of times max qdisc packet + * limit was hit + */ + struct list_head *hh_flows; /* table T (currently active HHs) */ + u32 hh_flows_limit; /* max active HH allocs */ + u32 hh_flows_overlimit; /* num of disallowed HH allocs */ + u32 hh_flows_total_cnt; /* total admitted HHs */ + u32 hh_flows_current_cnt; /* total current HHs */ + u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ + u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays + * was reset + */ + unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits + * of hhf_arrays + */ + /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ + struct list_head new_buckets; /* list of new buckets */ + struct list_head old_buckets; /* list of old buckets */ + + /* Configurable HHF parameters */ + u32 hhf_reset_timeout; /* interval to reset counter + * arrays in filter F + * (default 40ms) + */ + u32 hhf_admit_bytes; /* counter thresh to classify as + * HH (default 128KB). + * With these default values, + * 128KB / 40ms = 25 Mbps + * i.e., we expect to capture HHs + * sending > 25 Mbps. + */ + u32 hhf_evict_timeout; /* aging threshold to evict idle + * HHs out of table T. This should + * be large enough to avoid + * reordering during HH eviction. + * (default 1s) + */ + u32 hhf_non_hh_weight; /* WDRR weight for non-HHs + * (default 2, + * i.e., non-HH : HH = 2 : 1) + */ +}; + +static u32 hhf_time_stamp(void) +{ + return jiffies; +} + +static unsigned int skb_hash(const struct hhf_sched_data *q, + const struct sk_buff *skb) +{ + struct flow_keys keys; + unsigned int hash; + + if (skb->sk && skb->sk->sk_hash) + return skb->sk->sk_hash; + + skb_flow_dissect(skb, &keys); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src ^ keys.ip_proto, + (__force u32)keys.ports, q->perturbation); + return hash; +} + +/* Looks up a heavy-hitter flow in a chaining list of table T. */ +static struct hh_flow_state *seek_list(const u32 hash, + struct list_head *head, + struct hhf_sched_data *q) +{ + struct hh_flow_state *flow, *next; + u32 now = hhf_time_stamp(); + + if (list_empty(head)) + return NULL; + + list_for_each_entry_safe(flow, next, head, flowchain) { + u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + + if (hhf_time_before(prev, now)) { + /* Delete expired heavy-hitters, but preserve one entry + * to avoid kzalloc() when next time this slot is hit. + */ + if (list_is_last(&flow->flowchain, head)) + return NULL; + list_del(&flow->flowchain); + kfree(flow); + q->hh_flows_current_cnt--; + } else if (flow->hash_id == hash) { + return flow; + } + } + return NULL; +} + +/* Returns a flow state entry for a new heavy-hitter. Either reuses an expired + * entry or dynamically alloc a new entry. + */ +static struct hh_flow_state *alloc_new_hh(struct list_head *head, + struct hhf_sched_data *q) +{ + struct hh_flow_state *flow; + u32 now = hhf_time_stamp(); + + if (!list_empty(head)) { + /* Find an expired heavy-hitter flow entry. */ + list_for_each_entry(flow, head, flowchain) { + u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + + if (hhf_time_before(prev, now)) + return flow; + } + } + + if (q->hh_flows_current_cnt >= q->hh_flows_limit) { + q->hh_flows_overlimit++; + return NULL; + } + /* Create new entry. */ + flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); + if (!flow) + return NULL; + + q->hh_flows_current_cnt++; + INIT_LIST_HEAD(&flow->flowchain); + list_add_tail(&flow->flowchain, head); + + return flow; +} + +/* Assigns packets to WDRR buckets. Implements a multi-stage filter to + * classify heavy-hitters. + */ +static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + u32 tmp_hash, hash; + u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; + struct hh_flow_state *flow; + u32 pkt_len, min_hhf_val; + int i; + u32 prev; + u32 now = hhf_time_stamp(); + + /* Reset the HHF counter arrays if this is the right time. */ + prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; + if (hhf_time_before(prev, now)) { + for (i = 0; i < HHF_ARRAYS_CNT; i++) + bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); + q->hhf_arrays_reset_timestamp = now; + } + + /* Get hashed flow-id of the skb. */ + hash = skb_hash(q, skb); + + /* Check if this packet belongs to an already established HH flow. */ + flow_pos = hash & HHF_BIT_MASK; + flow = seek_list(hash, &q->hh_flows[flow_pos], q); + if (flow) { /* found its HH flow */ + flow->hit_timestamp = now; + return WDRR_BUCKET_FOR_HH; + } + + /* Now pass the packet through the multi-stage filter. */ + tmp_hash = hash; + xorsum = 0; + for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { + /* Split the skb_hash into three 10-bit chunks. */ + filter_pos[i] = tmp_hash & HHF_BIT_MASK; + xorsum ^= filter_pos[i]; + tmp_hash >>= HHF_BIT_MASK_LEN; + } + /* The last chunk is computed as XOR sum of other chunks. */ + filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; + + pkt_len = qdisc_pkt_len(skb); + min_hhf_val = ~0U; + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + u32 val; + + if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { + q->hhf_arrays[i][filter_pos[i]] = 0; + __set_bit(filter_pos[i], q->hhf_valid_bits[i]); + } + + val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; + if (min_hhf_val > val) + min_hhf_val = val; + } + + /* Found a new HH iff all counter values > HH admit threshold. */ + if (min_hhf_val > q->hhf_admit_bytes) { + /* Just captured a new heavy-hitter. */ + flow = alloc_new_hh(&q->hh_flows[flow_pos], q); + if (!flow) /* memory alloc problem */ + return WDRR_BUCKET_FOR_NON_HH; + flow->hash_id = hash; + flow->hit_timestamp = now; + q->hh_flows_total_cnt++; + + /* By returning without updating counters in q->hhf_arrays, + * we implicitly implement "shielding" (see Optimization O1). + */ + return WDRR_BUCKET_FOR_HH; + } + + /* Conservative update of HHF arrays (see Optimization O2). */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) + q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; + } + return WDRR_BUCKET_FOR_NON_HH; +} + +/* Removes one skb from head of bucket. */ +static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) +{ + struct sk_buff *skb = bucket->head; + + bucket->head = skb->next; + skb->next = NULL; + return skb; +} + +/* Tail-adds skb to bucket. */ +static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) +{ + if (bucket->head == NULL) + bucket->head = skb; + else + bucket->tail->next = skb; + bucket->tail = skb; + skb->next = NULL; +} + +static unsigned int hhf_drop(struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct wdrr_bucket *bucket; + + /* Always try to drop from heavy-hitters first. */ + bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; + if (!bucket->head) + bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; + + if (bucket->head) { + struct sk_buff *skb = dequeue_head(bucket); + + sch->q.qlen--; + sch->qstats.drops++; + sch->qstats.backlog -= qdisc_pkt_len(skb); + kfree_skb(skb); + } + + /* Return id of the bucket from which the packet was dropped. */ + return bucket - q->buckets; +} + +static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + enum wdrr_bucket_idx idx; + struct wdrr_bucket *bucket; + + idx = hhf_classify(skb, sch); + + bucket = &q->buckets[idx]; + bucket_add(bucket, skb); + sch->qstats.backlog += qdisc_pkt_len(skb); + + if (list_empty(&bucket->bucketchain)) { + unsigned int weight; + + /* The logic of new_buckets vs. old_buckets is the same as + * new_flows vs. old_flows in the implementation of fq_codel, + * i.e., short bursts of non-HHs should have strict priority. + */ + if (idx == WDRR_BUCKET_FOR_HH) { + /* Always move heavy-hitters to old bucket. */ + weight = 1; + list_add_tail(&bucket->bucketchain, &q->old_buckets); + } else { + weight = q->hhf_non_hh_weight; + list_add_tail(&bucket->bucketchain, &q->new_buckets); + } + bucket->deficit = weight * q->quantum; + } + if (++sch->q.qlen <= sch->limit) + return NET_XMIT_SUCCESS; + + q->drop_overlimit++; + /* Return Congestion Notification only if we dropped a packet from this + * bucket. + */ + if (hhf_drop(sch) == idx) + return NET_XMIT_CN; + + /* As we dropped a packet, better let upper stack know this. */ + qdisc_tree_decrease_qlen(sch, 1); + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *hhf_dequeue(struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = NULL; + struct wdrr_bucket *bucket; + struct list_head *head; + +begin: + head = &q->new_buckets; + if (list_empty(head)) { + head = &q->old_buckets; + if (list_empty(head)) + return NULL; + } + bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); + + if (bucket->deficit <= 0) { + int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? + 1 : q->hhf_non_hh_weight; + + bucket->deficit += weight * q->quantum; + list_move_tail(&bucket->bucketchain, &q->old_buckets); + goto begin; + } + + if (bucket->head) { + skb = dequeue_head(bucket); + sch->q.qlen--; + sch->qstats.backlog -= qdisc_pkt_len(skb); + } + + if (!skb) { + /* Force a pass through old_buckets to prevent starvation. */ + if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) + list_move_tail(&bucket->bucketchain, &q->old_buckets); + else + list_del_init(&bucket->bucketchain); + goto begin; + } + qdisc_bstats_update(sch, skb); + bucket->deficit -= qdisc_pkt_len(skb); + + return skb; +} + +static void hhf_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = hhf_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void *hhf_zalloc(size_t sz) +{ + void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); + + if (!ptr) + ptr = vzalloc(sz); + + return ptr; +} + +static void hhf_free(void *addr) +{ + kvfree(addr); +} + +static void hhf_destroy(struct Qdisc *sch) +{ + int i; + struct hhf_sched_data *q = qdisc_priv(sch); + + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + hhf_free(q->hhf_arrays[i]); + hhf_free(q->hhf_valid_bits[i]); + } + + for (i = 0; i < HH_FLOWS_CNT; i++) { + struct hh_flow_state *flow, *next; + struct list_head *head = &q->hh_flows[i]; + + if (list_empty(head)) + continue; + list_for_each_entry_safe(flow, next, head, flowchain) { + list_del(&flow->flowchain); + kfree(flow); + } + } + hhf_free(q->hh_flows); +} + +static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { + [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 }, + [TCA_HHF_QUANTUM] = { .type = NLA_U32 }, + [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, + [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 }, + [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 }, + [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 }, + [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 }, +}; + +static int hhf_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_HHF_MAX + 1]; + unsigned int qlen; + int err; + u64 non_hh_quantum; + u32 new_quantum = q->quantum; + u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy); + if (err < 0) + return err; + + if (tb[TCA_HHF_QUANTUM]) + new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); + + if (tb[TCA_HHF_NON_HH_WEIGHT]) + new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); + + non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; + if (non_hh_quantum > INT_MAX) + return -EINVAL; + + sch_tree_lock(sch); + + if (tb[TCA_HHF_BACKLOG_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); + + q->quantum = new_quantum; + q->hhf_non_hh_weight = new_hhf_non_hh_weight; + + if (tb[TCA_HHF_HH_FLOWS_LIMIT]) + q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]); + + if (tb[TCA_HHF_RESET_TIMEOUT]) { + u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); + + q->hhf_reset_timeout = usecs_to_jiffies(us); + } + + if (tb[TCA_HHF_ADMIT_BYTES]) + q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]); + + if (tb[TCA_HHF_EVICT_TIMEOUT]) { + u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); + + q->hhf_evict_timeout = usecs_to_jiffies(us); + } + + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = hhf_dequeue(sch); + + kfree_skb(skb); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + sch_tree_unlock(sch); + return 0; +} + +static int hhf_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + int i; + + sch->limit = 1000; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->perturbation = prandom_u32(); + INIT_LIST_HEAD(&q->new_buckets); + INIT_LIST_HEAD(&q->old_buckets); + + /* Configurable HHF parameters */ + q->hhf_reset_timeout = HZ / 25; /* 40 ms */ + q->hhf_admit_bytes = 131072; /* 128 KB */ + q->hhf_evict_timeout = HZ; /* 1 sec */ + q->hhf_non_hh_weight = 2; + + if (opt) { + int err = hhf_change(sch, opt); + + if (err) + return err; + } + + if (!q->hh_flows) { + /* Initialize heavy-hitter flow table. */ + q->hh_flows = hhf_zalloc(HH_FLOWS_CNT * + sizeof(struct list_head)); + if (!q->hh_flows) + return -ENOMEM; + for (i = 0; i < HH_FLOWS_CNT; i++) + INIT_LIST_HEAD(&q->hh_flows[i]); + + /* Cap max active HHs at twice len of hh_flows table. */ + q->hh_flows_limit = 2 * HH_FLOWS_CNT; + q->hh_flows_overlimit = 0; + q->hh_flows_total_cnt = 0; + q->hh_flows_current_cnt = 0; + + /* Initialize heavy-hitter filter arrays. */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * + sizeof(u32)); + if (!q->hhf_arrays[i]) { + hhf_destroy(sch); + return -ENOMEM; + } + } + q->hhf_arrays_reset_timestamp = hhf_time_stamp(); + + /* Initialize valid bits of heavy-hitter filter arrays. */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / + BITS_PER_BYTE); + if (!q->hhf_valid_bits[i]) { + hhf_destroy(sch); + return -ENOMEM; + } + } + + /* Initialize Weighted DRR buckets. */ + for (i = 0; i < WDRR_BUCKET_CNT; i++) { + struct wdrr_bucket *bucket = q->buckets + i; + + INIT_LIST_HEAD(&bucket->bucketchain); + } + } + + return 0; +} + +static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) || + nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, + jiffies_to_usecs(q->hhf_reset_timeout)) || + nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) || + nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, + jiffies_to_usecs(q->hhf_evict_timeout)) || + nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + return -1; +} + +static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct tc_hhf_xstats st = { + .drop_overlimit = q->drop_overlimit, + .hh_overlimit = q->hh_flows_overlimit, + .hh_tot_count = q->hh_flows_total_cnt, + .hh_cur_count = q->hh_flows_current_cnt, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { + .id = "hhf", + .priv_size = sizeof(struct hhf_sched_data), + + .enqueue = hhf_enqueue, + .dequeue = hhf_dequeue, + .peek = qdisc_peek_dequeued, + .drop = hhf_drop, + .init = hhf_init, + .reset = hhf_reset, + .destroy = hhf_destroy, + .change = hhf_change, + .dump = hhf_dump, + .dump_stats = hhf_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init hhf_module_init(void) +{ + return register_qdisc(&hhf_qdisc_ops); +} + +static void __exit hhf_module_exit(void) +{ + unregister_qdisc(&hhf_qdisc_ops); +} + +module_init(hhf_module_init) +module_exit(hhf_module_exit) +MODULE_AUTHOR("Terry Lam"); +MODULE_AUTHOR("Nandita Dukkipati"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 3ec95df4a85..9f949abcace 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1,4 +1,4 @@ -/* vim: ts=8 sw=8 +/* * net/sched/sch_htb.c Hierarchical token bucket, feed tree version * * This program is free software; you can redistribute it and/or @@ -11,7 +11,7 @@ * Credits (in time order) for older HTB versions: * Stef Coene <stef.coene@docum.org> * HTB support at LARTC mailing list - * Ondrej Kraus, <krauso@barr.cz> + * Ondrej Kraus, <krauso@barr.cz> * found missing INIT_QDISC(htb) * Vladimir Smelhaus, Aamer Akhter, Bert Hubert * helped a lot to locate nasty class stall bug @@ -24,267 +24,169 @@ * Jiri Fojtasek * fixed requeue routine * and many others. thanks. - * - * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $ */ -#include <linux/config.h> #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> +#include <linux/moduleparam.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> #include <linux/list.h> #include <linux/compiler.h> -#include <net/sock.h> -#include <net/pkt_sched.h> #include <linux/rbtree.h> +#include <linux/workqueue.h> +#include <linux/slab.h> +#include <net/netlink.h> +#include <net/sch_generic.h> +#include <net/pkt_sched.h> /* HTB algorithm. Author: devik@cdi.cz ======================================================================== HTB is like TBF with multiple classes. It is also similar to CBQ because - it allows to assign priority to each class in hierarchy. + it allows to assign priority to each class in hierarchy. In fact it is another implementation of Floyd's formal sharing. Levels: - Each class is assigned level. Leaf has ALWAYS level 0 and root + Each class is assigned level. Leaf has ALWAYS level 0 and root classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level one less than their parent. */ -#define HTB_HSIZE 16 /* classid hash size */ -#define HTB_EWMAC 2 /* rate average over HTB_EWMAC*HTB_HSIZE sec */ -#undef HTB_DEBUG /* compile debugging support (activated by tc tool) */ -#define HTB_RATECM 1 /* whether to use rate computer */ -#define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */ -#define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock) -#define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock) -#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */ +static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */ +#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */ #if HTB_VER >> 16 != TC_HTB_PROTOVER #error "Mismatched sch_htb.c and pkt_sch.h" #endif -/* debugging support; S is subsystem, these are defined: - 0 - netlink messages - 1 - enqueue - 2 - drop & requeue - 3 - dequeue main - 4 - dequeue one prio DRR part - 5 - dequeue class accounting - 6 - class overlimit status computation - 7 - hint tree - 8 - event queue - 10 - rate estimator - 11 - classifier - 12 - fast dequeue cache - - L is level; 0 = none, 1 = basic info, 2 = detailed, 3 = full - q->debug uint32 contains 16 2-bit fields one for subsystem starting - from LSB - */ -#ifdef HTB_DEBUG -#define HTB_DBG_COND(S,L) (((q->debug>>(2*S))&3) >= L) -#define HTB_DBG(S,L,FMT,ARG...) if (HTB_DBG_COND(S,L)) \ - printk(KERN_DEBUG FMT,##ARG) -#define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC) -#define HTB_PASSQ q, -#define HTB_ARGQ struct htb_sched *q, -#define static -#undef __inline__ -#define __inline__ -#undef inline -#define inline -#define HTB_CMAGIC 0xFEFAFEF1 -#define htb_safe_rb_erase(N,R) do { BUG_TRAP((N)->rb_color != -1); \ - if ((N)->rb_color == -1) break; \ - rb_erase(N,R); \ - (N)->rb_color = -1; } while (0) -#else -#define HTB_DBG_COND(S,L) (0) -#define HTB_DBG(S,L,FMT,ARG...) -#define HTB_PASSQ -#define HTB_ARGQ -#define HTB_CHCL(cl) -#define htb_safe_rb_erase(N,R) rb_erase(N,R) -#endif +/* Module parameter and sysfs export */ +module_param (htb_hysteresis, int, 0640); +MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate"); +static int htb_rate_est = 0; /* htb classes have a default rate estimator */ +module_param(htb_rate_est, int, 0640); +MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes"); /* used internaly to keep status of single class */ enum htb_cmode { - HTB_CANT_SEND, /* class can't send and can't borrow */ - HTB_MAY_BORROW, /* class can't send but may borrow */ - HTB_CAN_SEND /* class can send */ + HTB_CANT_SEND, /* class can't send and can't borrow */ + HTB_MAY_BORROW, /* class can't send but may borrow */ + HTB_CAN_SEND /* class can send */ }; -/* interior & leaf nodes; props specific to leaves are marked L: */ -struct htb_class -{ -#ifdef HTB_DEBUG - unsigned magic; -#endif - /* general class parameters */ - u32 classid; - struct gnet_stats_basic bstats; - struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; - struct tc_htb_xstats xstats;/* our special stats */ - int refcnt; /* usage count of this class */ - -#ifdef HTB_RATECM - /* rate measurement counters */ - unsigned long rate_bytes,sum_bytes; - unsigned long rate_packets,sum_packets; -#endif +struct htb_prio { + union { + struct rb_root row; + struct rb_root feed; + }; + struct rb_node *ptr; + /* When class changes from state 1->2 and disconnects from + * parent's feed then we lost ptr value and start from the + * first child again. Here we store classid of the + * last valid ptr (used when ptr is NULL). + */ + u32 last_ptr_id; +}; - /* topology */ - int level; /* our level (see above) */ - struct htb_class *parent; /* parent class */ - struct list_head hlist; /* classid hash list item */ - struct list_head sibling; /* sibling list item */ - struct list_head children; /* children list */ - - union { - struct htb_class_leaf { - struct Qdisc *q; - int prio; - int aprio; - int quantum; - int deficit[TC_HTB_MAXDEPTH]; - struct list_head drop_list; - } leaf; - struct htb_class_inner { - struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */ - struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ - /* When class changes from state 1->2 and disconnects from - parent's feed then we lost ptr value and start from the - first child again. Here we store classid of the - last valid ptr (used when ptr is NULL). */ - u32 last_ptr_id[TC_HTB_NUMPRIO]; - } inner; - } un; - struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ - struct rb_node pq_node; /* node for event queue */ - unsigned long pq_key; /* the same type as jiffies global */ - - int prio_activity; /* for which prios are we active */ - enum htb_cmode cmode; /* current mode of the class */ - - /* class attached filters */ - struct tcf_proto *filter_list; - int filter_cnt; - - int warned; /* only one warning about non work conserving .. */ - - /* token bucket parameters */ - struct qdisc_rate_table *rate; /* rate table of the class itself */ - struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ - long buffer,cbuffer; /* token bucket depth/rate */ - long mbuffer; /* max wait time */ - long tokens,ctokens; /* current number of tokens */ - psched_time_t t_c; /* checkpoint time */ +/* interior & leaf nodes; props specific to leaves are marked L: + * To reduce false sharing, place mostly read fields at beginning, + * and mostly written ones at the end. + */ +struct htb_class { + struct Qdisc_class_common common; + struct psched_ratecfg rate; + struct psched_ratecfg ceil; + s64 buffer, cbuffer;/* token bucket depth/rate */ + s64 mbuffer; /* max wait time */ + u32 prio; /* these two are used only by leaves... */ + int quantum; /* but stored for parent-to-leaf return */ + + struct tcf_proto *filter_list; /* class attached filters */ + int filter_cnt; + int refcnt; /* usage count of this class */ + + int level; /* our level (see above) */ + unsigned int children; + struct htb_class *parent; /* parent class */ + + struct gnet_stats_rate_est64 rate_est; + + /* + * Written often fields + */ + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct tc_htb_xstats xstats; /* our special stats */ + + /* token bucket parameters */ + s64 tokens, ctokens;/* current number of tokens */ + s64 t_c; /* checkpoint time */ + + union { + struct htb_class_leaf { + struct list_head drop_list; + int deficit[TC_HTB_MAXDEPTH]; + struct Qdisc *q; + } leaf; + struct htb_class_inner { + struct htb_prio clprio[TC_HTB_NUMPRIO]; + } inner; + } un; + s64 pq_key; + + int prio_activity; /* for which prios are we active */ + enum htb_cmode cmode; /* current mode of the class */ + struct rb_node pq_node; /* node for event queue */ + struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ }; -/* TODO: maybe compute rate when size is too large .. or drop ? */ -static __inline__ long L2T(struct htb_class *cl,struct qdisc_rate_table *rate, - int size) -{ - int slot = size >> rate->rate.cell_log; - if (slot > 255) { - cl->xstats.giants++; - slot = 255; - } - return rate->data[slot]; -} +struct htb_level { + struct rb_root wait_pq; + struct htb_prio hprio[TC_HTB_NUMPRIO]; +}; -struct htb_sched -{ - struct list_head root; /* root classes list */ - struct list_head hash[HTB_HSIZE]; /* hashed by classid */ - struct list_head drops[TC_HTB_NUMPRIO]; /* active leaves (for drops) */ - - /* self list - roots of self generating tree */ - struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; - int row_mask[TC_HTB_MAXDEPTH]; - struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; - u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; - - /* self wait list - roots of wait PQs per row */ - struct rb_root wait_pq[TC_HTB_MAXDEPTH]; - - /* time of nearest event per level (row) */ - unsigned long near_ev_cache[TC_HTB_MAXDEPTH]; - - /* cached value of jiffies in dequeue */ - unsigned long jiffies; - - /* whether we hit non-work conserving class during this dequeue; we use */ - int nwc_hit; /* this to disable mindelay complaint in dequeue */ - - int defcls; /* class where unclassified flows go to */ - u32 debug; /* subsystem debug levels */ - - /* filters for qdisc itself */ - struct tcf_proto *filter_list; - int filter_cnt; - - int rate2quantum; /* quant = rate / rate2quantum */ - psched_time_t now; /* cached dequeue time */ - struct timer_list timer; /* send delay timer */ -#ifdef HTB_RATECM - struct timer_list rttim; /* rate computer timer */ - int recmp_bucket; /* which hash bucket to recompute next */ -#endif - - /* non shaped skbs; let them go directly thru */ - struct sk_buff_head direct_queue; - int direct_qlen; /* max qlen of above */ +struct htb_sched { + struct Qdisc_class_hash clhash; + int defcls; /* class where unclassified flows go to */ + int rate2quantum; /* quant = rate / rate2quantum */ - long direct_pkts; -}; + /* filters for qdisc itself */ + struct tcf_proto *filter_list; -/* compute hash of size HTB_HSIZE for given handle */ -static __inline__ int htb_hash(u32 h) -{ -#if HTB_HSIZE != 16 - #error "Declare new hash for your HTB_HSIZE" -#endif - h ^= h>>8; /* stolen from cbq_hash */ - h ^= h>>4; - return h & 0xf; -} +#define HTB_WARN_TOOMANYEVENTS 0x1 + unsigned int warned; /* only one warning */ + int direct_qlen; + struct work_struct work; + + /* non shaped skbs; let them go directly thru */ + struct sk_buff_head direct_queue; + long direct_pkts; + + struct qdisc_watchdog watchdog; + + s64 now; /* cached dequeue time */ + struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ + + /* time of nearest event per level (row) */ + s64 near_ev_cache[TC_HTB_MAXDEPTH]; + + int row_mask[TC_HTB_MAXDEPTH]; + + struct htb_level hlevel[TC_HTB_MAXDEPTH]; +}; /* find class in global hash table using given handle */ -static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch) +static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); - struct list_head *p; - if (TC_H_MAJ(handle) != sch->handle) + struct Qdisc_class_common *clc; + + clc = qdisc_class_find(&q->clhash, handle); + if (clc == NULL) return NULL; - - list_for_each (p,q->hash+htb_hash(handle)) { - struct htb_class *cl = list_entry(p,struct htb_class,hlist); - if (cl->classid == handle) - return cl; - } - return NULL; + return container_of(clc, struct htb_class, common); } /** @@ -295,17 +197,14 @@ static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch) * We allow direct class selection by classid in priority. The we examine * filters in qdisc and in inner nodes (if higher filter points to the inner * node). If we end up with classid MAJOR:0 we enqueue the skb into special - * internal fifo (direct). These packets then go directly thru. If we still - * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull + * internal fifo (direct). These packets then go directly thru. If we still + * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful * then finish and return direct queue. */ -#define HTB_DIRECT (struct htb_class*)-1 -static inline u32 htb_classid(struct htb_class *cl) -{ - return (cl && cl != HTB_DIRECT) ? cl->classid : TC_H_UNSPEC; -} +#define HTB_DIRECT ((struct htb_class *)-1L) -static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl; @@ -314,119 +213,72 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, in int result; /* allow to select class by setting skb->priority to valid classid; - note that nfmark can be used too by attaching filter fw with no - rules in it */ + * note that nfmark can be used too by attaching filter fw with no + * rules in it + */ if (skb->priority == sch->handle) - return HTB_DIRECT; /* X:0 (direct flow) selected */ - if ((cl = htb_find(skb->priority,sch)) != NULL && cl->level == 0) - return cl; + return HTB_DIRECT; /* X:0 (direct flow) selected */ + cl = htb_find(skb->priority, sch); + if (cl) { + if (cl->level == 0) + return cl; + /* Start with inner filter chain if a non-leaf class is selected */ + tcf = cl->filter_list; + } else { + tcf = q->filter_list; + } - *qerr = NET_XMIT_BYPASS; - tcf = q->filter_list; + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - *qerr = NET_XMIT_SUCCESS; + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; } -#elif defined(CONFIG_NET_CLS_POLICE) - if (result == TC_POLICE_SHOT) - return HTB_DIRECT; #endif - if ((cl = (void*)res.class) == NULL) { + cl = (void *)res.class; + if (!cl) { if (res.classid == sch->handle) - return HTB_DIRECT; /* X:0 (direct flow) */ - if ((cl = htb_find(res.classid,sch)) == NULL) - break; /* filter selected invalid classid */ + return HTB_DIRECT; /* X:0 (direct flow) */ + cl = htb_find(res.classid, sch); + if (!cl) + break; /* filter selected invalid classid */ } if (!cl->level) - return cl; /* we hit leaf; return it */ + return cl; /* we hit leaf; return it */ /* we have got inner class; apply inner filter chain */ tcf = cl->filter_list; } /* classification failed; try to use default class */ - cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle),q->defcls),sch); + cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); if (!cl || cl->level) - return HTB_DIRECT; /* bad default .. this is safe bet */ + return HTB_DIRECT; /* bad default .. this is safe bet */ return cl; } -#ifdef HTB_DEBUG -static void htb_next_rb_node(struct rb_node **n); -#define HTB_DUMTREE(root,memb) if(root) { \ - struct rb_node *n = (root)->rb_node; \ - while (n->rb_left) n = n->rb_left; \ - while (n) { \ - struct htb_class *cl = rb_entry(n, struct htb_class, memb); \ - printk(" %x",cl->classid); htb_next_rb_node (&n); \ - } } - -static void htb_debug_dump (struct htb_sched *q) -{ - int i,p; - printk(KERN_DEBUG "htb*g j=%lu lj=%lu\n",jiffies,q->jiffies); - /* rows */ - for (i=TC_HTB_MAXDEPTH-1;i>=0;i--) { - printk(KERN_DEBUG "htb*r%d m=%x",i,q->row_mask[i]); - for (p=0;p<TC_HTB_NUMPRIO;p++) { - if (!q->row[i][p].rb_node) continue; - printk(" p%d:",p); - HTB_DUMTREE(q->row[i]+p,node[p]); - } - printk("\n"); - } - /* classes */ - for (i = 0; i < HTB_HSIZE; i++) { - struct list_head *l; - list_for_each (l,q->hash+i) { - struct htb_class *cl = list_entry(l,struct htb_class,hlist); - long diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); - printk(KERN_DEBUG "htb*c%x m=%d t=%ld c=%ld pq=%lu df=%ld ql=%d " - "pa=%x f:", - cl->classid,cl->cmode,cl->tokens,cl->ctokens, - cl->pq_node.rb_color==-1?0:cl->pq_key,diff, - cl->level?0:cl->un.leaf.q->q.qlen,cl->prio_activity); - if (cl->level) - for (p=0;p<TC_HTB_NUMPRIO;p++) { - if (!cl->un.inner.feed[p].rb_node) continue; - printk(" p%d a=%x:",p,cl->un.inner.ptr[p]?rb_entry(cl->un.inner.ptr[p], struct htb_class,node[p])->classid:0); - HTB_DUMTREE(cl->un.inner.feed+p,node[p]); - } - printk("\n"); - } - } -} -#endif /** * htb_add_to_id_tree - adds class to the round robin list * * Routine adds class to the list (actually tree) sorted by classid. * Make sure that class is not already on such list for given prio. */ -static void htb_add_to_id_tree (HTB_ARGQ struct rb_root *root, - struct htb_class *cl,int prio) +static void htb_add_to_id_tree(struct rb_root *root, + struct htb_class *cl, int prio) { struct rb_node **p = &root->rb_node, *parent = NULL; - HTB_DBG(7,3,"htb_add_id_tree cl=%X prio=%d\n",cl->classid,prio); -#ifdef HTB_DEBUG - if (cl->node[prio].rb_color != -1) { BUG_TRAP(0); return; } - HTB_CHCL(cl); - if (*p) { - struct htb_class *x = rb_entry(*p,struct htb_class,node[prio]); - HTB_CHCL(x); - } -#endif + while (*p) { - struct htb_class *c; parent = *p; + struct htb_class *c; + parent = *p; c = rb_entry(parent, struct htb_class, node[prio]); - HTB_CHCL(c); - if (cl->classid > c->classid) + + if (cl->common.classid > c->common.classid) p = &parent->rb_right; - else + else p = &parent->rb_left; } rb_link_node(&cl->node[prio], parent, p); @@ -440,35 +292,30 @@ static void htb_add_to_id_tree (HTB_ARGQ struct rb_root *root, * change its mode in cl->pq_key microseconds. Make sure that class is not * already in the queue. */ -static void htb_add_to_wait_tree (struct htb_sched *q, - struct htb_class *cl,long delay,int debug_hint) +static void htb_add_to_wait_tree(struct htb_sched *q, + struct htb_class *cl, s64 delay) { - struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; - HTB_DBG(7,3,"htb_add_wt cl=%X key=%lu\n",cl->classid,cl->pq_key); -#ifdef HTB_DEBUG - if (cl->pq_node.rb_color != -1) { BUG_TRAP(0); return; } - HTB_CHCL(cl); - if ((delay <= 0 || delay > cl->mbuffer) && net_ratelimit()) - printk(KERN_ERR "HTB: suspicious delay in wait_tree d=%ld cl=%X h=%d\n",delay,cl->classid,debug_hint); -#endif - cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay); - if (cl->pq_key == q->jiffies) + struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL; + + cl->pq_key = q->now + delay; + if (cl->pq_key == q->now) cl->pq_key++; /* update the nearest event cache */ - if (time_after(q->near_ev_cache[cl->level], cl->pq_key)) + if (q->near_ev_cache[cl->level] > cl->pq_key) q->near_ev_cache[cl->level] = cl->pq_key; - + while (*p) { - struct htb_class *c; parent = *p; + struct htb_class *c; + parent = *p; c = rb_entry(parent, struct htb_class, pq_node); - if (time_after_eq(cl->pq_key, c->pq_key)) + if (cl->pq_key >= c->pq_key) p = &parent->rb_right; - else + else p = &parent->rb_left; } rb_link_node(&cl->pq_node, parent, p); - rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); + rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq); } /** @@ -477,7 +324,7 @@ static void htb_add_to_wait_tree (struct htb_sched *q, * When we are past last key we return NULL. * Average complexity is 2 steps per call. */ -static void htb_next_rb_node(struct rb_node **n) +static inline void htb_next_rb_node(struct rb_node **n) { *n = rb_next(*n); } @@ -488,42 +335,53 @@ static void htb_next_rb_node(struct rb_node **n) * The class is added to row at priorities marked in mask. * It does nothing if mask == 0. */ -static inline void htb_add_class_to_row(struct htb_sched *q, - struct htb_class *cl,int mask) +static inline void htb_add_class_to_row(struct htb_sched *q, + struct htb_class *cl, int mask) { - HTB_DBG(7,2,"htb_addrow cl=%X mask=%X rmask=%X\n", - cl->classid,mask,q->row_mask[cl->level]); - HTB_CHCL(cl); q->row_mask[cl->level] |= mask; while (mask) { int prio = ffz(~mask); mask &= ~(1 << prio); - htb_add_to_id_tree(HTB_PASSQ q->row[cl->level]+prio,cl,prio); + htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio); } } +/* If this triggers, it is a bug in this code, but it need not be fatal */ +static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root) +{ + if (RB_EMPTY_NODE(rb)) { + WARN_ON(1); + } else { + rb_erase(rb, root); + RB_CLEAR_NODE(rb); + } +} + + /** * htb_remove_class_from_row - removes class from its row * * The class is removed from row at priorities marked in mask. * It does nothing if mask == 0. */ -static __inline__ void htb_remove_class_from_row(struct htb_sched *q, - struct htb_class *cl,int mask) +static inline void htb_remove_class_from_row(struct htb_sched *q, + struct htb_class *cl, int mask) { int m = 0; - HTB_CHCL(cl); + struct htb_level *hlevel = &q->hlevel[cl->level]; + while (mask) { int prio = ffz(~mask); + struct htb_prio *hprio = &hlevel->hprio[prio]; + mask &= ~(1 << prio); - if (q->ptr[cl->level][prio] == cl->node+prio) - htb_next_rb_node(q->ptr[cl->level]+prio); - htb_safe_rb_erase(cl->node + prio,q->row[cl->level]+prio); - if (!q->row[cl->level][prio].rb_node) + if (hprio->ptr == cl->node + prio) + htb_next_rb_node(&hprio->ptr); + + htb_safe_rb_erase(cl->node + prio, &hprio->row); + if (!hprio->row.rb_node) m |= 1 << prio; } - HTB_DBG(7,2,"htb_delrow cl=%X mask=%X rmask=%X maskdel=%X\n", - cl->classid,mask,q->row_mask[cl->level],m); q->row_mask[cl->level] &= ~m; } @@ -531,115 +389,123 @@ static __inline__ void htb_remove_class_from_row(struct htb_sched *q, * htb_activate_prios - creates active classe's feed chain * * The class is connected to ancestors and/or appropriate rows - * for priorities it is participating on. cl->cmode must be new + * for priorities it is participating on. cl->cmode must be new * (activated) mode. It does nothing if cl->prio_activity == 0. */ -static void htb_activate_prios(struct htb_sched *q,struct htb_class *cl) +static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) { struct htb_class *p = cl->parent; - long m,mask = cl->prio_activity; - HTB_DBG(7,2,"htb_act_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); - HTB_CHCL(cl); + long m, mask = cl->prio_activity; while (cl->cmode == HTB_MAY_BORROW && p && mask) { - HTB_CHCL(p); - m = mask; while (m) { + m = mask; + while (m) { int prio = ffz(~m); m &= ~(1 << prio); - - if (p->un.inner.feed[prio].rb_node) + + if (p->un.inner.clprio[prio].feed.rb_node) /* parent already has its feed in use so that - reset bit in mask as parent is already ok */ + * reset bit in mask as parent is already ok + */ mask &= ~(1 << prio); - - htb_add_to_id_tree(HTB_PASSQ p->un.inner.feed+prio,cl,prio); + + htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio); } - HTB_DBG(7,3,"htb_act_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", - p->classid,p->prio_activity,mask,p->cmode); p->prio_activity |= mask; - cl = p; p = cl->parent; - HTB_CHCL(cl); + cl = p; + p = cl->parent; + } if (cl->cmode == HTB_CAN_SEND && mask) - htb_add_class_to_row(q,cl,mask); + htb_add_class_to_row(q, cl, mask); } /** * htb_deactivate_prios - remove class from feed chain * - * cl->cmode must represent old mode (before deactivation). It does + * cl->cmode must represent old mode (before deactivation). It does * nothing if cl->prio_activity == 0. Class is removed from all feed * chains and rows. */ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) { struct htb_class *p = cl->parent; - long m,mask = cl->prio_activity; - HTB_DBG(7,2,"htb_deact_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); - HTB_CHCL(cl); + long m, mask = cl->prio_activity; while (cl->cmode == HTB_MAY_BORROW && p && mask) { - m = mask; mask = 0; + m = mask; + mask = 0; while (m) { int prio = ffz(~m); m &= ~(1 << prio); - - if (p->un.inner.ptr[prio] == cl->node+prio) { + + if (p->un.inner.clprio[prio].ptr == cl->node + prio) { /* we are removing child which is pointed to from - parent feed - forget the pointer but remember - classid */ - p->un.inner.last_ptr_id[prio] = cl->classid; - p->un.inner.ptr[prio] = NULL; + * parent feed - forget the pointer but remember + * classid + */ + p->un.inner.clprio[prio].last_ptr_id = cl->common.classid; + p->un.inner.clprio[prio].ptr = NULL; } - - htb_safe_rb_erase(cl->node + prio,p->un.inner.feed + prio); - - if (!p->un.inner.feed[prio].rb_node) + + htb_safe_rb_erase(cl->node + prio, + &p->un.inner.clprio[prio].feed); + + if (!p->un.inner.clprio[prio].feed.rb_node) mask |= 1 << prio; } - HTB_DBG(7,3,"htb_deact_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", - p->classid,p->prio_activity,mask,p->cmode); + p->prio_activity &= ~mask; - cl = p; p = cl->parent; - HTB_CHCL(cl); + cl = p; + p = cl->parent; + } - if (cl->cmode == HTB_CAN_SEND && mask) - htb_remove_class_from_row(q,cl,mask); + if (cl->cmode == HTB_CAN_SEND && mask) + htb_remove_class_from_row(q, cl, mask); +} + +static inline s64 htb_lowater(const struct htb_class *cl) +{ + if (htb_hysteresis) + return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0; + else + return 0; } +static inline s64 htb_hiwater(const struct htb_class *cl) +{ + if (htb_hysteresis) + return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0; + else + return 0; +} + /** * htb_class_mode - computes and returns current class mode * * It computes cl's mode at time cl->t_c+diff and returns it. If mode * is not HTB_CAN_SEND then cl->pq_key is updated to time difference - * from now to time when cl will change its state. + * from now to time when cl will change its state. * Also it is worth to note that class mode doesn't change simply - * at cl->{c,}tokens == 0 but there can rather be hysteresis of + * at cl->{c,}tokens == 0 but there can rather be hysteresis of * 0 .. -cl->{c,}buffer range. It is meant to limit number of * mode transitions per time unit. The speed gain is about 1/6. */ -static __inline__ enum htb_cmode -htb_class_mode(struct htb_class *cl,long *diff) +static inline enum htb_cmode +htb_class_mode(struct htb_class *cl, s64 *diff) { - long toks; + s64 toks; - if ((toks = (cl->ctokens + *diff)) < ( -#if HTB_HYSTERESIS - cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : -#endif - 0)) { - *diff = -toks; - return HTB_CANT_SEND; - } - if ((toks = (cl->tokens + *diff)) >= ( -#if HTB_HYSTERESIS - cl->cmode == HTB_CAN_SEND ? -cl->buffer : -#endif - 0)) - return HTB_CAN_SEND; + if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) { + *diff = -toks; + return HTB_CANT_SEND; + } - *diff = -toks; - return HTB_MAY_BORROW; + if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl)) + return HTB_CAN_SEND; + + *diff = -toks; + return HTB_MAY_BORROW; } /** @@ -651,167 +517,118 @@ htb_class_mode(struct htb_class *cl,long *diff) * be different from old one and cl->pq_key has to be valid if changing * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). */ -static void -htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) -{ - enum htb_cmode new_mode = htb_class_mode(cl,diff); - - HTB_CHCL(cl); - HTB_DBG(7,1,"htb_chging_clmode %d->%d cl=%X\n",cl->cmode,new_mode,cl->classid); +static void +htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) +{ + enum htb_cmode new_mode = htb_class_mode(cl, diff); if (new_mode == cl->cmode) - return; - - if (cl->prio_activity) { /* not necessary: speed optimization */ - if (cl->cmode != HTB_CANT_SEND) - htb_deactivate_prios(q,cl); + return; + + if (cl->prio_activity) { /* not necessary: speed optimization */ + if (cl->cmode != HTB_CANT_SEND) + htb_deactivate_prios(q, cl); cl->cmode = new_mode; - if (new_mode != HTB_CANT_SEND) - htb_activate_prios(q,cl); - } else + if (new_mode != HTB_CANT_SEND) + htb_activate_prios(q, cl); + } else cl->cmode = new_mode; } /** - * htb_activate - inserts leaf cl into appropriate active feeds + * htb_activate - inserts leaf cl into appropriate active feeds * * Routine learns (new) priority of leaf and activates feed chain * for the prio. It can be called on already active leaf safely. * It also adds leaf into droplist. */ -static __inline__ void htb_activate(struct htb_sched *q,struct htb_class *cl) +static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) { - BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen); - HTB_CHCL(cl); + WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen); + if (!cl->prio_activity) { - cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio); - htb_activate_prios(q,cl); - list_add_tail(&cl->un.leaf.drop_list,q->drops+cl->un.leaf.aprio); + cl->prio_activity = 1 << cl->prio; + htb_activate_prios(q, cl); + list_add_tail(&cl->un.leaf.drop_list, + q->drops + cl->prio); } } /** - * htb_deactivate - remove leaf cl from active feeds + * htb_deactivate - remove leaf cl from active feeds * * Make sure that leaf is active. In the other words it can't be called * with non-active leaf. It also removes class from the drop list. */ -static __inline__ void -htb_deactivate(struct htb_sched *q,struct htb_class *cl) +static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) { - BUG_TRAP(cl->prio_activity); - HTB_CHCL(cl); - htb_deactivate_prios(q,cl); + WARN_ON(!cl->prio_activity); + + htb_deactivate_prios(q, cl); cl->prio_activity = 0; list_del_init(&cl->un.leaf.drop_list); } static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - int ret; - struct htb_sched *q = qdisc_priv(sch); - struct htb_class *cl = htb_classify(skb,sch,&ret); - - if (cl == HTB_DIRECT) { - /* enqueue to helper queue */ - if (q->direct_queue.qlen < q->direct_qlen) { - __skb_queue_tail(&q->direct_queue, skb); - q->direct_pkts++; - } else { - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; - } + int uninitialized_var(ret); + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = htb_classify(skb, sch, &ret); + + if (cl == HTB_DIRECT) { + /* enqueue to helper queue */ + if (q->direct_queue.qlen < q->direct_qlen) { + __skb_queue_tail(&q->direct_queue, skb); + q->direct_pkts++; + } else { + return qdisc_drop(skb, sch); + } #ifdef CONFIG_NET_CLS_ACT - } else if (!cl) { - if (ret == NET_XMIT_BYPASS) - sch->qstats.drops++; - kfree_skb (skb); - return ret; + } else if (!cl) { + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; #endif - } else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { - sch->qstats.drops++; - cl->qstats.drops++; - return NET_XMIT_DROP; - } else { - cl->bstats.packets++; cl->bstats.bytes += skb->len; - htb_activate (q,cl); - } - - sch->q.qlen++; - sch->bstats.packets++; sch->bstats.bytes += skb->len; - HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); - return NET_XMIT_SUCCESS; -} - -/* TODO: requeuing packet charges it to policers again !! */ -static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct htb_sched *q = qdisc_priv(sch); - int ret = NET_XMIT_SUCCESS; - struct htb_class *cl = htb_classify(skb,sch, &ret); - struct sk_buff *tskb; - - if (cl == HTB_DIRECT || !cl) { - /* enqueue to helper queue */ - if (q->direct_queue.qlen < q->direct_qlen && cl) { - __skb_queue_head(&q->direct_queue, skb); + } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) { + sch->qstats.drops++; + cl->qstats.drops++; + } + return ret; } else { - __skb_queue_head(&q->direct_queue, skb); - tskb = __skb_dequeue_tail(&q->direct_queue); - kfree_skb (tskb); - sch->qstats.drops++; - return NET_XMIT_CN; + htb_activate(q, cl); } - } else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { - sch->qstats.drops++; - cl->qstats.drops++; - return NET_XMIT_DROP; - } else - htb_activate (q,cl); - - sch->q.qlen++; - sch->qstats.requeues++; - HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); - return NET_XMIT_SUCCESS; + + sch->q.qlen++; + return NET_XMIT_SUCCESS; } -static void htb_timer(unsigned long arg) +static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff) { - struct Qdisc *sch = (struct Qdisc*)arg; - sch->flags &= ~TCQ_F_THROTTLED; - wmb(); - netif_schedule(sch->dev); + s64 toks = diff + cl->tokens; + + if (toks > cl->buffer) + toks = cl->buffer; + toks -= (s64) psched_l2t_ns(&cl->rate, bytes); + if (toks <= -cl->mbuffer) + toks = 1 - cl->mbuffer; + + cl->tokens = toks; } -#ifdef HTB_RATECM -#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0 -static void htb_rate_timer(unsigned long arg) +static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff) { - struct Qdisc *sch = (struct Qdisc*)arg; - struct htb_sched *q = qdisc_priv(sch); - struct list_head *p; - - /* lock queue so that we can muck with it */ - HTB_QLOCK(sch); - HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies); - - q->rttim.expires = jiffies + HZ; - add_timer(&q->rttim); - - /* scan and recompute one bucket at time */ - if (++q->recmp_bucket >= HTB_HSIZE) - q->recmp_bucket = 0; - list_for_each (p,q->hash+q->recmp_bucket) { - struct htb_class *cl = list_entry(p,struct htb_class,hlist); - HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n", - cl->classid,cl->sum_bytes,cl->sum_packets); - RT_GEN (cl->sum_bytes,cl->rate_bytes); - RT_GEN (cl->sum_packets,cl->rate_packets); - } - HTB_QUNLOCK(sch); + s64 toks = diff + cl->ctokens; + + if (toks > cl->cbuffer) + toks = cl->cbuffer; + toks -= (s64) psched_l2t_ns(&cl->ceil, bytes); + if (toks <= -cl->mbuffer) + toks = 1 - cl->mbuffer; + + cl->ctokens = toks; } -#endif /** * htb_charge_class - charges amount "bytes" to leaf and ancestors @@ -824,68 +641,40 @@ static void htb_rate_timer(unsigned long arg) * CAN_SEND) because we can use more precise clock that event queue here. * In such case we remove class from event queue first. */ -static void htb_charge_class(struct htb_sched *q,struct htb_class *cl, - int level,int bytes) -{ - long toks,diff; +static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, + int level, struct sk_buff *skb) +{ + int bytes = qdisc_pkt_len(skb); enum htb_cmode old_mode; - HTB_DBG(5,1,"htb_chrg_cl cl=%X lev=%d len=%d\n",cl->classid,level,bytes); - -#define HTB_ACCNT(T,B,R) toks = diff + cl->T; \ - if (toks > cl->B) toks = cl->B; \ - toks -= L2T(cl, cl->R, bytes); \ - if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \ - cl->T = toks + s64 diff; while (cl) { - HTB_CHCL(cl); - diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); -#ifdef HTB_DEBUG - if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { - if (net_ratelimit()) - printk(KERN_ERR "HTB: bad diff in charge, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", - cl->classid, diff, -#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY - q->now.tv_sec * 1000000ULL + q->now.tv_usec, - cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec, -#else - (unsigned long long) q->now, - (unsigned long long) cl->t_c, -#endif - q->jiffies); - diff = 1000; - } -#endif + diff = min_t(s64, q->now - cl->t_c, cl->mbuffer); if (cl->level >= level) { - if (cl->level == level) cl->xstats.lends++; - HTB_ACCNT (tokens,buffer,rate); + if (cl->level == level) + cl->xstats.lends++; + htb_accnt_tokens(cl, bytes, diff); } else { cl->xstats.borrows++; - cl->tokens += diff; /* we moved t_c; update tokens */ + cl->tokens += diff; /* we moved t_c; update tokens */ } - HTB_ACCNT (ctokens,cbuffer,ceil); + htb_accnt_ctokens(cl, bytes, diff); cl->t_c = q->now; - HTB_DBG(5,2,"htb_chrg_clp cl=%X diff=%ld tok=%ld ctok=%ld\n",cl->classid,diff,cl->tokens,cl->ctokens); - old_mode = cl->cmode; diff = 0; - htb_change_class_mode(q,cl,&diff); + old_mode = cl->cmode; + diff = 0; + htb_change_class_mode(q, cl, &diff); if (old_mode != cl->cmode) { if (old_mode != HTB_CAN_SEND) - htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); + htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq); if (cl->cmode != HTB_CAN_SEND) - htb_add_to_wait_tree (q,cl,diff,1); + htb_add_to_wait_tree(q, cl, diff); } - -#ifdef HTB_RATECM - /* update rate counters */ - cl->sum_bytes += bytes; cl->sum_packets++; -#endif - /* update byte stats except for leaves which are already updated */ - if (cl->level) { - cl->bstats.bytes += bytes; - cl->bstats.packets++; - } + /* update basic stats except for leaves which are already updated */ + if (cl->level) + bstats_update(&cl->bstats, skb); + cl = cl->parent; } } @@ -893,69 +682,66 @@ static void htb_charge_class(struct htb_sched *q,struct htb_class *cl, /** * htb_do_events - make mode changes to classes at the level * - * Scans event queue for pending events and applies them. Returns jiffies to - * next pending event (0 for no event in pq). - * Note: Aplied are events whose have cl->pq_key <= jiffies. + * Scans event queue for pending events and applies them. Returns time of + * next pending event (0 for no event in pq, q->now for too many events). + * Note: Applied are events whose have cl->pq_key <= q->now. */ -static long htb_do_events(struct htb_sched *q,int level) +static s64 htb_do_events(struct htb_sched *q, const int level, + unsigned long start) { - int i; - HTB_DBG(8,1,"htb_do_events l=%d root=%p rmask=%X\n", - level,q->wait_pq[level].rb_node,q->row_mask[level]); - for (i = 0; i < 500; i++) { + /* don't run for longer than 2 jiffies; 2 is used instead of + * 1 to simplify things when jiffy is going to be incremented + * too soon + */ + unsigned long stop_at = start + 2; + struct rb_root *wait_pq = &q->hlevel[level].wait_pq; + + while (time_before(jiffies, stop_at)) { struct htb_class *cl; - long diff; - struct rb_node *p = q->wait_pq[level].rb_node; - if (!p) return 0; - while (p->rb_left) p = p->rb_left; + s64 diff; + struct rb_node *p = rb_first(wait_pq); + + if (!p) + return 0; cl = rb_entry(p, struct htb_class, pq_node); - if (time_after(cl->pq_key, q->jiffies)) { - HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - q->jiffies); - return cl->pq_key - q->jiffies; - } - htb_safe_rb_erase(p,q->wait_pq+level); - diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); -#ifdef HTB_DEBUG - if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { - if (net_ratelimit()) - printk(KERN_ERR "HTB: bad diff in events, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", - cl->classid, diff, -#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY - q->now.tv_sec * 1000000ULL + q->now.tv_usec, - cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec, -#else - (unsigned long long) q->now, - (unsigned long long) cl->t_c, -#endif - q->jiffies); - diff = 1000; - } -#endif - htb_change_class_mode(q,cl,&diff); + if (cl->pq_key > q->now) + return cl->pq_key; + + htb_safe_rb_erase(p, wait_pq); + diff = min_t(s64, q->now - cl->t_c, cl->mbuffer); + htb_change_class_mode(q, cl, &diff); if (cl->cmode != HTB_CAN_SEND) - htb_add_to_wait_tree (q,cl,diff,2); + htb_add_to_wait_tree(q, cl, diff); } - if (net_ratelimit()) - printk(KERN_WARNING "htb: too many events !\n"); - return HZ/10; + + /* too much load - let's continue after a break for scheduling */ + if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) { + pr_warn("htb: too many events!\n"); + q->warned |= HTB_WARN_TOOMANYEVENTS; + } + + return q->now; } /* Returns class->node+prio from id-tree where classe's id is >= id. NULL - is no such one exists. */ -static struct rb_node * -htb_id_find_next_upper(int prio,struct rb_node *n,u32 id) + * is no such one exists. + */ +static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n, + u32 id) { struct rb_node *r = NULL; while (n) { - struct htb_class *cl = rb_entry(n,struct htb_class,node[prio]); - if (id == cl->classid) return n; - - if (id > cl->classid) { + struct htb_class *cl = + rb_entry(n, struct htb_class, node[prio]); + + if (id > cl->common.classid) { n = n->rb_right; - } else { + } else if (id < cl->common.classid) { r = n; n = n->rb_left; + } else { + return n; } } return r; @@ -966,226 +752,212 @@ htb_id_find_next_upper(int prio,struct rb_node *n,u32 id) * * Find leaf where current feed pointers points to. */ -static struct htb_class * -htb_lookup_leaf(HTB_ARGQ struct rb_root *tree,int prio,struct rb_node **pptr,u32 *pid) +static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio) { int i; struct { struct rb_node *root; struct rb_node **pptr; u32 *pid; - } stk[TC_HTB_MAXDEPTH],*sp = stk; - - BUG_TRAP(tree->rb_node); - sp->root = tree->rb_node; - sp->pptr = pptr; - sp->pid = pid; + } stk[TC_HTB_MAXDEPTH], *sp = stk; + + BUG_ON(!hprio->row.rb_node); + sp->root = hprio->row.rb_node; + sp->pptr = &hprio->ptr; + sp->pid = &hprio->last_ptr_id; for (i = 0; i < 65535; i++) { - HTB_DBG(4,2,"htb_lleaf ptr=%p pid=%X\n",*sp->pptr,*sp->pid); - - if (!*sp->pptr && *sp->pid) { - /* ptr was invalidated but id is valid - try to recover - the original or next ptr */ - *sp->pptr = htb_id_find_next_upper(prio,sp->root,*sp->pid); + if (!*sp->pptr && *sp->pid) { + /* ptr was invalidated but id is valid - try to recover + * the original or next ptr + */ + *sp->pptr = + htb_id_find_next_upper(prio, sp->root, *sp->pid); } - *sp->pid = 0; /* ptr is valid now so that remove this hint as it - can become out of date quickly */ - if (!*sp->pptr) { /* we are at right end; rewind & go up */ + *sp->pid = 0; /* ptr is valid now so that remove this hint as it + * can become out of date quickly + */ + if (!*sp->pptr) { /* we are at right end; rewind & go up */ *sp->pptr = sp->root; - while ((*sp->pptr)->rb_left) + while ((*sp->pptr)->rb_left) *sp->pptr = (*sp->pptr)->rb_left; if (sp > stk) { sp--; - BUG_TRAP(*sp->pptr); if(!*sp->pptr) return NULL; - htb_next_rb_node (sp->pptr); + if (!*sp->pptr) { + WARN_ON(1); + return NULL; + } + htb_next_rb_node(sp->pptr); } } else { struct htb_class *cl; - cl = rb_entry(*sp->pptr,struct htb_class,node[prio]); - HTB_CHCL(cl); - if (!cl->level) + struct htb_prio *clp; + + cl = rb_entry(*sp->pptr, struct htb_class, node[prio]); + if (!cl->level) return cl; - (++sp)->root = cl->un.inner.feed[prio].rb_node; - sp->pptr = cl->un.inner.ptr+prio; - sp->pid = cl->un.inner.last_ptr_id+prio; + clp = &cl->un.inner.clprio[prio]; + (++sp)->root = clp->feed.rb_node; + sp->pptr = &clp->ptr; + sp->pid = &clp->last_ptr_id; } } - BUG_TRAP(0); + WARN_ON(1); return NULL; } /* dequeues packet at given priority and level; call only if - you are sure that there is active class at prio/level */ -static struct sk_buff * -htb_dequeue_tree(struct htb_sched *q,int prio,int level) + * you are sure that there is active class at prio/level + */ +static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio, + const int level) { struct sk_buff *skb = NULL; - struct htb_class *cl,*start; + struct htb_class *cl, *start; + struct htb_level *hlevel = &q->hlevel[level]; + struct htb_prio *hprio = &hlevel->hprio[prio]; + /* look initial class up in the row */ - start = cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio, - q->ptr[level]+prio,q->last_ptr_id[level]+prio); - + start = cl = htb_lookup_leaf(hprio, prio); + do { next: - BUG_TRAP(cl); - if (!cl) return NULL; - HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n", - prio,level,cl->classid,cl->un.leaf.deficit[level]); + if (unlikely(!cl)) + return NULL; /* class can be empty - it is unlikely but can be true if leaf - qdisc drops packets in enqueue routine or if someone used - graft operation on the leaf since last dequeue; - simply deactivate and skip such class */ + * qdisc drops packets in enqueue routine or if someone used + * graft operation on the leaf since last dequeue; + * simply deactivate and skip such class + */ if (unlikely(cl->un.leaf.q->q.qlen == 0)) { struct htb_class *next; - htb_deactivate(q,cl); + htb_deactivate(q, cl); /* row/level might become empty */ if ((q->row_mask[level] & (1 << prio)) == 0) - return NULL; - - next = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio, - prio,q->ptr[level]+prio,q->last_ptr_id[level]+prio); + return NULL; + + next = htb_lookup_leaf(hprio, prio); - if (cl == start) /* fix start if we just deleted it */ + if (cl == start) /* fix start if we just deleted it */ start = next; cl = next; goto next; } - - if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL)) + + skb = cl->un.leaf.q->dequeue(cl->un.leaf.q); + if (likely(skb != NULL)) break; - if (!cl->warned) { - printk(KERN_WARNING "htb: class %X isn't work conserving ?!\n",cl->classid); - cl->warned = 1; - } - q->nwc_hit++; - htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); - cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio,q->ptr[level]+prio, - q->last_ptr_id[level]+prio); + + qdisc_warn_nonwc("htb", cl->un.leaf.q); + htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr: + &q->hlevel[0].hprio[prio].ptr); + cl = htb_lookup_leaf(hprio, prio); } while (cl != start); if (likely(skb != NULL)) { - if ((cl->un.leaf.deficit[level] -= skb->len) < 0) { - HTB_DBG(4,2,"htb_next_cl oldptr=%p quant_add=%d\n", - level?cl->parent->un.inner.ptr[prio]:q->ptr[0][prio],cl->un.leaf.quantum); - cl->un.leaf.deficit[level] += cl->un.leaf.quantum; - htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); + bstats_update(&cl->bstats, skb); + cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb); + if (cl->un.leaf.deficit[level] < 0) { + cl->un.leaf.deficit[level] += cl->quantum; + htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr : + &q->hlevel[0].hprio[prio].ptr); } /* this used to be after charge_class but this constelation - gives us slightly better performance */ + * gives us slightly better performance + */ if (!cl->un.leaf.q->q.qlen) - htb_deactivate (q,cl); - htb_charge_class (q,cl,level,skb->len); + htb_deactivate(q, cl); + htb_charge_class(q, cl, level, skb); } return skb; } -static void htb_delay_by(struct Qdisc *sch,long delay) -{ - struct htb_sched *q = qdisc_priv(sch); - if (delay <= 0) delay = 1; - if (unlikely(delay > 5*HZ)) { - if (net_ratelimit()) - printk(KERN_INFO "HTB delay %ld > 5sec\n", delay); - delay = 5*HZ; - } - /* why don't use jiffies here ? because expires can be in past */ - mod_timer(&q->timer, q->jiffies + delay); - sch->flags |= TCQ_F_THROTTLED; - sch->qstats.overlimits++; - HTB_DBG(3,1,"htb_deq t_delay=%ld\n",delay); -} - static struct sk_buff *htb_dequeue(struct Qdisc *sch) { - struct sk_buff *skb = NULL; + struct sk_buff *skb; struct htb_sched *q = qdisc_priv(sch); int level; - long min_delay; -#ifdef HTB_DEBUG - int evs_used = 0; -#endif - - q->jiffies = jiffies; - HTB_DBG(3,1,"htb_deq dircnt=%d qlen=%d\n",skb_queue_len(&q->direct_queue), - sch->q.qlen); + s64 next_event; + unsigned long start_at; /* try to dequeue direct packets as high prio (!) to minimize cpu work */ - if ((skb = __skb_dequeue(&q->direct_queue)) != NULL) { - sch->flags &= ~TCQ_F_THROTTLED; + skb = __skb_dequeue(&q->direct_queue); + if (skb != NULL) { +ok: + qdisc_bstats_update(sch, skb); + qdisc_unthrottled(sch); sch->q.qlen--; return skb; } - if (!sch->q.qlen) goto fin; - PSCHED_GET_TIME(q->now); + if (!sch->q.qlen) + goto fin; + q->now = ktime_to_ns(ktime_get()); + start_at = jiffies; + + next_event = q->now + 5LLU * NSEC_PER_SEC; - min_delay = LONG_MAX; - q->nwc_hit = 0; for (level = 0; level < TC_HTB_MAXDEPTH; level++) { /* common case optimization - skip event handler quickly */ int m; - long delay; - if (time_after_eq(q->jiffies, q->near_ev_cache[level])) { - delay = htb_do_events(q,level); - q->near_ev_cache[level] = q->jiffies + (delay ? delay : HZ); -#ifdef HTB_DEBUG - evs_used++; -#endif - } else - delay = q->near_ev_cache[level] - q->jiffies; - - if (delay && min_delay > delay) - min_delay = delay; + s64 event = q->near_ev_cache[level]; + + if (q->now >= event) { + event = htb_do_events(q, level, start_at); + if (!event) + event = q->now + NSEC_PER_SEC; + q->near_ev_cache[level] = event; + } + + if (next_event > event) + next_event = event; + m = ~q->row_mask[level]; while (m != (int)(-1)) { - int prio = ffz (m); + int prio = ffz(m); + m |= 1 << prio; - skb = htb_dequeue_tree(q,prio,level); - if (likely(skb != NULL)) { - sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; - goto fin; - } + skb = htb_dequeue_tree(q, prio, level); + if (likely(skb != NULL)) + goto ok; } } -#ifdef HTB_DEBUG - if (!q->nwc_hit && min_delay >= 10*HZ && net_ratelimit()) { - if (min_delay == LONG_MAX) { - printk(KERN_ERR "HTB: dequeue bug (%d,%lu,%lu), report it please !\n", - evs_used,q->jiffies,jiffies); - htb_debug_dump(q); - } else - printk(KERN_WARNING "HTB: mindelay=%ld, some class has " - "too small rate\n",min_delay); + sch->qstats.overlimits++; + if (likely(next_event > q->now)) { + if (!test_bit(__QDISC_STATE_DEACTIVATED, + &qdisc_root_sleeping(q->watchdog.qdisc)->state)) { + ktime_t time = ns_to_ktime(next_event); + qdisc_throttled(q->watchdog.qdisc); + hrtimer_start(&q->watchdog.timer, time, + HRTIMER_MODE_ABS); + } + } else { + schedule_work(&q->work); } -#endif - htb_delay_by (sch,min_delay > 5*HZ ? 5*HZ : min_delay); fin: - HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,q->jiffies,skb); return skb; } /* try to drop from each class (by prio) until one succeed */ -static unsigned int htb_drop(struct Qdisc* sch) +static unsigned int htb_drop(struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); int prio; for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { struct list_head *p; - list_for_each (p,q->drops+prio) { + list_for_each(p, q->drops + prio) { struct htb_class *cl = list_entry(p, struct htb_class, un.leaf.drop_list); unsigned int len; - if (cl->un.leaf.q->ops->drop && - (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { + if (cl->un.leaf.q->ops->drop && + (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { sch->q.qlen--; if (!cl->un.leaf.q->q.qlen) - htb_deactivate (q,cl); + htb_deactivate(q, cl); return len; } } @@ -1195,91 +967,92 @@ static unsigned int htb_drop(struct Qdisc* sch) /* reset all classes */ /* always caled under BH & queue lock */ -static void htb_reset(struct Qdisc* sch) +static void htb_reset(struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); - int i; - HTB_DBG(0,1,"htb_reset sch=%p, handle=%X\n",sch,sch->handle); + struct htb_class *cl; + unsigned int i; - for (i = 0; i < HTB_HSIZE; i++) { - struct list_head *p; - list_for_each (p,q->hash+i) { - struct htb_class *cl = list_entry(p,struct htb_class,hlist); + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->level) - memset(&cl->un.inner,0,sizeof(cl->un.inner)); + memset(&cl->un.inner, 0, sizeof(cl->un.inner)); else { - if (cl->un.leaf.q) + if (cl->un.leaf.q) qdisc_reset(cl->un.leaf.q); INIT_LIST_HEAD(&cl->un.leaf.drop_list); } cl->prio_activity = 0; cl->cmode = HTB_CAN_SEND; -#ifdef HTB_DEBUG - cl->pq_node.rb_color = -1; - memset(cl->node,255,sizeof(cl->node)); -#endif } } - sch->flags &= ~TCQ_F_THROTTLED; - del_timer(&q->timer); + qdisc_watchdog_cancel(&q->watchdog); __skb_queue_purge(&q->direct_queue); sch->q.qlen = 0; - memset(q->row,0,sizeof(q->row)); - memset(q->row_mask,0,sizeof(q->row_mask)); - memset(q->wait_pq,0,sizeof(q->wait_pq)); - memset(q->ptr,0,sizeof(q->ptr)); + memset(q->hlevel, 0, sizeof(q->hlevel)); + memset(q->row_mask, 0, sizeof(q->row_mask)); for (i = 0; i < TC_HTB_NUMPRIO; i++) - INIT_LIST_HEAD(q->drops+i); + INIT_LIST_HEAD(q->drops + i); +} + +static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { + [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) }, + [TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) }, + [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 }, + [TCA_HTB_RATE64] = { .type = NLA_U64 }, + [TCA_HTB_CEIL64] = { .type = NLA_U64 }, +}; + +static void htb_work_func(struct work_struct *work) +{ + struct htb_sched *q = container_of(work, struct htb_sched, work); + struct Qdisc *sch = q->watchdog.qdisc; + + __netif_schedule(qdisc_root(sch)); } -static int htb_init(struct Qdisc *sch, struct rtattr *opt) +static int htb_init(struct Qdisc *sch, struct nlattr *opt) { struct htb_sched *q = qdisc_priv(sch); - struct rtattr *tb[TCA_HTB_INIT]; + struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_glob *gopt; + int err; int i; -#ifdef HTB_DEBUG - printk(KERN_INFO "HTB init, kernel part version %d.%d\n", - HTB_VER >> 16,HTB_VER & 0xffff); -#endif - if (!opt || rtattr_parse_nested(tb, TCA_HTB_INIT, opt) || - tb[TCA_HTB_INIT-1] == NULL || - RTA_PAYLOAD(tb[TCA_HTB_INIT-1]) < sizeof(*gopt)) { - printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n"); + + if (!opt) return -EINVAL; - } - gopt = RTA_DATA(tb[TCA_HTB_INIT-1]); - if (gopt->version != HTB_VER >> 16) { - printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n", - HTB_VER >> 16,HTB_VER & 0xffff,gopt->version); + + err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy); + if (err < 0) + return err; + + if (!tb[TCA_HTB_INIT]) + return -EINVAL; + + gopt = nla_data(tb[TCA_HTB_INIT]); + if (gopt->version != HTB_VER >> 16) return -EINVAL; - } - q->debug = gopt->debug; - HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum); - INIT_LIST_HEAD(&q->root); - for (i = 0; i < HTB_HSIZE; i++) - INIT_LIST_HEAD(q->hash+i); + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) + return err; for (i = 0; i < TC_HTB_NUMPRIO; i++) - INIT_LIST_HEAD(q->drops+i); + INIT_LIST_HEAD(q->drops + i); - init_timer(&q->timer); + qdisc_watchdog_init(&q->watchdog, sch); + INIT_WORK(&q->work, htb_work_func); skb_queue_head_init(&q->direct_queue); - q->direct_qlen = sch->dev->tx_queue_len; - if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ - q->direct_qlen = 2; - q->timer.function = htb_timer; - q->timer.data = (unsigned long)sch; - -#ifdef HTB_RATECM - init_timer(&q->rttim); - q->rttim.function = htb_rate_timer; - q->rttim.data = (unsigned long)sch; - q->rttim.expires = jiffies + HZ; - add_timer(&q->rttim); -#endif + if (tb[TCA_HTB_DIRECT_QLEN]) + q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); + else { + q->direct_qlen = qdisc_dev(sch)->tx_queue_len; + if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ + q->direct_qlen = 2; + } if ((q->rate2quantum = gopt->rate2quantum) < 1) q->rate2quantum = 1; q->defcls = gopt->defcls; @@ -1290,89 +1063,89 @@ static int htb_init(struct Qdisc *sch, struct rtattr *opt) static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) { struct htb_sched *q = qdisc_priv(sch); - unsigned char *b = skb->tail; - struct rtattr *rta; + struct nlattr *nest; struct tc_htb_glob gopt; - HTB_DBG(0,1,"htb_dump sch=%p, handle=%X\n",sch,sch->handle); - HTB_QLOCK(sch); - gopt.direct_pkts = q->direct_pkts; -#ifdef HTB_DEBUG - if (HTB_DBG_COND(0,2)) - htb_debug_dump(q); -#endif + /* Its safe to not acquire qdisc lock. As we hold RTNL, + * no change can happen on the qdisc parameters. + */ + + gopt.direct_pkts = q->direct_pkts; gopt.version = HTB_VER; gopt.rate2quantum = q->rate2quantum; gopt.defcls = q->defcls; - gopt.debug = q->debug; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); - rta->rta_len = skb->tail - b; - HTB_QUNLOCK(sch); - return skb->len; -rtattr_failure: - HTB_QUNLOCK(sch); - skb_trim(skb, skb->tail - skb->data); + gopt.debug = 0; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) || + nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } static int htb_dump_class(struct Qdisc *sch, unsigned long arg, - struct sk_buff *skb, struct tcmsg *tcm) + struct sk_buff *skb, struct tcmsg *tcm) { -#ifdef HTB_DEBUG - struct htb_sched *q = qdisc_priv(sch); -#endif - struct htb_class *cl = (struct htb_class*)arg; - unsigned char *b = skb->tail; - struct rtattr *rta; + struct htb_class *cl = (struct htb_class *)arg; + struct nlattr *nest; struct tc_htb_opt opt; - HTB_DBG(0,1,"htb_dump_class handle=%X clid=%X\n",sch->handle,cl->classid); - - HTB_QLOCK(sch); - tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT; - tcm->tcm_handle = cl->classid; + /* Its safe to not acquire qdisc lock. As we hold RTNL, + * no change can happen on the class parameters. + */ + tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT; + tcm->tcm_handle = cl->common.classid; if (!cl->level && cl->un.leaf.q) tcm->tcm_info = cl->un.leaf.q->handle; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - - memset (&opt,0,sizeof(opt)); - - opt.rate = cl->rate->rate; opt.buffer = cl->buffer; - opt.ceil = cl->ceil->rate; opt.cbuffer = cl->cbuffer; - opt.quantum = cl->un.leaf.quantum; opt.prio = cl->un.leaf.prio; - opt.level = cl->level; - RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); - rta->rta_len = skb->tail - b; - HTB_QUNLOCK(sch); - return skb->len; -rtattr_failure: - HTB_QUNLOCK(sch); - skb_trim(skb, b - skb->data); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + memset(&opt, 0, sizeof(opt)); + + psched_ratecfg_getrate(&opt.rate, &cl->rate); + opt.buffer = PSCHED_NS2TICKS(cl->buffer); + psched_ratecfg_getrate(&opt.ceil, &cl->ceil); + opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer); + opt.quantum = cl->quantum; + opt.prio = cl->prio; + opt.level = cl->level; + if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) && + nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps)) + goto nla_put_failure; + if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) && + nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } static int -htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, - struct gnet_dump *d) +htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { - struct htb_class *cl = (struct htb_class*)arg; - -#ifdef HTB_RATECM - cl->rate_est.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE); - cl->rate_est.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE); -#endif + struct htb_class *cl = (struct htb_class *)arg; if (!cl->level && cl->un.leaf.q) cl->qstats.qlen = cl->un.leaf.q->q.qlen; - cl->xstats.tokens = cl->tokens; - cl->xstats.ctokens = cl->ctokens; + cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); + cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || - gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, &cl->qstats) < 0) return -1; @@ -1380,132 +1153,171 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, } static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old) + struct Qdisc **old) { - struct htb_class *cl = (struct htb_class*)arg; + struct htb_class *cl = (struct htb_class *)arg; - if (cl && !cl->level) { - if (new == NULL && (new = qdisc_create_dflt(sch->dev, - &pfifo_qdisc_ops)) == NULL) - return -ENOBUFS; - sch_tree_lock(sch); - if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) { - if (cl->prio_activity) - htb_deactivate (qdisc_priv(sch),cl); + if (cl->level) + return -EINVAL; + if (new == NULL && + (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + cl->common.classid)) == NULL) + return -ENOBUFS; - /* TODO: is it correct ? Why CBQ doesn't do it ? */ - sch->q.qlen -= (*old)->q.qlen; - qdisc_reset(*old); - } - sch_tree_unlock(sch); - return 0; + sch_tree_lock(sch); + *old = cl->un.leaf.q; + cl->un.leaf.q = new; + if (*old != NULL) { + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); } - return -ENOENT; + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct htb_class *cl = (struct htb_class *)arg; + return !cl->level ? cl->un.leaf.q : NULL; } -static struct Qdisc * htb_leaf(struct Qdisc *sch, unsigned long arg) +static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg) { - struct htb_class *cl = (struct htb_class*)arg; - return (cl && !cl->level) ? cl->un.leaf.q : NULL; + struct htb_class *cl = (struct htb_class *)arg; + + if (cl->un.leaf.q->q.qlen == 0) + htb_deactivate(qdisc_priv(sch), cl); } static unsigned long htb_get(struct Qdisc *sch, u32 classid) { -#ifdef HTB_DEBUG - struct htb_sched *q = qdisc_priv(sch); -#endif - struct htb_class *cl = htb_find(classid,sch); - HTB_DBG(0,1,"htb_get clid=%X q=%p cl=%p ref=%d\n",classid,q,cl,cl?cl->refcnt:0); - if (cl) + struct htb_class *cl = htb_find(classid, sch); + if (cl) cl->refcnt++; return (unsigned long)cl; } -static void htb_destroy_filters(struct tcf_proto **fl) +static inline int htb_parent_last_child(struct htb_class *cl) { - struct tcf_proto *tp; + if (!cl->parent) + /* the root class */ + return 0; + if (cl->parent->children > 1) + /* not the last child */ + return 0; + return 1; +} - while ((tp = *fl) != NULL) { - *fl = tp->next; - tcf_destroy(tp); - } +static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, + struct Qdisc *new_q) +{ + struct htb_class *parent = cl->parent; + + WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity); + + if (parent->cmode != HTB_CAN_SEND) + htb_safe_rb_erase(&parent->pq_node, + &q->hlevel[parent->level].wait_pq); + + parent->level = 0; + memset(&parent->un.inner, 0, sizeof(parent->un.inner)); + INIT_LIST_HEAD(&parent->un.leaf.drop_list); + parent->un.leaf.q = new_q ? new_q : &noop_qdisc; + parent->tokens = parent->buffer; + parent->ctokens = parent->cbuffer; + parent->t_c = ktime_to_ns(ktime_get()); + parent->cmode = HTB_CAN_SEND; } -static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl) +static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) { - struct htb_sched *q = qdisc_priv(sch); - HTB_DBG(0,1,"htb_destrycls clid=%X ref=%d\n", cl?cl->classid:0,cl?cl->refcnt:0); if (!cl->level) { - BUG_TRAP(cl->un.leaf.q); - sch->q.qlen -= cl->un.leaf.q->q.qlen; + WARN_ON(!cl->un.leaf.q); qdisc_destroy(cl->un.leaf.q); } - qdisc_put_rtab(cl->rate); - qdisc_put_rtab(cl->ceil); - - htb_destroy_filters (&cl->filter_list); - - while (!list_empty(&cl->children)) - htb_destroy_class (sch,list_entry(cl->children.next, - struct htb_class,sibling)); - - /* note: this delete may happen twice (see htb_delete) */ - list_del(&cl->hlist); - list_del(&cl->sibling); - - if (cl->prio_activity) - htb_deactivate (q,cl); - - if (cl->cmode != HTB_CAN_SEND) - htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); - + gen_kill_estimator(&cl->bstats, &cl->rate_est); + tcf_destroy_chain(&cl->filter_list); kfree(cl); } -/* always caled under BH & queue lock */ -static void htb_destroy(struct Qdisc* sch) +static void htb_destroy(struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); - HTB_DBG(0,1,"htb_destroy q=%p\n",q); + struct hlist_node *next; + struct htb_class *cl; + unsigned int i; - del_timer_sync (&q->timer); -#ifdef HTB_RATECM - del_timer_sync (&q->rttim); -#endif + cancel_work_sync(&q->work); + qdisc_watchdog_cancel(&q->watchdog); /* This line used to be after htb_destroy_class call below - and surprisingly it worked in 2.4. But it must precede it - because filter need its target class alive to be able to call - unbind_filter on it (without Oops). */ - htb_destroy_filters(&q->filter_list); - - while (!list_empty(&q->root)) - htb_destroy_class (sch,list_entry(q->root.next, - struct htb_class,sibling)); + * and surprisingly it worked in 2.4. But it must precede it + * because filter need its target class alive to be able to call + * unbind_filter on it (without Oops). + */ + tcf_destroy_chain(&q->filter_list); + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) + tcf_destroy_chain(&cl->filter_list); + } + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], + common.hnode) + htb_destroy_class(sch, cl); + } + qdisc_class_hash_destroy(&q->clhash); __skb_queue_purge(&q->direct_queue); } static int htb_delete(struct Qdisc *sch, unsigned long arg) { struct htb_sched *q = qdisc_priv(sch); - struct htb_class *cl = (struct htb_class*)arg; - HTB_DBG(0,1,"htb_delete q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); + struct htb_class *cl = (struct htb_class *)arg; + unsigned int qlen; + struct Qdisc *new_q = NULL; + int last_child = 0; - // TODO: why don't allow to delete subtree ? references ? does - // tc subsys quarantee us that in htb_destroy it holds no class - // refs so that we can remove children safely there ? - if (!list_empty(&cl->children) || cl->filter_cnt) + /* TODO: why don't allow to delete subtree ? references ? does + * tc subsys guarantee us that in htb_destroy it holds no class + * refs so that we can remove children safely there ? + */ + if (cl->children || cl->filter_cnt) return -EBUSY; - + + if (!cl->level && htb_parent_last_child(cl)) { + new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + cl->parent->common.classid); + last_child = 1; + } + sch_tree_lock(sch); - + + if (!cl->level) { + qlen = cl->un.leaf.q->q.qlen; + qdisc_reset(cl->un.leaf.q); + qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen); + } + /* delete from hash and active; remainder in destroy_class */ - list_del_init(&cl->hlist); + qdisc_class_hash_remove(&q->clhash, &cl->common); + if (cl->parent) + cl->parent->children--; + if (cl->prio_activity) - htb_deactivate (q,cl); + htb_deactivate(q, cl); - if (--cl->refcnt == 0) - htb_destroy_class(sch,cl); + if (cl->cmode != HTB_CAN_SEND) + htb_safe_rb_erase(&cl->pq_node, + &q->hlevel[cl->level].wait_pq); + + if (last_child) + htb_parent_to_leaf(q, cl, new_q); + + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ sch_tree_unlock(sch); return 0; @@ -1513,141 +1325,197 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) static void htb_put(struct Qdisc *sch, unsigned long arg) { -#ifdef HTB_DEBUG - struct htb_sched *q = qdisc_priv(sch); -#endif - struct htb_class *cl = (struct htb_class*)arg; - HTB_DBG(0,1,"htb_put q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); + struct htb_class *cl = (struct htb_class *)arg; if (--cl->refcnt == 0) - htb_destroy_class(sch,cl); + htb_destroy_class(sch, cl); } -static int htb_change_class(struct Qdisc *sch, u32 classid, - u32 parentid, struct rtattr **tca, unsigned long *arg) +static int htb_change_class(struct Qdisc *sch, u32 classid, + u32 parentid, struct nlattr **tca, + unsigned long *arg) { int err = -EINVAL; struct htb_sched *q = qdisc_priv(sch); - struct htb_class *cl = (struct htb_class*)*arg,*parent; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct qdisc_rate_table *rtab = NULL, *ctab = NULL; - struct rtattr *tb[TCA_HTB_RTAB]; + struct htb_class *cl = (struct htb_class *)*arg, *parent; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; + u64 rate64, ceil64; /* extract all subattrs from opt attr */ - if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) || - tb[TCA_HTB_PARMS-1] == NULL || - RTA_PAYLOAD(tb[TCA_HTB_PARMS-1]) < sizeof(*hopt)) + if (!opt) goto failure; - - parent = parentid == TC_H_ROOT ? NULL : htb_find (parentid,sch); - hopt = RTA_DATA(tb[TCA_HTB_PARMS-1]); - HTB_DBG(0,1,"htb_chg cl=%p(%X), clid=%X, parid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,classid,parentid,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum); - rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB-1]); - ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB-1]); - if (!rtab || !ctab) goto failure; + err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy); + if (err < 0) + goto failure; - if (!cl) { /* new class */ + err = -EINVAL; + if (tb[TCA_HTB_PARMS] == NULL) + goto failure; + + parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch); + + hopt = nla_data(tb[TCA_HTB_PARMS]); + if (!hopt->rate.rate || !hopt->ceil.rate) + goto failure; + + /* Keeping backward compatible with rate_table based iproute2 tc */ + if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB])); + + if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB])); + + if (!cl) { /* new class */ struct Qdisc *new_q; + int prio; + struct { + struct nlattr nla; + struct gnet_estimator opt; + } est = { + .nla = { + .nla_len = nla_attr_size(sizeof(est.opt)), + .nla_type = TCA_RATE, + }, + .opt = { + /* 4s interval, 16s averaging constant */ + .interval = 2, + .ewma_log = 2, + }, + }; + /* check for valid classid */ - if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch)) + if (!classid || TC_H_MAJ(classid ^ sch->handle) || + htb_find(classid, sch)) goto failure; /* check maximal depth */ if (parent && parent->parent && parent->parent->level < 2) { - printk(KERN_ERR "htb: tree is too deep\n"); + pr_err("htb: tree is too deep\n"); goto failure; } err = -ENOBUFS; - if ((cl = kmalloc(sizeof(*cl), GFP_KERNEL)) == NULL) + cl = kzalloc(sizeof(*cl), GFP_KERNEL); + if (!cl) goto failure; - - memset(cl, 0, sizeof(*cl)); + + if (htb_rate_est || tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE] ? : &est.nla); + if (err) { + kfree(cl); + goto failure; + } + } + cl->refcnt = 1; - INIT_LIST_HEAD(&cl->sibling); - INIT_LIST_HEAD(&cl->hlist); - INIT_LIST_HEAD(&cl->children); + cl->children = 0; INIT_LIST_HEAD(&cl->un.leaf.drop_list); -#ifdef HTB_DEBUG - cl->magic = HTB_CMAGIC; -#endif + RB_CLEAR_NODE(&cl->pq_node); + + for (prio = 0; prio < TC_HTB_NUMPRIO; prio++) + RB_CLEAR_NODE(&cl->node[prio]); /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) - so that can't be used inside of sch_tree_lock - -- thanks to Karlis Peisenieks */ - new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + * so that can't be used inside of sch_tree_lock + * -- thanks to Karlis Peisenieks + */ + new_q = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, classid); sch_tree_lock(sch); if (parent && !parent->level) { + unsigned int qlen = parent->un.leaf.q->q.qlen; + /* turn parent into inner node */ - sch->q.qlen -= parent->un.leaf.q->q.qlen; - qdisc_destroy (parent->un.leaf.q); - if (parent->prio_activity) - htb_deactivate (q,parent); + qdisc_reset(parent->un.leaf.q); + qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen); + qdisc_destroy(parent->un.leaf.q); + if (parent->prio_activity) + htb_deactivate(q, parent); /* remove from evt list because of level change */ if (parent->cmode != HTB_CAN_SEND) { - htb_safe_rb_erase(&parent->pq_node,q->wait_pq /*+0*/); + htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq); parent->cmode = HTB_CAN_SEND; } parent->level = (parent->parent ? parent->parent->level - : TC_HTB_MAXDEPTH) - 1; - memset (&parent->un.inner,0,sizeof(parent->un.inner)); + : TC_HTB_MAXDEPTH) - 1; + memset(&parent->un.inner, 0, sizeof(parent->un.inner)); } /* leaf (we) needs elementary qdisc */ cl->un.leaf.q = new_q ? new_q : &noop_qdisc; - cl->classid = classid; cl->parent = parent; + cl->common.classid = classid; + cl->parent = parent; /* set class to be in HTB_CAN_SEND state */ - cl->tokens = hopt->buffer; - cl->ctokens = hopt->cbuffer; - cl->mbuffer = 60000000; /* 1min */ - PSCHED_GET_TIME(cl->t_c); + cl->tokens = PSCHED_TICKS2NS(hopt->buffer); + cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer); + cl->mbuffer = 60ULL * NSEC_PER_SEC; /* 1min */ + cl->t_c = ktime_to_ns(ktime_get()); cl->cmode = HTB_CAN_SEND; /* attach to the hash list and parent's family */ - list_add_tail(&cl->hlist, q->hash+htb_hash(classid)); - list_add_tail(&cl->sibling, parent ? &parent->children : &q->root); -#ifdef HTB_DEBUG - { - int i; - for (i = 0; i < TC_HTB_NUMPRIO; i++) cl->node[i].rb_color = -1; - cl->pq_node.rb_color = -1; + qdisc_class_hash_insert(&q->clhash, &cl->common); + if (parent) + parent->children++; + } else { + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) + return err; } -#endif - } else sch_tree_lock(sch); + sch_tree_lock(sch); + } + + rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + + ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + + psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); + psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); /* it used to be a nasty bug here, we have to check that node - is really leaf before changing cl->un.leaf ! */ + * is really leaf before changing cl->un.leaf ! + */ if (!cl->level) { - cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum; - if (!hopt->quantum && cl->un.leaf.quantum < 1000) { - printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.\n", cl->classid); - cl->un.leaf.quantum = 1000; + u64 quantum = cl->rate.rate_bytes_ps; + + do_div(quantum, q->rate2quantum); + cl->quantum = min_t(u64, quantum, INT_MAX); + + if (!hopt->quantum && cl->quantum < 1000) { + pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n", + cl->common.classid); + cl->quantum = 1000; } - if (!hopt->quantum && cl->un.leaf.quantum > 200000) { - printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.\n", cl->classid); - cl->un.leaf.quantum = 200000; + if (!hopt->quantum && cl->quantum > 200000) { + pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n", + cl->common.classid); + cl->quantum = 200000; } if (hopt->quantum) - cl->un.leaf.quantum = hopt->quantum; - if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO) - cl->un.leaf.prio = TC_HTB_NUMPRIO - 1; + cl->quantum = hopt->quantum; + if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO) + cl->prio = TC_HTB_NUMPRIO - 1; } - cl->buffer = hopt->buffer; - cl->cbuffer = hopt->cbuffer; - if (cl->rate) qdisc_put_rtab(cl->rate); cl->rate = rtab; - if (cl->ceil) qdisc_put_rtab(cl->ceil); cl->ceil = ctab; + cl->buffer = PSCHED_TICKS2NS(hopt->buffer); + cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); + sch_tree_unlock(sch); + qdisc_class_hash_grow(sch, &q->clhash); + *arg = (unsigned long)cl; return 0; failure: - if (rtab) qdisc_put_rtab(rtab); - if (ctab) qdisc_put_rtab(ctab); return err; } @@ -1656,55 +1524,48 @@ static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg) struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list; - HTB_DBG(0,2,"htb_tcf q=%p clid=%X fref=%d fl=%p\n",q,cl?cl->classid:0,cl?cl->filter_cnt:q->filter_cnt,*fl); + return fl; } static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, - u32 classid) + u32 classid) { - struct htb_sched *q = qdisc_priv(sch); - struct htb_class *cl = htb_find (classid,sch); - HTB_DBG(0,2,"htb_bind q=%p clid=%X cl=%p fref=%d\n",q,classid,cl,cl?cl->filter_cnt:q->filter_cnt); + struct htb_class *cl = htb_find(classid, sch); + /*if (cl && !cl->level) return 0; - The line above used to be there to prevent attaching filters to - leaves. But at least tc_index filter uses this just to get class - for other reasons so that we have to allow for it. - ---- - 19.6.2002 As Werner explained it is ok - bind filter is just - another way to "lock" the class - unlike "get" this lock can - be broken by class during destroy IIUC. + * The line above used to be there to prevent attaching filters to + * leaves. But at least tc_index filter uses this just to get class + * for other reasons so that we have to allow for it. + * ---- + * 19.6.2002 As Werner explained it is ok - bind filter is just + * another way to "lock" the class - unlike "get" this lock can + * be broken by class during destroy IIUC. */ - if (cl) - cl->filter_cnt++; - else - q->filter_cnt++; + if (cl) + cl->filter_cnt++; return (unsigned long)cl; } static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg) { - struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; - HTB_DBG(0,2,"htb_unbind q=%p cl=%p fref=%d\n",q,cl,cl?cl->filter_cnt:q->filter_cnt); - if (cl) - cl->filter_cnt--; - else - q->filter_cnt--; + + if (cl) + cl->filter_cnt--; } static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct htb_sched *q = qdisc_priv(sch); - int i; + struct htb_class *cl; + unsigned int i; if (arg->stop) return; - for (i = 0; i < HTB_HSIZE; i++) { - struct list_head *p; - list_for_each (p,q->hash+i) { - struct htb_class *cl = list_entry(p,struct htb_class,hlist); + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; @@ -1718,9 +1579,10 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct Qdisc_class_ops htb_class_ops = { +static const struct Qdisc_class_ops htb_class_ops = { .graft = htb_graft, .leaf = htb_leaf, + .qlen_notify = htb_qlen_notify, .get = htb_get, .put = htb_put, .change = htb_change_class, @@ -1733,31 +1595,30 @@ static struct Qdisc_class_ops htb_class_ops = { .dump_stats = htb_dump_class_stats, }; -static struct Qdisc_ops htb_qdisc_ops = { - .next = NULL, +static struct Qdisc_ops htb_qdisc_ops __read_mostly = { .cl_ops = &htb_class_ops, .id = "htb", .priv_size = sizeof(struct htb_sched), .enqueue = htb_enqueue, .dequeue = htb_dequeue, - .requeue = htb_requeue, + .peek = qdisc_peek_dequeued, .drop = htb_drop, .init = htb_init, .reset = htb_reset, .destroy = htb_destroy, - .change = NULL /* htb_change */, .dump = htb_dump, .owner = THIS_MODULE, }; static int __init htb_module_init(void) { - return register_qdisc(&htb_qdisc_ops); + return register_qdisc(&htb_qdisc_ops); } -static void __exit htb_module_exit(void) +static void __exit htb_module_exit(void) { - unregister_qdisc(&htb_qdisc_ops); + unregister_qdisc(&htb_qdisc_ops); } + module_init(htb_module_init) module_exit(htb_module_exit) MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 8edc32a6ad2..62871c14e1f 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -1,4 +1,4 @@ -/* net/sched/sch_ingress.c - Ingress qdisc +/* net/sched/sch_ingress.c - Ingress qdisc * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -7,430 +7,136 @@ * Authors: Jamal Hadi Salim 1999 */ -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> +#include <linux/list.h> #include <linux/skbuff.h> -#include <linux/netdevice.h> #include <linux/rtnetlink.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv6.h> -#include <linux/netfilter.h> -#include <linux/smp.h> +#include <net/netlink.h> #include <net/pkt_sched.h> -#include <asm/byteorder.h> -#include <asm/uaccess.h> -#include <linux/kmod.h> -#include <linux/stat.h> -#include <linux/interrupt.h> -#include <linux/list.h> - - -#undef DEBUG_INGRESS - -#ifdef DEBUG_INGRESS /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif -#if 0 /* data */ -#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define D2PRINTK(format,args...) -#endif - - -#define PRIV(sch) qdisc_priv(sch) - - -/* Thanks to Doron Oz for this hack -*/ -#ifndef CONFIG_NET_CLS_ACT -#ifdef CONFIG_NETFILTER -static int nf_registered; -#endif -#endif struct ingress_qdisc_data { - struct Qdisc *q; struct tcf_proto *filter_list; }; - /* ------------------------- Class/flow operations ------------------------- */ - -static int ingress_graft(struct Qdisc *sch,unsigned long arg, - struct Qdisc *new,struct Qdisc **old) -{ -#ifdef DEBUG_INGRESS - struct ingress_qdisc_data *p = PRIV(sch); -#endif - - DPRINTK("ingress_graft(sch %p,[qdisc %p],new %p,old %p)\n", - sch, p, new, old); - DPRINTK("\n ingress_graft: You cannot add qdiscs to classes"); - return 1; -} - - static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } - -static unsigned long ingress_get(struct Qdisc *sch,u32 classid) +static unsigned long ingress_get(struct Qdisc *sch, u32 classid) { -#ifdef DEBUG_INGRESS - struct ingress_qdisc_data *p = PRIV(sch); -#endif - DPRINTK("ingress_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid); return TC_H_MIN(classid) + 1; } - static unsigned long ingress_bind_filter(struct Qdisc *sch, - unsigned long parent, u32 classid) + unsigned long parent, u32 classid) { return ingress_get(sch, classid); } - static void ingress_put(struct Qdisc *sch, unsigned long cl) { } - -static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent, - struct rtattr **tca, unsigned long *arg) -{ -#ifdef DEBUG_INGRESS - struct ingress_qdisc_data *p = PRIV(sch); -#endif - DPRINTK("ingress_change(sch %p,[qdisc %p],classid %x,parent %x)," - "arg 0x%lx\n", sch, p, classid, parent, *arg); - DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); - return 0; -} - - - -static void ingress_walk(struct Qdisc *sch,struct qdisc_walker *walker) +static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) { -#ifdef DEBUG_INGRESS - struct ingress_qdisc_data *p = PRIV(sch); -#endif - DPRINTK("ingress_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); - DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); } - -static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch,unsigned long cl) +static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) { - struct ingress_qdisc_data *p = PRIV(sch); + struct ingress_qdisc_data *p = qdisc_priv(sch); return &p->filter_list; } - /* --------------------------- Qdisc operations ---------------------------- */ - -static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch) +static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct ingress_qdisc_data *p = PRIV(sch); + struct ingress_qdisc_data *p = qdisc_priv(sch); struct tcf_result res; int result; - D2PRINTK("ingress_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); result = tc_classify(skb, p->filter_list, &res); - D2PRINTK("result %d class 0x%04x\n", result, res.classid); - /* - * Unlike normal "enqueue" functions, ingress_enqueue returns a - * firewall FW_* code. - */ -#ifdef CONFIG_NET_CLS_ACT - sch->bstats.packets++; - sch->bstats.bytes += skb->len; - switch (result) { - case TC_ACT_SHOT: - result = TC_ACT_SHOT; - sch->qstats.drops++; - break; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - result = TC_ACT_STOLEN; - break; - case TC_ACT_RECLASSIFY: - case TC_ACT_OK: - case TC_ACT_UNSPEC: - default: - skb->tc_index = TC_H_MIN(res.classid); - result = TC_ACT_OK; - break; - }; -/* backward compat */ -#else -#ifdef CONFIG_NET_CLS_POLICE + + qdisc_bstats_update(sch, skb); switch (result) { - case TC_POLICE_SHOT: - result = NF_DROP; + case TC_ACT_SHOT: + result = TC_ACT_SHOT; sch->qstats.drops++; break; - case TC_POLICE_RECLASSIFY: /* DSCP remarking here ? */ - case TC_POLICE_OK: - case TC_POLICE_UNSPEC: - default: - sch->bstats.packets++; - sch->bstats.bytes += skb->len; - result = NF_ACCEPT; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + result = TC_ACT_STOLEN; + break; + case TC_ACT_RECLASSIFY: + case TC_ACT_OK: + skb->tc_index = TC_H_MIN(res.classid); + default: + result = TC_ACT_OK; break; - }; - -#else - D2PRINTK("Overriding result to ACCEPT\n"); - result = NF_ACCEPT; - sch->bstats.packets++; - sch->bstats.bytes += skb->len; -#endif -#endif - - return result; -} - - -static struct sk_buff *ingress_dequeue(struct Qdisc *sch) -{ -/* - struct ingress_qdisc_data *p = PRIV(sch); - D2PRINTK("ingress_dequeue(sch %p,[qdisc %p])\n",sch,PRIV(p)); -*/ - return NULL; -} - - -static int ingress_requeue(struct sk_buff *skb,struct Qdisc *sch) -{ -/* - struct ingress_qdisc_data *p = PRIV(sch); - D2PRINTK("ingress_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,PRIV(p)); -*/ - return 0; -} - -static unsigned int ingress_drop(struct Qdisc *sch) -{ -#ifdef DEBUG_INGRESS - struct ingress_qdisc_data *p = PRIV(sch); -#endif - DPRINTK("ingress_drop(sch %p,[qdisc %p])\n", sch, p); - return 0; -} - -#ifndef CONFIG_NET_CLS_ACT -#ifdef CONFIG_NETFILTER -static unsigned int -ing_hook(unsigned int hook, struct sk_buff **pskb, - const struct net_device *indev, - const struct net_device *outdev, - int (*okfn)(struct sk_buff *)) -{ - - struct Qdisc *q; - struct sk_buff *skb = *pskb; - struct net_device *dev = skb->dev; - int fwres=NF_ACCEPT; - - DPRINTK("ing_hook: skb %s dev=%s len=%u\n", - skb->sk ? "(owned)" : "(unowned)", - skb->dev ? (*pskb)->dev->name : "(no dev)", - skb->len); - -/* -revisit later: Use a private since lock dev->queue_lock is also -used on the egress (might slow things for an iota) -*/ - - if (dev->qdisc_ingress) { - spin_lock(&dev->queue_lock); - if ((q = dev->qdisc_ingress) != NULL) - fwres = q->enqueue(skb, q); - spin_unlock(&dev->queue_lock); - } - - return fwres; -} - -/* after ipt_filter */ -static struct nf_hook_ops ing_ops = { - .hook = ing_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, - .priority = NF_IP_PRI_FILTER + 1, -}; - -static struct nf_hook_ops ing6_ops = { - .hook = ing_hook, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_IP6_PRE_ROUTING, - .priority = NF_IP6_PRI_FILTER + 1, -}; - -#endif -#endif - -static int ingress_init(struct Qdisc *sch,struct rtattr *opt) -{ - struct ingress_qdisc_data *p = PRIV(sch); - -/* Make sure either netfilter or preferably CLS_ACT is -* compiled in */ -#ifndef CONFIG_NET_CLS_ACT -#ifndef CONFIG_NETFILTER - printk("You MUST compile classifier actions into the kernel\n"); - return -EINVAL; -#else - printk("Ingress scheduler: Classifier actions prefered over netfilter\n"); -#endif -#endif - -#ifndef CONFIG_NET_CLS_ACT -#ifdef CONFIG_NETFILTER - if (!nf_registered) { - if (nf_register_hook(&ing_ops) < 0) { - printk("ingress qdisc registration error \n"); - return -EINVAL; - } - nf_registered++; - - if (nf_register_hook(&ing6_ops) < 0) { - printk("IPv6 ingress qdisc registration error, " \ - "disabling IPv6 support.\n"); - } else - nf_registered++; } -#endif -#endif - - DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); - p->q = &noop_qdisc; - return 0; -} - -static void ingress_reset(struct Qdisc *sch) -{ - struct ingress_qdisc_data *p = PRIV(sch); - - DPRINTK("ingress_reset(sch %p,[qdisc %p])\n", sch, p); - -/* -#if 0 -*/ -/* for future use */ - qdisc_reset(p->q); -/* -#endif -*/ + return result; } /* ------------------------------------------------------------- */ - -/* ------------------------------------------------------------- */ - static void ingress_destroy(struct Qdisc *sch) { - struct ingress_qdisc_data *p = PRIV(sch); - struct tcf_proto *tp; + struct ingress_qdisc_data *p = qdisc_priv(sch); - DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p); - while (p->filter_list) { - tp = p->filter_list; - p->filter_list = tp->next; - tcf_destroy(tp); - } -#if 0 -/* for future use */ - qdisc_destroy(p->q); -#endif + tcf_destroy_chain(&p->filter_list); } - static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) { - unsigned char *b = skb->tail; - struct rtattr *rta; + struct nlattr *nest; - rta = (struct rtattr *) b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - rta->rta_len = skb->tail - b; - return skb->len; + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + return nla_nest_end(skb, nest); -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } -static struct Qdisc_class_ops ingress_class_ops = { - .graft = ingress_graft, +static const struct Qdisc_class_ops ingress_class_ops = { .leaf = ingress_leaf, .get = ingress_get, .put = ingress_put, - .change = ingress_change, - .delete = NULL, .walk = ingress_walk, .tcf_chain = ingress_find_tcf, .bind_tcf = ingress_bind_filter, .unbind_tcf = ingress_put, - .dump = NULL, }; -static struct Qdisc_ops ingress_qdisc_ops = { - .next = NULL, +static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", .priv_size = sizeof(struct ingress_qdisc_data), .enqueue = ingress_enqueue, - .dequeue = ingress_dequeue, - .requeue = ingress_requeue, - .drop = ingress_drop, - .init = ingress_init, - .reset = ingress_reset, .destroy = ingress_destroy, - .change = NULL, .dump = ingress_dump, .owner = THIS_MODULE, }; static int __init ingress_module_init(void) { - int ret = 0; - - if ((ret = register_qdisc(&ingress_qdisc_ops)) < 0) { - printk("Unable to register Ingress qdisc\n"); - return ret; - } - - return ret; + return register_qdisc(&ingress_qdisc_ops); } -static void __exit ingress_module_exit(void) + +static void __exit ingress_module_exit(void) { unregister_qdisc(&ingress_qdisc_ops); -#ifndef CONFIG_NET_CLS_ACT -#ifdef CONFIG_NETFILTER - if (nf_registered) { - nf_unregister_hook(&ing_ops); - if (nf_registered > 1) - nf_unregister_hook(&ing6_ops); - } -#endif -#endif } + module_init(ingress_module_init) module_exit(ingress_module_exit) MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c new file mode 100644 index 00000000000..a8b2864a696 --- /dev/null +++ b/net/sched/sch_mq.c @@ -0,0 +1,248 @@ +/* + * net/sched/sch_mq.c Classful multiqueue dummy scheduler + * + * Copyright (c) 2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> + +struct mq_sched { + struct Qdisc **qdiscs; +}; + +static void mq_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct mq_sched *priv = qdisc_priv(sch); + unsigned int ntx; + + if (!priv->qdiscs) + return; + for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) + qdisc_destroy(priv->qdiscs[ntx]); + kfree(priv->qdiscs); +} + +static int mq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct net_device *dev = qdisc_dev(sch); + struct mq_sched *priv = qdisc_priv(sch); + struct netdev_queue *dev_queue; + struct Qdisc *qdisc; + unsigned int ntx; + + if (sch->parent != TC_H_ROOT) + return -EOPNOTSUPP; + + if (!netif_is_multiqueue(dev)) + return -EOPNOTSUPP; + + /* pre-allocate qdiscs, attachment can't fail */ + priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), + GFP_KERNEL); + if (priv->qdiscs == NULL) + return -ENOMEM; + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + dev_queue = netdev_get_tx_queue(dev, ntx); + qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(ntx + 1))); + if (qdisc == NULL) + goto err; + priv->qdiscs[ntx] = qdisc; + qdisc->flags |= TCQ_F_ONETXQUEUE; + } + + sch->flags |= TCQ_F_MQROOT; + return 0; + +err: + mq_destroy(sch); + return -ENOMEM; +} + +static void mq_attach(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct mq_sched *priv = qdisc_priv(sch); + struct Qdisc *qdisc, *old; + unsigned int ntx; + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + qdisc = priv->qdiscs[ntx]; + old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + if (old) + qdisc_destroy(old); +#ifdef CONFIG_NET_SCHED + if (ntx < dev->real_num_tx_queues) + qdisc_list_add(qdisc); +#endif + + } + kfree(priv->qdiscs); + priv->qdiscs = NULL; +} + +static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *qdisc; + unsigned int ntx; + + sch->q.qlen = 0; + memset(&sch->bstats, 0, sizeof(sch->bstats)); + memset(&sch->qstats, 0, sizeof(sch->qstats)); + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; + spin_lock_bh(qdisc_lock(qdisc)); + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; + sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.qlen += qdisc->qstats.qlen; + sch->qstats.backlog += qdisc->qstats.backlog; + sch->qstats.drops += qdisc->qstats.drops; + sch->qstats.requeues += qdisc->qstats.requeues; + sch->qstats.overlimits += qdisc->qstats.overlimits; + spin_unlock_bh(qdisc_lock(qdisc)); + } + return 0; +} + +static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx = cl - 1; + + if (ntx >= dev->num_tx_queues) + return NULL; + return netdev_get_tx_queue(dev, ntx); +} + +static struct netdev_queue *mq_select_queue(struct Qdisc *sch, + struct tcmsg *tcm) +{ + unsigned int ntx = TC_H_MIN(tcm->tcm_parent); + struct netdev_queue *dev_queue = mq_queue_get(sch, ntx); + + if (!dev_queue) { + struct net_device *dev = qdisc_dev(sch); + + return netdev_get_tx_queue(dev, 0); + } + return dev_queue; +} + +static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, + struct Qdisc **old) +{ + struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + struct net_device *dev = qdisc_dev(sch); + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + *old = dev_graft_qdisc(dev_queue, new); + if (new) + new->flags |= TCQ_F_ONETXQUEUE; + if (dev->flags & IFF_UP) + dev_activate(dev); + return 0; +} + +static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl) +{ + struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + + return dev_queue->qdisc_sleeping; +} + +static unsigned long mq_get(struct Qdisc *sch, u32 classid) +{ + unsigned int ntx = TC_H_MIN(classid); + + if (!mq_queue_get(sch, ntx)) + return 0; + return ntx; +} + +static void mq_put(struct Qdisc *sch, unsigned long cl) +{ +} + +static int mq_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle |= TC_H_MIN(cl); + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + return 0; +} + +static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; + sch->qstats.qlen = sch->q.qlen; + if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || + gnet_stats_copy_queue(d, &sch->qstats) < 0) + return -1; + return 0; +} + +static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx; + + if (arg->stop) + return; + + arg->count = arg->skip; + for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { + if (arg->fn(sch, ntx + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops mq_class_ops = { + .select_queue = mq_select_queue, + .graft = mq_graft, + .leaf = mq_leaf, + .get = mq_get, + .put = mq_put, + .walk = mq_walk, + .dump = mq_dump_class, + .dump_stats = mq_dump_class_stats, +}; + +struct Qdisc_ops mq_qdisc_ops __read_mostly = { + .cl_ops = &mq_class_ops, + .id = "mq", + .priv_size = sizeof(struct mq_sched), + .init = mq_init, + .destroy = mq_destroy, + .attach = mq_attach, + .dump = mq_dump, + .owner = THIS_MODULE, +}; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c new file mode 100644 index 00000000000..6749e2f540d --- /dev/null +++ b/net/sched/sch_mqprio.c @@ -0,0 +1,426 @@ +/* + * net/sched/sch_mqprio.c + * + * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/module.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/sch_generic.h> + +struct mqprio_sched { + struct Qdisc **qdiscs; + int hw_owned; +}; + +static void mqprio_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + unsigned int ntx; + + if (priv->qdiscs) { + for (ntx = 0; + ntx < dev->num_tx_queues && priv->qdiscs[ntx]; + ntx++) + qdisc_destroy(priv->qdiscs[ntx]); + kfree(priv->qdiscs); + } + + if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) + dev->netdev_ops->ndo_setup_tc(dev, 0); + else + netdev_set_num_tc(dev, 0); +} + +static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) +{ + int i, j; + + /* Verify num_tc is not out of max range */ + if (qopt->num_tc > TC_MAX_QUEUE) + return -EINVAL; + + /* Verify priority mapping uses valid tcs */ + for (i = 0; i < TC_BITMASK + 1; i++) { + if (qopt->prio_tc_map[i] >= qopt->num_tc) + return -EINVAL; + } + + /* net_device does not support requested operation */ + if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) + return -EINVAL; + + /* if hw owned qcount and qoffset are taken from LLD so + * no reason to verify them here + */ + if (qopt->hw) + return 0; + + for (i = 0; i < qopt->num_tc; i++) { + unsigned int last = qopt->offset[i] + qopt->count[i]; + + /* Verify the queue count is in tx range being equal to the + * real_num_tx_queues indicates the last queue is in use. + */ + if (qopt->offset[i] >= dev->real_num_tx_queues || + !qopt->count[i] || + last > dev->real_num_tx_queues) + return -EINVAL; + + /* Verify that the offset and counts do not overlap */ + for (j = i + 1; j < qopt->num_tc; j++) { + if (last > qopt->offset[j]) + return -EINVAL; + } + } + + return 0; +} + +static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + struct netdev_queue *dev_queue; + struct Qdisc *qdisc; + int i, err = -EOPNOTSUPP; + struct tc_mqprio_qopt *qopt = NULL; + + BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); + BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); + + if (sch->parent != TC_H_ROOT) + return -EOPNOTSUPP; + + if (!netif_is_multiqueue(dev)) + return -EOPNOTSUPP; + + if (!opt || nla_len(opt) < sizeof(*qopt)) + return -EINVAL; + + qopt = nla_data(opt); + if (mqprio_parse_opt(dev, qopt)) + return -EINVAL; + + /* pre-allocate qdisc, attachment can't fail */ + priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), + GFP_KERNEL); + if (priv->qdiscs == NULL) { + err = -ENOMEM; + goto err; + } + + for (i = 0; i < dev->num_tx_queues; i++) { + dev_queue = netdev_get_tx_queue(dev, i); + qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1))); + if (qdisc == NULL) { + err = -ENOMEM; + goto err; + } + priv->qdiscs[i] = qdisc; + qdisc->flags |= TCQ_F_ONETXQUEUE; + } + + /* If the mqprio options indicate that hardware should own + * the queue mapping then run ndo_setup_tc otherwise use the + * supplied and verified mapping + */ + if (qopt->hw) { + priv->hw_owned = 1; + err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc); + if (err) + goto err; + } else { + netdev_set_num_tc(dev, qopt->num_tc); + for (i = 0; i < qopt->num_tc; i++) + netdev_set_tc_queue(dev, i, + qopt->count[i], qopt->offset[i]); + } + + /* Always use supplied priority mappings */ + for (i = 0; i < TC_BITMASK + 1; i++) + netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]); + + sch->flags |= TCQ_F_MQROOT; + return 0; + +err: + mqprio_destroy(sch); + return err; +} + +static void mqprio_attach(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + struct Qdisc *qdisc, *old; + unsigned int ntx; + + /* Attach underlying qdisc */ + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + qdisc = priv->qdiscs[ntx]; + old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + if (old) + qdisc_destroy(old); + if (ntx < dev->real_num_tx_queues) + qdisc_list_add(qdisc); + } + kfree(priv->qdiscs); + priv->qdiscs = NULL; +} + +static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, + unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx = cl - 1 - netdev_get_num_tc(dev); + + if (ntx >= dev->num_tx_queues) + return NULL; + return netdev_get_tx_queue(dev, ntx); +} + +static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, + struct Qdisc **old) +{ + struct net_device *dev = qdisc_dev(sch); + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + + if (!dev_queue) + return -EINVAL; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + *old = dev_graft_qdisc(dev_queue, new); + + if (new) + new->flags |= TCQ_F_ONETXQUEUE; + + if (dev->flags & IFF_UP) + dev_activate(dev); + + return 0; +} + +static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + unsigned char *b = skb_tail_pointer(skb); + struct tc_mqprio_qopt opt = { 0 }; + struct Qdisc *qdisc; + unsigned int i; + + sch->q.qlen = 0; + memset(&sch->bstats, 0, sizeof(sch->bstats)); + memset(&sch->qstats, 0, sizeof(sch->qstats)); + + for (i = 0; i < dev->num_tx_queues; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc; + spin_lock_bh(qdisc_lock(qdisc)); + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; + sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.qlen += qdisc->qstats.qlen; + sch->qstats.backlog += qdisc->qstats.backlog; + sch->qstats.drops += qdisc->qstats.drops; + sch->qstats.requeues += qdisc->qstats.requeues; + sch->qstats.overlimits += qdisc->qstats.overlimits; + spin_unlock_bh(qdisc_lock(qdisc)); + } + + opt.num_tc = netdev_get_num_tc(dev); + memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); + opt.hw = priv->hw_owned; + + for (i = 0; i < netdev_get_num_tc(dev); i++) { + opt.count[i] = dev->tc_to_txq[i].count; + opt.offset[i] = dev->tc_to_txq[i].offset; + } + + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; + + return skb->len; +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl) +{ + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + + if (!dev_queue) + return NULL; + + return dev_queue->qdisc_sleeping; +} + +static unsigned long mqprio_get(struct Qdisc *sch, u32 classid) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx = TC_H_MIN(classid); + + if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev)) + return 0; + return ntx; +} + +static void mqprio_put(struct Qdisc *sch, unsigned long cl) +{ +} + +static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct net_device *dev = qdisc_dev(sch); + + if (cl <= netdev_get_num_tc(dev)) { + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_info = 0; + } else { + int i; + struct netdev_queue *dev_queue; + + dev_queue = mqprio_queue_get(sch, cl); + tcm->tcm_parent = 0; + for (i = 0; i < netdev_get_num_tc(dev); i++) { + struct netdev_tc_txq tc = dev->tc_to_txq[i]; + int q_idx = cl - netdev_get_num_tc(dev); + + if (q_idx > tc.offset && + q_idx <= tc.offset + tc.count) { + tcm->tcm_parent = + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1)); + break; + } + } + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + } + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) + __releases(d->lock) + __acquires(d->lock) +{ + struct net_device *dev = qdisc_dev(sch); + + if (cl <= netdev_get_num_tc(dev)) { + int i; + struct Qdisc *qdisc; + struct gnet_stats_queue qstats = {0}; + struct gnet_stats_basic_packed bstats = {0}; + struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1]; + + /* Drop lock here it will be reclaimed before touching + * statistics this is required because the d->lock we + * hold here is the look on dev_queue->qdisc_sleeping + * also acquired below. + */ + spin_unlock_bh(d->lock); + + for (i = tc.offset; i < tc.offset + tc.count; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc; + spin_lock_bh(qdisc_lock(qdisc)); + bstats.bytes += qdisc->bstats.bytes; + bstats.packets += qdisc->bstats.packets; + qstats.qlen += qdisc->qstats.qlen; + qstats.backlog += qdisc->qstats.backlog; + qstats.drops += qdisc->qstats.drops; + qstats.requeues += qdisc->qstats.requeues; + qstats.overlimits += qdisc->qstats.overlimits; + spin_unlock_bh(qdisc_lock(qdisc)); + } + /* Reclaim root sleeping lock before completing stats */ + spin_lock_bh(d->lock); + if (gnet_stats_copy_basic(d, &bstats) < 0 || + gnet_stats_copy_queue(d, &qstats) < 0) + return -1; + } else { + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; + sch->qstats.qlen = sch->q.qlen; + if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || + gnet_stats_copy_queue(d, &sch->qstats) < 0) + return -1; + } + return 0; +} + +static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx; + + if (arg->stop) + return; + + /* Walk hierarchy with a virtual class per tc */ + arg->count = arg->skip; + for (ntx = arg->skip; + ntx < dev->num_tx_queues + netdev_get_num_tc(dev); + ntx++) { + if (arg->fn(sch, ntx + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops mqprio_class_ops = { + .graft = mqprio_graft, + .leaf = mqprio_leaf, + .get = mqprio_get, + .put = mqprio_put, + .walk = mqprio_walk, + .dump = mqprio_dump_class, + .dump_stats = mqprio_dump_class_stats, +}; + +static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { + .cl_ops = &mqprio_class_ops, + .id = "mqprio", + .priv_size = sizeof(struct mqprio_sched), + .init = mqprio_init, + .destroy = mqprio_destroy, + .attach = mqprio_attach, + .dump = mqprio_dump, + .owner = THIS_MODULE, +}; + +static int __init mqprio_module_init(void) +{ + return register_qdisc(&mqprio_qdisc_ops); +} + +static void __exit mqprio_module_exit(void) +{ + unregister_qdisc(&mqprio_qdisc_ops); +} + +module_init(mqprio_module_init); +module_exit(mqprio_module_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c new file mode 100644 index 00000000000..afb050a735f --- /dev/null +++ b/net/sched/sch_multiq.c @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2008, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, see <http://www.gnu.org/licenses/>. + * + * Author: Alexander Duyck <alexander.h.duyck@intel.com> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> + + +struct multiq_sched_data { + u16 bands; + u16 max_bands; + u16 curband; + struct tcf_proto *filter_list; + struct Qdisc **queues; +}; + + +static struct Qdisc * +multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + u32 band; + struct tcf_result res; + int err; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + err = tc_classify(skb, q->filter_list, &res); +#ifdef CONFIG_NET_CLS_ACT + switch (err) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + } +#endif + band = skb_get_queue_mapping(skb); + + if (band >= q->bands) + return q->queues[0]; + + return q->queues[band]; +} + +static int +multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct Qdisc *qdisc; + int ret; + + qdisc = multiq_classify(skb, sch, &ret); +#ifdef CONFIG_NET_CLS_ACT + if (qdisc == NULL) { + + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } +#endif + + ret = qdisc_enqueue(skb, qdisc); + if (ret == NET_XMIT_SUCCESS) { + sch->q.qlen++; + return NET_XMIT_SUCCESS; + } + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + return ret; +} + +static struct sk_buff *multiq_dequeue(struct Qdisc *sch) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc; + struct sk_buff *skb; + int band; + + for (band = 0; band < q->bands; band++) { + /* cycle through bands to ensure fairness */ + q->curband++; + if (q->curband >= q->bands) + q->curband = 0; + + /* Check that target subqueue is available before + * pulling an skb to avoid head-of-line blocking. + */ + if (!netif_xmit_stopped( + netdev_get_tx_queue(qdisc_dev(sch), q->curband))) { + qdisc = q->queues[q->curband]; + skb = qdisc->dequeue(qdisc); + if (skb) { + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + return skb; + } + } + } + return NULL; + +} + +static struct sk_buff *multiq_peek(struct Qdisc *sch) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned int curband = q->curband; + struct Qdisc *qdisc; + struct sk_buff *skb; + int band; + + for (band = 0; band < q->bands; band++) { + /* cycle through bands to ensure fairness */ + curband++; + if (curband >= q->bands) + curband = 0; + + /* Check that target subqueue is available before + * pulling an skb to avoid head-of-line blocking. + */ + if (!netif_xmit_stopped( + netdev_get_tx_queue(qdisc_dev(sch), curband))) { + qdisc = q->queues[curband]; + skb = qdisc->ops->peek(qdisc); + if (skb) + return skb; + } + } + return NULL; + +} + +static unsigned int multiq_drop(struct Qdisc *sch) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + int band; + unsigned int len; + struct Qdisc *qdisc; + + for (band = q->bands - 1; band >= 0; band--) { + qdisc = q->queues[band]; + if (qdisc->ops->drop) { + len = qdisc->ops->drop(qdisc); + if (len != 0) { + sch->q.qlen--; + return len; + } + } + } + return 0; +} + + +static void +multiq_reset(struct Qdisc *sch) +{ + u16 band; + struct multiq_sched_data *q = qdisc_priv(sch); + + for (band = 0; band < q->bands; band++) + qdisc_reset(q->queues[band]); + sch->q.qlen = 0; + q->curband = 0; +} + +static void +multiq_destroy(struct Qdisc *sch) +{ + int band; + struct multiq_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + for (band = 0; band < q->bands; band++) + qdisc_destroy(q->queues[band]); + + kfree(q->queues); +} + +static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + struct tc_multiq_qopt *qopt; + int i; + + if (!netif_is_multiqueue(qdisc_dev(sch))) + return -EOPNOTSUPP; + if (nla_len(opt) < sizeof(*qopt)) + return -EINVAL; + + qopt = nla_data(opt); + + qopt->bands = qdisc_dev(sch)->real_num_tx_queues; + + sch_tree_lock(sch); + q->bands = qopt->bands; + for (i = q->bands; i < q->max_bands; i++) { + if (q->queues[i] != &noop_qdisc) { + struct Qdisc *child = q->queues[i]; + q->queues[i] = &noop_qdisc; + qdisc_tree_decrease_qlen(child, child->q.qlen); + qdisc_destroy(child); + } + } + + sch_tree_unlock(sch); + + for (i = 0; i < q->bands; i++) { + if (q->queues[i] == &noop_qdisc) { + struct Qdisc *child, *old; + child = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, + i + 1)); + if (child) { + sch_tree_lock(sch); + old = q->queues[i]; + q->queues[i] = child; + + if (old != &noop_qdisc) { + qdisc_tree_decrease_qlen(old, + old->q.qlen); + qdisc_destroy(old); + } + sch_tree_unlock(sch); + } + } + } + return 0; +} + +static int multiq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + int i, err; + + q->queues = NULL; + + if (opt == NULL) + return -EINVAL; + + q->max_bands = qdisc_dev(sch)->num_tx_queues; + + q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL); + if (!q->queues) + return -ENOBUFS; + for (i = 0; i < q->max_bands; i++) + q->queues[i] = &noop_qdisc; + + err = multiq_tune(sch, opt); + + if (err) + kfree(q->queues); + + return err; +} + +static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb_tail_pointer(skb); + struct tc_multiq_qopt opt; + + opt.bands = q->bands; + opt.max_bands = q->max_bands; + + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->queues[band]; + q->queues[band] = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc * +multiq_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + return q->queues[band]; +} + +static unsigned long multiq_get(struct Qdisc *sch, u32 classid) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned long band = TC_H_MIN(classid); + + if (band - 1 >= q->bands) + return 0; + return band; +} + +static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return multiq_get(sch, classid); +} + + +static void multiq_put(struct Qdisc *q, unsigned long cl) +{ +} + +static int multiq_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + + tcm->tcm_handle |= TC_H_MIN(cl); + tcm->tcm_info = q->queues[cl - 1]->handle; + return 0; +} + +static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + struct Qdisc *cl_q; + + cl_q = q->queues[cl - 1]; + cl_q->qstats.qlen = cl_q->q.qlen; + if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || + gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + return -1; + + return 0; +} + +static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + int band; + + if (arg->stop) + return; + + for (band = 0; band < q->bands; band++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, band + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static const struct Qdisc_class_ops multiq_class_ops = { + .graft = multiq_graft, + .leaf = multiq_leaf, + .get = multiq_get, + .put = multiq_put, + .walk = multiq_walk, + .tcf_chain = multiq_find_tcf, + .bind_tcf = multiq_bind, + .unbind_tcf = multiq_put, + .dump = multiq_dump_class, + .dump_stats = multiq_dump_class_stats, +}; + +static struct Qdisc_ops multiq_qdisc_ops __read_mostly = { + .next = NULL, + .cl_ops = &multiq_class_ops, + .id = "multiq", + .priv_size = sizeof(struct multiq_sched_data), + .enqueue = multiq_enqueue, + .dequeue = multiq_dequeue, + .peek = multiq_peek, + .drop = multiq_drop, + .init = multiq_init, + .reset = multiq_reset, + .destroy = multiq_destroy, + .change = multiq_tune, + .dump = multiq_dump, + .owner = THIS_MODULE, +}; + +static int __init multiq_module_init(void) +{ + return register_qdisc(&multiq_qdisc_ops); +} + +static void __exit multiq_module_exit(void) +{ + unregister_qdisc(&multiq_qdisc_ops); +} + +module_init(multiq_module_init) +module_exit(multiq_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index ba528320483..111d70fddae 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -4,28 +4,32 @@ * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. + * 2 of the License. * * Many of the algorithms and ideas for this came from - * NIST Net which is not copyrighted. + * NIST Net which is not copyrighted. * * Authors: Stephen Hemminger <shemminger@osdl.org> * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> */ -#include <linux/config.h> +#include <linux/mm.h> #include <linux/module.h> -#include <linux/bitops.h> +#include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/vmalloc.h> #include <linux/rtnetlink.h> +#include <linux/reciprocal_div.h> +#include <linux/rbtree.h> +#include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/inet_ecn.h> -#define VERSION "1.2" +#define VERSION "1.3" /* Network Emulation Queuing algorithm. ==================================== @@ -49,77 +53,285 @@ control either since that can be handled by using token bucket or other rate control. - The simulator is limited by the Linux timer resolution - and will create packet bursts on the HZ boundary (1ms). + Correlated Loss Generator models + + Added generation of correlated loss according to the + "Gilbert-Elliot" model, a 4-state markov model. + + References: + [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG + [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general + and intuitive loss model for packet networks and its implementation + in the Netem module in the Linux kernel", available in [1] + + Authors: Stefano Salsano <stefano.salsano at uniroma2.it + Fabio Ludovici <fabio.ludovici at yahoo.it> */ struct netem_sched_data { + /* internal t(ime)fifo qdisc uses t_root and sch->limit */ + struct rb_root t_root; + + /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; - struct timer_list timer; - u32 latency; + struct qdisc_watchdog watchdog; + + psched_tdiff_t latency; + psched_tdiff_t jitter; + u32 loss; + u32 ecn; u32 limit; u32 counter; u32 gap; - u32 jitter; u32 duplicate; u32 reorder; u32 corrupt; + u64 rate; + s32 packet_overhead; + u32 cell_size; + struct reciprocal_value cell_size_reciprocal; + s32 cell_overhead; struct crndstate { - unsigned long last; - unsigned long rho; + u32 last; + u32 rho; } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; struct disttable { u32 size; s16 table[0]; } *delay_dist; + + enum { + CLG_RANDOM, + CLG_4_STATES, + CLG_GILB_ELL, + } loss_model; + + enum { + TX_IN_GAP_PERIOD = 1, + TX_IN_BURST_PERIOD, + LOST_IN_GAP_PERIOD, + LOST_IN_BURST_PERIOD, + } _4_state_model; + + enum { + GOOD_STATE = 1, + BAD_STATE, + } GE_state_model; + + /* Correlated Loss Generation models */ + struct clgstate { + /* state of the Markov chain */ + u8 state; + + /* 4-states and Gilbert-Elliot models */ + u32 a1; /* p13 for 4-states or p for GE */ + u32 a2; /* p31 for 4-states or r for GE */ + u32 a3; /* p32 for 4-states or h for GE */ + u32 a4; /* p14 for 4-states or 1-k for GE */ + u32 a5; /* p23 used only in 4-states */ + } clg; + }; -/* Time stamp put into socket buffer control block */ +/* Time stamp put into socket buffer control block + * Only valid when skbs are in our internal t(ime)fifo queue. + */ struct netem_skb_cb { psched_time_t time_to_send; + ktime_t tstamp_save; }; +/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp + * to hold a rb_node structure. + * + * If struct sk_buff layout is changed, the following checks will complain. + */ +static struct rb_node *netem_rb_node(struct sk_buff *skb) +{ + BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0); + BUILD_BUG_ON(offsetof(struct sk_buff, prev) != + offsetof(struct sk_buff, next) + sizeof(skb->next)); + BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) != + offsetof(struct sk_buff, prev) + sizeof(skb->prev)); + BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) + + sizeof(skb->prev) + + sizeof(skb->tstamp)); + return (struct rb_node *)&skb->next; +} + +static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) +{ + return (struct sk_buff *)rb; +} + +static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) +{ + /* we assume we can use skb next/prev/tstamp as storage for rb_node */ + qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); + return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; +} + /* init_crandom - initialize correlated random number generator * Use entropy source for initial seed. */ static void init_crandom(struct crndstate *state, unsigned long rho) { state->rho = rho; - state->last = net_random(); + state->last = prandom_u32(); } /* get_crandom - correlated random number generator * Next number depends on last value. * rho is scaled to avoid floating point. */ -static unsigned long get_crandom(struct crndstate *state) +static u32 get_crandom(struct crndstate *state) { u64 value, rho; unsigned long answer; - if (state->rho == 0) /* no correllation */ - return net_random(); + if (state->rho == 0) /* no correlation */ + return prandom_u32(); - value = net_random(); + value = prandom_u32(); rho = (u64)state->rho + 1; answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; state->last = answer; return answer; } +/* loss_4state - 4-state model loss generator + * Generates losses according to the 4-state Markov chain adopted in + * the GI (General and Intuitive) loss model. + */ +static bool loss_4state(struct netem_sched_data *q) +{ + struct clgstate *clg = &q->clg; + u32 rnd = prandom_u32(); + + /* + * Makes a comparison between rnd and the transition + * probabilities outgoing from the current state, then decides the + * next state and if the next packet has to be transmitted or lost. + * The four states correspond to: + * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period + * LOST_IN_BURST_PERIOD => isolated losses within a gap period + * LOST_IN_GAP_PERIOD => lost packets within a burst period + * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period + */ + switch (clg->state) { + case TX_IN_GAP_PERIOD: + if (rnd < clg->a4) { + clg->state = LOST_IN_BURST_PERIOD; + return true; + } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { + clg->state = LOST_IN_GAP_PERIOD; + return true; + } else if (clg->a1 + clg->a4 < rnd) { + clg->state = TX_IN_GAP_PERIOD; + } + + break; + case TX_IN_BURST_PERIOD: + if (rnd < clg->a5) { + clg->state = LOST_IN_GAP_PERIOD; + return true; + } else { + clg->state = TX_IN_BURST_PERIOD; + } + + break; + case LOST_IN_GAP_PERIOD: + if (rnd < clg->a3) + clg->state = TX_IN_BURST_PERIOD; + else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { + clg->state = TX_IN_GAP_PERIOD; + } else if (clg->a2 + clg->a3 < rnd) { + clg->state = LOST_IN_GAP_PERIOD; + return true; + } + break; + case LOST_IN_BURST_PERIOD: + clg->state = TX_IN_GAP_PERIOD; + break; + } + + return false; +} + +/* loss_gilb_ell - Gilbert-Elliot model loss generator + * Generates losses according to the Gilbert-Elliot loss model or + * its special cases (Gilbert or Simple Gilbert) + * + * Makes a comparison between random number and the transition + * probabilities outgoing from the current state, then decides the + * next state. A second random number is extracted and the comparison + * with the loss probability of the current state decides if the next + * packet will be transmitted or lost. + */ +static bool loss_gilb_ell(struct netem_sched_data *q) +{ + struct clgstate *clg = &q->clg; + + switch (clg->state) { + case GOOD_STATE: + if (prandom_u32() < clg->a1) + clg->state = BAD_STATE; + if (prandom_u32() < clg->a4) + return true; + break; + case BAD_STATE: + if (prandom_u32() < clg->a2) + clg->state = GOOD_STATE; + if (prandom_u32() > clg->a3) + return true; + } + + return false; +} + +static bool loss_event(struct netem_sched_data *q) +{ + switch (q->loss_model) { + case CLG_RANDOM: + /* Random packet drop 0 => none, ~0 => all */ + return q->loss && q->loss >= get_crandom(&q->loss_cor); + + case CLG_4_STATES: + /* 4state loss model algorithm (used also for GI model) + * Extracts a value from the markov 4 state loss generator, + * if it is 1 drops a packet and if needed writes the event in + * the kernel logs + */ + return loss_4state(q); + + case CLG_GILB_ELL: + /* Gilbert-Elliot loss model algorithm + * Extracts a value from the Gilbert-Elliot loss generator, + * if it is 1 drops a packet and if needed writes the event in + * the kernel logs + */ + return loss_gilb_ell(q); + } + + return false; /* not reached */ +} + + /* tabledist - return a pseudo-randomly distributed value with mean mu and * std deviation sigma. Uses table lookup to approximate the desired * distribution, and a uniformly-distributed pseudo-random source. */ -static long tabledist(unsigned long mu, long sigma, - struct crndstate *state, const struct disttable *dist) +static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, + struct crndstate *state, + const struct disttable *dist) { - long t, x; - unsigned long rnd; + psched_tdiff_t x; + long t; + u32 rnd; if (sigma == 0) return mu; @@ -127,7 +339,7 @@ static long tabledist(unsigned long mu, long sigma, rnd = get_crandom(state); /* default uniform distribution */ - if (dist == NULL) + if (dist == NULL) return (rnd % (2*sigma)) - sigma + mu; t = dist->table[rnd % dist->size]; @@ -140,6 +352,62 @@ static long tabledist(unsigned long mu, long sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } +static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q) +{ + u64 ticks; + + len += q->packet_overhead; + + if (q->cell_size) { + u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); + + if (len > cells * q->cell_size) /* extra cell needed for remainder */ + cells++; + len = cells * (q->cell_size + q->cell_overhead); + } + + ticks = (u64)len * NSEC_PER_SEC; + + do_div(ticks, q->rate); + return PSCHED_NS2TICKS(ticks); +} + +static void tfifo_reset(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct rb_node *p; + + while ((p = rb_first(&q->t_root))) { + struct sk_buff *skb = netem_rb_to_skb(p); + + rb_erase(p, &q->t_root); + skb->next = NULL; + skb->prev = NULL; + kfree_skb(skb); + } +} + +static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; + struct rb_node **p = &q->t_root.rb_node, *parent = NULL; + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = netem_rb_to_skb(parent); + if (tnext >= netem_skb_cb(skb)->time_to_send) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(netem_rb_node(nskb), parent, p); + rb_insert_color(netem_rb_node(nskb), &q->t_root); + sch->q.qlen++; +} + /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -149,38 +417,45 @@ static long tabledist(unsigned long mu, long sigma, static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; + /* We don't fill cb now as skb_unshare() may invalidate it */ + struct netem_skb_cb *cb; struct sk_buff *skb2; - int ret; int count = 1; - pr_debug("netem_enqueue skb=%p\n", skb); - /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) ++count; - /* Random packet drop 0 => none, ~0 => all */ - if (q->loss && q->loss >= get_crandom(&q->loss_cor)) - --count; - + /* Drop packet? */ + if (loss_event(q)) { + if (q->ecn && INET_ECN_set_ce(skb)) + sch->qstats.drops++; /* mark packet */ + else + --count; + } if (count == 0) { sch->qstats.drops++; kfree_skb(skb); - return NET_XMIT_DROP; + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } + /* If a delay is expected, orphan the skb. (orphaning usually takes + * place at TX completion time, so _before_ the link transit delay) + */ + if (q->latency || q->jitter) + skb_orphan_partial(skb); + /* * If we need to duplicate packet, then re-insert at top of the * qdisc tree, since parent queuer expects that only one * skb will be queued. */ if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { - struct Qdisc *rootq = sch->dev->qdisc; + struct Qdisc *rootq = qdisc_root(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; - rootq->enqueue(skb2, rootq); + qdisc_enqueue_root(skb2, rootq); q->duplicate = dupsave; } @@ -191,73 +466,99 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) * do it now in software before we mangle it. */ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { - if (!(skb = skb_unshare(skb, GFP_ATOMIC)) - || (skb->ip_summed == CHECKSUM_HW - && skb_checksum_help(skb, 0))) { - sch->qstats.drops++; - return NET_XMIT_DROP; - } + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || + (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(skb))) + return qdisc_drop(skb, sch); - skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); + skb->data[prandom_u32() % skb_headlen(skb)] ^= + 1<<(prandom_u32() % 8); } - if (q->gap == 0 /* not doing reordering */ - || q->counter < q->gap /* inside last reordering gap */ - || q->reorder < get_crandom(&q->reorder_cor)) { + if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) + return qdisc_reshape_fail(skb, sch); + + sch->qstats.backlog += qdisc_pkt_len(skb); + + cb = netem_skb_cb(skb); + if (q->gap == 0 || /* not doing reordering */ + q->counter < q->gap - 1 || /* inside last reordering gap */ + q->reorder < get_crandom(&q->reorder_cor)) { psched_time_t now; psched_tdiff_t delay; delay = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); - PSCHED_GET_TIME(now); - PSCHED_TADD2(now, delay, cb->time_to_send); + now = psched_get_time(); + + if (q->rate) { + struct sk_buff *last; + + if (!skb_queue_empty(&sch->q)) + last = skb_peek_tail(&sch->q); + else + last = netem_rb_to_skb(rb_last(&q->t_root)); + if (last) { + /* + * Last packet in queue is reference point (now), + * calculate this time bonus and subtract + * from delay. + */ + delay -= netem_skb_cb(last)->time_to_send - now; + delay = max_t(psched_tdiff_t, 0, delay); + now = netem_skb_cb(last)->time_to_send; + } + + delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q); + } + + cb->time_to_send = now + delay; + cb->tstamp_save = skb->tstamp; ++q->counter; - ret = q->qdisc->enqueue(skb, q->qdisc); + tfifo_enqueue(skb, sch); } else { - /* + /* * Do re-ordering by putting one out of N packets at the front * of the queue. */ - PSCHED_GET_TIME(cb->time_to_send); + cb->time_to_send = psched_get_time(); q->counter = 0; - ret = q->qdisc->ops->requeue(skb, q->qdisc); - } - - if (likely(ret == NET_XMIT_SUCCESS)) { - sch->q.qlen++; - sch->bstats.bytes += skb->len; - sch->bstats.packets++; - } else - sch->qstats.drops++; - pr_debug("netem: enqueue ret %d\n", ret); - return ret; -} - -/* Requeue packets but don't change time stamp */ -static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct netem_sched_data *q = qdisc_priv(sch); - int ret; - - if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) { - sch->q.qlen++; + __skb_queue_head(&sch->q, skb); sch->qstats.requeues++; } - return ret; + return NET_XMIT_SUCCESS; } -static unsigned int netem_drop(struct Qdisc* sch) +static unsigned int netem_drop(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); unsigned int len; - if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { - sch->q.qlen--; - sch->qstats.drops++; + len = qdisc_queue_drop(sch); + + if (!len) { + struct rb_node *p = rb_first(&q->t_root); + + if (p) { + struct sk_buff *skb = netem_rb_to_skb(p); + + rb_erase(p, &q->t_root); + sch->q.qlen--; + skb->next = NULL; + skb->prev = NULL; + len = qdisc_pkt_len(skb); + sch->qstats.backlog -= len; + kfree_skb(skb); + } } + if (!len && q->qdisc && q->qdisc->ops->drop) + len = q->qdisc->ops->drop(q->qdisc); + if (len) + sch->qstats.drops++; + return len; } @@ -265,301 +566,334 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; + struct rb_node *p; - skb = q->qdisc->dequeue(q->qdisc); + if (qdisc_is_throttled(sch)) + return NULL; + +tfifo_dequeue: + skb = __skb_dequeue(&sch->q); if (skb) { - const struct netem_skb_cb *cb - = (const struct netem_skb_cb *)skb->cb; - psched_time_t now; +deliver: + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); + return skb; + } + p = rb_first(&q->t_root); + if (p) { + psched_time_t time_to_send; + + skb = netem_rb_to_skb(p); /* if more time remaining? */ - PSCHED_GET_TIME(now); + time_to_send = netem_skb_cb(skb)->time_to_send; + if (time_to_send <= psched_get_time()) { + rb_erase(p, &q->t_root); - if (PSCHED_TLESS(cb->time_to_send, now)) { - pr_debug("netem_dequeue: return skb=%p\n", skb); sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; - return skb; - } else { - psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now); - - if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { - sch->qstats.drops++; - - /* After this qlen is confused */ - printk(KERN_ERR "netem: queue discpline %s could not requeue\n", - q->qdisc->ops->id); - - sch->q.qlen--; + skb->next = NULL; + skb->prev = NULL; + skb->tstamp = netem_skb_cb(skb)->tstamp_save; + +#ifdef CONFIG_NET_CLS_ACT + /* + * If it's at ingress let's pretend the delay is + * from the network (tstamp will be updated). + */ + if (G_TC_FROM(skb->tc_verd) & AT_INGRESS) + skb->tstamp.tv64 = 0; +#endif + + if (q->qdisc) { + int err = qdisc_enqueue(skb, q->qdisc); + + if (unlikely(err != NET_XMIT_SUCCESS)) { + if (net_xmit_drop_count(err)) { + sch->qstats.drops++; + qdisc_tree_decrease_qlen(sch, 1); + } + } + goto tfifo_dequeue; } + goto deliver; + } - mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay)); - sch->flags |= TCQ_F_THROTTLED; + if (q->qdisc) { + skb = q->qdisc->ops->dequeue(q->qdisc); + if (skb) + goto deliver; } + qdisc_watchdog_schedule(&q->watchdog, time_to_send); } + if (q->qdisc) { + skb = q->qdisc->ops->dequeue(q->qdisc); + if (skb) + goto deliver; + } return NULL; } -static void netem_watchdog(unsigned long arg) -{ - struct Qdisc *sch = (struct Qdisc *)arg; - - pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen); - sch->flags &= ~TCQ_F_THROTTLED; - netif_schedule(sch->dev); -} - static void netem_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - qdisc_reset(q->qdisc); - sch->q.qlen = 0; - sch->flags &= ~TCQ_F_THROTTLED; - del_timer_sync(&q->timer); + qdisc_reset_queue(sch); + tfifo_reset(sch); + if (q->qdisc) + qdisc_reset(q->qdisc); + qdisc_watchdog_cancel(&q->watchdog); } -/* Pass size change message down to embedded FIFO */ -static int set_fifo_limit(struct Qdisc *q, int limit) +static void dist_free(struct disttable *d) { - struct rtattr *rta; - int ret = -ENOMEM; - - /* Hack to avoid sending change message to non-FIFO */ - if (strncmp(q->ops->id + 1, "fifo", 4) != 0) - return 0; - - rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); - if (rta) { - rta->rta_type = RTM_NEWQDISC; - rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); - ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; - - ret = q->ops->change(q, rta); - kfree(rta); - } - return ret; + kvfree(d); } /* * Distribution data is a variable size payload containing * signed 16 bit values. */ -static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr) +static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); - unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16); - const __s16 *data = RTA_DATA(attr); + size_t n = nla_len(attr)/sizeof(__s16); + const __s16 *data = nla_data(attr); + spinlock_t *root_lock; struct disttable *d; int i; + size_t s; - if (n > 65536) + if (n > NETEM_DIST_MAX) return -EINVAL; - d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL); + s = sizeof(struct disttable) + n * sizeof(s16); + d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN); + if (!d) + d = vmalloc(s); if (!d) return -ENOMEM; d->size = n; for (i = 0; i < n; i++) d->table[i] = data[i]; - - spin_lock_bh(&sch->dev->queue_lock); - d = xchg(&q->delay_dist, d); - spin_unlock_bh(&sch->dev->queue_lock); - kfree(d); + root_lock = qdisc_root_sleeping_lock(sch); + + spin_lock_bh(root_lock); + swap(q->delay_dist, d); + spin_unlock_bh(root_lock); + + dist_free(d); return 0; } -static int get_correlation(struct Qdisc *sch, const struct rtattr *attr) +static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); - const struct tc_netem_corr *c = RTA_DATA(attr); - - if (RTA_PAYLOAD(attr) != sizeof(*c)) - return -EINVAL; + const struct tc_netem_corr *c = nla_data(attr); init_crandom(&q->delay_cor, c->delay_corr); init_crandom(&q->loss_cor, c->loss_corr); init_crandom(&q->dup_cor, c->dup_corr); - return 0; } -static int get_reorder(struct Qdisc *sch, const struct rtattr *attr) +static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); - const struct tc_netem_reorder *r = RTA_DATA(attr); - - if (RTA_PAYLOAD(attr) != sizeof(*r)) - return -EINVAL; + const struct tc_netem_reorder *r = nla_data(attr); q->reorder = r->probability; init_crandom(&q->reorder_cor, r->correlation); - return 0; } -static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr) +static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); - const struct tc_netem_corrupt *r = RTA_DATA(attr); - - if (RTA_PAYLOAD(attr) != sizeof(*r)) - return -EINVAL; + const struct tc_netem_corrupt *r = nla_data(attr); q->corrupt = r->probability; init_crandom(&q->corrupt_cor, r->correlation); - return 0; } -/* Parse netlink message to set options */ -static int netem_change(struct Qdisc *sch, struct rtattr *opt) +static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); - struct tc_netem_qopt *qopt; - int ret; - - if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) - return -EINVAL; + const struct tc_netem_rate *r = nla_data(attr); + + q->rate = r->rate; + q->packet_overhead = r->packet_overhead; + q->cell_size = r->cell_size; + q->cell_overhead = r->cell_overhead; + if (q->cell_size) + q->cell_size_reciprocal = reciprocal_value(q->cell_size); + else + q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; +} - qopt = RTA_DATA(opt); - ret = set_fifo_limit(q->qdisc, qopt->limit); - if (ret) { - pr_debug("netem: can't set fifo limit\n"); - return ret; - } - - q->latency = qopt->latency; - q->jitter = qopt->jitter; - q->limit = qopt->limit; - q->gap = qopt->gap; - q->counter = 0; - q->loss = qopt->loss; - q->duplicate = qopt->duplicate; +static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) +{ + const struct nlattr *la; + int rem; - /* for compatiablity with earlier versions. - * if gap is set, need to assume 100% probablity - */ - q->reorder = ~0; - - /* Handle nested options after initial queue options. - * Should have put all options in nested format but too late now. - */ - if (RTA_PAYLOAD(opt) > sizeof(*qopt)) { - struct rtattr *tb[TCA_NETEM_MAX]; - if (rtattr_parse(tb, TCA_NETEM_MAX, - RTA_DATA(opt) + sizeof(*qopt), - RTA_PAYLOAD(opt) - sizeof(*qopt))) - return -EINVAL; + nla_for_each_nested(la, attr, rem) { + u16 type = nla_type(la); - if (tb[TCA_NETEM_CORR-1]) { - ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]); - if (ret) - return ret; - } + switch (type) { + case NETEM_LOSS_GI: { + const struct tc_netem_gimodel *gi = nla_data(la); + + if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { + pr_info("netem: incorrect gi model size\n"); + return -EINVAL; + } - if (tb[TCA_NETEM_DELAY_DIST-1]) { - ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]); - if (ret) - return ret; + q->loss_model = CLG_4_STATES; + + q->clg.state = TX_IN_GAP_PERIOD; + q->clg.a1 = gi->p13; + q->clg.a2 = gi->p31; + q->clg.a3 = gi->p32; + q->clg.a4 = gi->p14; + q->clg.a5 = gi->p23; + break; } - if (tb[TCA_NETEM_REORDER-1]) { - ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]); - if (ret) - return ret; + case NETEM_LOSS_GE: { + const struct tc_netem_gemodel *ge = nla_data(la); + + if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { + pr_info("netem: incorrect ge model size\n"); + return -EINVAL; + } + + q->loss_model = CLG_GILB_ELL; + q->clg.state = GOOD_STATE; + q->clg.a1 = ge->p; + q->clg.a2 = ge->r; + q->clg.a3 = ge->h; + q->clg.a4 = ge->k1; + break; } - if (tb[TCA_NETEM_CORRUPT-1]) { - ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]); - if (ret) - return ret; + default: + pr_info("netem: unknown loss type %u\n", type); + return -EINVAL; } } return 0; } -/* - * Special case version of FIFO queue for use by netem. - * It queues in order based on timestamps in skb's - */ -struct fifo_sched_data { - u32 limit; +static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { + [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, + [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, + [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, + [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, + [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, + [TCA_NETEM_ECN] = { .type = NLA_U32 }, + [TCA_NETEM_RATE64] = { .type = NLA_U64 }, }; -static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) +static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, + const struct nla_policy *policy, int len) { - struct fifo_sched_data *q = qdisc_priv(sch); - struct sk_buff_head *list = &sch->q; - const struct netem_skb_cb *ncb - = (const struct netem_skb_cb *)nskb->cb; - struct sk_buff *skb; + int nested_len = nla_len(nla) - NLA_ALIGN(len); - if (likely(skb_queue_len(list) < q->limit)) { - skb_queue_reverse_walk(list, skb) { - const struct netem_skb_cb *cb - = (const struct netem_skb_cb *)skb->cb; + if (nested_len < 0) { + pr_info("netem: invalid attributes len %d\n", nested_len); + return -EINVAL; + } - if (!PSCHED_TLESS(ncb->time_to_send, cb->time_to_send)) - break; - } + if (nested_len >= nla_attr_size(0)) + return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), + nested_len, policy); - __skb_queue_after(list, skb, nskb); + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + return 0; +} - sch->qstats.backlog += nskb->len; - sch->bstats.bytes += nskb->len; - sch->bstats.packets++; +/* Parse netlink message to set options */ +static int netem_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_NETEM_MAX + 1]; + struct tc_netem_qopt *qopt; + struct clgstate old_clg; + int old_loss_model = CLG_RANDOM; + int ret; - return NET_XMIT_SUCCESS; + if (opt == NULL) + return -EINVAL; + + qopt = nla_data(opt); + ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); + if (ret < 0) + return ret; + + /* backup q->clg and q->loss_model */ + old_clg = q->clg; + old_loss_model = q->loss_model; + + if (tb[TCA_NETEM_LOSS]) { + ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); + if (ret) { + q->loss_model = old_loss_model; + return ret; + } + } else { + q->loss_model = CLG_RANDOM; } - return qdisc_drop(nskb, sch); -} + if (tb[TCA_NETEM_DELAY_DIST]) { + ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); + if (ret) { + /* recover clg and loss_model, in case of + * q->clg and q->loss_model were modified + * in get_loss_clg() + */ + q->clg = old_clg; + q->loss_model = old_loss_model; + return ret; + } + } -static int tfifo_init(struct Qdisc *sch, struct rtattr *opt) -{ - struct fifo_sched_data *q = qdisc_priv(sch); + sch->limit = qopt->limit; - if (opt) { - struct tc_fifo_qopt *ctl = RTA_DATA(opt); - if (RTA_PAYLOAD(opt) < sizeof(*ctl)) - return -EINVAL; + q->latency = qopt->latency; + q->jitter = qopt->jitter; + q->limit = qopt->limit; + q->gap = qopt->gap; + q->counter = 0; + q->loss = qopt->loss; + q->duplicate = qopt->duplicate; - q->limit = ctl->limit; - } else - q->limit = max_t(u32, sch->dev->tx_queue_len, 1); + /* for compatibility with earlier versions. + * if gap is set, need to assume 100% probability + */ + if (q->gap) + q->reorder = ~0; - return 0; -} + if (tb[TCA_NETEM_CORR]) + get_correlation(q, tb[TCA_NETEM_CORR]); -static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - struct fifo_sched_data *q = qdisc_priv(sch); - struct tc_fifo_qopt opt = { .limit = q->limit }; + if (tb[TCA_NETEM_REORDER]) + get_reorder(q, tb[TCA_NETEM_REORDER]); - RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); - return skb->len; + if (tb[TCA_NETEM_CORRUPT]) + get_corrupt(q, tb[TCA_NETEM_CORRUPT]); -rtattr_failure: - return -1; -} + if (tb[TCA_NETEM_RATE]) + get_rate(q, tb[TCA_NETEM_RATE]); -static struct Qdisc_ops tfifo_qdisc_ops = { - .id = "tfifo", - .priv_size = sizeof(struct fifo_sched_data), - .enqueue = tfifo_enqueue, - .dequeue = qdisc_dequeue_head, - .requeue = qdisc_requeue, - .drop = qdisc_queue_drop, - .init = tfifo_init, - .reset = qdisc_reset_queue, - .change = tfifo_init, - .dump = tfifo_dump, -}; + if (tb[TCA_NETEM_RATE64]) + q->rate = max_t(u64, q->rate, + nla_get_u64(tb[TCA_NETEM_RATE64])); -static int netem_init(struct Qdisc *sch, struct rtattr *opt) + if (tb[TCA_NETEM_ECN]) + q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); + + return ret; +} + +static int netem_init(struct Qdisc *sch, struct nlattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); int ret; @@ -567,21 +901,12 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt) if (!opt) return -EINVAL; - init_timer(&q->timer); - q->timer.function = netem_watchdog; - q->timer.data = (unsigned long) sch; - - q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops); - if (!q->qdisc) { - pr_debug("netem: qdisc create failed\n"); - return -ENOMEM; - } + qdisc_watchdog_init(&q->watchdog, sch); + q->loss_model = CLG_RANDOM; ret = netem_change(sch, opt); - if (ret) { - pr_debug("netem: change failed\n"); - qdisc_destroy(q->qdisc); - } + if (ret) + pr_info("netem: change failed\n"); return ret; } @@ -589,20 +914,71 @@ static void netem_destroy(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - del_timer_sync(&q->timer); - qdisc_destroy(q->qdisc); - kfree(q->delay_dist); + qdisc_watchdog_cancel(&q->watchdog); + if (q->qdisc) + qdisc_destroy(q->qdisc); + dist_free(q->delay_dist); +} + +static int dump_loss_model(const struct netem_sched_data *q, + struct sk_buff *skb) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_NETEM_LOSS); + if (nest == NULL) + goto nla_put_failure; + + switch (q->loss_model) { + case CLG_RANDOM: + /* legacy loss model */ + nla_nest_cancel(skb, nest); + return 0; /* no data */ + + case CLG_4_STATES: { + struct tc_netem_gimodel gi = { + .p13 = q->clg.a1, + .p31 = q->clg.a2, + .p32 = q->clg.a3, + .p14 = q->clg.a4, + .p23 = q->clg.a5, + }; + + if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) + goto nla_put_failure; + break; + } + case CLG_GILB_ELL: { + struct tc_netem_gemodel ge = { + .p = q->clg.a1, + .r = q->clg.a2, + .h = q->clg.a3, + .k1 = q->clg.a4, + }; + + if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) + goto nla_put_failure; + break; + } + } + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; } static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) { const struct netem_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; - struct rtattr *rta = (struct rtattr *) b; + struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); struct tc_netem_qopt qopt; struct tc_netem_corr cor; struct tc_netem_reorder reorder; struct tc_netem_corrupt corrupt; + struct tc_netem_rate rate; qopt.latency = q->latency; qopt.jitter = q->jitter; @@ -610,27 +986,48 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) qopt.loss = q->loss; qopt.gap = q->gap; qopt.duplicate = q->duplicate; - RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) + goto nla_put_failure; cor.delay_corr = q->delay_cor.rho; cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; - RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); + if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) + goto nla_put_failure; reorder.probability = q->reorder; reorder.correlation = q->reorder_cor.rho; - RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) + goto nla_put_failure; corrupt.probability = q->corrupt; corrupt.correlation = q->corrupt_cor.rho; - RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) + goto nla_put_failure; - rta->rta_len = skb->tail - b; + if (q->rate >= (1ULL << 32)) { + if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate)) + goto nla_put_failure; + rate.rate = ~0U; + } else { + rate.rate = q->rate; + } + rate.packet_overhead = q->packet_overhead; + rate.cell_size = q->cell_size; + rate.cell_overhead = q->cell_overhead; + if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) + goto nla_put_failure; - return skb->len; + if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) + goto nla_put_failure; -rtattr_failure: - skb_trim(skb, b - skb->data); + if (dump_loss_model(q, skb) != 0) + goto nla_put_failure; + + return nla_nest_end(skb, nla); + +nla_put_failure: + nlmsg_trim(skb, nla); return -1; } @@ -639,7 +1036,7 @@ static int netem_dump_class(struct Qdisc *sch, unsigned long cl, { struct netem_sched_data *q = qdisc_priv(sch); - if (cl != 1) /* only one class */ + if (cl != 1 || !q->qdisc) /* only one class */ return -ENOENT; tcm->tcm_handle |= TC_H_MIN(1); @@ -653,13 +1050,13 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, { struct netem_sched_data *q = qdisc_priv(sch); - if (new == NULL) - new = &noop_qdisc; - sch_tree_lock(sch); - *old = xchg(&q->qdisc, new); - qdisc_reset(*old); - sch->q.qlen = 0; + *old = q->qdisc; + q->qdisc = new; + if (*old) { + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + } sch_tree_unlock(sch); return 0; @@ -680,17 +1077,6 @@ static void netem_put(struct Qdisc *sch, unsigned long arg) { } -static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - struct rtattr **tca, unsigned long *arg) -{ - return -ENOSYS; -} - -static int netem_delete(struct Qdisc *sch, unsigned long arg) -{ - return -ENOSYS; -} - static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { @@ -703,30 +1089,22 @@ static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl) -{ - return NULL; -} - -static struct Qdisc_class_ops netem_class_ops = { +static const struct Qdisc_class_ops netem_class_ops = { .graft = netem_graft, .leaf = netem_leaf, .get = netem_get, .put = netem_put, - .change = netem_change_class, - .delete = netem_delete, .walk = netem_walk, - .tcf_chain = netem_find_tcf, .dump = netem_dump_class, }; -static struct Qdisc_ops netem_qdisc_ops = { +static struct Qdisc_ops netem_qdisc_ops __read_mostly = { .id = "netem", .cl_ops = &netem_class_ops, .priv_size = sizeof(struct netem_sched_data), .enqueue = netem_enqueue, .dequeue = netem_dequeue, - .requeue = netem_requeue, + .peek = qdisc_peek_dequeued, .drop = netem_drop, .init = netem_init, .reset = netem_reset, diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c new file mode 100644 index 00000000000..fefeeb73f15 --- /dev/null +++ b/net/sched/sch_pie.c @@ -0,0 +1,566 @@ +/* Copyright (C) 2013 Cisco Systems, Inc, 2013. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Author: Vijay Subramanian <vijaynsu@cisco.com> + * Author: Mythili Prabhu <mysuryan@cisco.com> + * + * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no> + * University of Oslo, Norway. + * + * References: + * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00 + * IEEE Conference on High Performance Switching and Routing 2013 : + * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem" + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> + +#define QUEUE_THRESHOLD 10000 +#define DQCOUNT_INVALID -1 +#define MAX_PROB 0xffffffff +#define PIE_SCALE 8 + +/* parameters used */ +struct pie_params { + psched_time_t target; /* user specified target delay in pschedtime */ + u32 tupdate; /* timer frequency (in jiffies) */ + u32 limit; /* number of packets that can be enqueued */ + u32 alpha; /* alpha and beta are between 0 and 32 */ + u32 beta; /* and are used for shift relative to 1 */ + bool ecn; /* true if ecn is enabled */ + bool bytemode; /* to scale drop early prob based on pkt size */ +}; + +/* variables used */ +struct pie_vars { + u32 prob; /* probability but scaled by u32 limit. */ + psched_time_t burst_time; + psched_time_t qdelay; + psched_time_t qdelay_old; + u64 dq_count; /* measured in bytes */ + psched_time_t dq_tstamp; /* drain rate */ + u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ + u32 qlen_old; /* in bytes */ +}; + +/* statistics gathering */ +struct pie_stats { + u32 packets_in; /* total number of packets enqueued */ + u32 dropped; /* packets dropped due to pie_action */ + u32 overlimit; /* dropped due to lack of space in queue */ + u32 maxq; /* maximum queue size */ + u32 ecn_mark; /* packets marked with ECN */ +}; + +/* private data for the Qdisc */ +struct pie_sched_data { + struct pie_params params; + struct pie_vars vars; + struct pie_stats stats; + struct timer_list adapt_timer; +}; + +static void pie_params_init(struct pie_params *params) +{ + params->alpha = 2; + params->beta = 20; + params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */ + params->limit = 1000; /* default of 1000 packets */ + params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */ + params->ecn = false; + params->bytemode = false; +} + +static void pie_vars_init(struct pie_vars *vars) +{ + vars->dq_count = DQCOUNT_INVALID; + vars->avg_dq_rate = 0; + /* default of 100 ms in pschedtime */ + vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC); +} + +static bool drop_early(struct Qdisc *sch, u32 packet_size) +{ + struct pie_sched_data *q = qdisc_priv(sch); + u32 rnd; + u32 local_prob = q->vars.prob; + u32 mtu = psched_mtu(qdisc_dev(sch)); + + /* If there is still burst allowance left skip random early drop */ + if (q->vars.burst_time > 0) + return false; + + /* If current delay is less than half of target, and + * if drop prob is low already, disable early_drop + */ + if ((q->vars.qdelay < q->params.target / 2) + && (q->vars.prob < MAX_PROB / 5)) + return false; + + /* If we have fewer than 2 mtu-sized packets, disable drop_early, + * similar to min_th in RED + */ + if (sch->qstats.backlog < 2 * mtu) + return false; + + /* If bytemode is turned on, use packet size to compute new + * probablity. Smaller packets will have lower drop prob in this case + */ + if (q->params.bytemode && packet_size <= mtu) + local_prob = (local_prob / mtu) * packet_size; + else + local_prob = q->vars.prob; + + rnd = prandom_u32(); + if (rnd < local_prob) + return true; + + return false; +} + +static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + bool enqueue = false; + + if (unlikely(qdisc_qlen(sch) >= sch->limit)) { + q->stats.overlimit++; + goto out; + } + + if (!drop_early(sch, skb->len)) { + enqueue = true; + } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && + INET_ECN_set_ce(skb)) { + /* If packet is ecn capable, mark it if drop probability + * is lower than 10%, else drop it. + */ + q->stats.ecn_mark++; + enqueue = true; + } + + /* we can enqueue the packet */ + if (enqueue) { + q->stats.packets_in++; + if (qdisc_qlen(sch) > q->stats.maxq) + q->stats.maxq = qdisc_qlen(sch); + + return qdisc_enqueue_tail(skb, sch); + } + +out: + q->stats.dropped++; + return qdisc_drop(skb, sch); +} + +static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { + [TCA_PIE_TARGET] = {.type = NLA_U32}, + [TCA_PIE_LIMIT] = {.type = NLA_U32}, + [TCA_PIE_TUPDATE] = {.type = NLA_U32}, + [TCA_PIE_ALPHA] = {.type = NLA_U32}, + [TCA_PIE_BETA] = {.type = NLA_U32}, + [TCA_PIE_ECN] = {.type = NLA_U32}, + [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, +}; + +static int pie_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct pie_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_PIE_MAX + 1]; + unsigned int qlen; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + /* convert from microseconds to pschedtime */ + if (tb[TCA_PIE_TARGET]) { + /* target is in us */ + u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); + + /* convert to pschedtime */ + q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + } + + /* tupdate is in jiffies */ + if (tb[TCA_PIE_TUPDATE]) + q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); + + if (tb[TCA_PIE_LIMIT]) { + u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); + + q->params.limit = limit; + sch->limit = limit; + } + + if (tb[TCA_PIE_ALPHA]) + q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); + + if (tb[TCA_PIE_BETA]) + q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); + + if (tb[TCA_PIE_ECN]) + q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); + + if (tb[TCA_PIE_BYTEMODE]) + q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); + + /* Drop excess packets if new limit is lower */ + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = __skb_dequeue(&sch->q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + sch_tree_unlock(sch); + return 0; +} + +static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) +{ + + struct pie_sched_data *q = qdisc_priv(sch); + int qlen = sch->qstats.backlog; /* current queue size in bytes */ + + /* If current queue is about 10 packets or more and dq_count is unset + * we have enough packets to calculate the drain rate. Save + * current time as dq_tstamp and start measurement cycle. + */ + if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { + q->vars.dq_tstamp = psched_get_time(); + q->vars.dq_count = 0; + } + + /* Calculate the average drain rate from this value. If queue length + * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset + * the dq_count to -1 as we don't have enough packets to calculate the + * drain rate anymore The following if block is entered only when we + * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) + * and we calculate the drain rate for the threshold here. dq_count is + * in bytes, time difference in psched_time, hence rate is in + * bytes/psched_time. + */ + if (q->vars.dq_count != DQCOUNT_INVALID) { + q->vars.dq_count += skb->len; + + if (q->vars.dq_count >= QUEUE_THRESHOLD) { + psched_time_t now = psched_get_time(); + u32 dtime = now - q->vars.dq_tstamp; + u32 count = q->vars.dq_count << PIE_SCALE; + + if (dtime == 0) + return; + + count = count / dtime; + + if (q->vars.avg_dq_rate == 0) + q->vars.avg_dq_rate = count; + else + q->vars.avg_dq_rate = + (q->vars.avg_dq_rate - + (q->vars.avg_dq_rate >> 3)) + (count >> 3); + + /* If the queue has receded below the threshold, we hold + * on to the last drain rate calculated, else we reset + * dq_count to 0 to re-enter the if block when the next + * packet is dequeued + */ + if (qlen < QUEUE_THRESHOLD) + q->vars.dq_count = DQCOUNT_INVALID; + else { + q->vars.dq_count = 0; + q->vars.dq_tstamp = psched_get_time(); + } + + if (q->vars.burst_time > 0) { + if (q->vars.burst_time > dtime) + q->vars.burst_time -= dtime; + else + q->vars.burst_time = 0; + } + } + } +} + +static void calculate_probability(struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + u32 qlen = sch->qstats.backlog; /* queue size in bytes */ + psched_time_t qdelay = 0; /* in pschedtime */ + psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ + s32 delta = 0; /* determines the change in probability */ + u32 oldprob; + u32 alpha, beta; + bool update_prob = true; + + q->vars.qdelay_old = q->vars.qdelay; + + if (q->vars.avg_dq_rate > 0) + qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; + else + qdelay = 0; + + /* If qdelay is zero and qlen is not, it means qlen is very small, less + * than dequeue_rate, so we do not update probabilty in this round + */ + if (qdelay == 0 && qlen != 0) + update_prob = false; + + /* In the algorithm, alpha and beta are between 0 and 2 with typical + * value for alpha as 0.125. In this implementation, we use values 0-32 + * passed from user space to represent this. Also, alpha and beta have + * unit of HZ and need to be scaled before they can used to update + * probability. alpha/beta are updated locally below by 1) scaling them + * appropriately 2) scaling down by 16 to come to 0-2 range. + * Please see paper for details. + * + * We scale alpha and beta differently depending on whether we are in + * light, medium or high dropping mode. + */ + if (q->vars.prob < MAX_PROB / 100) { + alpha = + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; + beta = + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; + } else if (q->vars.prob < MAX_PROB / 10) { + alpha = + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; + beta = + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; + } else { + alpha = + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + beta = + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + } + + /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ + delta += alpha * ((qdelay - q->params.target)); + delta += beta * ((qdelay - qdelay_old)); + + oldprob = q->vars.prob; + + /* to ensure we increase probability in steps of no more than 2% */ + if (delta > (s32) (MAX_PROB / (100 / 2)) && + q->vars.prob >= MAX_PROB / 10) + delta = (MAX_PROB / 100) * 2; + + /* Non-linear drop: + * Tune drop probability to increase quickly for high delays(>= 250ms) + * 250ms is derived through experiments and provides error protection + */ + + if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) + delta += MAX_PROB / (100 / 2); + + q->vars.prob += delta; + + if (delta > 0) { + /* prevent overflow */ + if (q->vars.prob < oldprob) { + q->vars.prob = MAX_PROB; + /* Prevent normalization error. If probability is at + * maximum value already, we normalize it here, and + * skip the check to do a non-linear drop in the next + * section. + */ + update_prob = false; + } + } else { + /* prevent underflow */ + if (q->vars.prob > oldprob) + q->vars.prob = 0; + } + + /* Non-linear drop in probability: Reduce drop probability quickly if + * delay is 0 for 2 consecutive Tupdate periods. + */ + + if ((qdelay == 0) && (qdelay_old == 0) && update_prob) + q->vars.prob = (q->vars.prob * 98) / 100; + + q->vars.qdelay = qdelay; + q->vars.qlen_old = qlen; + + /* We restart the measurement cycle if the following conditions are met + * 1. If the delay has been low for 2 consecutive Tupdate periods + * 2. Calculated drop probability is zero + * 3. We have atleast one estimate for the avg_dq_rate ie., + * is a non-zero value + */ + if ((q->vars.qdelay < q->params.target / 2) && + (q->vars.qdelay_old < q->params.target / 2) && + (q->vars.prob == 0) && + (q->vars.avg_dq_rate > 0)) + pie_vars_init(&q->vars); +} + +static void pie_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + struct pie_sched_data *q = qdisc_priv(sch); + spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + + spin_lock(root_lock); + calculate_probability(sch); + + /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ + if (q->params.tupdate) + mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); + spin_unlock(root_lock); + +} + +static int pie_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct pie_sched_data *q = qdisc_priv(sch); + + pie_params_init(&q->params); + pie_vars_init(&q->vars); + sch->limit = q->params.limit; + + setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch); + mod_timer(&q->adapt_timer, jiffies + HZ / 2); + + if (opt) { + int err = pie_change(sch, opt); + + if (err) + return err; + } + + return 0; +} + +static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct pie_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + /* convert target from pschedtime to us */ + if (nla_put_u32(skb, TCA_PIE_TARGET, + ((u32) PSCHED_TICKS2NS(q->params.target)) / + NSEC_PER_USEC) || + nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || + nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || + nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || + nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || + nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; + +} + +static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct pie_sched_data *q = qdisc_priv(sch); + struct tc_pie_xstats st = { + .prob = q->vars.prob, + .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / + NSEC_PER_USEC, + /* unscale and return dq_rate in bytes per sec */ + .avg_dq_rate = q->vars.avg_dq_rate * + (PSCHED_TICKS_PER_SEC) >> PIE_SCALE, + .packets_in = q->stats.packets_in, + .overlimit = q->stats.overlimit, + .maxq = q->stats.maxq, + .dropped = q->stats.dropped, + .ecn_mark = q->stats.ecn_mark, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + skb = __qdisc_dequeue_head(sch, &sch->q); + + if (!skb) + return NULL; + + pie_process_dequeue(sch, skb); + return skb; +} + +static void pie_reset(struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + qdisc_reset_queue(sch); + pie_vars_init(&q->vars); +} + +static void pie_destroy(struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + q->params.tupdate = 0; + del_timer_sync(&q->adapt_timer); +} + +static struct Qdisc_ops pie_qdisc_ops __read_mostly = { + .id = "pie", + .priv_size = sizeof(struct pie_sched_data), + .enqueue = pie_qdisc_enqueue, + .dequeue = pie_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = pie_init, + .destroy = pie_destroy, + .reset = pie_reset, + .change = pie_change, + .dump = pie_dump, + .dump_stats = pie_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init pie_module_init(void) +{ + return register_qdisc(&pie_qdisc_ops); +} + +static void __exit pie_module_exit(void) +{ + unregister_qdisc(&pie_qdisc_ops); +} + +module_init(pie_module_init); +module_exit(pie_module_exit); + +MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler"); +MODULE_AUTHOR("Vijay Subramanian"); +MODULE_AUTHOR("Mythili Prabhu"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c new file mode 100644 index 00000000000..89f8fcf73f1 --- /dev/null +++ b/net/sched/sch_plug.c @@ -0,0 +1,233 @@ +/* + * sch_plug.c Queue traffic until an explicit release command + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * There are two ways to use this qdisc: + * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating + * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands. + * + * 2. For network output buffering (a.k.a output commit) functionality. + * Output commit property is commonly used by applications using checkpoint + * based fault-tolerance to ensure that the checkpoint from which a system + * is being restored is consistent w.r.t outside world. + * + * Consider for e.g. Remus - a Virtual Machine checkpointing system, + * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated + * asynchronously to the backup host, while the VM continues executing the + * next epoch speculatively. + * + * The following is a typical sequence of output buffer operations: + * 1.At epoch i, start_buffer(i) + * 2. At end of epoch i (i.e. after 50ms): + * 2.1 Stop VM and take checkpoint(i). + * 2.2 start_buffer(i+1) and Resume VM + * 3. While speculatively executing epoch(i+1), asynchronously replicate + * checkpoint(i) to backup host. + * 4. When checkpoint_ack(i) is received from backup, release_buffer(i) + * Thus, this Qdisc would receive the following sequence of commands: + * TCQ_PLUG_BUFFER (epoch i) + * .. TCQ_PLUG_BUFFER (epoch i+1) + * ....TCQ_PLUG_RELEASE_ONE (epoch i) + * ......TCQ_PLUG_BUFFER (epoch i+2) + * ........ + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> + +/* + * State of the queue, when used for network output buffering: + * + * plug(i+1) plug(i) head + * ------------------+--------------------+----------------> + * | | + * | | + * pkts_current_epoch| pkts_last_epoch |pkts_to_release + * ----------------->|<--------+--------->|+---------------> + * v v + * + */ + +struct plug_sched_data { + /* If true, the dequeue function releases all packets + * from head to end of the queue. The queue turns into + * a pass-through queue for newly arriving packets. + */ + bool unplug_indefinite; + + /* Queue Limit in bytes */ + u32 limit; + + /* Number of packets (output) from the current speculatively + * executing epoch. + */ + u32 pkts_current_epoch; + + /* Number of packets corresponding to the recently finished + * epoch. These will be released when we receive a + * TCQ_PLUG_RELEASE_ONE command. This command is typically + * issued after committing a checkpoint at the target. + */ + u32 pkts_last_epoch; + + /* + * Number of packets from the head of the queue, that can + * be released (committed checkpoint). + */ + u32 pkts_to_release; +}; + +static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + if (likely(sch->qstats.backlog + skb->len <= q->limit)) { + if (!q->unplug_indefinite) + q->pkts_current_epoch++; + return qdisc_enqueue_tail(skb, sch); + } + + return qdisc_reshape_fail(skb, sch); +} + +static struct sk_buff *plug_dequeue(struct Qdisc *sch) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + if (qdisc_is_throttled(sch)) + return NULL; + + if (!q->unplug_indefinite) { + if (!q->pkts_to_release) { + /* No more packets to dequeue. Block the queue + * and wait for the next release command. + */ + qdisc_throttled(sch); + return NULL; + } + q->pkts_to_release--; + } + + return qdisc_dequeue_head(sch); +} + +static int plug_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + q->pkts_current_epoch = 0; + q->pkts_last_epoch = 0; + q->pkts_to_release = 0; + q->unplug_indefinite = false; + + if (opt == NULL) { + /* We will set a default limit of 100 pkts (~150kB) + * in case tx_queue_len is not available. The + * default value is completely arbitrary. + */ + u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100; + q->limit = pkt_limit * psched_mtu(qdisc_dev(sch)); + } else { + struct tc_plug_qopt *ctl = nla_data(opt); + + if (nla_len(opt) < sizeof(*ctl)) + return -EINVAL; + + q->limit = ctl->limit; + } + + qdisc_throttled(sch); + return 0; +} + +/* Receives 4 types of messages: + * TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ +static int plug_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct plug_sched_data *q = qdisc_priv(sch); + struct tc_plug_qopt *msg; + + if (opt == NULL) + return -EINVAL; + + msg = nla_data(opt); + if (nla_len(opt) < sizeof(*msg)) + return -EINVAL; + + switch (msg->action) { + case TCQ_PLUG_BUFFER: + /* Save size of the current buffer */ + q->pkts_last_epoch = q->pkts_current_epoch; + q->pkts_current_epoch = 0; + if (q->unplug_indefinite) + qdisc_throttled(sch); + q->unplug_indefinite = false; + break; + case TCQ_PLUG_RELEASE_ONE: + /* Add packets from the last complete buffer to the + * packets to be released set. + */ + q->pkts_to_release += q->pkts_last_epoch; + q->pkts_last_epoch = 0; + qdisc_unthrottled(sch); + netif_schedule_queue(sch->dev_queue); + break; + case TCQ_PLUG_RELEASE_INDEFINITE: + q->unplug_indefinite = true; + q->pkts_to_release = 0; + q->pkts_last_epoch = 0; + q->pkts_current_epoch = 0; + qdisc_unthrottled(sch); + netif_schedule_queue(sch->dev_queue); + break; + case TCQ_PLUG_LIMIT: + /* Limit is supplied in bytes */ + q->limit = msg->limit; + break; + default: + return -EINVAL; + } + + return 0; +} + +static struct Qdisc_ops plug_qdisc_ops __read_mostly = { + .id = "plug", + .priv_size = sizeof(struct plug_sched_data), + .enqueue = plug_enqueue, + .dequeue = plug_dequeue, + .peek = qdisc_peek_head, + .init = plug_init, + .change = plug_change, + .owner = THIS_MODULE, +}; + +static int __init plug_module_init(void) +{ + return register_qdisc(&plug_qdisc_ops); +} + +static void __exit plug_module_exit(void) +{ + unregister_qdisc(&plug_qdisc_ops); +} +module_init(plug_module_init) +module_exit(plug_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 1641db33a99..79359b69ad8 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -7,39 +7,22 @@ * 2 of the License, or (at your option) any later version. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>: + * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>: * Init -- EINVAL when opt undefined */ -#include <linux/config.h> #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> +#include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> +#include <net/netlink.h> #include <net/pkt_sched.h> -struct prio_sched_data -{ +struct prio_sched_data { int bands; struct tcf_proto *filter_list; u8 prio2band[TC_PRIO_MAX+1]; @@ -53,30 +36,29 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) struct prio_sched_data *q = qdisc_priv(sch); u32 band = skb->priority; struct tcf_result res; + int err; - *qerr = NET_XMIT_BYPASS; + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { + err = tc_classify(skb, q->filter_list, &res); #ifdef CONFIG_NET_CLS_ACT - switch (tc_classify(skb, q->filter_list, &res)) { + switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: - *qerr = NET_XMIT_SUCCESS; + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: return NULL; - }; - - if (!q->filter_list ) { -#else - if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) { + } #endif + if (!q->filter_list || err < 0) { if (TC_H_MAJ(band)) band = 0; - return q->queues[q->prio2band[band&TC_PRIO_MAX]]; + return q->queues[q->prio2band[band & TC_PRIO_MAX]]; } band = res.classid; } band = TC_H_MIN(band) - 1; - if (band > q->bands) + if (band >= q->bands) return q->queues[q->prio2band[0]]; return q->queues[band]; @@ -92,62 +74,47 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) #ifdef CONFIG_NET_CLS_ACT if (qdisc == NULL) { - if (ret == NET_XMIT_BYPASS) + if (ret & __NET_XMIT_BYPASS) sch->qstats.drops++; kfree_skb(skb); return ret; } #endif - if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) { - sch->bstats.bytes += skb->len; - sch->bstats.packets++; + ret = qdisc_enqueue(skb, qdisc); + if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; return NET_XMIT_SUCCESS; } - sch->qstats.drops++; - return ret; + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + return ret; } - -static int -prio_requeue(struct sk_buff *skb, struct Qdisc* sch) +static struct sk_buff *prio_peek(struct Qdisc *sch) { - struct Qdisc *qdisc; - int ret; - - qdisc = prio_classify(skb, sch, &ret); -#ifdef CONFIG_NET_CLS_ACT - if (qdisc == NULL) { - if (ret == NET_XMIT_BYPASS) - sch->qstats.drops++; - kfree_skb(skb); - return ret; - } -#endif + struct prio_sched_data *q = qdisc_priv(sch); + int prio; - if ((ret = qdisc->ops->requeue(skb, qdisc)) == NET_XMIT_SUCCESS) { - sch->q.qlen++; - sch->qstats.requeues++; - return 0; + for (prio = 0; prio < q->bands; prio++) { + struct Qdisc *qdisc = q->queues[prio]; + struct sk_buff *skb = qdisc->ops->peek(qdisc); + if (skb) + return skb; } - sch->qstats.drops++; - return NET_XMIT_DROP; + return NULL; } - -static struct sk_buff * -prio_dequeue(struct Qdisc* sch) +static struct sk_buff *prio_dequeue(struct Qdisc *sch) { - struct sk_buff *skb; struct prio_sched_data *q = qdisc_priv(sch); int prio; - struct Qdisc *qdisc; for (prio = 0; prio < q->bands; prio++) { - qdisc = q->queues[prio]; - skb = qdisc->dequeue(qdisc); + struct Qdisc *qdisc = q->queues[prio]; + struct sk_buff *skb = qdisc_dequeue_peeked(qdisc); if (skb) { + qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } @@ -156,7 +123,7 @@ prio_dequeue(struct Qdisc* sch) } -static unsigned int prio_drop(struct Qdisc* sch) +static unsigned int prio_drop(struct Qdisc *sch) { struct prio_sched_data *q = qdisc_priv(sch); int prio; @@ -165,7 +132,7 @@ static unsigned int prio_drop(struct Qdisc* sch) for (prio = q->bands-1; prio >= 0; prio--) { qdisc = q->queues[prio]; - if ((len = qdisc->ops->drop(qdisc)) != 0) { + if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) { sch->q.qlen--; return len; } @@ -175,44 +142,41 @@ static unsigned int prio_drop(struct Qdisc* sch) static void -prio_reset(struct Qdisc* sch) +prio_reset(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); - for (prio=0; prio<q->bands; prio++) + for (prio = 0; prio < q->bands; prio++) qdisc_reset(q->queues[prio]); sch->q.qlen = 0; } static void -prio_destroy(struct Qdisc* sch) +prio_destroy(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); - struct tcf_proto *tp; - while ((tp = q->filter_list) != NULL) { - q->filter_list = tp->next; - tcf_destroy(tp); - } - - for (prio=0; prio<q->bands; prio++) + tcf_destroy_chain(&q->filter_list); + for (prio = 0; prio < q->bands; prio++) qdisc_destroy(q->queues[prio]); } -static int prio_tune(struct Qdisc *sch, struct rtattr *opt) +static int prio_tune(struct Qdisc *sch, struct nlattr *opt) { struct prio_sched_data *q = qdisc_priv(sch); - struct tc_prio_qopt *qopt = RTA_DATA(opt); + struct tc_prio_qopt *qopt; int i; - if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) + if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; + qopt = nla_data(opt); + if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2) return -EINVAL; - for (i=0; i<=TC_PRIO_MAX; i++) { + for (i = 0; i <= TC_PRIO_MAX; i++) { if (qopt->priomap[i] >= qopt->bands) return -EINVAL; } @@ -221,23 +185,33 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt) q->bands = qopt->bands; memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); - for (i=q->bands; i<TCQ_PRIO_BANDS; i++) { - struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc); - if (child != &noop_qdisc) + for (i = q->bands; i < TCQ_PRIO_BANDS; i++) { + struct Qdisc *child = q->queues[i]; + q->queues[i] = &noop_qdisc; + if (child != &noop_qdisc) { + qdisc_tree_decrease_qlen(child, child->q.qlen); qdisc_destroy(child); + } } sch_tree_unlock(sch); - for (i=0; i<q->bands; i++) { + for (i = 0; i < q->bands; i++) { if (q->queues[i] == &noop_qdisc) { - struct Qdisc *child; - child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + struct Qdisc *child, *old; + + child = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, i + 1)); if (child) { sch_tree_lock(sch); - child = xchg(&q->queues[i], child); - - if (child != &noop_qdisc) - qdisc_destroy(child); + old = q->queues[i]; + q->queues[i] = child; + + if (old != &noop_qdisc) { + qdisc_tree_decrease_qlen(old, + old->q.qlen); + qdisc_destroy(old); + } sch_tree_unlock(sch); } } @@ -245,12 +219,12 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt) return 0; } -static int prio_init(struct Qdisc *sch, struct rtattr *opt) +static int prio_init(struct Qdisc *sch, struct nlattr *opt) { struct prio_sched_data *q = qdisc_priv(sch); int i; - for (i=0; i<TCQ_PRIO_BANDS; i++) + for (i = 0; i < TCQ_PRIO_BANDS; i++) q->queues[i] = &noop_qdisc; if (opt == NULL) { @@ -258,7 +232,7 @@ static int prio_init(struct Qdisc *sch, struct rtattr *opt) } else { int err; - if ((err= prio_tune(sch, opt)) != 0) + if ((err = prio_tune(sch, opt)) != 0) return err; } return 0; @@ -267,16 +241,19 @@ static int prio_init(struct Qdisc *sch, struct rtattr *opt) static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct prio_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; + unsigned char *b = skb_tail_pointer(skb); struct tc_prio_qopt opt; opt.bands = q->bands; - memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); - RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1); + + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; + return skb->len; -rtattr_failure: - skb_trim(skb, b - skb->data); +nla_put_failure: + nlmsg_trim(skb, b); return -1; } @@ -286,16 +263,13 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct prio_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; - if (band >= q->bands) - return -EINVAL; - if (new == NULL) new = &noop_qdisc; sch_tree_lock(sch); *old = q->queues[band]; q->queues[band] = new; - sch->q.qlen -= (*old)->q.qlen; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); qdisc_reset(*old); sch_tree_unlock(sch); @@ -308,9 +282,6 @@ prio_leaf(struct Qdisc *sch, unsigned long arg) struct prio_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; - if (band >= q->bands) - return NULL; - return q->queues[band]; } @@ -332,38 +303,30 @@ static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 clas static void prio_put(struct Qdisc *q, unsigned long cl) { - return; } -static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) +static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, + struct tcmsg *tcm) { - unsigned long cl = *arg; struct prio_sched_data *q = qdisc_priv(sch); - if (cl - 1 > q->bands) - return -ENOENT; + tcm->tcm_handle |= TC_H_MIN(cl); + tcm->tcm_info = q->queues[cl-1]->handle; return 0; } -static int prio_delete(struct Qdisc *sch, unsigned long cl) +static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) { struct prio_sched_data *q = qdisc_priv(sch); - if (cl - 1 > q->bands) - return -ENOENT; - return 0; -} + struct Qdisc *cl_q; + cl_q = q->queues[cl - 1]; + cl_q->qstats.qlen = cl_q->q.qlen; + if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || + gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + return -1; -static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, - struct tcmsg *tcm) -{ - struct prio_sched_data *q = qdisc_priv(sch); - - if (cl - 1 > q->bands) - return -ENOENT; - tcm->tcm_handle |= TC_H_MIN(cl); - if (q->queues[cl-1]) - tcm->tcm_info = q->queues[cl-1]->handle; return 0; } @@ -380,7 +343,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) arg->count++; continue; } - if (arg->fn(sch, prio+1, arg) < 0) { + if (arg->fn(sch, prio + 1, arg) < 0) { arg->stop = 1; break; } @@ -388,7 +351,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl) { struct prio_sched_data *q = qdisc_priv(sch); @@ -397,28 +360,27 @@ static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) return &q->filter_list; } -static struct Qdisc_class_ops prio_class_ops = { +static const struct Qdisc_class_ops prio_class_ops = { .graft = prio_graft, .leaf = prio_leaf, .get = prio_get, .put = prio_put, - .change = prio_change, - .delete = prio_delete, .walk = prio_walk, .tcf_chain = prio_find_tcf, .bind_tcf = prio_bind, .unbind_tcf = prio_put, .dump = prio_dump_class, + .dump_stats = prio_dump_class_stats, }; -static struct Qdisc_ops prio_qdisc_ops = { +static struct Qdisc_ops prio_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &prio_class_ops, .id = "prio", .priv_size = sizeof(struct prio_sched_data), .enqueue = prio_enqueue, .dequeue = prio_dequeue, - .requeue = prio_requeue, + .peek = prio_peek, .drop = prio_drop, .init = prio_init, .reset = prio_reset, @@ -433,7 +395,7 @@ static int __init prio_module_init(void) return register_qdisc(&prio_qdisc_ops); } -static void __exit prio_module_exit(void) +static void __exit prio_module_exit(void) { unregister_qdisc(&prio_qdisc_ops); } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c new file mode 100644 index 00000000000..8056fb4e618 --- /dev/null +++ b/net/sched/sch_qfq.c @@ -0,0 +1,1582 @@ +/* + * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler. + * + * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. + * Copyright (c) 2012 Paolo Valente. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/pkt_sched.h> +#include <net/sch_generic.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> + + +/* Quick Fair Queueing Plus + ======================== + + Sources: + + [1] Paolo Valente, + "Reducing the Execution Time of Fair-Queueing Schedulers." + http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf + + Sources for QFQ: + + [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient + Packet Scheduling with Tight Bandwidth Distribution Guarantees." + + See also: + http://retis.sssup.it/~fabio/linux/qfq/ + */ + +/* + + QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES + classes. Each aggregate is timestamped with a virtual start time S + and a virtual finish time F, and scheduled according to its + timestamps. S and F are computed as a function of a system virtual + time function V. The classes within each aggregate are instead + scheduled with DRR. + + To speed up operations, QFQ+ divides also aggregates into a limited + number of groups. Which group a class belongs to depends on the + ratio between the maximum packet length for the class and the weight + of the class. Groups have their own S and F. In the end, QFQ+ + schedules groups, then aggregates within groups, then classes within + aggregates. See [1] and [2] for a full description. + + Virtual time computations. + + S, F and V are all computed in fixed point arithmetic with + FRAC_BITS decimal bits. + + QFQ_MAX_INDEX is the maximum index allowed for a group. We need + one bit per index. + QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. + + The layout of the bits is as below: + + [ MTU_SHIFT ][ FRAC_BITS ] + [ MAX_INDEX ][ MIN_SLOT_SHIFT ] + ^.__grp->index = 0 + *.__grp->slot_shift + + where MIN_SLOT_SHIFT is derived by difference from the others. + + The max group index corresponds to Lmax/w_min, where + Lmax=1<<MTU_SHIFT, w_min = 1 . + From this, and knowing how many groups (MAX_INDEX) we want, + we can derive the shift corresponding to each group. + + Because we often need to compute + F = S + len/w_i and V = V + len/wsum + instead of storing w_i store the value + inv_w = (1<<FRAC_BITS)/w_i + so we can do F = S + len * inv_w * wsum. + We use W_TOT in the formulas so we can easily move between + static and adaptive weight sum. + + The per-scheduler-instance data contain all the data structures + for the scheduler: bitmaps and bucket lists. + + */ + +/* + * Maximum number of consecutive slots occupied by backlogged classes + * inside a group. + */ +#define QFQ_MAX_SLOTS 32 + +/* + * Shifts used for aggregate<->group mapping. We allow class weights that are + * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the + * group with the smallest index that can support the L_i / r_i configured + * for the classes in the aggregate. + * + * grp->index is the index of the group; and grp->slot_shift + * is the shift for the corresponding (scaled) sigma_i. + */ +#define QFQ_MAX_INDEX 24 +#define QFQ_MAX_WSHIFT 10 + +#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */ +#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT) + +#define FRAC_BITS 30 /* fixed point arithmetic */ +#define ONE_FP (1UL << FRAC_BITS) + +#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ +#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ + +#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */ + +/* + * Possible group states. These values are used as indexes for the bitmaps + * array of struct qfq_queue. + */ +enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; + +struct qfq_group; + +struct qfq_aggregate; + +struct qfq_class { + struct Qdisc_class_common common; + + unsigned int refcnt; + unsigned int filter_cnt; + + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est64 rate_est; + struct Qdisc *qdisc; + struct list_head alist; /* Link for active-classes list. */ + struct qfq_aggregate *agg; /* Parent aggregate. */ + int deficit; /* DRR deficit counter. */ +}; + +struct qfq_aggregate { + struct hlist_node next; /* Link for the slot list. */ + u64 S, F; /* flow timestamps (exact) */ + + /* group we belong to. In principle we would need the index, + * which is log_2(lmax/weight), but we never reference it + * directly, only the group. + */ + struct qfq_group *grp; + + /* these are copied from the flowset. */ + u32 class_weight; /* Weight of each class in this aggregate. */ + /* Max pkt size for the classes in this aggregate, DRR quantum. */ + int lmax; + + u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */ + u32 budgetmax; /* Max budget for this aggregate. */ + u32 initial_budget, budget; /* Initial and current budget. */ + + int num_classes; /* Number of classes in this aggr. */ + struct list_head active; /* DRR queue of active classes. */ + + struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */ +}; + +struct qfq_group { + u64 S, F; /* group timestamps (approx). */ + unsigned int slot_shift; /* Slot shift. */ + unsigned int index; /* Group index. */ + unsigned int front; /* Index of the front slot. */ + unsigned long full_slots; /* non-empty slots */ + + /* Array of RR lists of active aggregates. */ + struct hlist_head slots[QFQ_MAX_SLOTS]; +}; + +struct qfq_sched { + struct tcf_proto *filter_list; + struct Qdisc_class_hash clhash; + + u64 oldV, V; /* Precise virtual times. */ + struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ + u32 num_active_agg; /* Num. of active aggregates */ + u32 wsum; /* weight sum */ + u32 iwsum; /* inverse weight sum */ + + unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ + struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ + u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */ + + u32 max_agg_classes; /* Max number of classes per aggr. */ + struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */ +}; + +/* + * Possible reasons why the timestamps of an aggregate are updated + * enqueue: the aggregate switches from idle to active and must scheduled + * for service + * requeue: the aggregate finishes its budget, so it stops being served and + * must be rescheduled for service + */ +enum update_reason {enqueue, requeue}; + +static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct Qdisc_class_common *clc; + + clc = qdisc_class_find(&q->clhash, classid); + if (clc == NULL) + return NULL; + return container_of(clc, struct qfq_class, common); +} + +static void qfq_purge_queue(struct qfq_class *cl) +{ + unsigned int len = cl->qdisc->q.qlen; + + qdisc_reset(cl->qdisc); + qdisc_tree_decrease_qlen(cl->qdisc, len); +} + +static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { + [TCA_QFQ_WEIGHT] = { .type = NLA_U32 }, + [TCA_QFQ_LMAX] = { .type = NLA_U32 }, +}; + +/* + * Calculate a flow index, given its weight and maximum packet length. + * index = log_2(maxlen/weight) but we need to apply the scaling. + * This is used only once at flow creation. + */ +static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift) +{ + u64 slot_size = (u64)maxlen * inv_w; + unsigned long size_map; + int index = 0; + + size_map = slot_size >> min_slot_shift; + if (!size_map) + goto out; + + index = __fls(size_map) + 1; /* basically a log_2 */ + index -= !(slot_size - (1ULL << (index + min_slot_shift - 1))); + + if (index < 0) + index = 0; +out: + pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n", + (unsigned long) ONE_FP/inv_w, maxlen, index); + + return index; +} + +static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *); +static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *, + enum update_reason); + +static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + u32 lmax, u32 weight) +{ + INIT_LIST_HEAD(&agg->active); + hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); + + agg->lmax = lmax; + agg->class_weight = weight; +} + +static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, + u32 lmax, u32 weight) +{ + struct qfq_aggregate *agg; + + hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) + if (agg->lmax == lmax && agg->class_weight == weight) + return agg; + + return NULL; +} + + +/* Update aggregate as a function of the new number of classes. */ +static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + int new_num_classes) +{ + u32 new_agg_weight; + + if (new_num_classes == q->max_agg_classes) + hlist_del_init(&agg->nonfull_next); + + if (agg->num_classes > new_num_classes && + new_num_classes == q->max_agg_classes - 1) /* agg no more full */ + hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); + + /* The next assignment may let + * agg->initial_budget > agg->budgetmax + * hold, we will take it into account in charge_actual_service(). + */ + agg->budgetmax = new_num_classes * agg->lmax; + new_agg_weight = agg->class_weight * new_num_classes; + agg->inv_w = ONE_FP/new_agg_weight; + + if (agg->grp == NULL) { + int i = qfq_calc_index(agg->inv_w, agg->budgetmax, + q->min_slot_shift); + agg->grp = &q->groups[i]; + } + + q->wsum += + (int) agg->class_weight * (new_num_classes - agg->num_classes); + q->iwsum = ONE_FP / q->wsum; + + agg->num_classes = new_num_classes; +} + +/* Add class to aggregate. */ +static void qfq_add_to_agg(struct qfq_sched *q, + struct qfq_aggregate *agg, + struct qfq_class *cl) +{ + cl->agg = agg; + + qfq_update_agg(q, agg, agg->num_classes+1); + if (cl->qdisc->q.qlen > 0) { /* adding an active class */ + list_add_tail(&cl->alist, &agg->active); + if (list_first_entry(&agg->active, struct qfq_class, alist) == + cl && q->in_serv_agg != agg) /* agg was inactive */ + qfq_activate_agg(q, agg, enqueue); /* schedule agg */ + } +} + +static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); + +static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + if (!hlist_unhashed(&agg->nonfull_next)) + hlist_del_init(&agg->nonfull_next); + q->wsum -= agg->class_weight; + if (q->wsum != 0) + q->iwsum = ONE_FP / q->wsum; + + if (q->in_serv_agg == agg) + q->in_serv_agg = qfq_choose_next_agg(q); + kfree(agg); +} + +/* Deschedule class from within its parent aggregate. */ +static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) +{ + struct qfq_aggregate *agg = cl->agg; + + + list_del(&cl->alist); /* remove from RR queue of the aggregate */ + if (list_empty(&agg->active)) /* agg is now inactive */ + qfq_deactivate_agg(q, agg); +} + +/* Remove class from its parent aggregate. */ +static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) +{ + struct qfq_aggregate *agg = cl->agg; + + cl->agg = NULL; + if (agg->num_classes == 1) { /* agg being emptied, destroy it */ + qfq_destroy_agg(q, agg); + return; + } + qfq_update_agg(q, agg, agg->num_classes-1); +} + +/* Deschedule class and remove it from its parent aggregate. */ +static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) +{ + if (cl->qdisc->q.qlen > 0) /* class is active */ + qfq_deactivate_class(q, cl); + + qfq_rm_from_agg(q, cl); +} + +/* Move class to a new aggregate, matching the new class weight and/or lmax */ +static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, + u32 lmax) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight); + + if (new_agg == NULL) { /* create new aggregate */ + new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); + if (new_agg == NULL) + return -ENOBUFS; + qfq_init_agg(q, new_agg, lmax, weight); + } + qfq_deact_rm_from_agg(q, cl); + qfq_add_to_agg(q, new_agg, cl); + + return 0; +} + +static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl = (struct qfq_class *)*arg; + bool existing = false; + struct nlattr *tb[TCA_QFQ_MAX + 1]; + struct qfq_aggregate *new_agg = NULL; + u32 weight, lmax, inv_w; + int err; + int delta_w; + + if (tca[TCA_OPTIONS] == NULL) { + pr_notice("qfq: no options\n"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy); + if (err < 0) + return err; + + if (tb[TCA_QFQ_WEIGHT]) { + weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]); + if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) { + pr_notice("qfq: invalid weight %u\n", weight); + return -EINVAL; + } + } else + weight = 1; + + if (tb[TCA_QFQ_LMAX]) { + lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); + if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { + pr_notice("qfq: invalid max length %u\n", lmax); + return -EINVAL; + } + } else + lmax = psched_mtu(qdisc_dev(sch)); + + inv_w = ONE_FP / weight; + weight = ONE_FP / inv_w; + + if (cl != NULL && + lmax == cl->agg->lmax && + weight == cl->agg->class_weight) + return 0; /* nothing to change */ + + delta_w = weight - (cl ? cl->agg->class_weight : 0); + + if (q->wsum + delta_w > QFQ_MAX_WSUM) { + pr_notice("qfq: total weight out of range (%d + %u)\n", + delta_w, q->wsum); + return -EINVAL; + } + + if (cl != NULL) { /* modify existing class */ + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) + return err; + } + existing = true; + goto set_change_agg; + } + + /* create and init new class */ + cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); + if (cl == NULL) + return -ENOBUFS; + + cl->refcnt = 1; + cl->common.classid = classid; + cl->deficit = lmax; + + cl->qdisc = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, classid); + if (cl->qdisc == NULL) + cl->qdisc = &noop_qdisc; + + if (tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) + goto destroy_class; + } + + sch_tree_lock(sch); + qdisc_class_hash_insert(&q->clhash, &cl->common); + sch_tree_unlock(sch); + + qdisc_class_hash_grow(sch, &q->clhash); + +set_change_agg: + sch_tree_lock(sch); + new_agg = qfq_find_agg(q, lmax, weight); + if (new_agg == NULL) { /* create new aggregate */ + sch_tree_unlock(sch); + new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); + if (new_agg == NULL) { + err = -ENOBUFS; + gen_kill_estimator(&cl->bstats, &cl->rate_est); + goto destroy_class; + } + sch_tree_lock(sch); + qfq_init_agg(q, new_agg, lmax, weight); + } + if (existing) + qfq_deact_rm_from_agg(q, cl); + qfq_add_to_agg(q, new_agg, cl); + sch_tree_unlock(sch); + + *arg = (unsigned long)cl; + return 0; + +destroy_class: + qdisc_destroy(cl->qdisc); + kfree(cl); + return err; +} + +static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) +{ + struct qfq_sched *q = qdisc_priv(sch); + + qfq_rm_from_agg(q, cl); + gen_kill_estimator(&cl->bstats, &cl->rate_est); + qdisc_destroy(cl->qdisc); + kfree(cl); +} + +static int qfq_delete_class(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl = (struct qfq_class *)arg; + + if (cl->filter_cnt > 0) + return -EBUSY; + + sch_tree_lock(sch); + + qfq_purge_queue(cl); + qdisc_class_hash_remove(&q->clhash, &cl->common); + + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ + + sch_tree_unlock(sch); + return 0; +} + +static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid) +{ + struct qfq_class *cl = qfq_find_class(sch, classid); + + if (cl != NULL) + cl->refcnt++; + + return (unsigned long)cl; +} + +static void qfq_put_class(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + if (--cl->refcnt == 0) + qfq_destroy_class(sch, cl); +} + +static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl) +{ + struct qfq_sched *q = qdisc_priv(sch); + + if (cl) + return NULL; + + return &q->filter_list; +} + +static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct qfq_class *cl = qfq_find_class(sch, classid); + + if (cl != NULL) + cl->filter_cnt++; + + return (unsigned long)cl; +} + +static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + cl->filter_cnt--; +} + +static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + if (new == NULL) { + new = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, cl->common.classid); + if (new == NULL) + new = &noop_qdisc; + } + + sch_tree_lock(sch); + qfq_purge_queue(cl); + *old = cl->qdisc; + cl->qdisc = new; + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + return cl->qdisc; +} + +static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + struct nlattr *nest; + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->common.classid; + tcm->tcm_info = cl->qdisc->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || + nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax)) + goto nla_put_failure; + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + struct tc_qfq_stats xstats; + + memset(&xstats, 0, sizeof(xstats)); + cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; + + xstats.weight = cl->agg->class_weight; + xstats.lmax = cl->agg->lmax; + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); +} + +static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl; + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl; + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { + pr_debug("qfq_classify: found %d\n", skb->priority); + cl = qfq_find_class(sch, skb->priority); + if (cl != NULL) + return cl; + } + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + } +#endif + cl = (struct qfq_class *)res.class; + if (cl == NULL) + cl = qfq_find_class(sch, res.classid); + return cl; + } + + return NULL; +} + +/* Generic comparison function, handling wraparound. */ +static inline int qfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +/* Round a precise timestamp to its slotted value. */ +static inline u64 qfq_round_down(u64 ts, unsigned int shift) +{ + return ts & ~((1ULL << shift) - 1); +} + +/* return the pointer to the group with lowest index in the bitmap */ +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, + unsigned long bitmap) +{ + int index = __ffs(bitmap); + return &q->groups[index]; +} +/* Calculate a mask to mimic what would be ffs_from(). */ +static inline unsigned long mask_from(unsigned long bitmap, int from) +{ + return bitmap & ~((1UL << from) - 1); +} + +/* + * The state computation relies on ER=0, IR=1, EB=2, IB=3 + * First compute eligibility comparing grp->S, q->V, + * then check if someone is blocking us and possibly add EB + */ +static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp) +{ + /* if S > V we are not eligible */ + unsigned int state = qfq_gt(grp->S, q->V); + unsigned long mask = mask_from(q->bitmaps[ER], grp->index); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (qfq_gt(grp->F, next->F)) + state |= EB; + } + + return state; +} + + +/* + * In principle + * q->bitmaps[dst] |= q->bitmaps[src] & mask; + * q->bitmaps[src] &= ~mask; + * but we should make sure that src != dst + */ +static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask, + int src, int dst) +{ + q->bitmaps[dst] |= q->bitmaps[src] & mask; + q->bitmaps[src] &= ~mask; +} + +static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) +{ + unsigned long mask = mask_from(q->bitmaps[ER], index + 1); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (!qfq_gt(next->F, old_F)) + return; + } + + mask = (1UL << index) - 1; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); +} + +/* + * perhaps + * + old_V ^= q->V; + old_V >>= q->min_slot_shift; + if (old_V) { + ... + } + * + */ +static void qfq_make_eligible(struct qfq_sched *q) +{ + unsigned long vslot = q->V >> q->min_slot_shift; + unsigned long old_vslot = q->oldV >> q->min_slot_shift; + + if (vslot != old_vslot) { + unsigned long mask; + int last_flip_pos = fls(vslot ^ old_vslot); + + if (last_flip_pos > 31) /* higher than the number of groups */ + mask = ~0UL; /* make all groups eligible */ + else + mask = (1UL << last_flip_pos) - 1; + + qfq_move_groups(q, mask, IR, ER); + qfq_move_groups(q, mask, IB, EB); + } +} + +/* + * The index of the slot in which the input aggregate agg is to be + * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2' + * and not a '-1' because the start time of the group may be moved + * backward by one slot after the aggregate has been inserted, and + * this would cause non-empty slots to be right-shifted by one + * position. + * + * QFQ+ fully satisfies this bound to the slot index if the parameters + * of the classes are not changed dynamically, and if QFQ+ never + * happens to postpone the service of agg unjustly, i.e., it never + * happens that the aggregate becomes backlogged and eligible, or just + * eligible, while an aggregate with a higher approximated finish time + * is being served. In particular, in this case QFQ+ guarantees that + * the timestamps of agg are low enough that the slot index is never + * higher than 2. Unfortunately, QFQ+ cannot provide the same + * guarantee if it happens to unjustly postpone the service of agg, or + * if the parameters of some class are changed. + * + * As for the first event, i.e., an out-of-order service, the + * upper bound to the slot index guaranteed by QFQ+ grows to + * 2 + + * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * + * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1. + * + * The following function deals with this problem by backward-shifting + * the timestamps of agg, if needed, so as to guarantee that the slot + * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may + * cause the service of other aggregates to be postponed, yet the + * worst-case guarantees of these aggregates are not violated. In + * fact, in case of no out-of-order service, the timestamps of agg + * would have been even lower than they are after the backward shift, + * because QFQ+ would have guaranteed a maximum value equal to 2 for + * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose + * service is postponed because of the backward-shift would have + * however waited for the service of agg before being served. + * + * The other event that may cause the slot index to be higher than 2 + * for agg is a recent change of the parameters of some class. If the + * weight of a class is increased or the lmax (max_pkt_size) of the + * class is decreased, then a new aggregate with smaller slot size + * than the original parent aggregate of the class may happen to be + * activated. The activation of this aggregate should be properly + * delayed to when the service of the class has finished in the ideal + * system tracked by QFQ+. If the activation of the aggregate is not + * delayed to this reference time instant, then this aggregate may be + * unjustly served before other aggregates waiting for service. This + * may cause the above bound to the slot index to be violated for some + * of these unlucky aggregates. + * + * Instead of delaying the activation of the new aggregate, which is + * quite complex, the above-discussed capping of the slot index is + * used to handle also the consequences of a change of the parameters + * of a class. + */ +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, + u64 roundedS) +{ + u64 slot = (roundedS - grp->S) >> grp->slot_shift; + unsigned int i; /* slot index in the bucket list */ + + if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { + u64 deltaS = roundedS - grp->S - + ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); + agg->S -= deltaS; + agg->F -= deltaS; + slot = QFQ_MAX_SLOTS - 2; + } + + i = (grp->front + slot) % QFQ_MAX_SLOTS; + + hlist_add_head(&agg->next, &grp->slots[i]); + __set_bit(slot, &grp->full_slots); +} + +/* Maybe introduce hlist_first_entry?? */ +static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp) +{ + return hlist_entry(grp->slots[grp->front].first, + struct qfq_aggregate, next); +} + +/* + * remove the entry from the slot + */ +static void qfq_front_slot_remove(struct qfq_group *grp) +{ + struct qfq_aggregate *agg = qfq_slot_head(grp); + + BUG_ON(!agg); + hlist_del(&agg->next); + if (hlist_empty(&grp->slots[grp->front])) + __clear_bit(0, &grp->full_slots); +} + +/* + * Returns the first aggregate in the first non-empty bucket of the + * group. As a side effect, adjusts the bucket list so the first + * non-empty bucket is at position 0 in full_slots. + */ +static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp) +{ + unsigned int i; + + pr_debug("qfq slot_scan: grp %u full %#lx\n", + grp->index, grp->full_slots); + + if (grp->full_slots == 0) + return NULL; + + i = __ffs(grp->full_slots); /* zero based */ + if (i > 0) { + grp->front = (grp->front + i) % QFQ_MAX_SLOTS; + grp->full_slots >>= i; + } + + return qfq_slot_head(grp); +} + +/* + * adjust the bucket list. When the start time of a group decreases, + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to + * move the objects. The mask of occupied slots must be shifted + * because we use ffs() to find the first non-empty slot. + * This covers decreases in the group's start time, but what about + * increases of the start time ? + * Here too we should make sure that i is less than 32 + */ +static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) +{ + unsigned int i = (grp->S - roundedS) >> grp->slot_shift; + + grp->full_slots <<= i; + grp->front = (grp->front - i) % QFQ_MAX_SLOTS; +} + +static void qfq_update_eligible(struct qfq_sched *q) +{ + struct qfq_group *grp; + unsigned long ineligible; + + ineligible = q->bitmaps[IR] | q->bitmaps[IB]; + if (ineligible) { + if (!q->bitmaps[ER]) { + grp = qfq_ffs(q, ineligible); + if (qfq_gt(grp->S, q->V)) + q->V = grp->S; + } + qfq_make_eligible(q); + } +} + +/* Dequeue head packet of the head class in the DRR queue of the aggregate. */ +static void agg_dequeue(struct qfq_aggregate *agg, + struct qfq_class *cl, unsigned int len) +{ + qdisc_dequeue_peeked(cl->qdisc); + + cl->deficit -= (int) len; + + if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ + list_del(&cl->alist); + else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { + cl->deficit += agg->lmax; + list_move_tail(&cl->alist, &agg->active); + } +} + +static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, + struct qfq_class **cl, + unsigned int *len) +{ + struct sk_buff *skb; + + *cl = list_first_entry(&agg->active, struct qfq_class, alist); + skb = (*cl)->qdisc->ops->peek((*cl)->qdisc); + if (skb == NULL) + WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); + else + *len = qdisc_pkt_len(skb); + + return skb; +} + +/* Update F according to the actual service received by the aggregate. */ +static inline void charge_actual_service(struct qfq_aggregate *agg) +{ + /* Compute the service received by the aggregate, taking into + * account that, after decreasing the number of classes in + * agg, it may happen that + * agg->initial_budget - agg->budget > agg->bugdetmax + */ + u32 service_received = min(agg->budgetmax, + agg->initial_budget - agg->budget); + + agg->F = agg->S + (u64)service_received * agg->inv_w; +} + +/* Assign a reasonable start time for a new aggregate in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in EB (see [2]). So, if we have groups in ER, + * set S to the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + unsigned long mask; + u64 limit, roundedF; + int slot_shift = agg->grp->slot_shift; + + roundedF = qfq_round_down(agg->F, slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); + + if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(q->bitmaps[ER], agg->grp->index); + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { + if (qfq_gt(limit, next->F)) + agg->S = next->F; + else /* preserve timestamp correctness */ + agg->S = limit; + return; + } + } + agg->S = q->V; + } else /* timestamp is not stale */ + agg->S = agg->F; +} + +/* Update the timestamps of agg before scheduling/rescheduling it for + * service. In particular, assign to agg->F its maximum possible + * value, i.e., the virtual finish time with which the aggregate + * should be labeled if it used all its budget once in service. + */ +static inline void +qfq_update_agg_ts(struct qfq_sched *q, + struct qfq_aggregate *agg, enum update_reason reason) +{ + if (reason != requeue) + qfq_update_start(q, agg); + else /* just charge agg for the service received */ + agg->S = agg->F; + + agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; +} + +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg); + +static struct sk_buff *qfq_dequeue(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_aggregate *in_serv_agg = q->in_serv_agg; + struct qfq_class *cl; + struct sk_buff *skb = NULL; + /* next-packet len, 0 means no more active classes in in-service agg */ + unsigned int len = 0; + + if (in_serv_agg == NULL) + return NULL; + + if (!list_empty(&in_serv_agg->active)) + skb = qfq_peek_skb(in_serv_agg, &cl, &len); + + /* + * If there are no active classes in the in-service aggregate, + * or if the aggregate has not enough budget to serve its next + * class, then choose the next aggregate to serve. + */ + if (len == 0 || in_serv_agg->budget < len) { + charge_actual_service(in_serv_agg); + + /* recharge the budget of the aggregate */ + in_serv_agg->initial_budget = in_serv_agg->budget = + in_serv_agg->budgetmax; + + if (!list_empty(&in_serv_agg->active)) { + /* + * Still active: reschedule for + * service. Possible optimization: if no other + * aggregate is active, then there is no point + * in rescheduling this aggregate, and we can + * just keep it as the in-service one. This + * should be however a corner case, and to + * handle it, we would need to maintain an + * extra num_active_aggs field. + */ + qfq_update_agg_ts(q, in_serv_agg, requeue); + qfq_schedule_agg(q, in_serv_agg); + } else if (sch->q.qlen == 0) { /* no aggregate to serve */ + q->in_serv_agg = NULL; + return NULL; + } + + /* + * If we get here, there are other aggregates queued: + * choose the new aggregate to serve. + */ + in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q); + skb = qfq_peek_skb(in_serv_agg, &cl, &len); + } + if (!skb) + return NULL; + + sch->q.qlen--; + qdisc_bstats_update(sch, skb); + + agg_dequeue(in_serv_agg, cl, len); + /* If lmax is lowered, through qfq_change_class, for a class + * owning pending packets with larger size than the new value + * of lmax, then the following condition may hold. + */ + if (unlikely(in_serv_agg->budget < len)) + in_serv_agg->budget = 0; + else + in_serv_agg->budget -= len; + + q->V += (u64)len * q->iwsum; + pr_debug("qfq dequeue: len %u F %lld now %lld\n", + len, (unsigned long long) in_serv_agg->F, + (unsigned long long) q->V); + + return skb; +} + +static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) +{ + struct qfq_group *grp; + struct qfq_aggregate *agg, *new_front_agg; + u64 old_F; + + qfq_update_eligible(q); + q->oldV = q->V; + + if (!q->bitmaps[ER]) + return NULL; + + grp = qfq_ffs(q, q->bitmaps[ER]); + old_F = grp->F; + + agg = qfq_slot_head(grp); + + /* agg starts to be served, remove it from schedule */ + qfq_front_slot_remove(grp); + + new_front_agg = qfq_slot_scan(grp); + + if (new_front_agg == NULL) /* group is now inactive, remove from ER */ + __clear_bit(grp->index, &q->bitmaps[ER]); + else { + u64 roundedS = qfq_round_down(new_front_agg->S, + grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + return agg; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + + qfq_unblock_groups(q, grp->index, old_F); + + return agg; +} + +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl; + struct qfq_aggregate *agg; + int err = 0; + + cl = qfq_classify(skb, sch, &err); + if (cl == NULL) { + if (err & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return err; + } + pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); + + if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) { + pr_debug("qfq: increasing maxpkt from %u to %u for class %u", + cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid); + err = qfq_change_agg(sch, cl, cl->agg->class_weight, + qdisc_pkt_len(skb)); + if (err) + return err; + } + + err = qdisc_enqueue(skb, cl->qdisc); + if (unlikely(err != NET_XMIT_SUCCESS)) { + pr_debug("qfq_enqueue: enqueue failed %d\n", err); + if (net_xmit_drop_count(err)) { + cl->qstats.drops++; + sch->qstats.drops++; + } + return err; + } + + bstats_update(&cl->bstats, skb); + ++sch->q.qlen; + + agg = cl->agg; + /* if the queue was not empty, then done here */ + if (cl->qdisc->q.qlen != 1) { + if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && + list_first_entry(&agg->active, struct qfq_class, alist) + == cl && cl->deficit < qdisc_pkt_len(skb)) + list_move_tail(&cl->alist, &agg->active); + + return err; + } + + /* schedule class for service within the aggregate */ + cl->deficit = agg->lmax; + list_add_tail(&cl->alist, &agg->active); + + if (list_first_entry(&agg->active, struct qfq_class, alist) != cl || + q->in_serv_agg == agg) + return err; /* non-empty or in service, nothing else to do */ + + qfq_activate_agg(q, agg, enqueue); + + return err; +} + +/* + * Schedule aggregate according to its timestamps. + */ +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + struct qfq_group *grp = agg->grp; + u64 roundedS; + int s; + + roundedS = qfq_round_down(agg->S, grp->slot_shift); + + /* + * Insert agg in the correct bucket. + * If agg->S >= grp->S we don't need to adjust the + * bucket list and simply go to the insertion phase. + * Otherwise grp->S is decreasing, we must make room + * in the bucket list, and also recompute the group state. + * Finally, if there were no flows in this group and nobody + * was in ER make sure to adjust V. + */ + if (grp->full_slots) { + if (!qfq_gt(grp->S, agg->S)) + goto skip_update; + + /* create a slot for this agg->S */ + qfq_slot_rotate(grp, roundedS); + /* group was surely ineligible, remove */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[IB]); + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) && + q->in_serv_agg == NULL) + q->V = roundedS; + + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + + pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", + s, q->bitmaps[s], + (unsigned long long) agg->S, + (unsigned long long) agg->F, + (unsigned long long) q->V); + +skip_update: + qfq_slot_insert(grp, agg, roundedS); +} + + +/* Update agg ts and schedule agg for service */ +static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + enum update_reason reason) +{ + agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */ + + qfq_update_agg_ts(q, agg, reason); + if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */ + q->in_serv_agg = agg; /* start serving this aggregate */ + /* update V: to be in service, agg must be eligible */ + q->oldV = q->V = agg->S; + } else if (agg != q->in_serv_agg) + qfq_schedule_agg(q, agg); +} + +static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_aggregate *agg) +{ + unsigned int i, offset; + u64 roundedS; + + roundedS = qfq_round_down(agg->S, grp->slot_shift); + offset = (roundedS - grp->S) >> grp->slot_shift; + + i = (grp->front + offset) % QFQ_MAX_SLOTS; + + hlist_del(&agg->next); + if (hlist_empty(&grp->slots[i])) + __clear_bit(offset, &grp->full_slots); +} + +/* + * Called to forcibly deschedule an aggregate. If the aggregate is + * not in the front bucket, or if the latter has other aggregates in + * the front bucket, we can simply remove the aggregate with no other + * side effects. + * Otherwise we must propagate the event up. + */ +static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + struct qfq_group *grp = agg->grp; + unsigned long mask; + u64 roundedS; + int s; + + if (agg == q->in_serv_agg) { + charge_actual_service(agg); + q->in_serv_agg = qfq_choose_next_agg(q); + return; + } + + agg->F = agg->S; + qfq_slot_remove(q, grp, agg); + + if (!grp->full_slots) { + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + + if (test_bit(grp->index, &q->bitmaps[ER]) && + !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { + mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); + if (mask) + mask = ~((1UL << __fls(mask)) - 1); + else + mask = ~0UL; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); + } + __clear_bit(grp->index, &q->bitmaps[ER]); + } else if (hlist_empty(&grp->slots[grp->front])) { + agg = qfq_slot_scan(grp); + roundedS = qfq_round_down(agg->S, grp->slot_shift); + if (grp->S != roundedS) { + __clear_bit(grp->index, &q->bitmaps[ER]); + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + } +} + +static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl = (struct qfq_class *)arg; + + if (cl->qdisc->q.qlen == 0) + qfq_deactivate_class(q, cl); +} + +static unsigned int qfq_drop_from_slot(struct qfq_sched *q, + struct hlist_head *slot) +{ + struct qfq_aggregate *agg; + struct qfq_class *cl; + unsigned int len; + + hlist_for_each_entry(agg, slot, next) { + list_for_each_entry(cl, &agg->active, alist) { + + if (!cl->qdisc->ops->drop) + continue; + + len = cl->qdisc->ops->drop(cl->qdisc); + if (len > 0) { + if (cl->qdisc->q.qlen == 0) + qfq_deactivate_class(q, cl); + + return len; + } + } + } + return 0; +} + +static unsigned int qfq_drop(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + unsigned int i, j, len; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + for (j = 0; j < QFQ_MAX_SLOTS; j++) { + len = qfq_drop_from_slot(q, &grp->slots[j]); + if (len > 0) { + sch->q.qlen--; + return len; + } + } + + } + + return 0; +} + +static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + int i, j, err; + u32 max_cl_shift, maxbudg_shift, max_classes; + + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) + return err; + + if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES) + max_classes = QFQ_MAX_AGG_CLASSES; + else + max_classes = qdisc_dev(sch)->tx_queue_len + 1; + /* max_cl_shift = floor(log_2(max_classes)) */ + max_cl_shift = __fls(max_classes); + q->max_agg_classes = 1<<max_cl_shift; + + /* maxbudg_shift = log2(max_len * max_classes_per_agg) */ + maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift; + q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + grp->index = i; + grp->slot_shift = q->min_slot_shift + i; + for (j = 0; j < QFQ_MAX_SLOTS; j++) + INIT_HLIST_HEAD(&grp->slots[j]); + } + + INIT_HLIST_HEAD(&q->nonfull_aggs); + + return 0; +} + +static void qfq_reset_qdisc(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl; + unsigned int i; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { + if (cl->qdisc->q.qlen > 0) + qfq_deactivate_class(q, cl); + + qdisc_reset(cl->qdisc); + } + } + sch->q.qlen = 0; +} + +static void qfq_destroy_qdisc(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl; + struct hlist_node *next; + unsigned int i; + + tcf_destroy_chain(&q->filter_list); + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], + common.hnode) { + qfq_destroy_class(sch, cl); + } + } + qdisc_class_hash_destroy(&q->clhash); +} + +static const struct Qdisc_class_ops qfq_class_ops = { + .change = qfq_change_class, + .delete = qfq_delete_class, + .get = qfq_get_class, + .put = qfq_put_class, + .tcf_chain = qfq_tcf_chain, + .bind_tcf = qfq_bind_tcf, + .unbind_tcf = qfq_unbind_tcf, + .graft = qfq_graft_class, + .leaf = qfq_class_leaf, + .qlen_notify = qfq_qlen_notify, + .dump = qfq_dump_class, + .dump_stats = qfq_dump_class_stats, + .walk = qfq_walk, +}; + +static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { + .cl_ops = &qfq_class_ops, + .id = "qfq", + .priv_size = sizeof(struct qfq_sched), + .enqueue = qfq_enqueue, + .dequeue = qfq_dequeue, + .peek = qdisc_peek_dequeued, + .drop = qfq_drop, + .init = qfq_init_qdisc, + .reset = qfq_reset_qdisc, + .destroy = qfq_destroy_qdisc, + .owner = THIS_MODULE, +}; + +static int __init qfq_init(void) +{ + return register_qdisc(&qfq_qdisc_ops); +} + +static void __exit qfq_exit(void) +{ + unregister_qdisc(&qfq_qdisc_ops); +} + +module_init(qfq_init); +module_exit(qfq_exit); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index dccfa44c2d7..633e32defdc 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -14,11 +14,9 @@ * J Hadi Salim 980816: ECN support */ -#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> @@ -38,12 +36,14 @@ if RED works correctly. */ -struct red_sched_data -{ +struct red_sched_data { u32 limit; /* HARD maximal queue length */ unsigned char flags; + struct timer_list adapt_timer; struct red_parms parms; + struct red_vars vars; struct red_stats stats; + struct Qdisc *qdisc; }; static inline int red_use_ecn(struct red_sched_data *q) @@ -56,143 +56,208 @@ static inline int red_use_harddrop(struct red_sched_data *q) return q->flags & TC_RED_HARDDROP; } -static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); - - q->parms.qavg = red_calc_qavg(&q->parms, sch->qstats.backlog); - - if (red_is_idling(&q->parms)) - red_end_of_idle_period(&q->parms); - - switch (red_action(&q->parms, q->parms.qavg)) { - case RED_DONT_MARK: - break; - - case RED_PROB_MARK: - sch->qstats.overlimits++; - if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { - q->stats.prob_drop++; - goto congestion_drop; - } - - q->stats.prob_mark++; - break; - - case RED_HARD_MARK: - sch->qstats.overlimits++; - if (red_use_harddrop(q) || !red_use_ecn(q) || - !INET_ECN_set_ce(skb)) { - q->stats.forced_drop++; - goto congestion_drop; - } - - q->stats.forced_mark++; - break; + struct Qdisc *child = q->qdisc; + int ret; + + q->vars.qavg = red_calc_qavg(&q->parms, + &q->vars, + child->qstats.backlog); + + if (red_is_idling(&q->vars)) + red_end_of_idle_period(&q->vars); + + switch (red_action(&q->parms, &q->vars, q->vars.qavg)) { + case RED_DONT_MARK: + break; + + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + q->stats.prob_mark++; + break; + + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (red_use_harddrop(q) || !red_use_ecn(q) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } + + q->stats.forced_mark++; + break; } - if (sch->qstats.backlog + skb->len <= q->limit) - return qdisc_enqueue_tail(skb, sch); - - q->stats.pdrop++; - return qdisc_drop(skb, sch); + ret = qdisc_enqueue(skb, child); + if (likely(ret == NET_XMIT_SUCCESS)) { + sch->q.qlen++; + } else if (net_xmit_drop_count(ret)) { + q->stats.pdrop++; + sch->qstats.drops++; + } + return ret; congestion_drop: qdisc_drop(skb, sch); return NET_XMIT_CN; } -static int red_requeue(struct sk_buff *skb, struct Qdisc* sch) +static struct sk_buff *red_dequeue(struct Qdisc *sch) { + struct sk_buff *skb; struct red_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; - if (red_is_idling(&q->parms)) - red_end_of_idle_period(&q->parms); - - return qdisc_requeue(skb, sch); + skb = child->dequeue(child); + if (skb) { + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + } else { + if (!red_is_idling(&q->vars)) + red_start_of_idle_period(&q->vars); + } + return skb; } -static struct sk_buff * red_dequeue(struct Qdisc* sch) +static struct sk_buff *red_peek(struct Qdisc *sch) { - struct sk_buff *skb; struct red_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; - skb = qdisc_dequeue_head(sch); - - if (skb == NULL && !red_is_idling(&q->parms)) - red_start_of_idle_period(&q->parms); - - return skb; + return child->ops->peek(child); } -static unsigned int red_drop(struct Qdisc* sch) +static unsigned int red_drop(struct Qdisc *sch) { - struct sk_buff *skb; struct red_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + unsigned int len; - skb = qdisc_dequeue_tail(sch); - if (skb) { - unsigned int len = skb->len; + if (child->ops->drop && (len = child->ops->drop(child)) > 0) { q->stats.other++; - qdisc_drop(skb, sch); + sch->qstats.drops++; + sch->q.qlen--; return len; } - if (!red_is_idling(&q->parms)) - red_start_of_idle_period(&q->parms); + if (!red_is_idling(&q->vars)) + red_start_of_idle_period(&q->vars); return 0; } -static void red_reset(struct Qdisc* sch) +static void red_reset(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); - qdisc_reset_queue(sch); - red_restart(&q->parms); + qdisc_reset(q->qdisc); + sch->q.qlen = 0; + red_restart(&q->vars); } -static int red_change(struct Qdisc *sch, struct rtattr *opt) +static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); - struct rtattr *tb[TCA_RED_MAX]; + + del_timer_sync(&q->adapt_timer); + qdisc_destroy(q->qdisc); +} + +static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { + [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) }, + [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, + [TCA_RED_MAX_P] = { .type = NLA_U32 }, +}; + +static int red_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_RED_MAX + 1]; struct tc_red_qopt *ctl; + struct Qdisc *child = NULL; + int err; + u32 max_P; - if (opt == NULL || rtattr_parse_nested(tb, TCA_RED_MAX, opt)) + if (opt == NULL) return -EINVAL; - if (tb[TCA_RED_PARMS-1] == NULL || - RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || - tb[TCA_RED_STAB-1] == NULL || - RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE) + err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy); + if (err < 0) + return err; + + if (tb[TCA_RED_PARMS] == NULL || + tb[TCA_RED_STAB] == NULL) return -EINVAL; - ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); + max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; + + ctl = nla_data(tb[TCA_RED_PARMS]); + + if (ctl->limit > 0) { + child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit); + if (IS_ERR(child)) + return PTR_ERR(child); + } sch_tree_lock(sch); q->flags = ctl->flags; q->limit = ctl->limit; + if (child) { + qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_destroy(q->qdisc); + q->qdisc = child; + } + + red_set_parms(&q->parms, + ctl->qth_min, ctl->qth_max, ctl->Wlog, + ctl->Plog, ctl->Scell_log, + nla_data(tb[TCA_RED_STAB]), + max_P); + red_set_vars(&q->vars); - red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, - ctl->Plog, ctl->Scell_log, - RTA_DATA(tb[TCA_RED_STAB-1])); + del_timer(&q->adapt_timer); + if (ctl->flags & TC_RED_ADAPTATIVE) + mod_timer(&q->adapt_timer, jiffies + HZ/2); - if (skb_queue_empty(&sch->q)) - red_end_of_idle_period(&q->parms); + if (!q->qdisc->q.qlen) + red_start_of_idle_period(&q->vars); sch_tree_unlock(sch); return 0; } -static int red_init(struct Qdisc* sch, struct rtattr *opt) +static inline void red_adaptative_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + struct red_sched_data *q = qdisc_priv(sch); + spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + + spin_lock(root_lock); + red_adaptative_algo(&q->parms, &q->vars); + mod_timer(&q->adapt_timer, jiffies + HZ/2); + spin_unlock(root_lock); +} + +static int red_init(struct Qdisc *sch, struct nlattr *opt) { + struct red_sched_data *q = qdisc_priv(sch); + + q->qdisc = &noop_qdisc; + setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch); return red_change(sch, opt); } static int red_dump(struct Qdisc *sch, struct sk_buff *skb) { struct red_sched_data *q = qdisc_priv(sch); - struct rtattr *opts = NULL; + struct nlattr *opts = NULL; struct tc_red_qopt opt = { .limit = q->limit, .flags = q->flags, @@ -203,12 +268,18 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) .Scell_log = q->parms.Scell_log, }; - opts = RTA_NEST(skb, TCA_OPTIONS); - RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); - return RTA_NEST_END(skb, opts); - -rtattr_failure: - return RTA_NEST_CANCEL(skb, opts); + sch->qstats.backlog = q->qdisc->qstats.backlog; + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || + nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P)) + goto nla_put_failure; + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; } static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) @@ -224,15 +295,80 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) return gnet_stats_copy_app(d, &st, sizeof(st)); } -static struct Qdisc_ops red_qdisc_ops = { +static int red_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct red_sched_data *q = qdisc_priv(sch); + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + return 0; +} + +static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct red_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->qdisc; + q->qdisc = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *red_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct red_sched_data *q = qdisc_priv(sch); + return q->qdisc; +} + +static unsigned long red_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void red_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static const struct Qdisc_class_ops red_class_ops = { + .graft = red_graft, + .leaf = red_leaf, + .get = red_get, + .put = red_put, + .walk = red_walk, + .dump = red_dump_class, +}; + +static struct Qdisc_ops red_qdisc_ops __read_mostly = { .id = "red", .priv_size = sizeof(struct red_sched_data), + .cl_ops = &red_class_ops, .enqueue = red_enqueue, .dequeue = red_dequeue, - .requeue = red_requeue, + .peek = red_peek, .drop = red_drop, .init = red_init, .reset = red_reset, + .destroy = red_destroy, .change = red_change, .dump = red_dump, .dump_stats = red_dump_stats, diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c new file mode 100644 index 00000000000..9b0f7093d97 --- /dev/null +++ b/net/sched/sch_sfb.c @@ -0,0 +1,725 @@ +/* + * net/sched/sch_sfb.c Stochastic Fair Blue + * + * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr> + * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: + * A New Class of Active Queue Management Algorithms. + * U. Michigan CSE-TR-387-99, April 1999. + * + * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <net/ip.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> +#include <net/flow_keys.h> + +/* + * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) + * This implementation uses L = 8 and N = 16 + * This permits us to split one 32bit hash (provided per packet by rxhash or + * external classifier) into 8 subhashes of 4 bits. + */ +#define SFB_BUCKET_SHIFT 4 +#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */ +#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1) +#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */ + +/* SFB algo uses a virtual queue, named "bin" */ +struct sfb_bucket { + u16 qlen; /* length of virtual queue */ + u16 p_mark; /* marking probability */ +}; + +/* We use a double buffering right before hash change + * (Section 4.4 of SFB reference : moving hash functions) + */ +struct sfb_bins { + u32 perturbation; /* jhash perturbation */ + struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS]; +}; + +struct sfb_sched_data { + struct Qdisc *qdisc; + struct tcf_proto *filter_list; + unsigned long rehash_interval; + unsigned long warmup_time; /* double buffering warmup time in jiffies */ + u32 max; + u32 bin_size; /* maximum queue length per bin */ + u32 increment; /* d1 */ + u32 decrement; /* d2 */ + u32 limit; /* HARD maximal queue length */ + u32 penalty_rate; + u32 penalty_burst; + u32 tokens_avail; + unsigned long rehash_time; + unsigned long token_time; + + u8 slot; /* current active bins (0 or 1) */ + bool double_buffering; + struct sfb_bins bins[2]; + + struct { + u32 earlydrop; + u32 penaltydrop; + u32 bucketdrop; + u32 queuedrop; + u32 childdrop; /* drops in child qdisc */ + u32 marked; /* ECN mark */ + } stats; +}; + +/* + * Each queued skb might be hashed on one or two bins + * We store in skb_cb the two hash values. + * (A zero value means double buffering was not used) + */ +struct sfb_skb_cb { + u32 hashes[2]; +}; + +static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb)); + return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; +} + +/* + * If using 'internal' SFB flow classifier, hash comes from skb rxhash + * If using external classifier, hash comes from the classid. + */ +static u32 sfb_hash(const struct sk_buff *skb, u32 slot) +{ + return sfb_skb_cb(skb)->hashes[slot]; +} + +/* Probabilities are coded as Q0.16 fixed-point values, + * with 0xFFFF representing 65535/65536 (almost 1.0) + * Addition and subtraction are saturating in [0, 65535] + */ +static u32 prob_plus(u32 p1, u32 p2) +{ + u32 res = p1 + p2; + + return min_t(u32, res, SFB_MAX_PROB); +} + +static u32 prob_minus(u32 p1, u32 p2) +{ + return p1 > p2 ? p1 - p2 : 0; +} + +static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) +{ + int i; + struct sfb_bucket *b = &q->bins[slot].bins[0][0]; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b[hash].qlen < 0xFFFF) + b[hash].qlen++; + b += SFB_NUMBUCKETS; /* next level */ + } +} + +static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +{ + u32 sfbhash; + + sfbhash = sfb_hash(skb, 0); + if (sfbhash) + increment_one_qlen(sfbhash, 0, q); + + sfbhash = sfb_hash(skb, 1); + if (sfbhash) + increment_one_qlen(sfbhash, 1, q); +} + +static void decrement_one_qlen(u32 sfbhash, u32 slot, + struct sfb_sched_data *q) +{ + int i; + struct sfb_bucket *b = &q->bins[slot].bins[0][0]; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b[hash].qlen > 0) + b[hash].qlen--; + b += SFB_NUMBUCKETS; /* next level */ + } +} + +static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +{ + u32 sfbhash; + + sfbhash = sfb_hash(skb, 0); + if (sfbhash) + decrement_one_qlen(sfbhash, 0, q); + + sfbhash = sfb_hash(skb, 1); + if (sfbhash) + decrement_one_qlen(sfbhash, 1, q); +} + +static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q) +{ + b->p_mark = prob_minus(b->p_mark, q->decrement); +} + +static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q) +{ + b->p_mark = prob_plus(b->p_mark, q->increment); +} + +static void sfb_zero_all_buckets(struct sfb_sched_data *q) +{ + memset(&q->bins, 0, sizeof(q->bins)); +} + +/* + * compute max qlen, max p_mark, and avg p_mark + */ +static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q) +{ + int i; + u32 qlen = 0, prob = 0, totalpm = 0; + const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0]; + + for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) { + if (qlen < b->qlen) + qlen = b->qlen; + totalpm += b->p_mark; + if (prob < b->p_mark) + prob = b->p_mark; + b++; + } + *prob_r = prob; + *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS); + return qlen; +} + + +static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) +{ + q->bins[slot].perturbation = prandom_u32(); +} + +static void sfb_swap_slot(struct sfb_sched_data *q) +{ + sfb_init_perturbation(q->slot, q); + q->slot ^= 1; + q->double_buffering = false; +} + +/* Non elastic flows are allowed to use part of the bandwidth, expressed + * in "penalty_rate" packets per second, with "penalty_burst" burst + */ +static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q) +{ + if (q->penalty_rate == 0 || q->penalty_burst == 0) + return true; + + if (q->tokens_avail < 1) { + unsigned long age = min(10UL * HZ, jiffies - q->token_time); + + q->tokens_avail = (age * q->penalty_rate) / HZ; + if (q->tokens_avail > q->penalty_burst) + q->tokens_avail = q->penalty_burst; + q->token_time = jiffies; + if (q->tokens_avail < 1) + return true; + } + + q->tokens_avail--; + return false; +} + +static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q, + int *qerr, u32 *salt) +{ + struct tcf_result res; + int result; + + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return false; + } +#endif + *salt = TC_H_MIN(res.classid); + return true; + } + return false; +} + +static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + int i; + u32 p_min = ~0; + u32 minqlen = ~0; + u32 r, slot, salt, sfbhash; + int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + struct flow_keys keys; + + if (unlikely(sch->q.qlen >= q->limit)) { + sch->qstats.overlimits++; + q->stats.queuedrop++; + goto drop; + } + + if (q->rehash_interval > 0) { + unsigned long limit = q->rehash_time + q->rehash_interval; + + if (unlikely(time_after(jiffies, limit))) { + sfb_swap_slot(q); + q->rehash_time = jiffies; + } else if (unlikely(!q->double_buffering && q->warmup_time > 0 && + time_after(jiffies, limit - q->warmup_time))) { + q->double_buffering = true; + } + } + + if (q->filter_list) { + /* If using external classifiers, get result and record it. */ + if (!sfb_classify(skb, q, &ret, &salt)) + goto other_drop; + keys.src = salt; + keys.dst = 0; + keys.ports = 0; + } else { + skb_flow_dissect(skb, &keys); + } + + slot = q->slot; + + sfbhash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports, + q->bins[slot].perturbation); + if (!sfbhash) + sfbhash = 1; + sfb_skb_cb(skb)->hashes[slot] = sfbhash; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b->qlen == 0) + decrement_prob(b, q); + else if (b->qlen >= q->bin_size) + increment_prob(b, q); + if (minqlen > b->qlen) + minqlen = b->qlen; + if (p_min > b->p_mark) + p_min = b->p_mark; + } + + slot ^= 1; + sfb_skb_cb(skb)->hashes[slot] = 0; + + if (unlikely(minqlen >= q->max)) { + sch->qstats.overlimits++; + q->stats.bucketdrop++; + goto drop; + } + + if (unlikely(p_min >= SFB_MAX_PROB)) { + /* Inelastic flow */ + if (q->double_buffering) { + sfbhash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports, + q->bins[slot].perturbation); + if (!sfbhash) + sfbhash = 1; + sfb_skb_cb(skb)->hashes[slot] = sfbhash; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b->qlen == 0) + decrement_prob(b, q); + else if (b->qlen >= q->bin_size) + increment_prob(b, q); + } + } + if (sfb_rate_limit(skb, q)) { + sch->qstats.overlimits++; + q->stats.penaltydrop++; + goto drop; + } + goto enqueue; + } + + r = prandom_u32() & SFB_MAX_PROB; + + if (unlikely(r < p_min)) { + if (unlikely(p_min > SFB_MAX_PROB / 2)) { + /* If we're marking that many packets, then either + * this flow is unresponsive, or we're badly congested. + * In either case, we want to start dropping packets. + */ + if (r < (p_min - SFB_MAX_PROB / 2) * 2) { + q->stats.earlydrop++; + goto drop; + } + } + if (INET_ECN_set_ce(skb)) { + q->stats.marked++; + } else { + q->stats.earlydrop++; + goto drop; + } + } + +enqueue: + ret = qdisc_enqueue(skb, child); + if (likely(ret == NET_XMIT_SUCCESS)) { + sch->q.qlen++; + increment_qlen(skb, q); + } else if (net_xmit_drop_count(ret)) { + q->stats.childdrop++; + sch->qstats.drops++; + } + return ret; + +drop: + qdisc_drop(skb, sch); + return NET_XMIT_CN; +other_drop: + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; +} + +static struct sk_buff *sfb_dequeue(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + struct sk_buff *skb; + + skb = child->dequeue(q->qdisc); + + if (skb) { + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + decrement_qlen(skb, q); + } + + return skb; +} + +static struct sk_buff *sfb_peek(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + + return child->ops->peek(child); +} + +/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */ + +static void sfb_reset(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); + sch->q.qlen = 0; + q->slot = 0; + q->double_buffering = false; + sfb_zero_all_buckets(q); + sfb_init_perturbation(0, q); +} + +static void sfb_destroy(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + qdisc_destroy(q->qdisc); +} + +static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = { + [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) }, +}; + +static const struct tc_sfb_qopt sfb_default_ops = { + .rehash_interval = 600 * MSEC_PER_SEC, + .warmup_time = 60 * MSEC_PER_SEC, + .limit = 0, + .max = 25, + .bin_size = 20, + .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */ + .decrement = (SFB_MAX_PROB + 3000) / 6000, + .penalty_rate = 10, + .penalty_burst = 20, +}; + +static int sfb_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child; + struct nlattr *tb[TCA_SFB_MAX + 1]; + const struct tc_sfb_qopt *ctl = &sfb_default_ops; + u32 limit; + int err; + + if (opt) { + err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy); + if (err < 0) + return -EINVAL; + + if (tb[TCA_SFB_PARMS] == NULL) + return -EINVAL; + + ctl = nla_data(tb[TCA_SFB_PARMS]); + } + + limit = ctl->limit; + if (limit == 0) + limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1); + + child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit); + if (IS_ERR(child)) + return PTR_ERR(child); + + sch_tree_lock(sch); + + qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_destroy(q->qdisc); + q->qdisc = child; + + q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); + q->warmup_time = msecs_to_jiffies(ctl->warmup_time); + q->rehash_time = jiffies; + q->limit = limit; + q->increment = ctl->increment; + q->decrement = ctl->decrement; + q->max = ctl->max; + q->bin_size = ctl->bin_size; + q->penalty_rate = ctl->penalty_rate; + q->penalty_burst = ctl->penalty_burst; + q->tokens_avail = ctl->penalty_burst; + q->token_time = jiffies; + + q->slot = 0; + q->double_buffering = false; + sfb_zero_all_buckets(q); + sfb_init_perturbation(0, q); + sfb_init_perturbation(1, q); + + sch_tree_unlock(sch); + + return 0; +} + +static int sfb_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + q->qdisc = &noop_qdisc; + return sfb_change(sch, opt); +} + +static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + struct tc_sfb_qopt opt = { + .rehash_interval = jiffies_to_msecs(q->rehash_interval), + .warmup_time = jiffies_to_msecs(q->warmup_time), + .limit = q->limit, + .max = q->max, + .bin_size = q->bin_size, + .increment = q->increment, + .decrement = q->decrement, + .penalty_rate = q->penalty_rate, + .penalty_burst = q->penalty_burst, + }; + + sch->qstats.backlog = q->qdisc->qstats.backlog; + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct tc_sfb_xstats st = { + .earlydrop = q->stats.earlydrop, + .penaltydrop = q->stats.penaltydrop, + .bucketdrop = q->stats.bucketdrop, + .queuedrop = q->stats.queuedrop, + .childdrop = q->stats.childdrop, + .marked = q->stats.marked, + }; + + st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q); + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static int sfb_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + return -ENOSYS; +} + +static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->qdisc; + q->qdisc = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + return q->qdisc; +} + +static unsigned long sfb_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void sfb_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg) +{ + return -ENOSYS; +} + +static int sfb_delete(struct Qdisc *sch, unsigned long cl) +{ + return -ENOSYS; +} + +static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + + +static const struct Qdisc_class_ops sfb_class_ops = { + .graft = sfb_graft, + .leaf = sfb_leaf, + .get = sfb_get, + .put = sfb_put, + .change = sfb_change_class, + .delete = sfb_delete, + .walk = sfb_walk, + .tcf_chain = sfb_find_tcf, + .bind_tcf = sfb_bind, + .unbind_tcf = sfb_put, + .dump = sfb_dump_class, +}; + +static struct Qdisc_ops sfb_qdisc_ops __read_mostly = { + .id = "sfb", + .priv_size = sizeof(struct sfb_sched_data), + .cl_ops = &sfb_class_ops, + .enqueue = sfb_enqueue, + .dequeue = sfb_dequeue, + .peek = sfb_peek, + .init = sfb_init, + .reset = sfb_reset, + .destroy = sfb_destroy, + .change = sfb_change, + .dump = sfb_dump, + .dump_stats = sfb_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init sfb_module_init(void) +{ + return register_qdisc(&sfb_qdisc_ops); +} + +static void __exit sfb_module_exit(void) +{ + unregister_qdisc(&sfb_qdisc_ops); +} + +module_init(sfb_module_init) +module_exit(sfb_module_exit) + +MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline"); +MODULE_AUTHOR("Juliusz Chroboczek"); +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 86d8da0cbd0..1af2f73906d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -9,33 +9,22 @@ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ -#include <linux/config.h> #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> #include <linux/init.h> -#include <net/ip.h> -#include <linux/ipv6.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/flow_keys.h> +#include <net/red.h> /* Stochastic Fairness Queuing algorithm. @@ -54,7 +43,7 @@ Queuing using Deficit Round Robin", Proc. SIGCOMM 95. - This is not the thing that is usually called (W)FQ nowadays. + This is not the thing that is usually called (W)FQ nowadays. It does not use any timestamp mechanism, but instead processes queues in round-robin order. @@ -64,7 +53,7 @@ DRAWBACKS: - - "Stochastic" -> It is not 100% fair. + - "Stochastic" -> It is not 100% fair. When hash collisions occur, several flows are considered as one. - "Round-robin" -> It introduces larger delays than virtual clock @@ -78,122 +67,194 @@ SFQ is superior for this purpose. IMPLEMENTATION: - This implementation limits maximal queue length to 128; - maximal mtu to 2^15-1; number of hash buckets to 1024. - The only goal of this restrictions was that all data - fit into one 4K page :-). Struct sfq_sched_data is - organized in anti-cache manner: all the data for a bucket - are scattered over different locations. This is not good, - but it allowed me to put it into 4K. + This implementation limits : + - maximal queue length per flow to 127 packets. + - max mtu to 2^18-1; + - max 65408 flows, + - number of hash buckets to 65536. It is easy to increase these values, but not in flight. */ -#define SFQ_DEPTH 128 -#define SFQ_HASH_DIVISOR 1024 +#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */ +#define SFQ_DEFAULT_FLOWS 128 +#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */ +#define SFQ_EMPTY_SLOT 0xffff +#define SFQ_DEFAULT_HASH_DIVISOR 1024 -/* This type should contain at least SFQ_DEPTH*2 values */ -typedef unsigned char sfq_index; +/* We use 16 bits to store allot, and want to handle packets up to 64K + * Scale allot by 8 (1<<3) so that no overflow occurs. + */ +#define SFQ_ALLOT_SHIFT 3 +#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT) -struct sfq_head -{ +/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ +typedef u16 sfq_index; + +/* + * We dont use pointers to save space. + * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array + * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH] + * are 'pointers' to dep[] array + */ +struct sfq_head { sfq_index next; sfq_index prev; }; -struct sfq_sched_data -{ -/* Parameters */ - int perturb_period; - unsigned quantum; /* Allotment per round: MUST BE >= MTU */ - int limit; +struct sfq_slot { + struct sk_buff *skblist_next; + struct sk_buff *skblist_prev; + sfq_index qlen; /* number of skbs in skblist */ + sfq_index next; /* next slot in sfq RR chain */ + struct sfq_head dep; /* anchor in dep[] chains */ + unsigned short hash; /* hash value (index in ht[]) */ + short allot; /* credit for this slot */ + + unsigned int backlog; + struct red_vars vars; +}; -/* Variables */ +struct sfq_sched_data { +/* frequently used fields */ + int limit; /* limit of total number of packets in this qdisc */ + unsigned int divisor; /* number of slots in hash table */ + u8 headdrop; + u8 maxdepth; /* limit of packets per flow */ + + u32 perturbation; + u8 cur_depth; /* depth of longest slot */ + u8 flags; + unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ + struct tcf_proto *filter_list; + sfq_index *ht; /* Hash table ('divisor' slots) */ + struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ + + struct red_parms *red_parms; + struct tc_sfqred_stats stats; + struct sfq_slot *tail; /* current slot in round */ + + struct sfq_head dep[SFQ_MAX_DEPTH + 1]; + /* Linked lists of slots, indexed by depth + * dep[0] : list of unused flows + * dep[1] : list of flows with 1 packet + * dep[X] : list of flows with X packets + */ + + unsigned int maxflows; /* number of flows in flows array */ + int perturb_period; + unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ struct timer_list perturb_timer; - int perturbation; - sfq_index tail; /* Index of current slot in round */ - sfq_index max_depth; /* Maximal depth */ - - sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ - sfq_index next[SFQ_DEPTH]; /* Active slots link */ - short allot[SFQ_DEPTH]; /* Current allotment per slot */ - unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */ - struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */ - struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */ }; -static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) +/* + * sfq_head are either in a sfq_slot or in dep[] array + */ +static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val) { - int pert = q->perturbation; + if (val < SFQ_MAX_FLOWS) + return &q->slots[val].dep; + return &q->dep[val - SFQ_MAX_FLOWS]; +} - /* Have we any rotation primitives? If not, WHY? */ - h ^= (h1<<pert) ^ (h1>>(0x1F - pert)); - h ^= h>>10; - return h & 0x3FF; +/* + * In order to be able to quickly rehash our queue when timer changes + * q->perturbation, we store flow_keys in skb->cb[] + */ +struct sfq_skb_cb { + struct flow_keys keys; +}; + +static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb)); + return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; } -static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) +static unsigned int sfq_hash(const struct sfq_sched_data *q, + const struct sk_buff *skb) { - u32 h, h2; + const struct flow_keys *keys = &sfq_skb_cb(skb)->keys; + unsigned int hash; - switch (skb->protocol) { - case __constant_htons(ETH_P_IP): - { - struct iphdr *iph = skb->nh.iph; - h = iph->daddr; - h2 = iph->saddr^iph->protocol; - if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && - (iph->protocol == IPPROTO_TCP || - iph->protocol == IPPROTO_UDP || - iph->protocol == IPPROTO_SCTP || - iph->protocol == IPPROTO_DCCP || - iph->protocol == IPPROTO_ESP)) - h2 ^= *(((u32*)iph) + iph->ihl); - break; - } - case __constant_htons(ETH_P_IPV6): - { - struct ipv6hdr *iph = skb->nh.ipv6h; - h = iph->daddr.s6_addr32[3]; - h2 = iph->saddr.s6_addr32[3]^iph->nexthdr; - if (iph->nexthdr == IPPROTO_TCP || - iph->nexthdr == IPPROTO_UDP || - iph->nexthdr == IPPROTO_SCTP || - iph->nexthdr == IPPROTO_DCCP || - iph->nexthdr == IPPROTO_ESP) - h2 ^= *(u32*)&iph[1]; - break; + hash = jhash_3words((__force u32)keys->dst, + (__force u32)keys->src ^ keys->ip_proto, + (__force u32)keys->ports, q->perturbation); + return hash & (q->divisor - 1); +} + +static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->divisor) + return TC_H_MIN(skb->priority); + + if (!q->filter_list) { + skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys); + return sfq_hash(q, skb) + 1; } - default: - h = (u32)(unsigned long)skb->dst^skb->protocol; - h2 = (u32)(unsigned long)skb->sk; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= q->divisor) + return TC_H_MIN(res.classid); } - return sfq_fold_hash(q, h, h2); + return 0; } +/* + * x : slot number [0 .. SFQ_MAX_FLOWS - 1] + */ static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; - int d = q->qs[x].qlen + SFQ_DEPTH; + struct sfq_slot *slot = &q->slots[x]; + int qlen = slot->qlen; - p = d; - n = q->dep[d].next; - q->dep[x].next = n; - q->dep[x].prev = p; - q->dep[p].next = q->dep[n].prev = x; + p = qlen + SFQ_MAX_FLOWS; + n = q->dep[qlen].next; + + slot->dep.next = n; + slot->dep.prev = p; + + q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */ + sfq_dep_head(q, n)->prev = x; } +#define sfq_unlink(q, x, n, p) \ + do { \ + n = q->slots[x].dep.next; \ + p = q->slots[x].dep.prev; \ + sfq_dep_head(q, p)->next = n; \ + sfq_dep_head(q, n)->prev = p; \ + } while (0) + + static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; + int d; - n = q->dep[x].next; - p = q->dep[x].prev; - q->dep[p].next = n; - q->dep[n].prev = p; - - if (n == p && q->max_depth == q->qs[x].qlen + 1) - q->max_depth--; + sfq_unlink(q, x, n, p); + d = q->slots[x].qlen--; + if (n == p && q->cur_depth == d) + q->cur_depth--; sfq_link(q, x); } @@ -202,169 +263,285 @@ static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) sfq_index p, n; int d; - n = q->dep[x].next; - p = q->dep[x].prev; - q->dep[p].next = n; - q->dep[n].prev = p; - d = q->qs[x].qlen; - if (q->max_depth < d) - q->max_depth = d; + sfq_unlink(q, x, n, p); + d = ++q->slots[x].qlen; + if (q->cur_depth < d) + q->cur_depth = d; sfq_link(q, x); } +/* helper functions : might be changed when/if skb use a standard list_head */ + +/* remove one skb from tail of slot queue */ +static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot) +{ + struct sk_buff *skb = slot->skblist_prev; + + slot->skblist_prev = skb->prev; + skb->prev->next = (struct sk_buff *)slot; + skb->next = skb->prev = NULL; + return skb; +} + +/* remove one skb from head of slot queue */ +static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) +{ + struct sk_buff *skb = slot->skblist_next; + + slot->skblist_next = skb->next; + skb->next->prev = (struct sk_buff *)slot; + skb->next = skb->prev = NULL; + return skb; +} + +static inline void slot_queue_init(struct sfq_slot *slot) +{ + memset(slot, 0, sizeof(*slot)); + slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; +} + +/* add skb to slot queue (tail add) */ +static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) +{ + skb->prev = slot->skblist_prev; + skb->next = (struct sk_buff *)slot; + slot->skblist_prev->next = skb; + slot->skblist_prev = skb; +} + +#define slot_queue_walk(slot, skb) \ + for (skb = slot->skblist_next; \ + skb != (struct sk_buff *)slot; \ + skb = skb->next) + static unsigned int sfq_drop(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); - sfq_index d = q->max_depth; + sfq_index x, d = q->cur_depth; struct sk_buff *skb; unsigned int len; + struct sfq_slot *slot; - /* Queue is full! Find the longest slot and - drop a packet from it */ - + /* Queue is full! Find the longest slot and drop tail packet from it */ if (d > 1) { - sfq_index x = q->dep[d+SFQ_DEPTH].next; - skb = q->qs[x].prev; - len = skb->len; - __skb_unlink(skb, &q->qs[x]); - kfree_skb(skb); + x = q->dep[d].next; + slot = &q->slots[x]; +drop: + skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); + len = qdisc_pkt_len(skb); + slot->backlog -= len; sfq_dec(q, x); + kfree_skb(skb); sch->q.qlen--; sch->qstats.drops++; + sch->qstats.backlog -= len; return len; } if (d == 1) { /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ - d = q->next[q->tail]; - q->next[q->tail] = q->next[d]; - q->allot[q->next[d]] += q->quantum; - skb = q->qs[d].prev; - len = skb->len; - __skb_unlink(skb, &q->qs[d]); - kfree_skb(skb); - sfq_dec(q, d); - sch->q.qlen--; - q->ht[q->hash[d]] = SFQ_DEPTH; - sch->qstats.drops++; - return len; + x = q->tail->next; + slot = &q->slots[x]; + q->tail->next = slot->next; + q->ht[slot->hash] = SFQ_EMPTY_SLOT; + goto drop; } return 0; } +/* Is ECN parameter configured */ +static int sfq_prob_mark(const struct sfq_sched_data *q) +{ + return q->flags & TC_RED_ECN; +} + +/* Should packets over max threshold just be marked */ +static int sfq_hard_mark(const struct sfq_sched_data *q) +{ + return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN; +} + +static int sfq_headdrop(const struct sfq_sched_data *q) +{ + return q->headdrop; +} + static int -sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) +sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); - unsigned hash = sfq_hash(q, skb); - sfq_index x; + unsigned int hash; + sfq_index x, qlen; + struct sfq_slot *slot; + int uninitialized_var(ret); + struct sk_buff *head; + int delta; + + hash = sfq_classify(skb, sch, &ret); + if (hash == 0) { + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } + hash--; x = q->ht[hash]; - if (x == SFQ_DEPTH) { - q->ht[hash] = x = q->dep[SFQ_DEPTH].next; - q->hash[x] = hash; + slot = &q->slots[x]; + if (x == SFQ_EMPTY_SLOT) { + x = q->dep[0].next; /* get a free slot */ + if (x >= SFQ_MAX_FLOWS) + return qdisc_drop(skb, sch); + q->ht[hash] = x; + slot = &q->slots[x]; + slot->hash = hash; + slot->backlog = 0; /* should already be 0 anyway... */ + red_set_vars(&slot->vars); + goto enqueue; } - __skb_queue_tail(&q->qs[x], skb); - sfq_inc(q, x); - if (q->qs[x].qlen == 1) { /* The flow is new */ - if (q->tail == SFQ_DEPTH) { /* It is the first flow */ - q->tail = x; - q->next[x] = x; - q->allot[x] = q->quantum; - } else { - q->next[x] = q->next[q->tail]; - q->next[q->tail] = x; - q->tail = x; + if (q->red_parms) { + slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms, + &slot->vars, + slot->backlog); + switch (red_action(q->red_parms, + &slot->vars, + slot->vars.qavg)) { + case RED_DONT_MARK: + break; + + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (sfq_prob_mark(q)) { + /* We know we have at least one packet in queue */ + if (sfq_headdrop(q) && + INET_ECN_set_ce(slot->skblist_next)) { + q->stats.prob_mark_head++; + break; + } + if (INET_ECN_set_ce(skb)) { + q->stats.prob_mark++; + break; + } + } + q->stats.prob_drop++; + goto congestion_drop; + + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (sfq_hard_mark(q)) { + /* We know we have at least one packet in queue */ + if (sfq_headdrop(q) && + INET_ECN_set_ce(slot->skblist_next)) { + q->stats.forced_mark_head++; + break; + } + if (INET_ECN_set_ce(skb)) { + q->stats.forced_mark++; + break; + } + } + q->stats.forced_drop++; + goto congestion_drop; } } - if (++sch->q.qlen < q->limit-1) { - sch->bstats.bytes += skb->len; - sch->bstats.packets++; - return 0; - } - sfq_drop(sch); - return NET_XMIT_CN; -} + if (slot->qlen >= q->maxdepth) { +congestion_drop: + if (!sfq_headdrop(q)) + return qdisc_drop(skb, sch); -static int -sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) -{ - struct sfq_sched_data *q = qdisc_priv(sch); - unsigned hash = sfq_hash(q, skb); - sfq_index x; + /* We know we have at least one packet in queue */ + head = slot_dequeue_head(slot); + delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); + sch->qstats.backlog -= delta; + slot->backlog -= delta; + qdisc_drop(head, sch); - x = q->ht[hash]; - if (x == SFQ_DEPTH) { - q->ht[hash] = x = q->dep[SFQ_DEPTH].next; - q->hash[x] = hash; + slot_queue_add(slot, skb); + return NET_XMIT_CN; } - __skb_queue_head(&q->qs[x], skb); + +enqueue: + sch->qstats.backlog += qdisc_pkt_len(skb); + slot->backlog += qdisc_pkt_len(skb); + slot_queue_add(slot, skb); sfq_inc(q, x); - if (q->qs[x].qlen == 1) { /* The flow is new */ - if (q->tail == SFQ_DEPTH) { /* It is the first flow */ - q->tail = x; - q->next[x] = x; - q->allot[x] = q->quantum; + if (slot->qlen == 1) { /* The flow is new */ + if (q->tail == NULL) { /* It is the first flow */ + slot->next = x; } else { - q->next[x] = q->next[q->tail]; - q->next[q->tail] = x; - q->tail = x; + slot->next = q->tail->next; + q->tail->next = x; } + /* We put this flow at the end of our flow list. + * This might sound unfair for a new flow to wait after old ones, + * but we could endup servicing new flows only, and freeze old ones. + */ + q->tail = slot; + /* We could use a bigger initial quantum for new flows */ + slot->allot = q->scaled_quantum; } - if (++sch->q.qlen < q->limit - 1) { - sch->qstats.requeues++; - return 0; - } + if (++sch->q.qlen <= q->limit) + return NET_XMIT_SUCCESS; - sch->qstats.drops++; + qlen = slot->qlen; sfq_drop(sch); - return NET_XMIT_CN; + /* Return Congestion Notification only if we dropped a packet + * from this flow. + */ + if (qlen != slot->qlen) + return NET_XMIT_CN; + + /* As we dropped a packet, better let upper stack know this */ + qdisc_tree_decrease_qlen(sch, 1); + return NET_XMIT_SUCCESS; } - - - static struct sk_buff * -sfq_dequeue(struct Qdisc* sch) +sfq_dequeue(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - sfq_index a, old_a; + sfq_index a, next_a; + struct sfq_slot *slot; /* No active slots */ - if (q->tail == SFQ_DEPTH) + if (q->tail == NULL) return NULL; - a = old_a = q->next[q->tail]; - - /* Grab packet */ - skb = __skb_dequeue(&q->qs[a]); +next_slot: + a = q->tail->next; + slot = &q->slots[a]; + if (slot->allot <= 0) { + q->tail = slot; + slot->allot += q->scaled_quantum; + goto next_slot; + } + skb = slot_dequeue_head(slot); sfq_dec(q, a); + qdisc_bstats_update(sch, skb); sch->q.qlen--; - + sch->qstats.backlog -= qdisc_pkt_len(skb); + slot->backlog -= qdisc_pkt_len(skb); /* Is the slot empty? */ - if (q->qs[a].qlen == 0) { - q->ht[q->hash[a]] = SFQ_DEPTH; - a = q->next[a]; - if (a == old_a) { - q->tail = SFQ_DEPTH; + if (slot->qlen == 0) { + q->ht[slot->hash] = SFQ_EMPTY_SLOT; + next_a = slot->next; + if (a == next_a) { + q->tail = NULL; /* no more active slots */ return skb; } - q->next[q->tail] = a; - q->allot[a] += q->quantum; - } else if ((q->allot[a] -= skb->len) <= 0) { - q->tail = a; - a = q->next[a]; - q->allot[a] += q->quantum; + q->tail->next = next_a; + } else { + slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb)); } return skb; } static void -sfq_reset(struct Qdisc* sch) +sfq_reset(struct Qdisc *sch) { struct sk_buff *skb; @@ -372,113 +549,375 @@ sfq_reset(struct Qdisc* sch) kfree_skb(skb); } +/* + * When q->perturbation is changed, we rehash all queued skbs + * to avoid OOO (Out Of Order) effects. + * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change + * counters. + */ +static void sfq_rehash(struct Qdisc *sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + int i; + struct sfq_slot *slot; + struct sk_buff_head list; + int dropped = 0; + + __skb_queue_head_init(&list); + + for (i = 0; i < q->maxflows; i++) { + slot = &q->slots[i]; + if (!slot->qlen) + continue; + while (slot->qlen) { + skb = slot_dequeue_head(slot); + sfq_dec(q, i); + __skb_queue_tail(&list, skb); + } + slot->backlog = 0; + red_set_vars(&slot->vars); + q->ht[slot->hash] = SFQ_EMPTY_SLOT; + } + q->tail = NULL; + + while ((skb = __skb_dequeue(&list)) != NULL) { + unsigned int hash = sfq_hash(q, skb); + sfq_index x = q->ht[hash]; + + slot = &q->slots[x]; + if (x == SFQ_EMPTY_SLOT) { + x = q->dep[0].next; /* get a free slot */ + if (x >= SFQ_MAX_FLOWS) { +drop: sch->qstats.backlog -= qdisc_pkt_len(skb); + kfree_skb(skb); + dropped++; + continue; + } + q->ht[hash] = x; + slot = &q->slots[x]; + slot->hash = hash; + } + if (slot->qlen >= q->maxdepth) + goto drop; + slot_queue_add(slot, skb); + if (q->red_parms) + slot->vars.qavg = red_calc_qavg(q->red_parms, + &slot->vars, + slot->backlog); + slot->backlog += qdisc_pkt_len(skb); + sfq_inc(q, x); + if (slot->qlen == 1) { /* The flow is new */ + if (q->tail == NULL) { /* It is the first flow */ + slot->next = x; + } else { + slot->next = q->tail->next; + q->tail->next = x; + } + q->tail = slot; + slot->allot = q->scaled_quantum; + } + } + sch->q.qlen -= dropped; + qdisc_tree_decrease_qlen(sch, dropped); +} + static void sfq_perturbation(unsigned long arg) { - struct Qdisc *sch = (struct Qdisc*)arg; + struct Qdisc *sch = (struct Qdisc *)arg; struct sfq_sched_data *q = qdisc_priv(sch); + spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); - q->perturbation = net_random()&0x1F; + spin_lock(root_lock); + q->perturbation = prandom_u32(); + if (!q->filter_list && q->tail) + sfq_rehash(sch); + spin_unlock(root_lock); - if (q->perturb_period) { - q->perturb_timer.expires = jiffies + q->perturb_period; - add_timer(&q->perturb_timer); - } + if (q->perturb_period) + mod_timer(&q->perturb_timer, jiffies + q->perturb_period); } -static int sfq_change(struct Qdisc *sch, struct rtattr *opt) +static int sfq_change(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); - struct tc_sfq_qopt *ctl = RTA_DATA(opt); + struct tc_sfq_qopt *ctl = nla_data(opt); + struct tc_sfq_qopt_v1 *ctl_v1 = NULL; + unsigned int qlen; + struct red_parms *p = NULL; - if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; - + if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1))) + ctl_v1 = nla_data(opt); + if (ctl->divisor && + (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) + return -EINVAL; + if (ctl_v1 && ctl_v1->qth_min) { + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + } sch_tree_lock(sch); - q->quantum = ctl->quantum ? : psched_mtu(sch->dev); - q->perturb_period = ctl->perturb_period*HZ; - if (ctl->limit) - q->limit = min_t(u32, ctl->limit, SFQ_DEPTH); + if (ctl->quantum) { + q->quantum = ctl->quantum; + q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); + } + q->perturb_period = ctl->perturb_period * HZ; + if (ctl->flows) + q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); + if (ctl->divisor) { + q->divisor = ctl->divisor; + q->maxflows = min_t(u32, q->maxflows, q->divisor); + } + if (ctl_v1) { + if (ctl_v1->depth) + q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); + if (p) { + swap(q->red_parms, p); + red_set_parms(q->red_parms, + ctl_v1->qth_min, ctl_v1->qth_max, + ctl_v1->Wlog, + ctl_v1->Plog, ctl_v1->Scell_log, + NULL, + ctl_v1->max_P); + } + q->flags = ctl_v1->flags; + q->headdrop = ctl_v1->headdrop; + } + if (ctl->limit) { + q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows); + q->maxflows = min_t(u32, q->maxflows, q->limit); + } - while (sch->q.qlen >= q->limit-1) + qlen = sch->q.qlen; + while (sch->q.qlen > q->limit) sfq_drop(sch); + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); del_timer(&q->perturb_timer); if (q->perturb_period) { - q->perturb_timer.expires = jiffies + q->perturb_period; - add_timer(&q->perturb_timer); + mod_timer(&q->perturb_timer, jiffies + q->perturb_period); + q->perturbation = prandom_u32(); } sch_tree_unlock(sch); + kfree(p); return 0; } -static int sfq_init(struct Qdisc *sch, struct rtattr *opt) +static void *sfq_alloc(size_t sz) +{ + void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN); + + if (!ptr) + ptr = vmalloc(sz); + return ptr; +} + +static void sfq_free(void *addr) +{ + kvfree(addr); +} + +static void sfq_destroy(struct Qdisc *sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + q->perturb_period = 0; + del_timer_sync(&q->perturb_timer); + sfq_free(q->ht); + sfq_free(q->slots); + kfree(q->red_parms); +} + +static int sfq_init(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); int i; - init_timer(&q->perturb_timer); - q->perturb_timer.data = (unsigned long)sch; q->perturb_timer.function = sfq_perturbation; + q->perturb_timer.data = (unsigned long)sch; + init_timer_deferrable(&q->perturb_timer); - for (i=0; i<SFQ_HASH_DIVISOR; i++) - q->ht[i] = SFQ_DEPTH; - for (i=0; i<SFQ_DEPTH; i++) { - skb_queue_head_init(&q->qs[i]); - q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH; - q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH; + for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { + q->dep[i].next = i + SFQ_MAX_FLOWS; + q->dep[i].prev = i + SFQ_MAX_FLOWS; } - q->limit = SFQ_DEPTH; - q->max_depth = 0; - q->tail = SFQ_DEPTH; - if (opt == NULL) { - q->quantum = psched_mtu(sch->dev); - q->perturb_period = 0; - } else { + + q->limit = SFQ_MAX_DEPTH; + q->maxdepth = SFQ_MAX_DEPTH; + q->cur_depth = 0; + q->tail = NULL; + q->divisor = SFQ_DEFAULT_HASH_DIVISOR; + q->maxflows = SFQ_DEFAULT_FLOWS; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); + q->perturb_period = 0; + q->perturbation = prandom_u32(); + + if (opt) { int err = sfq_change(sch, opt); if (err) return err; } - for (i=0; i<SFQ_DEPTH; i++) + + q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); + q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); + if (!q->ht || !q->slots) { + sfq_destroy(sch); + return -ENOMEM; + } + for (i = 0; i < q->divisor; i++) + q->ht[i] = SFQ_EMPTY_SLOT; + + for (i = 0; i < q->maxflows; i++) { + slot_queue_init(&q->slots[i]); sfq_link(q, i); + } + if (q->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } -static void sfq_destroy(struct Qdisc *sch) +static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct sfq_sched_data *q = qdisc_priv(sch); - del_timer(&q->perturb_timer); + unsigned char *b = skb_tail_pointer(skb); + struct tc_sfq_qopt_v1 opt; + struct red_parms *p = q->red_parms; + + memset(&opt, 0, sizeof(opt)); + opt.v0.quantum = q->quantum; + opt.v0.perturb_period = q->perturb_period / HZ; + opt.v0.limit = q->limit; + opt.v0.divisor = q->divisor; + opt.v0.flows = q->maxflows; + opt.depth = q->maxdepth; + opt.headdrop = q->headdrop; + + if (p) { + opt.qth_min = p->qth_min >> p->Wlog; + opt.qth_max = p->qth_max >> p->Wlog; + opt.Wlog = p->Wlog; + opt.Plog = p->Plog; + opt.Scell_log = p->Scell_log; + opt.max_P = p->max_P; + } + memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); + opt.flags = q->flags; + + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; } -static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) +static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long sfq_get(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + /* we cannot bypass queue discipline anymore */ + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static void sfq_put(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl) { struct sfq_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; - struct tc_sfq_qopt opt; - opt.quantum = q->quantum; - opt.perturb_period = q->perturb_period/HZ; + if (cl) + return NULL; + return &q->filter_list; +} - opt.limit = q->limit; - opt.divisor = SFQ_HASH_DIVISOR; - opt.flows = q->limit; +static int sfq_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} - RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); +static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + sfq_index idx = q->ht[cl - 1]; + struct gnet_stats_queue qs = { 0 }; + struct tc_sfq_xstats xstats = { 0 }; - return skb->len; + if (idx != SFQ_EMPTY_SLOT) { + const struct sfq_slot *slot = &q->slots[idx]; -rtattr_failure: - skb_trim(skb, b - skb->data); - return -1; + xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; + qs.qlen = slot->qlen; + qs.backlog = slot->backlog; + } + if (gnet_stats_copy_queue(d, &qs) < 0) + return -1; + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } -static struct Qdisc_ops sfq_qdisc_ops = { - .next = NULL, - .cl_ops = NULL, +static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->divisor; i++) { + if (q->ht[i] == SFQ_EMPTY_SLOT || + arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops sfq_class_ops = { + .leaf = sfq_leaf, + .get = sfq_get, + .put = sfq_put, + .tcf_chain = sfq_find_tcf, + .bind_tcf = sfq_bind, + .unbind_tcf = sfq_put, + .dump = sfq_dump_class, + .dump_stats = sfq_dump_class_stats, + .walk = sfq_walk, +}; + +static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { + .cl_ops = &sfq_class_ops, .id = "sfq", .priv_size = sizeof(struct sfq_sched_data), .enqueue = sfq_enqueue, .dequeue = sfq_dequeue, - .requeue = sfq_requeue, + .peek = qdisc_peek_dequeued, .drop = sfq_drop, .init = sfq_init, .reset = sfq_reset, @@ -492,7 +931,7 @@ static int __init sfq_module_init(void) { return register_qdisc(&sfq_qdisc_ops); } -static void __exit sfq_module_exit(void) +static void __exit sfq_module_exit(void) { unregister_qdisc(&sfq_qdisc_ops); } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index cb9711ea8c6..18ff6343370 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -12,30 +12,14 @@ * */ -#include <linux/config.h> #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/jiffies.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> +#include <net/netlink.h> +#include <net/sch_generic.h> #include <net/pkt_sched.h> @@ -114,129 +98,177 @@ changed the limit is not effective anymore. */ -struct tbf_sched_data -{ +struct tbf_sched_data { /* Parameters */ u32 limit; /* Maximal length of backlog: bytes */ - u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ - u32 mtu; u32 max_size; - struct qdisc_rate_table *R_tab; - struct qdisc_rate_table *P_tab; + s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ + s64 mtu; + struct psched_ratecfg rate; + struct psched_ratecfg peak; /* Variables */ - long tokens; /* Current number of B tokens */ - long ptokens; /* Current number of P tokens */ - psched_time_t t_c; /* Time check-point */ - struct timer_list wd_timer; /* Watchdog timer */ + s64 tokens; /* Current number of B tokens */ + s64 ptokens; /* Current number of P tokens */ + s64 t_c; /* Time check-point */ struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ + struct qdisc_watchdog watchdog; /* Watchdog timer */ }; -#define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log]) -#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log]) -static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) +/* Time to Length, convert time in ns to length in bytes + * to determinate how many bytes can be sent in given time. + */ +static u64 psched_ns_t2l(const struct psched_ratecfg *r, + u64 time_in_ns) { - struct tbf_sched_data *q = qdisc_priv(sch); - int ret; + /* The formula is : + * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC + */ + u64 len = time_in_ns * r->rate_bytes_ps; - if (skb->len > q->max_size) { - sch->qstats.drops++; -#ifdef CONFIG_NET_CLS_POLICE - if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch)) -#endif - kfree_skb(skb); + do_div(len, NSEC_PER_SEC); - return NET_XMIT_DROP; + if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) { + do_div(len, 53); + len = len * 48; } - if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) { - sch->qstats.drops++; - return ret; - } + if (len > r->overhead) + len -= r->overhead; + else + len = 0; - sch->q.qlen++; - sch->bstats.bytes += skb->len; - sch->bstats.packets++; - return 0; + return len; } -static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch) +/* + * Return length of individual segments of a gso packet, + * including all headers (MAC, IP, TCP/UDP) + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + return hdr_len + skb_gso_transport_seglen(skb); +} + +/* GSO packet is too big, segment it so that tbf can transmit + * each segment in time + */ +static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct sk_buff *segs, *nskb; + netdev_features_t features = netif_skb_features(skb); + int ret, nb; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + + if (IS_ERR_OR_NULL(segs)) + return qdisc_reshape_fail(skb, sch); + + nb = 0; + while (segs) { + nskb = segs->next; + segs->next = NULL; + qdisc_skb_cb(segs)->pkt_len = segs->len; + ret = qdisc_enqueue(segs, q->qdisc); + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + } else { + nb++; + } + segs = nskb; + } + sch->q.qlen += nb; + if (nb > 1) + qdisc_tree_decrease_qlen(sch, 1 - nb); + consume_skb(skb); + return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; +} + +static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); int ret; - if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) { - sch->q.qlen++; - sch->qstats.requeues++; + if (qdisc_pkt_len(skb) > q->max_size) { + if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) + return tbf_segment(skb, sch); + return qdisc_reshape_fail(skb, sch); + } + ret = qdisc_enqueue(skb, q->qdisc); + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + return ret; } - return ret; + sch->q.qlen++; + return NET_XMIT_SUCCESS; } -static unsigned int tbf_drop(struct Qdisc* sch) +static unsigned int tbf_drop(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); - unsigned int len; + unsigned int len = 0; - if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { + if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { sch->q.qlen--; sch->qstats.drops++; } return len; } -static void tbf_watchdog(unsigned long arg) +static bool tbf_peak_present(const struct tbf_sched_data *q) { - struct Qdisc *sch = (struct Qdisc*)arg; - - sch->flags &= ~TCQ_F_THROTTLED; - netif_schedule(sch->dev); + return q->peak.rate_bytes_ps; } -static struct sk_buff *tbf_dequeue(struct Qdisc* sch) +static struct sk_buff *tbf_dequeue(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - skb = q->qdisc->dequeue(q->qdisc); + skb = q->qdisc->ops->peek(q->qdisc); if (skb) { - psched_time_t now; - long toks, delay; - long ptoks = 0; - unsigned int len = skb->len; - - PSCHED_GET_TIME(now); + s64 now; + s64 toks; + s64 ptoks = 0; + unsigned int len = qdisc_pkt_len(skb); - toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer); + now = ktime_to_ns(ktime_get()); + toks = min_t(s64, now - q->t_c, q->buffer); - if (q->P_tab) { + if (tbf_peak_present(q)) { ptoks = toks + q->ptokens; - if (ptoks > (long)q->mtu) + if (ptoks > q->mtu) ptoks = q->mtu; - ptoks -= L2T_P(q, len); + ptoks -= (s64) psched_l2t_ns(&q->peak, len); } toks += q->tokens; - if (toks > (long)q->buffer) + if (toks > q->buffer) toks = q->buffer; - toks -= L2T(q, len); + toks -= (s64) psched_l2t_ns(&q->rate, len); if ((toks|ptoks) >= 0) { + skb = qdisc_dequeue_peeked(q->qdisc); + if (unlikely(!skb)) + return NULL; + q->t_c = now; q->tokens = toks; q->ptokens = ptoks; sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); return skb; } - delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks)); - - if (delay == 0) - delay = 1; - - mod_timer(&q->wd_timer, jiffies+delay); + qdisc_watchdog_schedule_ns(&q->watchdog, + now + max_t(long, -toks, -ptoks)); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, @@ -249,135 +281,158 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch) (cf. CSZ, HPFQ, HFSC) */ - if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { - /* When requeue fails skb is dropped */ - sch->q.qlen--; - sch->qstats.drops++; - } - - sch->flags |= TCQ_F_THROTTLED; sch->qstats.overlimits++; } return NULL; } -static void tbf_reset(struct Qdisc* sch) +static void tbf_reset(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); sch->q.qlen = 0; - PSCHED_GET_TIME(q->t_c); + q->t_c = ktime_to_ns(ktime_get()); q->tokens = q->buffer; q->ptokens = q->mtu; - sch->flags &= ~TCQ_F_THROTTLED; - del_timer(&q->wd_timer); + qdisc_watchdog_cancel(&q->watchdog); } -static struct Qdisc *tbf_create_dflt_qdisc(struct net_device *dev, u32 limit) -{ - struct Qdisc *q = qdisc_create_dflt(dev, &bfifo_qdisc_ops); - struct rtattr *rta; - int ret; - - if (q) { - rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); - if (rta) { - rta->rta_type = RTM_NEWQDISC; - rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); - ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; - - ret = q->ops->change(q, rta); - kfree(rta); - - if (ret == 0) - return q; - } - qdisc_destroy(q); - } - - return NULL; -} +static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { + [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) }, + [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_TBF_RATE64] = { .type = NLA_U64 }, + [TCA_TBF_PRATE64] = { .type = NLA_U64 }, + [TCA_TBF_BURST] = { .type = NLA_U32 }, + [TCA_TBF_PBURST] = { .type = NLA_U32 }, +}; -static int tbf_change(struct Qdisc* sch, struct rtattr *opt) +static int tbf_change(struct Qdisc *sch, struct nlattr *opt) { - int err = -EINVAL; + int err; struct tbf_sched_data *q = qdisc_priv(sch); - struct rtattr *tb[TCA_TBF_PTAB]; + struct nlattr *tb[TCA_TBF_MAX + 1]; struct tc_tbf_qopt *qopt; - struct qdisc_rate_table *rtab = NULL; - struct qdisc_rate_table *ptab = NULL; struct Qdisc *child = NULL; - int max_size,n; - - if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) || - tb[TCA_TBF_PARMS-1] == NULL || - RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) + struct psched_ratecfg rate; + struct psched_ratecfg peak; + u64 max_size; + s64 buffer, mtu; + u64 rate64 = 0, prate64 = 0; + + err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_TBF_PARMS] == NULL) goto done; - qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]); - rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]); - if (rtab == NULL) - goto done; + qopt = nla_data(tb[TCA_TBF_PARMS]); + if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&qopt->rate, + tb[TCA_TBF_RTAB])); + + if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate, + tb[TCA_TBF_PTAB])); + + buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U); + mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U); + + if (tb[TCA_TBF_RATE64]) + rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); + psched_ratecfg_precompute(&rate, &qopt->rate, rate64); + + if (tb[TCA_TBF_BURST]) { + max_size = nla_get_u32(tb[TCA_TBF_BURST]); + buffer = psched_l2t_ns(&rate, max_size); + } else { + max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U); + } if (qopt->peakrate.rate) { - if (qopt->peakrate.rate > qopt->rate.rate) - ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]); - if (ptab == NULL) + if (tb[TCA_TBF_PRATE64]) + prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); + psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64); + if (peak.rate_bytes_ps <= rate.rate_bytes_ps) { + pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n", + peak.rate_bytes_ps, rate.rate_bytes_ps); + err = -EINVAL; goto done; + } + + if (tb[TCA_TBF_PBURST]) { + u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]); + max_size = min_t(u32, max_size, pburst); + mtu = psched_l2t_ns(&peak, pburst); + } else { + max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu)); + } + } else { + memset(&peak, 0, sizeof(peak)); } - for (n = 0; n < 256; n++) - if (rtab->data[n] > qopt->buffer) break; - max_size = (n << qopt->rate.cell_log)-1; - if (ptab) { - int size; + if (max_size < psched_mtu(qdisc_dev(sch))) + pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n", + max_size, qdisc_dev(sch)->name, + psched_mtu(qdisc_dev(sch))); - for (n = 0; n < 256; n++) - if (ptab->data[n] > qopt->mtu) break; - size = (n << qopt->peakrate.cell_log)-1; - if (size < max_size) max_size = size; - } - if (max_size < 0) + if (!max_size) { + err = -EINVAL; goto done; + } - if (q->qdisc == &noop_qdisc) { - if ((child = tbf_create_dflt_qdisc(sch->dev, qopt->limit)) == NULL) + if (q->qdisc != &noop_qdisc) { + err = fifo_set_limit(q->qdisc, qopt->limit); + if (err) + goto done; + } else if (qopt->limit > 0) { + child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit); + if (IS_ERR(child)) { + err = PTR_ERR(child); goto done; + } } sch_tree_lock(sch); - if (child) q->qdisc = child; + if (child) { + qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_destroy(q->qdisc); + q->qdisc = child; + } q->limit = qopt->limit; - q->mtu = qopt->mtu; + if (tb[TCA_TBF_PBURST]) + q->mtu = mtu; + else + q->mtu = PSCHED_TICKS2NS(qopt->mtu); q->max_size = max_size; - q->buffer = qopt->buffer; + if (tb[TCA_TBF_BURST]) + q->buffer = buffer; + else + q->buffer = PSCHED_TICKS2NS(qopt->buffer); q->tokens = q->buffer; q->ptokens = q->mtu; - rtab = xchg(&q->R_tab, rtab); - ptab = xchg(&q->P_tab, ptab); + + memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg)); + memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg)); + sch_tree_unlock(sch); err = 0; done: - if (rtab) - qdisc_put_rtab(rtab); - if (ptab) - qdisc_put_rtab(ptab); return err; } -static int tbf_init(struct Qdisc* sch, struct rtattr *opt) +static int tbf_init(struct Qdisc *sch, struct nlattr *opt) { struct tbf_sched_data *q = qdisc_priv(sch); if (opt == NULL) return -EINVAL; - PSCHED_GET_TIME(q->t_c); - init_timer(&q->wd_timer); - q->wd_timer.function = tbf_watchdog; - q->wd_timer.data = (unsigned long)sch; - + q->t_c = ktime_to_ns(ktime_get()); + qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = &noop_qdisc; return tbf_change(sch, opt); @@ -387,41 +442,43 @@ static void tbf_destroy(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); - del_timer(&q->wd_timer); - - if (q->P_tab) - qdisc_put_rtab(q->P_tab); - if (q->R_tab) - qdisc_put_rtab(q->R_tab); - + qdisc_watchdog_cancel(&q->watchdog); qdisc_destroy(q->qdisc); } static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tbf_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; - struct rtattr *rta; + struct nlattr *nest; struct tc_tbf_qopt opt; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + sch->qstats.backlog = q->qdisc->qstats.backlog; + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; opt.limit = q->limit; - opt.rate = q->R_tab->rate; - if (q->P_tab) - opt.peakrate = q->P_tab->rate; + psched_ratecfg_getrate(&opt.rate, &q->rate); + if (tbf_peak_present(q)) + psched_ratecfg_getrate(&opt.peakrate, &q->peak); else memset(&opt.peakrate, 0, sizeof(opt.peakrate)); - opt.mtu = q->mtu; - opt.buffer = q->buffer; - RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); - rta->rta_len = skb->tail - b; - - return skb->len; - -rtattr_failure: - skb_trim(skb, b - skb->data); + opt.mtu = PSCHED_NS2TICKS(q->mtu); + opt.buffer = PSCHED_NS2TICKS(q->buffer); + if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + if (q->rate.rate_bytes_ps >= (1ULL << 32) && + nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps)) + goto nla_put_failure; + if (tbf_peak_present(q) && + q->peak.rate_bytes_ps >= (1ULL << 32) && + nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } @@ -430,9 +487,6 @@ static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, { struct tbf_sched_data *q = qdisc_priv(sch); - if (cl != 1) /* only one class */ - return -ENOENT; - tcm->tcm_handle |= TC_H_MIN(1); tcm->tcm_info = q->qdisc->handle; @@ -448,9 +502,10 @@ static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new = &noop_qdisc; sch_tree_lock(sch); - *old = xchg(&q->qdisc, new); + *old = q->qdisc; + q->qdisc = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); qdisc_reset(*old); - sch->q.qlen = 0; sch_tree_unlock(sch); return 0; @@ -471,17 +526,6 @@ static void tbf_put(struct Qdisc *sch, unsigned long arg) { } -static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - struct rtattr **tca, unsigned long *arg) -{ - return -ENOSYS; -} - -static int tbf_delete(struct Qdisc *sch, unsigned long arg) -{ - return -ENOSYS; -} - static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { @@ -494,32 +538,23 @@ static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl) -{ - return NULL; -} - -static struct Qdisc_class_ops tbf_class_ops = -{ +static const struct Qdisc_class_ops tbf_class_ops = { .graft = tbf_graft, .leaf = tbf_leaf, .get = tbf_get, .put = tbf_put, - .change = tbf_change_class, - .delete = tbf_delete, .walk = tbf_walk, - .tcf_chain = tbf_find_tcf, .dump = tbf_dump_class, }; -static struct Qdisc_ops tbf_qdisc_ops = { +static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &tbf_class_ops, .id = "tbf", .priv_size = sizeof(struct tbf_sched_data), .enqueue = tbf_enqueue, .dequeue = tbf_dequeue, - .requeue = tbf_requeue, + .peek = qdisc_peek_dequeued, .drop = tbf_drop, .init = tbf_init, .reset = tbf_reset, diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 79b8ef34c6e..47416716294 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -9,31 +9,18 @@ */ #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> +#include <linux/slab.h> #include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> -#include <linux/interrupt.h> #include <linux/if_arp.h> -#include <linux/if_ether.h> -#include <linux/inet.h> #include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> #include <linux/init.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> #include <linux/moduleparam.h> -#include <net/sock.h> +#include <net/dst.h> +#include <net/neighbour.h> #include <net/pkt_sched.h> /* @@ -66,77 +53,73 @@ which will not break load balancing, though native slave traffic will have the highest priority. */ -struct teql_master -{ +struct teql_master { struct Qdisc_ops qops; struct net_device *dev; struct Qdisc *slaves; struct list_head master_list; - struct net_device_stats stats; + unsigned long tx_bytes; + unsigned long tx_packets; + unsigned long tx_errors; + unsigned long tx_dropped; }; -struct teql_sched_data -{ +struct teql_sched_data { struct Qdisc *next; struct teql_master *m; - struct neighbour *ncache; struct sk_buff_head q; }; -#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next) +#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next) -#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST) +#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT) /* "teql*" qdisc routines */ static int -teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) +teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct net_device *dev = sch->dev; + struct net_device *dev = qdisc_dev(sch); struct teql_sched_data *q = qdisc_priv(sch); - __skb_queue_tail(&q->q, skb); - if (q->q.qlen <= dev->tx_queue_len) { - sch->bstats.bytes += skb->len; - sch->bstats.packets++; - return 0; + if (q->q.qlen < dev->tx_queue_len) { + __skb_queue_tail(&q->q, skb); + return NET_XMIT_SUCCESS; } - __skb_unlink(skb, &q->q); - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; -} - -static int -teql_requeue(struct sk_buff *skb, struct Qdisc* sch) -{ - struct teql_sched_data *q = qdisc_priv(sch); - - __skb_queue_head(&q->q, skb); - sch->qstats.requeues++; - return 0; + return qdisc_drop(skb, sch); } static struct sk_buff * -teql_dequeue(struct Qdisc* sch) +teql_dequeue(struct Qdisc *sch) { struct teql_sched_data *dat = qdisc_priv(sch); + struct netdev_queue *dat_queue; struct sk_buff *skb; skb = __skb_dequeue(&dat->q); + dat_queue = netdev_get_tx_queue(dat->m->dev, 0); if (skb == NULL) { - struct net_device *m = dat->m->dev->qdisc->dev; + struct net_device *m = qdisc_dev(dat_queue->qdisc); if (m) { dat->m->slaves = sch; netif_wake_queue(m); } + } else { + qdisc_bstats_update(sch, skb); } - sch->q.qlen = dat->q.qlen + dat->m->dev->qdisc->q.qlen; + sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen; return skb; } -static __inline__ void +static struct sk_buff * +teql_peek(struct Qdisc *sch) +{ + /* teql is meant to be used as root qdisc */ + return NULL; +} + +static inline void teql_neigh_release(struct neighbour *n) { if (n) @@ -144,23 +127,23 @@ teql_neigh_release(struct neighbour *n) } static void -teql_reset(struct Qdisc* sch) +teql_reset(struct Qdisc *sch) { struct teql_sched_data *dat = qdisc_priv(sch); skb_queue_purge(&dat->q); sch->q.qlen = 0; - teql_neigh_release(xchg(&dat->ncache, NULL)); } static void -teql_destroy(struct Qdisc* sch) +teql_destroy(struct Qdisc *sch) { struct Qdisc *q, *prev; struct teql_sched_data *dat = qdisc_priv(sch); struct teql_master *master = dat->m; - if ((prev = master->slaves) != NULL) { + prev = master->slaves; + if (prev) { do { q = NEXT_SLAVE(prev); if (q == sch) { @@ -168,25 +151,30 @@ teql_destroy(struct Qdisc* sch) if (q == master->slaves) { master->slaves = NEXT_SLAVE(q); if (q == master->slaves) { + struct netdev_queue *txq; + spinlock_t *root_lock; + + txq = netdev_get_tx_queue(master->dev, 0); master->slaves = NULL; - spin_lock_bh(&master->dev->queue_lock); - qdisc_reset(master->dev->qdisc); - spin_unlock_bh(&master->dev->queue_lock); + + root_lock = qdisc_root_sleeping_lock(txq->qdisc); + spin_lock_bh(root_lock); + qdisc_reset(txq->qdisc); + spin_unlock_bh(root_lock); } } skb_queue_purge(&dat->q); - teql_neigh_release(xchg(&dat->ncache, NULL)); break; } - + } while ((prev = q) != master->slaves); } } -static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt) +static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt) { - struct net_device *dev = sch->dev; - struct teql_master *m = (struct teql_master*)sch->ops; + struct net_device *dev = qdisc_dev(sch); + struct teql_master *m = (struct teql_master *)sch->ops; struct teql_sched_data *q = qdisc_priv(sch); if (dev->hard_header_len > m->dev->hard_header_len) @@ -201,10 +189,13 @@ static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt) if (m->slaves) { if (m->dev->flags & IFF_UP) { - if ((m->dev->flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT)) - || (m->dev->flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST)) - || (m->dev->flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST)) - || dev->mtu < m->dev->mtu) + if ((m->dev->flags & IFF_POINTOPOINT && + !(dev->flags & IFF_POINTOPOINT)) || + (m->dev->flags & IFF_BROADCAST && + !(dev->flags & IFF_BROADCAST)) || + (m->dev->flags & IFF_MULTICAST && + !(dev->flags & IFF_MULTICAST)) || + dev->mtu < m->dev->mtu) return -EINVAL; } else { if (!(dev->flags&IFF_POINTOPOINT)) @@ -227,58 +218,74 @@ static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt) return 0; } -/* "teql*" netdevice routines */ static int -__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) +__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, + struct net_device *dev, struct netdev_queue *txq, + struct dst_entry *dst) { - struct teql_sched_data *q = qdisc_priv(dev->qdisc); - struct neighbour *mn = skb->dst->neighbour; - struct neighbour *n = q->ncache; + struct neighbour *n; + int err = 0; - if (mn->tbl == NULL) - return -EINVAL; - if (n && n->tbl == mn->tbl && - memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { - atomic_inc(&n->refcnt); - } else { - n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev); - if (IS_ERR(n)) - return PTR_ERR(n); + n = dst_neigh_lookup_skb(dst, skb); + if (!n) + return -ENOENT; + + if (dst->dev != dev) { + struct neighbour *mn; + + mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev); + neigh_release(n); + if (IS_ERR(mn)) + return PTR_ERR(mn); + n = mn; } + if (neigh_event_send(n, skb_res) == 0) { int err; - read_lock(&n->lock); - err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len); - read_unlock(&n->lock); - if (err < 0) { - neigh_release(n); - return -EINVAL; - } - teql_neigh_release(xchg(&q->ncache, n)); - return 0; + char haddr[MAX_ADDR_LEN]; + + neigh_ha_snapshot(haddr, n, dev); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr, + NULL, skb->len); + + if (err < 0) + err = -EINVAL; + } else { + err = (skb_res == NULL) ? -EAGAIN : 1; } neigh_release(n); - return (skb_res == NULL) ? -EAGAIN : 1; + return err; } -static __inline__ int -teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) +static inline int teql_resolve(struct sk_buff *skb, + struct sk_buff *skb_res, + struct net_device *dev, + struct netdev_queue *txq) { - if (dev->hard_header == NULL || - skb->dst == NULL || - skb->dst->neighbour == NULL) + struct dst_entry *dst = skb_dst(skb); + int res; + + if (txq->qdisc == &noop_qdisc) + return -ENODEV; + + if (!dev->header_ops || !dst) return 0; - return __teql_resolve(skb, skb_res, dev); + + rcu_read_lock(); + res = __teql_resolve(skb, skb_res, dev, txq, dst); + rcu_read_unlock(); + + return res; } -static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev) { struct teql_master *master = netdev_priv(dev); struct Qdisc *start, *q; int busy; int nores; - int len = skb->len; + int subq = skb_get_queue_mapping(skb); struct sk_buff *skb_res = NULL; start = master->slaves; @@ -287,47 +294,51 @@ restart: nores = 0; busy = 0; - if ((q = start) == NULL) + q = start; + if (!q) goto drop; do { - struct net_device *slave = q->dev; - - if (slave->qdisc_sleeping != q) + struct net_device *slave = qdisc_dev(q); + struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0); + const struct net_device_ops *slave_ops = slave->netdev_ops; + + if (slave_txq->qdisc_sleeping != q) continue; - if (netif_queue_stopped(slave) || ! netif_running(slave)) { + if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) || + !netif_running(slave)) { busy = 1; continue; } - switch (teql_resolve(skb, skb_res, slave)) { + switch (teql_resolve(skb, skb_res, slave, slave_txq)) { case 0: - if (spin_trylock(&slave->xmit_lock)) { - slave->xmit_lock_owner = smp_processor_id(); - if (!netif_queue_stopped(slave) && - slave->hard_start_xmit(skb, slave) == 0) { - slave->xmit_lock_owner = -1; - spin_unlock(&slave->xmit_lock); + if (__netif_tx_trylock(slave_txq)) { + unsigned int length = qdisc_pkt_len(skb); + + if (!netif_xmit_frozen_or_stopped(slave_txq) && + slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) { + txq_trans_update(slave_txq); + __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); - master->stats.tx_packets++; - master->stats.tx_bytes += len; - return 0; + master->tx_packets++; + master->tx_bytes += length; + return NETDEV_TX_OK; } - slave->xmit_lock_owner = -1; - spin_unlock(&slave->xmit_lock); + __netif_tx_unlock(slave_txq); } - if (netif_queue_stopped(dev)) + if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0))) busy = 1; break; case 1: master->slaves = NEXT_SLAVE(q); - return 0; + return NETDEV_TX_OK; default: nores = 1; break; } - __skb_pull(skb, skb->nh.raw - skb->data); + __skb_pull(skb, skb_network_offset(skb)); } while ((q = NEXT_SLAVE(q)) != start); if (nores && skb_res == NULL) { @@ -337,22 +348,22 @@ restart: if (busy) { netif_stop_queue(dev); - return 1; + return NETDEV_TX_BUSY; } - master->stats.tx_errors++; + master->tx_errors++; drop: - master->stats.tx_dropped++; + master->tx_dropped++; dev_kfree_skb(skb); - return 0; + return NETDEV_TX_OK; } static int teql_master_open(struct net_device *dev) { - struct Qdisc * q; + struct Qdisc *q; struct teql_master *m = netdev_priv(dev); int mtu = 0xFFFE; - unsigned flags = IFF_NOARP|IFF_MULTICAST; + unsigned int flags = IFF_NOARP | IFF_MULTICAST; if (m->slaves == NULL) return -EUNATCH; @@ -361,7 +372,7 @@ static int teql_master_open(struct net_device *dev) q = m->slaves; do { - struct net_device *slave = q->dev; + struct net_device *slave = qdisc_dev(q); if (slave == NULL) return -EUNATCH; @@ -395,10 +406,16 @@ static int teql_master_close(struct net_device *dev) return 0; } -static struct net_device_stats *teql_master_stats(struct net_device *dev) +static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) { struct teql_master *m = netdev_priv(dev); - return &m->stats; + + stats->tx_packets = m->tx_packets; + stats->tx_bytes = m->tx_bytes; + stats->tx_errors = m->tx_errors; + stats->tx_dropped = m->tx_dropped; + return stats; } static int teql_master_mtu(struct net_device *dev, int new_mtu) @@ -412,15 +429,23 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu) q = m->slaves; if (q) { do { - if (new_mtu > q->dev->mtu) + if (new_mtu > qdisc_dev(q)->mtu) return -EINVAL; - } while ((q=NEXT_SLAVE(q)) != m->slaves); + } while ((q = NEXT_SLAVE(q)) != m->slaves); } dev->mtu = new_mtu; return 0; } +static const struct net_device_ops teql_netdev_ops = { + .ndo_open = teql_master_open, + .ndo_stop = teql_master_close, + .ndo_start_xmit = teql_master_xmit, + .ndo_get_stats64 = teql_master_stats64, + .ndo_change_mtu = teql_master_mtu, +}; + static __init void teql_master_setup(struct net_device *dev) { struct teql_master *master = netdev_priv(dev); @@ -428,26 +453,22 @@ static __init void teql_master_setup(struct net_device *dev) master->dev = dev; ops->priv_size = sizeof(struct teql_sched_data); - + ops->enqueue = teql_enqueue; ops->dequeue = teql_dequeue; - ops->requeue = teql_requeue; + ops->peek = teql_peek; ops->init = teql_qdisc_init; ops->reset = teql_reset; ops->destroy = teql_destroy; ops->owner = THIS_MODULE; - dev->open = teql_master_open; - dev->hard_start_xmit = teql_master_xmit; - dev->stop = teql_master_close; - dev->get_stats = teql_master_stats; - dev->change_mtu = teql_master_mtu; + dev->netdev_ops = &teql_netdev_ops; dev->type = ARPHRD_VOID; dev->mtu = 1500; dev->tx_queue_len = 100; dev->flags = IFF_NOARP; dev->hard_header_len = LL_MAX_HEADER; - SET_MODULE_OWNER(dev); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } static LIST_HEAD(master_dev_list); @@ -492,7 +513,7 @@ static int __init teql_init(void) return i ? 0 : err; } -static void __exit teql_exit(void) +static void __exit teql_exit(void) { struct teql_master *master, *nxt; |
