aboutsummaryrefslogtreecommitdiff
path: root/net/sched
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig168
-rw-r--r--net/sched/Makefile14
-rw-r--r--net/sched/act_api.c723
-rw-r--r--net/sched/act_csum.c101
-rw-r--r--net/sched/act_gact.c86
-rw-r--r--net/sched/act_ipt.c145
-rw-r--r--net/sched/act_mirred.c104
-rw-r--r--net/sched/act_nat.c59
-rw-r--r--net/sched/act_pedit.c80
-rw-r--r--net/sched/act_police.c228
-rw-r--r--net/sched/act_simple.c95
-rw-r--r--net/sched/act_skbedit.c92
-rw-r--r--net/sched/cls_api.c221
-rw-r--r--net/sched/cls_basic.c61
-rw-r--r--net/sched/cls_bpf.c382
-rw-r--r--net/sched/cls_cgroup.c129
-rw-r--r--net/sched/cls_flow.c258
-rw-r--r--net/sched/cls_fw.c135
-rw-r--r--net/sched/cls_route.c178
-rw-r--r--net/sched/cls_rsvp.h156
-rw-r--r--net/sched/cls_tcindex.c91
-rw-r--r--net/sched/cls_u32.c191
-rw-r--r--net/sched/em_canid.c240
-rw-r--r--net/sched/em_cmp.c47
-rw-r--r--net/sched/em_ipset.c136
-rw-r--r--net/sched/em_meta.c237
-rw-r--r--net/sched/em_nbyte.c3
-rw-r--r--net/sched/em_text.c3
-rw-r--r--net/sched/em_u32.c2
-rw-r--r--net/sched/ematch.c41
-rw-r--r--net/sched/sch_api.c459
-rw-r--r--net/sched/sch_atm.c46
-rw-r--r--net/sched/sch_cbq.c431
-rw-r--r--net/sched/sch_choke.c632
-rw-r--r--net/sched/sch_codel.c276
-rw-r--r--net/sched/sch_drr.c31
-rw-r--r--net/sched/sch_dsmark.c76
-rw-r--r--net/sched/sch_fifo.c60
-rw-r--r--net/sched/sch_fq.c849
-rw-r--r--net/sched/sch_fq_codel.c620
-rw-r--r--net/sched/sch_generic.c241
-rw-r--r--net/sched/sch_gred.c195
-rw-r--r--net/sched/sch_hfsc.c75
-rw-r--r--net/sched/sch_hhf.c740
-rw-r--r--net/sched/sch_htb.c608
-rw-r--r--net/sched/sch_ingress.c6
-rw-r--r--net/sched/sch_mq.c21
-rw-r--r--net/sched/sch_mqprio.c426
-rw-r--r--net/sched/sch_multiq.c23
-rw-r--r--net/sched/sch_netem.c867
-rw-r--r--net/sched/sch_pie.c566
-rw-r--r--net/sched/sch_plug.c233
-rw-r--r--net/sched/sch_prio.c42
-rw-r--r--net/sched/sch_qfq.c1582
-rw-r--r--net/sched/sch_red.c136
-rw-r--r--net/sched/sch_sfb.c725
-rw-r--r--net/sched/sch_sfq.c773
-rw-r--r--net/sched/sch_tbf.c278
-rw-r--r--net/sched/sch_teql.c144
59 files changed, 12077 insertions, 3490 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a36270a994d..a1a8e29e5fc 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -24,7 +24,7 @@ menuconfig NET_SCHED
To administer these schedulers, you'll need the user-level utilities
from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
That package also contains some documentation; for more, check out
- <http://linux-net.osdl.org/index.php/Iproute2>.
+ <http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2>.
This Quality of Service (QoS) support will enable you to use
Differentiated Services (diffserv) and Resource Reservation Protocol
@@ -126,6 +126,17 @@ config NET_SCH_RED
To compile this code as a module, choose M here: the
module will be called sch_red.
+config NET_SCH_SFB
+ tristate "Stochastic Fair Blue (SFB)"
+ ---help---
+ Say Y here if you want to use the Stochastic Fair Blue (SFB)
+ packet scheduling algorithm.
+
+ See the top of <file:net/sched/sch_sfb.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_sfb.
+
config NET_SCH_SFQ
tristate "Stochastic Fairness Queueing (SFQ)"
---help---
@@ -205,6 +216,98 @@ config NET_SCH_DRR
If unsure, say N.
+config NET_SCH_MQPRIO
+ tristate "Multi-queue priority scheduler (MQPRIO)"
+ help
+ Say Y here if you want to use the Multi-queue Priority scheduler.
+ This scheduler allows QOS to be offloaded on NICs that have support
+ for offloading QOS schedulers.
+
+ To compile this driver as a module, choose M here: the module will
+ be called sch_mqprio.
+
+ If unsure, say N.
+
+config NET_SCH_CHOKE
+ tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+ help
+ Say Y here if you want to use the CHOKe packet scheduler (CHOose
+ and Keep for responsive flows, CHOose and Kill for unresponsive
+ flows). This is a variation of RED which trys to penalize flows
+ that monopolize the queue.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_choke.
+
+config NET_SCH_QFQ
+ tristate "Quick Fair Queueing scheduler (QFQ)"
+ help
+ Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_qfq.
+
+ If unsure, say N.
+
+config NET_SCH_CODEL
+ tristate "Controlled Delay AQM (CODEL)"
+ help
+ Say Y here if you want to use the Controlled Delay (CODEL)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_codel.
+
+ If unsure, say N.
+
+config NET_SCH_FQ_CODEL
+ tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)"
+ help
+ Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq_codel.
+
+ If unsure, say N.
+
+config NET_SCH_FQ
+ tristate "Fair Queue"
+ help
+ Say Y here if you want to use the FQ packet scheduling algorithm.
+
+ FQ does flow separation, and is able to respect pacing requirements
+ set by TCP stack into sk->sk_pacing_rate (for localy generated
+ traffic)
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq.
+
+ If unsure, say N.
+
+config NET_SCH_HHF
+ tristate "Heavy-Hitter Filter (HHF)"
+ help
+ Say Y here if you want to use the Heavy-Hitter Filter (HHF)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_hhf.
+
+config NET_SCH_PIE
+ tristate "Proportional Integral controller Enhanced (PIE) scheduler"
+ help
+ Say Y here if you want to use the Proportional Integral controller
+ Enhanced scheduler packet scheduling algorithm.
+ For more information, please see
+ http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_pie.
+
+ If unsure, say N.
+
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
@@ -215,6 +318,32 @@ config NET_SCH_INGRESS
To compile this code as a module, choose M here: the
module will be called sch_ingress.
+config NET_SCH_PLUG
+ tristate "Plug network traffic until release (PLUG)"
+ ---help---
+
+ This queuing discipline allows userspace to plug/unplug a network
+ output queue, using the netlink interface. When it receives an
+ enqueue command it inserts a plug into the outbound queue that
+ causes following packets to enqueue until a dequeue command arrives
+ over netlink, causing the plug to be removed and resuming the normal
+ packet flow.
+
+ This module also provides a generic "network output buffering"
+ functionality (aka output commit), wherein upon arrival of a dequeue
+ command, only packets up to the first plug are released for delivery.
+ The Remus HA project uses this module to enable speculative execution
+ of virtual machines by allowing the generated network output to be rolled
+ back if needed.
+
+ For more information, please refer to http://wiki.xensource.com/xenwiki/Remus
+
+ Say Y here if you are using this kernel for Xen dom0 and
+ want to protect Xen guests with Remus.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_plug.
+
comment "Classification"
config NET_CLS
@@ -243,7 +372,8 @@ config NET_CLS_TCINDEX
config NET_CLS_ROUTE4
tristate "Routing decision (ROUTE)"
- select NET_CLS_ROUTE
+ depends on INET
+ select IP_ROUTE_CLASSID
select NET_CLS
---help---
If you say Y here, you will be able to classify packets
@@ -252,9 +382,6 @@ config NET_CLS_ROUTE4
To compile this code as a module, choose M here: the
module will be called cls_route.
-config NET_CLS_ROUTE
- bool
-
config NET_CLS_FW
tristate "Netfilter mark (FW)"
select NET_CLS
@@ -330,6 +457,7 @@ config NET_CLS_FLOW
config NET_CLS_CGROUP
tristate "Control Group Classifier"
select NET_CLS
+ select CGROUP_NET_CLASSID
depends on CGROUPS
---help---
Say Y here if you want to classify packets based on the control
@@ -338,6 +466,16 @@ config NET_CLS_CGROUP
To compile this code as a module, choose M here: the
module will be called cls_cgroup.
+config NET_CLS_BPF
+ tristate "BPF-based classifier"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets based on
+ programmable BPF (JIT'ed) filters as an alternative to ematches.
+
+ To compile this code as a module, choose M here: the module will
+ be called cls_bpf.
+
config NET_EMATCH
bool "Extended Matches"
select NET_CLS
@@ -416,6 +554,26 @@ config NET_EMATCH_TEXT
To compile this code as a module, choose M here: the
module will be called em_text.
+config NET_EMATCH_CANID
+ tristate "CAN Identifier"
+ depends on NET_EMATCH && (CAN=y || CAN=m)
+ ---help---
+ Say Y here if you want to be able to classify CAN frames based
+ on CAN Identifier.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_canid.
+
+config NET_EMATCH_IPSET
+ tristate "IPset"
+ depends on NET_EMATCH && IP_SET
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ ipset membership.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_ipset.
+
config NET_CLS_ACT
bool "Actions"
---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 960f5dba630..0a869a11f3e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
+obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
@@ -32,6 +33,16 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
+obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
+obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
+obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
+obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
+obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
+obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
+obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
+
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
@@ -41,9 +52,12 @@ obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
+obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o
obj-$(CONFIG_NET_EMATCH) += ematch.o
obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o
+obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o
+obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 23b25f89e7e..648778aef1a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -20,53 +20,47 @@
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/err.h>
+#include <linux/module.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/sch_generic.h>
#include <net/act_api.h>
#include <net/netlink.h>
-static void tcf_common_free_rcu(struct rcu_head *head)
+void tcf_hash_destroy(struct tc_action *a)
{
- kfree(container_of(head, struct tcf_common, tcfc_rcu));
-}
+ struct tcf_common *p = a->priv;
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
-void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
-{
- unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
- struct tcf_common **p1p;
-
- for (p1p = &hinfo->htab[h]; *p1p; p1p = &(*p1p)->tcfc_next) {
- if (*p1p == p) {
- write_lock_bh(hinfo->lock);
- *p1p = p->tcfc_next;
- write_unlock_bh(hinfo->lock);
- gen_kill_estimator(&p->tcfc_bstats,
- &p->tcfc_rate_est);
- /*
- * gen_estimator est_timer() might access p->tcfc_lock
- * or bstats, wait a RCU grace period before freeing p
- */
- call_rcu(&p->tcfc_rcu, tcf_common_free_rcu);
- return;
- }
- }
- WARN_ON(1);
+ spin_lock_bh(&hinfo->lock);
+ hlist_del(&p->tcfc_head);
+ spin_unlock_bh(&hinfo->lock);
+ gen_kill_estimator(&p->tcfc_bstats,
+ &p->tcfc_rate_est);
+ /*
+ * gen_estimator est_timer() might access p->tcfc_lock
+ * or bstats, wait a RCU grace period before freeing p
+ */
+ kfree_rcu(p, tcfc_rcu);
}
EXPORT_SYMBOL(tcf_hash_destroy);
-int tcf_hash_release(struct tcf_common *p, int bind,
- struct tcf_hashinfo *hinfo)
+int tcf_hash_release(struct tc_action *a, int bind)
{
+ struct tcf_common *p = a->priv;
int ret = 0;
if (p) {
if (bind)
p->tcfc_bindcnt--;
+ else if (p->tcfc_bindcnt > 0)
+ return -EPERM;
p->tcfc_refcnt--;
if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
- tcf_hash_destroy(p, hinfo);
+ if (a->ops->cleanup)
+ a->ops->cleanup(a, bind);
+ tcf_hash_destroy(a);
ret = 1;
}
}
@@ -75,20 +69,22 @@ int tcf_hash_release(struct tcf_common *p, int bind,
EXPORT_SYMBOL(tcf_hash_release);
static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
- struct tc_action *a, struct tcf_hashinfo *hinfo)
+ struct tc_action *a)
{
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct hlist_head *head;
struct tcf_common *p;
- int err = 0, index = -1,i = 0, s_i = 0, n_i = 0;
+ int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
struct nlattr *nest;
- read_lock_bh(hinfo->lock);
+ spin_lock_bh(&hinfo->lock);
s_i = cb->args[0];
for (i = 0; i < (hinfo->hmask + 1); i++) {
- p = hinfo->htab[tcf_hash(i, hinfo->hmask)];
+ head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
- for (; p; p = p->tcfc_next) {
+ hlist_for_each_entry_rcu(p, head, tcfc_head) {
index++;
if (index < s_i)
continue;
@@ -111,7 +107,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
}
}
done:
- read_unlock_bh(hinfo->lock);
+ spin_unlock_bh(&hinfo->lock);
if (n_i)
cb->args[0] += n_i;
return n_i;
@@ -121,79 +117,82 @@ nla_put_failure:
goto done;
}
-static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,
- struct tcf_hashinfo *hinfo)
+static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a)
{
- struct tcf_common *p, *s_p;
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct hlist_head *head;
+ struct hlist_node *n;
+ struct tcf_common *p;
struct nlattr *nest;
- int i= 0, n_i = 0;
+ int i = 0, n_i = 0;
+ int ret = -EINVAL;
nest = nla_nest_start(skb, a->order);
if (nest == NULL)
goto nla_put_failure;
- NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind);
+ if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+ goto nla_put_failure;
for (i = 0; i < (hinfo->hmask + 1); i++) {
- p = hinfo->htab[tcf_hash(i, hinfo->hmask)];
-
- while (p != NULL) {
- s_p = p->tcfc_next;
- if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo))
- module_put(a->ops->owner);
- n_i++;
- p = s_p;
+ head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
+ hlist_for_each_entry_safe(p, n, head, tcfc_head) {
+ a->priv = p;
+ ret = tcf_hash_release(a, 0);
+ if (ret == ACT_P_DELETED) {
+ module_put(a->ops->owner);
+ n_i++;
+ } else if (ret < 0)
+ goto nla_put_failure;
}
}
- NLA_PUT_U32(skb, TCA_FCNT, n_i);
+ if (nla_put_u32(skb, TCA_FCNT, n_i))
+ goto nla_put_failure;
nla_nest_end(skb, nest);
return n_i;
nla_put_failure:
nla_nest_cancel(skb, nest);
- return -EINVAL;
+ return ret;
}
-int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
- int type, struct tc_action *a)
+static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
+ int type, struct tc_action *a)
{
- struct tcf_hashinfo *hinfo = a->ops->hinfo;
-
if (type == RTM_DELACTION) {
- return tcf_del_walker(skb, a, hinfo);
+ return tcf_del_walker(skb, a);
} else if (type == RTM_GETACTION) {
- return tcf_dump_walker(skb, cb, a, hinfo);
+ return tcf_dump_walker(skb, cb, a);
} else {
WARN(1, "tcf_generic_walker: unknown action %d\n", type);
return -EINVAL;
}
}
-EXPORT_SYMBOL(tcf_generic_walker);
-struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
+static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
{
- struct tcf_common *p;
+ struct tcf_common *p = NULL;
+ struct hlist_head *head;
- read_lock_bh(hinfo->lock);
- for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p;
- p = p->tcfc_next) {
+ spin_lock_bh(&hinfo->lock);
+ head = &hinfo->htab[tcf_hash(index, hinfo->hmask)];
+ hlist_for_each_entry_rcu(p, head, tcfc_head)
if (p->tcfc_index == index)
break;
- }
- read_unlock_bh(hinfo->lock);
+ spin_unlock_bh(&hinfo->lock);
return p;
}
-EXPORT_SYMBOL(tcf_hash_lookup);
-u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo)
+u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)
{
- u32 val = *idx_gen;
+ u32 val = hinfo->index;
do {
if (++val == 0)
val = 1;
} while (tcf_hash_lookup(val, hinfo));
- return (*idx_gen = val);
+ hinfo->index = val;
+ return val;
}
EXPORT_SYMBOL(tcf_hash_new_index);
@@ -210,34 +209,46 @@ int tcf_hash_search(struct tc_action *a, u32 index)
}
EXPORT_SYMBOL(tcf_hash_search);
-struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind,
- struct tcf_hashinfo *hinfo)
+int tcf_hash_check(u32 index, struct tc_action *a, int bind)
{
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
struct tcf_common *p = NULL;
if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
if (bind)
p->tcfc_bindcnt++;
p->tcfc_refcnt++;
a->priv = p;
+ return 1;
}
- return p;
+ return 0;
}
EXPORT_SYMBOL(tcf_hash_check);
-struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est,
- struct tc_action *a, int size, int bind,
- u32 *idx_gen, struct tcf_hashinfo *hinfo)
+void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
+{
+ struct tcf_common *pc = a->priv;
+ if (est)
+ gen_kill_estimator(&pc->tcfc_bstats,
+ &pc->tcfc_rate_est);
+ kfree_rcu(pc, tcfc_rcu);
+}
+EXPORT_SYMBOL(tcf_hash_cleanup);
+
+int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
+ int size, int bind)
{
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
struct tcf_common *p = kzalloc(size, GFP_KERNEL);
if (unlikely(!p))
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
p->tcfc_refcnt = 1;
if (bind)
p->tcfc_bindcnt = 1;
spin_lock_init(&p->tcfc_lock);
- p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo);
+ INIT_HLIST_NODE(&p->tcfc_head);
+ p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);
p->tcfc_tm.install = jiffies;
p->tcfc_tm.lastuse = jiffies;
if (est) {
@@ -245,42 +256,64 @@ struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est,
&p->tcfc_lock, est);
if (err) {
kfree(p);
- return ERR_PTR(err);
+ return err;
}
}
a->priv = (void *) p;
- return p;
+ return 0;
}
EXPORT_SYMBOL(tcf_hash_create);
-void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo)
+void tcf_hash_insert(struct tc_action *a)
{
+ struct tcf_common *p = a->priv;
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
- write_lock_bh(hinfo->lock);
- p->tcfc_next = hinfo->htab[h];
- hinfo->htab[h] = p;
- write_unlock_bh(hinfo->lock);
+ spin_lock_bh(&hinfo->lock);
+ hlist_add_head(&p->tcfc_head, &hinfo->htab[h]);
+ spin_unlock_bh(&hinfo->lock);
}
EXPORT_SYMBOL(tcf_hash_insert);
-static struct tc_action_ops *act_base = NULL;
+static LIST_HEAD(act_base);
static DEFINE_RWLOCK(act_mod_lock);
-int tcf_register_action(struct tc_action_ops *act)
+int tcf_register_action(struct tc_action_ops *act, unsigned int mask)
{
- struct tc_action_ops *a, **ap;
+ struct tc_action_ops *a;
+ int err;
+
+ /* Must supply act, dump and init */
+ if (!act->act || !act->dump || !act->init)
+ return -EINVAL;
+
+ /* Supply defaults */
+ if (!act->lookup)
+ act->lookup = tcf_hash_search;
+ if (!act->walk)
+ act->walk = tcf_generic_walker;
+
+ act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL);
+ if (!act->hinfo)
+ return -ENOMEM;
+ err = tcf_hashinfo_init(act->hinfo, mask);
+ if (err) {
+ kfree(act->hinfo);
+ return err;
+ }
write_lock(&act_mod_lock);
- for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) {
+ list_for_each_entry(a, &act_base, head) {
if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
write_unlock(&act_mod_lock);
+ tcf_hashinfo_destroy(act->hinfo);
+ kfree(act->hinfo);
return -EEXIST;
}
}
- act->next = NULL;
- *ap = act;
+ list_add_tail(&act->head, &act_base);
write_unlock(&act_mod_lock);
return 0;
}
@@ -288,17 +321,18 @@ EXPORT_SYMBOL(tcf_register_action);
int tcf_unregister_action(struct tc_action_ops *act)
{
- struct tc_action_ops *a, **ap;
+ struct tc_action_ops *a;
int err = -ENOENT;
write_lock(&act_mod_lock);
- for (ap = &act_base; (a = *ap) != NULL; ap = &a->next)
- if (a == act)
+ list_for_each_entry(a, &act_base, head) {
+ if (a == act) {
+ list_del(&act->head);
+ tcf_hashinfo_destroy(act->hinfo);
+ kfree(act->hinfo);
+ err = 0;
break;
- if (a) {
- *ap = a->next;
- a->next = NULL;
- err = 0;
+ }
}
write_unlock(&act_mod_lock);
return err;
@@ -308,72 +342,45 @@ EXPORT_SYMBOL(tcf_unregister_action);
/* lookup by name */
static struct tc_action_ops *tc_lookup_action_n(char *kind)
{
- struct tc_action_ops *a = NULL;
+ struct tc_action_ops *a, *res = NULL;
if (kind) {
read_lock(&act_mod_lock);
- for (a = act_base; a; a = a->next) {
+ list_for_each_entry(a, &act_base, head) {
if (strcmp(kind, a->kind) == 0) {
- if (!try_module_get(a->owner)) {
- read_unlock(&act_mod_lock);
- return NULL;
- }
+ if (try_module_get(a->owner))
+ res = a;
break;
}
}
read_unlock(&act_mod_lock);
}
- return a;
+ return res;
}
/* lookup by nlattr */
static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
{
- struct tc_action_ops *a = NULL;
+ struct tc_action_ops *a, *res = NULL;
if (kind) {
read_lock(&act_mod_lock);
- for (a = act_base; a; a = a->next) {
+ list_for_each_entry(a, &act_base, head) {
if (nla_strcmp(kind, a->kind) == 0) {
- if (!try_module_get(a->owner)) {
- read_unlock(&act_mod_lock);
- return NULL;
- }
+ if (try_module_get(a->owner))
+ res = a;
break;
}
}
read_unlock(&act_mod_lock);
}
- return a;
+ return res;
}
-#if 0
-/* lookup by id */
-static struct tc_action_ops *tc_lookup_action_id(u32 type)
-{
- struct tc_action_ops *a = NULL;
-
- if (type) {
- read_lock(&act_mod_lock);
- for (a = act_base; a; a = a->next) {
- if (a->type == type) {
- if (!try_module_get(a->owner)) {
- read_unlock(&act_mod_lock);
- return NULL;
- }
- break;
- }
- }
- read_unlock(&act_mod_lock);
- }
- return a;
-}
-#endif
-
-int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
+int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
struct tcf_result *res)
{
- struct tc_action *a;
+ const struct tc_action *a;
int ret = -1;
if (skb->tc_verd & TC_NCLS) {
@@ -381,53 +388,44 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
ret = TC_ACT_OK;
goto exec_done;
}
- while ((a = act) != NULL) {
+ list_for_each_entry(a, actions, list) {
repeat:
- if (a->ops && a->ops->act) {
- ret = a->ops->act(skb, a, res);
- if (TC_MUNGED & skb->tc_verd) {
- /* copied already, allow trampling */
- skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
- skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
- }
- if (ret == TC_ACT_REPEAT)
- goto repeat; /* we need a ttl - JHS */
- if (ret != TC_ACT_PIPE)
- goto exec_done;
+ ret = a->ops->act(skb, a, res);
+ if (TC_MUNGED & skb->tc_verd) {
+ /* copied already, allow trampling */
+ skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
+ skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
}
- act = a->next;
+ if (ret == TC_ACT_REPEAT)
+ goto repeat; /* we need a ttl - JHS */
+ if (ret != TC_ACT_PIPE)
+ goto exec_done;
}
exec_done:
return ret;
}
EXPORT_SYMBOL(tcf_action_exec);
-void tcf_action_destroy(struct tc_action *act, int bind)
+int tcf_action_destroy(struct list_head *actions, int bind)
{
- struct tc_action *a;
+ struct tc_action *a, *tmp;
+ int ret = 0;
- for (a = act; a; a = act) {
- if (a->ops && a->ops->cleanup) {
- if (a->ops->cleanup(a, bind) == ACT_P_DELETED)
- module_put(a->ops->owner);
- act = act->next;
- kfree(a);
- } else {
- /*FIXME: Remove later - catch insertion bugs*/
- WARN(1, "tcf_action_destroy: BUG? destroying NULL ops\n");
- act = act->next;
- kfree(a);
- }
+ list_for_each_entry_safe(a, tmp, actions, list) {
+ ret = tcf_hash_release(a, bind);
+ if (ret == ACT_P_DELETED)
+ module_put(a->ops->owner);
+ else if (ret < 0)
+ return ret;
+ list_del(&a->list);
+ kfree(a);
}
+ return ret;
}
int
tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
- int err = -EINVAL;
-
- if (a->ops == NULL || a->ops->dump == NULL)
- return err;
return a->ops->dump(skb, a, bind, ref);
}
@@ -438,16 +436,15 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
- if (a->ops == NULL || a->ops->dump == NULL)
- return err;
-
- NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind);
+ if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+ goto nla_put_failure;
if (tcf_action_copy_stats(skb, a, 0))
goto nla_put_failure;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
- if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) {
+ err = tcf_action_dump_old(skb, a, bind, ref);
+ if (err > 0) {
nla_nest_end(skb, nest);
return err;
}
@@ -459,14 +456,13 @@ nla_put_failure:
EXPORT_SYMBOL(tcf_action_dump_1);
int
-tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref)
+tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref)
{
struct tc_action *a;
int err = -EINVAL;
struct nlattr *nest;
- while ((a = act) != NULL) {
- act = a->next;
+ list_for_each_entry(a, actions, list) {
nest = nla_nest_start(skb, a->order);
if (nest == NULL)
goto nla_put_failure;
@@ -485,13 +481,14 @@ errout:
return err;
}
-struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
- char *name, int ovr, int bind)
+struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
+ struct nlattr *est, char *name, int ovr,
+ int bind)
{
struct tc_action *a;
struct tc_action_ops *a_o;
char act_name[IFNAMSIZ];
- struct nlattr *tb[TCA_ACT_MAX+1];
+ struct nlattr *tb[TCA_ACT_MAX + 1];
struct nlattr *kind;
int err;
@@ -540,21 +537,22 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
if (a == NULL)
goto err_mod;
+ a->ops = a_o;
+ INIT_LIST_HEAD(&a->list);
/* backward compatibility for policer */
if (name == NULL)
- err = a_o->init(tb[TCA_ACT_OPTIONS], est, a, ovr, bind);
+ err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind);
else
- err = a_o->init(nla, est, a, ovr, bind);
+ err = a_o->init(net, nla, est, a, ovr, bind);
if (err < 0)
goto err_free;
/* module count goes up only when brand new policy is created
- if it exists and is only bound to in a_o->init() then
- ACT_P_CREATED is not returned (a zero is).
- */
+ * if it exists and is only bound to in a_o->init() then
+ * ACT_P_CREATED is not returned (a zero is).
+ */
if (err != ACT_P_CREATED)
module_put(a_o->owner);
- a->ops = a_o;
return a;
@@ -566,36 +564,33 @@ err_out:
return ERR_PTR(err);
}
-struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est,
- char *name, int ovr, int bind)
+int tcf_action_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, char *name, int ovr,
+ int bind, struct list_head *actions)
{
- struct nlattr *tb[TCA_ACT_MAX_PRIO+1];
- struct tc_action *head = NULL, *act, *act_prev = NULL;
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ struct tc_action *act;
int err;
int i;
err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
if (err < 0)
- return ERR_PTR(err);
+ return err;
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
- act = tcf_action_init_1(tb[i], est, name, ovr, bind);
- if (IS_ERR(act))
+ act = tcf_action_init_1(net, tb[i], est, name, ovr, bind);
+ if (IS_ERR(act)) {
+ err = PTR_ERR(act);
goto err;
+ }
act->order = i;
-
- if (head == NULL)
- head = act;
- else
- act_prev->next = act;
- act_prev = act;
+ list_add_tail(&act->list, actions);
}
- return head;
+ return 0;
err:
- if (head != NULL)
- tcf_action_destroy(head, bind);
- return act;
+ tcf_action_destroy(actions, bind);
+ return err;
}
int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
@@ -603,9 +598,9 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
{
int err = 0;
struct gnet_dump d;
- struct tcf_act_hdr *h = a->priv;
+ struct tcf_common *p = a->priv;
- if (h == NULL)
+ if (p == NULL)
goto errout;
/* compat_mode being true specifies a call that is supposed
@@ -614,24 +609,20 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
if (compat_mode) {
if (a->type == TCA_OLD_COMPAT)
err = gnet_stats_start_copy_compat(skb, 0,
- TCA_STATS, TCA_XSTATS, &h->tcf_lock, &d);
+ TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d);
else
return 0;
} else
err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
- &h->tcf_lock, &d);
+ &p->tcfc_lock, &d);
if (err < 0)
goto errout;
- if (a->ops != NULL && a->ops->get_stats != NULL)
- if (a->ops->get_stats(skb, a) < 0)
- goto errout;
-
- if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &h->tcf_bstats,
- &h->tcf_rate_est) < 0 ||
- gnet_stats_copy_queue(&d, &h->tcf_qstats) < 0)
+ if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 ||
+ gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
+ &p->tcfc_rate_est) < 0 ||
+ gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0)
goto errout;
if (gnet_stats_finish_copy(&d) < 0)
@@ -644,7 +635,7 @@ errout:
}
static int
-tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
+tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq,
u16 flags, int event, int bind, int ref)
{
struct tcamsg *t;
@@ -652,52 +643,66 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
- nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
-
- t = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ t = nlmsg_data(nlh);
t->tca_family = AF_UNSPEC;
t->tca__pad1 = 0;
t->tca__pad2 = 0;
nest = nla_nest_start(skb, TCA_ACT_TAB);
if (nest == NULL)
- goto nla_put_failure;
+ goto out_nlmsg_trim;
- if (tcf_action_dump(skb, a, bind, ref) < 0)
- goto nla_put_failure;
+ if (tcf_action_dump(skb, actions, bind, ref) < 0)
+ goto out_nlmsg_trim;
nla_nest_end(skb, nest);
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
return skb->len;
-nla_put_failure:
-nlmsg_failure:
+out_nlmsg_trim:
nlmsg_trim(skb, b);
return -1;
}
static int
-act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n,
- struct tc_action *a, int event)
+act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
+ struct list_head *actions, int event)
{
struct sk_buff *skb;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
- if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
+ if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
- return rtnl_unicast(skb, net, pid);
+ return rtnl_unicast(skb, net, portid);
+}
+
+static struct tc_action *create_a(int i)
+{
+ struct tc_action *act;
+
+ act = kzalloc(sizeof(*act), GFP_KERNEL);
+ if (act == NULL) {
+ pr_debug("create_a: failed to alloc!\n");
+ return NULL;
+ }
+ act->order = i;
+ INIT_LIST_HEAD(&act->list);
+ return act;
}
static struct tc_action *
-tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid)
+tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)
{
- struct nlattr *tb[TCA_ACT_MAX+1];
+ struct nlattr *tb[TCA_ACT_MAX + 1];
struct tc_action *a;
int index;
int err;
@@ -713,16 +718,14 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid)
index = nla_get_u32(tb[TCA_ACT_INDEX]);
err = -ENOMEM;
- a = kzalloc(sizeof(struct tc_action), GFP_KERNEL);
+ a = create_a(0);
if (a == NULL)
goto err_out;
err = -EINVAL;
a->ops = tc_lookup_action(tb[TCA_ACT_KIND]);
- if (a->ops == NULL)
+ if (a->ops == NULL) /* could happen in batch of actions */
goto err_free;
- if (a->ops->lookup == NULL)
- goto err_mod;
err = -ENOENT;
if (a->ops->lookup(a, index) == 0)
goto err_mod;
@@ -738,31 +741,18 @@ err_out:
return ERR_PTR(err);
}
-static void cleanup_a(struct tc_action *act)
+static void cleanup_a(struct list_head *actions)
{
- struct tc_action *a;
+ struct tc_action *a, *tmp;
- for (a = act; a; a = act) {
- act = a->next;
+ list_for_each_entry_safe(a, tmp, actions, list) {
+ list_del(&a->list);
kfree(a);
}
}
-static struct tc_action *create_a(int i)
-{
- struct tc_action *act;
-
- act = kzalloc(sizeof(*act), GFP_KERNEL);
- if (act == NULL) {
- pr_debug("create_a: failed to alloc!\n");
- return NULL;
- }
- act->order = i;
- return act;
-}
-
static int tca_action_flush(struct net *net, struct nlattr *nla,
- struct nlmsghdr *n, u32 pid)
+ struct nlmsghdr *n, u32 portid)
{
struct sk_buff *skb;
unsigned char *b;
@@ -770,20 +760,14 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
struct tcamsg *t;
struct netlink_callback dcb;
struct nlattr *nest;
- struct nlattr *tb[TCA_ACT_MAX+1];
+ struct nlattr *tb[TCA_ACT_MAX + 1];
struct nlattr *kind;
- struct tc_action *a = create_a(0);
+ struct tc_action a;
int err = -ENOMEM;
- if (a == NULL) {
- pr_debug("tca_action_flush: couldnt create tc_action\n");
- return err;
- }
-
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb) {
pr_debug("tca_action_flush: failed skb alloc\n");
- kfree(a);
return err;
}
@@ -795,23 +779,27 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
err = -EINVAL;
kind = tb[TCA_ACT_KIND];
- a->ops = tc_lookup_action(kind);
- if (a->ops == NULL)
+ memset(&a, 0, sizeof(struct tc_action));
+ INIT_LIST_HEAD(&a.list);
+ a.ops = tc_lookup_action(kind);
+ if (a.ops == NULL) /*some idjot trying to flush unknown action */
goto err_out;
- nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
- t = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0);
+ if (!nlh)
+ goto out_module_put;
+ t = nlmsg_data(nlh);
t->tca_family = AF_UNSPEC;
t->tca__pad1 = 0;
t->tca__pad2 = 0;
nest = nla_nest_start(skb, TCA_ACT_TAB);
if (nest == NULL)
- goto nla_put_failure;
+ goto out_module_put;
- err = a->ops->walk(skb, &dcb, RTM_DELACTION, a);
+ err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a);
if (err < 0)
- goto nla_put_failure;
+ goto out_module_put;
if (err == 0)
goto noflush_out;
@@ -819,171 +807,150 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
nlh->nlmsg_flags |= NLM_F_ROOT;
- module_put(a->ops->owner);
- kfree(a);
- err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ module_put(a.ops->owner);
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
if (err > 0)
return 0;
return err;
-nla_put_failure:
-nlmsg_failure:
- module_put(a->ops->owner);
+out_module_put:
+ module_put(a.ops->owner);
err_out:
noflush_out:
kfree_skb(skb);
- kfree(a);
return err;
}
static int
+tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
+ u32 portid)
+{
+ int ret;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
+ 0, 1) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ /* now do the delete */
+ ret = tcf_action_destroy(actions, 0);
+ if (ret < 0) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+ ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ if (ret > 0)
+ return 0;
+ return ret;
+}
+
+static int
tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
- u32 pid, int event)
+ u32 portid, int event)
{
int i, ret;
- struct nlattr *tb[TCA_ACT_MAX_PRIO+1];
- struct tc_action *head = NULL, *act, *act_prev = NULL;
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ struct tc_action *act;
+ LIST_HEAD(actions);
ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
if (ret < 0)
return ret;
- if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) {
+ if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
if (tb[1] != NULL)
- return tca_action_flush(net, tb[1], n, pid);
+ return tca_action_flush(net, tb[1], n, portid);
else
return -EINVAL;
}
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
- act = tcf_action_get_1(tb[i], n, pid);
+ act = tcf_action_get_1(tb[i], n, portid);
if (IS_ERR(act)) {
ret = PTR_ERR(act);
goto err;
}
act->order = i;
-
- if (head == NULL)
- head = act;
- else
- act_prev->next = act;
- act_prev = act;
+ list_add_tail(&act->list, &actions);
}
if (event == RTM_GETACTION)
- ret = act_get_notify(net, pid, n, head, event);
+ ret = act_get_notify(net, portid, n, &actions, event);
else { /* delete */
- struct sk_buff *skb;
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb) {
- ret = -ENOBUFS;
- goto err;
- }
-
- if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event,
- 0, 1) <= 0) {
- kfree_skb(skb);
- ret = -EINVAL;
+ ret = tcf_del_notify(net, n, &actions, portid);
+ if (ret)
goto err;
- }
-
- /* now do the delete */
- tcf_action_destroy(head, 0);
- ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
- n->nlmsg_flags&NLM_F_ECHO);
- if (ret > 0)
- return 0;
return ret;
}
err:
- cleanup_a(head);
+ cleanup_a(&actions);
return ret;
}
-static int tcf_add_notify(struct net *net, struct tc_action *a,
- u32 pid, u32 seq, int event, u16 flags)
+static int
+tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
+ u32 portid)
{
- struct tcamsg *t;
- struct nlmsghdr *nlh;
struct sk_buff *skb;
- struct nlattr *nest;
- unsigned char *b;
int err = 0;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
- b = skb_tail_pointer(skb);
-
- nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
- t = NLMSG_DATA(nlh);
- t->tca_family = AF_UNSPEC;
- t->tca__pad1 = 0;
- t->tca__pad2 = 0;
-
- nest = nla_nest_start(skb, TCA_ACT_TAB);
- if (nest == NULL)
- goto nla_put_failure;
-
- if (tcf_action_dump(skb, a, 0, 0) < 0)
- goto nla_put_failure;
-
- nla_nest_end(skb, nest);
-
- nlh->nlmsg_len = skb_tail_pointer(skb) - b;
- NETLINK_CB(skb).dst_group = RTNLGRP_TC;
+ if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags,
+ RTM_NEWACTION, 0, 0) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
- err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags&NLM_F_ECHO);
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
if (err > 0)
err = 0;
return err;
-
-nla_put_failure:
-nlmsg_failure:
- kfree_skb(skb);
- return -1;
}
-
static int
tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
- u32 pid, int ovr)
+ u32 portid, int ovr)
{
int ret = 0;
- struct tc_action *act;
- struct tc_action *a;
- u32 seq = n->nlmsg_seq;
+ LIST_HEAD(actions);
- act = tcf_action_init(nla, NULL, NULL, ovr, 0);
- if (act == NULL)
- goto done;
- if (IS_ERR(act)) {
- ret = PTR_ERR(act);
+ ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions);
+ if (ret)
goto done;
- }
/* dump then free all the actions after update; inserted policy
* stays intact
- * */
- ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags);
- for (a = act; a; a = act) {
- act = a->next;
- kfree(a);
- }
+ */
+ ret = tcf_add_notify(net, n, &actions, portid);
+ cleanup_a(&actions);
done:
return ret;
}
-static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_ACT_MAX + 1];
- u32 pid = skb ? NETLINK_CB(skb).pid : 0;
+ u32 portid = skb ? NETLINK_CB(skb).portid : 0;
int ret = 0, ovr = 0;
+ if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL);
if (ret < 0)
return ret;
@@ -993,30 +960,29 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
return -EINVAL;
}
- /* n->nlmsg_flags&NLM_F_CREATE
- * */
+ /* n->nlmsg_flags & NLM_F_CREATE */
switch (n->nlmsg_type) {
case RTM_NEWACTION:
/* we are going to assume all other flags
- * imply create only if it doesnt exist
+ * imply create only if it doesn't exist
* Note that CREATE | EXCL implies that
* but since we want avoid ambiguity (eg when flags
* is zero) then just set this
*/
- if (n->nlmsg_flags&NLM_F_REPLACE)
+ if (n->nlmsg_flags & NLM_F_REPLACE)
ovr = 1;
replay:
- ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr);
+ ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr);
if (ret == -EAGAIN)
goto replay;
break;
case RTM_DELACTION:
ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
- pid, RTM_DELACTION);
+ portid, RTM_DELACTION);
break;
case RTM_GETACTION:
ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
- pid, RTM_GETACTION);
+ portid, RTM_GETACTION);
break;
default:
BUG();
@@ -1028,7 +994,7 @@ replay:
static struct nlattr *
find_dump_kind(const struct nlmsghdr *n)
{
- struct nlattr *tb1, *tb2[TCA_ACT_MAX+1];
+ struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct nlattr *nla[TCAA_MAX + 1];
struct nlattr *kind;
@@ -1062,7 +1028,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
struct tc_action_ops *a_o;
struct tc_action a;
int ret = 0;
- struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh);
+ struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
struct nlattr *kind = find_dump_kind(cb->nlh);
if (kind == NULL) {
@@ -1071,33 +1037,28 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
}
a_o = tc_lookup_action(kind);
- if (a_o == NULL) {
+ if (a_o == NULL)
return 0;
- }
memset(&a, 0, sizeof(struct tc_action));
a.ops = a_o;
- if (a_o->walk == NULL) {
- WARN(1, "tc_dump_action: %s !capable of dumping table\n",
- a_o->kind);
- goto nla_put_failure;
- }
-
- nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
- cb->nlh->nlmsg_type, sizeof(*t));
- t = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*t), 0);
+ if (!nlh)
+ goto out_module_put;
+ t = nlmsg_data(nlh);
t->tca_family = AF_UNSPEC;
t->tca__pad1 = 0;
t->tca__pad2 = 0;
nest = nla_nest_start(skb, TCA_ACT_TAB);
if (nest == NULL)
- goto nla_put_failure;
+ goto out_module_put;
ret = a_o->walk(skb, cb, RTM_GETACTION, &a);
if (ret < 0)
- goto nla_put_failure;
+ goto out_module_put;
if (ret > 0) {
nla_nest_end(skb, nest);
@@ -1106,13 +1067,12 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
nla_nest_cancel(skb, nest);
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
- if (NETLINK_CB(cb->skb).pid && ret)
+ if (NETLINK_CB(cb->skb).portid && ret)
nlh->nlmsg_flags |= NLM_F_MULTI;
module_put(a_o->owner);
return skb->len;
-nla_put_failure:
-nlmsg_failure:
+out_module_put:
module_put(a_o->owner);
nlmsg_trim(skb, b);
return skb->len;
@@ -1120,9 +1080,10 @@ nlmsg_failure:
static int __init tc_action_init(void)
{
- rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action);
+ rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action,
+ NULL);
return 0;
}
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 67dc7ce9b63..edbf40dac70 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -37,33 +37,23 @@
#include <net/tc_act/tc_csum.h>
#define CSUM_TAB_MASK 15
-static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1];
-static u32 csum_idx_gen;
-static DEFINE_RWLOCK(csum_lock);
-
-static struct tcf_hashinfo csum_hash_info = {
- .htab = tcf_csum_ht,
- .hmask = CSUM_TAB_MASK,
- .lock = &csum_lock,
-};
static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
};
-static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,
+static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_CSUM_MAX + 1];
struct tc_csum *parm;
- struct tcf_common *pc;
struct tcf_csum *p;
int ret = 0, err;
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy);
+ err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy);
if (err < 0)
return err;
@@ -71,39 +61,31 @@ static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,
return -EINVAL;
parm = nla_data(tb[TCA_CSUM_PARMS]);
- pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info);
- if (!pc) {
- pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
- &csum_idx_gen, &csum_hash_info);
- if (IS_ERR(pc))
- return PTR_ERR(pc);
- p = to_tcf_csum(pc);
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ if (ret)
+ return ret;
ret = ACT_P_CREATED;
} else {
- p = to_tcf_csum(pc);
- if (!ovr) {
- tcf_hash_release(pc, bind, &csum_hash_info);
+ if (bind)/* dont override defaults */
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
return -EEXIST;
- }
}
+ p = to_tcf_csum(a);
spin_lock_bh(&p->tcf_lock);
p->tcf_action = parm->action;
p->update_flags = parm->update_flags;
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(pc, &csum_hash_info);
+ tcf_hash_insert(a);
return ret;
}
-static int tcf_csum_cleanup(struct tc_action *a, int bind)
-{
- struct tcf_csum *p = a->priv;
- return tcf_hash_release(&p->common, bind, &csum_hash_info);
-}
-
/**
* tcf_csum_skb_nextlayer - Get next layer pointer
* @skb: sk_buff to use
@@ -166,15 +148,17 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
return 1;
}
-static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl)
{
struct icmp6hdr *icmp6h;
+ const struct ipv6hdr *ip6h;
icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));
if (icmp6h == NULL)
return 0;
+ ip6h = ipv6_hdr(skb);
icmp6h->icmp6_cksum = 0;
skb->csum = csum_partial(icmp6h, ipl - ihl, 0);
icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
@@ -186,15 +170,17 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h,
return 1;
}
-static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph,
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl)
{
struct tcphdr *tcph;
+ const struct iphdr *iph;
tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
if (tcph == NULL)
return 0;
+ iph = ip_hdr(skb);
tcph->check = 0;
skb->csum = csum_partial(tcph, ipl - ihl, 0);
tcph->check = tcp_v4_check(ipl - ihl,
@@ -205,15 +191,17 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph,
return 1;
}
-static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl)
{
struct tcphdr *tcph;
+ const struct ipv6hdr *ip6h;
tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
if (tcph == NULL)
return 0;
+ ip6h = ipv6_hdr(skb);
tcph->check = 0;
skb->csum = csum_partial(tcph, ipl - ihl, 0);
tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
@@ -225,10 +213,11 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h,
return 1;
}
-static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph,
+static int tcf_csum_ipv4_udp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl, int udplite)
{
struct udphdr *udph;
+ const struct iphdr *iph;
u16 ul;
/*
@@ -242,6 +231,7 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph,
if (udph == NULL)
return 0;
+ iph = ip_hdr(skb);
ul = ntohs(udph->len);
if (udplite || udph->check) {
@@ -276,10 +266,11 @@ ignore_obscure_skb:
return 1;
}
-static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+static int tcf_csum_ipv6_udp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl, int udplite)
{
struct udphdr *udph;
+ const struct ipv6hdr *ip6h;
u16 ul;
/*
@@ -293,6 +284,7 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h,
if (udph == NULL)
return 0;
+ ip6h = ipv6_hdr(skb);
ul = ntohs(udph->len);
udph->check = 0;
@@ -328,7 +320,7 @@ ignore_obscure_skb:
static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
{
- struct iphdr *iph;
+ const struct iphdr *iph;
int ntkoff;
ntkoff = skb_network_offset(skb);
@@ -353,19 +345,19 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
break;
case IPPROTO_TCP:
if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
- if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4,
+ if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4,
ntohs(iph->tot_len)))
goto fail;
break;
case IPPROTO_UDP:
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
- if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
+ if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
ntohs(iph->tot_len), 0))
goto fail;
break;
case IPPROTO_UDPLITE:
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
- if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
+ if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
ntohs(iph->tot_len), 1))
goto fail;
break;
@@ -377,7 +369,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
goto fail;
- ip_send_check(iph);
+ ip_send_check(ip_hdr(skb));
}
return 1;
@@ -397,7 +389,7 @@ static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
while (len > 1) {
switch (xh[off]) {
- case IPV6_TLV_PAD0:
+ case IPV6_TLV_PAD1:
optlen = 1;
break;
case IPV6_TLV_JUMBO:
@@ -456,6 +448,7 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
ixhl = ipv6_optlen(ip6xh);
if (!pskb_may_pull(skb, hl + ixhl + ntkoff))
goto fail;
+ ip6xh = (void *)(skb_network_header(skb) + hl);
if ((nexthdr == NEXTHDR_HOP) &&
!(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))
goto fail;
@@ -464,25 +457,25 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
break;
case IPPROTO_ICMPV6:
if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
- if (!tcf_csum_ipv6_icmp(skb, ip6h,
+ if (!tcf_csum_ipv6_icmp(skb,
hl, pl + sizeof(*ip6h)))
goto fail;
goto done;
case IPPROTO_TCP:
if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
- if (!tcf_csum_ipv6_tcp(skb, ip6h,
+ if (!tcf_csum_ipv6_tcp(skb,
hl, pl + sizeof(*ip6h)))
goto fail;
goto done;
case IPPROTO_UDP:
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
- if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
+ if (!tcf_csum_ipv6_udp(skb, hl,
pl + sizeof(*ip6h), 0))
goto fail;
goto done;
case IPPROTO_UDPLITE:
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
- if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
+ if (!tcf_csum_ipv6_udp(skb, hl,
pl + sizeof(*ip6h), 1))
goto fail;
goto done;
@@ -500,7 +493,7 @@ fail:
}
static int tcf_csum(struct sk_buff *skb,
- struct tc_action *a, struct tcf_result *res)
+ const struct tc_action *a, struct tcf_result *res)
{
struct tcf_csum *p = a->priv;
int action;
@@ -508,8 +501,7 @@ static int tcf_csum(struct sk_buff *skb,
spin_lock(&p->tcf_lock);
p->tcf_tm.lastuse = jiffies;
- p->tcf_bstats.bytes += qdisc_pkt_len(skb);
- p->tcf_bstats.packets++;
+ bstats_update(&p->tcf_bstats, skb);
action = p->tcf_action;
update_flags = p->update_flags;
spin_unlock(&p->tcf_lock);
@@ -551,11 +543,13 @@ static int tcf_csum_dump(struct sk_buff *skb,
};
struct tcf_t t;
- NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
- NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t);
+ if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t))
+ goto nla_put_failure;
return skb->len;
@@ -566,16 +560,11 @@ nla_put_failure:
static struct tc_action_ops act_csum_ops = {
.kind = "csum",
- .hinfo = &csum_hash_info,
.type = TCA_ACT_CSUM,
- .capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_csum,
.dump = tcf_csum_dump,
- .cleanup = tcf_csum_cleanup,
- .lookup = tcf_hash_search,
.init = tcf_csum_init,
- .walk = tcf_generic_walker
};
MODULE_DESCRIPTION("Checksum updating actions");
@@ -583,7 +572,7 @@ MODULE_LICENSE("GPL");
static int __init csum_init_module(void)
{
- return tcf_register_action(&act_csum_ops);
+ return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK);
}
static void __exit csum_cleanup_module(void)
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index c2ed90a4c0b..d6bcbd9f779 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -24,20 +24,11 @@
#include <net/tc_act/tc_gact.h>
#define GACT_TAB_MASK 15
-static struct tcf_common *tcf_gact_ht[GACT_TAB_MASK + 1];
-static u32 gact_idx_gen;
-static DEFINE_RWLOCK(gact_lock);
-
-static struct tcf_hashinfo gact_hash_info = {
- .htab = tcf_gact_ht,
- .hmask = GACT_TAB_MASK,
- .lock = &gact_lock,
-};
#ifdef CONFIG_GACT_PROB
static int gact_net_rand(struct tcf_gact *gact)
{
- if (!gact->tcfg_pval || net_random() % gact->tcfg_pval)
+ if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval)
return gact->tcf_action;
return gact->tcfg_paction;
}
@@ -50,7 +41,7 @@ static int gact_determ(struct tcf_gact *gact)
}
typedef int (*g_rand)(struct tcf_gact *gact);
-static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ };
+static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };
#endif /* CONFIG_GACT_PROB */
static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
@@ -58,15 +49,18 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
[TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) },
};
-static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
- struct tc_action *a, int ovr, int bind)
+static int tcf_gact_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
{
struct nlattr *tb[TCA_GACT_MAX + 1];
struct tc_gact *parm;
struct tcf_gact *gact;
- struct tcf_common *pc;
int ret = 0;
int err;
+#ifdef CONFIG_GACT_PROB
+ struct tc_gact_p *p_parm = NULL;
+#endif
if (nla == NULL)
return -EINVAL;
@@ -82,29 +76,33 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
#ifndef CONFIG_GACT_PROB
if (tb[TCA_GACT_PROB] != NULL)
return -EOPNOTSUPP;
+#else
+ if (tb[TCA_GACT_PROB]) {
+ p_parm = nla_data(tb[TCA_GACT_PROB]);
+ if (p_parm->ptype >= MAX_RAND)
+ return -EINVAL;
+ }
#endif
- pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info);
- if (!pc) {
- pc = tcf_hash_create(parm->index, est, a, sizeof(*gact),
- bind, &gact_idx_gen, &gact_hash_info);
- if (IS_ERR(pc))
- return PTR_ERR(pc);
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind);
+ if (ret)
+ return ret;
ret = ACT_P_CREATED;
} else {
- if (!ovr) {
- tcf_hash_release(pc, bind, &gact_hash_info);
+ if (bind)/* dont override defaults */
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
return -EEXIST;
- }
}
- gact = to_gact(pc);
+ gact = to_gact(a);
spin_lock_bh(&gact->tcf_lock);
gact->tcf_action = parm->action;
#ifdef CONFIG_GACT_PROB
- if (tb[TCA_GACT_PROB] != NULL) {
- struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]);
+ if (p_parm) {
gact->tcfg_paction = p_parm->paction;
gact->tcfg_pval = p_parm->pval;
gact->tcfg_ptype = p_parm->ptype;
@@ -112,27 +110,19 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
#endif
spin_unlock_bh(&gact->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(pc, &gact_hash_info);
+ tcf_hash_insert(a);
return ret;
}
-static int tcf_gact_cleanup(struct tc_action *a, int bind)
-{
- struct tcf_gact *gact = a->priv;
-
- if (gact)
- return tcf_hash_release(&gact->common, bind, &gact_hash_info);
- return 0;
-}
-
-static int tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_gact *gact = a->priv;
int action = TC_ACT_SHOT;
spin_lock(&gact->tcf_lock);
#ifdef CONFIG_GACT_PROB
- if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL)
+ if (gact->tcfg_ptype)
action = gact_rand[gact->tcfg_ptype](gact);
else
action = gact->tcf_action;
@@ -161,7 +151,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
};
struct tcf_t t;
- NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
#ifdef CONFIG_GACT_PROB
if (gact->tcfg_ptype) {
struct tc_gact_p p_opt = {
@@ -170,13 +161,15 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
.ptype = gact->tcfg_ptype,
};
- NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt);
+ if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt))
+ goto nla_put_failure;
}
#endif
t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
- NLA_PUT(skb, TCA_GACT_TM, sizeof(t), &t);
+ if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -186,16 +179,11 @@ nla_put_failure:
static struct tc_action_ops act_gact_ops = {
.kind = "gact",
- .hinfo = &gact_hash_info,
.type = TCA_ACT_GACT,
- .capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_gact,
.dump = tcf_gact_dump,
- .cleanup = tcf_gact_cleanup,
- .lookup = tcf_hash_search,
.init = tcf_gact_init,
- .walk = tcf_generic_walker
};
MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
@@ -205,11 +193,11 @@ MODULE_LICENSE("GPL");
static int __init gact_init_module(void)
{
#ifdef CONFIG_GACT_PROB
- printk(KERN_INFO "GACT probability on\n");
+ pr_info("GACT probability on\n");
#else
- printk(KERN_INFO "GACT probability NOT on\n");
+ pr_info("GACT probability NOT on\n");
#endif
- return tcf_register_action(&act_gact_ops);
+ return tcf_register_action(&act_gact_ops, GACT_TAB_MASK);
}
static void __exit gact_cleanup_module(void)
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 8daef963225..8a64a0734ae 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -1,5 +1,5 @@
/*
- * net/sched/ipt.c iptables target interface
+ * net/sched/ipt.c iptables target interface
*
*TODO: Add other tables. For now we only support the ipv4 table targets
*
@@ -8,7 +8,7 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Copyright: Jamal Hadi Salim (2002-4)
+ * Copyright: Jamal Hadi Salim (2002-13)
*/
#include <linux/types.h>
@@ -29,15 +29,6 @@
#define IPT_TAB_MASK 15
-static struct tcf_common *tcf_ipt_ht[IPT_TAB_MASK + 1];
-static u32 ipt_idx_gen;
-static DEFINE_RWLOCK(ipt_lock);
-
-static struct tcf_hashinfo ipt_hash_info = {
- .htab = tcf_ipt_ht,
- .hmask = IPT_TAB_MASK,
- .lock = &ipt_lock,
-};
static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
{
@@ -77,22 +68,12 @@ static void ipt_destroy_target(struct xt_entry_target *t)
module_put(par.target->me);
}
-static int tcf_ipt_release(struct tcf_ipt *ipt, int bind)
+static void tcf_ipt_release(struct tc_action *a, int bind)
{
- int ret = 0;
- if (ipt) {
- if (bind)
- ipt->tcf_bindcnt--;
- ipt->tcf_refcnt--;
- if (ipt->tcf_bindcnt <= 0 && ipt->tcf_refcnt <= 0) {
- ipt_destroy_target(ipt->tcfi_t);
- kfree(ipt->tcfi_tname);
- kfree(ipt->tcfi_t);
- tcf_hash_destroy(&ipt->common, &ipt_hash_info);
- ret = ACT_P_DELETED;
- }
- }
- return ret;
+ struct tcf_ipt *ipt = to_ipt(a);
+ ipt_destroy_target(ipt->tcfi_t);
+ kfree(ipt->tcfi_tname);
+ kfree(ipt->tcfi_t);
}
static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
@@ -102,12 +83,11 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
[TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) },
};
-static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
+static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_IPT_MAX + 1];
struct tcf_ipt *ipt;
- struct tcf_common *pc;
struct xt_entry_target *td, *t;
char *tname;
int ret = 0, err;
@@ -133,20 +113,20 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
if (tb[TCA_IPT_INDEX] != NULL)
index = nla_get_u32(tb[TCA_IPT_INDEX]);
- pc = tcf_hash_check(index, a, bind, &ipt_hash_info);
- if (!pc) {
- pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind,
- &ipt_idx_gen, &ipt_hash_info);
- if (IS_ERR(pc))
- return PTR_ERR(pc);
+ if (!tcf_hash_check(index, a, bind) ) {
+ ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind);
+ if (ret)
+ return ret;
ret = ACT_P_CREATED;
} else {
- if (!ovr) {
- tcf_ipt_release(to_ipt(pc), bind);
+ if (bind)/* dont override defaults */
+ return 0;
+ tcf_hash_release(a, bind);
+
+ if (!ovr)
return -EEXIST;
- }
}
- ipt = to_ipt(pc);
+ ipt = to_ipt(a);
hook = nla_get_u32(tb[TCA_IPT_HOOK]);
@@ -162,7 +142,8 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
if (unlikely(!t))
goto err2;
- if ((err = ipt_init_target(t, tname, hook)) < 0)
+ err = ipt_init_target(t, tname, hook);
+ if (err < 0)
goto err3;
spin_lock_bh(&ipt->tcf_lock);
@@ -176,7 +157,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
ipt->tcfi_hook = hook;
spin_unlock_bh(&ipt->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(pc, &ipt_hash_info);
+ tcf_hash_insert(a);
return ret;
err3:
@@ -184,37 +165,30 @@ err3:
err2:
kfree(tname);
err1:
- kfree(pc);
+ if (ret == ACT_P_CREATED)
+ tcf_hash_cleanup(a, est);
return err;
}
-static int tcf_ipt_cleanup(struct tc_action *a, int bind)
-{
- struct tcf_ipt *ipt = a->priv;
- return tcf_ipt_release(ipt, bind);
-}
-
-static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
+static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
int ret = 0, result = 0;
struct tcf_ipt *ipt = a->priv;
struct xt_action_param par;
- if (skb_cloned(skb)) {
- if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
- return TC_ACT_UNSPEC;
- }
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return TC_ACT_UNSPEC;
spin_lock(&ipt->tcf_lock);
ipt->tcf_tm.lastuse = jiffies;
- ipt->tcf_bstats.bytes += qdisc_pkt_len(skb);
- ipt->tcf_bstats.packets++;
+ bstats_update(&ipt->tcf_bstats, skb);
/* yes, we have to worry about both in and out dev
- worry later - danger - this API seems to have changed
- from earlier kernels */
+ * worry later - danger - this API seems to have changed
+ * from earlier kernels
+ */
par.in = skb->dev;
par.out = NULL;
par.hooknum = ipt->tcfi_hook;
@@ -234,9 +208,8 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
result = TC_ACT_PIPE;
break;
default:
- if (net_ratelimit())
- pr_notice("tc filter: Bogus netfilter code"
- " %d assume ACCEPT\n", ret);
+ net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n",
+ ret);
result = TC_POLICE_OK;
break;
}
@@ -254,9 +227,9 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
struct tc_cnt c;
/* for simple targets kernel size == user size
- ** user name = target name
- ** for foolproof you need to not assume this
- */
+ * user name = target name
+ * for foolproof you need to not assume this
+ */
t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
if (unlikely(!t))
@@ -266,15 +239,17 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
c.refcnt = ipt->tcf_refcnt - ref;
strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
- NLA_PUT(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t);
- NLA_PUT_U32(skb, TCA_IPT_INDEX, ipt->tcf_index);
- NLA_PUT_U32(skb, TCA_IPT_HOOK, ipt->tcfi_hook);
- NLA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c);
- NLA_PUT_STRING(skb, TCA_IPT_TABLE, ipt->tcfi_tname);
+ if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
+ nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) ||
+ nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) ||
+ nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) ||
+ nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname))
+ goto nla_put_failure;
tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
- NLA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm);
+ if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm))
+ goto nla_put_failure;
kfree(t);
return skb->len;
@@ -286,29 +261,49 @@ nla_put_failure:
static struct tc_action_ops act_ipt_ops = {
.kind = "ipt",
- .hinfo = &ipt_hash_info,
.type = TCA_ACT_IPT,
- .capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_ipt,
.dump = tcf_ipt_dump,
- .cleanup = tcf_ipt_cleanup,
- .lookup = tcf_hash_search,
+ .cleanup = tcf_ipt_release,
+ .init = tcf_ipt_init,
+};
+
+static struct tc_action_ops act_xt_ops = {
+ .kind = "xt",
+ .type = TCA_ACT_XT,
+ .owner = THIS_MODULE,
+ .act = tcf_ipt,
+ .dump = tcf_ipt_dump,
+ .cleanup = tcf_ipt_release,
.init = tcf_ipt_init,
- .walk = tcf_generic_walker
};
-MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
MODULE_DESCRIPTION("Iptables target actions");
MODULE_LICENSE("GPL");
+MODULE_ALIAS("act_xt");
static int __init ipt_init_module(void)
{
- return tcf_register_action(&act_ipt_ops);
+ int ret1, ret2;
+
+ ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK);
+ if (ret1 < 0)
+ printk("Failed to load xt action\n");
+ ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK);
+ if (ret2 < 0)
+ printk("Failed to load ipt action\n");
+
+ if (ret1 < 0 && ret2 < 0) {
+ return ret1;
+ } else
+ return 0;
}
static void __exit ipt_cleanup_module(void)
{
+ tcf_unregister_action(&act_xt_ops);
tcf_unregister_action(&act_ipt_ops);
}
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 0c311be9282..4f912c0e225 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -30,45 +30,27 @@
#include <linux/if_arp.h>
#define MIRRED_TAB_MASK 7
-static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1];
-static u32 mirred_idx_gen;
-static DEFINE_RWLOCK(mirred_lock);
static LIST_HEAD(mirred_list);
-static struct tcf_hashinfo mirred_hash_info = {
- .htab = tcf_mirred_ht,
- .hmask = MIRRED_TAB_MASK,
- .lock = &mirred_lock,
-};
-
-static inline int tcf_mirred_release(struct tcf_mirred *m, int bind)
+static void tcf_mirred_release(struct tc_action *a, int bind)
{
- if (m) {
- if (bind)
- m->tcf_bindcnt--;
- m->tcf_refcnt--;
- if(!m->tcf_bindcnt && m->tcf_refcnt <= 0) {
- list_del(&m->tcfm_list);
- if (m->tcfm_dev)
- dev_put(m->tcfm_dev);
- tcf_hash_destroy(&m->common, &mirred_hash_info);
- return 1;
- }
- }
- return 0;
+ struct tcf_mirred *m = to_mirred(a);
+ list_del(&m->tcfm_list);
+ if (m->tcfm_dev)
+ dev_put(m->tcfm_dev);
}
static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
[TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) },
};
-static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,
- struct tc_action *a, int ovr, int bind)
+static int tcf_mirred_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a, int ovr,
+ int bind)
{
struct nlattr *tb[TCA_MIRRED_MAX + 1];
struct tc_mirred *parm;
struct tcf_mirred *m;
- struct tcf_common *pc;
struct net_device *dev;
int ret, ok_push = 0;
@@ -88,7 +70,7 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,
return -EINVAL;
}
if (parm->ifindex) {
- dev = __dev_get_by_index(&init_net, parm->ifindex);
+ dev = __dev_get_by_index(net, parm->ifindex);
if (dev == NULL)
return -ENODEV;
switch (dev->type) {
@@ -108,22 +90,20 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,
dev = NULL;
}
- pc = tcf_hash_check(parm->index, a, bind, &mirred_hash_info);
- if (!pc) {
+ if (!tcf_hash_check(parm->index, a, bind)) {
if (dev == NULL)
return -EINVAL;
- pc = tcf_hash_create(parm->index, est, a, sizeof(*m), bind,
- &mirred_idx_gen, &mirred_hash_info);
- if (IS_ERR(pc))
- return PTR_ERR(pc);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind);
+ if (ret)
+ return ret;
ret = ACT_P_CREATED;
} else {
if (!ovr) {
- tcf_mirred_release(to_mirred(pc), bind);
+ tcf_hash_release(a, bind);
return -EEXIST;
}
}
- m = to_mirred(pc);
+ m = to_mirred(a);
spin_lock_bh(&m->tcf_lock);
m->tcf_action = parm->action;
@@ -139,22 +119,13 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,
spin_unlock_bh(&m->tcf_lock);
if (ret == ACT_P_CREATED) {
list_add(&m->tcfm_list, &mirred_list);
- tcf_hash_insert(pc, &mirred_hash_info);
+ tcf_hash_insert(a);
}
return ret;
}
-static int tcf_mirred_cleanup(struct tc_action *a, int bind)
-{
- struct tcf_mirred *m = a->priv;
-
- if (m)
- return tcf_mirred_release(m, bind);
- return 0;
-}
-
-static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
+static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_mirred *m = a->priv;
@@ -165,8 +136,7 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
spin_lock(&m->tcf_lock);
m->tcf_tm.lastuse = jiffies;
- m->tcf_bstats.bytes += qdisc_pkt_len(skb);
- m->tcf_bstats.packets++;
+ bstats_update(&m->tcf_bstats, skb);
dev = m->tcfm_dev;
if (!dev) {
@@ -175,9 +145,8 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
}
if (!(dev->flags & IFF_UP)) {
- if (net_ratelimit())
- pr_notice("tc mirred to Houston: device %s is down\n",
- dev->name);
+ net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
+ dev->name);
goto out;
}
@@ -197,19 +166,17 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
skb2->skb_iif = skb->dev->ifindex;
skb2->dev = dev;
- dev_queue_xmit(skb2);
- err = 0;
+ err = dev_queue_xmit(skb2);
out:
if (err) {
m->tcf_qstats.overlimits++;
- /* should we be asking for packet to be dropped?
- * may make sense for redirect case only
- */
- retval = TC_ACT_SHOT;
- } else {
+ if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
+ retval = TC_ACT_SHOT;
+ else
+ retval = m->tcf_action;
+ } else
retval = m->tcf_action;
- }
spin_unlock(&m->tcf_lock);
return retval;
@@ -229,11 +196,13 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i
};
struct tcf_t t;
- NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
- NLA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t);
+ if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -244,7 +213,7 @@ nla_put_failure:
static int mirred_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct tcf_mirred *m;
if (event == NETDEV_UNREGISTER)
@@ -262,19 +231,14 @@ static struct notifier_block mirred_device_notifier = {
.notifier_call = mirred_device_event,
};
-
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
- .hinfo = &mirred_hash_info,
.type = TCA_ACT_MIRRED,
- .capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_mirred,
.dump = tcf_mirred_dump,
- .cleanup = tcf_mirred_cleanup,
- .lookup = tcf_hash_search,
+ .cleanup = tcf_mirred_release,
.init = tcf_mirred_init,
- .walk = tcf_generic_walker
};
MODULE_AUTHOR("Jamal Hadi Salim(2002)");
@@ -288,13 +252,13 @@ static int __init mirred_init_module(void)
return err;
pr_info("Mirror/redirect action on\n");
- return tcf_register_action(&act_mirred_ops);
+ return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK);
}
static void __exit mirred_cleanup_module(void)
{
- unregister_netdevice_notifier(&mirred_device_notifier);
tcf_unregister_action(&act_mirred_ops);
+ unregister_netdevice_notifier(&mirred_device_notifier);
}
module_init(mirred_init_module);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 186eb837e60..270a030d5fd 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -30,28 +30,18 @@
#define NAT_TAB_MASK 15
-static struct tcf_common *tcf_nat_ht[NAT_TAB_MASK + 1];
-static u32 nat_idx_gen;
-static DEFINE_RWLOCK(nat_lock);
-
-static struct tcf_hashinfo nat_hash_info = {
- .htab = tcf_nat_ht,
- .hmask = NAT_TAB_MASK,
- .lock = &nat_lock,
-};
static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
[TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) },
};
-static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,
+static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_NAT_MAX + 1];
struct tc_nat *parm;
int ret = 0, err;
struct tcf_nat *p;
- struct tcf_common *pc;
if (nla == NULL)
return -EINVAL;
@@ -64,21 +54,19 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,
return -EINVAL;
parm = nla_data(tb[TCA_NAT_PARMS]);
- pc = tcf_hash_check(parm->index, a, bind, &nat_hash_info);
- if (!pc) {
- pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
- &nat_idx_gen, &nat_hash_info);
- if (IS_ERR(pc))
- return PTR_ERR(pc);
- p = to_tcf_nat(pc);
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ if (ret)
+ return ret;
ret = ACT_P_CREATED;
} else {
- p = to_tcf_nat(pc);
- if (!ovr) {
- tcf_hash_release(pc, bind, &nat_hash_info);
+ if (bind)
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
return -EEXIST;
- }
}
+ p = to_tcf_nat(a);
spin_lock_bh(&p->tcf_lock);
p->old_addr = parm->old_addr;
@@ -90,19 +78,12 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(pc, &nat_hash_info);
+ tcf_hash_insert(a);
return ret;
}
-static int tcf_nat_cleanup(struct tc_action *a, int bind)
-{
- struct tcf_nat *p = a->priv;
-
- return tcf_hash_release(&p->common, bind, &nat_hash_info);
-}
-
-static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
+static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_nat *p = a->priv;
@@ -125,8 +106,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
egress = p->flags & TCA_NAT_FLAG_EGRESS;
action = p->tcf_action;
- p->tcf_bstats.bytes += qdisc_pkt_len(skb);
- p->tcf_bstats.packets++;
+ bstats_update(&p->tcf_bstats, skb);
spin_unlock(&p->tcf_lock);
@@ -285,11 +265,13 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t t;
- NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
- NLA_PUT(skb, TCA_NAT_TM, sizeof(t), &t);
+ if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t))
+ goto nla_put_failure;
return skb->len;
@@ -300,16 +282,11 @@ nla_put_failure:
static struct tc_action_ops act_nat_ops = {
.kind = "nat",
- .hinfo = &nat_hash_info,
.type = TCA_ACT_NAT,
- .capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_nat,
.dump = tcf_nat_dump,
- .cleanup = tcf_nat_cleanup,
- .lookup = tcf_hash_search,
.init = tcf_nat_init,
- .walk = tcf_generic_walker
};
MODULE_DESCRIPTION("Stateless NAT actions");
@@ -317,7 +294,7 @@ MODULE_LICENSE("GPL");
static int __init nat_init_module(void)
{
- return tcf_register_action(&act_nat_ops);
+ return tcf_register_action(&act_nat_ops, NAT_TAB_MASK);
}
static void __exit nat_cleanup_module(void)
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index a0593c9640d..5f9bcb2e080 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -24,28 +24,19 @@
#include <net/tc_act/tc_pedit.h>
#define PEDIT_TAB_MASK 15
-static struct tcf_common *tcf_pedit_ht[PEDIT_TAB_MASK + 1];
-static u32 pedit_idx_gen;
-static DEFINE_RWLOCK(pedit_lock);
-
-static struct tcf_hashinfo pedit_hash_info = {
- .htab = tcf_pedit_ht,
- .hmask = PEDIT_TAB_MASK,
- .lock = &pedit_lock,
-};
static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
[TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
};
-static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
- struct tc_action *a, int ovr, int bind)
+static int tcf_pedit_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
{
struct nlattr *tb[TCA_PEDIT_MAX + 1];
struct tc_pedit *parm;
int ret = 0, err;
struct tcf_pedit *p;
- struct tcf_common *pc;
struct tc_pedit_key *keys = NULL;
int ksize;
@@ -63,27 +54,27 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
return -EINVAL;
- pc = tcf_hash_check(parm->index, a, bind, &pedit_hash_info);
- if (!pc) {
+ if (!tcf_hash_check(parm->index, a, bind)) {
if (!parm->nkeys)
return -EINVAL;
- pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
- &pedit_idx_gen, &pedit_hash_info);
- if (IS_ERR(pc))
- return PTR_ERR(pc);
- p = to_pedit(pc);
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ if (ret)
+ return ret;
+ p = to_pedit(a);
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL) {
- kfree(pc);
+ tcf_hash_cleanup(a, est);
return -ENOMEM;
}
ret = ACT_P_CREATED;
} else {
- p = to_pedit(pc);
- if (!ovr) {
- tcf_hash_release(pc, bind, &pedit_hash_info);
+ p = to_pedit(a);
+ tcf_hash_release(a, bind);
+ if (bind)
+ return 0;
+ if (!ovr)
return -EEXIST;
- }
+
if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL)
@@ -102,36 +93,26 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
memcpy(p->tcfp_keys, parm->keys, ksize);
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(pc, &pedit_hash_info);
+ tcf_hash_insert(a);
return ret;
}
-static int tcf_pedit_cleanup(struct tc_action *a, int bind)
+static void tcf_pedit_cleanup(struct tc_action *a, int bind)
{
struct tcf_pedit *p = a->priv;
-
- if (p) {
- struct tc_pedit_key *keys = p->tcfp_keys;
- if (tcf_hash_release(&p->common, bind, &pedit_hash_info)) {
- kfree(keys);
- return 1;
- }
- }
- return 0;
+ struct tc_pedit_key *keys = p->tcfp_keys;
+ kfree(keys);
}
-static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
+static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_pedit *p = a->priv;
int i, munged = 0;
unsigned int off;
- if (skb_cloned(skb)) {
- if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
- return p->tcf_action;
- }
- }
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return p->tcf_action;
off = skb_network_offset(skb);
@@ -163,7 +144,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
}
if (offset > 0 && offset > skb->len) {
pr_info("tc filter pedit"
- " offset %d cant exceed pkt length %d\n",
+ " offset %d can't exceed pkt length %d\n",
offset, skb->len);
goto bad;
}
@@ -187,8 +168,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
bad:
p->tcf_qstats.overlimits++;
done:
- p->tcf_bstats.bytes += qdisc_pkt_len(skb);
- p->tcf_bstats.packets++;
+ bstats_update(&p->tcf_bstats, skb);
spin_unlock(&p->tcf_lock);
return p->tcf_action;
}
@@ -218,11 +198,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
opt->refcnt = p->tcf_refcnt - ref;
opt->bindcnt = p->tcf_bindcnt - bind;
- NLA_PUT(skb, TCA_PEDIT_PARMS, s, opt);
+ if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
+ goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
- NLA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t);
+ if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t))
+ goto nla_put_failure;
kfree(opt);
return skb->len;
@@ -234,16 +216,12 @@ nla_put_failure:
static struct tc_action_ops act_pedit_ops = {
.kind = "pedit",
- .hinfo = &pedit_hash_info,
.type = TCA_ACT_PEDIT,
- .capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_pedit,
.dump = tcf_pedit_dump,
.cleanup = tcf_pedit_cleanup,
- .lookup = tcf_hash_search,
.init = tcf_pedit_init,
- .walk = tcf_generic_walker
};
MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
@@ -252,7 +230,7 @@ MODULE_LICENSE("GPL");
static int __init pedit_init_module(void)
{
- return tcf_register_action(&act_pedit_ops);
+ return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK);
}
static void __exit pedit_cleanup_module(void)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 7ebf7439b47..0566e4606a4 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -22,23 +22,28 @@
#include <net/act_api.h>
#include <net/netlink.h>
-#define L2T(p,L) qdisc_l2t((p)->tcfp_R_tab, L)
-#define L2T_P(p,L) qdisc_l2t((p)->tcfp_P_tab, L)
+struct tcf_police {
+ struct tcf_common common;
+ int tcfp_result;
+ u32 tcfp_ewma_rate;
+ s64 tcfp_burst;
+ u32 tcfp_mtu;
+ s64 tcfp_toks;
+ s64 tcfp_ptoks;
+ s64 tcfp_mtu_ptoks;
+ s64 tcfp_t_c;
+ struct psched_ratecfg rate;
+ bool rate_present;
+ struct psched_ratecfg peak;
+ bool peak_present;
+};
+#define to_police(pc) \
+ container_of(pc, struct tcf_police, common)
#define POL_TAB_MASK 15
-static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1];
-static u32 police_idx_gen;
-static DEFINE_RWLOCK(police_lock);
-
-static struct tcf_hashinfo police_hash_info = {
- .htab = tcf_police_ht,
- .hmask = POL_TAB_MASK,
- .lock = &police_lock,
-};
/* old policer structure from before tc actions */
-struct tc_police_compat
-{
+struct tc_police_compat {
u32 index;
int action;
u32 limit;
@@ -53,18 +58,20 @@ struct tc_police_compat
static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
int type, struct tc_action *a)
{
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct hlist_head *head;
struct tcf_common *p;
int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
struct nlattr *nest;
- read_lock_bh(&police_lock);
+ spin_lock_bh(&hinfo->lock);
s_i = cb->args[0];
for (i = 0; i < (POL_TAB_MASK + 1); i++) {
- p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)];
+ head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)];
- for (; p; p = p->tcfc_next) {
+ hlist_for_each_entry_rcu(p, head, tcfc_head) {
index++;
if (index < s_i)
continue;
@@ -87,7 +94,7 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c
}
}
done:
- read_unlock_bh(&police_lock);
+ spin_unlock_bh(&hinfo->lock);
if (n_i)
cb->args[0] += n_i;
return n_i;
@@ -97,38 +104,6 @@ nla_put_failure:
goto done;
}
-static void tcf_police_free_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct tcf_police, tcf_rcu));
-}
-
-static void tcf_police_destroy(struct tcf_police *p)
-{
- unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
- struct tcf_common **p1p;
-
- for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->tcfc_next) {
- if (*p1p == &p->common) {
- write_lock_bh(&police_lock);
- *p1p = p->tcf_next;
- write_unlock_bh(&police_lock);
- gen_kill_estimator(&p->tcf_bstats,
- &p->tcf_rate_est);
- if (p->tcfp_R_tab)
- qdisc_put_rtab(p->tcfp_R_tab);
- if (p->tcfp_P_tab)
- qdisc_put_rtab(p->tcfp_P_tab);
- /*
- * gen_estimator est_timer() might access p->tcf_lock
- * or bstats, wait a RCU grace period before freeing p
- */
- call_rcu(&p->tcf_rcu, tcf_police_free_rcu);
- return;
- }
- }
- WARN_ON(1);
-}
-
static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
[TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE },
[TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE },
@@ -136,15 +111,17 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
};
-static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est,
- struct tc_action *a, int ovr, int bind)
+static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
{
- unsigned h;
+ unsigned int h;
int ret = 0, err;
struct nlattr *tb[TCA_POLICE_MAX + 1];
struct tc_police *parm;
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
int size;
if (nla == NULL)
@@ -162,19 +139,17 @@ static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est,
parm = nla_data(tb[TCA_POLICE_TBF]);
if (parm->index) {
- struct tcf_common *pc;
-
- pc = tcf_hash_lookup(parm->index, &police_hash_info);
- if (pc != NULL) {
- a->priv = pc;
- police = to_police(pc);
+ if (tcf_hash_search(a, parm->index)) {
+ police = to_police(a->priv);
if (bind) {
police->tcf_bindcnt += 1;
police->tcf_refcnt += 1;
+ return 0;
}
if (ovr)
goto override;
- return ret;
+ /* not replacing */
+ return -EEXIST;
}
}
@@ -217,26 +192,36 @@ override:
}
/* No failure allowed after this point */
- if (R_tab != NULL) {
- qdisc_put_rtab(police->tcfp_R_tab);
- police->tcfp_R_tab = R_tab;
+ police->tcfp_mtu = parm->mtu;
+ if (police->tcfp_mtu == 0) {
+ police->tcfp_mtu = ~0;
+ if (R_tab)
+ police->tcfp_mtu = 255 << R_tab->rate.cell_log;
+ }
+ if (R_tab) {
+ police->rate_present = true;
+ psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);
+ qdisc_put_rtab(R_tab);
+ } else {
+ police->rate_present = false;
}
- if (P_tab != NULL) {
- qdisc_put_rtab(police->tcfp_P_tab);
- police->tcfp_P_tab = P_tab;
+ if (P_tab) {
+ police->peak_present = true;
+ psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);
+ qdisc_put_rtab(P_tab);
+ } else {
+ police->peak_present = false;
}
if (tb[TCA_POLICE_RESULT])
police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
- police->tcfp_toks = police->tcfp_burst = parm->burst;
- police->tcfp_mtu = parm->mtu;
- if (police->tcfp_mtu == 0) {
- police->tcfp_mtu = ~0;
- if (police->tcfp_R_tab)
- police->tcfp_mtu = 255<<police->tcfp_R_tab->rate.cell_log;
+ police->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
+ police->tcfp_toks = police->tcfp_burst;
+ if (police->peak_present) {
+ police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak,
+ police->tcfp_mtu);
+ police->tcfp_ptoks = police->tcfp_mtu_ptoks;
}
- if (police->tcfp_P_tab)
- police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu);
police->tcf_action = parm->action;
if (tb[TCA_POLICE_AVRATE])
@@ -246,14 +231,13 @@ override:
if (ret != ACT_P_CREATED)
return ret;
- police->tcfp_t_c = psched_get_time();
+ police->tcfp_t_c = ktime_to_ns(ktime_get());
police->tcf_index = parm->index ? parm->index :
- tcf_hash_new_index(&police_idx_gen, &police_hash_info);
+ tcf_hash_new_index(hinfo);
h = tcf_hash(police->tcf_index, POL_TAB_MASK);
- write_lock_bh(&police_lock);
- police->tcf_next = tcf_police_ht[h];
- tcf_police_ht[h] = &police->common;
- write_unlock_bh(&police_lock);
+ spin_lock_bh(&hinfo->lock);
+ hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
+ spin_unlock_bh(&hinfo->lock);
a->priv = police;
return ret;
@@ -261,45 +245,24 @@ override:
failure_unlock:
spin_unlock_bh(&police->tcf_lock);
failure:
- if (P_tab)
- qdisc_put_rtab(P_tab);
- if (R_tab)
- qdisc_put_rtab(R_tab);
+ qdisc_put_rtab(P_tab);
+ qdisc_put_rtab(R_tab);
if (ret == ACT_P_CREATED)
kfree(police);
return err;
}
-static int tcf_act_police_cleanup(struct tc_action *a, int bind)
-{
- struct tcf_police *p = a->priv;
- int ret = 0;
-
- if (p != NULL) {
- if (bind)
- p->tcf_bindcnt--;
-
- p->tcf_refcnt--;
- if (p->tcf_refcnt <= 0 && !p->tcf_bindcnt) {
- tcf_police_destroy(p);
- ret = 1;
- }
- }
- return ret;
-}
-
-static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,
+static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_police *police = a->priv;
- psched_time_t now;
- long toks;
- long ptoks = 0;
+ s64 now;
+ s64 toks;
+ s64 ptoks = 0;
spin_lock(&police->tcf_lock);
- police->tcf_bstats.bytes += qdisc_pkt_len(skb);
- police->tcf_bstats.packets++;
+ bstats_update(&police->tcf_bstats, skb);
if (police->tcfp_ewma_rate &&
police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
@@ -311,24 +274,25 @@ static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,
}
if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
- if (police->tcfp_R_tab == NULL) {
+ if (!police->rate_present) {
spin_unlock(&police->tcf_lock);
return police->tcfp_result;
}
- now = psched_get_time();
- toks = psched_tdiff_bounded(now, police->tcfp_t_c,
- police->tcfp_burst);
- if (police->tcfp_P_tab) {
+ now = ktime_to_ns(ktime_get());
+ toks = min_t(s64, now - police->tcfp_t_c,
+ police->tcfp_burst);
+ if (police->peak_present) {
ptoks = toks + police->tcfp_ptoks;
- if (ptoks > (long)L2T_P(police, police->tcfp_mtu))
- ptoks = (long)L2T_P(police, police->tcfp_mtu);
- ptoks -= L2T_P(police, qdisc_pkt_len(skb));
+ if (ptoks > police->tcfp_mtu_ptoks)
+ ptoks = police->tcfp_mtu_ptoks;
+ ptoks -= (s64) psched_l2t_ns(&police->peak,
+ qdisc_pkt_len(skb));
}
toks += police->tcfp_toks;
- if (toks > (long)police->tcfp_burst)
+ if (toks > police->tcfp_burst)
toks = police->tcfp_burst;
- toks -= L2T(police, qdisc_pkt_len(skb));
+ toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb));
if ((toks|ptoks) >= 0) {
police->tcfp_t_c = now;
police->tcfp_toks = toks;
@@ -354,20 +318,23 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
.index = police->tcf_index,
.action = police->tcf_action,
.mtu = police->tcfp_mtu,
- .burst = police->tcfp_burst,
+ .burst = PSCHED_NS2TICKS(police->tcfp_burst),
.refcnt = police->tcf_refcnt - ref,
.bindcnt = police->tcf_bindcnt - bind,
};
- if (police->tcfp_R_tab)
- opt.rate = police->tcfp_R_tab->rate;
- if (police->tcfp_P_tab)
- opt.peakrate = police->tcfp_P_tab->rate;
- NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
- if (police->tcfp_result)
- NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result);
- if (police->tcfp_ewma_rate)
- NLA_PUT_U32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate);
+ if (police->rate_present)
+ psched_ratecfg_getrate(&opt.rate, &police->rate);
+ if (police->peak_present)
+ psched_ratecfg_getrate(&opt.peakrate, &police->peak);
+ if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if (police->tcfp_result &&
+ nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result))
+ goto nla_put_failure;
+ if (police->tcfp_ewma_rate &&
+ nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -381,14 +348,10 @@ MODULE_LICENSE("GPL");
static struct tc_action_ops act_police_ops = {
.kind = "police",
- .hinfo = &police_hash_info,
.type = TCA_ID_POLICE,
- .capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_act_police,
.dump = tcf_act_police_dump,
- .cleanup = tcf_act_police_cleanup,
- .lookup = tcf_hash_search,
.init = tcf_act_police_locate,
.walk = tcf_act_police_walker
};
@@ -396,14 +359,13 @@ static struct tc_action_ops act_police_ops = {
static int __init
police_init_module(void)
{
- return tcf_register_action(&act_police_ops);
+ return tcf_register_action(&act_police_ops, POL_TAB_MASK);
}
static void __exit
police_cleanup_module(void)
{
tcf_unregister_action(&act_police_ops);
- rcu_barrier(); /* Wait for completion of call_rcu()'s (tcf_police_free_rcu) */
}
module_init(police_init_module);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 97e84f3ee77..992c2317ce8 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -25,50 +25,31 @@
#include <net/tc_act/tc_defact.h>
#define SIMP_TAB_MASK 7
-static struct tcf_common *tcf_simp_ht[SIMP_TAB_MASK + 1];
-static u32 simp_idx_gen;
-static DEFINE_RWLOCK(simp_lock);
-
-static struct tcf_hashinfo simp_hash_info = {
- .htab = tcf_simp_ht,
- .hmask = SIMP_TAB_MASK,
- .lock = &simp_lock,
-};
#define SIMP_MAX_DATA 32
-static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_defact *d = a->priv;
spin_lock(&d->tcf_lock);
d->tcf_tm.lastuse = jiffies;
- d->tcf_bstats.bytes += qdisc_pkt_len(skb);
- d->tcf_bstats.packets++;
+ bstats_update(&d->tcf_bstats, skb);
/* print policy string followed by _ then packet count
* Example if this was the 3rd packet and the string was "hello"
* then it would look like "hello_3" (without quotes)
- **/
+ */
pr_info("simple: %s_%d\n",
(char *)d->tcfd_defdata, d->tcf_bstats.packets);
spin_unlock(&d->tcf_lock);
return d->tcf_action;
}
-static int tcf_simp_release(struct tcf_defact *d, int bind)
+static void tcf_simp_release(struct tc_action *a, int bind)
{
- int ret = 0;
- if (d) {
- if (bind)
- d->tcf_bindcnt--;
- d->tcf_refcnt--;
- if (d->tcf_bindcnt <= 0 && d->tcf_refcnt <= 0) {
- kfree(d->tcfd_defdata);
- tcf_hash_destroy(&d->common, &simp_hash_info);
- ret = 1;
- }
- }
- return ret;
+ struct tcf_defact *d = to_defact(a);
+ kfree(d->tcfd_defdata);
}
static int alloc_defdata(struct tcf_defact *d, char *defdata)
@@ -95,13 +76,13 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
[TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA },
};
-static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
- struct tc_action *a, int ovr, int bind)
+static int tcf_simp_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
{
struct nlattr *tb[TCA_DEF_MAX + 1];
struct tc_defact *parm;
struct tcf_defact *d;
- struct tcf_common *pc;
char *defdata;
int ret = 0, err;
@@ -121,46 +102,38 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
parm = nla_data(tb[TCA_DEF_PARMS]);
defdata = nla_data(tb[TCA_DEF_DATA]);
- pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info);
- if (!pc) {
- pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
- &simp_idx_gen, &simp_hash_info);
- if (IS_ERR(pc))
- return PTR_ERR(pc);
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+ if (ret)
+ return ret;
- d = to_defact(pc);
+ d = to_defact(a);
ret = alloc_defdata(d, defdata);
if (ret < 0) {
- kfree(pc);
+ tcf_hash_cleanup(a, est);
return ret;
}
d->tcf_action = parm->action;
ret = ACT_P_CREATED;
} else {
- d = to_defact(pc);
- if (!ovr) {
- tcf_simp_release(d, bind);
+ d = to_defact(a);
+
+ if (bind)
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
return -EEXIST;
- }
+
reset_policy(d, defdata, parm);
}
if (ret == ACT_P_CREATED)
- tcf_hash_insert(pc, &simp_hash_info);
+ tcf_hash_insert(a);
return ret;
}
-static inline int tcf_simp_cleanup(struct tc_action *a, int bind)
-{
- struct tcf_defact *d = a->priv;
-
- if (d)
- return tcf_simp_release(d, bind);
- return 0;
-}
-
-static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
- int bind, int ref)
+static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_defact *d = a->priv;
@@ -172,12 +145,14 @@ static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t t;
- NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt);
- NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata);
+ if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
+ nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
+ goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
- NLA_PUT(skb, TCA_DEF_TM, sizeof(t), &t);
+ if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -187,15 +162,12 @@ nla_put_failure:
static struct tc_action_ops act_simp_ops = {
.kind = "simple",
- .hinfo = &simp_hash_info,
.type = TCA_ACT_SIMP,
- .capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_simp,
.dump = tcf_simp_dump,
- .cleanup = tcf_simp_cleanup,
+ .cleanup = tcf_simp_release,
.init = tcf_simp_init,
- .walk = tcf_generic_walker,
};
MODULE_AUTHOR("Jamal Hadi Salim(2005)");
@@ -204,7 +176,8 @@ MODULE_LICENSE("GPL");
static int __init simp_init_module(void)
{
- int ret = tcf_register_action(&act_simp_ops);
+ int ret;
+ ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK);
if (!ret)
pr_info("Simple TC action Loaded\n");
return ret;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 66cbf4eb885..fcfeeaf838b 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -11,8 +11,7 @@
* more details.
*
* You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; if not, see <http://www.gnu.org/licenses/>.
*
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
*/
@@ -29,25 +28,15 @@
#include <net/tc_act/tc_skbedit.h>
#define SKBEDIT_TAB_MASK 15
-static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1];
-static u32 skbedit_idx_gen;
-static DEFINE_RWLOCK(skbedit_lock);
-
-static struct tcf_hashinfo skbedit_hash_info = {
- .htab = tcf_skbedit_ht,
- .hmask = SKBEDIT_TAB_MASK,
- .lock = &skbedit_lock,
-};
-static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a,
+static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_skbedit *d = a->priv;
spin_lock(&d->tcf_lock);
d->tcf_tm.lastuse = jiffies;
- d->tcf_bstats.bytes += qdisc_pkt_len(skb);
- d->tcf_bstats.packets++;
+ bstats_update(&d->tcf_bstats, skb);
if (d->flags & SKBEDIT_F_PRIORITY)
skb->priority = d->priority;
@@ -68,13 +57,13 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
};
-static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
- struct tc_action *a, int ovr, int bind)
+static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
{
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
struct tc_skbedit *parm;
struct tcf_skbedit *d;
- struct tcf_common *pc;
u32 flags = 0, *priority = NULL, *mark = NULL;
u16 *queue_mapping = NULL;
int ret = 0, err;
@@ -109,21 +98,20 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
- pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info);
- if (!pc) {
- pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
- &skbedit_idx_gen, &skbedit_hash_info);
- if (IS_ERR(pc))
- return PTR_ERR(pc);
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+ if (ret)
+ return ret;
- d = to_skbedit(pc);
+ d = to_skbedit(a);
ret = ACT_P_CREATED;
} else {
- d = to_skbedit(pc);
- if (!ovr) {
- tcf_hash_release(pc, bind, &skbedit_hash_info);
+ d = to_skbedit(a);
+ if (bind)
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
return -EEXIST;
- }
}
spin_lock_bh(&d->tcf_lock);
@@ -141,21 +129,12 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
spin_unlock_bh(&d->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(pc, &skbedit_hash_info);
+ tcf_hash_insert(a);
return ret;
}
-static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind)
-{
- struct tcf_skbedit *d = a->priv;
-
- if (d)
- return tcf_hash_release(&d->common, bind, &skbedit_hash_info);
- return 0;
-}
-
-static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
- int bind, int ref)
+static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_skbedit *d = a->priv;
@@ -167,20 +146,25 @@ static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t t;
- NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt);
- if (d->flags & SKBEDIT_F_PRIORITY)
- NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
- &d->priority);
- if (d->flags & SKBEDIT_F_QUEUE_MAPPING)
- NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING,
- sizeof(d->queue_mapping), &d->queue_mapping);
- if (d->flags & SKBEDIT_F_MARK)
- NLA_PUT(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
- &d->mark);
+ if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if ((d->flags & SKBEDIT_F_PRIORITY) &&
+ nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
+ &d->priority))
+ goto nla_put_failure;
+ if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) &&
+ nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING,
+ sizeof(d->queue_mapping), &d->queue_mapping))
+ goto nla_put_failure;
+ if ((d->flags & SKBEDIT_F_MARK) &&
+ nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
+ &d->mark))
+ goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
- NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t);
+ if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -190,15 +174,11 @@ nla_put_failure:
static struct tc_action_ops act_skbedit_ops = {
.kind = "skbedit",
- .hinfo = &skbedit_hash_info,
.type = TCA_ACT_SKBEDIT,
- .capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_skbedit,
.dump = tcf_skbedit_dump,
- .cleanup = tcf_skbedit_cleanup,
.init = tcf_skbedit_init,
- .walk = tcf_generic_walker,
};
MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
@@ -207,7 +187,7 @@ MODULE_LICENSE("GPL");
static int __init skbedit_init_module(void)
{
- return tcf_register_action(&act_skbedit_ops);
+ return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK);
}
static void __exit skbedit_cleanup_module(void)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 5fd0c28ef79..45527e6b52d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -22,7 +22,6 @@
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/kmod.h>
-#include <linux/netlink.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <net/net_namespace.h>
@@ -32,46 +31,44 @@
#include <net/pkt_cls.h>
/* The list of all installed classifier types */
-
-static struct tcf_proto_ops *tcf_proto_base __read_mostly;
+static LIST_HEAD(tcf_proto_base);
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(cls_mod_lock);
/* Find classifier type by string name */
-static struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
+static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
{
- struct tcf_proto_ops *t = NULL;
+ const struct tcf_proto_ops *t, *res = NULL;
if (kind) {
read_lock(&cls_mod_lock);
- for (t = tcf_proto_base; t; t = t->next) {
+ list_for_each_entry(t, &tcf_proto_base, head) {
if (nla_strcmp(kind, t->kind) == 0) {
- if (!try_module_get(t->owner))
- t = NULL;
+ if (try_module_get(t->owner))
+ res = t;
break;
}
}
read_unlock(&cls_mod_lock);
}
- return t;
+ return res;
}
/* Register(unregister) new classifier type */
int register_tcf_proto_ops(struct tcf_proto_ops *ops)
{
- struct tcf_proto_ops *t, **tp;
+ struct tcf_proto_ops *t;
int rc = -EEXIST;
write_lock(&cls_mod_lock);
- for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
+ list_for_each_entry(t, &tcf_proto_base, head)
if (!strcmp(ops->kind, t->kind))
goto out;
- ops->next = NULL;
- *tp = ops;
+ list_add_tail(&ops->head, &tcf_proto_base);
rc = 0;
out:
write_unlock(&cls_mod_lock);
@@ -81,19 +78,17 @@ EXPORT_SYMBOL(register_tcf_proto_ops);
int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
{
- struct tcf_proto_ops *t, **tp;
+ struct tcf_proto_ops *t;
int rc = -ENOENT;
write_lock(&cls_mod_lock);
- for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next)
- if (t == ops)
+ list_for_each_entry(t, &tcf_proto_base, head) {
+ if (t == ops) {
+ list_del(&t->head);
+ rc = 0;
break;
-
- if (!t)
- goto out;
- *tp = t->next;
- rc = 0;
-out:
+ }
+ }
write_unlock(&cls_mod_lock);
return rc;
}
@@ -111,14 +106,14 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
u32 first = TC_H_MAKE(0xC0000000U, 0U);
if (tp)
- first = tp->prio-1;
+ first = tp->prio - 1;
return first;
}
/* Add/change/delete/get a filter node */
-static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
@@ -132,15 +127,23 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
struct Qdisc *q;
struct tcf_proto **back, **chain;
struct tcf_proto *tp;
- struct tcf_proto_ops *tp_ops;
+ const struct tcf_proto_ops *tp_ops;
const struct Qdisc_class_ops *cops;
unsigned long cl;
unsigned long fh;
int err;
int tp_created = 0;
+ if ((n->nlmsg_type != RTM_GETTFILTER) &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
replay:
- t = NLMSG_DATA(n);
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ t = nlmsg_data(n);
protocol = TC_H_MIN(t->tcm_info);
prio = TC_H_MAJ(t->tcm_info);
nprio = prio;
@@ -149,7 +152,8 @@ replay:
if (prio == 0) {
/* If no priority is given, user wants we allocated it. */
- if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+ if (n->nlmsg_type != RTM_NEWTFILTER ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
prio = TC_H_MAKE(0x80000000U, 0U);
}
@@ -161,10 +165,6 @@ replay:
if (dev == NULL)
return -ENODEV;
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
- if (err < 0)
- return err;
-
/* Find qdisc */
if (!parent) {
q = dev->qdisc;
@@ -176,7 +176,8 @@ replay:
}
/* Is it classful? */
- if ((cops = q->ops->cl_ops) == NULL)
+ cops = q->ops->cl_ops;
+ if (!cops)
return -EINVAL;
if (cops->tcf_chain == NULL)
@@ -196,10 +197,11 @@ replay:
goto errout;
/* Check the chain for existence of proto-tcf with this priority */
- for (back = chain; (tp=*back) != NULL; back = &tp->next) {
+ for (back = chain; (tp = *back) != NULL; back = &tp->next) {
if (tp->prio >= prio) {
if (tp->prio == prio) {
- if (!nprio || (tp->protocol != protocol && protocol))
+ if (!nprio ||
+ (tp->protocol != protocol && protocol))
goto errout;
} else
tp = NULL;
@@ -216,7 +218,8 @@ replay:
goto errout;
err = -ENOENT;
- if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+ if (n->nlmsg_type != RTM_NEWTFILTER ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
goto errout;
@@ -315,7 +318,8 @@ replay:
}
}
- err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
+ err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
+ n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE);
if (err == 0) {
if (tp_created) {
spin_lock_bh(root_lock);
@@ -338,32 +342,35 @@ errout:
return err;
}
-static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp,
- unsigned long fh, u32 pid, u32 seq, u16 flags, int event)
+static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp,
+ unsigned long fh, u32 portid, u32 seq, u16 flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
- nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
- tcm = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
tcm->tcm_family = AF_UNSPEC;
tcm->tcm__pad1 = 0;
tcm->tcm__pad2 = 0;
tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
tcm->tcm_parent = tp->classid;
tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
- NLA_PUT_STRING(skb, TCA_KIND, tp->ops->kind);
+ if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
+ goto nla_put_failure;
tcm->tcm_handle = fh;
if (RTM_DELTFILTER != event) {
tcm->tcm_handle = 0;
- if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0)
+ if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
goto nla_put_failure;
}
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
return skb->len;
-nlmsg_failure:
+out_nlmsg_trim:
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
@@ -374,18 +381,18 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
unsigned long fh, int event)
{
struct sk_buff *skb;
- u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
- if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) {
+ if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
- return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
n->nlmsg_flags & NLM_F_ECHO);
}
@@ -399,8 +406,9 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
struct tcf_walker *arg)
{
struct tcf_dump_args *a = (void *)arg;
+ struct net *net = sock_net(a->skb->sk);
- return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid,
+ return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
}
@@ -413,14 +421,15 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
struct net_device *dev;
struct Qdisc *q;
struct tcf_proto *tp, **chain;
- struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
unsigned long cl = 0;
const struct Qdisc_class_ops *cops;
struct tcf_dump_args arg;
- if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
return skb->len;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return skb->len;
if (!tcm->tcm_parent)
@@ -429,7 +438,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
if (!q)
goto out;
- if ((cops = q->ops->cl_ops) == NULL)
+ cops = q->ops->cl_ops;
+ if (!cops)
goto errout;
if (cops->tcf_chain == NULL)
goto errout;
@@ -444,8 +454,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
s_t = cb->args[0];
- for (tp=*chain, t=0; tp; tp = tp->next, t++) {
- if (t < s_t) continue;
+ for (tp = *chain, t = 0; tp; tp = tp->next, t++) {
+ if (t < s_t)
+ continue;
if (TC_H_MAJ(tcm->tcm_info) &&
TC_H_MAJ(tcm->tcm_info) != tp->prio)
continue;
@@ -455,7 +466,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
if (t > s_t)
memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
if (cb->args[1] == 0) {
- if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid,
+ if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWTFILTER) <= 0)
break;
@@ -468,10 +479,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
arg.skb = skb;
arg.cb = cb;
arg.w.stop = 0;
- arg.w.skip = cb->args[1]-1;
+ arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
tp->ops->walk(tp, &arg.w);
- cb->args[1] = arg.w.count+1;
+ cb->args[1] = arg.w.count + 1;
if (arg.w.stop)
break;
}
@@ -488,45 +499,41 @@ out:
void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
- if (exts->action) {
- tcf_action_destroy(exts->action, TCA_ACT_UNBIND);
- exts->action = NULL;
- }
+ tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND);
+ INIT_LIST_HEAD(&exts->actions);
#endif
}
EXPORT_SYMBOL(tcf_exts_destroy);
-int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb,
- struct nlattr *rate_tlv, struct tcf_exts *exts,
- const struct tcf_ext_map *map)
+int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
+ struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
{
- memset(exts, 0, sizeof(*exts));
-
#ifdef CONFIG_NET_CLS_ACT
{
struct tc_action *act;
- if (map->police && tb[map->police]) {
- act = tcf_action_init_1(tb[map->police], rate_tlv,
- "police", TCA_ACT_NOREPLACE,
+ INIT_LIST_HEAD(&exts->actions);
+ if (exts->police && tb[exts->police]) {
+ act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
+ "police", ovr,
TCA_ACT_BIND);
if (IS_ERR(act))
return PTR_ERR(act);
- act->type = TCA_OLD_COMPAT;
- exts->action = act;
- } else if (map->action && tb[map->action]) {
- act = tcf_action_init(tb[map->action], rate_tlv, NULL,
- TCA_ACT_NOREPLACE, TCA_ACT_BIND);
- if (IS_ERR(act))
- return PTR_ERR(act);
-
- exts->action = act;
+ act->type = exts->type = TCA_OLD_COMPAT;
+ list_add(&act->list, &exts->actions);
+ } else if (exts->action && tb[exts->action]) {
+ int err;
+ err = tcf_action_init(net, tb[exts->action], rate_tlv,
+ NULL, ovr,
+ TCA_ACT_BIND, &exts->actions);
+ if (err)
+ return err;
}
}
#else
- if ((map->action && tb[map->action]) ||
- (map->police && tb[map->police]))
+ if ((exts->action && tb[exts->action]) ||
+ (exts->police && tb[exts->police]))
return -EOPNOTSUPP;
#endif
@@ -538,43 +545,42 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
struct tcf_exts *src)
{
#ifdef CONFIG_NET_CLS_ACT
- if (src->action) {
- struct tc_action *act;
- tcf_tree_lock(tp);
- act = dst->action;
- dst->action = src->action;
- tcf_tree_unlock(tp);
- if (act)
- tcf_action_destroy(act, TCA_ACT_UNBIND);
- }
+ LIST_HEAD(tmp);
+ tcf_tree_lock(tp);
+ list_splice_init(&dst->actions, &tmp);
+ list_splice(&src->actions, &dst->actions);
+ tcf_tree_unlock(tp);
+ tcf_action_destroy(&tmp, TCA_ACT_UNBIND);
#endif
}
EXPORT_SYMBOL(tcf_exts_change);
-int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts,
- const struct tcf_ext_map *map)
+#define tcf_exts_first_act(ext) \
+ list_first_entry(&(exts)->actions, struct tc_action, list)
+
+int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
- if (map->action && exts->action) {
+ if (exts->action && !list_empty(&exts->actions)) {
/*
* again for backward compatible mode - we want
* to work with both old and new modes of entering
* tc data even if iproute2 was newer - jhs
*/
struct nlattr *nest;
-
- if (exts->action->type != TCA_OLD_COMPAT) {
- nest = nla_nest_start(skb, map->action);
+ if (exts->type != TCA_OLD_COMPAT) {
+ nest = nla_nest_start(skb, exts->action);
if (nest == NULL)
goto nla_put_failure;
- if (tcf_action_dump(skb, exts->action, 0, 0) < 0)
+ if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
- } else if (map->police) {
- nest = nla_nest_start(skb, map->police);
- if (nest == NULL)
+ } else if (exts->police) {
+ struct tc_action *act = tcf_exts_first_act(exts);
+ nest = nla_nest_start(skb, exts->police);
+ if (nest == NULL || !act)
goto nla_put_failure;
- if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0)
+ if (tcf_action_dump_old(skb, act, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
}
@@ -587,26 +593,23 @@ nla_put_failure: __attribute__ ((unused))
EXPORT_SYMBOL(tcf_exts_dump);
-int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts,
- const struct tcf_ext_map *map)
+int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
- if (exts->action)
- if (tcf_action_copy_stats(skb, exts->action, 1) < 0)
- goto nla_put_failure;
+ struct tc_action *a = tcf_exts_first_act(exts);
+ if (tcf_action_copy_stats(skb, a, 1) < 0)
+ return -1;
#endif
return 0;
-nla_put_failure: __attribute__ ((unused))
- return -1;
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
static int __init tc_filter_init(void)
{
- rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
- tc_dump_tfilter);
+ tc_dump_tfilter, NULL);
return 0;
}
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index f23d9155b1e..0ae1813e3e9 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -21,14 +21,12 @@
#include <net/act_api.h>
#include <net/pkt_cls.h>
-struct basic_head
-{
+struct basic_head {
u32 hgenerator;
struct list_head flist;
};
-struct basic_filter
-{
+struct basic_filter {
u32 handle;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
@@ -36,16 +34,11 @@ struct basic_filter
struct list_head link;
};
-static const struct tcf_ext_map basic_ext_map = {
- .action = TCA_BASIC_ACT,
- .police = TCA_BASIC_POLICE
-};
-
-static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp,
+static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
int r;
- struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_head *head = tp->root;
struct basic_filter *f;
list_for_each_entry(f, &head->flist, link) {
@@ -63,7 +56,7 @@ static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp,
static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
{
unsigned long l = 0UL;
- struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_head *head = tp->root;
struct basic_filter *f;
if (head == NULL)
@@ -92,8 +85,7 @@ static int basic_init(struct tcf_proto *tp)
return 0;
}
-static inline void basic_delete_filter(struct tcf_proto *tp,
- struct basic_filter *f)
+static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
@@ -115,7 +107,7 @@ static void basic_destroy(struct tcf_proto *tp)
static int basic_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_head *head = tp->root;
struct basic_filter *t, *f = (struct basic_filter *) arg;
list_for_each_entry(t, &head->flist, link)
@@ -135,15 +127,17 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
[TCA_BASIC_EMATCHES] = { .type = NLA_NESTED },
};
-static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
- unsigned long base, struct nlattr **tb,
- struct nlattr *est)
+static int basic_set_parms(struct net *net, struct tcf_proto *tp,
+ struct basic_filter *f, unsigned long base,
+ struct nlattr **tb,
+ struct nlattr *est, bool ovr)
{
- int err = -EINVAL;
+ int err;
struct tcf_exts e;
struct tcf_ematch_tree t;
- err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map);
+ tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
if (err < 0)
return err;
@@ -165,11 +159,12 @@ errout:
return err;
}
-static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
- struct nlattr **tca, unsigned long *arg)
+static int basic_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, unsigned long *arg, bool ovr)
{
int err;
- struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_head *head = tp->root;
struct nlattr *tb[TCA_BASIC_MAX + 1];
struct basic_filter *f = (struct basic_filter *) *arg;
@@ -184,7 +179,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
if (f != NULL) {
if (handle && f->handle != handle)
return -EINVAL;
- return basic_set_parms(tp, f, base, tb, tca[TCA_RATE]);
+ return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);
}
err = -ENOBUFS;
@@ -192,6 +187,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
if (f == NULL)
goto errout;
+ tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
err = -EINVAL;
if (handle)
f->handle = handle;
@@ -203,14 +199,14 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
} while (--i > 0 && basic_get(tp, head->hgenerator));
if (i <= 0) {
- printk(KERN_ERR "Insufficient number of handles\n");
+ pr_err("Insufficient number of handles\n");
goto errout;
}
f->handle = head->hgenerator;
}
- err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE]);
+ err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);
if (err < 0)
goto errout;
@@ -229,7 +225,7 @@ errout:
static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_head *head = tp->root;
struct basic_filter *f;
list_for_each_entry(f, &head->flist, link) {
@@ -245,7 +241,7 @@ skip:
}
}
-static int basic_dump(struct tcf_proto *tp, unsigned long fh,
+static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct basic_filter *f = (struct basic_filter *) fh;
@@ -260,16 +256,17 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh,
if (nest == NULL)
goto nla_put_failure;
- if (f->res.classid)
- NLA_PUT_U32(skb, TCA_BASIC_CLASSID, f->res.classid);
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid))
+ goto nla_put_failure;
- if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 ||
+ if (tcf_exts_dump(skb, &f->exts) < 0 ||
tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
- if (tcf_exts_dump_stats(skb, &f->exts, &basic_ext_map) < 0)
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
new file mode 100644
index 00000000000..13f64df2c71
--- /dev/null
+++ b/net/sched/cls_bpf.c
@@ -0,0 +1,382 @@
+/*
+ * Berkeley Packet Filter based traffic classifier
+ *
+ * Might be used to classify traffic through flexible, user-defined and
+ * possibly JIT-ed BPF filters for traffic control as an alternative to
+ * ematches.
+ *
+ * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_DESCRIPTION("TC BPF based classifier");
+
+struct cls_bpf_head {
+ struct list_head plist;
+ u32 hgen;
+};
+
+struct cls_bpf_prog {
+ struct sk_filter *filter;
+ struct sock_filter *bpf_ops;
+ struct tcf_exts exts;
+ struct tcf_result res;
+ struct list_head link;
+ u32 handle;
+ u16 bpf_len;
+};
+
+static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
+ [TCA_BPF_CLASSID] = { .type = NLA_U32 },
+ [TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
+ [TCA_BPF_OPS] = { .type = NLA_BINARY,
+ .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
+};
+
+static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog;
+ int ret;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ int filter_res = SK_RUN_FILTER(prog->filter, skb);
+
+ if (filter_res == 0)
+ continue;
+
+ *res = prog->res;
+ if (filter_res != -1)
+ res->classid = filter_res;
+
+ ret = tcf_exts_exec(skb, &prog->exts, res);
+ if (ret < 0)
+ continue;
+
+ return ret;
+ }
+
+ return -1;
+}
+
+static int cls_bpf_init(struct tcf_proto *tp)
+{
+ struct cls_bpf_head *head;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+
+ INIT_LIST_HEAD(&head->plist);
+ tp->root = head;
+
+ return 0;
+}
+
+static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
+{
+ tcf_unbind_filter(tp, &prog->res);
+ tcf_exts_destroy(tp, &prog->exts);
+
+ sk_unattached_filter_destroy(prog->filter);
+
+ kfree(prog->bpf_ops);
+ kfree(prog);
+}
+
+static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (prog == todel) {
+ tcf_tree_lock(tp);
+ list_del(&prog->link);
+ tcf_tree_unlock(tp);
+
+ cls_bpf_delete_prog(tp, prog);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static void cls_bpf_destroy(struct tcf_proto *tp)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog, *tmp;
+
+ list_for_each_entry_safe(prog, tmp, &head->plist, link) {
+ list_del(&prog->link);
+ cls_bpf_delete_prog(tp, prog);
+ }
+
+ kfree(head);
+}
+
+static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog;
+ unsigned long ret = 0UL;
+
+ if (head == NULL)
+ return 0UL;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (prog->handle == handle) {
+ ret = (unsigned long) prog;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void cls_bpf_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
+ struct cls_bpf_prog *prog,
+ unsigned long base, struct nlattr **tb,
+ struct nlattr *est, bool ovr)
+{
+ struct sock_filter *bpf_ops, *bpf_old;
+ struct tcf_exts exts;
+ struct sock_fprog_kern tmp;
+ struct sk_filter *fp, *fp_old;
+ u16 bpf_size, bpf_len;
+ u32 classid;
+ int ret;
+
+ if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID])
+ return -EINVAL;
+
+ tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+ ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
+ if (ret < 0)
+ return ret;
+
+ classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+ bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
+ if (bpf_len > BPF_MAXINSNS || bpf_len == 0) {
+ ret = -EINVAL;
+ goto errout;
+ }
+
+ bpf_size = bpf_len * sizeof(*bpf_ops);
+ bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+ if (bpf_ops == NULL) {
+ ret = -ENOMEM;
+ goto errout;
+ }
+
+ memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
+
+ tmp.len = bpf_len;
+ tmp.filter = bpf_ops;
+
+ ret = sk_unattached_filter_create(&fp, &tmp);
+ if (ret)
+ goto errout_free;
+
+ tcf_tree_lock(tp);
+ fp_old = prog->filter;
+ bpf_old = prog->bpf_ops;
+
+ prog->bpf_len = bpf_len;
+ prog->bpf_ops = bpf_ops;
+ prog->filter = fp;
+ prog->res.classid = classid;
+ tcf_tree_unlock(tp);
+
+ tcf_bind_filter(tp, &prog->res, base);
+ tcf_exts_change(tp, &prog->exts, &exts);
+
+ if (fp_old)
+ sk_unattached_filter_destroy(fp_old);
+ if (bpf_old)
+ kfree(bpf_old);
+
+ return 0;
+
+errout_free:
+ kfree(bpf_ops);
+errout:
+ tcf_exts_destroy(tp, &exts);
+ return ret;
+}
+
+static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
+ struct cls_bpf_head *head)
+{
+ unsigned int i = 0x80000000;
+
+ do {
+ if (++head->hgen == 0x7FFFFFFF)
+ head->hgen = 1;
+ } while (--i > 0 && cls_bpf_get(tp, head->hgen));
+ if (i == 0)
+ pr_err("Insufficient number of handles\n");
+
+ return i;
+}
+
+static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ unsigned long *arg, bool ovr)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg;
+ struct nlattr *tb[TCA_BPF_MAX + 1];
+ int ret;
+
+ if (tca[TCA_OPTIONS] == NULL)
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy);
+ if (ret < 0)
+ return ret;
+
+ if (prog != NULL) {
+ if (handle && prog->handle != handle)
+ return -EINVAL;
+ return cls_bpf_modify_existing(net, tp, prog, base, tb,
+ tca[TCA_RATE], ovr);
+ }
+
+ prog = kzalloc(sizeof(*prog), GFP_KERNEL);
+ if (prog == NULL)
+ return -ENOBUFS;
+
+ tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+ if (handle == 0)
+ prog->handle = cls_bpf_grab_new_handle(tp, head);
+ else
+ prog->handle = handle;
+ if (prog->handle == 0) {
+ ret = -EINVAL;
+ goto errout;
+ }
+
+ ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
+ if (ret < 0)
+ goto errout;
+
+ tcf_tree_lock(tp);
+ list_add(&prog->link, &head->plist);
+ tcf_tree_unlock(tp);
+
+ *arg = (unsigned long) prog;
+
+ return 0;
+errout:
+ if (*arg == 0UL && prog)
+ kfree(prog);
+
+ return ret;
+}
+
+static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *tm)
+{
+ struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
+ struct nlattr *nest, *nla;
+
+ if (prog == NULL)
+ return skb->len;
+
+ tm->tcm_handle = prog->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len *
+ sizeof(struct sock_filter));
+ if (nla == NULL)
+ goto nla_put_failure;
+
+ memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
+
+ if (tcf_exts_dump(skb, &prog->exts) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &prog->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (arg->count < arg->skip)
+ goto skip;
+ if (arg->fn(tp, (unsigned long) prog, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+skip:
+ arg->count++;
+ }
+}
+
+static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
+ .kind = "bpf",
+ .owner = THIS_MODULE,
+ .classify = cls_bpf_classify,
+ .init = cls_bpf_init,
+ .destroy = cls_bpf_destroy,
+ .get = cls_bpf_get,
+ .put = cls_bpf_put,
+ .change = cls_bpf_change,
+ .delete = cls_bpf_delete,
+ .walk = cls_bpf_walk,
+ .dump = cls_bpf_dump,
+};
+
+static int __init cls_bpf_init_mod(void)
+{
+ return register_tcf_proto_ops(&cls_bpf_ops);
+}
+
+static void __exit cls_bpf_exit_mod(void)
+{
+ unregister_tcf_proto_ops(&cls_bpf_ops);
+}
+
+module_init(cls_bpf_init_mod);
+module_exit(cls_bpf_exit_mod);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index d49c40fb7e0..cacf01bd04f 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -11,97 +11,20 @@
#include <linux/module.h>
#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/errno.h>
#include <linux/skbuff.h>
-#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/sock.h>
#include <net/cls_cgroup.h>
-static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
- struct cgroup *cgrp);
-static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
-
-struct cgroup_subsys net_cls_subsys = {
- .name = "net_cls",
- .create = cgrp_create,
- .destroy = cgrp_destroy,
- .populate = cgrp_populate,
-#ifdef CONFIG_NET_CLS_CGROUP
- .subsys_id = net_cls_subsys_id,
-#endif
- .module = THIS_MODULE,
-};
-
-
-static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
-{
- return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
- struct cgroup_cls_state, css);
-}
-
-static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
-{
- return container_of(task_subsys_state(p, net_cls_subsys_id),
- struct cgroup_cls_state, css);
-}
-
-static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
- struct cgroup *cgrp)
-{
- struct cgroup_cls_state *cs;
-
- if (!(cs = kzalloc(sizeof(*cs), GFP_KERNEL)))
- return ERR_PTR(-ENOMEM);
-
- if (cgrp->parent)
- cs->classid = cgrp_cls_state(cgrp->parent)->classid;
-
- return &cs->css;
-}
-
-static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
- kfree(cgrp_cls_state(cgrp));
-}
-
-static u64 read_classid(struct cgroup *cgrp, struct cftype *cft)
-{
- return cgrp_cls_state(cgrp)->classid;
-}
-
-static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value)
-{
- cgrp_cls_state(cgrp)->classid = (u32) value;
- return 0;
-}
-
-static struct cftype ss_files[] = {
- {
- .name = "classid",
- .read_u64 = read_classid,
- .write_u64 = write_classid,
- },
-};
-
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
- return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
-}
-
-struct cls_cgroup_head
-{
+struct cls_cgroup_head {
u32 handle;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
};
-static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
+static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_cgroup_head *head = tp->root;
@@ -153,20 +76,16 @@ static int cls_cgroup_init(struct tcf_proto *tp)
return 0;
}
-static const struct tcf_ext_map cgroup_ext_map = {
- .action = TCA_CGROUP_ACT,
- .police = TCA_CGROUP_POLICE,
-};
-
static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
[TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED },
};
-static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
+static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- unsigned long *arg)
+ unsigned long *arg, bool ovr)
{
- struct nlattr *tb[TCA_CGROUP_MAX+1];
+ struct nlattr *tb[TCA_CGROUP_MAX + 1];
struct cls_cgroup_head *head = tp->root;
struct tcf_ematch_tree t;
struct tcf_exts e;
@@ -183,6 +102,7 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
if (head == NULL)
return -ENOBUFS;
+ tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
head->handle = handle;
tcf_tree_lock(tp);
@@ -198,7 +118,8 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
if (err < 0)
return err;
- err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &cgroup_ext_map);
+ tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
if (err < 0)
return err;
@@ -243,7 +164,7 @@ skip:
arg->count++;
}
-static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh,
+static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct cls_cgroup_head *head = tp->root;
@@ -256,13 +177,13 @@ static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh,
if (nest == NULL)
goto nla_put_failure;
- if (tcf_exts_dump(skb, &head->exts, &cgroup_ext_map) < 0 ||
+ if (tcf_exts_dump(skb, &head->exts) < 0 ||
tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
- if (tcf_exts_dump_stats(skb, &head->exts, &cgroup_ext_map) < 0)
+ if (tcf_exts_dump_stats(skb, &head->exts) < 0)
goto nla_put_failure;
return skb->len;
@@ -288,36 +209,12 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
static int __init init_cgroup_cls(void)
{
- int ret;
-
- ret = cgroup_load_subsys(&net_cls_subsys);
- if (ret)
- goto out;
-
-#ifndef CONFIG_NET_CLS_CGROUP
- /* We can't use rcu_assign_pointer because this is an int. */
- smp_wmb();
- net_cls_subsys_id = net_cls_subsys.subsys_id;
-#endif
-
- ret = register_tcf_proto_ops(&cls_cgroup_ops);
- if (ret)
- cgroup_unload_subsys(&net_cls_subsys);
-
-out:
- return ret;
+ return register_tcf_proto_ops(&cls_cgroup_ops);
}
static void __exit exit_cgroup_cls(void)
{
unregister_tcf_proto_ops(&cls_cgroup_ops);
-
-#ifndef CONFIG_NET_CLS_CGROUP
- net_cls_subsys_id = -1;
- synchronize_rcu();
-#endif
-
- cgroup_unload_subsys(&net_cls_subsys);
}
module_init(init_cgroup_cls);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 5b271a18bc3..35be16f7c19 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -21,10 +21,13 @@
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
#include <linux/slab.h>
+#include <linux/module.h>
#include <net/pkt_cls.h>
#include <net/ip.h>
#include <net/route.h>
+#include <net/flow_keys.h>
+
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netfilter/nf_conntrack.h>
#endif
@@ -53,11 +56,6 @@ struct flow_filter {
u32 hashrnd;
};
-static const struct tcf_ext_map flow_ext_map = {
- .action = TCA_FLOW_ACT,
- .police = TCA_FLOW_POLICE,
-};
-
static inline u32 addr_fold(void *addr)
{
unsigned long a = (unsigned long)addr;
@@ -65,132 +63,37 @@ static inline u32 addr_fold(void *addr)
return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
}
-static u32 flow_get_src(struct sk_buff *skb)
+static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- if (pskb_network_may_pull(skb, sizeof(struct iphdr)))
- return ntohl(ip_hdr(skb)->saddr);
- break;
- case htons(ETH_P_IPV6):
- if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
- return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]);
- break;
- }
-
+ if (flow->src)
+ return ntohl(flow->src);
return addr_fold(skb->sk);
}
-static u32 flow_get_dst(struct sk_buff *skb)
+static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- if (pskb_network_may_pull(skb, sizeof(struct iphdr)))
- return ntohl(ip_hdr(skb)->daddr);
- break;
- case htons(ETH_P_IPV6):
- if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
- return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]);
- break;
- }
-
+ if (flow->dst)
+ return ntohl(flow->dst);
return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
}
-static u32 flow_get_proto(struct sk_buff *skb)
+static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
{
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- return pskb_network_may_pull(skb, sizeof(struct iphdr)) ?
- ip_hdr(skb)->protocol : 0;
- case htons(ETH_P_IPV6):
- return pskb_network_may_pull(skb, sizeof(struct ipv6hdr)) ?
- ipv6_hdr(skb)->nexthdr : 0;
- default:
- return 0;
- }
+ return flow->ip_proto;
}
-static u32 flow_get_proto_src(struct sk_buff *skb)
+static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
- switch (skb->protocol) {
- case htons(ETH_P_IP): {
- struct iphdr *iph;
- int poff;
-
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
- break;
- iph = ip_hdr(skb);
- if (iph->frag_off & htons(IP_MF|IP_OFFSET))
- break;
- poff = proto_ports_offset(iph->protocol);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) {
- iph = ip_hdr(skb);
- return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
- poff));
- }
- break;
- }
- case htons(ETH_P_IPV6): {
- struct ipv6hdr *iph;
- int poff;
-
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
- break;
- iph = ipv6_hdr(skb);
- poff = proto_ports_offset(iph->nexthdr);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) {
- iph = ipv6_hdr(skb);
- return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
- poff));
- }
- break;
- }
- }
+ if (flow->ports)
+ return ntohs(flow->port16[0]);
return addr_fold(skb->sk);
}
-static u32 flow_get_proto_dst(struct sk_buff *skb)
+static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
- switch (skb->protocol) {
- case htons(ETH_P_IP): {
- struct iphdr *iph;
- int poff;
-
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
- break;
- iph = ip_hdr(skb);
- if (iph->frag_off & htons(IP_MF|IP_OFFSET))
- break;
- poff = proto_ports_offset(iph->protocol);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
- iph = ip_hdr(skb);
- return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
- 2 + poff));
- }
- break;
- }
- case htons(ETH_P_IPV6): {
- struct ipv6hdr *iph;
- int poff;
-
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
- break;
- iph = ipv6_hdr(skb);
- poff = proto_ports_offset(iph->nexthdr);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) {
- iph = ipv6_hdr(skb);
- return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
- poff + 2));
- }
- break;
- }
- }
+ if (flow->ports)
+ return ntohs(flow->port16[1]);
return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
}
@@ -223,7 +126,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
#define CTTUPLE(skb, member) \
({ \
enum ip_conntrack_info ctinfo; \
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \
+ const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \
if (ct == NULL) \
goto fallback; \
ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \
@@ -236,7 +139,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
})
#endif
-static u32 flow_get_nfct_src(struct sk_buff *skb)
+static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
@@ -245,10 +148,10 @@ static u32 flow_get_nfct_src(struct sk_buff *skb)
return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
}
fallback:
- return flow_get_src(skb);
+ return flow_get_src(skb, flow);
}
-static u32 flow_get_nfct_dst(struct sk_buff *skb)
+static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
@@ -257,26 +160,26 @@ static u32 flow_get_nfct_dst(struct sk_buff *skb)
return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
}
fallback:
- return flow_get_dst(skb);
+ return flow_get_dst(skb, flow);
}
-static u32 flow_get_nfct_proto_src(struct sk_buff *skb)
+static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, src.u.all));
fallback:
- return flow_get_proto_src(skb);
+ return flow_get_proto_src(skb, flow);
}
-static u32 flow_get_nfct_proto_dst(struct sk_buff *skb)
+static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, dst.u.all));
fallback:
- return flow_get_proto_dst(skb);
+ return flow_get_proto_dst(skb, flow);
}
static u32 flow_get_rtclassid(const struct sk_buff *skb)
{
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (skb_dst(skb))
return skb_dst(skb)->tclassid;
#endif
@@ -285,15 +188,19 @@ static u32 flow_get_rtclassid(const struct sk_buff *skb)
static u32 flow_get_skuid(const struct sk_buff *skb)
{
- if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
- return skb->sk->sk_socket->file->f_cred->fsuid;
+ if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
+ kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid;
+ return from_kuid(&init_user_ns, skuid);
+ }
return 0;
}
static u32 flow_get_skgid(const struct sk_buff *skb)
{
- if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
- return skb->sk->sk_socket->file->f_cred->fsgid;
+ if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
+ kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid;
+ return from_kgid(&init_user_ns, skgid);
+ }
return 0;
}
@@ -308,22 +215,22 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb)
static u32 flow_get_rxhash(struct sk_buff *skb)
{
- return skb_get_rxhash(skb);
+ return skb_get_hash(skb);
}
-static u32 flow_key_get(struct sk_buff *skb, int key)
+static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
{
switch (key) {
case FLOW_KEY_SRC:
- return flow_get_src(skb);
+ return flow_get_src(skb, flow);
case FLOW_KEY_DST:
- return flow_get_dst(skb);
+ return flow_get_dst(skb, flow);
case FLOW_KEY_PROTO:
- return flow_get_proto(skb);
+ return flow_get_proto(skb, flow);
case FLOW_KEY_PROTO_SRC:
- return flow_get_proto_src(skb);
+ return flow_get_proto_src(skb, flow);
case FLOW_KEY_PROTO_DST:
- return flow_get_proto_dst(skb);
+ return flow_get_proto_dst(skb, flow);
case FLOW_KEY_IIF:
return flow_get_iif(skb);
case FLOW_KEY_PRIORITY:
@@ -333,13 +240,13 @@ static u32 flow_key_get(struct sk_buff *skb, int key)
case FLOW_KEY_NFCT:
return flow_get_nfct(skb);
case FLOW_KEY_NFCT_SRC:
- return flow_get_nfct_src(skb);
+ return flow_get_nfct_src(skb, flow);
case FLOW_KEY_NFCT_DST:
- return flow_get_nfct_dst(skb);
+ return flow_get_nfct_dst(skb, flow);
case FLOW_KEY_NFCT_PROTO_SRC:
- return flow_get_nfct_proto_src(skb);
+ return flow_get_nfct_proto_src(skb, flow);
case FLOW_KEY_NFCT_PROTO_DST:
- return flow_get_nfct_proto_dst(skb);
+ return flow_get_nfct_proto_dst(skb, flow);
case FLOW_KEY_RTCLASSID:
return flow_get_rtclassid(skb);
case FLOW_KEY_SKUID:
@@ -356,7 +263,17 @@ static u32 flow_key_get(struct sk_buff *skb, int key)
}
}
-static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp,
+#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \
+ (1 << FLOW_KEY_DST) | \
+ (1 << FLOW_KEY_PROTO) | \
+ (1 << FLOW_KEY_PROTO_SRC) | \
+ (1 << FLOW_KEY_PROTO_DST) | \
+ (1 << FLOW_KEY_NFCT_SRC) | \
+ (1 << FLOW_KEY_NFCT_DST) | \
+ (1 << FLOW_KEY_NFCT_PROTO_SRC) | \
+ (1 << FLOW_KEY_NFCT_PROTO_DST))
+
+static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct flow_head *head = tp->root;
@@ -367,17 +284,20 @@ static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp,
int r;
list_for_each_entry(f, &head->filters, list) {
- u32 keys[f->nkeys];
+ u32 keys[FLOW_KEY_MAX + 1];
+ struct flow_keys flow_keys;
if (!tcf_em_tree_match(skb, &f->ematches, NULL))
continue;
keymask = f->keymask;
+ if (keymask & FLOW_KEYS_NEEDED)
+ skb_flow_dissect(skb, &flow_keys);
for (n = 0; n < f->nkeys; n++) {
key = ffs(keymask) - 1;
keymask &= ~(1 << key);
- keys[n] = flow_key_get(skb, key);
+ keys[n] = flow_key_get(skb, key, &flow_keys);
}
if (f->mode == FLOW_MODE_HASH)
@@ -426,9 +346,10 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
[TCA_FLOW_PERTURB] = { .type = NLA_U32 },
};
-static int flow_change(struct tcf_proto *tp, unsigned long base,
+static int flow_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- unsigned long *arg)
+ unsigned long *arg, bool ovr)
{
struct flow_head *head = tp->root;
struct flow_filter *f;
@@ -465,9 +386,14 @@ static int flow_change(struct tcf_proto *tp, unsigned long base,
if (fls(keymask) - 1 > FLOW_KEY_MAX)
return -EOPNOTSUPP;
+
+ if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) &&
+ sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns)
+ return -EOPNOTSUPP;
}
- err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map);
+ tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
if (err < 0)
return err;
@@ -525,6 +451,7 @@ static int flow_change(struct tcf_proto *tp, unsigned long base,
f->handle = handle;
f->mask = ~0U;
+ tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
get_random_bytes(&f->hashrnd, 4);
f->perturb_timer.function = flow_perturbation;
@@ -636,7 +563,7 @@ static void flow_put(struct tcf_proto *tp, unsigned long f)
{
}
-static int flow_dump(struct tcf_proto *tp, unsigned long fh,
+static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct flow_filter *f = (struct flow_filter *)fh;
@@ -651,27 +578,34 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh,
if (nest == NULL)
goto nla_put_failure;
- NLA_PUT_U32(skb, TCA_FLOW_KEYS, f->keymask);
- NLA_PUT_U32(skb, TCA_FLOW_MODE, f->mode);
+ if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) ||
+ nla_put_u32(skb, TCA_FLOW_MODE, f->mode))
+ goto nla_put_failure;
if (f->mask != ~0 || f->xor != 0) {
- NLA_PUT_U32(skb, TCA_FLOW_MASK, f->mask);
- NLA_PUT_U32(skb, TCA_FLOW_XOR, f->xor);
+ if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) ||
+ nla_put_u32(skb, TCA_FLOW_XOR, f->xor))
+ goto nla_put_failure;
}
- if (f->rshift)
- NLA_PUT_U32(skb, TCA_FLOW_RSHIFT, f->rshift);
- if (f->addend)
- NLA_PUT_U32(skb, TCA_FLOW_ADDEND, f->addend);
+ if (f->rshift &&
+ nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift))
+ goto nla_put_failure;
+ if (f->addend &&
+ nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend))
+ goto nla_put_failure;
- if (f->divisor)
- NLA_PUT_U32(skb, TCA_FLOW_DIVISOR, f->divisor);
- if (f->baseclass)
- NLA_PUT_U32(skb, TCA_FLOW_BASECLASS, f->baseclass);
+ if (f->divisor &&
+ nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor))
+ goto nla_put_failure;
+ if (f->baseclass &&
+ nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass))
+ goto nla_put_failure;
- if (f->perturb_period)
- NLA_PUT_U32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ);
+ if (f->perturb_period &&
+ nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ))
+ goto nla_put_failure;
- if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0)
+ if (tcf_exts_dump(skb, &f->exts) < 0)
goto nla_put_failure;
#ifdef CONFIG_NET_EMATCH
if (f->ematches.hdr.nmatches &&
@@ -680,7 +614,7 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh,
#endif
nla_nest_end(skb, nest);
- if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0)
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 93b0a7b6f9b..861b03ccfed 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -29,71 +29,45 @@
#include <net/act_api.h>
#include <net/pkt_cls.h>
-#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
+#define HTSIZE 256
-struct fw_head
-{
- struct fw_filter *ht[HTSIZE];
- u32 mask;
+struct fw_head {
+ u32 mask;
+ struct fw_filter *ht[HTSIZE];
};
-struct fw_filter
-{
+struct fw_filter {
struct fw_filter *next;
u32 id;
struct tcf_result res;
#ifdef CONFIG_NET_CLS_IND
- char indev[IFNAMSIZ];
+ int ifindex;
#endif /* CONFIG_NET_CLS_IND */
struct tcf_exts exts;
};
-static const struct tcf_ext_map fw_ext_map = {
- .action = TCA_FW_ACT,
- .police = TCA_FW_POLICE
-};
-
-static __inline__ int fw_hash(u32 handle)
+static u32 fw_hash(u32 handle)
{
- if (HTSIZE == 4096)
- return ((handle >> 24) & 0xFFF) ^
- ((handle >> 12) & 0xFFF) ^
- (handle & 0xFFF);
- else if (HTSIZE == 2048)
- return ((handle >> 22) & 0x7FF) ^
- ((handle >> 11) & 0x7FF) ^
- (handle & 0x7FF);
- else if (HTSIZE == 1024)
- return ((handle >> 20) & 0x3FF) ^
- ((handle >> 10) & 0x3FF) ^
- (handle & 0x3FF);
- else if (HTSIZE == 512)
- return (handle >> 27) ^
- ((handle >> 18) & 0x1FF) ^
- ((handle >> 9) & 0x1FF) ^
- (handle & 0x1FF);
- else if (HTSIZE == 256) {
- u8 *t = (u8 *) &handle;
- return t[0] ^ t[1] ^ t[2] ^ t[3];
- } else
- return handle & (HTSIZE - 1);
+ handle ^= (handle >> 16);
+ handle ^= (handle >> 8);
+ return handle % HTSIZE;
}
-static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
+static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = tp->root;
struct fw_filter *f;
int r;
u32 id = skb->mark;
if (head != NULL) {
id &= head->mask;
- for (f=head->ht[fw_hash(id)]; f; f=f->next) {
+ for (f = head->ht[fw_hash(id)]; f; f = f->next) {
if (f->id == id) {
*res = f->res;
#ifdef CONFIG_NET_CLS_IND
- if (!tcf_match_indev(skb, f->indev))
+ if (!tcf_match_indev(skb, f->ifindex))
continue;
#endif /* CONFIG_NET_CLS_IND */
r = tcf_exts_exec(skb, &f->exts, res);
@@ -105,7 +79,8 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
}
} else {
/* old method */
- if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) {
+ if (id && (TC_H_MAJ(id) == 0 ||
+ !(TC_H_MAJ(id ^ tp->q->handle)))) {
res->classid = id;
res->class = 0;
return 0;
@@ -117,13 +92,13 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = tp->root;
struct fw_filter *f;
if (head == NULL)
return 0;
- for (f=head->ht[fw_hash(handle)]; f; f=f->next) {
+ for (f = head->ht[fw_hash(handle)]; f; f = f->next) {
if (f->id == handle)
return (unsigned long)f;
}
@@ -139,8 +114,7 @@ static int fw_init(struct tcf_proto *tp)
return 0;
}
-static inline void
-fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
+static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
@@ -156,8 +130,8 @@ static void fw_destroy(struct tcf_proto *tp)
if (head == NULL)
return;
- for (h=0; h<HTSIZE; h++) {
- while ((f=head->ht[h]) != NULL) {
+ for (h = 0; h < HTSIZE; h++) {
+ while ((f = head->ht[h]) != NULL) {
head->ht[h] = f->next;
fw_delete_filter(tp, f);
}
@@ -167,14 +141,14 @@ static void fw_destroy(struct tcf_proto *tp)
static int fw_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct fw_head *head = (struct fw_head*)tp->root;
- struct fw_filter *f = (struct fw_filter*)arg;
+ struct fw_head *head = tp->root;
+ struct fw_filter *f = (struct fw_filter *)arg;
struct fw_filter **fp;
if (head == NULL || f == NULL)
goto out;
- for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
+ for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
@@ -194,19 +168,19 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
};
static int
-fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,
- struct nlattr **tb, struct nlattr **tca, unsigned long base)
+fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
+ struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)
{
- struct fw_head *head = (struct fw_head *)tp->root;
+ struct fw_head *head = tp->root;
struct tcf_exts e;
u32 mask;
int err;
- err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &fw_ext_map);
+ tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE);
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
if (err < 0)
return err;
- err = -EINVAL;
if (tb[TCA_FW_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
tcf_bind_filter(tp, &f->res, base);
@@ -214,12 +188,17 @@ fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,
#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_FW_INDEV]) {
- err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV]);
- if (err < 0)
+ int ret;
+ ret = tcf_change_indev(net, tb[TCA_FW_INDEV]);
+ if (ret < 0) {
+ err = ret;
goto errout;
+ }
+ f->ifindex = ret;
}
#endif /* CONFIG_NET_CLS_IND */
+ err = -EINVAL;
if (tb[TCA_FW_MASK]) {
mask = nla_get_u32(tb[TCA_FW_MASK]);
if (mask != head->mask)
@@ -235,12 +214,13 @@ errout:
return err;
}
-static int fw_change(struct tcf_proto *tp, unsigned long base,
+static int fw_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
u32 handle,
struct nlattr **tca,
- unsigned long *arg)
+ unsigned long *arg, bool ovr)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = tp->root;
struct fw_filter *f = (struct fw_filter *) *arg;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_FW_MAX + 1];
@@ -256,7 +236,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base,
if (f != NULL) {
if (f->id != handle && handle)
return -EINVAL;
- return fw_change_attrs(tp, f, tb, tca, base);
+ return fw_change_attrs(net, tp, f, tb, tca, base, ovr);
}
if (!handle)
@@ -281,9 +261,10 @@ static int fw_change(struct tcf_proto *tp, unsigned long base,
if (f == NULL)
return -ENOBUFS;
+ tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
f->id = handle;
- err = fw_change_attrs(tp, f, tb, tca, base);
+ err = fw_change_attrs(net, tp, f, tb, tca, base, ovr);
if (err < 0)
goto errout;
@@ -302,7 +283,7 @@ errout:
static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = tp->root;
int h;
if (head == NULL)
@@ -328,11 +309,11 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
}
-static int fw_dump(struct tcf_proto *tp, unsigned long fh,
+static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct fw_head *head = (struct fw_head *)tp->root;
- struct fw_filter *f = (struct fw_filter*)fh;
+ struct fw_head *head = tp->root;
+ struct fw_filter *f = (struct fw_filter *)fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
@@ -348,21 +329,27 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh,
if (nest == NULL)
goto nla_put_failure;
- if (f->res.classid)
- NLA_PUT_U32(skb, TCA_FW_CLASSID, f->res.classid);
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
+ goto nla_put_failure;
#ifdef CONFIG_NET_CLS_IND
- if (strlen(f->indev))
- NLA_PUT_STRING(skb, TCA_FW_INDEV, f->indev);
+ if (f->ifindex) {
+ struct net_device *dev;
+ dev = __dev_get_by_index(net, f->ifindex);
+ if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
+ goto nla_put_failure;
+ }
#endif /* CONFIG_NET_CLS_IND */
- if (head->mask != 0xFFFFFFFF)
- NLA_PUT_U32(skb, TCA_FW_MASK, head->mask);
+ if (head->mask != 0xFFFFFFFF &&
+ nla_put_u32(skb, TCA_FW_MASK, head->mask))
+ goto nla_put_failure;
- if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0)
+ if (tcf_exts_dump(skb, &f->exts) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
- if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0)
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 694dcd85dec..dd9fc2523c7 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -23,34 +23,30 @@
#include <net/pkt_cls.h>
/*
- 1. For now we assume that route tags < 256.
- It allows to use direct table lookups, instead of hash tables.
- 2. For now we assume that "from TAG" and "fromdev DEV" statements
- are mutually exclusive.
- 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
+ * 1. For now we assume that route tags < 256.
+ * It allows to use direct table lookups, instead of hash tables.
+ * 2. For now we assume that "from TAG" and "fromdev DEV" statements
+ * are mutually exclusive.
+ * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
*/
-struct route4_fastmap
-{
+struct route4_fastmap {
struct route4_filter *filter;
u32 id;
int iif;
};
-struct route4_head
-{
+struct route4_head {
struct route4_fastmap fastmap[16];
- struct route4_bucket *table[256+1];
+ struct route4_bucket *table[256 + 1];
};
-struct route4_bucket
-{
+struct route4_bucket {
/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
- struct route4_filter *ht[16+16+1];
+ struct route4_filter *ht[16 + 16 + 1];
};
-struct route4_filter
-{
+struct route4_filter {
struct route4_filter *next;
u32 id;
int iif;
@@ -61,20 +57,15 @@ struct route4_filter
struct route4_bucket *bkt;
};
-#define ROUTE4_FAILURE ((struct route4_filter*)(-1L))
+#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
-static const struct tcf_ext_map route_ext_map = {
- .police = TCA_ROUTE4_POLICE,
- .action = TCA_ROUTE4_ACT
-};
-
-static __inline__ int route4_fastmap_hash(u32 id, int iif)
+static inline int route4_fastmap_hash(u32 id, int iif)
{
- return id&0xF;
+ return id & 0xF;
}
-static inline
-void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
+static void
+route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
{
spinlock_t *root_lock = qdisc_root_sleeping_lock(q);
@@ -83,32 +74,33 @@ void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
spin_unlock_bh(root_lock);
}
-static inline void
+static void
route4_set_fastmap(struct route4_head *head, u32 id, int iif,
struct route4_filter *f)
{
int h = route4_fastmap_hash(id, iif);
+
head->fastmap[h].id = id;
head->fastmap[h].iif = iif;
head->fastmap[h].filter = f;
}
-static __inline__ int route4_hash_to(u32 id)
+static inline int route4_hash_to(u32 id)
{
- return id&0xFF;
+ return id & 0xFF;
}
-static __inline__ int route4_hash_from(u32 id)
+static inline int route4_hash_from(u32 id)
{
- return (id>>16)&0xF;
+ return (id >> 16) & 0xF;
}
-static __inline__ int route4_hash_iif(int iif)
+static inline int route4_hash_iif(int iif)
{
- return 16 + ((iif>>16)&0xF);
+ return 16 + ((iif >> 16) & 0xF);
}
-static __inline__ int route4_hash_wild(void)
+static inline int route4_hash_wild(void)
{
return 32;
}
@@ -128,24 +120,25 @@ static __inline__ int route4_hash_wild(void)
return 0; \
}
-static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
+static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- struct route4_head *head = (struct route4_head*)tp->root;
+ struct route4_head *head = tp->root;
struct dst_entry *dst;
struct route4_bucket *b;
struct route4_filter *f;
u32 id, h;
int iif, dont_cache = 0;
- if ((dst = skb_dst(skb)) == NULL)
+ dst = skb_dst(skb);
+ if (!dst)
goto failure;
id = dst->tclassid;
if (head == NULL)
goto old_method;
- iif = ((struct rtable*)dst)->fl.iif;
+ iif = inet_iif(skb);
h = route4_fastmap_hash(id, iif);
if (id == head->fastmap[h].id &&
@@ -161,7 +154,8 @@ static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
h = route4_hash_to(id);
restart:
- if ((b = head->table[h]) != NULL) {
+ b = head->table[h];
+ if (b) {
for (f = b->ht[route4_hash_from(id)]; f; f = f->next)
if (f->id == id)
ROUTE4_APPLY_RESULT();
@@ -197,8 +191,9 @@ old_method:
static inline u32 to_hash(u32 id)
{
- u32 h = id&0xFF;
- if (id&0x8000)
+ u32 h = id & 0xFF;
+
+ if (id & 0x8000)
h += 256;
return h;
}
@@ -211,17 +206,17 @@ static inline u32 from_hash(u32 id)
if (!(id & 0x8000)) {
if (id > 255)
return 256;
- return id&0xF;
+ return id & 0xF;
}
- return 16 + (id&0xF);
+ return 16 + (id & 0xF);
}
static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
{
- struct route4_head *head = (struct route4_head*)tp->root;
+ struct route4_head *head = tp->root;
struct route4_bucket *b;
struct route4_filter *f;
- unsigned h1, h2;
+ unsigned int h1, h2;
if (!head)
return 0;
@@ -230,11 +225,12 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
if (h1 > 256)
return 0;
- h2 = from_hash(handle>>16);
+ h2 = from_hash(handle >> 16);
if (h2 > 32)
return 0;
- if ((b = head->table[h1]) != NULL) {
+ b = head->table[h1];
+ if (b) {
for (f = b->ht[h2]; f; f = f->next)
if (f->handle == handle)
return (unsigned long)f;
@@ -251,7 +247,7 @@ static int route4_init(struct tcf_proto *tp)
return 0;
}
-static inline void
+static void
route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
{
tcf_unbind_filter(tp, &f->res);
@@ -267,11 +263,12 @@ static void route4_destroy(struct tcf_proto *tp)
if (head == NULL)
return;
- for (h1=0; h1<=256; h1++) {
+ for (h1 = 0; h1 <= 256; h1++) {
struct route4_bucket *b;
- if ((b = head->table[h1]) != NULL) {
- for (h2=0; h2<=32; h2++) {
+ b = head->table[h1];
+ if (b) {
+ for (h2 = 0; h2 <= 32; h2++) {
struct route4_filter *f;
while ((f = b->ht[h2]) != NULL) {
@@ -287,9 +284,9 @@ static void route4_destroy(struct tcf_proto *tp)
static int route4_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct route4_head *head = (struct route4_head*)tp->root;
- struct route4_filter **fp, *f = (struct route4_filter*)arg;
- unsigned h = 0;
+ struct route4_head *head = tp->root;
+ struct route4_filter **fp, *f = (struct route4_filter *)arg;
+ unsigned int h = 0;
struct route4_bucket *b;
int i;
@@ -299,7 +296,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
h = f->handle;
b = f->bkt;
- for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) {
+ for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
@@ -310,7 +307,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
/* Strip tree */
- for (i=0; i<=32; i++)
+ for (i = 0; i <= 32; i++)
if (b->ht[i])
return 0;
@@ -333,9 +330,11 @@ static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {
[TCA_ROUTE4_IIF] = { .type = NLA_U32 },
};
-static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
- struct route4_filter *f, u32 handle, struct route4_head *head,
- struct nlattr **tb, struct nlattr *est, int new)
+static int route4_set_parms(struct net *net, struct tcf_proto *tp,
+ unsigned long base, struct route4_filter *f,
+ u32 handle, struct route4_head *head,
+ struct nlattr **tb, struct nlattr *est, int new,
+ bool ovr)
{
int err;
u32 id = 0, to = 0, nhandle = 0x8000;
@@ -344,7 +343,8 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
struct route4_bucket *b;
struct tcf_exts e;
- err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map);
+ tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
if (err < 0)
return err;
@@ -380,7 +380,8 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
}
h1 = to_hash(nhandle);
- if ((b = head->table[h1]) == NULL) {
+ b = head->table[h1];
+ if (!b) {
err = -ENOBUFS;
b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
if (b == NULL)
@@ -391,6 +392,7 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
tcf_tree_unlock(tp);
} else {
unsigned int h2 = from_hash(nhandle >> 16);
+
err = -EEXIST;
for (fp = b->ht[h2]; fp; fp = fp->next)
if (fp->handle == f->handle)
@@ -423,10 +425,11 @@ errout:
return err;
}
-static int route4_change(struct tcf_proto *tp, unsigned long base,
+static int route4_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
u32 handle,
struct nlattr **tca,
- unsigned long *arg)
+ unsigned long *arg, bool ovr)
{
struct route4_head *head = tp->root;
struct route4_filter *f, *f1, **fp;
@@ -444,15 +447,16 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
if (err < 0)
return err;
- if ((f = (struct route4_filter*)*arg) != NULL) {
+ f = (struct route4_filter *)*arg;
+ if (f) {
if (f->handle != handle && handle)
return -EINVAL;
if (f->bkt)
old_handle = f->handle;
- err = route4_set_parms(tp, base, f, handle, head, tb,
- tca[TCA_RATE], 0);
+ err = route4_set_parms(net, tp, base, f, handle, head, tb,
+ tca[TCA_RATE], 0, ovr);
if (err < 0)
return err;
@@ -474,14 +478,15 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
if (f == NULL)
goto errout;
- err = route4_set_parms(tp, base, f, handle, head, tb,
- tca[TCA_RATE], 1);
+ tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+ err = route4_set_parms(net, tp, base, f, handle, head, tb,
+ tca[TCA_RATE], 1, ovr);
if (err < 0)
goto errout;
reinsert:
h = from_hash(f->handle >> 16);
- for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next)
+ for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next)
if (f->handle < f1->handle)
break;
@@ -492,7 +497,8 @@ reinsert:
if (old_handle && f->handle != old_handle) {
th = to_hash(old_handle);
h = from_hash(old_handle >> 16);
- if ((b = head->table[th]) != NULL) {
+ b = head->table[th];
+ if (b) {
for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
*fp = f->next;
@@ -515,7 +521,7 @@ errout:
static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct route4_head *head = tp->root;
- unsigned h, h1;
+ unsigned int h, h1;
if (head == NULL)
arg->stop = 1;
@@ -546,10 +552,10 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
}
-static int route4_dump(struct tcf_proto *tp, unsigned long fh,
+static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct route4_filter *f = (struct route4_filter*)fh;
+ struct route4_filter *f = (struct route4_filter *)fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
u32 id;
@@ -563,26 +569,30 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh,
if (nest == NULL)
goto nla_put_failure;
- if (!(f->handle&0x8000)) {
- id = f->id&0xFF;
- NLA_PUT_U32(skb, TCA_ROUTE4_TO, id);
+ if (!(f->handle & 0x8000)) {
+ id = f->id & 0xFF;
+ if (nla_put_u32(skb, TCA_ROUTE4_TO, id))
+ goto nla_put_failure;
}
- if (f->handle&0x80000000) {
- if ((f->handle>>16) != 0xFFFF)
- NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif);
+ if (f->handle & 0x80000000) {
+ if ((f->handle >> 16) != 0xFFFF &&
+ nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif))
+ goto nla_put_failure;
} else {
- id = f->id>>16;
- NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id);
+ id = f->id >> 16;
+ if (nla_put_u32(skb, TCA_ROUTE4_FROM, id))
+ goto nla_put_failure;
}
- if (f->res.classid)
- NLA_PUT_U32(skb, TCA_ROUTE4_CLASSID, f->res.classid);
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid))
+ goto nla_put_failure;
- if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0)
+ if (tcf_exts_dump(skb, &f->exts) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
- if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0)
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 425a1790b04..1020e233a5d 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -66,28 +66,25 @@
powerful classification engine. */
-struct rsvp_head
-{
+struct rsvp_head {
u32 tmap[256/32];
u32 hgenerator;
u8 tgenerator;
struct rsvp_session *ht[256];
};
-struct rsvp_session
-{
+struct rsvp_session {
struct rsvp_session *next;
__be32 dst[RSVP_DST_LEN];
struct tc_rsvp_gpi dpi;
u8 protocol;
u8 tunnelid;
/* 16 (src,sport) hash slots, and one wildcard source slot */
- struct rsvp_filter *ht[16+1];
+ struct rsvp_filter *ht[16 + 1];
};
-struct rsvp_filter
-{
+struct rsvp_filter {
struct rsvp_filter *next;
__be32 src[RSVP_DST_LEN];
struct tc_rsvp_gpi spi;
@@ -100,28 +97,25 @@ struct rsvp_filter
struct rsvp_session *sess;
};
-static __inline__ unsigned hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
+static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
{
- unsigned h = (__force __u32)dst[RSVP_DST_LEN-1];
+ unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
+
h ^= h>>16;
h ^= h>>8;
return (h ^ protocol ^ tunnelid) & 0xFF;
}
-static __inline__ unsigned hash_src(__be32 *src)
+static inline unsigned int hash_src(__be32 *src)
{
- unsigned h = (__force __u32)src[RSVP_DST_LEN-1];
+ unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
+
h ^= h>>16;
h ^= h>>8;
h ^= h>>4;
return h & 0xF;
}
-static struct tcf_ext_map rsvp_ext_map = {
- .police = TCA_RSVP_POLICE,
- .action = TCA_RSVP_ACT
-};
-
#define RSVP_APPLY_RESULT() \
{ \
int r = tcf_exts_exec(skb, &f->exts, res); \
@@ -131,13 +125,13 @@ static struct tcf_ext_map rsvp_ext_map = {
return r; \
}
-static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
+static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
+ struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
struct rsvp_session *s;
struct rsvp_filter *f;
- unsigned h1, h2;
+ unsigned int h1, h2;
__be32 *dst, *src;
u8 protocol;
u8 tunnelid = 0;
@@ -162,13 +156,13 @@ restart:
src = &nhptr->saddr.s6_addr32[0];
dst = &nhptr->daddr.s6_addr32[0];
protocol = nhptr->nexthdr;
- xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr);
+ xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
#else
src = &nhptr->saddr;
dst = &nhptr->daddr;
protocol = nhptr->protocol;
- xprt = ((u8*)nhptr) + (nhptr->ihl<<2);
- if (nhptr->frag_off & htons(IP_MF|IP_OFFSET))
+ xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
+ if (ip_is_fragment(nhptr))
return -1;
#endif
@@ -176,10 +170,10 @@ restart:
h2 = hash_src(src);
for (s = sht[h1]; s; s = s->next) {
- if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
+ if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
protocol == s->protocol &&
!(s->dpi.mask &
- (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) &&
+ (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
#if RSVP_DST_LEN == 4
dst[0] == s->dst[0] &&
dst[1] == s->dst[1] &&
@@ -188,8 +182,8 @@ restart:
tunnelid == s->tunnelid) {
for (f = s->ht[h2]; f; f = f->next) {
- if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] &&
- !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key))
+ if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
+ !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
#if RSVP_DST_LEN == 4
&&
src[0] == f->src[0] &&
@@ -205,7 +199,7 @@ matched:
return 0;
tunnelid = f->res.classid;
- nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr));
+ nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
goto restart;
}
}
@@ -224,11 +218,11 @@ matched:
static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
{
- struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
+ struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
struct rsvp_session *s;
struct rsvp_filter *f;
- unsigned h1 = handle&0xFF;
- unsigned h2 = (handle>>8)&0xFF;
+ unsigned int h1 = handle & 0xFF;
+ unsigned int h2 = (handle >> 8) & 0xFF;
if (h2 > 16)
return 0;
@@ -258,7 +252,7 @@ static int rsvp_init(struct tcf_proto *tp)
return -ENOBUFS;
}
-static inline void
+static void
rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
{
tcf_unbind_filter(tp, &f->res);
@@ -277,13 +271,13 @@ static void rsvp_destroy(struct tcf_proto *tp)
sht = data->ht;
- for (h1=0; h1<256; h1++) {
+ for (h1 = 0; h1 < 256; h1++) {
struct rsvp_session *s;
while ((s = sht[h1]) != NULL) {
sht[h1] = s->next;
- for (h2=0; h2<=16; h2++) {
+ for (h2 = 0; h2 <= 16; h2++) {
struct rsvp_filter *f;
while ((f = s->ht[h2]) != NULL) {
@@ -299,13 +293,13 @@ static void rsvp_destroy(struct tcf_proto *tp)
static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg;
- unsigned h = f->handle;
+ struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg;
+ unsigned int h = f->handle;
struct rsvp_session **sp;
struct rsvp_session *s = f->sess;
int i;
- for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) {
+ for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
@@ -314,12 +308,12 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
/* Strip tree */
- for (i=0; i<=16; i++)
+ for (i = 0; i <= 16; i++)
if (s->ht[i])
return 0;
/* OK, session has no flows */
- for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF];
+ for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF];
*sp; sp = &(*sp)->next) {
if (*sp == s) {
tcf_tree_lock(tp);
@@ -337,13 +331,14 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
return 0;
}
-static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
+static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
{
struct rsvp_head *data = tp->root;
int i = 0xFFFF;
while (i-- > 0) {
u32 h;
+
if ((data->hgenerator += 0x10000) == 0)
data->hgenerator = 0x10000;
h = data->hgenerator|salt;
@@ -355,10 +350,10 @@ static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
static int tunnel_bts(struct rsvp_head *data)
{
- int n = data->tgenerator>>5;
- u32 b = 1<<(data->tgenerator&0x1F);
+ int n = data->tgenerator >> 5;
+ u32 b = 1 << (data->tgenerator & 0x1F);
- if (data->tmap[n]&b)
+ if (data->tmap[n] & b)
return 0;
data->tmap[n] |= b;
return 1;
@@ -372,10 +367,10 @@ static void tunnel_recycle(struct rsvp_head *data)
memset(tmap, 0, sizeof(tmap));
- for (h1=0; h1<256; h1++) {
+ for (h1 = 0; h1 < 256; h1++) {
struct rsvp_session *s;
for (s = sht[h1]; s; s = s->next) {
- for (h2=0; h2<=16; h2++) {
+ for (h2 = 0; h2 <= 16; h2++) {
struct rsvp_filter *f;
for (f = s->ht[h2]; f; f = f->next) {
@@ -395,8 +390,8 @@ static u32 gen_tunnel(struct rsvp_head *data)
{
int i, k;
- for (k=0; k<2; k++) {
- for (i=255; i>0; i--) {
+ for (k = 0; k < 2; k++) {
+ for (i = 255; i > 0; i--) {
if (++data->tgenerator == 0)
data->tgenerator = 1;
if (tunnel_bts(data))
@@ -416,19 +411,20 @@ static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
[TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
};
-static int rsvp_change(struct tcf_proto *tp, unsigned long base,
+static int rsvp_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
u32 handle,
struct nlattr **tca,
- unsigned long *arg)
+ unsigned long *arg, bool ovr)
{
struct rsvp_head *data = tp->root;
struct rsvp_filter *f, **fp;
struct rsvp_session *s, **sp;
struct tc_rsvp_pinfo *pinfo = NULL;
- struct nlattr *opt = tca[TCA_OPTIONS-1];
+ struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_RSVP_MAX + 1];
struct tcf_exts e;
- unsigned h1, h2;
+ unsigned int h1, h2;
__be32 *dst;
int err;
@@ -439,17 +435,19 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
if (err < 0)
return err;
- err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map);
+ tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
if (err < 0)
return err;
- if ((f = (struct rsvp_filter*)*arg) != NULL) {
+ f = (struct rsvp_filter *)*arg;
+ if (f) {
/* Node exists: adjust only classid */
if (f->handle != handle && handle)
goto errout2;
- if (tb[TCA_RSVP_CLASSID-1]) {
- f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]);
+ if (tb[TCA_RSVP_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
tcf_bind_filter(tp, &f->res, base);
}
@@ -461,7 +459,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
err = -EINVAL;
if (handle)
goto errout2;
- if (tb[TCA_RSVP_DST-1] == NULL)
+ if (tb[TCA_RSVP_DST] == NULL)
goto errout2;
err = -ENOBUFS;
@@ -469,20 +467,21 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
if (f == NULL)
goto errout2;
+ tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
h2 = 16;
- if (tb[TCA_RSVP_SRC-1]) {
- memcpy(f->src, nla_data(tb[TCA_RSVP_SRC-1]), sizeof(f->src));
+ if (tb[TCA_RSVP_SRC]) {
+ memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
h2 = hash_src(f->src);
}
- if (tb[TCA_RSVP_PINFO-1]) {
- pinfo = nla_data(tb[TCA_RSVP_PINFO-1]);
+ if (tb[TCA_RSVP_PINFO]) {
+ pinfo = nla_data(tb[TCA_RSVP_PINFO]);
f->spi = pinfo->spi;
f->tunnelhdr = pinfo->tunnelhdr;
}
- if (tb[TCA_RSVP_CLASSID-1])
- f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]);
+ if (tb[TCA_RSVP_CLASSID])
+ f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
- dst = nla_data(tb[TCA_RSVP_DST-1]);
+ dst = nla_data(tb[TCA_RSVP_DST]);
h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
err = -ENOMEM;
@@ -500,7 +499,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
goto errout;
}
- for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) {
+ for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) {
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
pinfo && pinfo->protocol == s->protocol &&
memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
@@ -523,7 +522,7 @@ insert:
tcf_exts_change(tp, &f->exts, &e);
for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
- if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask)
+ if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask)
break;
f->next = *fp;
wmb();
@@ -567,7 +566,7 @@ errout2:
static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct rsvp_head *head = tp->root;
- unsigned h, h1;
+ unsigned int h, h1;
if (arg->stop)
return;
@@ -595,10 +594,10 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
}
-static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
+static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct rsvp_filter *f = (struct rsvp_filter*)fh;
+ struct rsvp_filter *f = (struct rsvp_filter *)fh;
struct rsvp_session *s;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
@@ -614,25 +613,29 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
if (nest == NULL)
goto nla_put_failure;
- NLA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst);
+ if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst))
+ goto nla_put_failure;
pinfo.dpi = s->dpi;
pinfo.spi = f->spi;
pinfo.protocol = s->protocol;
pinfo.tunnelid = s->tunnelid;
pinfo.tunnelhdr = f->tunnelhdr;
pinfo.pad = 0;
- NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
- if (f->res.classid)
- NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid);
- if (((f->handle>>8)&0xFF) != 16)
- NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src);
+ if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo))
+ goto nla_put_failure;
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid))
+ goto nla_put_failure;
+ if (((f->handle >> 8) & 0xFF) != 16 &&
+ nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src))
+ goto nla_put_failure;
- if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0)
+ if (tcf_exts_dump(skb, &f->exts) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
- if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0)
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
goto nla_put_failure;
return skb->len;
@@ -641,8 +644,7 @@ nla_put_failure:
return -1;
}
-static struct tcf_proto_ops RSVP_OPS = {
- .next = NULL,
+static struct tcf_proto_ops RSVP_OPS __read_mostly = {
.kind = RSVP_ID,
.classify = rsvp_classify,
.init = rsvp_init,
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 20ef330bb91..c721cd4a469 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -24,9 +24,6 @@
#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
-#define PRIV(tp) ((struct tcindex_data *) (tp)->root)
-
-
struct tcindex_filter_result {
struct tcf_exts exts;
struct tcf_result res;
@@ -50,11 +47,6 @@ struct tcindex_data {
int fall_through; /* 0: only classify if explicit match */
};
-static const struct tcf_ext_map tcindex_ext_map = {
- .police = TCA_TCINDEX_POLICE,
- .action = TCA_TCINDEX_ACT
-};
-
static inline int
tcindex_filter_is_set(struct tcindex_filter_result *r)
{
@@ -79,10 +71,10 @@ tcindex_lookup(struct tcindex_data *p, u16 key)
}
-static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp,
+static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- struct tcindex_data *p = PRIV(tp);
+ struct tcindex_data *p = tp->root;
struct tcindex_filter_result *f;
int key = (skb->tc_index & p->mask) >> p->shift;
@@ -107,7 +99,7 @@ static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp,
static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
{
- struct tcindex_data *p = PRIV(tp);
+ struct tcindex_data *p = tp->root;
struct tcindex_filter_result *r;
pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
@@ -145,7 +137,7 @@ static int tcindex_init(struct tcf_proto *tp)
static int
__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock)
{
- struct tcindex_data *p = PRIV(tp);
+ struct tcindex_data *p = tp->root;
struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
struct tcindex_filter *f = NULL;
@@ -196,10 +188,17 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
[TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
};
+static void tcindex_filter_result_init(struct tcindex_filter_result *r)
+{
+ memset(r, 0, sizeof(*r));
+ tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+}
+
static int
-tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
- struct tcindex_data *p, struct tcindex_filter_result *r,
- struct nlattr **tb, struct nlattr *est)
+tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct tcindex_data *p,
+ struct tcindex_filter_result *r, struct nlattr **tb,
+ struct nlattr *est, bool ovr)
{
int err, balloc = 0;
struct tcindex_filter_result new_filter_result, *old_r = r;
@@ -208,17 +207,17 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
struct tcindex_filter *f = NULL; /* make gcc behave */
struct tcf_exts e;
- err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map);
+ tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
if (err < 0)
return err;
memcpy(&cp, p, sizeof(cp));
- memset(&new_filter_result, 0, sizeof(new_filter_result));
+ tcindex_filter_result_init(&new_filter_result);
+ tcindex_filter_result_init(&cr);
if (old_r)
- memcpy(&cr, r, sizeof(cr));
- else
- memset(&cr, 0, sizeof(cr));
+ cr.res = r->res;
if (tb[TCA_TCINDEX_HASH])
cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
@@ -249,7 +248,7 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
* of the hashing index is below the threshold.
*/
if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD)
- cp.hash = (cp.mask >> cp.shift)+1;
+ cp.hash = (cp.mask >> cp.shift) + 1;
else
cp.hash = DEFAULT_HASH_SIZE;
}
@@ -270,9 +269,14 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
err = -ENOMEM;
if (!cp.perfect && !cp.h) {
if (valid_perfect_hash(&cp)) {
+ int i;
+
cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL);
if (!cp.perfect)
goto errout;
+ for (i = 0; i < cp.hash; i++)
+ tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT,
+ TCA_TCINDEX_POLICE);
balloc = 1;
} else {
cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL);
@@ -298,14 +302,17 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
tcf_bind_filter(tp, &cr.res, base);
}
- tcf_exts_change(tp, &cr.exts, &e);
+ if (old_r)
+ tcf_exts_change(tp, &r->exts, &e);
+ else
+ tcf_exts_change(tp, &cr.exts, &e);
tcf_tree_lock(tp);
if (old_r && old_r != r)
- memset(old_r, 0, sizeof(*old_r));
+ tcindex_filter_result_init(old_r);
memcpy(p, &cp, sizeof(cp));
- memcpy(r, &cr, sizeof(cr));
+ r->res = cr.res;
if (r == &new_filter_result) {
struct tcindex_filter **fp;
@@ -332,12 +339,13 @@ errout:
}
static int
-tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle,
- struct nlattr **tca, unsigned long *arg)
+tcindex_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, unsigned long *arg, bool ovr)
{
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_TCINDEX_MAX + 1];
- struct tcindex_data *p = PRIV(tp);
+ struct tcindex_data *p = tp->root;
struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
int err;
@@ -352,13 +360,14 @@ tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle,
if (err < 0)
return err;
- return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE]);
+ return tcindex_set_parms(net, tp, base, handle, p, r, tb,
+ tca[TCA_RATE], ovr);
}
static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
{
- struct tcindex_data *p = PRIV(tp);
+ struct tcindex_data *p = tp->root;
struct tcindex_filter *f, *next;
int i;
@@ -405,7 +414,7 @@ static int tcindex_destroy_element(struct tcf_proto *tp,
static void tcindex_destroy(struct tcf_proto *tp)
{
- struct tcindex_data *p = PRIV(tp);
+ struct tcindex_data *p = tp->root;
struct tcf_walker walker;
pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
@@ -420,10 +429,10 @@ static void tcindex_destroy(struct tcf_proto *tp)
}
-static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
+static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct tcindex_data *p = PRIV(tp);
+ struct tcindex_data *p = tp->root;
struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
@@ -438,10 +447,11 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
if (!fh) {
t->tcm_handle = ~0; /* whatever ... */
- NLA_PUT_U32(skb, TCA_TCINDEX_HASH, p->hash);
- NLA_PUT_U16(skb, TCA_TCINDEX_MASK, p->mask);
- NLA_PUT_U32(skb, TCA_TCINDEX_SHIFT, p->shift);
- NLA_PUT_U32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through);
+ if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) ||
+ nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) ||
+ nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) ||
+ nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through))
+ goto nla_put_failure;
nla_nest_end(skb, nest);
} else {
if (p->perfect) {
@@ -460,14 +470,15 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
}
}
pr_debug("handle = %d\n", t->tcm_handle);
- if (r->res.class)
- NLA_PUT_U32(skb, TCA_TCINDEX_CLASSID, r->res.classid);
+ if (r->res.class &&
+ nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))
+ goto nla_put_failure;
- if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0)
+ if (tcf_exts_dump(skb, &r->exts) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
- if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0)
+ if (tcf_exts_dump_stats(skb, &r->exts) < 0)
goto nla_put_failure;
}
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index b0c2a82178a..70c0be8d012 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -38,18 +38,18 @@
#include <linux/errno.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
+#include <linux/bitmap.h>
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
-struct tc_u_knode
-{
+struct tc_u_knode {
struct tc_u_knode *next;
u32 handle;
struct tc_u_hnode *ht_up;
struct tcf_exts exts;
#ifdef CONFIG_NET_CLS_IND
- char indev[IFNAMSIZ];
+ int ifindex;
#endif
u8 fshift;
struct tcf_result res;
@@ -63,45 +63,40 @@ struct tc_u_knode
struct tc_u32_sel sel;
};
-struct tc_u_hnode
-{
+struct tc_u_hnode {
struct tc_u_hnode *next;
u32 handle;
u32 prio;
struct tc_u_common *tp_c;
int refcnt;
- unsigned divisor;
+ unsigned int divisor;
struct tc_u_knode *ht[1];
};
-struct tc_u_common
-{
+struct tc_u_common {
struct tc_u_hnode *hlist;
struct Qdisc *q;
int refcnt;
u32 hgenerator;
};
-static const struct tcf_ext_map u32_ext_map = {
- .action = TCA_U32_ACT,
- .police = TCA_U32_POLICE
-};
-
-static __inline__ unsigned u32_hash_fold(__be32 key, struct tc_u32_sel *sel, u8 fshift)
+static inline unsigned int u32_hash_fold(__be32 key,
+ const struct tc_u32_sel *sel,
+ u8 fshift)
{
- unsigned h = ntohl(key & sel->hmask)>>fshift;
+ unsigned int h = ntohl(key & sel->hmask) >> fshift;
return h;
}
-static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res)
+static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res)
{
struct {
struct tc_u_knode *knode;
unsigned int off;
} stack[TC_U32_MAXDEPTH];
- struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root;
+ struct tc_u_hnode *ht = tp->root;
unsigned int off = skb_network_offset(skb);
struct tc_u_knode *n;
int sdepth = 0;
@@ -120,7 +115,7 @@ next_knode:
struct tc_u32_key *key = n->sel.keys;
#ifdef CONFIG_CLS_U32_PERF
- n->pf->rcnt +=1;
+ n->pf->rcnt += 1;
j = 0;
#endif
@@ -133,14 +128,14 @@ next_knode:
}
#endif
- for (i = n->sel.nkeys; i>0; i--, key++) {
+ for (i = n->sel.nkeys; i > 0; i--, key++) {
int toff = off + key->off + (off2 & key->offmask);
- __be32 *data, _data;
+ __be32 *data, hdata;
if (skb_headroom(skb) + toff > INT_MAX)
goto out;
- data = skb_header_pointer(skb, toff, 4, &_data);
+ data = skb_header_pointer(skb, toff, 4, &hdata);
if (!data)
goto out;
if ((*data ^ key->val) & key->mask) {
@@ -148,23 +143,23 @@ next_knode:
goto next_knode;
}
#ifdef CONFIG_CLS_U32_PERF
- n->pf->kcnts[j] +=1;
+ n->pf->kcnts[j] += 1;
j++;
#endif
}
if (n->ht_down == NULL) {
check_terminal:
- if (n->sel.flags&TC_U32_TERMINAL) {
+ if (n->sel.flags & TC_U32_TERMINAL) {
*res = n->res;
#ifdef CONFIG_NET_CLS_IND
- if (!tcf_match_indev(skb, n->indev)) {
+ if (!tcf_match_indev(skb, n->ifindex)) {
n = n->next;
goto next_knode;
}
#endif
#ifdef CONFIG_CLS_U32_PERF
- n->pf->rhit +=1;
+ n->pf->rhit += 1;
#endif
r = tcf_exts_exec(skb, &n->exts, res);
if (r < 0) {
@@ -188,26 +183,26 @@ check_terminal:
ht = n->ht_down;
sel = 0;
if (ht->divisor) {
- __be32 *data, _data;
+ __be32 *data, hdata;
data = skb_header_pointer(skb, off + n->sel.hoff, 4,
- &_data);
+ &hdata);
if (!data)
goto out;
sel = ht->divisor & u32_hash_fold(*data, &n->sel,
n->fshift);
}
- if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT)))
+ if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
goto next_ht;
- if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) {
+ if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
off2 = n->sel.off + 3;
if (n->sel.flags & TC_U32_VAROFFSET) {
- __be16 *data, _data;
+ __be16 *data, hdata;
data = skb_header_pointer(skb,
off + n->sel.offoff,
- 2, &_data);
+ 2, &hdata);
if (!data)
goto out;
off2 += ntohs(n->sel.offmask & *data) >>
@@ -215,7 +210,7 @@ check_terminal:
}
off2 &= ~3;
}
- if (n->sel.flags&TC_U32_EAT) {
+ if (n->sel.flags & TC_U32_EAT) {
off += off2;
off2 = 0;
}
@@ -235,12 +230,11 @@ out:
return -1;
deadloop:
- if (net_ratelimit())
- printk(KERN_WARNING "cls_u32: dead loop\n");
+ net_warn_ratelimited("cls_u32: dead loop\n");
return -1;
}
-static __inline__ struct tc_u_hnode *
+static struct tc_u_hnode *
u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
{
struct tc_u_hnode *ht;
@@ -252,10 +246,10 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
return ht;
}
-static __inline__ struct tc_u_knode *
+static struct tc_u_knode *
u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
{
- unsigned sel;
+ unsigned int sel;
struct tc_u_knode *n = NULL;
sel = TC_U32_HASH(handle);
@@ -300,7 +294,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c)
do {
if (++tp_c->hgenerator == 0x7FF)
tp_c->hgenerator = 1;
- } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
+ } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
}
@@ -354,7 +348,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
return 0;
}
-static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
+static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
{
struct tc_u_knode **kp;
struct tc_u_hnode *ht = key->ht_up;
@@ -378,9 +372,9 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
{
struct tc_u_knode *n;
- unsigned h;
+ unsigned int h;
- for (h=0; h<=ht->divisor; h++) {
+ for (h = 0; h <= ht->divisor; h++) {
while ((n = ht->ht[h]) != NULL) {
ht->ht[h] = n->next;
@@ -446,13 +440,13 @@ static void u32_destroy(struct tcf_proto *tp)
static int u32_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct tc_u_hnode *ht = (struct tc_u_hnode*)arg;
+ struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
if (ht == NULL)
return 0;
if (TC_U32_KEY(ht->handle))
- return u32_delete_key(tp, (struct tc_u_knode*)ht);
+ return u32_delete_key(tp, (struct tc_u_knode *)ht);
if (tp->root == ht)
return -EINVAL;
@@ -467,17 +461,25 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
return 0;
}
+#define NR_U32_NODE (1<<12)
static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
{
struct tc_u_knode *n;
- unsigned i = 0x7FF;
+ unsigned long i;
+ unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!bitmap)
+ return handle | 0xFFF;
+
+ for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
+ set_bit(TC_U32_NODE(n->handle), bitmap);
- for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
- if (i < TC_U32_NODE(n->handle))
- i = TC_U32_NODE(n->handle);
- i++;
+ i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800);
+ if (i >= NR_U32_NODE)
+ i = find_next_zero_bit(bitmap, NR_U32_NODE, 1);
- return handle|(i>0xFFF ? 0xFFF : i);
+ kfree(bitmap);
+ return handle | (i >= NR_U32_NODE ? 0xFFF : i);
}
static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
@@ -490,15 +492,16 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
[TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) },
};
-static int u32_set_parms(struct tcf_proto *tp, unsigned long base,
- struct tc_u_hnode *ht,
+static int u32_set_parms(struct net *net, struct tcf_proto *tp,
+ unsigned long base, struct tc_u_hnode *ht,
struct tc_u_knode *n, struct nlattr **tb,
- struct nlattr *est)
+ struct nlattr *est, bool ovr)
{
int err;
struct tcf_exts e;
- err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map);
+ tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE);
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
if (err < 0)
return err;
@@ -533,9 +536,11 @@ static int u32_set_parms(struct tcf_proto *tp, unsigned long base,
#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_U32_INDEV]) {
- err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV]);
- if (err < 0)
+ int ret;
+ ret = tcf_change_indev(net, tb[TCA_U32_INDEV]);
+ if (ret < 0)
goto errout;
+ n->ifindex = ret;
}
#endif
tcf_exts_change(tp, &n->exts, &e);
@@ -546,9 +551,10 @@ errout:
return err;
}
-static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
+static int u32_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base, u32 handle,
struct nlattr **tca,
- unsigned long *arg)
+ unsigned long *arg, bool ovr)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
@@ -566,15 +572,17 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
if (err < 0)
return err;
- if ((n = (struct tc_u_knode*)*arg) != NULL) {
+ n = (struct tc_u_knode *)*arg;
+ if (n) {
if (TC_U32_KEY(n->handle) == 0)
return -EINVAL;
- return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE]);
+ return u32_set_parms(net, tp, base, n->ht_up, n, tb,
+ tca[TCA_RATE], ovr);
}
if (tb[TCA_U32_DIVISOR]) {
- unsigned divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
+ unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
if (--divisor > 0x100)
return -EINVAL;
@@ -585,7 +593,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
if (handle == 0)
return -ENOMEM;
}
- ht = kzalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL);
+ ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
if (ht == NULL)
return -ENOBUFS;
ht->tp_c = tp_c;
@@ -645,6 +653,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
n->ht_up = ht;
n->handle = handle;
n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+ tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
#ifdef CONFIG_CLS_U32_MARK
if (tb[TCA_U32_MARK]) {
@@ -656,7 +665,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
}
#endif
- err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE]);
+ err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);
if (err == 0) {
struct tc_u_knode **ins;
for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
@@ -683,7 +692,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
struct tc_u_knode *n;
- unsigned h;
+ unsigned int h;
if (arg->stop)
return;
@@ -714,10 +723,10 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
}
-static int u32_dump(struct tcf_proto *tp, unsigned long fh,
+static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct tc_u_knode *n = (struct tc_u_knode*)fh;
+ struct tc_u_knode *n = (struct tc_u_knode *)fh;
struct nlattr *nest;
if (n == NULL)
@@ -730,45 +739,57 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
if (TC_U32_KEY(n->handle) == 0) {
- struct tc_u_hnode *ht = (struct tc_u_hnode*)fh;
- u32 divisor = ht->divisor+1;
- NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor);
+ struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
+ u32 divisor = ht->divisor + 1;
+
+ if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
+ goto nla_put_failure;
} else {
- NLA_PUT(skb, TCA_U32_SEL,
- sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
- &n->sel);
+ if (nla_put(skb, TCA_U32_SEL,
+ sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
+ &n->sel))
+ goto nla_put_failure;
if (n->ht_up) {
u32 htid = n->handle & 0xFFFFF000;
- NLA_PUT_U32(skb, TCA_U32_HASH, htid);
+ if (nla_put_u32(skb, TCA_U32_HASH, htid))
+ goto nla_put_failure;
}
- if (n->res.classid)
- NLA_PUT_U32(skb, TCA_U32_CLASSID, n->res.classid);
- if (n->ht_down)
- NLA_PUT_U32(skb, TCA_U32_LINK, n->ht_down->handle);
+ if (n->res.classid &&
+ nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid))
+ goto nla_put_failure;
+ if (n->ht_down &&
+ nla_put_u32(skb, TCA_U32_LINK, n->ht_down->handle))
+ goto nla_put_failure;
#ifdef CONFIG_CLS_U32_MARK
- if (n->mark.val || n->mark.mask)
- NLA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark);
+ if ((n->mark.val || n->mark.mask) &&
+ nla_put(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark))
+ goto nla_put_failure;
#endif
- if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0)
+ if (tcf_exts_dump(skb, &n->exts) < 0)
goto nla_put_failure;
#ifdef CONFIG_NET_CLS_IND
- if(strlen(n->indev))
- NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev);
+ if (n->ifindex) {
+ struct net_device *dev;
+ dev = __dev_get_by_index(net, n->ifindex);
+ if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
+ goto nla_put_failure;
+ }
#endif
#ifdef CONFIG_CLS_U32_PERF
- NLA_PUT(skb, TCA_U32_PCNT,
- sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
- n->pf);
+ if (nla_put(skb, TCA_U32_PCNT,
+ sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
+ n->pf))
+ goto nla_put_failure;
#endif
}
nla_nest_end(skb, nest);
if (TC_U32_KEY(n->handle))
- if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0)
+ if (tcf_exts_dump_stats(skb, &n->exts) < 0)
goto nla_put_failure;
return skb->len;
diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c
new file mode 100644
index 00000000000..bfd34e4c1af
--- /dev/null
+++ b/net/sched/em_canid.c
@@ -0,0 +1,240 @@
+/*
+ * em_canid.c Ematch rule to match CAN frames according to their CAN IDs
+ *
+ * This program is free software; you can distribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
+ * Copyright: (c) 2011 Czech Technical University in Prague
+ * (c) 2011 Volkswagen Group Research
+ * Authors: Michal Sojka <sojkam1@fel.cvut.cz>
+ * Pavel Pisa <pisa@cmp.felk.cvut.cz>
+ * Rostislav Lisovy <lisovy@gmail.cz>
+ * Funded by: Volkswagen Group Research
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+#include <linux/can.h>
+
+#define EM_CAN_RULES_MAX 500
+
+struct canid_match {
+ /* For each SFF CAN ID (11 bit) there is one record in this bitfield */
+ DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS));
+
+ int rules_count;
+ int sff_rules_count;
+ int eff_rules_count;
+
+ /*
+ * Raw rules copied from netlink message; Used for sending
+ * information to userspace (when 'tc filter show' is invoked)
+ * AND when matching EFF frames
+ */
+ struct can_filter rules_raw[];
+};
+
+/**
+ * em_canid_get_id() - Extracts Can ID out of the sk_buff structure.
+ */
+static canid_t em_canid_get_id(struct sk_buff *skb)
+{
+ /* CAN ID is stored within the data field */
+ struct can_frame *cf = (struct can_frame *)skb->data;
+
+ return cf->can_id;
+}
+
+static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id,
+ u32 can_mask)
+{
+ int i;
+
+ /*
+ * Limit can_mask and can_id to SFF range to
+ * protect against write after end of array
+ */
+ can_mask &= CAN_SFF_MASK;
+ can_id &= can_mask;
+
+ /* Single frame */
+ if (can_mask == CAN_SFF_MASK) {
+ set_bit(can_id, cm->match_sff);
+ return;
+ }
+
+ /* All frames */
+ if (can_mask == 0) {
+ bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS));
+ return;
+ }
+
+ /*
+ * Individual frame filter.
+ * Add record (set bit to 1) for each ID that
+ * conforms particular rule
+ */
+ for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) {
+ if ((i & can_mask) == can_id)
+ set_bit(i, cm->match_sff);
+ }
+}
+
+static inline struct canid_match *em_canid_priv(struct tcf_ematch *m)
+{
+ return (struct canid_match *)m->data;
+}
+
+static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m,
+ struct tcf_pkt_info *info)
+{
+ struct canid_match *cm = em_canid_priv(m);
+ canid_t can_id;
+ int match = 0;
+ int i;
+ const struct can_filter *lp;
+
+ can_id = em_canid_get_id(skb);
+
+ if (can_id & CAN_EFF_FLAG) {
+ for (i = 0, lp = cm->rules_raw;
+ i < cm->eff_rules_count; i++, lp++) {
+ if (!(((lp->can_id ^ can_id) & lp->can_mask))) {
+ match = 1;
+ break;
+ }
+ }
+ } else { /* SFF */
+ can_id &= CAN_SFF_MASK;
+ match = (test_bit(can_id, cm->match_sff) ? 1 : 0);
+ }
+
+ return match;
+}
+
+static int em_canid_change(struct tcf_proto *tp, void *data, int len,
+ struct tcf_ematch *m)
+{
+ struct can_filter *conf = data; /* Array with rules */
+ struct canid_match *cm;
+ struct canid_match *cm_old = (struct canid_match *)m->data;
+ int i;
+
+ if (!len)
+ return -EINVAL;
+
+ if (len % sizeof(struct can_filter))
+ return -EINVAL;
+
+ if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX)
+ return -EINVAL;
+
+ cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL);
+ if (!cm)
+ return -ENOMEM;
+
+ cm->rules_count = len / sizeof(struct can_filter);
+
+ /*
+ * We need two for() loops for copying rules into two contiguous
+ * areas in rules_raw to process all eff rules with a simple loop.
+ * NB: The configuration interface supports sff and eff rules.
+ * We do not support filters here that match for the same can_id
+ * provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123).
+ * For this (unusual case) two filters have to be specified. The
+ * SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id.
+ */
+
+ /* Fill rules_raw with EFF rules first */
+ for (i = 0; i < cm->rules_count; i++) {
+ if (conf[i].can_id & CAN_EFF_FLAG) {
+ memcpy(cm->rules_raw + cm->eff_rules_count,
+ &conf[i],
+ sizeof(struct can_filter));
+
+ cm->eff_rules_count++;
+ }
+ }
+
+ /* append SFF frame rules */
+ for (i = 0; i < cm->rules_count; i++) {
+ if (!(conf[i].can_id & CAN_EFF_FLAG)) {
+ memcpy(cm->rules_raw
+ + cm->eff_rules_count
+ + cm->sff_rules_count,
+ &conf[i], sizeof(struct can_filter));
+
+ cm->sff_rules_count++;
+
+ em_canid_sff_match_add(cm,
+ conf[i].can_id, conf[i].can_mask);
+ }
+ }
+
+ m->datalen = sizeof(struct canid_match) + len;
+ m->data = (unsigned long)cm;
+
+ if (cm_old != NULL) {
+ pr_err("canid: Configuring an existing ematch!\n");
+ kfree(cm_old);
+ }
+
+ return 0;
+}
+
+static void em_canid_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+ struct canid_match *cm = em_canid_priv(m);
+
+ kfree(cm);
+}
+
+static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ struct canid_match *cm = em_canid_priv(m);
+
+ /*
+ * When configuring this ematch 'rules_count' is set not to exceed
+ * 'rules_raw' array size
+ */
+ if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count,
+ &cm->rules_raw) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static struct tcf_ematch_ops em_canid_ops = {
+ .kind = TCF_EM_CANID,
+ .change = em_canid_change,
+ .match = em_canid_match,
+ .destroy = em_canid_destroy,
+ .dump = em_canid_dump,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_canid_ops.link)
+};
+
+static int __init init_em_canid(void)
+{
+ return tcf_em_register(&em_canid_ops);
+}
+
+static void __exit exit_em_canid(void)
+{
+ tcf_em_unregister(&em_canid_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_canid);
+module_exit(exit_em_canid);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID);
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index bc450397487..1c8360a2752 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -33,40 +33,41 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
return 0;
switch (cmp->align) {
- case TCF_EM_ALIGN_U8:
- val = *ptr;
- break;
+ case TCF_EM_ALIGN_U8:
+ val = *ptr;
+ break;
- case TCF_EM_ALIGN_U16:
- val = get_unaligned_be16(ptr);
+ case TCF_EM_ALIGN_U16:
+ val = get_unaligned_be16(ptr);
- if (cmp_needs_transformation(cmp))
- val = be16_to_cpu(val);
- break;
+ if (cmp_needs_transformation(cmp))
+ val = be16_to_cpu(val);
+ break;
- case TCF_EM_ALIGN_U32:
- /* Worth checking boundries? The branching seems
- * to get worse. Visit again. */
- val = get_unaligned_be32(ptr);
+ case TCF_EM_ALIGN_U32:
+ /* Worth checking boundries? The branching seems
+ * to get worse. Visit again.
+ */
+ val = get_unaligned_be32(ptr);
- if (cmp_needs_transformation(cmp))
- val = be32_to_cpu(val);
- break;
+ if (cmp_needs_transformation(cmp))
+ val = be32_to_cpu(val);
+ break;
- default:
- return 0;
+ default:
+ return 0;
}
if (cmp->mask)
val &= cmp->mask;
switch (cmp->opnd) {
- case TCF_EM_OPND_EQ:
- return val == cmp->val;
- case TCF_EM_OPND_LT:
- return val < cmp->val;
- case TCF_EM_OPND_GT:
- return val > cmp->val;
+ case TCF_EM_OPND_EQ:
+ return val == cmp->val;
+ case TCF_EM_OPND_LT:
+ return val < cmp->val;
+ case TCF_EM_OPND_GT:
+ return val > cmp->val;
}
return 0;
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
new file mode 100644
index 00000000000..527aeb7a3ff
--- /dev/null
+++ b/net/sched/em_ipset.c
@@ -0,0 +1,136 @@
+/*
+ * net/sched/em_ipset.c ipset ematch
+ *
+ * Copyright (c) 2012 Florian Westphal <fw@strlen.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/xt_set.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+#include <net/pkt_cls.h>
+
+static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len,
+ struct tcf_ematch *em)
+{
+ struct xt_set_info *set = data;
+ ip_set_id_t index;
+ struct net *net = dev_net(qdisc_dev(tp->q));
+
+ if (data_len != sizeof(*set))
+ return -EINVAL;
+
+ index = ip_set_nfnl_get_byindex(net, set->index);
+ if (index == IPSET_INVALID_ID)
+ return -ENOENT;
+
+ em->datalen = sizeof(*set);
+ em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
+ if (em->data)
+ return 0;
+
+ ip_set_nfnl_put(net, index);
+ return -ENOMEM;
+}
+
+static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em)
+{
+ const struct xt_set_info *set = (const void *) em->data;
+ if (set) {
+ ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index);
+ kfree((void *) em->data);
+ }
+}
+
+static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ struct ip_set_adt_opt opt;
+ struct xt_action_param acpar;
+ const struct xt_set_info *set = (const void *) em->data;
+ struct net_device *dev, *indev = NULL;
+ int ret, network_offset;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ acpar.family = NFPROTO_IPV4;
+ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+ acpar.thoff = ip_hdrlen(skb);
+ break;
+ case htons(ETH_P_IPV6):
+ acpar.family = NFPROTO_IPV6;
+ if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+ /* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */
+ acpar.thoff = sizeof(struct ipv6hdr);
+ break;
+ default:
+ return 0;
+ }
+
+ acpar.hooknum = 0;
+
+ opt.family = acpar.family;
+ opt.dim = set->dim;
+ opt.flags = set->flags;
+ opt.cmdflags = 0;
+ opt.ext.timeout = ~0u;
+
+ network_offset = skb_network_offset(skb);
+ skb_pull(skb, network_offset);
+
+ dev = skb->dev;
+
+ rcu_read_lock();
+
+ if (dev && skb->skb_iif)
+ indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif);
+
+ acpar.in = indev ? indev : dev;
+ acpar.out = dev;
+
+ ret = ip_set_test(set->index, skb, &acpar, &opt);
+
+ rcu_read_unlock();
+
+ skb_push(skb, network_offset);
+ return ret;
+}
+
+static struct tcf_ematch_ops em_ipset_ops = {
+ .kind = TCF_EM_IPSET,
+ .change = em_ipset_change,
+ .destroy = em_ipset_destroy,
+ .match = em_ipset_match,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_ipset_ops.link)
+};
+
+static int __init init_em_ipset(void)
+{
+ return tcf_em_register(&em_ipset_ops);
+}
+
+static void __exit exit_em_ipset(void)
+{
+ tcf_em_unregister(&em_ipset_ops);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("TC extended match for IP sets");
+
+module_init(init_em_ipset);
+module_exit(exit_em_ipset);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 34da5e29ea1..9b8c0b0e60d 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -47,7 +47,7 @@
* on the meta type. Obviously, the length of the data must also
* be provided for non-numeric types.
*
- * Additionaly, type dependant modifiers such as shift operators
+ * Additionally, type dependent modifiers such as shift operators
* or mask may be applied to extend the functionaliy. As of now,
* the variable length type supports shifting the byte string to
* the right, eating up any number of octets and thus supporting
@@ -73,21 +73,18 @@
#include <net/pkt_cls.h>
#include <net/sock.h>
-struct meta_obj
-{
+struct meta_obj {
unsigned long value;
unsigned int len;
};
-struct meta_value
-{
+struct meta_value {
struct tcf_meta_val hdr;
unsigned long val;
unsigned int len;
};
-struct meta_match
-{
+struct meta_match {
struct meta_value lvalue;
struct meta_value rvalue;
};
@@ -225,7 +222,7 @@ META_COLLECTOR(int_maclen)
META_COLLECTOR(int_rxhash)
{
- dst->value = skb_get_rxhash(skb);
+ dst->value = skb_get_hash(skb);
}
/**************************************************************************
@@ -255,7 +252,7 @@ META_COLLECTOR(int_rtclassid)
if (unlikely(skb_dst(skb) == NULL))
*err = -1;
else
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
dst->value = skb_dst(skb)->tclassid;
#else
dst->value = 0;
@@ -267,47 +264,59 @@ META_COLLECTOR(int_rtiif)
if (unlikely(skb_rtable(skb) == NULL))
*err = -1;
else
- dst->value = skb_rtable(skb)->fl.iif;
+ dst->value = inet_iif(skb);
}
/**************************************************************************
* Socket Attributes
**************************************************************************/
-#define SKIP_NONLOCAL(skb) \
- if (unlikely(skb->sk == NULL)) { \
- *err = -1; \
- return; \
- }
+#define skip_nonlocal(skb) \
+ (unlikely(skb->sk == NULL))
META_COLLECTOR(int_sk_family)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_family;
}
META_COLLECTOR(int_sk_state)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_state;
}
META_COLLECTOR(int_sk_reuse)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_reuse;
}
META_COLLECTOR(int_sk_bound_if)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
/* No error if bound_dev_if is 0, legal userspace check */
dst->value = skb->sk->sk_bound_dev_if;
}
META_COLLECTOR(var_sk_bound_if)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
if (skb->sk->sk_bound_dev_if == 0) {
dst->value = (unsigned long) "any";
@@ -325,157 +334,226 @@ META_COLLECTOR(var_sk_bound_if)
META_COLLECTOR(int_sk_refcnt)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = atomic_read(&skb->sk->sk_refcnt);
}
META_COLLECTOR(int_sk_rcvbuf)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_rcvbuf;
}
META_COLLECTOR(int_sk_shutdown)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_shutdown;
}
META_COLLECTOR(int_sk_proto)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_protocol;
}
META_COLLECTOR(int_sk_type)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_type;
}
META_COLLECTOR(int_sk_rmem_alloc)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = sk_rmem_alloc_get(skb->sk);
}
META_COLLECTOR(int_sk_wmem_alloc)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = sk_wmem_alloc_get(skb->sk);
}
META_COLLECTOR(int_sk_omem_alloc)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = atomic_read(&skb->sk->sk_omem_alloc);
}
META_COLLECTOR(int_sk_rcv_qlen)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_receive_queue.qlen;
}
META_COLLECTOR(int_sk_snd_qlen)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_write_queue.qlen;
}
META_COLLECTOR(int_sk_wmem_queued)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_wmem_queued;
}
META_COLLECTOR(int_sk_fwd_alloc)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_forward_alloc;
}
META_COLLECTOR(int_sk_sndbuf)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_sndbuf;
}
META_COLLECTOR(int_sk_alloc)
{
- SKIP_NONLOCAL(skb);
- dst->value = skb->sk->sk_allocation;
-}
-
-META_COLLECTOR(int_sk_route_caps)
-{
- SKIP_NONLOCAL(skb);
- dst->value = skb->sk->sk_route_caps;
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = (__force int) skb->sk->sk_allocation;
}
META_COLLECTOR(int_sk_hash)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_hash;
}
META_COLLECTOR(int_sk_lingertime)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_lingertime / HZ;
}
META_COLLECTOR(int_sk_err_qlen)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_error_queue.qlen;
}
META_COLLECTOR(int_sk_ack_bl)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_ack_backlog;
}
META_COLLECTOR(int_sk_max_ack_bl)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_max_ack_backlog;
}
META_COLLECTOR(int_sk_prio)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_priority;
}
META_COLLECTOR(int_sk_rcvlowat)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_rcvlowat;
}
META_COLLECTOR(int_sk_rcvtimeo)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_rcvtimeo / HZ;
}
META_COLLECTOR(int_sk_sndtimeo)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_sndtimeo / HZ;
}
META_COLLECTOR(int_sk_sendmsg_off)
{
- SKIP_NONLOCAL(skb);
- dst->value = skb->sk->sk_sndmsg_off;
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_frag.offset;
}
META_COLLECTOR(int_sk_write_pend)
{
- SKIP_NONLOCAL(skb);
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
dst->value = skb->sk->sk_write_pending;
}
@@ -483,8 +561,7 @@ META_COLLECTOR(int_sk_write_pend)
* Meta value collectors assignment table
**************************************************************************/
-struct meta_ops
-{
+struct meta_ops {
void (*get)(struct sk_buff *, struct tcf_pkt_info *,
struct meta_value *, struct meta_obj *, int *);
};
@@ -494,7 +571,7 @@ struct meta_ops
/* Meta value operations table listing all meta value collectors and
* assigns them to a type and meta id. */
-static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
+static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = {
[TCF_META_TYPE_VAR] = {
[META_ID(DEV)] = META_FUNC(var_dev),
[META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
@@ -534,7 +611,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
[META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen),
[META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc),
[META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc),
- [META_ID(SK_ROUTE_CAPS)] = META_FUNC(int_sk_route_caps),
[META_ID(SK_HASH)] = META_FUNC(int_sk_hash),
[META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime),
[META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl),
@@ -550,7 +626,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
}
};
-static inline struct meta_ops * meta_ops(struct meta_value *val)
+static inline struct meta_ops *meta_ops(struct meta_value *val)
{
return &__meta_ops[meta_type(val)][meta_id(val)];
}
@@ -596,8 +672,9 @@ static void meta_var_apply_extras(struct meta_value *v,
static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
{
- if (v->val && v->len)
- NLA_PUT(skb, tlv, v->len, (void *) v->val);
+ if (v->val && v->len &&
+ nla_put(skb, tlv, v->len, (void *) v->val))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -647,10 +724,12 @@ static void meta_int_apply_extras(struct meta_value *v,
static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
{
- if (v->len == sizeof(unsigned long))
- NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val);
- else if (v->len == sizeof(u32)) {
- NLA_PUT_U32(skb, tlv, v->val);
+ if (v->len == sizeof(unsigned long)) {
+ if (nla_put(skb, tlv, sizeof(unsigned long), &v->val))
+ goto nla_put_failure;
+ } else if (v->len == sizeof(u32)) {
+ if (nla_put_u32(skb, tlv, v->val))
+ goto nla_put_failure;
}
return 0;
@@ -663,8 +742,7 @@ nla_put_failure:
* Type specific operations table
**************************************************************************/
-struct meta_type_ops
-{
+struct meta_type_ops {
void (*destroy)(struct meta_value *);
int (*compare)(struct meta_obj *, struct meta_obj *);
int (*change)(struct meta_value *, struct nlattr *);
@@ -672,7 +750,7 @@ struct meta_type_ops
int (*dump)(struct sk_buff *, struct meta_value *, int);
};
-static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
+static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
[TCF_META_TYPE_VAR] = {
.destroy = meta_var_destroy,
.compare = meta_var_compare,
@@ -688,7 +766,7 @@ static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
}
};
-static inline struct meta_type_ops * meta_type_ops(struct meta_value *v)
+static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)
{
return &__meta_type_ops[meta_type(v)];
}
@@ -713,7 +791,7 @@ static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
return err;
if (meta_type_ops(v)->apply_extras)
- meta_type_ops(v)->apply_extras(v, dst);
+ meta_type_ops(v)->apply_extras(v, dst);
return 0;
}
@@ -732,12 +810,12 @@ static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
switch (meta->lvalue.hdr.op) {
- case TCF_EM_OPND_EQ:
- return !r;
- case TCF_EM_OPND_LT:
- return r < 0;
- case TCF_EM_OPND_GT:
- return r > 0;
+ case TCF_EM_OPND_EQ:
+ return !r;
+ case TCF_EM_OPND_LT:
+ return r < 0;
+ case TCF_EM_OPND_GT:
+ return r > 0;
}
return 0;
@@ -771,7 +849,7 @@ static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)
static inline int meta_is_supported(struct meta_value *val)
{
- return (!meta_id(val) || meta_ops(val)->get);
+ return !meta_id(val) || meta_ops(val)->get;
}
static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = {
@@ -802,8 +880,10 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len,
goto errout;
meta = kzalloc(sizeof(*meta), GFP_KERNEL);
- if (meta == NULL)
+ if (meta == NULL) {
+ err = -ENOMEM;
goto errout;
+ }
memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
@@ -844,7 +924,8 @@ static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right));
- NLA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr);
+ if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr))
+ goto nla_put_failure;
ops = meta_type_ops(&meta->lvalue);
if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 ||
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 1a4176aee6e..a3bed07a008 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -18,8 +18,7 @@
#include <linux/tc_ematch/tc_em_nbyte.h>
#include <net/pkt_cls.h>
-struct nbyte_data
-{
+struct nbyte_data {
struct tcf_em_nbyte hdr;
char pattern[0];
};
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index ea8f566e720..15d353d2e4b 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -19,8 +19,7 @@
#include <linux/tc_ematch/tc_em_text.h>
#include <net/pkt_cls.h>
-struct text_match
-{
+struct text_match {
u16 from_offset;
u16 to_offset;
u8 from_layer;
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
index 953f1479f7d..797bdb88c01 100644
--- a/net/sched/em_u32.c
+++ b/net/sched/em_u32.c
@@ -35,7 +35,7 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
return 0;
- return !(((*(__be32*) ptr) ^ key->val) & key->mask);
+ return !(((*(__be32 *) ptr) ^ key->val) & key->mask);
}
static struct tcf_ematch_ops em_u32_ops = {
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 5e37da961f8..3a633debb6d 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -93,7 +93,7 @@
static LIST_HEAD(ematch_ops);
static DEFINE_RWLOCK(ematch_mod_lock);
-static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind)
+static struct tcf_ematch_ops *tcf_em_lookup(u16 kind)
{
struct tcf_ematch_ops *e = NULL;
@@ -163,8 +163,8 @@ void tcf_em_unregister(struct tcf_ematch_ops *ops)
}
EXPORT_SYMBOL(tcf_em_unregister);
-static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree,
- int index)
+static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree,
+ int index)
{
return &tree->matches[index];
}
@@ -184,7 +184,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
if (em_hdr->kind == TCF_EM_CONTAINER) {
/* Special ematch called "container", carries an index
- * referencing an external ematch sequence. */
+ * referencing an external ematch sequence.
+ */
u32 ref;
if (data_len < sizeof(ref))
@@ -195,7 +196,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
goto errout;
/* We do not allow backward jumps to avoid loops and jumps
- * to our own position are of course illegal. */
+ * to our own position are of course illegal.
+ */
if (ref <= idx)
goto errout;
@@ -208,7 +210,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
* which automatically releases the reference again, therefore
* the module MUST not be given back under any circumstances
* here. Be aware, the destroy function assumes that the
- * module is held if the ops field is non zero. */
+ * module is held if the ops field is non zero.
+ */
em->ops = tcf_em_lookup(em_hdr->kind);
if (em->ops == NULL) {
@@ -221,7 +224,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
if (em->ops) {
/* We dropped the RTNL mutex in order to
* perform the module load. Tell the caller
- * to replay the request. */
+ * to replay the request.
+ */
module_put(em->ops->owner);
err = -EAGAIN;
}
@@ -230,7 +234,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
}
/* ematch module provides expected length of data, so we
- * can do a basic sanity check. */
+ * can do a basic sanity check.
+ */
if (em->ops->datalen && data_len < em->ops->datalen)
goto errout;
@@ -246,7 +251,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
* TCF_EM_SIMPLE may be specified stating that the
* data only consists of a u32 integer and the module
* does not expected a memory reference but rather
- * the value carried. */
+ * the value carried.
+ */
if (em_hdr->flags & TCF_EM_SIMPLE) {
if (data_len < sizeof(u32))
goto errout;
@@ -334,7 +340,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
* The array of rt attributes is parsed in the order as they are
* provided, their type must be incremental from 1 to n. Even
* if it does not serve any real purpose, a failure of sticking
- * to this policy will result in parsing failure. */
+ * to this policy will result in parsing failure.
+ */
for (idx = 0; nla_ok(rt_match, list_len); idx++) {
err = -EINVAL;
@@ -359,7 +366,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
/* Check if the number of matches provided by userspace actually
* complies with the array of matches. The number was used for
* the validation of references and a mismatch could lead to
- * undefined references during the matching process. */
+ * undefined references during the matching process.
+ */
if (idx != tree_hdr->nmatches) {
err = -EINVAL;
goto errout_abort;
@@ -433,7 +441,8 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
if (top_start == NULL)
goto nla_put_failure;
- NLA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr);
+ if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr))
+ goto nla_put_failure;
list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);
if (list_start == NULL)
@@ -449,7 +458,8 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
.flags = em->flags
};
- NLA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr);
+ if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr))
+ goto nla_put_failure;
if (em->ops && em->ops->dump) {
if (em->ops->dump(skb, em) < 0)
@@ -478,6 +488,7 @@ static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
int r = em->ops->match(skb, em, info);
+
return tcf_em_is_inverted(em) ? !r : r;
}
@@ -526,9 +537,7 @@ pop_stack:
return res;
stack_overflow:
- if (net_ratelimit())
- printk(KERN_WARNING "tc ematch: local stack overflow,"
- " increase NET_EMATCH_STACK\n");
+ net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n");
return -1;
}
EXPORT_SYMBOL(__tcf_em_tree_match);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b22ca2d1ceb..58bed7599db 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -135,7 +135,7 @@ static DEFINE_RWLOCK(qdisc_mod_lock);
static struct Qdisc_ops *qdisc_base;
-/* Register/uregister queueing discipline */
+/* Register/unregister queueing discipline */
int register_qdisc(struct Qdisc_ops *qops)
{
@@ -187,7 +187,7 @@ int unregister_qdisc(struct Qdisc_ops *qops)
int err = -ENOENT;
write_lock(&qdisc_mod_lock);
- for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
+ for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
if (q == qops)
break;
if (q) {
@@ -200,6 +200,58 @@ int unregister_qdisc(struct Qdisc_ops *qops)
}
EXPORT_SYMBOL(unregister_qdisc);
+/* Get default qdisc if not otherwise specified */
+void qdisc_get_default(char *name, size_t len)
+{
+ read_lock(&qdisc_mod_lock);
+ strlcpy(name, default_qdisc_ops->id, len);
+ read_unlock(&qdisc_mod_lock);
+}
+
+static struct Qdisc_ops *qdisc_lookup_default(const char *name)
+{
+ struct Qdisc_ops *q = NULL;
+
+ for (q = qdisc_base; q; q = q->next) {
+ if (!strcmp(name, q->id)) {
+ if (!try_module_get(q->owner))
+ q = NULL;
+ break;
+ }
+ }
+
+ return q;
+}
+
+/* Set new default qdisc to use */
+int qdisc_set_default(const char *name)
+{
+ const struct Qdisc_ops *ops;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ write_lock(&qdisc_mod_lock);
+ ops = qdisc_lookup_default(name);
+ if (!ops) {
+ /* Not found, drop lock and try to load module */
+ write_unlock(&qdisc_mod_lock);
+ request_module("sch_%s", name);
+ write_lock(&qdisc_mod_lock);
+
+ ops = qdisc_lookup_default(name);
+ }
+
+ if (ops) {
+ /* Set new default */
+ module_put(default_qdisc_ops->owner);
+ default_qdisc_ops = ops;
+ }
+ write_unlock(&qdisc_mod_lock);
+
+ return ops ? 0 : -ENOENT;
+}
+
/* We know handle. Find qdisc among all qdisc's attached to device
(root qdisc, all its children, children of children etc.)
*/
@@ -219,11 +271,16 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
return NULL;
}
-static void qdisc_list_add(struct Qdisc *q)
+void qdisc_list_add(struct Qdisc *q)
{
- if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
- list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
+ if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
+ struct Qdisc *root = qdisc_dev(q)->qdisc;
+
+ WARN_ON_ONCE(root == &noop_qdisc);
+ list_add_tail(&q->list, &root->list);
+ }
}
+EXPORT_SYMBOL(qdisc_list_add);
void qdisc_list_del(struct Qdisc *q)
{
@@ -285,28 +342,70 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
return q;
}
+/* The linklayer setting were not transferred from iproute2, in older
+ * versions, and the rate tables lookup systems have been dropped in
+ * the kernel. To keep backward compatible with older iproute2 tc
+ * utils, we detect the linklayer setting by detecting if the rate
+ * table were modified.
+ *
+ * For linklayer ATM table entries, the rate table will be aligned to
+ * 48 bytes, thus some table entries will contain the same value. The
+ * mpu (min packet unit) is also encoded into the old rate table, thus
+ * starting from the mpu, we find low and high table entries for
+ * mapping this cell. If these entries contain the same value, when
+ * the rate tables have been modified for linklayer ATM.
+ *
+ * This is done by rounding mpu to the nearest 48 bytes cell/entry,
+ * and then roundup to the next cell, calc the table entry one below,
+ * and compare.
+ */
+static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
+{
+ int low = roundup(r->mpu, 48);
+ int high = roundup(low+1, 48);
+ int cell_low = low >> r->cell_log;
+ int cell_high = (high >> r->cell_log) - 1;
+
+ /* rtab is too inaccurate at rates > 100Mbit/s */
+ if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
+ pr_debug("TC linklayer: Giving up ATM detection\n");
+ return TC_LINKLAYER_ETHERNET;
+ }
+
+ if ((cell_high > cell_low) && (cell_high < 256)
+ && (rtab[cell_low] == rtab[cell_high])) {
+ pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
+ cell_low, cell_high, rtab[cell_high]);
+ return TC_LINKLAYER_ATM;
+ }
+ return TC_LINKLAYER_ETHERNET;
+}
+
static struct qdisc_rate_table *qdisc_rtab_list;
struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
{
struct qdisc_rate_table *rtab;
+ if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
+ nla_len(tab) != TC_RTAB_SIZE)
+ return NULL;
+
for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
- if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
+ if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
+ !memcmp(&rtab->data, nla_data(tab), 1024)) {
rtab->refcnt++;
return rtab;
}
}
- if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
- nla_len(tab) != TC_RTAB_SIZE)
- return NULL;
-
rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
if (rtab) {
rtab->rate = *r;
rtab->refcnt = 1;
memcpy(rtab->data, nla_data(tab), 1024);
+ if (r->linklayer == TC_LINKLAYER_UNAWARE)
+ r->linklayer = __detect_linklayer(r, rtab->data);
rtab->next = qdisc_rtab_list;
qdisc_rtab_list = rtab;
}
@@ -321,7 +420,9 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)
if (!tab || --tab->refcnt)
return;
- for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
+ for (rtabp = &qdisc_rtab_list;
+ (rtab = *rtabp) != NULL;
+ rtabp = &rtab->next) {
if (rtab == tab) {
*rtabp = rtab->next;
kfree(rtab);
@@ -396,6 +497,11 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
return stab;
}
+static void stab_kfree_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct qdisc_size_table, rcu));
+}
+
void qdisc_put_stab(struct qdisc_size_table *tab)
{
if (!tab)
@@ -405,7 +511,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
if (--tab->refcnt == 0) {
list_del(&tab->list);
- kfree(tab);
+ call_rcu_bh(&tab->rcu, stab_kfree_rcu);
}
spin_unlock(&qdisc_stab_lock);
@@ -419,7 +525,8 @@ static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
nest = nla_nest_start(skb, TCA_STAB);
if (nest == NULL)
goto nla_put_failure;
- NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
+ if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
+ goto nla_put_failure;
nla_nest_end(skb, nest);
return skb->len;
@@ -428,7 +535,7 @@ nla_put_failure:
return -1;
}
-void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
+void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
{
int pkt_len, slot;
@@ -454,14 +561,13 @@ out:
pkt_len = 1;
qdisc_skb_cb(skb)->pkt_len = pkt_len;
}
-EXPORT_SYMBOL(qdisc_calculate_pkt_len);
+EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
-void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
+void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
{
if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
- printk(KERN_WARNING
- "%s: %s qdisc %X: is non-work-conserving?\n",
- txt, qdisc->ops->id, qdisc->handle >> 16);
+ pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
+ txt, qdisc->ops->id, qdisc->handle >> 16);
qdisc->flags |= TCQ_F_WARN_NONWC;
}
}
@@ -472,7 +578,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
timer);
- wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(wd->qdisc);
__netif_schedule(qdisc_root(wd->qdisc));
return HRTIMER_NORESTART;
@@ -486,25 +592,24 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
}
EXPORT_SYMBOL(qdisc_watchdog_init);
-void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
+void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
{
- ktime_t time;
-
if (test_bit(__QDISC_STATE_DEACTIVATED,
&qdisc_root_sleeping(wd->qdisc)->state))
return;
- wd->qdisc->flags |= TCQ_F_THROTTLED;
- time = ktime_set(0, 0);
- time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
- hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
+ qdisc_throttled(wd->qdisc);
+
+ hrtimer_start(&wd->timer,
+ ns_to_ktime(expires),
+ HRTIMER_MODE_ABS);
}
-EXPORT_SYMBOL(qdisc_watchdog_schedule);
+EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
{
hrtimer_cancel(&wd->timer);
- wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(wd->qdisc);
}
EXPORT_SYMBOL(qdisc_watchdog_cancel);
@@ -539,7 +644,7 @@ static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
{
struct Qdisc_class_common *cl;
- struct hlist_node *n, *next;
+ struct hlist_node *next;
struct hlist_head *nhash, *ohash;
unsigned int nsize, nmask, osize;
unsigned int i, h;
@@ -558,7 +663,7 @@ void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
sch_tree_lock(sch);
for (i = 0; i < osize; i++) {
- hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
+ hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
h = qdisc_class_hash(cl->classid, nmask);
hlist_add_head(&cl->hnode, &nhash[h]);
}
@@ -612,20 +717,24 @@ void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
}
EXPORT_SYMBOL(qdisc_class_hash_remove);
-/* Allocate an unique handle from space managed by kernel */
-
+/* Allocate an unique handle from space managed by kernel
+ * Possible range is [8000-FFFF]:0000 (0x8000 values)
+ */
static u32 qdisc_alloc_handle(struct net_device *dev)
{
- int i = 0x10000;
+ int i = 0x8000;
static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
do {
autohandle += TC_H_MAKE(0x10000U, 0);
if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
autohandle = TC_H_MAKE(0x80000000U, 0);
- } while (qdisc_lookup(dev, autohandle) && --i > 0);
+ if (!qdisc_lookup(dev, autohandle))
+ return autohandle;
+ cond_resched();
+ } while (--i > 0);
- return i>0 ? autohandle : 0;
+ return 0;
}
void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
@@ -633,9 +742,11 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
const struct Qdisc_class_ops *cops;
unsigned long cl;
u32 parentid;
+ int drops;
if (n == 0)
return;
+ drops = max_t(int, n, 0);
while ((parentid = sch->parent)) {
if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
return;
@@ -652,6 +763,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
cops->put(sch, cl);
}
sch->q.qlen -= n;
+ sch->qstats.drops += drops;
}
}
EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
@@ -823,6 +935,8 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
goto err_out3;
}
lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
+ if (!netif_is_multiqueue(dev))
+ sch->flags |= TCQ_F_ONETXQUEUE;
}
sch->handle = handle;
@@ -834,7 +948,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
err = PTR_ERR(stab);
goto err_out4;
}
- sch->stab = stab;
+ rcu_assign_pointer(sch->stab, stab);
}
if (tca[TCA_RATE]) {
spinlock_t *root_lock;
@@ -874,7 +988,7 @@ err_out4:
* Any broken qdiscs that would require a ops->reset() here?
* The qdisc was never in action so it shouldn't be necessary.
*/
- qdisc_put_stab(sch->stab);
+ qdisc_put_stab(rtnl_dereference(sch->stab));
if (ops->destroy)
ops->destroy(sch);
goto err_out3;
@@ -882,7 +996,7 @@ err_out4:
static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
{
- struct qdisc_size_table *stab = NULL;
+ struct qdisc_size_table *ostab, *stab = NULL;
int err = 0;
if (tca[TCA_OPTIONS]) {
@@ -899,8 +1013,9 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
return PTR_ERR(stab);
}
- qdisc_put_stab(sch->stab);
- sch->stab = stab;
+ ostab = rtnl_dereference(sch->stab);
+ rcu_assign_pointer(sch->stab, stab);
+ qdisc_put_stab(ostab);
if (tca[TCA_RATE]) {
/* NB: ignores errors from replace_estimator
@@ -915,9 +1030,8 @@ out:
return 0;
}
-struct check_loop_arg
-{
- struct qdisc_walker w;
+struct check_loop_arg {
+ struct qdisc_walker w;
struct Qdisc *p;
int depth;
};
@@ -959,33 +1073,39 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
* Delete/get qdisc.
*/
-static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
- struct tcmsg *tcm = NLMSG_DATA(n);
+ struct tcmsg *tcm = nlmsg_data(n);
struct nlattr *tca[TCA_MAX + 1];
struct net_device *dev;
- u32 clid = tcm->tcm_parent;
+ u32 clid;
struct Qdisc *q = NULL;
struct Qdisc *p = NULL;
int err;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
- return -ENODEV;
+ if ((n->nlmsg_type != RTM_GETQDISC) &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
if (err < 0)
return err;
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ clid = tcm->tcm_parent;
if (clid) {
if (clid != TC_H_ROOT) {
if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
- if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+ p = qdisc_lookup(dev, TC_H_MAJ(clid));
+ if (!p)
return -ENOENT;
q = qdisc_leaf(p, clid);
- } else { /* ingress */
- if (dev_ingress_queue(dev))
- q = dev_ingress_queue(dev)->qdisc_sleeping;
+ } else if (dev_ingress_queue(dev)) {
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
}
} else {
q = dev->qdisc;
@@ -996,7 +1116,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
return -EINVAL;
} else {
- if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ if (!q)
return -ENOENT;
}
@@ -1008,7 +1129,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
return -EINVAL;
if (q->handle == 0)
return -ENOENT;
- if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
+ err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
+ if (err != 0)
return err;
} else {
qdisc_notify(net, skb, n, clid, NULL, q);
@@ -1017,10 +1139,10 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
}
/*
- Create/change qdisc.
+ * Create/change qdisc.
*/
-static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct tcmsg *tcm;
@@ -1030,28 +1152,33 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
struct Qdisc *q, *p;
int err;
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
replay:
/* Reinit, just in case something touches this. */
- tcm = NLMSG_DATA(n);
+ err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ tcm = nlmsg_data(n);
clid = tcm->tcm_parent;
q = p = NULL;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return -ENODEV;
- err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
- if (err < 0)
- return err;
if (clid) {
if (clid != TC_H_ROOT) {
if (clid != TC_H_INGRESS) {
- if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+ p = qdisc_lookup(dev, TC_H_MAJ(clid));
+ if (!p)
return -ENOENT;
q = qdisc_leaf(p, clid);
- } else { /* ingress */
- if (dev_ingress_queue_create(dev))
- q = dev_ingress_queue(dev)->qdisc_sleeping;
+ } else if (dev_ingress_queue_create(dev)) {
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
}
} else {
q = dev->qdisc;
@@ -1063,13 +1190,14 @@ replay:
if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
if (tcm->tcm_handle) {
- if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
+ if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
return -EEXIST;
if (TC_H_MIN(tcm->tcm_handle))
return -EINVAL;
- if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ if (!q)
goto create_n_graft;
- if (n->nlmsg_flags&NLM_F_EXCL)
+ if (n->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
return -EINVAL;
@@ -1079,7 +1207,7 @@ replay:
atomic_inc(&q->refcnt);
goto graft;
} else {
- if (q == NULL)
+ if (!q)
goto create_n_graft;
/* This magic test requires explanation.
@@ -1101,9 +1229,9 @@ replay:
* For now we select create/graft, if
* user gave KIND, which does not match existing.
*/
- if ((n->nlmsg_flags&NLM_F_CREATE) &&
- (n->nlmsg_flags&NLM_F_REPLACE) &&
- ((n->nlmsg_flags&NLM_F_EXCL) ||
+ if ((n->nlmsg_flags & NLM_F_CREATE) &&
+ (n->nlmsg_flags & NLM_F_REPLACE) &&
+ ((n->nlmsg_flags & NLM_F_EXCL) ||
(tca[TCA_KIND] &&
nla_strcmp(tca[TCA_KIND], q->ops->id))))
goto create_n_graft;
@@ -1118,7 +1246,7 @@ replay:
/* Change qdisc parameters */
if (q == NULL)
return -ENOENT;
- if (n->nlmsg_flags&NLM_F_EXCL)
+ if (n->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
return -EINVAL;
@@ -1128,7 +1256,7 @@ replay:
return err;
create_n_graft:
- if (!(n->nlmsg_flags&NLM_F_CREATE))
+ if (!(n->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
if (clid == TC_H_INGRESS) {
if (dev_ingress_queue(dev))
@@ -1169,15 +1297,19 @@ graft:
}
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
- u32 pid, u32 seq, u16 flags, int event)
+ u32 portid, u32 seq, u16 flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
struct gnet_dump d;
+ struct qdisc_size_table *stab;
- nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
- tcm = NLMSG_DATA(nlh);
+ cond_resched();
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
tcm->tcm_family = AF_UNSPEC;
tcm->tcm__pad1 = 0;
tcm->tcm__pad2 = 0;
@@ -1185,12 +1317,14 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
tcm->tcm_parent = clid;
tcm->tcm_handle = q->handle;
tcm->tcm_info = atomic_read(&q->refcnt);
- NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
+ if (nla_put_string(skb, TCA_KIND, q->ops->id))
+ goto nla_put_failure;
if (q->ops->dump && q->ops->dump(q, skb) < 0)
goto nla_put_failure;
q->qstats.qlen = q->q.qlen;
- if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
+ stab = rtnl_dereference(q->stab);
+ if (stab && qdisc_dump_stab(skb, stab) < 0)
goto nla_put_failure;
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
@@ -1211,7 +1345,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
return skb->len;
-nlmsg_failure:
+out_nlmsg_trim:
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
@@ -1227,23 +1361,26 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb,
struct Qdisc *old, struct Qdisc *new)
{
struct sk_buff *skb;
- u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
if (old && !tc_qdisc_dump_ignore(old)) {
- if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
+ if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
+ 0, RTM_DELQDISC) < 0)
goto err_out;
}
if (new && !tc_qdisc_dump_ignore(new)) {
- if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+ if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
+ old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
goto err_out;
}
if (skb->len)
- return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
err_out:
kfree_skb(skb);
@@ -1265,7 +1402,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
q_idx++;
} else {
if (!tc_qdisc_dump_ignore(q) &&
- tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
+ tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
goto done;
q_idx++;
@@ -1275,8 +1412,8 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
q_idx++;
continue;
}
- if (!tc_qdisc_dump_ignore(q) &&
- tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
+ if (!tc_qdisc_dump_ignore(q) &&
+ tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
goto done;
q_idx++;
@@ -1300,9 +1437,9 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
s_idx = cb->args[0];
s_q_idx = q_idx = cb->args[1];
- rcu_read_lock();
idx = 0;
- for_each_netdev_rcu(net, dev) {
+ ASSERT_RTNL();
+ for_each_netdev(net, dev) {
struct netdev_queue *dev_queue;
if (idx < s_idx)
@@ -1325,8 +1462,6 @@ cont:
}
done:
- rcu_read_unlock();
-
cb->args[0] = idx;
cb->args[1] = q_idx;
@@ -1341,28 +1476,33 @@ done:
-static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
- struct tcmsg *tcm = NLMSG_DATA(n);
+ struct tcmsg *tcm = nlmsg_data(n);
struct nlattr *tca[TCA_MAX + 1];
struct net_device *dev;
struct Qdisc *q = NULL;
const struct Qdisc_class_ops *cops;
unsigned long cl = 0;
unsigned long new_cl;
- u32 pid = tcm->tcm_parent;
- u32 clid = tcm->tcm_handle;
- u32 qid = TC_H_MAJ(clid);
+ u32 portid;
+ u32 clid;
+ u32 qid;
int err;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
- return -ENODEV;
+ if ((n->nlmsg_type != RTM_GETTCLASS) &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
if (err < 0)
return err;
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
/*
parent == TC_H_UNSPEC - unspecified parent.
parent == TC_H_ROOT - class is root, which has no parent.
@@ -1378,8 +1518,12 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
/* Step 1. Determine qdisc handle X:0 */
- if (pid != TC_H_ROOT) {
- u32 qid1 = TC_H_MAJ(pid);
+ portid = tcm->tcm_parent;
+ clid = tcm->tcm_handle;
+ qid = TC_H_MAJ(clid);
+
+ if (portid != TC_H_ROOT) {
+ u32 qid1 = TC_H_MAJ(portid);
if (qid && qid1) {
/* If both majors are known, they must be identical. */
@@ -1391,19 +1535,20 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
qid = dev->qdisc->handle;
/* Now qid is genuine qdisc handle consistent
- both with parent and child.
-
- TC_H_MAJ(pid) still may be unspecified, complete it now.
+ * both with parent and child.
+ *
+ * TC_H_MAJ(portid) still may be unspecified, complete it now.
*/
- if (pid)
- pid = TC_H_MAKE(qid, pid);
+ if (portid)
+ portid = TC_H_MAKE(qid, portid);
} else {
if (qid == 0)
qid = dev->qdisc->handle;
}
/* OK. Locate qdisc */
- if ((q = qdisc_lookup(dev, qid)) == NULL)
+ q = qdisc_lookup(dev, qid);
+ if (!q)
return -ENOENT;
/* An check that it supports classes */
@@ -1413,7 +1558,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
/* Now try to get class */
if (clid == 0) {
- if (pid == TC_H_ROOT)
+ if (portid == TC_H_ROOT)
clid = qid;
} else
clid = TC_H_MAKE(qid, clid);
@@ -1423,13 +1568,14 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
if (cl == 0) {
err = -ENOENT;
- if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
+ if (n->nlmsg_type != RTM_NEWTCLASS ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
goto out;
} else {
switch (n->nlmsg_type) {
case RTM_NEWTCLASS:
err = -EEXIST;
- if (n->nlmsg_flags&NLM_F_EXCL)
+ if (n->nlmsg_flags & NLM_F_EXCL)
goto out;
break;
case RTM_DELTCLASS:
@@ -1451,7 +1597,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
new_cl = cl;
err = -EOPNOTSUPP;
if (cops->change)
- err = cops->change(q, clid, pid, tca, &new_cl);
+ err = cops->change(q, clid, portid, tca, &new_cl);
if (err == 0)
tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
@@ -1465,7 +1611,7 @@ out:
static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
unsigned long cl,
- u32 pid, u32 seq, u16 flags, int event)
+ u32 portid, u32 seq, u16 flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
@@ -1473,8 +1619,11 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
struct gnet_dump d;
const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
- nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
- tcm = NLMSG_DATA(nlh);
+ cond_resched();
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
tcm->tcm_family = AF_UNSPEC;
tcm->tcm__pad1 = 0;
tcm->tcm__pad2 = 0;
@@ -1482,7 +1631,8 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
tcm->tcm_parent = q->handle;
tcm->tcm_handle = q->handle;
tcm->tcm_info = 0;
- NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
+ if (nla_put_string(skb, TCA_KIND, q->ops->id))
+ goto nla_put_failure;
if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
goto nla_put_failure;
@@ -1499,7 +1649,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
return skb->len;
-nlmsg_failure:
+out_nlmsg_trim:
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
@@ -1510,32 +1660,32 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb,
unsigned long cl, int event)
{
struct sk_buff *skb;
- u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
- if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
kfree_skb(skb);
return -EINVAL;
}
- return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
}
-struct qdisc_dump_args
-{
- struct qdisc_walker w;
- struct sk_buff *skb;
- struct netlink_callback *cb;
+struct qdisc_dump_args {
+ struct qdisc_walker w;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
};
static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
{
struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
- return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
+ return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
}
@@ -1590,15 +1740,16 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
struct net *net = sock_net(skb->sk);
struct netdev_queue *dev_queue;
struct net_device *dev;
int t, s_t;
- if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
return 0;
- if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return 0;
s_t = cb->args[0];
@@ -1621,19 +1772,22 @@ done:
}
/* Main classifier routine: scans classifier chain attached
- to this qdisc, (optionally) tests for protocol and asks
- specific classifiers.
+ * to this qdisc, (optionally) tests for protocol and asks
+ * specific classifiers.
*/
-int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
+int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
__be16 protocol = skb->protocol;
- int err = 0;
+ int err;
for (; tp; tp = tp->next) {
- if ((tp->protocol == protocol ||
- tp->protocol == htons(ETH_P_ALL)) &&
- (err = tp->classify(skb, tp, res)) >= 0) {
+ if (tp->protocol != protocol &&
+ tp->protocol != htons(ETH_P_ALL))
+ continue;
+ err = tp->classify(skb, tp, res);
+
+ if (err >= 0) {
#ifdef CONFIG_NET_CLS_ACT
if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
@@ -1645,16 +1799,14 @@ int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
}
EXPORT_SYMBOL(tc_classify_compat);
-int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
+int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
int err = 0;
- __be16 protocol;
#ifdef CONFIG_NET_CLS_ACT
- struct tcf_proto *otp = tp;
+ const struct tcf_proto *otp = tp;
reclassify:
#endif
- protocol = skb->protocol;
err = tc_classify_compat(skb, tp, res);
#ifdef CONFIG_NET_CLS_ACT
@@ -1663,12 +1815,10 @@ reclassify:
tp = otp;
if (verd++ >= MAX_REC_LOOP) {
- if (net_ratelimit())
- printk(KERN_NOTICE
- "%s: packet reclassify loop"
- " rule prio %u protocol %02x\n",
- tp->q->ops->id,
- tp->prio & 0xffff, ntohs(tp->protocol));
+ net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
+ tp->q->ops->id,
+ tp->prio & 0xffff,
+ ntohs(tp->protocol));
return TC_ACT_SHOT;
}
skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
@@ -1728,7 +1878,7 @@ static int __net_init psched_net_init(struct net *net)
{
struct proc_dir_entry *e;
- e = proc_net_fops_create(net, "psched", 0, &psched_fops);
+ e = proc_create("psched", 0, net->proc_net, &psched_fops);
if (e == NULL)
return -ENOMEM;
@@ -1737,7 +1887,7 @@ static int __net_init psched_net_init(struct net *net)
static void __net_exit psched_net_exit(struct net *net)
{
- proc_net_remove(net, "psched");
+ remove_proc_entry("psched", net->proc_net);
}
#else
static int __net_init psched_net_init(struct net *net)
@@ -1761,22 +1911,23 @@ static int __init pktsched_init(void)
err = register_pernet_subsys(&psched_net_ops);
if (err) {
- printk(KERN_ERR "pktsched_init: "
+ pr_err("pktsched_init: "
"cannot initialize per netns operations\n");
return err;
}
+ register_qdisc(&pfifo_fast_ops);
register_qdisc(&pfifo_qdisc_ops);
register_qdisc(&bfifo_qdisc_ops);
register_qdisc(&pfifo_head_drop_qdisc_ops);
register_qdisc(&mq_qdisc_ops);
- rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
- rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
+ rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
return 0;
}
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 282540778aa..8449b337f9e 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
+#include <linux/interrupt.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
@@ -15,8 +16,6 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
-extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */
-
/*
* The ATM queuing discipline provides a framework for invoking classifiers
* (aka "filters"), which in turn select classes of this queuing discipline.
@@ -319,7 +318,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
* creation), and one for the reference held when calling delete.
*/
if (flow->ref < 2) {
- printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n", flow->ref);
+ pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
return -EINVAL;
}
if (flow->ref > 2)
@@ -384,12 +383,12 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
}
flow = NULL;
- done:
- ;
+done:
+ ;
}
- if (!flow)
+ if (!flow) {
flow = &p->link;
- else {
+ } else {
if (flow->vcc)
ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
/*@@@ looks good ... but it's not supposed to work :-) */
@@ -422,10 +421,6 @@ drop: __maybe_unused
}
return ret;
}
- sch->bstats.bytes += qdisc_pkt_len(skb);
- sch->bstats.packets++;
- flow->bstats.bytes += qdisc_pkt_len(skb);
- flow->bstats.packets++;
/*
* Okay, this may seem weird. We pretend we've dropped the packet if
* it goes via ATM. The reason for this is that the outer qdisc
@@ -473,6 +468,8 @@ static void sch_atm_dequeue(unsigned long data)
if (unlikely(!skb))
break;
+ qdisc_bstats_update(sch, skb);
+ bstats_update(&flow->bstats, skb);
pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
/* remove any LL header somebody else has attached */
skb_pull(skb, skb_network_offset(skb));
@@ -578,8 +575,7 @@ static void atm_tc_destroy(struct Qdisc *sch)
list_for_each_entry_safe(flow, tmp, &p->flows, list) {
if (flow->ref > 1)
- printk(KERN_ERR "atm_destroy: %p->ref = %d\n", flow,
- flow->ref);
+ pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
atm_tc_put(sch, (unsigned long)flow);
}
tasklet_kill(&p->task);
@@ -603,27 +599,31 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
if (nest == NULL)
goto nla_put_failure;
- NLA_PUT(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr);
+ if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr))
+ goto nla_put_failure;
if (flow->vcc) {
struct sockaddr_atmpvc pvc;
int state;
+ memset(&pvc, 0, sizeof(pvc));
pvc.sap_family = AF_ATMPVC;
pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
pvc.sap_addr.vpi = flow->vcc->vpi;
pvc.sap_addr.vci = flow->vcc->vci;
- NLA_PUT(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc);
+ if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc))
+ goto nla_put_failure;
state = ATM_VF2VS(flow->vcc->flags);
- NLA_PUT_U32(skb, TCA_ATM_STATE, state);
+ if (nla_put_u32(skb, TCA_ATM_STATE, state))
+ goto nla_put_failure;
}
- if (flow->excess)
- NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid);
- else {
- NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0);
+ if (flow->excess) {
+ if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->classid))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u32(skb, TCA_ATM_EXCESS, 0))
+ goto nla_put_failure;
}
-
- nla_nest_end(skb, nest);
- return skb->len;
+ return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index eb763159086..ead526467cc 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -72,8 +72,7 @@
struct cbq_sched_data;
-struct cbq_class
-{
+struct cbq_class {
struct Qdisc_class_common common;
struct cbq_class *next_alive; /* next class with backlog in this priority band */
@@ -131,7 +130,7 @@ struct cbq_class
psched_time_t penalized;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+ struct gnet_stats_rate_est64 rate_est;
struct tc_cbq_xstats xstats;
struct tcf_proto *filter_list;
@@ -139,19 +138,18 @@ struct cbq_class
int refcnt;
int filters;
- struct cbq_class *defaults[TC_PRIO_MAX+1];
+ struct cbq_class *defaults[TC_PRIO_MAX + 1];
};
-struct cbq_sched_data
-{
+struct cbq_sched_data {
struct Qdisc_class_hash clhash; /* Hash table of all classes */
- int nclasses[TC_CBQ_MAXPRIO+1];
- unsigned quanta[TC_CBQ_MAXPRIO+1];
+ int nclasses[TC_CBQ_MAXPRIO + 1];
+ unsigned int quanta[TC_CBQ_MAXPRIO + 1];
struct cbq_class link;
- unsigned activemask;
- struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
+ unsigned int activemask;
+ struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
with backlog */
#ifdef CONFIG_NET_CLS_ACT
@@ -162,7 +160,7 @@ struct cbq_sched_data
int tx_len;
psched_time_t now; /* Cached timestamp */
psched_time_t now_rt; /* Cached real time */
- unsigned pmask;
+ unsigned int pmask;
struct hrtimer delay_timer;
struct qdisc_watchdog watchdog; /* Watchdog timer,
@@ -175,9 +173,9 @@ struct cbq_sched_data
};
-#define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
+#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
-static __inline__ struct cbq_class *
+static inline struct cbq_class *
cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
{
struct Qdisc_class_common *clc;
@@ -193,25 +191,27 @@ cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
static struct cbq_class *
cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
{
- struct cbq_class *cl, *new;
+ struct cbq_class *cl;
- for (cl = this->tparent; cl; cl = cl->tparent)
- if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
- return new;
+ for (cl = this->tparent; cl; cl = cl->tparent) {
+ struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
+ if (new != NULL && new != this)
+ return new;
+ }
return NULL;
}
#endif
/* Classify packet. The procedure is pretty complicated, but
- it allows us to combine link sharing and priority scheduling
- transparently.
-
- Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
- so that it resolves to split nodes. Then packets are classified
- by logical priority, or a more specific classifier may be attached
- to the split node.
+ * it allows us to combine link sharing and priority scheduling
+ * transparently.
+ *
+ * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
+ * so that it resolves to split nodes. Then packets are classified
+ * by logical priority, or a more specific classifier may be attached
+ * to the split node.
*/
static struct cbq_class *
@@ -227,7 +227,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
/*
* Step 1. If skb->priority points to one of our classes, use it.
*/
- if (TC_H_MAJ(prio^sch->handle) == 0 &&
+ if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
(cl = cbq_class_lookup(q, prio)) != NULL)
return cl;
@@ -243,16 +243,18 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
(result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
goto fallback;
- if ((cl = (void*)res.class) == NULL) {
+ cl = (void *)res.class;
+ if (!cl) {
if (TC_H_MAJ(res.classid))
cl = cbq_class_lookup(q, res.classid);
- else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
+ else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
cl = defmap[TC_PRIO_BESTEFFORT];
- if (cl == NULL || cl->level >= head->level)
+ if (cl == NULL)
goto fallback;
}
-
+ if (cl->level >= head->level)
+ goto fallback;
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_QUEUED:
@@ -282,7 +284,7 @@ fallback:
* Step 4. No success...
*/
if (TC_H_MAJ(prio) == 0 &&
- !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
+ !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
!(cl = head->defaults[TC_PRIO_BESTEFFORT]))
return head;
@@ -290,12 +292,12 @@ fallback:
}
/*
- A packet has just been enqueued on the empty class.
- cbq_activate_class adds it to the tail of active class list
- of its priority band.
+ * A packet has just been enqueued on the empty class.
+ * cbq_activate_class adds it to the tail of active class list
+ * of its priority band.
*/
-static __inline__ void cbq_activate_class(struct cbq_class *cl)
+static inline void cbq_activate_class(struct cbq_class *cl)
{
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
int prio = cl->cpriority;
@@ -314,9 +316,9 @@ static __inline__ void cbq_activate_class(struct cbq_class *cl)
}
/*
- Unlink class from active chain.
- Note that this same procedure is done directly in cbq_dequeue*
- during round-robin procedure.
+ * Unlink class from active chain.
+ * Note that this same procedure is done directly in cbq_dequeue*
+ * during round-robin procedure.
*/
static void cbq_deactivate_class(struct cbq_class *this)
@@ -350,7 +352,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
{
int toplevel = q->toplevel;
- if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
+ if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
psched_time_t now;
psched_tdiff_t incr;
@@ -363,7 +365,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
q->toplevel = cl->level;
return;
}
- } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
+ } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
}
}
@@ -390,8 +392,6 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, cl->q);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
- sch->bstats.packets++;
- sch->bstats.bytes += qdisc_pkt_len(skb);
cbq_mark_toplevel(q, cl);
if (!cl->next_alive)
cbq_activate_class(cl);
@@ -419,11 +419,11 @@ static void cbq_ovl_classic(struct cbq_class *cl)
delay += cl->offtime;
/*
- Class goes to sleep, so that it will have no
- chance to work avgidle. Let's forgive it 8)
-
- BTW cbq-2.0 has a crap in this
- place, apparently they forgot to shift it by cl->ewma_log.
+ * Class goes to sleep, so that it will have no
+ * chance to work avgidle. Let's forgive it 8)
+ *
+ * BTW cbq-2.0 has a crap in this
+ * place, apparently they forgot to shift it by cl->ewma_log.
*/
if (cl->avgidle < 0)
delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
@@ -440,8 +440,8 @@ static void cbq_ovl_classic(struct cbq_class *cl)
q->wd_expires = delay;
/* Dirty work! We must schedule wakeups based on
- real available rate, rather than leaf rate,
- which may be tiny (even zero).
+ * real available rate, rather than leaf rate,
+ * which may be tiny (even zero).
*/
if (q->toplevel == TC_CBQ_MAXLEVEL) {
struct cbq_class *b;
@@ -461,7 +461,7 @@ static void cbq_ovl_classic(struct cbq_class *cl)
}
/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
- they go overlimit
+ * they go overlimit
*/
static void cbq_ovl_rclassic(struct cbq_class *cl)
@@ -509,8 +509,7 @@ static void cbq_ovl_delay(struct cbq_class *cl)
cl->cpriority = TC_CBQ_MAXPRIO;
q->pmask |= (1<<TC_CBQ_MAXPRIO);
- expires = ktime_set(0, 0);
- expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched));
+ expires = ns_to_ktime(PSCHED_TICKS2NS(sched));
if (hrtimer_try_to_cancel(&q->delay_timer) &&
ktime_to_ns(ktime_sub(
hrtimer_get_expires(&q->delay_timer),
@@ -596,7 +595,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
struct Qdisc *sch = q->watchdog.qdisc;
psched_time_t now;
psched_tdiff_t delay = 0;
- unsigned pmask;
+ unsigned int pmask;
now = psched_get_time();
@@ -625,7 +624,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
}
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
__netif_schedule(qdisc_root(sch));
return HRTIMER_NORESTART;
}
@@ -650,8 +649,6 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
ret = qdisc_enqueue(skb, cl->q);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
- sch->bstats.packets++;
- sch->bstats.bytes += qdisc_pkt_len(skb);
if (!cl->next_alive)
cbq_activate_class(cl);
return 0;
@@ -667,15 +664,15 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
#endif
/*
- It is mission critical procedure.
-
- We "regenerate" toplevel cutoff, if transmitting class
- has backlog and it is not regulated. It is not part of
- original CBQ description, but looks more reasonable.
- Probably, it is wrong. This question needs further investigation.
-*/
+ * It is mission critical procedure.
+ *
+ * We "regenerate" toplevel cutoff, if transmitting class
+ * has backlog and it is not regulated. It is not part of
+ * original CBQ description, but looks more reasonable.
+ * Probably, it is wrong. This question needs further investigation.
+ */
-static __inline__ void
+static inline void
cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
struct cbq_class *borrowed)
{
@@ -686,7 +683,7 @@ cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
q->toplevel = borrowed->level;
return;
}
- } while ((borrowed=borrowed->borrow) != NULL);
+ } while ((borrowed = borrowed->borrow) != NULL);
}
#if 0
/* It is not necessary now. Uncommenting it
@@ -714,10 +711,10 @@ cbq_update(struct cbq_sched_data *q)
cl->bstats.bytes += len;
/*
- (now - last) is total time between packet right edges.
- (last_pktlen/rate) is "virtual" busy time, so that
-
- idle = (now - last) - last_pktlen/rate
+ * (now - last) is total time between packet right edges.
+ * (last_pktlen/rate) is "virtual" busy time, so that
+ *
+ * idle = (now - last) - last_pktlen/rate
*/
idle = q->now - cl->last;
@@ -727,9 +724,9 @@ cbq_update(struct cbq_sched_data *q)
idle -= L2T(cl, len);
/* true_avgidle := (1-W)*true_avgidle + W*idle,
- where W=2^{-ewma_log}. But cl->avgidle is scaled:
- cl->avgidle == true_avgidle/W,
- hence:
+ * where W=2^{-ewma_log}. But cl->avgidle is scaled:
+ * cl->avgidle == true_avgidle/W,
+ * hence:
*/
avgidle += idle - (avgidle>>cl->ewma_log);
}
@@ -743,22 +740,22 @@ cbq_update(struct cbq_sched_data *q)
cl->avgidle = avgidle;
/* Calculate expected time, when this class
- will be allowed to send.
- It will occur, when:
- (1-W)*true_avgidle + W*delay = 0, i.e.
- idle = (1/W - 1)*(-true_avgidle)
- or
- idle = (1 - W)*(-cl->avgidle);
+ * will be allowed to send.
+ * It will occur, when:
+ * (1-W)*true_avgidle + W*delay = 0, i.e.
+ * idle = (1/W - 1)*(-true_avgidle)
+ * or
+ * idle = (1 - W)*(-cl->avgidle);
*/
idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
/*
- That is not all.
- To maintain the rate allocated to the class,
- we add to undertime virtual clock,
- necessary to complete transmitted packet.
- (len/phys_bandwidth has been already passed
- to the moment of cbq_update)
+ * That is not all.
+ * To maintain the rate allocated to the class,
+ * we add to undertime virtual clock,
+ * necessary to complete transmitted packet.
+ * (len/phys_bandwidth has been already passed
+ * to the moment of cbq_update)
*/
idle -= L2T(&q->link, len);
@@ -780,7 +777,7 @@ cbq_update(struct cbq_sched_data *q)
cbq_update_toplevel(q, this, q->tx_borrowed);
}
-static __inline__ struct cbq_class *
+static inline struct cbq_class *
cbq_under_limit(struct cbq_class *cl)
{
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
@@ -796,16 +793,17 @@ cbq_under_limit(struct cbq_class *cl)
do {
/* It is very suspicious place. Now overlimit
- action is generated for not bounded classes
- only if link is completely congested.
- Though it is in agree with ancestor-only paradigm,
- it looks very stupid. Particularly,
- it means that this chunk of code will either
- never be called or result in strong amplification
- of burstiness. Dangerous, silly, and, however,
- no another solution exists.
+ * action is generated for not bounded classes
+ * only if link is completely congested.
+ * Though it is in agree with ancestor-only paradigm,
+ * it looks very stupid. Particularly,
+ * it means that this chunk of code will either
+ * never be called or result in strong amplification
+ * of burstiness. Dangerous, silly, and, however,
+ * no another solution exists.
*/
- if ((cl = cl->borrow) == NULL) {
+ cl = cl->borrow;
+ if (!cl) {
this_cl->qstats.overlimits++;
this_cl->overlimit(this_cl);
return NULL;
@@ -818,7 +816,7 @@ cbq_under_limit(struct cbq_class *cl)
return cl;
}
-static __inline__ struct sk_buff *
+static inline struct sk_buff *
cbq_dequeue_prio(struct Qdisc *sch, int prio)
{
struct cbq_sched_data *q = qdisc_priv(sch);
@@ -842,7 +840,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
if (cl->deficit <= 0) {
/* Class exhausted its allotment per
- this round. Switch to the next one.
+ * this round. Switch to the next one.
*/
deficit = 1;
cl->deficit += cl->quantum;
@@ -852,8 +850,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
skb = cl->q->dequeue(cl->q);
/* Class did not give us any skb :-(
- It could occur even if cl->q->q.qlen != 0
- f.e. if cl->q == "tbf"
+ * It could occur even if cl->q->q.qlen != 0
+ * f.e. if cl->q == "tbf"
*/
if (skb == NULL)
goto skip_class;
@@ -882,7 +880,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
skip_class:
if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
/* Class is empty or penalized.
- Unlink it from active chain.
+ * Unlink it from active chain.
*/
cl_prev->next_alive = cl->next_alive;
cl->next_alive = NULL;
@@ -921,14 +919,14 @@ next_class:
return NULL;
}
-static __inline__ struct sk_buff *
+static inline struct sk_buff *
cbq_dequeue_1(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
- unsigned activemask;
+ unsigned int activemask;
- activemask = q->activemask&0xFF;
+ activemask = q->activemask & 0xFF;
while (activemask) {
int prio = ffz(~activemask);
activemask &= ~(1<<prio);
@@ -953,19 +951,22 @@ cbq_dequeue(struct Qdisc *sch)
if (q->tx_class) {
psched_tdiff_t incr2;
/* Time integrator. We calculate EOS time
- by adding expected packet transmission time.
- If real time is greater, we warp artificial clock,
- so that:
-
- cbq_time = max(real_time, work);
+ * by adding expected packet transmission time.
+ * If real time is greater, we warp artificial clock,
+ * so that:
+ *
+ * cbq_time = max(real_time, work);
*/
incr2 = L2T(&q->link, q->tx_len);
q->now += incr2;
cbq_update(q);
if ((incr -= incr2) < 0)
incr = 0;
+ q->now += incr;
+ } else {
+ if (now > q->now)
+ q->now = now;
}
- q->now += incr;
q->now_rt = now;
for (;;) {
@@ -973,28 +974,29 @@ cbq_dequeue(struct Qdisc *sch)
skb = cbq_dequeue_1(sch);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
return skb;
}
/* All the classes are overlimit.
-
- It is possible, if:
-
- 1. Scheduler is empty.
- 2. Toplevel cutoff inhibited borrowing.
- 3. Root class is overlimit.
-
- Reset 2d and 3d conditions and retry.
-
- Note, that NS and cbq-2.0 are buggy, peeking
- an arbitrary class is appropriate for ancestor-only
- sharing, but not for toplevel algorithm.
-
- Our version is better, but slower, because it requires
- two passes, but it is unavoidable with top-level sharing.
- */
+ *
+ * It is possible, if:
+ *
+ * 1. Scheduler is empty.
+ * 2. Toplevel cutoff inhibited borrowing.
+ * 3. Root class is overlimit.
+ *
+ * Reset 2d and 3d conditions and retry.
+ *
+ * Note, that NS and cbq-2.0 are buggy, peeking
+ * an arbitrary class is appropriate for ancestor-only
+ * sharing, but not for toplevel algorithm.
+ *
+ * Our version is better, but slower, because it requires
+ * two passes, but it is unavoidable with top-level sharing.
+ */
if (q->toplevel == TC_CBQ_MAXLEVEL &&
q->link.undertime == PSCHED_PASTPERFECT)
@@ -1005,7 +1007,8 @@ cbq_dequeue(struct Qdisc *sch)
}
/* No packets in scheduler or nobody wants to give them to us :-(
- Sigh... start watchdog timer in the last case. */
+ * Sigh... start watchdog timer in the last case.
+ */
if (sch->q.qlen) {
sch->qstats.overlimits++;
@@ -1027,36 +1030,38 @@ static void cbq_adjust_levels(struct cbq_class *this)
int level = 0;
struct cbq_class *cl;
- if ((cl = this->children) != NULL) {
+ cl = this->children;
+ if (cl) {
do {
if (cl->level > level)
level = cl->level;
} while ((cl = cl->sibling) != this->children);
}
- this->level = level+1;
+ this->level = level + 1;
} while ((this = this->tparent) != NULL);
}
static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
{
struct cbq_class *cl;
- struct hlist_node *n;
unsigned int h;
if (q->quanta[prio] == 0)
return;
for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
/* BUGGGG... Beware! This expression suffer of
- arithmetic overflows!
+ * arithmetic overflows!
*/
if (cl->priority == prio) {
cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
q->quanta[prio];
}
- if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
- printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
+ if (cl->quantum <= 0 ||
+ cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
+ pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
+ cl->common.classid, cl->quantum);
cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
}
}
@@ -1067,31 +1072,30 @@ static void cbq_sync_defmap(struct cbq_class *cl)
{
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
struct cbq_class *split = cl->split;
- unsigned h;
+ unsigned int h;
int i;
if (split == NULL)
return;
- for (i=0; i<=TC_PRIO_MAX; i++) {
- if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
+ if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
split->defaults[i] = NULL;
}
- for (i=0; i<=TC_PRIO_MAX; i++) {
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
int level = split->level;
if (split->defaults[i])
continue;
for (h = 0; h < q->clhash.hashsize; h++) {
- struct hlist_node *n;
struct cbq_class *c;
- hlist_for_each_entry(c, n, &q->clhash.hash[h],
+ hlist_for_each_entry(c, &q->clhash.hash[h],
common.hnode) {
if (c->split == split && c->level < level &&
- c->defmap&(1<<i)) {
+ c->defmap & (1<<i)) {
split->defaults[i] = c;
level = c->level;
}
@@ -1105,7 +1109,8 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma
struct cbq_class *split = NULL;
if (splitid == 0) {
- if ((split = cl->split) == NULL)
+ split = cl->split;
+ if (!split)
return;
splitid = split->common.classid;
}
@@ -1123,9 +1128,9 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma
cl->defmap = 0;
cbq_sync_defmap(cl);
cl->split = split;
- cl->defmap = def&mask;
+ cl->defmap = def & mask;
} else
- cl->defmap = (cl->defmap&~mask)|(def&mask);
+ cl->defmap = (cl->defmap & ~mask) | (def & mask);
cbq_sync_defmap(cl);
}
@@ -1138,7 +1143,7 @@ static void cbq_unlink_class(struct cbq_class *this)
qdisc_class_hash_remove(&q->clhash, &this->common);
if (this->tparent) {
- clp=&this->sibling;
+ clp = &this->sibling;
cl = *clp;
do {
if (cl == this) {
@@ -1177,7 +1182,7 @@ static void cbq_link_class(struct cbq_class *this)
}
}
-static unsigned int cbq_drop(struct Qdisc* sch)
+static unsigned int cbq_drop(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl, *cl_head;
@@ -1185,7 +1190,8 @@ static unsigned int cbq_drop(struct Qdisc* sch)
unsigned int len;
for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
- if ((cl_head = q->active[prio]) == NULL)
+ cl_head = q->active[prio];
+ if (!cl_head)
continue;
cl = cl_head;
@@ -1202,13 +1208,12 @@ static unsigned int cbq_drop(struct Qdisc* sch)
}
static void
-cbq_reset(struct Qdisc* sch)
+cbq_reset(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl;
- struct hlist_node *n;
int prio;
- unsigned h;
+ unsigned int h;
q->activemask = 0;
q->pmask = 0;
@@ -1224,7 +1229,7 @@ cbq_reset(struct Qdisc* sch)
q->active[prio] = NULL;
for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
qdisc_reset(cl->q);
cl->next_alive = NULL;
@@ -1240,21 +1245,21 @@ cbq_reset(struct Qdisc* sch)
static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
{
- if (lss->change&TCF_CBQ_LSS_FLAGS) {
- cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
- cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
+ if (lss->change & TCF_CBQ_LSS_FLAGS) {
+ cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
+ cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
}
- if (lss->change&TCF_CBQ_LSS_EWMA)
+ if (lss->change & TCF_CBQ_LSS_EWMA)
cl->ewma_log = lss->ewma_log;
- if (lss->change&TCF_CBQ_LSS_AVPKT)
+ if (lss->change & TCF_CBQ_LSS_AVPKT)
cl->avpkt = lss->avpkt;
- if (lss->change&TCF_CBQ_LSS_MINIDLE)
+ if (lss->change & TCF_CBQ_LSS_MINIDLE)
cl->minidle = -(long)lss->minidle;
- if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
+ if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
cl->maxidle = lss->maxidle;
cl->avgidle = lss->maxidle;
}
- if (lss->change&TCF_CBQ_LSS_OFFTIME)
+ if (lss->change & TCF_CBQ_LSS_OFFTIME)
cl->offtime = lss->offtime;
return 0;
}
@@ -1282,10 +1287,10 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
if (wrr->weight)
cl->weight = wrr->weight;
if (wrr->priority) {
- cl->priority = wrr->priority-1;
+ cl->priority = wrr->priority - 1;
cl->cpriority = cl->priority;
if (cl->priority >= cl->priority2)
- cl->priority2 = TC_CBQ_MAXPRIO-1;
+ cl->priority2 = TC_CBQ_MAXPRIO - 1;
}
cbq_addprio(q, cl);
@@ -1302,10 +1307,10 @@ static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
cl->overlimit = cbq_ovl_delay;
break;
case TC_CBQ_OVL_LOWPRIO:
- if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
- ovl->priority2-1 <= cl->priority)
+ if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
+ ovl->priority2 - 1 <= cl->priority)
return -EINVAL;
- cl->priority2 = ovl->priority2-1;
+ cl->priority2 = ovl->priority2 - 1;
cl->overlimit = cbq_ovl_lowprio;
break;
case TC_CBQ_OVL_DROP:
@@ -1384,9 +1389,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
if (!q->link.q)
q->link.q = &noop_qdisc;
- q->link.priority = TC_CBQ_MAXPRIO-1;
- q->link.priority2 = TC_CBQ_MAXPRIO-1;
- q->link.cpriority = TC_CBQ_MAXPRIO-1;
+ q->link.priority = TC_CBQ_MAXPRIO - 1;
+ q->link.priority2 = TC_CBQ_MAXPRIO - 1;
+ q->link.cpriority = TC_CBQ_MAXPRIO - 1;
q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
q->link.overlimit = cbq_ovl_classic;
q->link.allot = psched_mtu(qdisc_dev(sch));
@@ -1417,11 +1422,12 @@ put_rtab:
return err;
}
-static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
- NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
+ if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -1429,7 +1435,7 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_lssopt opt;
@@ -1446,7 +1452,8 @@ static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
opt.minidle = (u32)(-cl->minidle);
opt.offtime = cl->offtime;
opt.change = ~0;
- NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -1454,17 +1461,19 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_wrropt opt;
+ memset(&opt, 0, sizeof(opt));
opt.flags = 0;
opt.allot = cl->allot;
- opt.priority = cl->priority+1;
- opt.cpriority = cl->cpriority+1;
+ opt.priority = cl->priority + 1;
+ opt.cpriority = cl->cpriority + 1;
opt.weight = cl->weight;
- NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -1472,16 +1481,17 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_ovl opt;
opt.strategy = cl->ovl_strategy;
- opt.priority2 = cl->priority2+1;
+ opt.priority2 = cl->priority2 + 1;
opt.pad = 0;
opt.penalty = cl->penalty;
- NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -1489,7 +1499,7 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_fopt opt;
@@ -1498,7 +1508,8 @@ static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
opt.split = cl->split ? cl->split->common.classid : 0;
opt.defmap = cl->defmap;
opt.defchange = ~0;
- NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
+ goto nla_put_failure;
}
return skb->len;
@@ -1508,7 +1519,7 @@ nla_put_failure:
}
#ifdef CONFIG_NET_CLS_ACT
-static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_police opt;
@@ -1517,7 +1528,8 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
opt.police = cl->police;
opt.__res1 = 0;
opt.__res2 = 0;
- NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
+ goto nla_put_failure;
}
return skb->len;
@@ -1551,8 +1563,7 @@ static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
goto nla_put_failure;
if (cbq_dump_attr(skb, &q->link) < 0)
goto nla_put_failure;
- nla_nest_end(skb, nest);
- return skb->len;
+ return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
@@ -1572,7 +1583,7 @@ static int
cbq_dump_class(struct Qdisc *sch, unsigned long arg,
struct sk_buff *skb, struct tcmsg *tcm)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
struct nlattr *nest;
if (cl->tparent)
@@ -1587,8 +1598,7 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg,
goto nla_put_failure;
if (cbq_dump_attr(skb, cl) < 0)
goto nla_put_failure;
- nla_nest_end(skb, nest);
- return skb->len;
+ return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
@@ -1600,7 +1610,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
struct gnet_dump *d)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
cl->qstats.qlen = cl->q->q.qlen;
cl->xstats.avgidle = cl->avgidle;
@@ -1620,7 +1630,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
if (new == NULL) {
new = qdisc_create_dflt(sch->dev_queue,
@@ -1643,10 +1653,9 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
return 0;
}
-static struct Qdisc *
-cbq_leaf(struct Qdisc *sch, unsigned long arg)
+static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
return cl->q;
}
@@ -1685,13 +1694,12 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
kfree(cl);
}
-static void
-cbq_destroy(struct Qdisc* sch)
+static void cbq_destroy(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- struct hlist_node *n, *next;
+ struct hlist_node *next;
struct cbq_class *cl;
- unsigned h;
+ unsigned int h;
#ifdef CONFIG_NET_CLS_ACT
q->rx_class = NULL;
@@ -1702,11 +1710,11 @@ cbq_destroy(struct Qdisc* sch)
* be bound to classes which have been destroyed already. --TGR '04
*/
for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
tcf_destroy_chain(&cl->filter_list);
}
for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
common.hnode)
cbq_destroy_class(sch, cl);
}
@@ -1715,7 +1723,7 @@ cbq_destroy(struct Qdisc* sch)
static void cbq_put(struct Qdisc *sch, unsigned long arg)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
if (--cl->refcnt == 0) {
#ifdef CONFIG_NET_CLS_ACT
@@ -1738,7 +1746,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
{
int err;
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class*)*arg;
+ struct cbq_class *cl = (struct cbq_class *)*arg;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_CBQ_MAX + 1];
struct cbq_class *parent;
@@ -1773,8 +1781,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
qdisc_root_sleeping_lock(sch),
tca[TCA_RATE]);
if (err) {
- if (rtab)
- qdisc_put_rtab(rtab);
+ qdisc_put_rtab(rtab);
return err;
}
}
@@ -1830,13 +1837,14 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
if (classid) {
err = -EINVAL;
- if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
+ if (TC_H_MAJ(classid ^ sch->handle) ||
+ cbq_class_lookup(q, classid))
goto failure;
} else {
int i;
- classid = TC_H_MAKE(sch->handle,0x8000);
+ classid = TC_H_MAKE(sch->handle, 0x8000);
- for (i=0; i<0x8000; i++) {
+ for (i = 0; i < 0x8000; i++) {
if (++q->hgenerator >= 0x8000)
q->hgenerator = 1;
if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
@@ -1893,11 +1901,11 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
cl->minidle = -0x7FFFFFFF;
cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
- if (cl->ewma_log==0)
+ if (cl->ewma_log == 0)
cl->ewma_log = q->link.ewma_log;
- if (cl->maxidle==0)
+ if (cl->maxidle == 0)
cl->maxidle = q->link.maxidle;
- if (cl->avpkt==0)
+ if (cl->avpkt == 0)
cl->avpkt = q->link.avpkt;
cl->overlimit = cbq_ovl_classic;
if (tb[TCA_CBQ_OVL_STRATEGY])
@@ -1923,7 +1931,7 @@ failure:
static int cbq_delete(struct Qdisc *sch, unsigned long arg)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
unsigned int qlen;
if (cl->filters || cl->children || cl == &q->link)
@@ -1981,7 +1989,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *p = (struct cbq_class*)parent;
+ struct cbq_class *p = (struct cbq_class *)parent;
struct cbq_class *cl = cbq_class_lookup(q, classid);
if (cl) {
@@ -1995,7 +2003,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
cl->filters--;
}
@@ -2004,14 +2012,13 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl;
- struct hlist_node *n;
- unsigned h;
+ unsigned int h;
if (arg->stop)
return;
for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
if (arg->count < arg->skip) {
arg->count++;
continue;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
new file mode 100644
index 00000000000..ed30e436128
--- /dev/null
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,632 @@
+/*
+ * net/sched/sch_choke.c CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+#include <net/flow_keys.h>
+
+/*
+ CHOKe stateless AQM for fair bandwidth allocation
+ =================================================
+
+ CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+ unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+ maintains no flow state. The difference from RED is an additional step
+ during the enqueuing process. If average queue size is over the
+ low threshold (qmin), a packet is chosen at random from the queue.
+ If both the new and chosen packet are from the same flow, both
+ are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+ needs to access packets in queue randomly. It has a minimal class
+ interface to allow overriding the builtin flow classifier with
+ filters.
+
+ Source:
+ R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+ Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+ IEEE INFOCOM, 2000.
+
+ A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+ Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+/* Upper bound on size of sk_buff table (packets) */
+#define CHOKE_MAX_QUEUE (128*1024 - 1)
+
+struct choke_sched_data {
+/* Parameters */
+ u32 limit;
+ unsigned char flags;
+
+ struct red_parms parms;
+
+/* Variables */
+ struct red_vars vars;
+ struct tcf_proto *filter_list;
+ struct {
+ u32 prob_drop; /* Early probability drops */
+ u32 prob_mark; /* Early probability marks */
+ u32 forced_drop; /* Forced drops, qavg > max_thresh */
+ u32 forced_mark; /* Forced marks, qavg > max_thresh */
+ u32 pdrop; /* Drops due to queue limits */
+ u32 other; /* Drops due to drop() calls */
+ u32 matched; /* Drops to flow match */
+ } stats;
+
+ unsigned int head;
+ unsigned int tail;
+
+ unsigned int tab_mask; /* size - 1 */
+
+ struct sk_buff **tab;
+};
+
+/* number of elements in queue including holes */
+static unsigned int choke_len(const struct choke_sched_data *q)
+{
+ return (q->tail - q->head) & q->tab_mask;
+}
+
+/* Is ECN parameter configured */
+static int use_ecn(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static int use_harddrop(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+ do {
+ q->head = (q->head + 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->head] == NULL);
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+ do {
+ q->tail = (q->tail - 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->tail] == NULL);
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = q->tab[idx];
+
+ q->tab[idx] = NULL;
+
+ if (idx == q->head)
+ choke_zap_head_holes(q);
+ if (idx == q->tail)
+ choke_zap_tail_holes(q);
+
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_drop(skb, sch);
+ qdisc_tree_decrease_qlen(sch, 1);
+ --sch->q.qlen;
+}
+
+struct choke_skb_cb {
+ u16 classid;
+ u8 keys_valid;
+ struct flow_keys keys;
+};
+
+static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
+ return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
+{
+ choke_skb_cb(skb)->classid = classid;
+}
+
+static u16 choke_get_classid(const struct sk_buff *skb)
+{
+ return choke_skb_cb(skb)->classid;
+}
+
+/*
+ * Compare flow of two packets
+ * Returns true only if source and destination address and port match.
+ * false for special cases
+ */
+static bool choke_match_flow(struct sk_buff *skb1,
+ struct sk_buff *skb2)
+{
+ if (skb1->protocol != skb2->protocol)
+ return false;
+
+ if (!choke_skb_cb(skb1)->keys_valid) {
+ choke_skb_cb(skb1)->keys_valid = 1;
+ skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys);
+ }
+
+ if (!choke_skb_cb(skb2)->keys_valid) {
+ choke_skb_cb(skb2)->keys_valid = 1;
+ skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys);
+ }
+
+ return !memcmp(&choke_skb_cb(skb1)->keys,
+ &choke_skb_cb(skb2)->keys,
+ sizeof(struct flow_keys));
+}
+
+/*
+ * Classify flow using either:
+ * 1. pre-existing classification result in skb
+ * 2. fast internal classification
+ * 3. use TC filter based classification
+ */
+static bool choke_classify(struct sk_buff *skb,
+ struct Qdisc *sch, int *qerr)
+
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tcf_result res;
+ int result;
+
+ result = tc_classify(skb, q->filter_list, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return false;
+ }
+#endif
+ choke_set_classid(skb, TC_H_MIN(res.classid));
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Select a packet at random from queue
+ * HACK: since queue can have holes from previous deletion; retry several
+ * times to find a random skb but then just give up and return the head
+ * Will return NULL if queue is empty (q->head == q->tail)
+ */
+static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
+ unsigned int *pidx)
+{
+ struct sk_buff *skb;
+ int retrys = 3;
+
+ do {
+ *pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
+ skb = q->tab[*pidx];
+ if (skb)
+ return skb;
+ } while (--retrys > 0);
+
+ return q->tab[*pidx = q->head];
+}
+
+/*
+ * Compare new packet with random packet in queue
+ * returns true if matched and sets *pidx
+ */
+static bool choke_match_random(const struct choke_sched_data *q,
+ struct sk_buff *nskb,
+ unsigned int *pidx)
+{
+ struct sk_buff *oskb;
+
+ if (q->head == q->tail)
+ return false;
+
+ oskb = choke_peek_random(q, pidx);
+ if (q->filter_list)
+ return choke_get_classid(nskb) == choke_get_classid(oskb);
+
+ return choke_match_flow(oskb, nskb);
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ const struct red_parms *p = &q->parms;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+ if (q->filter_list) {
+ /* If using external classifiers, get result and record it. */
+ if (!choke_classify(skb, sch, &ret))
+ goto other_drop; /* Packet was eaten by filter */
+ }
+
+ choke_skb_cb(skb)->keys_valid = 0;
+ /* Compute average queue usage (see RED) */
+ q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
+
+ /* Is queue small? */
+ if (q->vars.qavg <= p->qth_min)
+ q->vars.qcount = -1;
+ else {
+ unsigned int idx;
+
+ /* Draw a packet at random from queue and compare flow */
+ if (choke_match_random(q, skb, &idx)) {
+ q->stats.matched++;
+ choke_drop_by_idx(sch, idx);
+ goto congestion_drop;
+ }
+
+ /* Queue is large, always mark/drop */
+ if (q->vars.qavg > p->qth_max) {
+ q->vars.qcount = -1;
+
+ sch->qstats.overlimits++;
+ if (use_harddrop(q) || !use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ } else if (++q->vars.qcount) {
+ if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
+ q->vars.qcount = 0;
+ q->vars.qR = red_random(p);
+
+ sch->qstats.overlimits++;
+ if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ }
+ } else
+ q->vars.qR = red_random(p);
+ }
+
+ /* Admit new packet */
+ if (sch->q.qlen < q->limit) {
+ q->tab[q->tail] = skb;
+ q->tail = (q->tail + 1) & q->tab_mask;
+ ++sch->q.qlen;
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ q->stats.pdrop++;
+ return qdisc_drop(skb, sch);
+
+congestion_drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+
+other_drop:
+ if (ret & __NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ if (q->head == q->tail) {
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
+ return NULL;
+ }
+
+ skb = q->tab[q->head];
+ q->tab[q->head] = NULL;
+ choke_zap_head_holes(q);
+ --sch->q.qlen;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_bstats_update(sch, skb);
+
+ return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ unsigned int len;
+
+ len = qdisc_queue_drop(sch);
+ if (len > 0)
+ q->stats.other++;
+ else {
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
+ }
+
+ return len;
+}
+
+static void choke_reset(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ red_restart(&q->vars);
+}
+
+static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
+ [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
+ [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
+ [TCA_CHOKE_MAX_P] = { .type = NLA_U32 },
+};
+
+
+static void choke_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CHOKE_MAX + 1];
+ const struct tc_red_qopt *ctl;
+ int err;
+ struct sk_buff **old = NULL;
+ unsigned int mask;
+ u32 max_P;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CHOKE_PARMS] == NULL ||
+ tb[TCA_CHOKE_STAB] == NULL)
+ return -EINVAL;
+
+ max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
+
+ ctl = nla_data(tb[TCA_CHOKE_PARMS]);
+
+ if (ctl->limit > CHOKE_MAX_QUEUE)
+ return -EINVAL;
+
+ mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+ if (mask != q->tab_mask) {
+ struct sk_buff **ntab;
+
+ ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!ntab)
+ ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+ if (!ntab)
+ return -ENOMEM;
+
+ sch_tree_lock(sch);
+ old = q->tab;
+ if (old) {
+ unsigned int oqlen = sch->q.qlen, tail = 0;
+
+ while (q->head != q->tail) {
+ struct sk_buff *skb = q->tab[q->head];
+
+ q->head = (q->head + 1) & q->tab_mask;
+ if (!skb)
+ continue;
+ if (tail < mask) {
+ ntab[tail++] = skb;
+ continue;
+ }
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ --sch->q.qlen;
+ qdisc_drop(skb, sch);
+ }
+ qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+ q->head = 0;
+ q->tail = tail;
+ }
+
+ q->tab_mask = mask;
+ q->tab = ntab;
+ } else
+ sch_tree_lock(sch);
+
+ q->flags = ctl->flags;
+ q->limit = ctl->limit;
+
+ red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ nla_data(tb[TCA_CHOKE_STAB]),
+ max_P);
+ red_set_vars(&q->vars);
+
+ if (q->head == q->tail)
+ red_end_of_idle_period(&q->vars);
+
+ sch_tree_unlock(sch);
+ choke_free(old);
+ return 0;
+}
+
+static int choke_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+ struct tc_red_qopt opt = {
+ .limit = q->limit,
+ .flags = q->flags,
+ .qth_min = q->parms.qth_min >> q->parms.Wlog,
+ .qth_max = q->parms.qth_max >> q->parms.Wlog,
+ .Wlog = q->parms.Wlog,
+ .Plog = q->parms.Plog,
+ .Scell_log = q->parms.Scell_log,
+ };
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
+ nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
+ goto nla_put_failure;
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tc_choke_xstats st = {
+ .early = q->stats.prob_drop + q->stats.forced_drop,
+ .marked = q->stats.prob_mark + q->stats.forced_mark,
+ .pdrop = q->stats.pdrop,
+ .other = q->stats.other,
+ .matched = q->stats.matched,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ if (!arg->stop) {
+ if (arg->fn(sch, 1, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+ .leaf = choke_leaf,
+ .get = choke_get,
+ .put = choke_put,
+ .tcf_chain = choke_find_tcf,
+ .bind_tcf = choke_bind,
+ .unbind_tcf = choke_put,
+ .dump = choke_dump_class,
+ .walk = choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+ .id = "choke",
+ .priv_size = sizeof(struct choke_sched_data),
+
+ .enqueue = choke_enqueue,
+ .dequeue = choke_dequeue,
+ .peek = choke_peek_head,
+ .drop = choke_drop,
+ .init = choke_init,
+ .destroy = choke_destroy,
+ .reset = choke_reset,
+ .change = choke_change,
+ .dump = choke_dump,
+ .dump_stats = choke_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+ return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+ unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
new file mode 100644
index 00000000000..2f9ab17db85
--- /dev/null
+++ b/net/sched/sch_codel.c
@@ -0,0 +1,276 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm
+ *
+ * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
+ * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
+ *
+ * Implemented on linux by :
+ * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
+ * Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The names of the authors may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/prefetch.h>
+#include <net/pkt_sched.h>
+#include <net/codel.h>
+
+
+#define DEFAULT_CODEL_LIMIT 1000
+
+struct codel_sched_data {
+ struct codel_params params;
+ struct codel_vars vars;
+ struct codel_stats stats;
+ u32 drop_overlimit;
+};
+
+/* This is the specific function called from codel_dequeue()
+ * to dequeue a packet from queue. Note: backlog is handled in
+ * codel, we dont need to reduce it here.
+ */
+static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+{
+ struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+ prefetch(&skb->end); /* we'll need skb_shinfo() */
+ return skb;
+}
+
+static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
+
+ /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+ * or HTB crashes. Defer it for next round.
+ */
+ if (q->stats.drop_count && sch->q.qlen) {
+ qdisc_tree_decrease_qlen(sch, q->stats.drop_count);
+ q->stats.drop_count = 0;
+ }
+ if (skb)
+ qdisc_bstats_update(sch, skb);
+ return skb;
+}
+
+static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct codel_sched_data *q;
+
+ if (likely(qdisc_qlen(sch) < sch->limit)) {
+ codel_set_enqueue_time(skb);
+ return qdisc_enqueue_tail(skb, sch);
+ }
+ q = qdisc_priv(sch);
+ q->drop_overlimit++;
+ return qdisc_drop(skb, sch);
+}
+
+static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
+ [TCA_CODEL_TARGET] = { .type = NLA_U32 },
+ [TCA_CODEL_LIMIT] = { .type = NLA_U32 },
+ [TCA_CODEL_INTERVAL] = { .type = NLA_U32 },
+ [TCA_CODEL_ECN] = { .type = NLA_U32 },
+};
+
+static int codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CODEL_MAX + 1];
+ unsigned int qlen;
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+
+ if (tb[TCA_CODEL_TARGET]) {
+ u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]);
+
+ q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_CODEL_INTERVAL]) {
+ u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
+
+ q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_CODEL_LIMIT])
+ sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]);
+
+ if (tb[TCA_CODEL_ECN])
+ q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]);
+
+ qlen = sch->q.qlen;
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_drop(skb, sch);
+ }
+ qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static int codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+
+ sch->limit = DEFAULT_CODEL_LIMIT;
+
+ codel_params_init(&q->params);
+ codel_vars_init(&q->vars);
+ codel_stats_init(&q->stats);
+
+ if (opt) {
+ int err = codel_change(sch, opt);
+
+ if (err)
+ return err;
+ }
+
+ if (sch->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+ return 0;
+}
+
+static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CODEL_TARGET,
+ codel_time_to_us(q->params.target)) ||
+ nla_put_u32(skb, TCA_CODEL_LIMIT,
+ sch->limit) ||
+ nla_put_u32(skb, TCA_CODEL_INTERVAL,
+ codel_time_to_us(q->params.interval)) ||
+ nla_put_u32(skb, TCA_CODEL_ECN,
+ q->params.ecn))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -1;
+}
+
+static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ const struct codel_sched_data *q = qdisc_priv(sch);
+ struct tc_codel_xstats st = {
+ .maxpacket = q->stats.maxpacket,
+ .count = q->vars.count,
+ .lastcount = q->vars.lastcount,
+ .drop_overlimit = q->drop_overlimit,
+ .ldelay = codel_time_to_us(q->vars.ldelay),
+ .dropping = q->vars.dropping,
+ .ecn_mark = q->stats.ecn_mark,
+ };
+
+ if (q->vars.dropping) {
+ codel_tdiff_t delta = q->vars.drop_next - codel_get_time();
+
+ if (delta >= 0)
+ st.drop_next = codel_time_to_us(delta);
+ else
+ st.drop_next = -codel_time_to_us(-delta);
+ }
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void codel_reset(struct Qdisc *sch)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset_queue(sch);
+ codel_vars_init(&q->vars);
+}
+
+static struct Qdisc_ops codel_qdisc_ops __read_mostly = {
+ .id = "codel",
+ .priv_size = sizeof(struct codel_sched_data),
+
+ .enqueue = codel_qdisc_enqueue,
+ .dequeue = codel_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = codel_init,
+ .reset = codel_reset,
+ .change = codel_change,
+ .dump = codel_dump,
+ .dump_stats = codel_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init codel_module_init(void)
+{
+ return register_qdisc(&codel_qdisc_ops);
+}
+
+static void __exit codel_module_exit(void)
+{
+ unregister_qdisc(&codel_qdisc_ops);
+}
+
+module_init(codel_module_init)
+module_exit(codel_module_exit)
+
+MODULE_DESCRIPTION("Controlled Delay queue discipline");
+MODULE_AUTHOR("Dave Taht");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index aa8b5313f8c..7bbbfe11219 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+ struct gnet_stats_rate_est64 rate_est;
struct list_head alist;
struct Qdisc *qdisc;
@@ -260,7 +260,8 @@ static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
- NLA_PUT_U32(skb, TCA_DRR_QUANTUM, cl->quantum);
+ if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum))
+ goto nla_put_failure;
return nla_nest_end(skb, nest);
nla_put_failure:
@@ -292,14 +293,13 @@ static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
- struct hlist_node *n;
unsigned int i;
if (arg->stop)
return;
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
if (arg->count < arg->skip) {
arg->count++;
continue;
@@ -351,8 +351,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
- unsigned int len;
- int err;
+ int err = 0;
cl = drr_classify(skb, sch, &err);
if (cl == NULL) {
@@ -362,7 +361,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return err;
}
- len = qdisc_pkt_len(skb);
err = qdisc_enqueue(skb, cl->qdisc);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
@@ -377,11 +375,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
cl->deficit = cl->quantum;
}
- cl->bstats.packets++;
- cl->bstats.bytes += len;
- sch->bstats.packets++;
- sch->bstats.bytes += len;
-
sch->q.qlen++;
return err;
}
@@ -398,8 +391,10 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
while (1) {
cl = list_first_entry(&q->active, struct drr_class, alist);
skb = cl->qdisc->ops->peek(cl->qdisc);
- if (skb == NULL)
+ if (skb == NULL) {
+ qdisc_warn_nonwc(__func__, cl->qdisc);
goto out;
+ }
len = qdisc_pkt_len(skb);
if (len <= cl->deficit) {
@@ -407,6 +402,9 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
skb = qdisc_dequeue_peeked(cl->qdisc);
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
+
+ bstats_update(&cl->bstats, skb);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -454,11 +452,10 @@ static void drr_reset_qdisc(struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
- struct hlist_node *n;
unsigned int i;
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
if (cl->qdisc->q.qlen)
list_del(&cl->alist);
qdisc_reset(cl->qdisc);
@@ -471,13 +468,13 @@ static void drr_destroy_qdisc(struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
- struct hlist_node *n, *next;
+ struct hlist_node *next;
unsigned int i;
tcf_destroy_chain(&q->filter_list);
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
common.hnode)
drr_destroy_class(sch, cl);
}
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 1d295d62bb5..49d6ef338b5 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -47,7 +47,7 @@ struct dsmark_qdisc_data {
static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
{
- return (index <= p->indices && index > 0);
+ return index <= p->indices && index > 0;
}
/* ------------------------- Class/flow operations ------------------------- */
@@ -57,8 +57,8 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
- pr_debug("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",
- sch, p, new, old);
+ pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n",
+ __func__, sch, p, new, old);
if (new == NULL) {
new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
@@ -85,8 +85,8 @@ static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
{
- pr_debug("dsmark_get(sch %p,[qdisc %p],classid %x)\n",
- sch, qdisc_priv(sch), classid);
+ pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
+ __func__, sch, qdisc_priv(sch), classid);
return TC_H_MIN(classid) + 1;
}
@@ -118,8 +118,8 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
int err = -EINVAL;
u8 mask = 0;
- pr_debug("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
- "arg 0x%lx\n", sch, p, classid, parent, *arg);
+ pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n",
+ __func__, sch, p, classid, parent, *arg);
if (!dsmark_valid_index(p, *arg)) {
err = -ENOENT;
@@ -137,10 +137,10 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
if (tb[TCA_DSMARK_VALUE])
- p->value[*arg-1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
+ p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
if (tb[TCA_DSMARK_MASK])
- p->mask[*arg-1] = mask;
+ p->mask[*arg - 1] = mask;
err = 0;
@@ -155,8 +155,8 @@ static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
if (!dsmark_valid_index(p, arg))
return -EINVAL;
- p->mask[arg-1] = 0xff;
- p->value[arg-1] = 0;
+ p->mask[arg - 1] = 0xff;
+ p->value[arg - 1] = 0;
return 0;
}
@@ -166,7 +166,8 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
struct dsmark_qdisc_data *p = qdisc_priv(sch);
int i;
- pr_debug("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
+ pr_debug("%s(sch %p,[qdisc %p],walker %p)\n",
+ __func__, sch, p, walker);
if (walker->stop)
return;
@@ -175,7 +176,7 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
if (p->mask[i] == 0xff && !p->value[i])
goto ignore;
if (walker->count >= walker->skip) {
- if (walker->fn(sch, i+1, walker) < 0) {
+ if (walker->fn(sch, i + 1, walker) < 0) {
walker->stop = 1;
break;
}
@@ -199,7 +200,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
struct dsmark_qdisc_data *p = qdisc_priv(sch);
int err;
- pr_debug("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
+ pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
if (p->set_tc_index) {
switch (skb->protocol) {
@@ -260,15 +261,12 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return err;
}
- sch->bstats.bytes += qdisc_pkt_len(skb);
- sch->bstats.packets++;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
drop:
- kfree_skb(skb);
- sch->qstats.drops++;
+ qdisc_drop(skb, sch);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
@@ -278,12 +276,13 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
struct sk_buff *skb;
u32 index;
- pr_debug("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p);
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
skb = p->q->ops->dequeue(p->q);
if (skb == NULL)
return NULL;
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
index = skb->tc_index & (p->indices - 1);
@@ -305,9 +304,8 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
* and don't need yet another qdisc as a bypass.
*/
if (p->mask[index] != 0xff || p->value[index])
- printk(KERN_WARNING
- "dsmark_dequeue: unsupported protocol %d\n",
- ntohs(skb->protocol));
+ pr_warn("%s: unsupported protocol %d\n",
+ __func__, ntohs(skb->protocol));
break;
}
@@ -318,7 +316,7 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
- pr_debug("dsmark_peek(sch %p,[qdisc %p])\n", sch, p);
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
return p->q->ops->peek(p->q);
}
@@ -328,7 +326,7 @@ static unsigned int dsmark_drop(struct Qdisc *sch)
struct dsmark_qdisc_data *p = qdisc_priv(sch);
unsigned int len;
- pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
if (p->q->ops->drop == NULL)
return 0;
@@ -349,7 +347,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
u16 indices;
u8 *mask;
- pr_debug("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
+ pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);
if (!opt)
goto errout;
@@ -387,7 +385,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
if (p->q == NULL)
p->q = &noop_qdisc;
- pr_debug("dsmark_init: qdisc %p\n", p->q);
+ pr_debug("%s: qdisc %p\n", __func__, p->q);
err = 0;
errout:
@@ -398,7 +396,7 @@ static void dsmark_reset(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
- pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
qdisc_reset(p->q);
sch->q.qlen = 0;
}
@@ -407,7 +405,7 @@ static void dsmark_destroy(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
- pr_debug("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p);
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
tcf_destroy_chain(&p->filter_list);
qdisc_destroy(p->q);
@@ -420,19 +418,20 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct nlattr *opts = NULL;
- pr_debug("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl);
+ pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);
if (!dsmark_valid_index(p, cl))
return -EINVAL;
- tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1);
+ tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
tcm->tcm_info = p->q->handle;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
- NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl-1]);
- NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl-1]);
+ if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) ||
+ nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]))
+ goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -449,13 +448,16 @@ static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
- NLA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices);
+ if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices))
+ goto nla_put_failure;
- if (p->default_index != NO_DEFAULT_INDEX)
- NLA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index);
+ if (p->default_index != NO_DEFAULT_INDEX &&
+ nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index))
+ goto nla_put_failure;
- if (p->set_tc_index)
- NLA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX);
+ if (p->set_tc_index &&
+ nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX))
+ goto nla_put_failure;
return nla_nest_end(skb, opts);
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 4dfecb0cba3..e15a9eb2908 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -19,46 +19,30 @@
/* 1 band FIFO pseudo-"scheduler" */
-struct fifo_sched_data
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
- u32 limit;
-};
-
-static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
-{
- struct fifo_sched_data *q = qdisc_priv(sch);
-
- if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= q->limit))
+ if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_reshape_fail(skb, sch);
}
-static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
- struct fifo_sched_data *q = qdisc_priv(sch);
-
- if (likely(skb_queue_len(&sch->q) < q->limit))
+ if (likely(skb_queue_len(&sch->q) < sch->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_reshape_fail(skb, sch);
}
-static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
- struct sk_buff *skb_head;
- struct fifo_sched_data *q = qdisc_priv(sch);
-
- if (likely(skb_queue_len(&sch->q) < q->limit))
+ if (likely(skb_queue_len(&sch->q) < sch->limit))
return qdisc_enqueue_tail(skb, sch);
/* queue full, remove one skb to fulfill the limit */
- skb_head = qdisc_dequeue_head(sch);
- sch->bstats.bytes -= qdisc_pkt_len(skb_head);
- sch->bstats.packets--;
+ __qdisc_queue_drop_head(sch, &sch->q);
sch->qstats.drops++;
- kfree_skb(skb_head);
-
qdisc_enqueue_tail(skb, sch);
return NET_XMIT_CN;
@@ -66,33 +50,43 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch)
static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
{
- struct fifo_sched_data *q = qdisc_priv(sch);
+ bool bypass;
+ bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
if (opt == NULL) {
u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
- if (sch->ops == &bfifo_qdisc_ops)
+ if (is_bfifo)
limit *= psched_mtu(qdisc_dev(sch));
- q->limit = limit;
+ sch->limit = limit;
} else {
struct tc_fifo_qopt *ctl = nla_data(opt);
if (nla_len(opt) < sizeof(*ctl))
return -EINVAL;
- q->limit = ctl->limit;
+ sch->limit = ctl->limit;
}
+ if (is_bfifo)
+ bypass = sch->limit >= psched_mtu(qdisc_dev(sch));
+ else
+ bypass = sch->limit >= 1;
+
+ if (bypass)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
{
- struct fifo_sched_data *q = qdisc_priv(sch);
- struct tc_fifo_qopt opt = { .limit = q->limit };
+ struct tc_fifo_qopt opt = { .limit = sch->limit };
- NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -101,7 +95,7 @@ nla_put_failure:
struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
.id = "pfifo",
- .priv_size = sizeof(struct fifo_sched_data),
+ .priv_size = 0,
.enqueue = pfifo_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
@@ -116,7 +110,7 @@ EXPORT_SYMBOL(pfifo_qdisc_ops);
struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
.id = "bfifo",
- .priv_size = sizeof(struct fifo_sched_data),
+ .priv_size = 0,
.enqueue = bfifo_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
@@ -131,7 +125,7 @@ EXPORT_SYMBOL(bfifo_qdisc_ops);
struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
.id = "pfifo_head_drop",
- .priv_size = sizeof(struct fifo_sched_data),
+ .priv_size = 0,
.enqueue = pfifo_tail_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
new file mode 100644
index 00000000000..ba32c2b005d
--- /dev/null
+++ b/net/sched/sch_fq.c
@@ -0,0 +1,849 @@
+/*
+ * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
+ *
+ * Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Meant to be mostly used for localy generated traffic :
+ * Fast classification depends on skb->sk being set before reaching us.
+ * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
+ * All packets belonging to a socket are considered as a 'flow'.
+ *
+ * Flows are dynamically allocated and stored in a hash table of RB trees
+ * They are also part of one Round Robin 'queues' (new or old flows)
+ *
+ * Burst avoidance (aka pacing) capability :
+ *
+ * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
+ * bunch of packets, and this packet scheduler adds delay between
+ * packets to respect rate limitation.
+ *
+ * enqueue() :
+ * - lookup one RB tree (out of 1024 or more) to find the flow.
+ * If non existent flow, create it, add it to the tree.
+ * Add skb to the per flow list of skb (fifo).
+ * - Use a special fifo for high prio packets
+ *
+ * dequeue() : serves flows in Round Robin
+ * Note : When a flow becomes empty, we do not immediately remove it from
+ * rb trees, for performance reasons (its expected to send additional packets,
+ * or SLAB cache will reuse socket for another flow)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+/*
+ * Per flow structure, dynamically allocated
+ */
+struct fq_flow {
+ struct sk_buff *head; /* list of skbs for this flow : first skb */
+ union {
+ struct sk_buff *tail; /* last skb in the list */
+ unsigned long age; /* jiffies when flow was emptied, for gc */
+ };
+ struct rb_node fq_node; /* anchor in fq_root[] trees */
+ struct sock *sk;
+ int qlen; /* number of packets in flow queue */
+ int credit;
+ u32 socket_hash; /* sk_hash */
+ struct fq_flow *next; /* next pointer in RR lists, or &detached */
+
+ struct rb_node rate_node; /* anchor in q->delayed tree */
+ u64 time_next_packet;
+};
+
+struct fq_flow_head {
+ struct fq_flow *first;
+ struct fq_flow *last;
+};
+
+struct fq_sched_data {
+ struct fq_flow_head new_flows;
+
+ struct fq_flow_head old_flows;
+
+ struct rb_root delayed; /* for rate limited flows */
+ u64 time_next_delayed_flow;
+
+ struct fq_flow internal; /* for non classified or high prio packets */
+ u32 quantum;
+ u32 initial_quantum;
+ u32 flow_refill_delay;
+ u32 flow_max_rate; /* optional max rate per flow */
+ u32 flow_plimit; /* max packets per flow */
+ struct rb_root *fq_root;
+ u8 rate_enable;
+ u8 fq_trees_log;
+
+ u32 flows;
+ u32 inactive_flows;
+ u32 throttled_flows;
+
+ u64 stat_gc_flows;
+ u64 stat_internal_packets;
+ u64 stat_tcp_retrans;
+ u64 stat_throttled;
+ u64 stat_flows_plimit;
+ u64 stat_pkts_too_long;
+ u64 stat_allocation_errors;
+ struct qdisc_watchdog watchdog;
+};
+
+/* special value to mark a detached flow (not on old/new list) */
+static struct fq_flow detached, throttled;
+
+static void fq_flow_set_detached(struct fq_flow *f)
+{
+ f->next = &detached;
+ f->age = jiffies;
+}
+
+static bool fq_flow_is_detached(const struct fq_flow *f)
+{
+ return f->next == &detached;
+}
+
+static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+ struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct fq_flow *aux;
+
+ parent = *p;
+ aux = container_of(parent, struct fq_flow, rate_node);
+ if (f->time_next_packet >= aux->time_next_packet)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&f->rate_node, parent, p);
+ rb_insert_color(&f->rate_node, &q->delayed);
+ q->throttled_flows++;
+ q->stat_throttled++;
+
+ f->next = &throttled;
+ if (q->time_next_delayed_flow > f->time_next_packet)
+ q->time_next_delayed_flow = f->time_next_packet;
+}
+
+
+static struct kmem_cache *fq_flow_cachep __read_mostly;
+
+static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+{
+ if (head->first)
+ head->last->next = flow;
+ else
+ head->first = flow;
+ head->last = flow;
+ flow->next = NULL;
+}
+
+/* limit number of collected flows per round */
+#define FQ_GC_MAX 8
+#define FQ_GC_AGE (3*HZ)
+
+static bool fq_gc_candidate(const struct fq_flow *f)
+{
+ return fq_flow_is_detached(f) &&
+ time_after(jiffies, f->age + FQ_GC_AGE);
+}
+
+static void fq_gc(struct fq_sched_data *q,
+ struct rb_root *root,
+ struct sock *sk)
+{
+ struct fq_flow *f, *tofree[FQ_GC_MAX];
+ struct rb_node **p, *parent;
+ int fcnt = 0;
+
+ p = &root->rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+
+ f = container_of(parent, struct fq_flow, fq_node);
+ if (f->sk == sk)
+ break;
+
+ if (fq_gc_candidate(f)) {
+ tofree[fcnt++] = f;
+ if (fcnt == FQ_GC_MAX)
+ break;
+ }
+
+ if (f->sk > sk)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+
+ q->flows -= fcnt;
+ q->inactive_flows -= fcnt;
+ q->stat_gc_flows += fcnt;
+ while (fcnt) {
+ struct fq_flow *f = tofree[--fcnt];
+
+ rb_erase(&f->fq_node, root);
+ kmem_cache_free(fq_flow_cachep, f);
+ }
+}
+
+static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+{
+ struct rb_node **p, *parent;
+ struct sock *sk = skb->sk;
+ struct rb_root *root;
+ struct fq_flow *f;
+
+ /* warning: no starvation prevention... */
+ if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
+ return &q->internal;
+
+ if (unlikely(!sk)) {
+ /* By forcing low order bit to 1, we make sure to not
+ * collide with a local flow (socket pointers are word aligned)
+ */
+ sk = (struct sock *)(skb_get_hash(skb) | 1L);
+ }
+
+ root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
+
+ if (q->flows >= (2U << q->fq_trees_log) &&
+ q->inactive_flows > q->flows/2)
+ fq_gc(q, root, sk);
+
+ p = &root->rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+
+ f = container_of(parent, struct fq_flow, fq_node);
+ if (f->sk == sk) {
+ /* socket might have been reallocated, so check
+ * if its sk_hash is the same.
+ * It not, we need to refill credit with
+ * initial quantum
+ */
+ if (unlikely(skb->sk &&
+ f->socket_hash != sk->sk_hash)) {
+ f->credit = q->initial_quantum;
+ f->socket_hash = sk->sk_hash;
+ f->time_next_packet = 0ULL;
+ }
+ return f;
+ }
+ if (f->sk > sk)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+
+ f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!f)) {
+ q->stat_allocation_errors++;
+ return &q->internal;
+ }
+ fq_flow_set_detached(f);
+ f->sk = sk;
+ if (skb->sk)
+ f->socket_hash = sk->sk_hash;
+ f->credit = q->initial_quantum;
+
+ rb_link_node(&f->fq_node, parent, p);
+ rb_insert_color(&f->fq_node, root);
+
+ q->flows++;
+ q->inactive_flows++;
+ return f;
+}
+
+
+/* remove one skb from head of flow queue */
+static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ if (skb) {
+ flow->head = skb->next;
+ skb->next = NULL;
+ flow->qlen--;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ sch->q.qlen--;
+ }
+ return skb;
+}
+
+/* We might add in the future detection of retransmits
+ * For the time being, just return false
+ */
+static bool skb_is_retransmit(struct sk_buff *skb)
+{
+ return false;
+}
+
+/* add skb to flow queue
+ * flow queue is a linked list, kind of FIFO, except for TCP retransmits
+ * We special case tcp retransmits to be transmitted before other packets.
+ * We rely on fact that TCP retransmits are unlikely, so we do not waste
+ * a separate queue or a pointer.
+ * head-> [retrans pkt 1]
+ * [retrans pkt 2]
+ * [ normal pkt 1]
+ * [ normal pkt 2]
+ * [ normal pkt 3]
+ * tail-> [ normal pkt 4]
+ */
+static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
+{
+ struct sk_buff *prev, *head = flow->head;
+
+ skb->next = NULL;
+ if (!head) {
+ flow->head = skb;
+ flow->tail = skb;
+ return;
+ }
+ if (likely(!skb_is_retransmit(skb))) {
+ flow->tail->next = skb;
+ flow->tail = skb;
+ return;
+ }
+
+ /* This skb is a tcp retransmit,
+ * find the last retrans packet in the queue
+ */
+ prev = NULL;
+ while (skb_is_retransmit(head)) {
+ prev = head;
+ head = head->next;
+ if (!head)
+ break;
+ }
+ if (!prev) { /* no rtx packet in queue, become the new head */
+ skb->next = flow->head;
+ flow->head = skb;
+ } else {
+ if (prev == flow->tail)
+ flow->tail = skb;
+ else
+ skb->next = prev->next;
+ prev->next = skb;
+ }
+}
+
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct fq_flow *f;
+
+ if (unlikely(sch->q.qlen >= sch->limit))
+ return qdisc_drop(skb, sch);
+
+ f = fq_classify(skb, q);
+ if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
+ q->stat_flows_plimit++;
+ return qdisc_drop(skb, sch);
+ }
+
+ f->qlen++;
+ if (skb_is_retransmit(skb))
+ q->stat_tcp_retrans++;
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+ if (fq_flow_is_detached(f)) {
+ fq_flow_add_tail(&q->new_flows, f);
+ if (time_after(jiffies, f->age + q->flow_refill_delay))
+ f->credit = max_t(u32, f->credit, q->quantum);
+ q->inactive_flows--;
+ qdisc_unthrottled(sch);
+ }
+
+ /* Note: this overwrites f->age */
+ flow_queue_add(f, skb);
+
+ if (unlikely(f == &q->internal)) {
+ q->stat_internal_packets++;
+ qdisc_unthrottled(sch);
+ }
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
+}
+
+static void fq_check_throttled(struct fq_sched_data *q, u64 now)
+{
+ struct rb_node *p;
+
+ if (q->time_next_delayed_flow > now)
+ return;
+
+ q->time_next_delayed_flow = ~0ULL;
+ while ((p = rb_first(&q->delayed)) != NULL) {
+ struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
+
+ if (f->time_next_packet > now) {
+ q->time_next_delayed_flow = f->time_next_packet;
+ break;
+ }
+ rb_erase(p, &q->delayed);
+ q->throttled_flows--;
+ fq_flow_add_tail(&q->old_flows, f);
+ }
+}
+
+static struct sk_buff *fq_dequeue(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ u64 now = ktime_to_ns(ktime_get());
+ struct fq_flow_head *head;
+ struct sk_buff *skb;
+ struct fq_flow *f;
+ u32 rate;
+
+ skb = fq_dequeue_head(sch, &q->internal);
+ if (skb)
+ goto out;
+ fq_check_throttled(q, now);
+begin:
+ head = &q->new_flows;
+ if (!head->first) {
+ head = &q->old_flows;
+ if (!head->first) {
+ if (q->time_next_delayed_flow != ~0ULL)
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ q->time_next_delayed_flow);
+ return NULL;
+ }
+ }
+ f = head->first;
+
+ if (f->credit <= 0) {
+ f->credit += q->quantum;
+ head->first = f->next;
+ fq_flow_add_tail(&q->old_flows, f);
+ goto begin;
+ }
+
+ if (unlikely(f->head && now < f->time_next_packet)) {
+ head->first = f->next;
+ fq_flow_set_throttled(q, f);
+ goto begin;
+ }
+
+ skb = fq_dequeue_head(sch, f);
+ if (!skb) {
+ head->first = f->next;
+ /* force a pass through old_flows to prevent starvation */
+ if ((head == &q->new_flows) && q->old_flows.first) {
+ fq_flow_add_tail(&q->old_flows, f);
+ } else {
+ fq_flow_set_detached(f);
+ q->inactive_flows++;
+ }
+ goto begin;
+ }
+ prefetch(&skb->end);
+ f->time_next_packet = now;
+ f->credit -= qdisc_pkt_len(skb);
+
+ if (f->credit > 0 || !q->rate_enable)
+ goto out;
+
+ rate = q->flow_max_rate;
+ if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT)
+ rate = min(skb->sk->sk_pacing_rate, rate);
+
+ if (rate != ~0U) {
+ u32 plen = max(qdisc_pkt_len(skb), q->quantum);
+ u64 len = (u64)plen * NSEC_PER_SEC;
+
+ if (likely(rate))
+ do_div(len, rate);
+ /* Since socket rate can change later,
+ * clamp the delay to 125 ms.
+ * TODO: maybe segment the too big skb, as in commit
+ * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
+ */
+ if (unlikely(len > 125 * NSEC_PER_MSEC)) {
+ len = 125 * NSEC_PER_MSEC;
+ q->stat_pkts_too_long++;
+ }
+
+ f->time_next_packet = now + len;
+ }
+out:
+ qdisc_bstats_update(sch, skb);
+ qdisc_unthrottled(sch);
+ return skb;
+}
+
+static void fq_reset(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct rb_root *root;
+ struct sk_buff *skb;
+ struct rb_node *p;
+ struct fq_flow *f;
+ unsigned int idx;
+
+ while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
+ kfree_skb(skb);
+
+ if (!q->fq_root)
+ return;
+
+ for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
+ root = &q->fq_root[idx];
+ while ((p = rb_first(root)) != NULL) {
+ f = container_of(p, struct fq_flow, fq_node);
+ rb_erase(p, root);
+
+ while ((skb = fq_dequeue_head(sch, f)) != NULL)
+ kfree_skb(skb);
+
+ kmem_cache_free(fq_flow_cachep, f);
+ }
+ }
+ q->new_flows.first = NULL;
+ q->old_flows.first = NULL;
+ q->delayed = RB_ROOT;
+ q->flows = 0;
+ q->inactive_flows = 0;
+ q->throttled_flows = 0;
+}
+
+static void fq_rehash(struct fq_sched_data *q,
+ struct rb_root *old_array, u32 old_log,
+ struct rb_root *new_array, u32 new_log)
+{
+ struct rb_node *op, **np, *parent;
+ struct rb_root *oroot, *nroot;
+ struct fq_flow *of, *nf;
+ int fcnt = 0;
+ u32 idx;
+
+ for (idx = 0; idx < (1U << old_log); idx++) {
+ oroot = &old_array[idx];
+ while ((op = rb_first(oroot)) != NULL) {
+ rb_erase(op, oroot);
+ of = container_of(op, struct fq_flow, fq_node);
+ if (fq_gc_candidate(of)) {
+ fcnt++;
+ kmem_cache_free(fq_flow_cachep, of);
+ continue;
+ }
+ nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
+
+ np = &nroot->rb_node;
+ parent = NULL;
+ while (*np) {
+ parent = *np;
+
+ nf = container_of(parent, struct fq_flow, fq_node);
+ BUG_ON(nf->sk == of->sk);
+
+ if (nf->sk > of->sk)
+ np = &parent->rb_right;
+ else
+ np = &parent->rb_left;
+ }
+
+ rb_link_node(&of->fq_node, parent, np);
+ rb_insert_color(&of->fq_node, nroot);
+ }
+ }
+ q->flows -= fcnt;
+ q->inactive_flows -= fcnt;
+ q->stat_gc_flows += fcnt;
+}
+
+static void *fq_alloc_node(size_t sz, int node)
+{
+ void *ptr;
+
+ ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node);
+ if (!ptr)
+ ptr = vmalloc_node(sz, node);
+ return ptr;
+}
+
+static void fq_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static int fq_resize(struct Qdisc *sch, u32 log)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct rb_root *array;
+ void *old_fq_root;
+ u32 idx;
+
+ if (q->fq_root && log == q->fq_trees_log)
+ return 0;
+
+ /* If XPS was setup, we can allocate memory on right NUMA node */
+ array = fq_alloc_node(sizeof(struct rb_root) << log,
+ netdev_queue_numa_node_read(sch->dev_queue));
+ if (!array)
+ return -ENOMEM;
+
+ for (idx = 0; idx < (1U << log); idx++)
+ array[idx] = RB_ROOT;
+
+ sch_tree_lock(sch);
+
+ old_fq_root = q->fq_root;
+ if (old_fq_root)
+ fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
+
+ q->fq_root = array;
+ q->fq_trees_log = log;
+
+ sch_tree_unlock(sch);
+
+ fq_free(old_fq_root);
+
+ return 0;
+}
+
+static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+ [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
+ [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
+};
+
+static int fq_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_MAX + 1];
+ int err, drop_count = 0;
+ u32 fq_log;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+
+ fq_log = q->fq_trees_log;
+
+ if (tb[TCA_FQ_BUCKETS_LOG]) {
+ u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
+
+ if (nval >= 1 && nval <= ilog2(256*1024))
+ fq_log = nval;
+ else
+ err = -EINVAL;
+ }
+ if (tb[TCA_FQ_PLIMIT])
+ sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
+
+ if (tb[TCA_FQ_FLOW_PLIMIT])
+ q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
+
+ if (tb[TCA_FQ_QUANTUM])
+ q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
+
+ if (tb[TCA_FQ_INITIAL_QUANTUM])
+ q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
+
+ if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
+ pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
+ nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
+
+ if (tb[TCA_FQ_FLOW_MAX_RATE])
+ q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+
+ if (tb[TCA_FQ_RATE_ENABLE]) {
+ u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
+
+ if (enable <= 1)
+ q->rate_enable = enable;
+ else
+ err = -EINVAL;
+ }
+
+ if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
+ u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
+
+ q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
+ }
+
+ if (!err) {
+ sch_tree_unlock(sch);
+ err = fq_resize(sch, fq_log);
+ sch_tree_lock(sch);
+ }
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = fq_dequeue(sch);
+
+ if (!skb)
+ break;
+ kfree_skb(skb);
+ drop_count++;
+ }
+ qdisc_tree_decrease_qlen(sch, drop_count);
+
+ sch_tree_unlock(sch);
+ return err;
+}
+
+static void fq_destroy(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+
+ fq_reset(sch);
+ fq_free(q->fq_root);
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int fq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ int err;
+
+ sch->limit = 10000;
+ q->flow_plimit = 100;
+ q->quantum = 2 * psched_mtu(qdisc_dev(sch));
+ q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
+ q->flow_refill_delay = msecs_to_jiffies(40);
+ q->flow_max_rate = ~0U;
+ q->rate_enable = 1;
+ q->new_flows.first = NULL;
+ q->old_flows.first = NULL;
+ q->delayed = RB_ROOT;
+ q->fq_root = NULL;
+ q->fq_trees_log = ilog2(1024);
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (opt)
+ err = fq_change(sch, opt);
+ else
+ err = fq_resize(sch, q->fq_trees_log);
+
+ return err;
+}
+
+static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
+
+ if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
+ nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
+ nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
+ jiffies_to_usecs(q->flow_refill_delay)) ||
+ nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ u64 now = ktime_to_ns(ktime_get());
+ struct tc_fq_qd_stats st = {
+ .gc_flows = q->stat_gc_flows,
+ .highprio_packets = q->stat_internal_packets,
+ .tcp_retrans = q->stat_tcp_retrans,
+ .throttled = q->stat_throttled,
+ .flows_plimit = q->stat_flows_plimit,
+ .pkts_too_long = q->stat_pkts_too_long,
+ .allocation_errors = q->stat_allocation_errors,
+ .flows = q->flows,
+ .inactive_flows = q->inactive_flows,
+ .throttled_flows = q->throttled_flows,
+ .time_next_delayed_flow = q->time_next_delayed_flow - now,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
+ .id = "fq",
+ .priv_size = sizeof(struct fq_sched_data),
+
+ .enqueue = fq_enqueue,
+ .dequeue = fq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = fq_init,
+ .reset = fq_reset,
+ .destroy = fq_destroy,
+ .change = fq_change,
+ .dump = fq_dump,
+ .dump_stats = fq_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init fq_module_init(void)
+{
+ int ret;
+
+ fq_flow_cachep = kmem_cache_create("fq_flow_cache",
+ sizeof(struct fq_flow),
+ 0, 0, NULL);
+ if (!fq_flow_cachep)
+ return -ENOMEM;
+
+ ret = register_qdisc(&fq_qdisc_ops);
+ if (ret)
+ kmem_cache_destroy(fq_flow_cachep);
+ return ret;
+}
+
+static void __exit fq_module_exit(void)
+{
+ unregister_qdisc(&fq_qdisc_ops);
+ kmem_cache_destroy(fq_flow_cachep);
+}
+
+module_init(fq_module_init)
+module_exit(fq_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
new file mode 100644
index 00000000000..063b726bf1f
--- /dev/null
+++ b/net/sched/sch_fq_codel.c
@@ -0,0 +1,620 @@
+/*
+ * Fair Queue CoDel discipline
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/flow_keys.h>
+#include <net/codel.h>
+
+/* Fair Queue CoDel.
+ *
+ * Principles :
+ * Packets are classified (internal classifier or external) on flows.
+ * This is a Stochastic model (as we use a hash, several flows
+ * might be hashed on same slot)
+ * Each flow has a CoDel managed queue.
+ * Flows are linked onto two (Round Robin) lists,
+ * so that new flows have priority on old ones.
+ *
+ * For a given flow, packets are not reordered (CoDel uses a FIFO)
+ * head drops only.
+ * ECN capability is on by default.
+ * Low memory footprint (64 bytes per flow)
+ */
+
+struct fq_codel_flow {
+ struct sk_buff *head;
+ struct sk_buff *tail;
+ struct list_head flowchain;
+ int deficit;
+ u32 dropped; /* number of drops (or ECN marks) on this flow */
+ struct codel_vars cvars;
+}; /* please try to keep this structure <= 64 bytes */
+
+struct fq_codel_sched_data {
+ struct tcf_proto *filter_list; /* optional external classifier */
+ struct fq_codel_flow *flows; /* Flows table [flows_cnt] */
+ u32 *backlogs; /* backlog table [flows_cnt] */
+ u32 flows_cnt; /* number of flows */
+ u32 perturbation; /* hash perturbation */
+ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
+ struct codel_params cparams;
+ struct codel_stats cstats;
+ u32 drop_overlimit;
+ u32 new_flow_count;
+
+ struct list_head new_flows; /* list of new flows */
+ struct list_head old_flows; /* list of old flows */
+};
+
+static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
+ const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ unsigned int hash;
+
+ skb_flow_dissect(skb, &keys);
+ hash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src ^ keys.ip_proto,
+ (__force u32)keys.ports, q->perturbation);
+ return ((u64)hash * q->flows_cnt) >> 32;
+}
+
+static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct tcf_result res;
+ int result;
+
+ if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->flows_cnt)
+ return TC_H_MIN(skb->priority);
+
+ if (!q->filter_list)
+ return fq_codel_hash(q, skb) + 1;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tc_classify(skb, q->filter_list, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= q->flows_cnt)
+ return TC_H_MIN(res.classid);
+ }
+ return 0;
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ flow->head = skb->next;
+ skb->next = NULL;
+ return skb;
+}
+
+/* add skb to flow queue (tail add) */
+static inline void flow_queue_add(struct fq_codel_flow *flow,
+ struct sk_buff *skb)
+{
+ if (flow->head == NULL)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+}
+
+static unsigned int fq_codel_drop(struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ unsigned int maxbacklog = 0, idx = 0, i, len;
+ struct fq_codel_flow *flow;
+
+ /* Queue is full! Find the fat flow and drop packet from it.
+ * This might sound expensive, but with 1024 flows, we scan
+ * 4KB of memory, and we dont need to handle a complex tree
+ * in fast path (packet queue/enqueue) with many cache misses.
+ */
+ for (i = 0; i < q->flows_cnt; i++) {
+ if (q->backlogs[i] > maxbacklog) {
+ maxbacklog = q->backlogs[i];
+ idx = i;
+ }
+ }
+ flow = &q->flows[idx];
+ skb = dequeue_head(flow);
+ len = qdisc_pkt_len(skb);
+ q->backlogs[idx] -= len;
+ kfree_skb(skb);
+ sch->q.qlen--;
+ sch->qstats.drops++;
+ sch->qstats.backlog -= len;
+ flow->dropped++;
+ return idx;
+}
+
+static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ unsigned int idx;
+ struct fq_codel_flow *flow;
+ int uninitialized_var(ret);
+
+ idx = fq_codel_classify(skb, sch, &ret);
+ if (idx == 0) {
+ if (ret & __NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+ }
+ idx--;
+
+ codel_set_enqueue_time(skb);
+ flow = &q->flows[idx];
+ flow_queue_add(flow, skb);
+ q->backlogs[idx] += qdisc_pkt_len(skb);
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+
+ if (list_empty(&flow->flowchain)) {
+ list_add_tail(&flow->flowchain, &q->new_flows);
+ q->new_flow_count++;
+ flow->deficit = q->quantum;
+ flow->dropped = 0;
+ }
+ if (++sch->q.qlen <= sch->limit)
+ return NET_XMIT_SUCCESS;
+
+ q->drop_overlimit++;
+ /* Return Congestion Notification only if we dropped a packet
+ * from this flow.
+ */
+ if (fq_codel_drop(sch) == idx)
+ return NET_XMIT_CN;
+
+ /* As we dropped a packet, better let upper stack know this */
+ qdisc_tree_decrease_qlen(sch, 1);
+ return NET_XMIT_SUCCESS;
+}
+
+/* This is the specific function called from codel_dequeue()
+ * to dequeue a packet from queue. Note: backlog is handled in
+ * codel, we dont need to reduce it here.
+ */
+static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct fq_codel_flow *flow;
+ struct sk_buff *skb = NULL;
+
+ flow = container_of(vars, struct fq_codel_flow, cvars);
+ if (flow->head) {
+ skb = dequeue_head(flow);
+ q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
+ sch->q.qlen--;
+ }
+ return skb;
+}
+
+static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ struct fq_codel_flow *flow;
+ struct list_head *head;
+ u32 prev_drop_count, prev_ecn_mark;
+
+begin:
+ head = &q->new_flows;
+ if (list_empty(head)) {
+ head = &q->old_flows;
+ if (list_empty(head))
+ return NULL;
+ }
+ flow = list_first_entry(head, struct fq_codel_flow, flowchain);
+
+ if (flow->deficit <= 0) {
+ flow->deficit += q->quantum;
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ goto begin;
+ }
+
+ prev_drop_count = q->cstats.drop_count;
+ prev_ecn_mark = q->cstats.ecn_mark;
+
+ skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
+ dequeue);
+
+ flow->dropped += q->cstats.drop_count - prev_drop_count;
+ flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
+
+ if (!skb) {
+ /* force a pass through old_flows to prevent starvation */
+ if ((head == &q->new_flows) && !list_empty(&q->old_flows))
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ else
+ list_del_init(&flow->flowchain);
+ goto begin;
+ }
+ qdisc_bstats_update(sch, skb);
+ flow->deficit -= qdisc_pkt_len(skb);
+ /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+ * or HTB crashes. Defer it for next round.
+ */
+ if (q->cstats.drop_count && sch->q.qlen) {
+ qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+ q->cstats.drop_count = 0;
+ }
+ return skb;
+}
+
+static void fq_codel_reset(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ while ((skb = fq_codel_dequeue(sch)) != NULL)
+ kfree_skb(skb);
+}
+
+static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
+ [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 },
+};
+
+static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy);
+ if (err < 0)
+ return err;
+ if (tb[TCA_FQ_CODEL_FLOWS]) {
+ if (q->flows)
+ return -EINVAL;
+ q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]);
+ if (!q->flows_cnt ||
+ q->flows_cnt > 65536)
+ return -EINVAL;
+ }
+ sch_tree_lock(sch);
+
+ if (tb[TCA_FQ_CODEL_TARGET]) {
+ u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]);
+
+ q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_FQ_CODEL_INTERVAL]) {
+ u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
+
+ q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_FQ_CODEL_LIMIT])
+ sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]);
+
+ if (tb[TCA_FQ_CODEL_ECN])
+ q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]);
+
+ if (tb[TCA_FQ_CODEL_QUANTUM])
+ q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
+
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = fq_codel_dequeue(sch);
+
+ kfree_skb(skb);
+ q->cstats.drop_count++;
+ }
+ qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+ q->cstats.drop_count = 0;
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static void *fq_codel_zalloc(size_t sz)
+{
+ void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+ if (!ptr)
+ ptr = vzalloc(sz);
+ return ptr;
+}
+
+static void fq_codel_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static void fq_codel_destroy(struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ fq_codel_free(q->backlogs);
+ fq_codel_free(q->flows);
+}
+
+static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ int i;
+
+ sch->limit = 10*1024;
+ q->flows_cnt = 1024;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->perturbation = prandom_u32();
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+ codel_params_init(&q->cparams);
+ codel_stats_init(&q->cstats);
+ q->cparams.ecn = true;
+
+ if (opt) {
+ int err = fq_codel_change(sch, opt);
+ if (err)
+ return err;
+ }
+
+ if (!q->flows) {
+ q->flows = fq_codel_zalloc(q->flows_cnt *
+ sizeof(struct fq_codel_flow));
+ if (!q->flows)
+ return -ENOMEM;
+ q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32));
+ if (!q->backlogs) {
+ fq_codel_free(q->flows);
+ return -ENOMEM;
+ }
+ for (i = 0; i < q->flows_cnt; i++) {
+ struct fq_codel_flow *flow = q->flows + i;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ codel_vars_init(&flow->cvars);
+ }
+ }
+ if (sch->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+ return 0;
+}
+
+static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET,
+ codel_time_to_us(q->cparams.target)) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_LIMIT,
+ sch->limit) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL,
+ codel_time_to_us(q->cparams.interval)) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_ECN,
+ q->cparams.ecn) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
+ q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
+ q->flows_cnt))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct tc_fq_codel_xstats st = {
+ .type = TCA_FQ_CODEL_XSTATS_QDISC,
+ };
+ struct list_head *pos;
+
+ st.qdisc_stats.maxpacket = q->cstats.maxpacket;
+ st.qdisc_stats.drop_overlimit = q->drop_overlimit;
+ st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
+ st.qdisc_stats.new_flow_count = q->new_flow_count;
+
+ list_for_each(pos, &q->new_flows)
+ st.qdisc_stats.new_flows_len++;
+
+ list_for_each(pos, &q->old_flows)
+ st.qdisc_stats.old_flows_len++;
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ /* we cannot bypass queue discipline anymore */
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+ return 0;
+}
+
+static void fq_codel_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ u32 idx = cl - 1;
+ struct gnet_stats_queue qs = { 0 };
+ struct tc_fq_codel_xstats xstats;
+
+ if (idx < q->flows_cnt) {
+ const struct fq_codel_flow *flow = &q->flows[idx];
+ const struct sk_buff *skb = flow->head;
+
+ memset(&xstats, 0, sizeof(xstats));
+ xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
+ xstats.class_stats.deficit = flow->deficit;
+ xstats.class_stats.ldelay =
+ codel_time_to_us(flow->cvars.ldelay);
+ xstats.class_stats.count = flow->cvars.count;
+ xstats.class_stats.lastcount = flow->cvars.lastcount;
+ xstats.class_stats.dropping = flow->cvars.dropping;
+ if (flow->cvars.dropping) {
+ codel_tdiff_t delta = flow->cvars.drop_next -
+ codel_get_time();
+
+ xstats.class_stats.drop_next = (delta >= 0) ?
+ codel_time_to_us(delta) :
+ -codel_time_to_us(-delta);
+ }
+ while (skb) {
+ qs.qlen++;
+ skb = skb->next;
+ }
+ qs.backlog = q->backlogs[idx];
+ qs.drops = flow->dropped;
+ }
+ if (gnet_stats_copy_queue(d, &qs) < 0)
+ return -1;
+ if (idx < q->flows_cnt)
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+ return 0;
+}
+
+static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->flows_cnt; i++) {
+ if (list_empty(&q->flows[i].flowchain) ||
+ arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops fq_codel_class_ops = {
+ .leaf = fq_codel_leaf,
+ .get = fq_codel_get,
+ .put = fq_codel_put,
+ .tcf_chain = fq_codel_find_tcf,
+ .bind_tcf = fq_codel_bind,
+ .unbind_tcf = fq_codel_put,
+ .dump = fq_codel_dump_class,
+ .dump_stats = fq_codel_dump_class_stats,
+ .walk = fq_codel_walk,
+};
+
+static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
+ .cl_ops = &fq_codel_class_ops,
+ .id = "fq_codel",
+ .priv_size = sizeof(struct fq_codel_sched_data),
+ .enqueue = fq_codel_enqueue,
+ .dequeue = fq_codel_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = fq_codel_drop,
+ .init = fq_codel_init,
+ .reset = fq_codel_reset,
+ .destroy = fq_codel_destroy,
+ .change = fq_codel_change,
+ .dump = fq_codel_dump,
+ .dump_stats = fq_codel_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init fq_codel_module_init(void)
+{
+ return register_qdisc(&fq_codel_qdisc_ops);
+}
+
+static void __exit fq_codel_module_exit(void)
+{
+ unregister_qdisc(&fq_codel_qdisc_ops);
+}
+
+module_init(fq_codel_module_init)
+module_exit(fq_codel_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 0918834ee4a..e1543b03e39 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -25,9 +25,15 @@
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/if_vlan.h>
+#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
+/* Qdisc to use by default */
+const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
+EXPORT_SYMBOL(default_qdisc_ops);
+
/* Main transmission queue. */
/* Modifications to data participating in scheduling must be protected with
@@ -53,20 +59,19 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
{
struct sk_buff *skb = q->gso_skb;
+ const struct netdev_queue *txq = q->dev_queue;
if (unlikely(skb)) {
- struct net_device *dev = qdisc_dev(q);
- struct netdev_queue *txq;
-
/* check the reason of requeuing without tx lock first */
- txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
- if (!netif_tx_queue_frozen_or_stopped(txq)) {
+ txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb));
+ if (!netif_xmit_frozen_or_stopped(txq)) {
q->gso_skb = NULL;
q->q.qlen--;
} else
skb = NULL;
} else {
- skb = q->dequeue(q);
+ if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq))
+ skb = q->dequeue(q);
}
return skb;
@@ -86,9 +91,8 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
* deadloop is detected. Return OK to try the next skb.
*/
kfree_skb(skb);
- if (net_ratelimit())
- printk(KERN_WARNING "Dead loop on netdevice %s, "
- "fix it urgently!\n", dev_queue->dev->name);
+ net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
+ dev_queue->dev->name);
ret = qdisc_qlen(q);
} else {
/*
@@ -121,7 +125,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
spin_unlock(root_lock);
HARD_TX_LOCK(dev, txq, smp_processor_id());
- if (!netif_tx_queue_frozen_or_stopped(txq))
+ if (!netif_xmit_frozen_or_stopped(txq))
ret = dev_hard_start_xmit(skb, dev, txq);
HARD_TX_UNLOCK(dev, txq);
@@ -136,14 +140,14 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
ret = handle_dev_cpu_collision(skb, txq, q);
} else {
/* Driver returned NETDEV_TX_BUSY - requeue skb */
- if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
- printk(KERN_WARNING "BUG %s code %d qlen %d\n",
- dev->name, ret, q->q.qlen);
+ if (unlikely(ret != NETDEV_TX_BUSY))
+ net_warn_ratelimited("BUG %s code %d qlen %d\n",
+ dev->name, ret, q->q.qlen);
ret = dev_requeue_skb(skb, q);
}
- if (ret && netif_tx_queue_frozen_or_stopped(txq))
+ if (ret && netif_xmit_frozen_or_stopped(txq))
ret = 0;
return ret;
@@ -189,15 +193,15 @@ static inline int qdisc_restart(struct Qdisc *q)
void __qdisc_run(struct Qdisc *q)
{
- unsigned long start_time = jiffies;
+ int quota = weight_p;
while (qdisc_restart(q)) {
/*
- * Postpone processing if
- * 1. another process needs the CPU;
- * 2. we've been doing it for too long.
+ * Ordered by possible occurrence: Postpone processing if
+ * 1. we've exceeded packet quota
+ * 2. another process needs the CPU;
*/
- if (need_resched() || jiffies != start_time) {
+ if (--quota <= 0 || need_resched()) {
__netif_schedule(q);
break;
}
@@ -208,15 +212,19 @@ void __qdisc_run(struct Qdisc *q)
unsigned long dev_trans_start(struct net_device *dev)
{
- unsigned long val, res = dev->trans_start;
+ unsigned long val, res;
unsigned int i;
+ if (is_vlan_dev(dev))
+ dev = vlan_dev_real_dev(dev);
+ res = dev->trans_start;
for (i = 0; i < dev->num_tx_queues; i++) {
val = netdev_get_tx_queue(dev, i)->trans_start;
if (val && time_after(val, res))
res = val;
}
dev->trans_start = res;
+
return res;
}
EXPORT_SYMBOL(dev_trans_start);
@@ -242,18 +250,18 @@ static void dev_watchdog(unsigned long arg)
* old device drivers set dev->trans_start
*/
trans_start = txq->trans_start ? : dev->trans_start;
- if (netif_tx_queue_stopped(txq) &&
+ if (netif_xmit_stopped(txq) &&
time_after(jiffies, (trans_start +
dev->watchdog_timeo))) {
some_queue_timedout = 1;
+ txq->trans_timeout++;
break;
}
}
if (some_queue_timedout) {
- char drivername[64];
WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
- dev->name, netdev_drivername(dev, drivername, 64), i);
+ dev->name, netdev_drivername(dev), i);
dev->netdev_ops->ndo_tx_timeout(dev);
}
if (!mod_timer(&dev->watchdog_timer,
@@ -302,6 +310,7 @@ void netif_carrier_on(struct net_device *dev)
if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
if (dev->reg_state == NETREG_UNINITIALIZED)
return;
+ atomic_inc(&dev->carrier_changes);
linkwatch_fire_event(dev);
if (netif_running(dev))
__netdev_watchdog_up(dev);
@@ -320,41 +329,24 @@ void netif_carrier_off(struct net_device *dev)
if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
if (dev->reg_state == NETREG_UNINITIALIZED)
return;
+ atomic_inc(&dev->carrier_changes);
linkwatch_fire_event(dev);
}
}
EXPORT_SYMBOL(netif_carrier_off);
-/**
- * netif_notify_peers - notify network peers about existence of @dev
- * @dev: network device
- *
- * Generate traffic such that interested network peers are aware of
- * @dev, such as by generating a gratuitous ARP. This may be used when
- * a device wants to inform the rest of the network about some sort of
- * reconfiguration such as a failover event or virtual machine
- * migration.
- */
-void netif_notify_peers(struct net_device *dev)
-{
- rtnl_lock();
- call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
- rtnl_unlock();
-}
-EXPORT_SYMBOL(netif_notify_peers);
-
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
under all circumstances. It is difficult to invent anything faster or
cheaper.
*/
-static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
{
kfree_skb(skb);
return NET_XMIT_CN;
}
-static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
+static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
{
return NULL;
}
@@ -412,8 +404,9 @@ static struct Qdisc noqueue_qdisc = {
};
-static const u8 prio2band[TC_PRIO_MAX+1] =
- { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
+static const u8 prio2band[TC_PRIO_MAX + 1] = {
+ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+};
/* 3-band FIFO queue: old style, but should be a bit faster than
generic prio+fifo combination.
@@ -445,7 +438,7 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
return priv->q + band;
}
-static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
{
if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
@@ -460,7 +453,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
return qdisc_drop(skb, qdisc);
}
-static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
+static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
int band = bitmap2band[priv->bitmap];
@@ -479,7 +472,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
return NULL;
}
-static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc)
+static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
int band = bitmap2band[priv->bitmap];
@@ -493,7 +486,7 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc)
return NULL;
}
-static void pfifo_fast_reset(struct Qdisc* qdisc)
+static void pfifo_fast_reset(struct Qdisc *qdisc)
{
int prio;
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
@@ -510,8 +503,9 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
- memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
- NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+ memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -526,6 +520,8 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
skb_queue_head_init(band2list(priv, prio));
+ /* Can by-pass the queue discipline */
+ qdisc->flags |= TCQ_F_CAN_BYPASS;
return 0;
}
@@ -541,17 +537,16 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
.owner = THIS_MODULE,
};
+static struct lock_class_key qdisc_tx_busylock;
+
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
- struct Qdisc_ops *ops)
+ const struct Qdisc_ops *ops)
{
void *p;
struct Qdisc *sch;
- unsigned int size;
+ unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
int err = -ENOBUFS;
-
- /* ensure that the Qdisc and the private data are 64-byte aligned */
- size = QDISC_ALIGN(sizeof(*sch));
- size += ops->priv_size + (QDISC_ALIGNTO - 1);
+ struct net_device *dev = dev_queue->dev;
p = kzalloc_node(size, GFP_KERNEL,
netdev_queue_numa_node_read(dev_queue));
@@ -559,16 +554,28 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
if (!p)
goto errout;
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
- sch->padded = (char *) sch - (char *) p;
-
+ /* if we got non aligned memory, ask more and do alignment ourself */
+ if (sch != p) {
+ kfree(p);
+ p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
+ netdev_queue_numa_node_read(dev_queue));
+ if (!p)
+ goto errout;
+ sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+ sch->padded = (char *) sch - (char *) p;
+ }
INIT_LIST_HEAD(&sch->list);
skb_queue_head_init(&sch->q);
+
spin_lock_init(&sch->busylock);
+ lockdep_set_class(&sch->busylock,
+ dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
sch->ops = ops;
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev_queue = dev_queue;
- dev_hold(qdisc_dev(sch));
+ dev_hold(dev);
atomic_set(&sch->refcnt, 1);
return sch;
@@ -577,10 +584,14 @@ errout:
}
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
- struct Qdisc_ops *ops, unsigned int parentid)
+ const struct Qdisc_ops *ops,
+ unsigned int parentid)
{
struct Qdisc *sch;
+ if (!try_module_get(ops->owner))
+ goto errout;
+
sch = qdisc_alloc(dev_queue, ops);
if (IS_ERR(sch))
goto errout;
@@ -630,7 +641,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
#ifdef CONFIG_NET_SCHED
qdisc_list_del(qdisc);
- qdisc_put_stab(qdisc->stab);
+ qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
if (ops->reset)
@@ -674,25 +685,23 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
return oqdisc;
}
+EXPORT_SYMBOL(dev_graft_qdisc);
static void attach_one_default_qdisc(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_unused)
{
- struct Qdisc *qdisc;
+ struct Qdisc *qdisc = &noqueue_qdisc;
if (dev->tx_queue_len) {
qdisc = qdisc_create_dflt(dev_queue,
- &pfifo_fast_ops, TC_H_ROOT);
+ default_qdisc_ops, TC_H_ROOT);
if (!qdisc) {
- printk(KERN_INFO "%s: activation failed\n", dev->name);
+ netdev_info(dev, "activation failed\n");
return;
}
-
- /* Can by-pass the queue discipline for default qdisc */
- qdisc->flags |= TCQ_F_CAN_BYPASS;
- } else {
- qdisc = &noqueue_qdisc;
+ if (!netif_is_multiqueue(dev))
+ qdisc->flags |= TCQ_F_ONETXQUEUE;
}
dev_queue->qdisc_sleeping = qdisc;
}
@@ -711,8 +720,8 @@ static void attach_default_qdiscs(struct net_device *dev)
} else {
qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
if (qdisc) {
- qdisc->ops->attach(qdisc);
dev->qdisc = qdisc;
+ qdisc->ops->attach(qdisc);
}
}
}
@@ -739,9 +748,8 @@ void dev_activate(struct net_device *dev)
int need_watchdog;
/* No queueing discipline is attached to device;
- create default one i.e. pfifo_fast for devices,
- which need queueing and noqueue_qdisc for
- virtual interfaces
+ * create default one for devices, which need queueing
+ * and noqueue_qdisc for virtual interfaces
*/
if (dev->qdisc == &noop_qdisc)
@@ -761,6 +769,7 @@ void dev_activate(struct net_device *dev)
dev_watchdog_up(dev);
}
}
+EXPORT_SYMBOL(dev_activate);
static void dev_deactivate_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
@@ -810,21 +819,51 @@ static bool some_qdisc_is_busy(struct net_device *dev)
return false;
}
-void dev_deactivate(struct net_device *dev)
+/**
+ * dev_deactivate_many - deactivate transmissions on several devices
+ * @head: list of devices to deactivate
+ *
+ * This function returns only when all outstanding transmissions
+ * have completed, unless all devices are in dismantle phase.
+ */
+void dev_deactivate_many(struct list_head *head)
{
- netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
- if (dev_ingress_queue(dev))
- dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
+ struct net_device *dev;
+ bool sync_needed = false;
- dev_watchdog_down(dev);
+ list_for_each_entry(dev, head, close_list) {
+ netdev_for_each_tx_queue(dev, dev_deactivate_queue,
+ &noop_qdisc);
+ if (dev_ingress_queue(dev))
+ dev_deactivate_queue(dev, dev_ingress_queue(dev),
+ &noop_qdisc);
- /* Wait for outstanding qdisc-less dev_queue_xmit calls. */
- synchronize_rcu();
+ dev_watchdog_down(dev);
+ sync_needed |= !dev->dismantle;
+ }
+
+ /* Wait for outstanding qdisc-less dev_queue_xmit calls.
+ * This is avoided if all devices are in dismantle phase :
+ * Caller will call synchronize_net() for us
+ */
+ if (sync_needed)
+ synchronize_net();
/* Wait for outstanding qdisc_run calls. */
- while (some_qdisc_is_busy(dev))
- yield();
+ list_for_each_entry(dev, head, close_list)
+ while (some_qdisc_is_busy(dev))
+ yield();
+}
+
+void dev_deactivate(struct net_device *dev)
+{
+ LIST_HEAD(single);
+
+ list_add(&dev->close_list, &single);
+ dev_deactivate_many(&single);
+ list_del(&single);
}
+EXPORT_SYMBOL(dev_deactivate);
static void dev_init_scheduler_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
@@ -871,3 +910,39 @@ void dev_shutdown(struct net_device *dev)
WARN_ON(timer_pending(&dev->watchdog_timer));
}
+
+void psched_ratecfg_precompute(struct psched_ratecfg *r,
+ const struct tc_ratespec *conf,
+ u64 rate64)
+{
+ memset(r, 0, sizeof(*r));
+ r->overhead = conf->overhead;
+ r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
+ r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
+ r->mult = 1;
+ /*
+ * The deal here is to replace a divide by a reciprocal one
+ * in fast path (a reciprocal divide is a multiply and a shift)
+ *
+ * Normal formula would be :
+ * time_in_ns = (NSEC_PER_SEC * len) / rate_bps
+ *
+ * We compute mult/shift to use instead :
+ * time_in_ns = (len * mult) >> shift;
+ *
+ * We try to get the highest possible mult value for accuracy,
+ * but have to make sure no overflows will ever happen.
+ */
+ if (r->rate_bytes_ps > 0) {
+ u64 factor = NSEC_PER_SEC;
+
+ for (;;) {
+ r->mult = div64_u64(factor, r->rate_bytes_ps);
+ if (r->mult & (1U << 31) || factor & (1ULL << 63))
+ break;
+ factor <<= 1;
+ r->shift++;
+ }
+ }
+}
+EXPORT_SYMBOL(psched_ratecfg_precompute);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 51dcc2aa5c9..12cbc09157f 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -32,16 +32,16 @@
struct gred_sched_data;
struct gred_sched;
-struct gred_sched_data
-{
+struct gred_sched_data {
u32 limit; /* HARD maximal queue length */
- u32 DP; /* the drop pramaters */
+ u32 DP; /* the drop parameters */
u32 bytesin; /* bytes seen on virtualQ so far*/
u32 packetsin; /* packets seen on virtualQ so far*/
u32 backlog; /* bytes on the virtualQ */
u8 prio; /* the prio of this vq */
struct red_parms parms;
+ struct red_vars vars;
struct red_stats stats;
};
@@ -50,14 +50,13 @@ enum {
GRED_RIO_MODE,
};
-struct gred_sched
-{
+struct gred_sched {
struct gred_sched_data *tab[MAX_DPs];
unsigned long flags;
u32 red_flags;
u32 DPs;
u32 def;
- struct red_parms wred_set;
+ struct red_vars wred_set;
};
static inline int gred_wred_mode(struct gred_sched *table)
@@ -103,9 +102,8 @@ static inline int gred_wred_mode_check(struct Qdisc *sch)
if (q == NULL)
continue;
- for (n = 0; n < table->DPs; n++)
- if (table->tab[n] && table->tab[n] != q &&
- table->tab[n]->prio == q->prio)
+ for (n = i + 1; n < table->DPs; n++)
+ if (table->tab[n] && table->tab[n]->prio == q->prio)
return 1;
}
@@ -127,17 +125,18 @@ static inline u16 tc_index_to_dp(struct sk_buff *skb)
return skb->tc_index & GRED_VQ_MASK;
}
-static inline void gred_load_wred_set(struct gred_sched *table,
+static inline void gred_load_wred_set(const struct gred_sched *table,
struct gred_sched_data *q)
{
- q->parms.qavg = table->wred_set.qavg;
- q->parms.qidlestart = table->wred_set.qidlestart;
+ q->vars.qavg = table->wred_set.qavg;
+ q->vars.qidlestart = table->wred_set.qidlestart;
}
static inline void gred_store_wred_set(struct gred_sched *table,
struct gred_sched_data *q)
{
- table->wred_set.qavg = q->parms.qavg;
+ table->wred_set.qavg = q->vars.qavg;
+ table->wred_set.qidlestart = q->vars.qidlestart;
}
static inline int gred_use_ecn(struct gred_sched *t)
@@ -150,17 +149,18 @@ static inline int gred_use_harddrop(struct gred_sched *t)
return t->red_flags & TC_RED_HARDDROP;
}
-static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
- struct gred_sched_data *q=NULL;
- struct gred_sched *t= qdisc_priv(sch);
+ struct gred_sched_data *q = NULL;
+ struct gred_sched *t = qdisc_priv(sch);
unsigned long qavg = 0;
u16 dp = tc_index_to_dp(skb);
- if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
dp = t->def;
- if ((q = t->tab[dp]) == NULL) {
+ q = t->tab[dp];
+ if (!q) {
/* Pass through packets not assigned to a DP
* if no default DP has been configured. This
* allows for DP flows to be left untouched.
@@ -171,19 +171,19 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
goto drop;
}
- /* fix tc_index? --could be controvesial but needed for
+ /* fix tc_index? --could be controversial but needed for
requeueing */
skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
}
- /* sum up all the qaves of prios <= to ours to get the new qave */
+ /* sum up all the qaves of prios < ours to get the new qave */
if (!gred_wred_mode(t) && gred_rio_mode(t)) {
int i;
for (i = 0; i < t->DPs; i++) {
if (t->tab[i] && t->tab[i]->prio < q->prio &&
- !red_is_idling(&t->tab[i]->parms))
- qavg +=t->tab[i]->parms.qavg;
+ !red_is_idling(&t->tab[i]->vars))
+ qavg += t->tab[i]->vars.qavg;
}
}
@@ -194,37 +194,39 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
if (gred_wred_mode(t))
gred_load_wred_set(t, q);
- q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch));
+ q->vars.qavg = red_calc_qavg(&q->parms,
+ &q->vars,
+ gred_backlog(t, q, sch));
- if (red_is_idling(&q->parms))
- red_end_of_idle_period(&q->parms);
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
if (gred_wred_mode(t))
gred_store_wred_set(t, q);
- switch (red_action(&q->parms, q->parms.qavg + qavg)) {
- case RED_DONT_MARK:
- break;
+ switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) {
+ case RED_DONT_MARK:
+ break;
- case RED_PROB_MARK:
- sch->qstats.overlimits++;
- if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
- q->stats.prob_drop++;
- goto congestion_drop;
- }
+ case RED_PROB_MARK:
+ sch->qstats.overlimits++;
+ if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
- q->stats.prob_mark++;
- break;
+ q->stats.prob_mark++;
+ break;
- case RED_HARD_MARK:
- sch->qstats.overlimits++;
- if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
- !INET_ECN_set_ce(skb)) {
- q->stats.forced_drop++;
- goto congestion_drop;
- }
- q->stats.forced_mark++;
- break;
+ case RED_HARD_MARK:
+ sch->qstats.overlimits++;
+ if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+ q->stats.forced_mark++;
+ break;
}
if (q->backlog + qdisc_pkt_len(skb) <= q->limit) {
@@ -241,7 +243,7 @@ congestion_drop:
return NET_XMIT_CN;
}
-static struct sk_buff *gred_dequeue(struct Qdisc* sch)
+static struct sk_buff *gred_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
struct gred_sched *t = qdisc_priv(sch);
@@ -253,27 +255,27 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch)
u16 dp = tc_index_to_dp(skb);
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
- if (net_ratelimit())
- printk(KERN_WARNING "GRED: Unable to relocate "
- "VQ 0x%x after dequeue, screwing up "
- "backlog.\n", tc_index_to_dp(skb));
+ net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n",
+ tc_index_to_dp(skb));
} else {
q->backlog -= qdisc_pkt_len(skb);
- if (!q->backlog && !gred_wred_mode(t))
- red_start_of_idle_period(&q->parms);
+ if (gred_wred_mode(t)) {
+ if (!sch->qstats.backlog)
+ red_start_of_idle_period(&t->wred_set);
+ } else {
+ if (!q->backlog)
+ red_start_of_idle_period(&q->vars);
+ }
}
return skb;
}
- if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
- red_start_of_idle_period(&t->wred_set);
-
return NULL;
}
-static unsigned int gred_drop(struct Qdisc* sch)
+static unsigned int gred_drop(struct Qdisc *sch)
{
struct sk_buff *skb;
struct gred_sched *t = qdisc_priv(sch);
@@ -285,30 +287,29 @@ static unsigned int gred_drop(struct Qdisc* sch)
u16 dp = tc_index_to_dp(skb);
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
- if (net_ratelimit())
- printk(KERN_WARNING "GRED: Unable to relocate "
- "VQ 0x%x while dropping, screwing up "
- "backlog.\n", tc_index_to_dp(skb));
+ net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n",
+ tc_index_to_dp(skb));
} else {
q->backlog -= len;
q->stats.other++;
- if (!q->backlog && !gred_wred_mode(t))
- red_start_of_idle_period(&q->parms);
+ if (gred_wred_mode(t)) {
+ if (!sch->qstats.backlog)
+ red_start_of_idle_period(&t->wred_set);
+ } else {
+ if (!q->backlog)
+ red_start_of_idle_period(&q->vars);
+ }
}
qdisc_drop(skb, sch);
return len;
}
- if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
- red_start_of_idle_period(&t->wred_set);
-
return 0;
-
}
-static void gred_reset(struct Qdisc* sch)
+static void gred_reset(struct Qdisc *sch)
{
int i;
struct gred_sched *t = qdisc_priv(sch);
@@ -321,7 +322,7 @@ static void gred_reset(struct Qdisc* sch)
if (!q)
continue;
- red_restart(&q->parms);
+ red_restart(&q->vars);
q->backlog = 0;
}
}
@@ -369,8 +370,8 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
for (i = table->DPs; i < MAX_DPs; i++) {
if (table->tab[i]) {
- printk(KERN_WARNING "GRED: Warning: Destroying "
- "shadowed VQ 0x%x\n", i);
+ pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n",
+ i);
gred_destroy_vq(table->tab[i]);
table->tab[i] = NULL;
}
@@ -380,29 +381,31 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
}
static inline int gred_change_vq(struct Qdisc *sch, int dp,
- struct tc_gred_qopt *ctl, int prio, u8 *stab)
+ struct tc_gred_qopt *ctl, int prio,
+ u8 *stab, u32 max_P,
+ struct gred_sched_data **prealloc)
{
struct gred_sched *table = qdisc_priv(sch);
- struct gred_sched_data *q;
+ struct gred_sched_data *q = table->tab[dp];
- if (table->tab[dp] == NULL) {
- table->tab[dp] = kzalloc(sizeof(*q), GFP_KERNEL);
- if (table->tab[dp] == NULL)
+ if (!q) {
+ table->tab[dp] = q = *prealloc;
+ *prealloc = NULL;
+ if (!q)
return -ENOMEM;
}
- q = table->tab[dp];
q->DP = dp;
q->prio = prio;
q->limit = ctl->limit;
if (q->backlog == 0)
- red_end_of_idle_period(&q->parms);
+ red_end_of_idle_period(&q->vars);
red_set_parms(&q->parms,
ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
- ctl->Scell_log, stab);
-
+ ctl->Scell_log, stab, max_P);
+ red_set_vars(&q->vars);
return 0;
}
@@ -410,6 +413,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
[TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) },
[TCA_GRED_STAB] = { .len = 256 },
[TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) },
+ [TCA_GRED_MAX_P] = { .type = NLA_U32 },
};
static int gred_change(struct Qdisc *sch, struct nlattr *opt)
@@ -419,6 +423,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)
struct nlattr *tb[TCA_GRED_MAX + 1];
int err, prio = GRED_DEF_PRIO;
u8 *stab;
+ u32 max_P;
+ struct gred_sched_data *prealloc;
if (opt == NULL)
return -EINVAL;
@@ -434,6 +440,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)
tb[TCA_GRED_STAB] == NULL)
return -EINVAL;
+ max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
+
err = -EINVAL;
ctl = nla_data(tb[TCA_GRED_PARMS]);
stab = nla_data(tb[TCA_GRED_STAB]);
@@ -456,9 +464,10 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)
prio = ctl->prio;
}
+ prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
sch_tree_lock(sch);
- err = gred_change_vq(sch, ctl->DP, ctl, prio, stab);
+ err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);
if (err < 0)
goto errout_locked;
@@ -472,6 +481,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)
errout_locked:
sch_tree_unlock(sch);
+ kfree(prealloc);
errout:
return err;
}
@@ -499,6 +509,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
struct gred_sched *table = qdisc_priv(sch);
struct nlattr *parms, *opts = NULL;
int i;
+ u32 max_p[MAX_DPs];
struct tc_gred_sopt sopt = {
.DPs = table->DPs,
.def_DP = table->def,
@@ -509,7 +520,17 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
- NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
+ if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt))
+ goto nla_put_failure;
+
+ for (i = 0; i < MAX_DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+
+ max_p[i] = q ? q->parms.max_P : 0;
+ }
+ if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p))
+ goto nla_put_failure;
+
parms = nla_nest_start(skb, TCA_GRED_PARMS);
if (parms == NULL)
goto nla_put_failure;
@@ -517,6 +538,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
for (i = 0; i < MAX_DPs; i++) {
struct gred_sched_data *q = table->tab[i];
struct tc_gred_qopt opt;
+ unsigned long qavg;
memset(&opt, 0, sizeof(opt));
@@ -545,13 +567,12 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.packets = q->packetsin;
opt.bytesin = q->bytesin;
- if (gred_wred_mode(table)) {
- q->parms.qidlestart =
- table->tab[table->def]->parms.qidlestart;
- q->parms.qavg = table->tab[table->def]->parms.qavg;
- }
+ if (gred_wred_mode(table))
+ gred_load_wred_set(table, q);
- opt.qave = red_calc_qavg(&q->parms, q->parms.qavg);
+ qavg = red_calc_qavg(&q->parms, &q->vars,
+ q->vars.qavg >> q->parms.Wlog);
+ opt.qave = qavg >> q->parms.Wlog;
append_opt:
if (nla_append(skb, sizeof(opt), &opt) < 0)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 069c62b7bb3..ec8aeaac1dd 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -81,8 +81,7 @@
* that are expensive on 32-bit architectures.
*/
-struct internal_sc
-{
+struct internal_sc {
u64 sm1; /* scaled slope of the 1st segment */
u64 ism1; /* scaled inverse-slope of the 1st segment */
u64 dx; /* the x-projection of the 1st segment */
@@ -92,8 +91,7 @@ struct internal_sc
};
/* runtime service curve */
-struct runtime_sc
-{
+struct runtime_sc {
u64 x; /* current starting position on x-axis */
u64 y; /* current starting position on y-axis */
u64 sm1; /* scaled slope of the 1st segment */
@@ -104,21 +102,19 @@ struct runtime_sc
u64 ism2; /* scaled inverse-slope of the 2nd segment */
};
-enum hfsc_class_flags
-{
+enum hfsc_class_flags {
HFSC_RSC = 0x1,
HFSC_FSC = 0x2,
HFSC_USC = 0x4
};
-struct hfsc_class
-{
+struct hfsc_class {
struct Qdisc_class_common cl_common;
unsigned int refcnt; /* usage count */
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+ struct gnet_stats_rate_est64 rate_est;
unsigned int level; /* class level in hierarchy */
struct tcf_proto *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
@@ -140,8 +136,8 @@ struct hfsc_class
u64 cl_cumul; /* cumulative work in bytes done by
real-time criteria */
- u64 cl_d; /* deadline*/
- u64 cl_e; /* eligible time */
+ u64 cl_d; /* deadline*/
+ u64 cl_e; /* eligible time */
u64 cl_vt; /* virtual time */
u64 cl_f; /* time when this class will fit for
link-sharing, max(myf, cfmin) */
@@ -176,8 +172,7 @@ struct hfsc_class
unsigned long cl_nactive; /* number of active children */
};
-struct hfsc_sched
-{
+struct hfsc_sched {
u16 defcls; /* default class id */
struct hfsc_class root; /* root class */
struct Qdisc_class_hash clhash; /* class hash */
@@ -693,7 +688,7 @@ init_vf(struct hfsc_class *cl, unsigned int len)
if (go_active) {
n = rb_last(&cl->cl_parent->vt_tree);
if (n != NULL) {
- max_cl = rb_entry(n, struct hfsc_class,vt_node);
+ max_cl = rb_entry(n, struct hfsc_class, vt_node);
/*
* set vt to the average of the min and max
* classes. if the parent's period didn't
@@ -1177,8 +1172,10 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
return NULL;
}
#endif
- if ((cl = (struct hfsc_class *)res.class) == NULL) {
- if ((cl = hfsc_find_class(res.classid, sch)) == NULL)
+ cl = (struct hfsc_class *)res.class;
+ if (!cl) {
+ cl = hfsc_find_class(res.classid, sch);
+ if (!cl)
break; /* filter selected invalid classid */
if (cl->level >= head->level)
break; /* filter may only point downwards */
@@ -1308,7 +1305,8 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
tsc.m1 = sm2m(sc->sm1);
tsc.d = dx2d(sc->dx);
tsc.m2 = sm2m(sc->sm2);
- NLA_PUT(skb, attr, sizeof(tsc), &tsc);
+ if (nla_put(skb, attr, sizeof(tsc), &tsc))
+ goto nla_put_failure;
return skb->len;
@@ -1316,7 +1314,7 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
return -1;
}
-static inline int
+static int
hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)
{
if ((cl->cl_flags & HFSC_RSC) &&
@@ -1355,8 +1353,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
goto nla_put_failure;
if (hfsc_dump_curves(skb, cl) < 0)
goto nla_put_failure;
- nla_nest_end(skb, nest);
- return skb->len;
+ return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
@@ -1371,6 +1368,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
struct tc_hfsc_stats xstats;
cl->qstats.qlen = cl->qdisc->q.qlen;
+ cl->qstats.backlog = cl->qdisc->qstats.backlog;
xstats.level = cl->level;
xstats.period = cl->cl_vtperiod;
xstats.work = cl->cl_total;
@@ -1390,7 +1388,6 @@ static void
hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct hfsc_sched *q = qdisc_priv(sch);
- struct hlist_node *n;
struct hfsc_class *cl;
unsigned int i;
@@ -1398,7 +1395,7 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
return;
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[i],
+ hlist_for_each_entry(cl, &q->clhash.hash[i],
cl_common.hnode) {
if (arg->count < arg->skip) {
arg->count++;
@@ -1420,7 +1417,8 @@ hfsc_schedule_watchdog(struct Qdisc *sch)
struct hfsc_class *cl;
u64 next_time = 0;
- if ((cl = eltree_get_minel(q)) != NULL)
+ cl = eltree_get_minel(q);
+ if (cl)
next_time = cl->cl_e;
if (q->root.cl_cfmin != 0) {
if (next_time == 0 || next_time > q->root.cl_cfmin)
@@ -1523,11 +1521,10 @@ hfsc_reset_qdisc(struct Qdisc *sch)
{
struct hfsc_sched *q = qdisc_priv(sch);
struct hfsc_class *cl;
- struct hlist_node *n;
unsigned int i;
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode)
+ hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
hfsc_reset_class(cl);
}
q->eligible = RB_ROOT;
@@ -1540,16 +1537,16 @@ static void
hfsc_destroy_qdisc(struct Qdisc *sch)
{
struct hfsc_sched *q = qdisc_priv(sch);
- struct hlist_node *n, *next;
+ struct hlist_node *next;
struct hfsc_class *cl;
unsigned int i;
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode)
+ hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
tcf_destroy_chain(&cl->filter_list);
}
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
cl_common.hnode)
hfsc_destroy_class(sch, cl);
}
@@ -1563,9 +1560,18 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
struct hfsc_sched *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_hfsc_qopt qopt;
+ struct hfsc_class *cl;
+ unsigned int i;
+
+ sch->qstats.backlog = 0;
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
+ sch->qstats.backlog += cl->qdisc->qstats.backlog;
+ }
qopt.defcls = q->defcls;
- NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
+ if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
+ goto nla_put_failure;
return skb->len;
nla_put_failure:
@@ -1599,10 +1605,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (cl->qdisc->q.qlen == 1)
set_active(cl, qdisc_pkt_len(skb));
- cl->bstats.packets++;
- cl->bstats.bytes += qdisc_pkt_len(skb);
- sch->bstats.packets++;
- sch->bstats.bytes += qdisc_pkt_len(skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -1628,7 +1630,8 @@ hfsc_dequeue(struct Qdisc *sch)
* find the class with the minimum deadline among
* the eligible classes.
*/
- if ((cl = eltree_get_mindl(q, cur_time)) != NULL) {
+ cl = eltree_get_mindl(q, cur_time);
+ if (cl) {
realtime = 1;
} else {
/*
@@ -1649,6 +1652,7 @@ hfsc_dequeue(struct Qdisc *sch)
return NULL;
}
+ bstats_update(&cl->bstats, skb);
update_vf(cl, qdisc_pkt_len(skb), cur_time);
if (realtime)
cl->cl_cumul += qdisc_pkt_len(skb);
@@ -1667,7 +1671,8 @@ hfsc_dequeue(struct Qdisc *sch)
set_passive(cl);
}
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
new file mode 100644
index 00000000000..d85b6812a7d
--- /dev/null
+++ b/net/sched/sch_hhf.c
@@ -0,0 +1,740 @@
+/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF)
+ *
+ * Copyright (C) 2013 Terry Lam <vtlam@google.com>
+ * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
+ */
+
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <net/flow_keys.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+/* Heavy-Hitter Filter (HHF)
+ *
+ * Principles :
+ * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
+ * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
+ * as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
+ * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
+ * in which the heavy-hitter bucket is served with less weight.
+ * In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
+ * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
+ * higher share of bandwidth.
+ *
+ * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
+ * following paper:
+ * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
+ * Accounting", in ACM SIGCOMM, 2002.
+ *
+ * Conceptually, a multi-stage filter comprises k independent hash functions
+ * and k counter arrays. Packets are indexed into k counter arrays by k hash
+ * functions, respectively. The counters are then increased by the packet sizes.
+ * Therefore,
+ * - For a heavy-hitter flow: *all* of its k array counters must be large.
+ * - For a non-heavy-hitter flow: some of its k array counters can be large
+ * due to hash collision with other small flows; however, with high
+ * probability, not *all* k counters are large.
+ *
+ * By the design of the multi-stage filter algorithm, the false negative rate
+ * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
+ * susceptible to false positives (non-heavy-hitters mistakenly classified as
+ * heavy-hitters).
+ * Therefore, we also implement the following optimizations to reduce false
+ * positives by avoiding unnecessary increment of the counter values:
+ * - Optimization O1: once a heavy-hitter is identified, its bytes are not
+ * accounted in the array counters. This technique is called "shielding"
+ * in Section 3.3.1 of [EV02].
+ * - Optimization O2: conservative update of counters
+ * (Section 3.3.2 of [EV02]),
+ * New counter value = max {old counter value,
+ * smallest counter value + packet bytes}
+ *
+ * Finally, we refresh the counters periodically since otherwise the counter
+ * values will keep accumulating.
+ *
+ * Once a flow is classified as heavy-hitter, we also save its per-flow state
+ * in an exact-matching flow table so that its subsequent packets can be
+ * dispatched to the heavy-hitter bucket accordingly.
+ *
+ *
+ * At a high level, this qdisc works as follows:
+ * Given a packet p:
+ * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
+ * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
+ * bucket.
+ * - Otherwise, forward p to the multi-stage filter, denoted filter F
+ * + If F decides that p belongs to a non-heavy-hitter flow, then send p
+ * to the non-heavy-hitter bucket.
+ * + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
+ * then set up a new flow entry for the flow-id of p in the table T and
+ * send p to the heavy-hitter bucket.
+ *
+ * In this implementation:
+ * - T is a fixed-size hash-table with 1024 entries. Hash collision is
+ * resolved by linked-list chaining.
+ * - F has four counter arrays, each array containing 1024 32-bit counters.
+ * That means 4 * 1024 * 32 bits = 16KB of memory.
+ * - Since each array in F contains 1024 counters, 10 bits are sufficient to
+ * index into each array.
+ * Hence, instead of having four hash functions, we chop the 32-bit
+ * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
+ * computed as XOR sum of those three chunks.
+ * - We need to clear the counter arrays periodically; however, directly
+ * memsetting 16KB of memory can lead to cache eviction and unwanted delay.
+ * So by representing each counter by a valid bit, we only need to reset
+ * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
+ * - The Deficit Round Robin engine is taken from fq_codel implementation
+ * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
+ * fq_codel_flow in fq_codel implementation.
+ *
+ */
+
+/* Non-configurable parameters */
+#define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */
+#define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */
+#define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */
+#define HHF_BIT_MASK_LEN 10 /* masking 10 bits */
+#define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */
+
+#define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */
+enum wdrr_bucket_idx {
+ WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */
+ WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */
+};
+
+#define hhf_time_before(a, b) \
+ (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
+
+/* Heavy-hitter per-flow state */
+struct hh_flow_state {
+ u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */
+ u32 hit_timestamp; /* last time heavy-hitter was seen */
+ struct list_head flowchain; /* chaining under hash collision */
+};
+
+/* Weighted Deficit Round Robin (WDRR) scheduler */
+struct wdrr_bucket {
+ struct sk_buff *head;
+ struct sk_buff *tail;
+ struct list_head bucketchain;
+ int deficit;
+};
+
+struct hhf_sched_data {
+ struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
+ u32 perturbation; /* hash perturbation */
+ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
+ u32 drop_overlimit; /* number of times max qdisc packet
+ * limit was hit
+ */
+ struct list_head *hh_flows; /* table T (currently active HHs) */
+ u32 hh_flows_limit; /* max active HH allocs */
+ u32 hh_flows_overlimit; /* num of disallowed HH allocs */
+ u32 hh_flows_total_cnt; /* total admitted HHs */
+ u32 hh_flows_current_cnt; /* total current HHs */
+ u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */
+ u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays
+ * was reset
+ */
+ unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits
+ * of hhf_arrays
+ */
+ /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */
+ struct list_head new_buckets; /* list of new buckets */
+ struct list_head old_buckets; /* list of old buckets */
+
+ /* Configurable HHF parameters */
+ u32 hhf_reset_timeout; /* interval to reset counter
+ * arrays in filter F
+ * (default 40ms)
+ */
+ u32 hhf_admit_bytes; /* counter thresh to classify as
+ * HH (default 128KB).
+ * With these default values,
+ * 128KB / 40ms = 25 Mbps
+ * i.e., we expect to capture HHs
+ * sending > 25 Mbps.
+ */
+ u32 hhf_evict_timeout; /* aging threshold to evict idle
+ * HHs out of table T. This should
+ * be large enough to avoid
+ * reordering during HH eviction.
+ * (default 1s)
+ */
+ u32 hhf_non_hh_weight; /* WDRR weight for non-HHs
+ * (default 2,
+ * i.e., non-HH : HH = 2 : 1)
+ */
+};
+
+static u32 hhf_time_stamp(void)
+{
+ return jiffies;
+}
+
+static unsigned int skb_hash(const struct hhf_sched_data *q,
+ const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ unsigned int hash;
+
+ if (skb->sk && skb->sk->sk_hash)
+ return skb->sk->sk_hash;
+
+ skb_flow_dissect(skb, &keys);
+ hash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src ^ keys.ip_proto,
+ (__force u32)keys.ports, q->perturbation);
+ return hash;
+}
+
+/* Looks up a heavy-hitter flow in a chaining list of table T. */
+static struct hh_flow_state *seek_list(const u32 hash,
+ struct list_head *head,
+ struct hhf_sched_data *q)
+{
+ struct hh_flow_state *flow, *next;
+ u32 now = hhf_time_stamp();
+
+ if (list_empty(head))
+ return NULL;
+
+ list_for_each_entry_safe(flow, next, head, flowchain) {
+ u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
+
+ if (hhf_time_before(prev, now)) {
+ /* Delete expired heavy-hitters, but preserve one entry
+ * to avoid kzalloc() when next time this slot is hit.
+ */
+ if (list_is_last(&flow->flowchain, head))
+ return NULL;
+ list_del(&flow->flowchain);
+ kfree(flow);
+ q->hh_flows_current_cnt--;
+ } else if (flow->hash_id == hash) {
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+/* Returns a flow state entry for a new heavy-hitter. Either reuses an expired
+ * entry or dynamically alloc a new entry.
+ */
+static struct hh_flow_state *alloc_new_hh(struct list_head *head,
+ struct hhf_sched_data *q)
+{
+ struct hh_flow_state *flow;
+ u32 now = hhf_time_stamp();
+
+ if (!list_empty(head)) {
+ /* Find an expired heavy-hitter flow entry. */
+ list_for_each_entry(flow, head, flowchain) {
+ u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
+
+ if (hhf_time_before(prev, now))
+ return flow;
+ }
+ }
+
+ if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
+ q->hh_flows_overlimit++;
+ return NULL;
+ }
+ /* Create new entry. */
+ flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC);
+ if (!flow)
+ return NULL;
+
+ q->hh_flows_current_cnt++;
+ INIT_LIST_HEAD(&flow->flowchain);
+ list_add_tail(&flow->flowchain, head);
+
+ return flow;
+}
+
+/* Assigns packets to WDRR buckets. Implements a multi-stage filter to
+ * classify heavy-hitters.
+ */
+static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ u32 tmp_hash, hash;
+ u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
+ struct hh_flow_state *flow;
+ u32 pkt_len, min_hhf_val;
+ int i;
+ u32 prev;
+ u32 now = hhf_time_stamp();
+
+ /* Reset the HHF counter arrays if this is the right time. */
+ prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
+ if (hhf_time_before(prev, now)) {
+ for (i = 0; i < HHF_ARRAYS_CNT; i++)
+ bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
+ q->hhf_arrays_reset_timestamp = now;
+ }
+
+ /* Get hashed flow-id of the skb. */
+ hash = skb_hash(q, skb);
+
+ /* Check if this packet belongs to an already established HH flow. */
+ flow_pos = hash & HHF_BIT_MASK;
+ flow = seek_list(hash, &q->hh_flows[flow_pos], q);
+ if (flow) { /* found its HH flow */
+ flow->hit_timestamp = now;
+ return WDRR_BUCKET_FOR_HH;
+ }
+
+ /* Now pass the packet through the multi-stage filter. */
+ tmp_hash = hash;
+ xorsum = 0;
+ for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) {
+ /* Split the skb_hash into three 10-bit chunks. */
+ filter_pos[i] = tmp_hash & HHF_BIT_MASK;
+ xorsum ^= filter_pos[i];
+ tmp_hash >>= HHF_BIT_MASK_LEN;
+ }
+ /* The last chunk is computed as XOR sum of other chunks. */
+ filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash;
+
+ pkt_len = qdisc_pkt_len(skb);
+ min_hhf_val = ~0U;
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ u32 val;
+
+ if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
+ q->hhf_arrays[i][filter_pos[i]] = 0;
+ __set_bit(filter_pos[i], q->hhf_valid_bits[i]);
+ }
+
+ val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
+ if (min_hhf_val > val)
+ min_hhf_val = val;
+ }
+
+ /* Found a new HH iff all counter values > HH admit threshold. */
+ if (min_hhf_val > q->hhf_admit_bytes) {
+ /* Just captured a new heavy-hitter. */
+ flow = alloc_new_hh(&q->hh_flows[flow_pos], q);
+ if (!flow) /* memory alloc problem */
+ return WDRR_BUCKET_FOR_NON_HH;
+ flow->hash_id = hash;
+ flow->hit_timestamp = now;
+ q->hh_flows_total_cnt++;
+
+ /* By returning without updating counters in q->hhf_arrays,
+ * we implicitly implement "shielding" (see Optimization O1).
+ */
+ return WDRR_BUCKET_FOR_HH;
+ }
+
+ /* Conservative update of HHF arrays (see Optimization O2). */
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
+ q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
+ }
+ return WDRR_BUCKET_FOR_NON_HH;
+}
+
+/* Removes one skb from head of bucket. */
+static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
+{
+ struct sk_buff *skb = bucket->head;
+
+ bucket->head = skb->next;
+ skb->next = NULL;
+ return skb;
+}
+
+/* Tail-adds skb to bucket. */
+static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
+{
+ if (bucket->head == NULL)
+ bucket->head = skb;
+ else
+ bucket->tail->next = skb;
+ bucket->tail = skb;
+ skb->next = NULL;
+}
+
+static unsigned int hhf_drop(struct Qdisc *sch)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct wdrr_bucket *bucket;
+
+ /* Always try to drop from heavy-hitters first. */
+ bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
+ if (!bucket->head)
+ bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
+
+ if (bucket->head) {
+ struct sk_buff *skb = dequeue_head(bucket);
+
+ sch->q.qlen--;
+ sch->qstats.drops++;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ kfree_skb(skb);
+ }
+
+ /* Return id of the bucket from which the packet was dropped. */
+ return bucket - q->buckets;
+}
+
+static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ enum wdrr_bucket_idx idx;
+ struct wdrr_bucket *bucket;
+
+ idx = hhf_classify(skb, sch);
+
+ bucket = &q->buckets[idx];
+ bucket_add(bucket, skb);
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+
+ if (list_empty(&bucket->bucketchain)) {
+ unsigned int weight;
+
+ /* The logic of new_buckets vs. old_buckets is the same as
+ * new_flows vs. old_flows in the implementation of fq_codel,
+ * i.e., short bursts of non-HHs should have strict priority.
+ */
+ if (idx == WDRR_BUCKET_FOR_HH) {
+ /* Always move heavy-hitters to old bucket. */
+ weight = 1;
+ list_add_tail(&bucket->bucketchain, &q->old_buckets);
+ } else {
+ weight = q->hhf_non_hh_weight;
+ list_add_tail(&bucket->bucketchain, &q->new_buckets);
+ }
+ bucket->deficit = weight * q->quantum;
+ }
+ if (++sch->q.qlen <= sch->limit)
+ return NET_XMIT_SUCCESS;
+
+ q->drop_overlimit++;
+ /* Return Congestion Notification only if we dropped a packet from this
+ * bucket.
+ */
+ if (hhf_drop(sch) == idx)
+ return NET_XMIT_CN;
+
+ /* As we dropped a packet, better let upper stack know this. */
+ qdisc_tree_decrease_qlen(sch, 1);
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = NULL;
+ struct wdrr_bucket *bucket;
+ struct list_head *head;
+
+begin:
+ head = &q->new_buckets;
+ if (list_empty(head)) {
+ head = &q->old_buckets;
+ if (list_empty(head))
+ return NULL;
+ }
+ bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
+
+ if (bucket->deficit <= 0) {
+ int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
+ 1 : q->hhf_non_hh_weight;
+
+ bucket->deficit += weight * q->quantum;
+ list_move_tail(&bucket->bucketchain, &q->old_buckets);
+ goto begin;
+ }
+
+ if (bucket->head) {
+ skb = dequeue_head(bucket);
+ sch->q.qlen--;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ }
+
+ if (!skb) {
+ /* Force a pass through old_buckets to prevent starvation. */
+ if ((head == &q->new_buckets) && !list_empty(&q->old_buckets))
+ list_move_tail(&bucket->bucketchain, &q->old_buckets);
+ else
+ list_del_init(&bucket->bucketchain);
+ goto begin;
+ }
+ qdisc_bstats_update(sch, skb);
+ bucket->deficit -= qdisc_pkt_len(skb);
+
+ return skb;
+}
+
+static void hhf_reset(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ while ((skb = hhf_dequeue(sch)) != NULL)
+ kfree_skb(skb);
+}
+
+static void *hhf_zalloc(size_t sz)
+{
+ void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+ if (!ptr)
+ ptr = vzalloc(sz);
+
+ return ptr;
+}
+
+static void hhf_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static void hhf_destroy(struct Qdisc *sch)
+{
+ int i;
+ struct hhf_sched_data *q = qdisc_priv(sch);
+
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ hhf_free(q->hhf_arrays[i]);
+ hhf_free(q->hhf_valid_bits[i]);
+ }
+
+ for (i = 0; i < HH_FLOWS_CNT; i++) {
+ struct hh_flow_state *flow, *next;
+ struct list_head *head = &q->hh_flows[i];
+
+ if (list_empty(head))
+ continue;
+ list_for_each_entry_safe(flow, next, head, flowchain) {
+ list_del(&flow->flowchain);
+ kfree(flow);
+ }
+ }
+ hhf_free(q->hh_flows);
+}
+
+static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
+ [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 },
+ [TCA_HHF_QUANTUM] = { .type = NLA_U32 },
+ [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
+ [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 },
+ [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 },
+ [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 },
+ [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 },
+};
+
+static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_HHF_MAX + 1];
+ unsigned int qlen;
+ int err;
+ u64 non_hh_quantum;
+ u32 new_quantum = q->quantum;
+ u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_HHF_QUANTUM])
+ new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]);
+
+ if (tb[TCA_HHF_NON_HH_WEIGHT])
+ new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
+
+ non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
+ if (non_hh_quantum > INT_MAX)
+ return -EINVAL;
+
+ sch_tree_lock(sch);
+
+ if (tb[TCA_HHF_BACKLOG_LIMIT])
+ sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
+
+ q->quantum = new_quantum;
+ q->hhf_non_hh_weight = new_hhf_non_hh_weight;
+
+ if (tb[TCA_HHF_HH_FLOWS_LIMIT])
+ q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
+
+ if (tb[TCA_HHF_RESET_TIMEOUT]) {
+ u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
+
+ q->hhf_reset_timeout = usecs_to_jiffies(us);
+ }
+
+ if (tb[TCA_HHF_ADMIT_BYTES])
+ q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
+
+ if (tb[TCA_HHF_EVICT_TIMEOUT]) {
+ u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
+
+ q->hhf_evict_timeout = usecs_to_jiffies(us);
+ }
+
+ qlen = sch->q.qlen;
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = hhf_dequeue(sch);
+
+ kfree_skb(skb);
+ }
+ qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ int i;
+
+ sch->limit = 1000;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->perturbation = prandom_u32();
+ INIT_LIST_HEAD(&q->new_buckets);
+ INIT_LIST_HEAD(&q->old_buckets);
+
+ /* Configurable HHF parameters */
+ q->hhf_reset_timeout = HZ / 25; /* 40 ms */
+ q->hhf_admit_bytes = 131072; /* 128 KB */
+ q->hhf_evict_timeout = HZ; /* 1 sec */
+ q->hhf_non_hh_weight = 2;
+
+ if (opt) {
+ int err = hhf_change(sch, opt);
+
+ if (err)
+ return err;
+ }
+
+ if (!q->hh_flows) {
+ /* Initialize heavy-hitter flow table. */
+ q->hh_flows = hhf_zalloc(HH_FLOWS_CNT *
+ sizeof(struct list_head));
+ if (!q->hh_flows)
+ return -ENOMEM;
+ for (i = 0; i < HH_FLOWS_CNT; i++)
+ INIT_LIST_HEAD(&q->hh_flows[i]);
+
+ /* Cap max active HHs at twice len of hh_flows table. */
+ q->hh_flows_limit = 2 * HH_FLOWS_CNT;
+ q->hh_flows_overlimit = 0;
+ q->hh_flows_total_cnt = 0;
+ q->hh_flows_current_cnt = 0;
+
+ /* Initialize heavy-hitter filter arrays. */
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
+ sizeof(u32));
+ if (!q->hhf_arrays[i]) {
+ hhf_destroy(sch);
+ return -ENOMEM;
+ }
+ }
+ q->hhf_arrays_reset_timestamp = hhf_time_stamp();
+
+ /* Initialize valid bits of heavy-hitter filter arrays. */
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
+ BITS_PER_BYTE);
+ if (!q->hhf_valid_bits[i]) {
+ hhf_destroy(sch);
+ return -ENOMEM;
+ }
+ }
+
+ /* Initialize Weighted DRR buckets. */
+ for (i = 0; i < WDRR_BUCKET_CNT; i++) {
+ struct wdrr_bucket *bucket = q->buckets + i;
+
+ INIT_LIST_HEAD(&bucket->bucketchain);
+ }
+ }
+
+ return 0;
+}
+
+static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
+ nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
+ nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
+ jiffies_to_usecs(q->hhf_reset_timeout)) ||
+ nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
+ nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
+ jiffies_to_usecs(q->hhf_evict_timeout)) ||
+ nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct tc_hhf_xstats st = {
+ .drop_overlimit = q->drop_overlimit,
+ .hh_overlimit = q->hh_flows_overlimit,
+ .hh_tot_count = q->hh_flows_total_cnt,
+ .hh_cur_count = q->hh_flows_current_cnt,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
+ .id = "hhf",
+ .priv_size = sizeof(struct hhf_sched_data),
+
+ .enqueue = hhf_enqueue,
+ .dequeue = hhf_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = hhf_drop,
+ .init = hhf_init,
+ .reset = hhf_reset,
+ .destroy = hhf_destroy,
+ .change = hhf_change,
+ .dump = hhf_dump,
+ .dump_stats = hhf_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init hhf_module_init(void)
+{
+ return register_qdisc(&hhf_qdisc_ops);
+}
+
+static void __exit hhf_module_exit(void)
+{
+ unregister_qdisc(&hhf_qdisc_ops);
+}
+
+module_init(hhf_module_init)
+module_exit(hhf_module_exit)
+MODULE_AUTHOR("Terry Lam");
+MODULE_AUTHOR("Nandita Dukkipati");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 01b519d6c52..9f949abcace 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -38,6 +38,7 @@
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <net/netlink.h>
+#include <net/sch_generic.h>
#include <net/pkt_sched.h>
/* HTB algorithm.
@@ -64,6 +65,10 @@ static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis f
module_param (htb_hysteresis, int, 0640);
MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
+static int htb_rate_est = 0; /* htb classes have a default rate estimator */
+module_param(htb_rate_est, int, 0640);
+MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
+
/* used internaly to keep status of single class */
enum htb_cmode {
HTB_CANT_SEND, /* class can't send and can't borrow */
@@ -71,94 +76,105 @@ enum htb_cmode {
HTB_CAN_SEND /* class can send */
};
-/* interior & leaf nodes; props specific to leaves are marked L: */
+struct htb_prio {
+ union {
+ struct rb_root row;
+ struct rb_root feed;
+ };
+ struct rb_node *ptr;
+ /* When class changes from state 1->2 and disconnects from
+ * parent's feed then we lost ptr value and start from the
+ * first child again. Here we store classid of the
+ * last valid ptr (used when ptr is NULL).
+ */
+ u32 last_ptr_id;
+};
+
+/* interior & leaf nodes; props specific to leaves are marked L:
+ * To reduce false sharing, place mostly read fields at beginning,
+ * and mostly written ones at the end.
+ */
struct htb_class {
struct Qdisc_class_common common;
- /* general class parameters */
- struct gnet_stats_basic_packed bstats;
- struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
- struct tc_htb_xstats xstats; /* our special stats */
- int refcnt; /* usage count of this class */
+ struct psched_ratecfg rate;
+ struct psched_ratecfg ceil;
+ s64 buffer, cbuffer;/* token bucket depth/rate */
+ s64 mbuffer; /* max wait time */
+ u32 prio; /* these two are used only by leaves... */
+ int quantum; /* but stored for parent-to-leaf return */
+
+ struct tcf_proto *filter_list; /* class attached filters */
+ int filter_cnt;
+ int refcnt; /* usage count of this class */
- /* topology */
- int level; /* our level (see above) */
- unsigned int children;
- struct htb_class *parent; /* parent class */
+ int level; /* our level (see above) */
+ unsigned int children;
+ struct htb_class *parent; /* parent class */
- int prio; /* these two are used only by leaves... */
- int quantum; /* but stored for parent-to-leaf return */
+ struct gnet_stats_rate_est64 rate_est;
+
+ /*
+ * Written often fields
+ */
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct tc_htb_xstats xstats; /* our special stats */
+
+ /* token bucket parameters */
+ s64 tokens, ctokens;/* current number of tokens */
+ s64 t_c; /* checkpoint time */
union {
struct htb_class_leaf {
- struct Qdisc *q;
- int deficit[TC_HTB_MAXDEPTH];
struct list_head drop_list;
+ int deficit[TC_HTB_MAXDEPTH];
+ struct Qdisc *q;
} leaf;
struct htb_class_inner {
- struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
- struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
- /* When class changes from state 1->2 and disconnects from
- parent's feed then we lost ptr value and start from the
- first child again. Here we store classid of the
- last valid ptr (used when ptr is NULL). */
- u32 last_ptr_id[TC_HTB_NUMPRIO];
+ struct htb_prio clprio[TC_HTB_NUMPRIO];
} inner;
} un;
- struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
- struct rb_node pq_node; /* node for event queue */
- psched_time_t pq_key;
-
- int prio_activity; /* for which prios are we active */
- enum htb_cmode cmode; /* current mode of the class */
+ s64 pq_key;
- /* class attached filters */
- struct tcf_proto *filter_list;
- int filter_cnt;
+ int prio_activity; /* for which prios are we active */
+ enum htb_cmode cmode; /* current mode of the class */
+ struct rb_node pq_node; /* node for event queue */
+ struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
+};
- /* token bucket parameters */
- struct qdisc_rate_table *rate; /* rate table of the class itself */
- struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */
- long buffer, cbuffer; /* token bucket depth/rate */
- psched_tdiff_t mbuffer; /* max wait time */
- long tokens, ctokens; /* current number of tokens */
- psched_time_t t_c; /* checkpoint time */
+struct htb_level {
+ struct rb_root wait_pq;
+ struct htb_prio hprio[TC_HTB_NUMPRIO];
};
struct htb_sched {
struct Qdisc_class_hash clhash;
- struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
+ int defcls; /* class where unclassified flows go to */
+ int rate2quantum; /* quant = rate / rate2quantum */
- /* self list - roots of self generating tree */
- struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
- int row_mask[TC_HTB_MAXDEPTH];
- struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
- u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
-
- /* self wait list - roots of wait PQs per row */
- struct rb_root wait_pq[TC_HTB_MAXDEPTH];
+ /* filters for qdisc itself */
+ struct tcf_proto *filter_list;
- /* time of nearest event per level (row) */
- psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
+#define HTB_WARN_TOOMANYEVENTS 0x1
+ unsigned int warned; /* only one warning */
+ int direct_qlen;
+ struct work_struct work;
- int defcls; /* class where unclassified flows go to */
+ /* non shaped skbs; let them go directly thru */
+ struct sk_buff_head direct_queue;
+ long direct_pkts;
- /* filters for qdisc itself */
- struct tcf_proto *filter_list;
+ struct qdisc_watchdog watchdog;
- int rate2quantum; /* quant = rate / rate2quantum */
- psched_time_t now; /* cached dequeue time */
- struct qdisc_watchdog watchdog;
+ s64 now; /* cached dequeue time */
+ struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
- /* non shaped skbs; let them go directly thru */
- struct sk_buff_head direct_queue;
- int direct_qlen; /* max qlen of above */
+ /* time of nearest event per level (row) */
+ s64 near_ev_cache[TC_HTB_MAXDEPTH];
- long direct_pkts;
+ int row_mask[TC_HTB_MAXDEPTH];
-#define HTB_WARN_TOOMANYEVENTS 0x1
- unsigned int warned; /* only one warning */
- struct work_struct work;
+ struct htb_level hlevel[TC_HTB_MAXDEPTH];
};
/* find class in global hash table using given handle */
@@ -182,10 +198,10 @@ static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
* filters in qdisc and in inner nodes (if higher filter points to the inner
* node). If we end up with classid MAJOR:0 we enqueue the skb into special
* internal fifo (direct). These packets then go directly thru. If we still
- * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
+ * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
* then finish and return direct queue.
*/
-#define HTB_DIRECT (struct htb_class*)-1
+#define HTB_DIRECT ((struct htb_class *)-1L)
static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
int *qerr)
@@ -197,15 +213,22 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
int result;
/* allow to select class by setting skb->priority to valid classid;
- note that nfmark can be used too by attaching filter fw with no
- rules in it */
+ * note that nfmark can be used too by attaching filter fw with no
+ * rules in it
+ */
if (skb->priority == sch->handle)
return HTB_DIRECT; /* X:0 (direct flow) selected */
- if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
- return cl;
+ cl = htb_find(skb->priority, sch);
+ if (cl) {
+ if (cl->level == 0)
+ return cl;
+ /* Start with inner filter chain if a non-leaf class is selected */
+ tcf = cl->filter_list;
+ } else {
+ tcf = q->filter_list;
+ }
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- tcf = q->filter_list;
while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -216,10 +239,12 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
return NULL;
}
#endif
- if ((cl = (void *)res.class) == NULL) {
+ cl = (void *)res.class;
+ if (!cl) {
if (res.classid == sch->handle)
return HTB_DIRECT; /* X:0 (direct flow) */
- if ((cl = htb_find(res.classid, sch)) == NULL)
+ cl = htb_find(res.classid, sch);
+ if (!cl)
break; /* filter selected invalid classid */
}
if (!cl->level)
@@ -268,9 +293,9 @@ static void htb_add_to_id_tree(struct rb_root *root,
* already in the queue.
*/
static void htb_add_to_wait_tree(struct htb_sched *q,
- struct htb_class *cl, long delay)
+ struct htb_class *cl, s64 delay)
{
- struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
+ struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
cl->pq_key = q->now + delay;
if (cl->pq_key == q->now)
@@ -290,7 +315,7 @@ static void htb_add_to_wait_tree(struct htb_sched *q,
p = &parent->rb_left;
}
rb_link_node(&cl->pq_node, parent, p);
- rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
+ rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
}
/**
@@ -317,7 +342,7 @@ static inline void htb_add_class_to_row(struct htb_sched *q,
while (mask) {
int prio = ffz(~mask);
mask &= ~(1 << prio);
- htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
+ htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
}
}
@@ -343,16 +368,18 @@ static inline void htb_remove_class_from_row(struct htb_sched *q,
struct htb_class *cl, int mask)
{
int m = 0;
+ struct htb_level *hlevel = &q->hlevel[cl->level];
while (mask) {
int prio = ffz(~mask);
+ struct htb_prio *hprio = &hlevel->hprio[prio];
mask &= ~(1 << prio);
- if (q->ptr[cl->level][prio] == cl->node + prio)
- htb_next_rb_node(q->ptr[cl->level] + prio);
+ if (hprio->ptr == cl->node + prio)
+ htb_next_rb_node(&hprio->ptr);
- htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
- if (!q->row[cl->level][prio].rb_node)
+ htb_safe_rb_erase(cl->node + prio, &hprio->row);
+ if (!hprio->row.rb_node)
m |= 1 << prio;
}
q->row_mask[cl->level] &= ~m;
@@ -376,12 +403,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
int prio = ffz(~m);
m &= ~(1 << prio);
- if (p->un.inner.feed[prio].rb_node)
+ if (p->un.inner.clprio[prio].feed.rb_node)
/* parent already has its feed in use so that
- reset bit in mask as parent is already ok */
+ * reset bit in mask as parent is already ok
+ */
mask &= ~(1 << prio);
- htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
+ htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
}
p->prio_activity |= mask;
cl = p;
@@ -411,17 +439,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
int prio = ffz(~m);
m &= ~(1 << prio);
- if (p->un.inner.ptr[prio] == cl->node + prio) {
+ if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
/* we are removing child which is pointed to from
- parent feed - forget the pointer but remember
- classid */
- p->un.inner.last_ptr_id[prio] = cl->common.classid;
- p->un.inner.ptr[prio] = NULL;
+ * parent feed - forget the pointer but remember
+ * classid
+ */
+ p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
+ p->un.inner.clprio[prio].ptr = NULL;
}
- htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
+ htb_safe_rb_erase(cl->node + prio,
+ &p->un.inner.clprio[prio].feed);
- if (!p->un.inner.feed[prio].rb_node)
+ if (!p->un.inner.clprio[prio].feed.rb_node)
mask |= 1 << prio;
}
@@ -434,14 +464,14 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
htb_remove_class_from_row(q, cl, mask);
}
-static inline long htb_lowater(const struct htb_class *cl)
+static inline s64 htb_lowater(const struct htb_class *cl)
{
if (htb_hysteresis)
return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
else
return 0;
}
-static inline long htb_hiwater(const struct htb_class *cl)
+static inline s64 htb_hiwater(const struct htb_class *cl)
{
if (htb_hysteresis)
return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
@@ -462,9 +492,9 @@ static inline long htb_hiwater(const struct htb_class *cl)
* mode transitions per time unit. The speed gain is about 1/6.
*/
static inline enum htb_cmode
-htb_class_mode(struct htb_class *cl, long *diff)
+htb_class_mode(struct htb_class *cl, s64 *diff)
{
- long toks;
+ s64 toks;
if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
*diff = -toks;
@@ -488,7 +518,7 @@ htb_class_mode(struct htb_class *cl, long *diff)
* to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
*/
static void
-htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
+htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
{
enum htb_cmode new_mode = htb_class_mode(cl, diff);
@@ -551,9 +581,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
__skb_queue_tail(&q->direct_queue, skb);
q->direct_pkts++;
} else {
- kfree_skb(skb);
- sch->qstats.drops++;
- return NET_XMIT_DROP;
+ return qdisc_drop(skb, sch);
}
#ifdef CONFIG_NET_CLS_ACT
} else if (!cl) {
@@ -569,38 +597,33 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
return ret;
} else {
- cl->bstats.packets +=
- skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
- cl->bstats.bytes += qdisc_pkt_len(skb);
htb_activate(q, cl);
}
sch->q.qlen++;
- sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
- sch->bstats.bytes += qdisc_pkt_len(skb);
return NET_XMIT_SUCCESS;
}
-static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff)
+static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
{
- long toks = diff + cl->tokens;
+ s64 toks = diff + cl->tokens;
if (toks > cl->buffer)
toks = cl->buffer;
- toks -= (long) qdisc_l2t(cl->rate, bytes);
+ toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
if (toks <= -cl->mbuffer)
toks = 1 - cl->mbuffer;
cl->tokens = toks;
}
-static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff)
+static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
{
- long toks = diff + cl->ctokens;
+ s64 toks = diff + cl->ctokens;
if (toks > cl->cbuffer)
toks = cl->cbuffer;
- toks -= (long) qdisc_l2t(cl->ceil, bytes);
+ toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
if (toks <= -cl->mbuffer)
toks = 1 - cl->mbuffer;
@@ -623,10 +646,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
{
int bytes = qdisc_pkt_len(skb);
enum htb_cmode old_mode;
- long diff;
+ s64 diff;
while (cl) {
- diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
+ diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
if (cl->level >= level) {
if (cl->level == level)
cl->xstats.lends++;
@@ -643,17 +666,15 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
htb_change_class_mode(q, cl, &diff);
if (old_mode != cl->cmode) {
if (old_mode != HTB_CAN_SEND)
- htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
+ htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
if (cl->cmode != HTB_CAN_SEND)
htb_add_to_wait_tree(q, cl, diff);
}
- /* update byte stats except for leaves which are already updated */
- if (cl->level) {
- cl->bstats.bytes += bytes;
- cl->bstats.packets += skb_is_gso(skb)?
- skb_shinfo(skb)->gso_segs:1;
- }
+ /* update basic stats except for leaves which are already updated */
+ if (cl->level)
+ bstats_update(&cl->bstats, skb);
+
cl = cl->parent;
}
}
@@ -665,17 +686,20 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
* next pending event (0 for no event in pq, q->now for too many events).
* Note: Applied are events whose have cl->pq_key <= q->now.
*/
-static psched_time_t htb_do_events(struct htb_sched *q, int level,
- unsigned long start)
+static s64 htb_do_events(struct htb_sched *q, const int level,
+ unsigned long start)
{
/* don't run for longer than 2 jiffies; 2 is used instead of
- 1 to simplify things when jiffy is going to be incremented
- too soon */
+ * 1 to simplify things when jiffy is going to be incremented
+ * too soon
+ */
unsigned long stop_at = start + 2;
+ struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
+
while (time_before(jiffies, stop_at)) {
struct htb_class *cl;
- long diff;
- struct rb_node *p = rb_first(&q->wait_pq[level]);
+ s64 diff;
+ struct rb_node *p = rb_first(wait_pq);
if (!p)
return 0;
@@ -684,8 +708,8 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
if (cl->pq_key > q->now)
return cl->pq_key;
- htb_safe_rb_erase(p, q->wait_pq + level);
- diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
+ htb_safe_rb_erase(p, wait_pq);
+ diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
htb_change_class_mode(q, cl, &diff);
if (cl->cmode != HTB_CAN_SEND)
htb_add_to_wait_tree(q, cl, diff);
@@ -693,7 +717,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
/* too much load - let's continue after a break for scheduling */
if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
- printk(KERN_WARNING "htb: too many events!\n");
+ pr_warn("htb: too many events!\n");
q->warned |= HTB_WARN_TOOMANYEVENTS;
}
@@ -701,7 +725,8 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
}
/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
- is no such one exists. */
+ * is no such one exists.
+ */
static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
u32 id)
{
@@ -727,8 +752,7 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
*
* Find leaf where current feed pointers points to.
*/
-static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
- struct rb_node **pptr, u32 * pid)
+static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
{
int i;
struct {
@@ -737,20 +761,22 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
u32 *pid;
} stk[TC_HTB_MAXDEPTH], *sp = stk;
- BUG_ON(!tree->rb_node);
- sp->root = tree->rb_node;
- sp->pptr = pptr;
- sp->pid = pid;
+ BUG_ON(!hprio->row.rb_node);
+ sp->root = hprio->row.rb_node;
+ sp->pptr = &hprio->ptr;
+ sp->pid = &hprio->last_ptr_id;
for (i = 0; i < 65535; i++) {
if (!*sp->pptr && *sp->pid) {
/* ptr was invalidated but id is valid - try to recover
- the original or next ptr */
+ * the original or next ptr
+ */
*sp->pptr =
htb_id_find_next_upper(prio, sp->root, *sp->pid);
}
*sp->pid = 0; /* ptr is valid now so that remove this hint as it
- can become out of date quickly */
+ * can become out of date quickly
+ */
if (!*sp->pptr) { /* we are at right end; rewind & go up */
*sp->pptr = sp->root;
while ((*sp->pptr)->rb_left)
@@ -765,12 +791,15 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
}
} else {
struct htb_class *cl;
+ struct htb_prio *clp;
+
cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
if (!cl->level)
return cl;
- (++sp)->root = cl->un.inner.feed[prio].rb_node;
- sp->pptr = cl->un.inner.ptr + prio;
- sp->pid = cl->un.inner.last_ptr_id + prio;
+ clp = &cl->un.inner.clprio[prio];
+ (++sp)->root = clp->feed.rb_node;
+ sp->pptr = &clp->ptr;
+ sp->pid = &clp->last_ptr_id;
}
}
WARN_ON(1);
@@ -778,16 +807,18 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
}
/* dequeues packet at given priority and level; call only if
- you are sure that there is active class at prio/level */
-static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
- int level)
+ * you are sure that there is active class at prio/level
+ */
+static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
+ const int level)
{
struct sk_buff *skb = NULL;
struct htb_class *cl, *start;
+ struct htb_level *hlevel = &q->hlevel[level];
+ struct htb_prio *hprio = &hlevel->hprio[prio];
+
/* look initial class up in the row */
- start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
- q->ptr[level] + prio,
- q->last_ptr_id[level] + prio);
+ start = cl = htb_lookup_leaf(hprio, prio);
do {
next:
@@ -795,9 +826,10 @@ next:
return NULL;
/* class can be empty - it is unlikely but can be true if leaf
- qdisc drops packets in enqueue routine or if someone used
- graft operation on the leaf since last dequeue;
- simply deactivate and skip such class */
+ * qdisc drops packets in enqueue routine or if someone used
+ * graft operation on the leaf since last dequeue;
+ * simply deactivate and skip such class
+ */
if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
struct htb_class *next;
htb_deactivate(q, cl);
@@ -806,9 +838,7 @@ next:
if ((q->row_mask[level] & (1 << prio)) == 0)
return NULL;
- next = htb_lookup_leaf(q->row[level] + prio,
- prio, q->ptr[level] + prio,
- q->last_ptr_id[level] + prio);
+ next = htb_lookup_leaf(hprio, prio);
if (cl == start) /* fix start if we just deleted it */
start = next;
@@ -821,23 +851,23 @@ next:
break;
qdisc_warn_nonwc("htb", cl->un.leaf.q);
- htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
- ptr[0]) + prio);
- cl = htb_lookup_leaf(q->row[level] + prio, prio,
- q->ptr[level] + prio,
- q->last_ptr_id[level] + prio);
+ htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
+ &q->hlevel[0].hprio[prio].ptr);
+ cl = htb_lookup_leaf(hprio, prio);
} while (cl != start);
if (likely(skb != NULL)) {
+ bstats_update(&cl->bstats, skb);
cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
if (cl->un.leaf.deficit[level] < 0) {
cl->un.leaf.deficit[level] += cl->quantum;
- htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
- ptr[0]) + prio);
+ htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
+ &q->hlevel[0].hprio[prio].ptr);
}
/* this used to be after charge_class but this constelation
- gives us slightly better performance */
+ * gives us slightly better performance
+ */
if (!cl->un.leaf.q->q.qlen)
htb_deactivate(q, cl);
htb_charge_class(q, cl, level, skb);
@@ -847,39 +877,40 @@ next:
static struct sk_buff *htb_dequeue(struct Qdisc *sch)
{
- struct sk_buff *skb = NULL;
+ struct sk_buff *skb;
struct htb_sched *q = qdisc_priv(sch);
int level;
- psched_time_t next_event;
+ s64 next_event;
unsigned long start_at;
/* try to dequeue direct packets as high prio (!) to minimize cpu work */
skb = __skb_dequeue(&q->direct_queue);
if (skb != NULL) {
- sch->flags &= ~TCQ_F_THROTTLED;
+ok:
+ qdisc_bstats_update(sch, skb);
+ qdisc_unthrottled(sch);
sch->q.qlen--;
return skb;
}
if (!sch->q.qlen)
goto fin;
- q->now = psched_get_time();
+ q->now = ktime_to_ns(ktime_get());
start_at = jiffies;
- next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
+ next_event = q->now + 5LLU * NSEC_PER_SEC;
for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
/* common case optimization - skip event handler quickly */
int m;
- psched_time_t event;
+ s64 event = q->near_ev_cache[level];
- if (q->now >= q->near_ev_cache[level]) {
+ if (q->now >= event) {
event = htb_do_events(q, level, start_at);
if (!event)
- event = q->now + PSCHED_TICKS_PER_SEC;
+ event = q->now + NSEC_PER_SEC;
q->near_ev_cache[level] = event;
- } else
- event = q->near_ev_cache[level];
+ }
if (next_event > event)
next_event = event;
@@ -887,20 +918,25 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
m = ~q->row_mask[level];
while (m != (int)(-1)) {
int prio = ffz(m);
+
m |= 1 << prio;
skb = htb_dequeue_tree(q, prio, level);
- if (likely(skb != NULL)) {
- sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
- goto fin;
- }
+ if (likely(skb != NULL))
+ goto ok;
}
}
sch->qstats.overlimits++;
- if (likely(next_event > q->now))
- qdisc_watchdog_schedule(&q->watchdog, next_event);
- else
+ if (likely(next_event > q->now)) {
+ if (!test_bit(__QDISC_STATE_DEACTIVATED,
+ &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
+ ktime_t time = ns_to_ktime(next_event);
+ qdisc_throttled(q->watchdog.qdisc);
+ hrtimer_start(&q->watchdog.timer, time,
+ HRTIMER_MODE_ABS);
+ }
+ } else {
schedule_work(&q->work);
+ }
fin:
return skb;
}
@@ -935,11 +971,10 @@ static void htb_reset(struct Qdisc *sch)
{
struct htb_sched *q = qdisc_priv(sch);
struct htb_class *cl;
- struct hlist_node *n;
unsigned int i;
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
if (cl->level)
memset(&cl->un.inner, 0, sizeof(cl->un.inner));
else {
@@ -955,10 +990,8 @@ static void htb_reset(struct Qdisc *sch)
qdisc_watchdog_cancel(&q->watchdog);
__skb_queue_purge(&q->direct_queue);
sch->q.qlen = 0;
- memset(q->row, 0, sizeof(q->row));
+ memset(q->hlevel, 0, sizeof(q->hlevel));
memset(q->row_mask, 0, sizeof(q->row_mask));
- memset(q->wait_pq, 0, sizeof(q->wait_pq));
- memset(q->ptr, 0, sizeof(q->ptr));
for (i = 0; i < TC_HTB_NUMPRIO; i++)
INIT_LIST_HEAD(q->drops + i);
}
@@ -968,6 +1001,9 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
[TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) },
[TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
[TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
+ [TCA_HTB_RATE64] = { .type = NLA_U64 },
+ [TCA_HTB_CEIL64] = { .type = NLA_U64 },
};
static void htb_work_func(struct work_struct *work)
@@ -981,7 +1017,7 @@ static void htb_work_func(struct work_struct *work)
static int htb_init(struct Qdisc *sch, struct nlattr *opt)
{
struct htb_sched *q = qdisc_priv(sch);
- struct nlattr *tb[TCA_HTB_INIT + 1];
+ struct nlattr *tb[TCA_HTB_MAX + 1];
struct tc_htb_glob *gopt;
int err;
int i;
@@ -989,21 +1025,16 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
if (!opt)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
+ err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
if (err < 0)
return err;
- if (tb[TCA_HTB_INIT] == NULL) {
- printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
+ if (!tb[TCA_HTB_INIT])
return -EINVAL;
- }
+
gopt = nla_data(tb[TCA_HTB_INIT]);
- if (gopt->version != HTB_VER >> 16) {
- printk(KERN_ERR
- "HTB: need tc/htb version %d (minor is %d), you have %d\n",
- HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
+ if (gopt->version != HTB_VER >> 16)
return -EINVAL;
- }
err = qdisc_class_hash_init(&q->clhash);
if (err < 0)
@@ -1015,10 +1046,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
INIT_WORK(&q->work, htb_work_func);
skb_queue_head_init(&q->direct_queue);
- q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
- if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
- q->direct_qlen = 2;
-
+ if (tb[TCA_HTB_DIRECT_QLEN])
+ q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
+ else {
+ q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
+ if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
+ q->direct_qlen = 2;
+ }
if ((q->rate2quantum = gopt->rate2quantum) < 1)
q->rate2quantum = 1;
q->defcls = gopt->defcls;
@@ -1028,12 +1062,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
{
- spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
struct htb_sched *q = qdisc_priv(sch);
struct nlattr *nest;
struct tc_htb_glob gopt;
- spin_lock_bh(root_lock);
+ /* Its safe to not acquire qdisc lock. As we hold RTNL,
+ * no change can happen on the qdisc parameters.
+ */
gopt.direct_pkts = q->direct_pkts;
gopt.version = HTB_VER;
@@ -1044,14 +1079,13 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
- NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
- nla_nest_end(skb, nest);
+ if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
+ nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
+ goto nla_put_failure;
- spin_unlock_bh(root_lock);
- return skb->len;
+ return nla_nest_end(skb, nest);
nla_put_failure:
- spin_unlock_bh(root_lock);
nla_nest_cancel(skb, nest);
return -1;
}
@@ -1060,11 +1094,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct htb_class *cl = (struct htb_class *)arg;
- spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
struct nlattr *nest;
struct tc_htb_opt opt;
- spin_lock_bh(root_lock);
+ /* Its safe to not acquire qdisc lock. As we hold RTNL,
+ * no change can happen on the class parameters.
+ */
tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
tcm->tcm_handle = cl->common.classid;
if (!cl->level && cl->un.leaf.q)
@@ -1076,21 +1111,25 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
memset(&opt, 0, sizeof(opt));
- opt.rate = cl->rate->rate;
- opt.buffer = cl->buffer;
- opt.ceil = cl->ceil->rate;
- opt.cbuffer = cl->cbuffer;
+ psched_ratecfg_getrate(&opt.rate, &cl->rate);
+ opt.buffer = PSCHED_NS2TICKS(cl->buffer);
+ psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
+ opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
opt.quantum = cl->quantum;
opt.prio = cl->prio;
opt.level = cl->level;
- NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps))
+ goto nla_put_failure;
+ if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps))
+ goto nla_put_failure;
- nla_nest_end(skb, nest);
- spin_unlock_bh(root_lock);
- return skb->len;
+ return nla_nest_end(skb, nest);
nla_put_failure:
- spin_unlock_bh(root_lock);
nla_nest_cancel(skb, nest);
return -1;
}
@@ -1102,8 +1141,8 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
if (!cl->level && cl->un.leaf.q)
cl->qstats.qlen = cl->un.leaf.q->q.qlen;
- cl->xstats.tokens = cl->tokens;
- cl->xstats.ctokens = cl->ctokens;
+ cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
+ cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
@@ -1177,7 +1216,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
if (parent->cmode != HTB_CAN_SEND)
- htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
+ htb_safe_rb_erase(&parent->pq_node,
+ &q->hlevel[parent->level].wait_pq);
parent->level = 0;
memset(&parent->un.inner, 0, sizeof(parent->un.inner));
@@ -1185,7 +1225,7 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
parent->tokens = parent->buffer;
parent->ctokens = parent->cbuffer;
- parent->t_c = psched_get_time();
+ parent->t_c = ktime_to_ns(ktime_get());
parent->cmode = HTB_CAN_SEND;
}
@@ -1196,9 +1236,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
qdisc_destroy(cl->un.leaf.q);
}
gen_kill_estimator(&cl->bstats, &cl->rate_est);
- qdisc_put_rtab(cl->rate);
- qdisc_put_rtab(cl->ceil);
-
tcf_destroy_chain(&cl->filter_list);
kfree(cl);
}
@@ -1206,24 +1243,25 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
static void htb_destroy(struct Qdisc *sch)
{
struct htb_sched *q = qdisc_priv(sch);
- struct hlist_node *n, *next;
+ struct hlist_node *next;
struct htb_class *cl;
unsigned int i;
cancel_work_sync(&q->work);
qdisc_watchdog_cancel(&q->watchdog);
/* This line used to be after htb_destroy_class call below
- and surprisingly it worked in 2.4. But it must precede it
- because filter need its target class alive to be able to call
- unbind_filter on it (without Oops). */
+ * and surprisingly it worked in 2.4. But it must precede it
+ * because filter need its target class alive to be able to call
+ * unbind_filter on it (without Oops).
+ */
tcf_destroy_chain(&q->filter_list);
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
tcf_destroy_chain(&cl->filter_list);
}
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
common.hnode)
htb_destroy_class(sch, cl);
}
@@ -1239,9 +1277,10 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
struct Qdisc *new_q = NULL;
int last_child = 0;
- // TODO: why don't allow to delete subtree ? references ? does
- // tc subsys quarantee us that in htb_destroy it holds no class
- // refs so that we can remove children safely there ?
+ /* TODO: why don't allow to delete subtree ? references ? does
+ * tc subsys guarantee us that in htb_destroy it holds no class
+ * refs so that we can remove children safely there ?
+ */
if (cl->children || cl->filter_cnt)
return -EBUSY;
@@ -1268,7 +1307,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
htb_deactivate(q, cl);
if (cl->cmode != HTB_CAN_SEND)
- htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
+ htb_safe_rb_erase(&cl->pq_node,
+ &q->hlevel[cl->level].wait_pq);
if (last_child)
htb_parent_to_leaf(q, cl, new_q);
@@ -1299,9 +1339,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
struct htb_sched *q = qdisc_priv(sch);
struct htb_class *cl = (struct htb_class *)*arg, *parent;
struct nlattr *opt = tca[TCA_OPTIONS];
- struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
- struct nlattr *tb[__TCA_HTB_MAX];
+ struct nlattr *tb[TCA_HTB_MAX + 1];
struct tc_htb_opt *hopt;
+ u64 rate64, ceil64;
/* extract all subattrs from opt attr */
if (!opt)
@@ -1318,12 +1358,16 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
hopt = nla_data(tb[TCA_HTB_PARMS]);
-
- rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
- ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
- if (!rtab || !ctab)
+ if (!hopt->rate.rate || !hopt->ceil.rate)
goto failure;
+ /* Keeping backward compatible with rate_table based iproute2 tc */
+ if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
+
+ if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
+
if (!cl) { /* new class */
struct Qdisc *new_q;
int prio;
@@ -1349,19 +1393,22 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
/* check maximal depth */
if (parent && parent->parent && parent->parent->level < 2) {
- printk(KERN_ERR "htb: tree is too deep\n");
+ pr_err("htb: tree is too deep\n");
goto failure;
}
err = -ENOBUFS;
- if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
+ cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+ if (!cl)
goto failure;
- err = gen_new_estimator(&cl->bstats, &cl->rate_est,
- qdisc_root_sleeping_lock(sch),
- tca[TCA_RATE] ? : &est.nla);
- if (err) {
- kfree(cl);
- goto failure;
+ if (htb_rate_est || tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE] ? : &est.nla);
+ if (err) {
+ kfree(cl);
+ goto failure;
+ }
}
cl->refcnt = 1;
@@ -1373,8 +1420,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
RB_CLEAR_NODE(&cl->node[prio]);
/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
- so that can't be used inside of sch_tree_lock
- -- thanks to Karlis Peisenieks */
+ * so that can't be used inside of sch_tree_lock
+ * -- thanks to Karlis Peisenieks
+ */
new_q = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, classid);
sch_tree_lock(sch);
@@ -1390,7 +1438,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
/* remove from evt list because of level change */
if (parent->cmode != HTB_CAN_SEND) {
- htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
+ htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
parent->cmode = HTB_CAN_SEND;
}
parent->level = (parent->parent ? parent->parent->level
@@ -1404,10 +1452,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
cl->parent = parent;
/* set class to be in HTB_CAN_SEND state */
- cl->tokens = hopt->buffer;
- cl->ctokens = hopt->cbuffer;
- cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC; /* 1min */
- cl->t_c = psched_get_time();
+ cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
+ cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
+ cl->mbuffer = 60ULL * NSEC_PER_SEC; /* 1min */
+ cl->t_c = ktime_to_ns(ktime_get());
cl->cmode = HTB_CAN_SEND;
/* attach to the hash list and parent's family */
@@ -1425,20 +1473,30 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
sch_tree_lock(sch);
}
+ rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
+
+ ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
+
+ psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
+ psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
+
/* it used to be a nasty bug here, we have to check that node
- is really leaf before changing cl->un.leaf ! */
+ * is really leaf before changing cl->un.leaf !
+ */
if (!cl->level) {
- cl->quantum = rtab->rate.rate / q->rate2quantum;
+ u64 quantum = cl->rate.rate_bytes_ps;
+
+ do_div(quantum, q->rate2quantum);
+ cl->quantum = min_t(u64, quantum, INT_MAX);
+
if (!hopt->quantum && cl->quantum < 1000) {
- printk(KERN_WARNING
- "HTB: quantum of class %X is small. Consider r2q change.\n",
- cl->common.classid);
+ pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
+ cl->common.classid);
cl->quantum = 1000;
}
if (!hopt->quantum && cl->quantum > 200000) {
- printk(KERN_WARNING
- "HTB: quantum of class %X is big. Consider r2q change.\n",
- cl->common.classid);
+ pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
+ cl->common.classid);
cl->quantum = 200000;
}
if (hopt->quantum)
@@ -1447,14 +1505,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
cl->prio = TC_HTB_NUMPRIO - 1;
}
- cl->buffer = hopt->buffer;
- cl->cbuffer = hopt->cbuffer;
- if (cl->rate)
- qdisc_put_rtab(cl->rate);
- cl->rate = rtab;
- if (cl->ceil)
- qdisc_put_rtab(cl->ceil);
- cl->ceil = ctab;
+ cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
+ cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
+
sch_tree_unlock(sch);
qdisc_class_hash_grow(sch, &q->clhash);
@@ -1463,10 +1516,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
return 0;
failure:
- if (rtab)
- qdisc_put_rtab(rtab);
- if (ctab)
- qdisc_put_rtab(ctab);
return err;
}
@@ -1485,13 +1534,13 @@ static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
struct htb_class *cl = htb_find(classid, sch);
/*if (cl && !cl->level) return 0;
- The line above used to be there to prevent attaching filters to
- leaves. But at least tc_index filter uses this just to get class
- for other reasons so that we have to allow for it.
- ----
- 19.6.2002 As Werner explained it is ok - bind filter is just
- another way to "lock" the class - unlike "get" this lock can
- be broken by class during destroy IIUC.
+ * The line above used to be there to prevent attaching filters to
+ * leaves. But at least tc_index filter uses this just to get class
+ * for other reasons so that we have to allow for it.
+ * ----
+ * 19.6.2002 As Werner explained it is ok - bind filter is just
+ * another way to "lock" the class - unlike "get" this lock can
+ * be broken by class during destroy IIUC.
*/
if (cl)
cl->filter_cnt++;
@@ -1510,14 +1559,13 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct htb_sched *q = qdisc_priv(sch);
struct htb_class *cl;
- struct hlist_node *n;
unsigned int i;
if (arg->stop)
return;
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
if (arg->count < arg->skip) {
arg->count++;
continue;
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index f10e34a6844..62871c14e1f 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -63,8 +63,7 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
result = tc_classify(skb, p->filter_list, &res);
- sch->bstats.packets++;
- sch->bstats.bytes += qdisc_pkt_len(skb);
+ qdisc_bstats_update(sch, skb);
switch (result) {
case TC_ACT_SHOT:
result = TC_ACT_SHOT;
@@ -101,8 +100,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
- nla_nest_end(skb, nest);
- return skb->len;
+ return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index ecc302f4d2a..a8b2864a696 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -11,6 +11,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
+#include <linux/export.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
@@ -56,13 +57,13 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
dev_queue = netdev_get_tx_queue(dev, ntx);
- qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+ qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(ntx + 1)));
if (qdisc == NULL)
goto err;
- qdisc->flags |= TCQ_F_CAN_BYPASS;
priv->qdiscs[ntx] = qdisc;
+ qdisc->flags |= TCQ_F_ONETXQUEUE;
}
sch->flags |= TCQ_F_MQROOT;
@@ -77,14 +78,19 @@ static void mq_attach(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
- struct Qdisc *qdisc;
+ struct Qdisc *qdisc, *old;
unsigned int ntx;
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
qdisc = priv->qdiscs[ntx];
- qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
- if (qdisc)
- qdisc_destroy(qdisc);
+ old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+ if (old)
+ qdisc_destroy(old);
+#ifdef CONFIG_NET_SCHED
+ if (ntx < dev->real_num_tx_queues)
+ qdisc_list_add(qdisc);
+#endif
+
}
kfree(priv->qdiscs);
priv->qdiscs = NULL;
@@ -150,7 +156,8 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
dev_deactivate(dev);
*old = dev_graft_qdisc(dev_queue, new);
-
+ if (new)
+ new->flags |= TCQ_F_ONETXQUEUE;
if (dev->flags & IFF_UP)
dev_activate(dev);
return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
new file mode 100644
index 00000000000..6749e2f540d
--- /dev/null
+++ b/net/sched/sch_mqprio.c
@@ -0,0 +1,426 @@
+/*
+ * net/sched/sch_mqprio.c
+ *
+ * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct mqprio_sched {
+ struct Qdisc **qdiscs;
+ int hw_owned;
+};
+
+static void mqprio_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ unsigned int ntx;
+
+ if (priv->qdiscs) {
+ for (ntx = 0;
+ ntx < dev->num_tx_queues && priv->qdiscs[ntx];
+ ntx++)
+ qdisc_destroy(priv->qdiscs[ntx]);
+ kfree(priv->qdiscs);
+ }
+
+ if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
+ dev->netdev_ops->ndo_setup_tc(dev, 0);
+ else
+ netdev_set_num_tc(dev, 0);
+}
+
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+ int i, j;
+
+ /* Verify num_tc is not out of max range */
+ if (qopt->num_tc > TC_MAX_QUEUE)
+ return -EINVAL;
+
+ /* Verify priority mapping uses valid tcs */
+ for (i = 0; i < TC_BITMASK + 1; i++) {
+ if (qopt->prio_tc_map[i] >= qopt->num_tc)
+ return -EINVAL;
+ }
+
+ /* net_device does not support requested operation */
+ if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
+ return -EINVAL;
+
+ /* if hw owned qcount and qoffset are taken from LLD so
+ * no reason to verify them here
+ */
+ if (qopt->hw)
+ return 0;
+
+ for (i = 0; i < qopt->num_tc; i++) {
+ unsigned int last = qopt->offset[i] + qopt->count[i];
+
+ /* Verify the queue count is in tx range being equal to the
+ * real_num_tx_queues indicates the last queue is in use.
+ */
+ if (qopt->offset[i] >= dev->real_num_tx_queues ||
+ !qopt->count[i] ||
+ last > dev->real_num_tx_queues)
+ return -EINVAL;
+
+ /* Verify that the offset and counts do not overlap */
+ for (j = i + 1; j < qopt->num_tc; j++) {
+ if (last > qopt->offset[j])
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct netdev_queue *dev_queue;
+ struct Qdisc *qdisc;
+ int i, err = -EOPNOTSUPP;
+ struct tc_mqprio_qopt *qopt = NULL;
+
+ BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
+ BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
+
+ if (sch->parent != TC_H_ROOT)
+ return -EOPNOTSUPP;
+
+ if (!netif_is_multiqueue(dev))
+ return -EOPNOTSUPP;
+
+ if (!opt || nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+
+ qopt = nla_data(opt);
+ if (mqprio_parse_opt(dev, qopt))
+ return -EINVAL;
+
+ /* pre-allocate qdisc, attachment can't fail */
+ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+ GFP_KERNEL);
+ if (priv->qdiscs == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ dev_queue = netdev_get_tx_queue(dev, i);
+ qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1)));
+ if (qdisc == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+ priv->qdiscs[i] = qdisc;
+ qdisc->flags |= TCQ_F_ONETXQUEUE;
+ }
+
+ /* If the mqprio options indicate that hardware should own
+ * the queue mapping then run ndo_setup_tc otherwise use the
+ * supplied and verified mapping
+ */
+ if (qopt->hw) {
+ priv->hw_owned = 1;
+ err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
+ if (err)
+ goto err;
+ } else {
+ netdev_set_num_tc(dev, qopt->num_tc);
+ for (i = 0; i < qopt->num_tc; i++)
+ netdev_set_tc_queue(dev, i,
+ qopt->count[i], qopt->offset[i]);
+ }
+
+ /* Always use supplied priority mappings */
+ for (i = 0; i < TC_BITMASK + 1; i++)
+ netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
+
+ sch->flags |= TCQ_F_MQROOT;
+ return 0;
+
+err:
+ mqprio_destroy(sch);
+ return err;
+}
+
+static void mqprio_attach(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct Qdisc *qdisc, *old;
+ unsigned int ntx;
+
+ /* Attach underlying qdisc */
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ qdisc = priv->qdiscs[ntx];
+ old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+ if (old)
+ qdisc_destroy(old);
+ if (ntx < dev->real_num_tx_queues)
+ qdisc_list_add(qdisc);
+ }
+ kfree(priv->qdiscs);
+ priv->qdiscs = NULL;
+}
+
+static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
+
+ if (ntx >= dev->num_tx_queues)
+ return NULL;
+ return netdev_get_tx_queue(dev, ntx);
+}
+
+static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return -EINVAL;
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ *old = dev_graft_qdisc(dev_queue, new);
+
+ if (new)
+ new->flags |= TCQ_F_ONETXQUEUE;
+
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+
+ return 0;
+}
+
+static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_mqprio_qopt opt = { 0 };
+ struct Qdisc *qdisc;
+ unsigned int i;
+
+ sch->q.qlen = 0;
+ memset(&sch->bstats, 0, sizeof(sch->bstats));
+ memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+ spin_lock_bh(qdisc_lock(qdisc));
+ sch->q.qlen += qdisc->q.qlen;
+ sch->bstats.bytes += qdisc->bstats.bytes;
+ sch->bstats.packets += qdisc->bstats.packets;
+ sch->qstats.qlen += qdisc->qstats.qlen;
+ sch->qstats.backlog += qdisc->qstats.backlog;
+ sch->qstats.drops += qdisc->qstats.drops;
+ sch->qstats.requeues += qdisc->qstats.requeues;
+ sch->qstats.overlimits += qdisc->qstats.overlimits;
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+
+ opt.num_tc = netdev_get_num_tc(dev);
+ memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+ opt.hw = priv->hw_owned;
+
+ for (i = 0; i < netdev_get_num_tc(dev); i++) {
+ opt.count[i] = dev->tc_to_txq[i].count;
+ opt.offset[i] = dev->tc_to_txq[i].offset;
+ }
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return skb->len;
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return NULL;
+
+ return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int ntx = TC_H_MIN(classid);
+
+ if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
+ return 0;
+ return ntx;
+}
+
+static void mqprio_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct net_device *dev = qdisc_dev(sch);
+
+ if (cl <= netdev_get_num_tc(dev)) {
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_info = 0;
+ } else {
+ int i;
+ struct netdev_queue *dev_queue;
+
+ dev_queue = mqprio_queue_get(sch, cl);
+ tcm->tcm_parent = 0;
+ for (i = 0; i < netdev_get_num_tc(dev); i++) {
+ struct netdev_tc_txq tc = dev->tc_to_txq[i];
+ int q_idx = cl - netdev_get_num_tc(dev);
+
+ if (q_idx > tc.offset &&
+ q_idx <= tc.offset + tc.count) {
+ tcm->tcm_parent =
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1));
+ break;
+ }
+ }
+ tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+ }
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+ __releases(d->lock)
+ __acquires(d->lock)
+{
+ struct net_device *dev = qdisc_dev(sch);
+
+ if (cl <= netdev_get_num_tc(dev)) {
+ int i;
+ struct Qdisc *qdisc;
+ struct gnet_stats_queue qstats = {0};
+ struct gnet_stats_basic_packed bstats = {0};
+ struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
+
+ /* Drop lock here it will be reclaimed before touching
+ * statistics this is required because the d->lock we
+ * hold here is the look on dev_queue->qdisc_sleeping
+ * also acquired below.
+ */
+ spin_unlock_bh(d->lock);
+
+ for (i = tc.offset; i < tc.offset + tc.count; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+ spin_lock_bh(qdisc_lock(qdisc));
+ bstats.bytes += qdisc->bstats.bytes;
+ bstats.packets += qdisc->bstats.packets;
+ qstats.qlen += qdisc->qstats.qlen;
+ qstats.backlog += qdisc->qstats.backlog;
+ qstats.drops += qdisc->qstats.drops;
+ qstats.requeues += qdisc->qstats.requeues;
+ qstats.overlimits += qdisc->qstats.overlimits;
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+ /* Reclaim root sleeping lock before completing stats */
+ spin_lock_bh(d->lock);
+ if (gnet_stats_copy_basic(d, &bstats) < 0 ||
+ gnet_stats_copy_queue(d, &qstats) < 0)
+ return -1;
+ } else {
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ sch = dev_queue->qdisc_sleeping;
+ sch->qstats.qlen = sch->q.qlen;
+ if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+ gnet_stats_copy_queue(d, &sch->qstats) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx;
+
+ if (arg->stop)
+ return;
+
+ /* Walk hierarchy with a virtual class per tc */
+ arg->count = arg->skip;
+ for (ntx = arg->skip;
+ ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
+ ntx++) {
+ if (arg->fn(sch, ntx + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops mqprio_class_ops = {
+ .graft = mqprio_graft,
+ .leaf = mqprio_leaf,
+ .get = mqprio_get,
+ .put = mqprio_put,
+ .walk = mqprio_walk,
+ .dump = mqprio_dump_class,
+ .dump_stats = mqprio_dump_class_stats,
+};
+
+static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
+ .cl_ops = &mqprio_class_ops,
+ .id = "mqprio",
+ .priv_size = sizeof(struct mqprio_sched),
+ .init = mqprio_init,
+ .destroy = mqprio_destroy,
+ .attach = mqprio_attach,
+ .dump = mqprio_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init mqprio_module_init(void)
+{
+ return register_qdisc(&mqprio_qdisc_ops);
+}
+
+static void __exit mqprio_module_exit(void)
+{
+ unregister_qdisc(&mqprio_qdisc_ops);
+}
+
+module_init(mqprio_module_init);
+module_exit(mqprio_module_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 32690deab5d..afb050a735f 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -11,8 +11,7 @@
* more details.
*
* You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * this program; if not, see <http://www.gnu.org/licenses/>.
*
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
*/
@@ -83,8 +82,6 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
- sch->bstats.bytes += qdisc_pkt_len(skb);
- sch->bstats.packets++;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -109,10 +106,12 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
/* Check that target subqueue is available before
* pulling an skb to avoid head-of-line blocking.
*/
- if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) {
+ if (!netif_xmit_stopped(
+ netdev_get_tx_queue(qdisc_dev(sch), q->curband))) {
qdisc = q->queues[q->curband];
skb = qdisc->dequeue(qdisc);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -139,7 +138,8 @@ static struct sk_buff *multiq_peek(struct Qdisc *sch)
/* Check that target subqueue is available before
* pulling an skb to avoid head-of-line blocking.
*/
- if (!__netif_subqueue_stopped(qdisc_dev(sch), curband)) {
+ if (!netif_xmit_stopped(
+ netdev_get_tx_queue(qdisc_dev(sch), curband))) {
qdisc = q->queues[curband];
skb = qdisc->ops->peek(qdisc);
if (skb)
@@ -157,7 +157,7 @@ static unsigned int multiq_drop(struct Qdisc *sch)
unsigned int len;
struct Qdisc *qdisc;
- for (band = q->bands-1; band >= 0; band--) {
+ for (band = q->bands - 1; band >= 0; band--) {
qdisc = q->queues[band];
if (qdisc->ops->drop) {
len = qdisc->ops->drop(qdisc);
@@ -266,7 +266,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
for (i = 0; i < q->max_bands; i++)
q->queues[i] = &noop_qdisc;
- err = multiq_tune(sch,opt);
+ err = multiq_tune(sch, opt);
if (err)
kfree(q->queues);
@@ -283,7 +283,8 @@ static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.bands = q->bands;
opt.max_bands = q->max_bands;
- NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
return skb->len;
@@ -347,7 +348,7 @@ static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
struct multiq_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(cl);
- tcm->tcm_info = q->queues[cl-1]->handle;
+ tcm->tcm_info = q->queues[cl - 1]->handle;
return 0;
}
@@ -379,7 +380,7 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
arg->count++;
continue;
}
- if (arg->fn(sch, band+1, arg) < 0) {
+ if (arg->fn(sch, band + 1, arg) < 0) {
arg->stop = 1;
break;
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e5593c083a7..111d70fddae 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -13,18 +13,23 @@
* Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
*/
+#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
+#include <linux/reciprocal_div.h>
+#include <linux/rbtree.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
-#define VERSION "1.2"
+#define VERSION "1.3"
/* Network Emulation Queuing algorithm.
====================================
@@ -47,22 +52,47 @@
layering other disciplines. It does not need to do bandwidth
control either since that can be handled by using token
bucket or other rate control.
+
+ Correlated Loss Generator models
+
+ Added generation of correlated loss according to the
+ "Gilbert-Elliot" model, a 4-state markov model.
+
+ References:
+ [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
+ [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
+ and intuitive loss model for packet networks and its implementation
+ in the Netem module in the Linux kernel", available in [1]
+
+ Authors: Stefano Salsano <stefano.salsano at uniroma2.it
+ Fabio Ludovici <fabio.ludovici at yahoo.it>
*/
struct netem_sched_data {
+ /* internal t(ime)fifo qdisc uses t_root and sch->limit */
+ struct rb_root t_root;
+
+ /* optional qdisc for classful handling (NULL at netem init) */
struct Qdisc *qdisc;
+
struct qdisc_watchdog watchdog;
psched_tdiff_t latency;
psched_tdiff_t jitter;
u32 loss;
+ u32 ecn;
u32 limit;
u32 counter;
u32 gap;
u32 duplicate;
u32 reorder;
u32 corrupt;
+ u64 rate;
+ s32 packet_overhead;
+ u32 cell_size;
+ struct reciprocal_value cell_size_reciprocal;
+ s32 cell_overhead;
struct crndstate {
u32 last;
@@ -73,17 +103,75 @@ struct netem_sched_data {
u32 size;
s16 table[0];
} *delay_dist;
+
+ enum {
+ CLG_RANDOM,
+ CLG_4_STATES,
+ CLG_GILB_ELL,
+ } loss_model;
+
+ enum {
+ TX_IN_GAP_PERIOD = 1,
+ TX_IN_BURST_PERIOD,
+ LOST_IN_GAP_PERIOD,
+ LOST_IN_BURST_PERIOD,
+ } _4_state_model;
+
+ enum {
+ GOOD_STATE = 1,
+ BAD_STATE,
+ } GE_state_model;
+
+ /* Correlated Loss Generation models */
+ struct clgstate {
+ /* state of the Markov chain */
+ u8 state;
+
+ /* 4-states and Gilbert-Elliot models */
+ u32 a1; /* p13 for 4-states or p for GE */
+ u32 a2; /* p31 for 4-states or r for GE */
+ u32 a3; /* p32 for 4-states or h for GE */
+ u32 a4; /* p14 for 4-states or 1-k for GE */
+ u32 a5; /* p23 used only in 4-states */
+ } clg;
+
};
-/* Time stamp put into socket buffer control block */
+/* Time stamp put into socket buffer control block
+ * Only valid when skbs are in our internal t(ime)fifo queue.
+ */
struct netem_skb_cb {
psched_time_t time_to_send;
+ ktime_t tstamp_save;
};
+/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
+ * to hold a rb_node structure.
+ *
+ * If struct sk_buff layout is changed, the following checks will complain.
+ */
+static struct rb_node *netem_rb_node(struct sk_buff *skb)
+{
+ BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
+ BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
+ offsetof(struct sk_buff, next) + sizeof(skb->next));
+ BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
+ offsetof(struct sk_buff, prev) + sizeof(skb->prev));
+ BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
+ sizeof(skb->prev) +
+ sizeof(skb->tstamp));
+ return (struct rb_node *)&skb->next;
+}
+
+static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
+{
+ return (struct sk_buff *)rb;
+}
+
static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
{
- BUILD_BUG_ON(sizeof(skb->cb) <
- sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
+ /* we assume we can use skb next/prev/tstamp as storage for rb_node */
+ qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
}
@@ -93,7 +181,7 @@ static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
static void init_crandom(struct crndstate *state, unsigned long rho)
{
state->rho = rho;
- state->last = net_random();
+ state->last = prandom_u32();
}
/* get_crandom - correlated random number generator
@@ -106,15 +194,133 @@ static u32 get_crandom(struct crndstate *state)
unsigned long answer;
if (state->rho == 0) /* no correlation */
- return net_random();
+ return prandom_u32();
- value = net_random();
+ value = prandom_u32();
rho = (u64)state->rho + 1;
answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
state->last = answer;
return answer;
}
+/* loss_4state - 4-state model loss generator
+ * Generates losses according to the 4-state Markov chain adopted in
+ * the GI (General and Intuitive) loss model.
+ */
+static bool loss_4state(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+ u32 rnd = prandom_u32();
+
+ /*
+ * Makes a comparison between rnd and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state and if the next packet has to be transmitted or lost.
+ * The four states correspond to:
+ * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
+ * LOST_IN_BURST_PERIOD => isolated losses within a gap period
+ * LOST_IN_GAP_PERIOD => lost packets within a burst period
+ * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
+ */
+ switch (clg->state) {
+ case TX_IN_GAP_PERIOD:
+ if (rnd < clg->a4) {
+ clg->state = LOST_IN_BURST_PERIOD;
+ return true;
+ } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ } else if (clg->a1 + clg->a4 < rnd) {
+ clg->state = TX_IN_GAP_PERIOD;
+ }
+
+ break;
+ case TX_IN_BURST_PERIOD:
+ if (rnd < clg->a5) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ } else {
+ clg->state = TX_IN_BURST_PERIOD;
+ }
+
+ break;
+ case LOST_IN_GAP_PERIOD:
+ if (rnd < clg->a3)
+ clg->state = TX_IN_BURST_PERIOD;
+ else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
+ clg->state = TX_IN_GAP_PERIOD;
+ } else if (clg->a2 + clg->a3 < rnd) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ }
+ break;
+ case LOST_IN_BURST_PERIOD:
+ clg->state = TX_IN_GAP_PERIOD;
+ break;
+ }
+
+ return false;
+}
+
+/* loss_gilb_ell - Gilbert-Elliot model loss generator
+ * Generates losses according to the Gilbert-Elliot loss model or
+ * its special cases (Gilbert or Simple Gilbert)
+ *
+ * Makes a comparison between random number and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state. A second random number is extracted and the comparison
+ * with the loss probability of the current state decides if the next
+ * packet will be transmitted or lost.
+ */
+static bool loss_gilb_ell(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+
+ switch (clg->state) {
+ case GOOD_STATE:
+ if (prandom_u32() < clg->a1)
+ clg->state = BAD_STATE;
+ if (prandom_u32() < clg->a4)
+ return true;
+ break;
+ case BAD_STATE:
+ if (prandom_u32() < clg->a2)
+ clg->state = GOOD_STATE;
+ if (prandom_u32() > clg->a3)
+ return true;
+ }
+
+ return false;
+}
+
+static bool loss_event(struct netem_sched_data *q)
+{
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* Random packet drop 0 => none, ~0 => all */
+ return q->loss && q->loss >= get_crandom(&q->loss_cor);
+
+ case CLG_4_STATES:
+ /* 4state loss model algorithm (used also for GI model)
+ * Extracts a value from the markov 4 state loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_4state(q);
+
+ case CLG_GILB_ELL:
+ /* Gilbert-Elliot loss model algorithm
+ * Extracts a value from the Gilbert-Elliot loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_gilb_ell(q);
+ }
+
+ return false; /* not reached */
+}
+
+
/* tabledist - return a pseudo-randomly distributed value with mean mu and
* std deviation sigma. Uses table lookup to approximate the desired
* distribution, and a uniformly-distributed pseudo-random source.
@@ -146,6 +352,62 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
}
+static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
+{
+ u64 ticks;
+
+ len += q->packet_overhead;
+
+ if (q->cell_size) {
+ u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
+
+ if (len > cells * q->cell_size) /* extra cell needed for remainder */
+ cells++;
+ len = cells * (q->cell_size + q->cell_overhead);
+ }
+
+ ticks = (u64)len * NSEC_PER_SEC;
+
+ do_div(ticks, q->rate);
+ return PSCHED_NS2TICKS(ticks);
+}
+
+static void tfifo_reset(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p;
+
+ while ((p = rb_first(&q->t_root))) {
+ struct sk_buff *skb = netem_rb_to_skb(p);
+
+ rb_erase(p, &q->t_root);
+ skb->next = NULL;
+ skb->prev = NULL;
+ kfree_skb(skb);
+ }
+}
+
+static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
+ struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct sk_buff *skb;
+
+ parent = *p;
+ skb = netem_rb_to_skb(parent);
+ if (tnext >= netem_skb_cb(skb)->time_to_send)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(netem_rb_node(nskb), parent, p);
+ rb_insert_color(netem_rb_node(nskb), &q->t_root);
+ sch->q.qlen++;
+}
+
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
@@ -158,26 +420,30 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
/* We don't fill cb now as skb_unshare() may invalidate it */
struct netem_skb_cb *cb;
struct sk_buff *skb2;
- int ret;
int count = 1;
- pr_debug("netem_enqueue skb=%p\n", skb);
-
/* Random duplication */
if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
++count;
- /* Random packet drop 0 => none, ~0 => all */
- if (q->loss && q->loss >= get_crandom(&q->loss_cor))
- --count;
-
+ /* Drop packet? */
+ if (loss_event(q)) {
+ if (q->ecn && INET_ECN_set_ce(skb))
+ sch->qstats.drops++; /* mark packet */
+ else
+ --count;
+ }
if (count == 0) {
sch->qstats.drops++;
kfree_skb(skb);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
- skb_orphan(skb);
+ /* If a delay is expected, orphan the skb. (orphaning usually takes
+ * place at TX completion time, so _before_ the link transit delay)
+ */
+ if (q->latency || q->jitter)
+ skb_orphan_partial(skb);
/*
* If we need to duplicate packet, then re-insert at top of the
@@ -202,17 +468,21 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
(skb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(skb))) {
- sch->qstats.drops++;
- return NET_XMIT_DROP;
- }
+ skb_checksum_help(skb)))
+ return qdisc_drop(skb, sch);
- skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
+ skb->data[prandom_u32() % skb_headlen(skb)] ^=
+ 1<<(prandom_u32() % 8);
}
+ if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
+ return qdisc_reshape_fail(skb, sch);
+
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+
cb = netem_skb_cb(skb);
- if (q->gap == 0 || /* not doing reordering */
- q->counter < q->gap || /* inside last reordering gap */
+ if (q->gap == 0 || /* not doing reordering */
+ q->counter < q->gap - 1 || /* inside last reordering gap */
q->reorder < get_crandom(&q->reorder_cor)) {
psched_time_t now;
psched_tdiff_t delay;
@@ -221,9 +491,32 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
&q->delay_cor, q->delay_dist);
now = psched_get_time();
+
+ if (q->rate) {
+ struct sk_buff *last;
+
+ if (!skb_queue_empty(&sch->q))
+ last = skb_peek_tail(&sch->q);
+ else
+ last = netem_rb_to_skb(rb_last(&q->t_root));
+ if (last) {
+ /*
+ * Last packet in queue is reference point (now),
+ * calculate this time bonus and subtract
+ * from delay.
+ */
+ delay -= netem_skb_cb(last)->time_to_send - now;
+ delay = max_t(psched_tdiff_t, 0, delay);
+ now = netem_skb_cb(last)->time_to_send;
+ }
+
+ delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
+ }
+
cb->time_to_send = now + delay;
+ cb->tstamp_save = skb->tstamp;
++q->counter;
- ret = qdisc_enqueue(skb, q->qdisc);
+ tfifo_enqueue(skb, sch);
} else {
/*
* Do re-ordering by putting one out of N packets at the front
@@ -232,33 +525,40 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
cb->time_to_send = psched_get_time();
q->counter = 0;
- __skb_queue_head(&q->qdisc->q, skb);
- q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
- q->qdisc->qstats.requeues++;
- ret = NET_XMIT_SUCCESS;
- }
-
- if (likely(ret == NET_XMIT_SUCCESS)) {
- sch->q.qlen++;
- sch->bstats.bytes += qdisc_pkt_len(skb);
- sch->bstats.packets++;
- } else if (net_xmit_drop_count(ret)) {
- sch->qstats.drops++;
+ __skb_queue_head(&sch->q, skb);
+ sch->qstats.requeues++;
}
- pr_debug("netem: enqueue ret %d\n", ret);
- return ret;
+ return NET_XMIT_SUCCESS;
}
-static unsigned int netem_drop(struct Qdisc* sch)
+static unsigned int netem_drop(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
- unsigned int len = 0;
+ unsigned int len;
- if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
- sch->q.qlen--;
- sch->qstats.drops++;
+ len = qdisc_queue_drop(sch);
+
+ if (!len) {
+ struct rb_node *p = rb_first(&q->t_root);
+
+ if (p) {
+ struct sk_buff *skb = netem_rb_to_skb(p);
+
+ rb_erase(p, &q->t_root);
+ sch->q.qlen--;
+ skb->next = NULL;
+ skb->prev = NULL;
+ len = qdisc_pkt_len(skb);
+ sch->qstats.backlog -= len;
+ kfree_skb(skb);
+ }
}
+ if (!len && q->qdisc && q->qdisc->ops->drop)
+ len = q->qdisc->ops->drop(q->qdisc);
+ if (len)
+ sch->qstats.drops++;
+
return len;
}
@@ -266,20 +566,35 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
+ struct rb_node *p;
- if (sch->flags & TCQ_F_THROTTLED)
+ if (qdisc_is_throttled(sch))
return NULL;
- skb = q->qdisc->ops->peek(q->qdisc);
+tfifo_dequeue:
+ skb = __skb_dequeue(&sch->q);
if (skb) {
- const struct netem_skb_cb *cb = netem_skb_cb(skb);
- psched_time_t now = psched_get_time();
+deliver:
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
+ return skb;
+ }
+ p = rb_first(&q->t_root);
+ if (p) {
+ psched_time_t time_to_send;
+
+ skb = netem_rb_to_skb(p);
/* if more time remaining? */
- if (cb->time_to_send <= now) {
- skb = qdisc_dequeue_peeked(q->qdisc);
- if (unlikely(!skb))
- return NULL;
+ time_to_send = netem_skb_cb(skb)->time_to_send;
+ if (time_to_send <= psched_get_time()) {
+ rb_erase(p, &q->t_root);
+
+ sch->q.qlen--;
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->tstamp = netem_skb_cb(skb)->tstamp_save;
#ifdef CONFIG_NET_CLS_ACT
/*
@@ -289,14 +604,34 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
skb->tstamp.tv64 = 0;
#endif
- pr_debug("netem_dequeue: return skb=%p\n", skb);
- sch->q.qlen--;
- return skb;
+
+ if (q->qdisc) {
+ int err = qdisc_enqueue(skb, q->qdisc);
+
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ sch->qstats.drops++;
+ qdisc_tree_decrease_qlen(sch, 1);
+ }
+ }
+ goto tfifo_dequeue;
+ }
+ goto deliver;
}
- qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
+ if (q->qdisc) {
+ skb = q->qdisc->ops->dequeue(q->qdisc);
+ if (skb)
+ goto deliver;
+ }
+ qdisc_watchdog_schedule(&q->watchdog, time_to_send);
}
+ if (q->qdisc) {
+ skb = q->qdisc->ops->dequeue(q->qdisc);
+ if (skb)
+ goto deliver;
+ }
return NULL;
}
@@ -304,11 +639,18 @@ static void netem_reset(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
- qdisc_reset(q->qdisc);
- sch->q.qlen = 0;
+ qdisc_reset_queue(sch);
+ tfifo_reset(sch);
+ if (q->qdisc)
+ qdisc_reset(q->qdisc);
qdisc_watchdog_cancel(&q->watchdog);
}
+static void dist_free(struct disttable *d)
+{
+ kvfree(d);
+}
+
/*
* Distribution data is a variable size payload containing
* signed 16 bit values.
@@ -316,16 +658,20 @@ static void netem_reset(struct Qdisc *sch)
static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
{
struct netem_sched_data *q = qdisc_priv(sch);
- unsigned long n = nla_len(attr)/sizeof(__s16);
+ size_t n = nla_len(attr)/sizeof(__s16);
const __s16 *data = nla_data(attr);
spinlock_t *root_lock;
struct disttable *d;
int i;
+ size_t s;
- if (n > 65536)
+ if (n > NETEM_DIST_MAX)
return -EINVAL;
- d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
+ s = sizeof(struct disttable) + n * sizeof(s16);
+ d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
+ if (!d)
+ d = vmalloc(s);
if (!d)
return -ENOMEM;
@@ -336,15 +682,15 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
root_lock = qdisc_root_sleeping_lock(sch);
spin_lock_bh(root_lock);
- kfree(q->delay_dist);
- q->delay_dist = d;
+ swap(q->delay_dist, d);
spin_unlock_bh(root_lock);
+
+ dist_free(d);
return 0;
}
-static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
+static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
{
- struct netem_sched_data *q = qdisc_priv(sch);
const struct tc_netem_corr *c = nla_data(attr);
init_crandom(&q->delay_cor, c->delay_corr);
@@ -352,28 +698,98 @@ static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
init_crandom(&q->dup_cor, c->dup_corr);
}
-static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
+static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
{
- struct netem_sched_data *q = qdisc_priv(sch);
const struct tc_netem_reorder *r = nla_data(attr);
q->reorder = r->probability;
init_crandom(&q->reorder_cor, r->correlation);
}
-static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
+static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
{
- struct netem_sched_data *q = qdisc_priv(sch);
const struct tc_netem_corrupt *r = nla_data(attr);
q->corrupt = r->probability;
init_crandom(&q->corrupt_cor, r->correlation);
}
+static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_rate *r = nla_data(attr);
+
+ q->rate = r->rate;
+ q->packet_overhead = r->packet_overhead;
+ q->cell_size = r->cell_size;
+ q->cell_overhead = r->cell_overhead;
+ if (q->cell_size)
+ q->cell_size_reciprocal = reciprocal_value(q->cell_size);
+ else
+ q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
+}
+
+static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct nlattr *la;
+ int rem;
+
+ nla_for_each_nested(la, attr, rem) {
+ u16 type = nla_type(la);
+
+ switch (type) {
+ case NETEM_LOSS_GI: {
+ const struct tc_netem_gimodel *gi = nla_data(la);
+
+ if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
+ pr_info("netem: incorrect gi model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_4_STATES;
+
+ q->clg.state = TX_IN_GAP_PERIOD;
+ q->clg.a1 = gi->p13;
+ q->clg.a2 = gi->p31;
+ q->clg.a3 = gi->p32;
+ q->clg.a4 = gi->p14;
+ q->clg.a5 = gi->p23;
+ break;
+ }
+
+ case NETEM_LOSS_GE: {
+ const struct tc_netem_gemodel *ge = nla_data(la);
+
+ if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
+ pr_info("netem: incorrect ge model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_GILB_ELL;
+ q->clg.state = GOOD_STATE;
+ q->clg.a1 = ge->p;
+ q->clg.a2 = ge->r;
+ q->clg.a3 = ge->h;
+ q->clg.a4 = ge->k1;
+ break;
+ }
+
+ default:
+ pr_info("netem: unknown loss type %u\n", type);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
[TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
[TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
[TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
+ [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
+ [TCA_NETEM_LOSS] = { .type = NLA_NESTED },
+ [TCA_NETEM_ECN] = { .type = NLA_U32 },
+ [TCA_NETEM_RATE64] = { .type = NLA_U64 },
};
static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
@@ -381,11 +797,15 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
{
int nested_len = nla_len(nla) - NLA_ALIGN(len);
- if (nested_len < 0)
+ if (nested_len < 0) {
+ pr_info("netem: invalid attributes len %d\n", nested_len);
return -EINVAL;
+ }
+
if (nested_len >= nla_attr_size(0))
return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
nested_len, policy);
+
memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
return 0;
}
@@ -396,6 +816,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
struct netem_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_NETEM_MAX + 1];
struct tc_netem_qopt *qopt;
+ struct clgstate old_clg;
+ int old_loss_model = CLG_RANDOM;
int ret;
if (opt == NULL)
@@ -406,12 +828,35 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
if (ret < 0)
return ret;
- ret = fifo_set_limit(q->qdisc, qopt->limit);
- if (ret) {
- pr_debug("netem: can't set fifo limit\n");
- return ret;
+ /* backup q->clg and q->loss_model */
+ old_clg = q->clg;
+ old_loss_model = q->loss_model;
+
+ if (tb[TCA_NETEM_LOSS]) {
+ ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
+ if (ret) {
+ q->loss_model = old_loss_model;
+ return ret;
+ }
+ } else {
+ q->loss_model = CLG_RANDOM;
}
+ if (tb[TCA_NETEM_DELAY_DIST]) {
+ ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
+ if (ret) {
+ /* recover clg and loss_model, in case of
+ * q->clg and q->loss_model were modified
+ * in get_loss_clg()
+ */
+ q->clg = old_clg;
+ q->loss_model = old_loss_model;
+ return ret;
+ }
+ }
+
+ sch->limit = qopt->limit;
+
q->latency = qopt->latency;
q->jitter = qopt->jitter;
q->limit = qopt->limit;
@@ -427,107 +872,27 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
q->reorder = ~0;
if (tb[TCA_NETEM_CORR])
- get_correlation(sch, tb[TCA_NETEM_CORR]);
-
- if (tb[TCA_NETEM_DELAY_DIST]) {
- ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
- if (ret)
- return ret;
- }
+ get_correlation(q, tb[TCA_NETEM_CORR]);
if (tb[TCA_NETEM_REORDER])
- get_reorder(sch, tb[TCA_NETEM_REORDER]);
+ get_reorder(q, tb[TCA_NETEM_REORDER]);
if (tb[TCA_NETEM_CORRUPT])
- get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
+ get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
- return 0;
-}
+ if (tb[TCA_NETEM_RATE])
+ get_rate(q, tb[TCA_NETEM_RATE]);
-/*
- * Special case version of FIFO queue for use by netem.
- * It queues in order based on timestamps in skb's
- */
-struct fifo_sched_data {
- u32 limit;
- psched_time_t oldest;
-};
-
-static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
-{
- struct fifo_sched_data *q = qdisc_priv(sch);
- struct sk_buff_head *list = &sch->q;
- psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
- struct sk_buff *skb;
-
- if (likely(skb_queue_len(list) < q->limit)) {
- /* Optimize for add at tail */
- if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
- q->oldest = tnext;
- return qdisc_enqueue_tail(nskb, sch);
- }
-
- skb_queue_reverse_walk(list, skb) {
- const struct netem_skb_cb *cb = netem_skb_cb(skb);
-
- if (tnext >= cb->time_to_send)
- break;
- }
-
- __skb_queue_after(list, skb, nskb);
-
- sch->qstats.backlog += qdisc_pkt_len(nskb);
- sch->bstats.bytes += qdisc_pkt_len(nskb);
- sch->bstats.packets++;
-
- return NET_XMIT_SUCCESS;
- }
-
- return qdisc_reshape_fail(nskb, sch);
-}
-
-static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
-{
- struct fifo_sched_data *q = qdisc_priv(sch);
-
- if (opt) {
- struct tc_fifo_qopt *ctl = nla_data(opt);
- if (nla_len(opt) < sizeof(*ctl))
- return -EINVAL;
-
- q->limit = ctl->limit;
- } else
- q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
-
- q->oldest = PSCHED_PASTPERFECT;
- return 0;
-}
-
-static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
-{
- struct fifo_sched_data *q = qdisc_priv(sch);
- struct tc_fifo_qopt opt = { .limit = q->limit };
+ if (tb[TCA_NETEM_RATE64])
+ q->rate = max_t(u64, q->rate,
+ nla_get_u64(tb[TCA_NETEM_RATE64]));
- NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
- return skb->len;
+ if (tb[TCA_NETEM_ECN])
+ q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
-nla_put_failure:
- return -1;
+ return ret;
}
-static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
- .id = "tfifo",
- .priv_size = sizeof(struct fifo_sched_data),
- .enqueue = tfifo_enqueue,
- .dequeue = qdisc_dequeue_head,
- .peek = qdisc_peek_head,
- .drop = qdisc_queue_drop,
- .init = tfifo_init,
- .reset = qdisc_reset_queue,
- .change = tfifo_init,
- .dump = tfifo_dump,
-};
-
static int netem_init(struct Qdisc *sch, struct nlattr *opt)
{
struct netem_sched_data *q = qdisc_priv(sch);
@@ -538,18 +903,10 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt)
qdisc_watchdog_init(&q->watchdog, sch);
- q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
- TC_H_MAKE(sch->handle, 1));
- if (!q->qdisc) {
- pr_debug("netem: qdisc create failed\n");
- return -ENOMEM;
- }
-
+ q->loss_model = CLG_RANDOM;
ret = netem_change(sch, opt);
- if (ret) {
- pr_debug("netem: change failed\n");
- qdisc_destroy(q->qdisc);
- }
+ if (ret)
+ pr_info("netem: change failed\n");
return ret;
}
@@ -558,19 +915,70 @@ static void netem_destroy(struct Qdisc *sch)
struct netem_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
- qdisc_destroy(q->qdisc);
- kfree(q->delay_dist);
+ if (q->qdisc)
+ qdisc_destroy(q->qdisc);
+ dist_free(q->delay_dist);
+}
+
+static int dump_loss_model(const struct netem_sched_data *q,
+ struct sk_buff *skb)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_NETEM_LOSS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* legacy loss model */
+ nla_nest_cancel(skb, nest);
+ return 0; /* no data */
+
+ case CLG_4_STATES: {
+ struct tc_netem_gimodel gi = {
+ .p13 = q->clg.a1,
+ .p31 = q->clg.a2,
+ .p32 = q->clg.a3,
+ .p14 = q->clg.a4,
+ .p23 = q->clg.a5,
+ };
+
+ if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
+ goto nla_put_failure;
+ break;
+ }
+ case CLG_GILB_ELL: {
+ struct tc_netem_gemodel ge = {
+ .p = q->clg.a1,
+ .r = q->clg.a2,
+ .h = q->clg.a3,
+ .k1 = q->clg.a4,
+ };
+
+ if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
+ goto nla_put_failure;
+ break;
+ }
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
}
static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
{
const struct netem_sched_data *q = qdisc_priv(sch);
- unsigned char *b = skb_tail_pointer(skb);
- struct nlattr *nla = (struct nlattr *) b;
+ struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
struct tc_netem_qopt qopt;
struct tc_netem_corr cor;
struct tc_netem_reorder reorder;
struct tc_netem_corrupt corrupt;
+ struct tc_netem_rate rate;
qopt.latency = q->latency;
qopt.jitter = q->jitter;
@@ -578,32 +986,121 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
qopt.loss = q->loss;
qopt.gap = q->gap;
qopt.duplicate = q->duplicate;
- NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
+ if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
+ goto nla_put_failure;
cor.delay_corr = q->delay_cor.rho;
cor.loss_corr = q->loss_cor.rho;
cor.dup_corr = q->dup_cor.rho;
- NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
+ if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
+ goto nla_put_failure;
reorder.probability = q->reorder;
reorder.correlation = q->reorder_cor.rho;
- NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
+ if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
+ goto nla_put_failure;
corrupt.probability = q->corrupt;
corrupt.correlation = q->corrupt_cor.rho;
- NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
+ if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
+ goto nla_put_failure;
- nla->nla_len = skb_tail_pointer(skb) - b;
+ if (q->rate >= (1ULL << 32)) {
+ if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate))
+ goto nla_put_failure;
+ rate.rate = ~0U;
+ } else {
+ rate.rate = q->rate;
+ }
+ rate.packet_overhead = q->packet_overhead;
+ rate.cell_size = q->cell_size;
+ rate.cell_overhead = q->cell_overhead;
+ if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
+ goto nla_put_failure;
+
+ if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
+ goto nla_put_failure;
+
+ if (dump_loss_model(q, skb) != 0)
+ goto nla_put_failure;
- return skb->len;
+ return nla_nest_end(skb, nla);
nla_put_failure:
- nlmsg_trim(skb, b);
+ nlmsg_trim(skb, nla);
return -1;
}
+static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ if (cl != 1 || !q->qdisc) /* only one class */
+ return -ENOENT;
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+
+ return 0;
+}
+
+static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ sch_tree_lock(sch);
+ *old = q->qdisc;
+ q->qdisc = new;
+ if (*old) {
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ }
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ return q->qdisc;
+}
+
+static unsigned long netem_get(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void netem_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops netem_class_ops = {
+ .graft = netem_graft,
+ .leaf = netem_leaf,
+ .get = netem_get,
+ .put = netem_put,
+ .walk = netem_walk,
+ .dump = netem_dump_class,
+};
+
static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
.id = "netem",
+ .cl_ops = &netem_class_ops,
.priv_size = sizeof(struct netem_sched_data),
.enqueue = netem_enqueue,
.dequeue = netem_dequeue,
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
new file mode 100644
index 00000000000..fefeeb73f15
--- /dev/null
+++ b/net/sched/sch_pie.c
@@ -0,0 +1,566 @@
+/* Copyright (C) 2013 Cisco Systems, Inc, 2013.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Author: Vijay Subramanian <vijaynsu@cisco.com>
+ * Author: Mythili Prabhu <mysuryan@cisco.com>
+ *
+ * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no>
+ * University of Oslo, Norway.
+ *
+ * References:
+ * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
+ * IEEE Conference on High Performance Switching and Routing 2013 :
+ * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+
+#define QUEUE_THRESHOLD 10000
+#define DQCOUNT_INVALID -1
+#define MAX_PROB 0xffffffff
+#define PIE_SCALE 8
+
+/* parameters used */
+struct pie_params {
+ psched_time_t target; /* user specified target delay in pschedtime */
+ u32 tupdate; /* timer frequency (in jiffies) */
+ u32 limit; /* number of packets that can be enqueued */
+ u32 alpha; /* alpha and beta are between 0 and 32 */
+ u32 beta; /* and are used for shift relative to 1 */
+ bool ecn; /* true if ecn is enabled */
+ bool bytemode; /* to scale drop early prob based on pkt size */
+};
+
+/* variables used */
+struct pie_vars {
+ u32 prob; /* probability but scaled by u32 limit. */
+ psched_time_t burst_time;
+ psched_time_t qdelay;
+ psched_time_t qdelay_old;
+ u64 dq_count; /* measured in bytes */
+ psched_time_t dq_tstamp; /* drain rate */
+ u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
+ u32 qlen_old; /* in bytes */
+};
+
+/* statistics gathering */
+struct pie_stats {
+ u32 packets_in; /* total number of packets enqueued */
+ u32 dropped; /* packets dropped due to pie_action */
+ u32 overlimit; /* dropped due to lack of space in queue */
+ u32 maxq; /* maximum queue size */
+ u32 ecn_mark; /* packets marked with ECN */
+};
+
+/* private data for the Qdisc */
+struct pie_sched_data {
+ struct pie_params params;
+ struct pie_vars vars;
+ struct pie_stats stats;
+ struct timer_list adapt_timer;
+};
+
+static void pie_params_init(struct pie_params *params)
+{
+ params->alpha = 2;
+ params->beta = 20;
+ params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */
+ params->limit = 1000; /* default of 1000 packets */
+ params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */
+ params->ecn = false;
+ params->bytemode = false;
+}
+
+static void pie_vars_init(struct pie_vars *vars)
+{
+ vars->dq_count = DQCOUNT_INVALID;
+ vars->avg_dq_rate = 0;
+ /* default of 100 ms in pschedtime */
+ vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
+}
+
+static bool drop_early(struct Qdisc *sch, u32 packet_size)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ u32 rnd;
+ u32 local_prob = q->vars.prob;
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+
+ /* If there is still burst allowance left skip random early drop */
+ if (q->vars.burst_time > 0)
+ return false;
+
+ /* If current delay is less than half of target, and
+ * if drop prob is low already, disable early_drop
+ */
+ if ((q->vars.qdelay < q->params.target / 2)
+ && (q->vars.prob < MAX_PROB / 5))
+ return false;
+
+ /* If we have fewer than 2 mtu-sized packets, disable drop_early,
+ * similar to min_th in RED
+ */
+ if (sch->qstats.backlog < 2 * mtu)
+ return false;
+
+ /* If bytemode is turned on, use packet size to compute new
+ * probablity. Smaller packets will have lower drop prob in this case
+ */
+ if (q->params.bytemode && packet_size <= mtu)
+ local_prob = (local_prob / mtu) * packet_size;
+ else
+ local_prob = q->vars.prob;
+
+ rnd = prandom_u32();
+ if (rnd < local_prob)
+ return true;
+
+ return false;
+}
+
+static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ bool enqueue = false;
+
+ if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
+ q->stats.overlimit++;
+ goto out;
+ }
+
+ if (!drop_early(sch, skb->len)) {
+ enqueue = true;
+ } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
+ INET_ECN_set_ce(skb)) {
+ /* If packet is ecn capable, mark it if drop probability
+ * is lower than 10%, else drop it.
+ */
+ q->stats.ecn_mark++;
+ enqueue = true;
+ }
+
+ /* we can enqueue the packet */
+ if (enqueue) {
+ q->stats.packets_in++;
+ if (qdisc_qlen(sch) > q->stats.maxq)
+ q->stats.maxq = qdisc_qlen(sch);
+
+ return qdisc_enqueue_tail(skb, sch);
+ }
+
+out:
+ q->stats.dropped++;
+ return qdisc_drop(skb, sch);
+}
+
+static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
+ [TCA_PIE_TARGET] = {.type = NLA_U32},
+ [TCA_PIE_LIMIT] = {.type = NLA_U32},
+ [TCA_PIE_TUPDATE] = {.type = NLA_U32},
+ [TCA_PIE_ALPHA] = {.type = NLA_U32},
+ [TCA_PIE_BETA] = {.type = NLA_U32},
+ [TCA_PIE_ECN] = {.type = NLA_U32},
+ [TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+};
+
+static int pie_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_PIE_MAX + 1];
+ unsigned int qlen;
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+
+ /* convert from microseconds to pschedtime */
+ if (tb[TCA_PIE_TARGET]) {
+ /* target is in us */
+ u32 target = nla_get_u32(tb[TCA_PIE_TARGET]);
+
+ /* convert to pschedtime */
+ q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
+ }
+
+ /* tupdate is in jiffies */
+ if (tb[TCA_PIE_TUPDATE])
+ q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
+
+ if (tb[TCA_PIE_LIMIT]) {
+ u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
+
+ q->params.limit = limit;
+ sch->limit = limit;
+ }
+
+ if (tb[TCA_PIE_ALPHA])
+ q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]);
+
+ if (tb[TCA_PIE_BETA])
+ q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]);
+
+ if (tb[TCA_PIE_ECN])
+ q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]);
+
+ if (tb[TCA_PIE_BYTEMODE])
+ q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
+
+ /* Drop excess packets if new limit is lower */
+ qlen = sch->q.qlen;
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_drop(skb, sch);
+ }
+ qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
+{
+
+ struct pie_sched_data *q = qdisc_priv(sch);
+ int qlen = sch->qstats.backlog; /* current queue size in bytes */
+
+ /* If current queue is about 10 packets or more and dq_count is unset
+ * we have enough packets to calculate the drain rate. Save
+ * current time as dq_tstamp and start measurement cycle.
+ */
+ if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) {
+ q->vars.dq_tstamp = psched_get_time();
+ q->vars.dq_count = 0;
+ }
+
+ /* Calculate the average drain rate from this value. If queue length
+ * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset
+ * the dq_count to -1 as we don't have enough packets to calculate the
+ * drain rate anymore The following if block is entered only when we
+ * have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
+ * and we calculate the drain rate for the threshold here. dq_count is
+ * in bytes, time difference in psched_time, hence rate is in
+ * bytes/psched_time.
+ */
+ if (q->vars.dq_count != DQCOUNT_INVALID) {
+ q->vars.dq_count += skb->len;
+
+ if (q->vars.dq_count >= QUEUE_THRESHOLD) {
+ psched_time_t now = psched_get_time();
+ u32 dtime = now - q->vars.dq_tstamp;
+ u32 count = q->vars.dq_count << PIE_SCALE;
+
+ if (dtime == 0)
+ return;
+
+ count = count / dtime;
+
+ if (q->vars.avg_dq_rate == 0)
+ q->vars.avg_dq_rate = count;
+ else
+ q->vars.avg_dq_rate =
+ (q->vars.avg_dq_rate -
+ (q->vars.avg_dq_rate >> 3)) + (count >> 3);
+
+ /* If the queue has receded below the threshold, we hold
+ * on to the last drain rate calculated, else we reset
+ * dq_count to 0 to re-enter the if block when the next
+ * packet is dequeued
+ */
+ if (qlen < QUEUE_THRESHOLD)
+ q->vars.dq_count = DQCOUNT_INVALID;
+ else {
+ q->vars.dq_count = 0;
+ q->vars.dq_tstamp = psched_get_time();
+ }
+
+ if (q->vars.burst_time > 0) {
+ if (q->vars.burst_time > dtime)
+ q->vars.burst_time -= dtime;
+ else
+ q->vars.burst_time = 0;
+ }
+ }
+ }
+}
+
+static void calculate_probability(struct Qdisc *sch)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ u32 qlen = sch->qstats.backlog; /* queue size in bytes */
+ psched_time_t qdelay = 0; /* in pschedtime */
+ psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
+ s32 delta = 0; /* determines the change in probability */
+ u32 oldprob;
+ u32 alpha, beta;
+ bool update_prob = true;
+
+ q->vars.qdelay_old = q->vars.qdelay;
+
+ if (q->vars.avg_dq_rate > 0)
+ qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
+ else
+ qdelay = 0;
+
+ /* If qdelay is zero and qlen is not, it means qlen is very small, less
+ * than dequeue_rate, so we do not update probabilty in this round
+ */
+ if (qdelay == 0 && qlen != 0)
+ update_prob = false;
+
+ /* In the algorithm, alpha and beta are between 0 and 2 with typical
+ * value for alpha as 0.125. In this implementation, we use values 0-32
+ * passed from user space to represent this. Also, alpha and beta have
+ * unit of HZ and need to be scaled before they can used to update
+ * probability. alpha/beta are updated locally below by 1) scaling them
+ * appropriately 2) scaling down by 16 to come to 0-2 range.
+ * Please see paper for details.
+ *
+ * We scale alpha and beta differently depending on whether we are in
+ * light, medium or high dropping mode.
+ */
+ if (q->vars.prob < MAX_PROB / 100) {
+ alpha =
+ (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
+ beta =
+ (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
+ } else if (q->vars.prob < MAX_PROB / 10) {
+ alpha =
+ (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
+ beta =
+ (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
+ } else {
+ alpha =
+ (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ beta =
+ (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ }
+
+ /* alpha and beta should be between 0 and 32, in multiples of 1/16 */
+ delta += alpha * ((qdelay - q->params.target));
+ delta += beta * ((qdelay - qdelay_old));
+
+ oldprob = q->vars.prob;
+
+ /* to ensure we increase probability in steps of no more than 2% */
+ if (delta > (s32) (MAX_PROB / (100 / 2)) &&
+ q->vars.prob >= MAX_PROB / 10)
+ delta = (MAX_PROB / 100) * 2;
+
+ /* Non-linear drop:
+ * Tune drop probability to increase quickly for high delays(>= 250ms)
+ * 250ms is derived through experiments and provides error protection
+ */
+
+ if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
+ delta += MAX_PROB / (100 / 2);
+
+ q->vars.prob += delta;
+
+ if (delta > 0) {
+ /* prevent overflow */
+ if (q->vars.prob < oldprob) {
+ q->vars.prob = MAX_PROB;
+ /* Prevent normalization error. If probability is at
+ * maximum value already, we normalize it here, and
+ * skip the check to do a non-linear drop in the next
+ * section.
+ */
+ update_prob = false;
+ }
+ } else {
+ /* prevent underflow */
+ if (q->vars.prob > oldprob)
+ q->vars.prob = 0;
+ }
+
+ /* Non-linear drop in probability: Reduce drop probability quickly if
+ * delay is 0 for 2 consecutive Tupdate periods.
+ */
+
+ if ((qdelay == 0) && (qdelay_old == 0) && update_prob)
+ q->vars.prob = (q->vars.prob * 98) / 100;
+
+ q->vars.qdelay = qdelay;
+ q->vars.qlen_old = qlen;
+
+ /* We restart the measurement cycle if the following conditions are met
+ * 1. If the delay has been low for 2 consecutive Tupdate periods
+ * 2. Calculated drop probability is zero
+ * 3. We have atleast one estimate for the avg_dq_rate ie.,
+ * is a non-zero value
+ */
+ if ((q->vars.qdelay < q->params.target / 2) &&
+ (q->vars.qdelay_old < q->params.target / 2) &&
+ (q->vars.prob == 0) &&
+ (q->vars.avg_dq_rate > 0))
+ pie_vars_init(&q->vars);
+}
+
+static void pie_timer(unsigned long arg)
+{
+ struct Qdisc *sch = (struct Qdisc *)arg;
+ struct pie_sched_data *q = qdisc_priv(sch);
+ spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+
+ spin_lock(root_lock);
+ calculate_probability(sch);
+
+ /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
+ if (q->params.tupdate)
+ mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
+ spin_unlock(root_lock);
+
+}
+
+static int pie_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+
+ pie_params_init(&q->params);
+ pie_vars_init(&q->vars);
+ sch->limit = q->params.limit;
+
+ setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch);
+ mod_timer(&q->adapt_timer, jiffies + HZ / 2);
+
+ if (opt) {
+ int err = pie_change(sch, opt);
+
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ /* convert target from pschedtime to us */
+ if (nla_put_u32(skb, TCA_PIE_TARGET,
+ ((u32) PSCHED_TICKS2NS(q->params.target)) /
+ NSEC_PER_USEC) ||
+ nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) ||
+ nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
+ nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
+ nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
+ nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -1;
+
+}
+
+static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ struct tc_pie_xstats st = {
+ .prob = q->vars.prob,
+ .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) /
+ NSEC_PER_USEC,
+ /* unscale and return dq_rate in bytes per sec */
+ .avg_dq_rate = q->vars.avg_dq_rate *
+ (PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
+ .packets_in = q->stats.packets_in,
+ .overlimit = q->stats.overlimit,
+ .maxq = q->stats.maxq,
+ .dropped = q->stats.dropped,
+ .ecn_mark = q->stats.ecn_mark,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ skb = __qdisc_dequeue_head(sch, &sch->q);
+
+ if (!skb)
+ return NULL;
+
+ pie_process_dequeue(sch, skb);
+ return skb;
+}
+
+static void pie_reset(struct Qdisc *sch)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ qdisc_reset_queue(sch);
+ pie_vars_init(&q->vars);
+}
+
+static void pie_destroy(struct Qdisc *sch)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ q->params.tupdate = 0;
+ del_timer_sync(&q->adapt_timer);
+}
+
+static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
+ .id = "pie",
+ .priv_size = sizeof(struct pie_sched_data),
+ .enqueue = pie_qdisc_enqueue,
+ .dequeue = pie_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = pie_init,
+ .destroy = pie_destroy,
+ .reset = pie_reset,
+ .change = pie_change,
+ .dump = pie_dump,
+ .dump_stats = pie_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init pie_module_init(void)
+{
+ return register_qdisc(&pie_qdisc_ops);
+}
+
+static void __exit pie_module_exit(void)
+{
+ unregister_qdisc(&pie_qdisc_ops);
+}
+
+module_init(pie_module_init);
+module_exit(pie_module_exit);
+
+MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler");
+MODULE_AUTHOR("Vijay Subramanian");
+MODULE_AUTHOR("Mythili Prabhu");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
new file mode 100644
index 00000000000..89f8fcf73f1
--- /dev/null
+++ b/net/sched/sch_plug.c
@@ -0,0 +1,233 @@
+/*
+ * sch_plug.c Queue traffic until an explicit release command
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * There are two ways to use this qdisc:
+ * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
+ * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
+ *
+ * 2. For network output buffering (a.k.a output commit) functionality.
+ * Output commit property is commonly used by applications using checkpoint
+ * based fault-tolerance to ensure that the checkpoint from which a system
+ * is being restored is consistent w.r.t outside world.
+ *
+ * Consider for e.g. Remus - a Virtual Machine checkpointing system,
+ * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated
+ * asynchronously to the backup host, while the VM continues executing the
+ * next epoch speculatively.
+ *
+ * The following is a typical sequence of output buffer operations:
+ * 1.At epoch i, start_buffer(i)
+ * 2. At end of epoch i (i.e. after 50ms):
+ * 2.1 Stop VM and take checkpoint(i).
+ * 2.2 start_buffer(i+1) and Resume VM
+ * 3. While speculatively executing epoch(i+1), asynchronously replicate
+ * checkpoint(i) to backup host.
+ * 4. When checkpoint_ack(i) is received from backup, release_buffer(i)
+ * Thus, this Qdisc would receive the following sequence of commands:
+ * TCQ_PLUG_BUFFER (epoch i)
+ * .. TCQ_PLUG_BUFFER (epoch i+1)
+ * ....TCQ_PLUG_RELEASE_ONE (epoch i)
+ * ......TCQ_PLUG_BUFFER (epoch i+2)
+ * ........
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+/*
+ * State of the queue, when used for network output buffering:
+ *
+ * plug(i+1) plug(i) head
+ * ------------------+--------------------+---------------->
+ * | |
+ * | |
+ * pkts_current_epoch| pkts_last_epoch |pkts_to_release
+ * ----------------->|<--------+--------->|+--------------->
+ * v v
+ *
+ */
+
+struct plug_sched_data {
+ /* If true, the dequeue function releases all packets
+ * from head to end of the queue. The queue turns into
+ * a pass-through queue for newly arriving packets.
+ */
+ bool unplug_indefinite;
+
+ /* Queue Limit in bytes */
+ u32 limit;
+
+ /* Number of packets (output) from the current speculatively
+ * executing epoch.
+ */
+ u32 pkts_current_epoch;
+
+ /* Number of packets corresponding to the recently finished
+ * epoch. These will be released when we receive a
+ * TCQ_PLUG_RELEASE_ONE command. This command is typically
+ * issued after committing a checkpoint at the target.
+ */
+ u32 pkts_last_epoch;
+
+ /*
+ * Number of packets from the head of the queue, that can
+ * be released (committed checkpoint).
+ */
+ u32 pkts_to_release;
+};
+
+static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+
+ if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
+ if (!q->unplug_indefinite)
+ q->pkts_current_epoch++;
+ return qdisc_enqueue_tail(skb, sch);
+ }
+
+ return qdisc_reshape_fail(skb, sch);
+}
+
+static struct sk_buff *plug_dequeue(struct Qdisc *sch)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+
+ if (qdisc_is_throttled(sch))
+ return NULL;
+
+ if (!q->unplug_indefinite) {
+ if (!q->pkts_to_release) {
+ /* No more packets to dequeue. Block the queue
+ * and wait for the next release command.
+ */
+ qdisc_throttled(sch);
+ return NULL;
+ }
+ q->pkts_to_release--;
+ }
+
+ return qdisc_dequeue_head(sch);
+}
+
+static int plug_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+
+ q->pkts_current_epoch = 0;
+ q->pkts_last_epoch = 0;
+ q->pkts_to_release = 0;
+ q->unplug_indefinite = false;
+
+ if (opt == NULL) {
+ /* We will set a default limit of 100 pkts (~150kB)
+ * in case tx_queue_len is not available. The
+ * default value is completely arbitrary.
+ */
+ u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100;
+ q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
+ } else {
+ struct tc_plug_qopt *ctl = nla_data(opt);
+
+ if (nla_len(opt) < sizeof(*ctl))
+ return -EINVAL;
+
+ q->limit = ctl->limit;
+ }
+
+ qdisc_throttled(sch);
+ return 0;
+}
+
+/* Receives 4 types of messages:
+ * TCQ_PLUG_BUFFER: Inset a plug into the queue and
+ * buffer any incoming packets
+ * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
+ * to beginning of the next plug.
+ * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
+ * Stop buffering packets until the next TCQ_PLUG_BUFFER
+ * command is received (just act as a pass-thru queue).
+ * TCQ_PLUG_LIMIT: Increase/decrease queue size
+ */
+static int plug_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+ struct tc_plug_qopt *msg;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ msg = nla_data(opt);
+ if (nla_len(opt) < sizeof(*msg))
+ return -EINVAL;
+
+ switch (msg->action) {
+ case TCQ_PLUG_BUFFER:
+ /* Save size of the current buffer */
+ q->pkts_last_epoch = q->pkts_current_epoch;
+ q->pkts_current_epoch = 0;
+ if (q->unplug_indefinite)
+ qdisc_throttled(sch);
+ q->unplug_indefinite = false;
+ break;
+ case TCQ_PLUG_RELEASE_ONE:
+ /* Add packets from the last complete buffer to the
+ * packets to be released set.
+ */
+ q->pkts_to_release += q->pkts_last_epoch;
+ q->pkts_last_epoch = 0;
+ qdisc_unthrottled(sch);
+ netif_schedule_queue(sch->dev_queue);
+ break;
+ case TCQ_PLUG_RELEASE_INDEFINITE:
+ q->unplug_indefinite = true;
+ q->pkts_to_release = 0;
+ q->pkts_last_epoch = 0;
+ q->pkts_current_epoch = 0;
+ qdisc_unthrottled(sch);
+ netif_schedule_queue(sch->dev_queue);
+ break;
+ case TCQ_PLUG_LIMIT:
+ /* Limit is supplied in bytes */
+ q->limit = msg->limit;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
+ .id = "plug",
+ .priv_size = sizeof(struct plug_sched_data),
+ .enqueue = plug_enqueue,
+ .dequeue = plug_dequeue,
+ .peek = qdisc_peek_head,
+ .init = plug_init,
+ .change = plug_change,
+ .owner = THIS_MODULE,
+};
+
+static int __init plug_module_init(void)
+{
+ return register_qdisc(&plug_qdisc_ops);
+}
+
+static void __exit plug_module_exit(void)
+{
+ unregister_qdisc(&plug_qdisc_ops);
+}
+module_init(plug_module_init)
+module_exit(plug_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index b1c95bce33c..79359b69ad8 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -22,8 +22,7 @@
#include <net/pkt_sched.h>
-struct prio_sched_data
-{
+struct prio_sched_data {
int bands;
struct tcf_proto *filter_list;
u8 prio2band[TC_PRIO_MAX+1];
@@ -54,7 +53,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
if (!q->filter_list || err < 0) {
if (TC_H_MAJ(band))
band = 0;
- return q->queues[q->prio2band[band&TC_PRIO_MAX]];
+ return q->queues[q->prio2band[band & TC_PRIO_MAX]];
}
band = res.classid;
}
@@ -84,8 +83,6 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
- sch->bstats.bytes += qdisc_pkt_len(skb);
- sch->bstats.packets++;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -108,15 +105,16 @@ static struct sk_buff *prio_peek(struct Qdisc *sch)
return NULL;
}
-static struct sk_buff *prio_dequeue(struct Qdisc* sch)
+static struct sk_buff *prio_dequeue(struct Qdisc *sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
for (prio = 0; prio < q->bands; prio++) {
struct Qdisc *qdisc = q->queues[prio];
- struct sk_buff *skb = qdisc->dequeue(qdisc);
+ struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -125,7 +123,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc* sch)
}
-static unsigned int prio_drop(struct Qdisc* sch)
+static unsigned int prio_drop(struct Qdisc *sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
@@ -144,24 +142,24 @@ static unsigned int prio_drop(struct Qdisc* sch)
static void
-prio_reset(struct Qdisc* sch)
+prio_reset(struct Qdisc *sch)
{
int prio;
struct prio_sched_data *q = qdisc_priv(sch);
- for (prio=0; prio<q->bands; prio++)
+ for (prio = 0; prio < q->bands; prio++)
qdisc_reset(q->queues[prio]);
sch->q.qlen = 0;
}
static void
-prio_destroy(struct Qdisc* sch)
+prio_destroy(struct Qdisc *sch)
{
int prio;
struct prio_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
- for (prio=0; prio<q->bands; prio++)
+ for (prio = 0; prio < q->bands; prio++)
qdisc_destroy(q->queues[prio]);
}
@@ -178,7 +176,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
return -EINVAL;
- for (i=0; i<=TC_PRIO_MAX; i++) {
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
if (qopt->priomap[i] >= qopt->bands)
return -EINVAL;
}
@@ -187,7 +185,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
q->bands = qopt->bands;
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
- for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
+ for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
if (child != &noop_qdisc) {
@@ -197,9 +195,10 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
}
sch_tree_unlock(sch);
- for (i=0; i<q->bands; i++) {
+ for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
+
child = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops,
TC_H_MAKE(sch->handle, i + 1));
@@ -225,7 +224,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)
struct prio_sched_data *q = qdisc_priv(sch);
int i;
- for (i=0; i<TCQ_PRIO_BANDS; i++)
+ for (i = 0; i < TCQ_PRIO_BANDS; i++)
q->queues[i] = &noop_qdisc;
if (opt == NULL) {
@@ -233,7 +232,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)
} else {
int err;
- if ((err= prio_tune(sch, opt)) != 0)
+ if ((err = prio_tune(sch, opt)) != 0)
return err;
}
return 0;
@@ -246,9 +245,10 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_prio_qopt opt;
opt.bands = q->bands;
- memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
+ memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
- NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
return skb->len;
@@ -343,7 +343,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
arg->count++;
continue;
}
- if (arg->fn(sch, prio+1, arg) < 0) {
+ if (arg->fn(sch, prio + 1, arg) < 0) {
arg->stop = 1;
break;
}
@@ -351,7 +351,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
}
}
-static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl)
{
struct prio_sched_data *q = qdisc_priv(sch);
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
new file mode 100644
index 00000000000..8056fb4e618
--- /dev/null
+++ b/net/sched/sch_qfq.c
@@ -0,0 +1,1582 @@
+/*
+ * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
+ *
+ * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
+ * Copyright (c) 2012 Paolo Valente.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+
+/* Quick Fair Queueing Plus
+ ========================
+
+ Sources:
+
+ [1] Paolo Valente,
+ "Reducing the Execution Time of Fair-Queueing Schedulers."
+ http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
+
+ Sources for QFQ:
+
+ [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
+ Packet Scheduling with Tight Bandwidth Distribution Guarantees."
+
+ See also:
+ http://retis.sssup.it/~fabio/linux/qfq/
+ */
+
+/*
+
+ QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
+ classes. Each aggregate is timestamped with a virtual start time S
+ and a virtual finish time F, and scheduled according to its
+ timestamps. S and F are computed as a function of a system virtual
+ time function V. The classes within each aggregate are instead
+ scheduled with DRR.
+
+ To speed up operations, QFQ+ divides also aggregates into a limited
+ number of groups. Which group a class belongs to depends on the
+ ratio between the maximum packet length for the class and the weight
+ of the class. Groups have their own S and F. In the end, QFQ+
+ schedules groups, then aggregates within groups, then classes within
+ aggregates. See [1] and [2] for a full description.
+
+ Virtual time computations.
+
+ S, F and V are all computed in fixed point arithmetic with
+ FRAC_BITS decimal bits.
+
+ QFQ_MAX_INDEX is the maximum index allowed for a group. We need
+ one bit per index.
+ QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
+
+ The layout of the bits is as below:
+
+ [ MTU_SHIFT ][ FRAC_BITS ]
+ [ MAX_INDEX ][ MIN_SLOT_SHIFT ]
+ ^.__grp->index = 0
+ *.__grp->slot_shift
+
+ where MIN_SLOT_SHIFT is derived by difference from the others.
+
+ The max group index corresponds to Lmax/w_min, where
+ Lmax=1<<MTU_SHIFT, w_min = 1 .
+ From this, and knowing how many groups (MAX_INDEX) we want,
+ we can derive the shift corresponding to each group.
+
+ Because we often need to compute
+ F = S + len/w_i and V = V + len/wsum
+ instead of storing w_i store the value
+ inv_w = (1<<FRAC_BITS)/w_i
+ so we can do F = S + len * inv_w * wsum.
+ We use W_TOT in the formulas so we can easily move between
+ static and adaptive weight sum.
+
+ The per-scheduler-instance data contain all the data structures
+ for the scheduler: bitmaps and bucket lists.
+
+ */
+
+/*
+ * Maximum number of consecutive slots occupied by backlogged classes
+ * inside a group.
+ */
+#define QFQ_MAX_SLOTS 32
+
+/*
+ * Shifts used for aggregate<->group mapping. We allow class weights that are
+ * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
+ * group with the smallest index that can support the L_i / r_i configured
+ * for the classes in the aggregate.
+ *
+ * grp->index is the index of the group; and grp->slot_shift
+ * is the shift for the corresponding (scaled) sigma_i.
+ */
+#define QFQ_MAX_INDEX 24
+#define QFQ_MAX_WSHIFT 10
+
+#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
+#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT)
+
+#define FRAC_BITS 30 /* fixed point arithmetic */
+#define ONE_FP (1UL << FRAC_BITS)
+
+#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
+#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
+
+#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */
+
+/*
+ * Possible group states. These values are used as indexes for the bitmaps
+ * array of struct qfq_queue.
+ */
+enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
+
+struct qfq_group;
+
+struct qfq_aggregate;
+
+struct qfq_class {
+ struct Qdisc_class_common common;
+
+ unsigned int refcnt;
+ unsigned int filter_cnt;
+
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct gnet_stats_rate_est64 rate_est;
+ struct Qdisc *qdisc;
+ struct list_head alist; /* Link for active-classes list. */
+ struct qfq_aggregate *agg; /* Parent aggregate. */
+ int deficit; /* DRR deficit counter. */
+};
+
+struct qfq_aggregate {
+ struct hlist_node next; /* Link for the slot list. */
+ u64 S, F; /* flow timestamps (exact) */
+
+ /* group we belong to. In principle we would need the index,
+ * which is log_2(lmax/weight), but we never reference it
+ * directly, only the group.
+ */
+ struct qfq_group *grp;
+
+ /* these are copied from the flowset. */
+ u32 class_weight; /* Weight of each class in this aggregate. */
+ /* Max pkt size for the classes in this aggregate, DRR quantum. */
+ int lmax;
+
+ u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */
+ u32 budgetmax; /* Max budget for this aggregate. */
+ u32 initial_budget, budget; /* Initial and current budget. */
+
+ int num_classes; /* Number of classes in this aggr. */
+ struct list_head active; /* DRR queue of active classes. */
+
+ struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */
+};
+
+struct qfq_group {
+ u64 S, F; /* group timestamps (approx). */
+ unsigned int slot_shift; /* Slot shift. */
+ unsigned int index; /* Group index. */
+ unsigned int front; /* Index of the front slot. */
+ unsigned long full_slots; /* non-empty slots */
+
+ /* Array of RR lists of active aggregates. */
+ struct hlist_head slots[QFQ_MAX_SLOTS];
+};
+
+struct qfq_sched {
+ struct tcf_proto *filter_list;
+ struct Qdisc_class_hash clhash;
+
+ u64 oldV, V; /* Precise virtual times. */
+ struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
+ u32 num_active_agg; /* Num. of active aggregates */
+ u32 wsum; /* weight sum */
+ u32 iwsum; /* inverse weight sum */
+
+ unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
+ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+ u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */
+
+ u32 max_agg_classes; /* Max number of classes per aggr. */
+ struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
+};
+
+/*
+ * Possible reasons why the timestamps of an aggregate are updated
+ * enqueue: the aggregate switches from idle to active and must scheduled
+ * for service
+ * requeue: the aggregate finishes its budget, so it stops being served and
+ * must be rescheduled for service
+ */
+enum update_reason {enqueue, requeue};
+
+static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, classid);
+ if (clc == NULL)
+ return NULL;
+ return container_of(clc, struct qfq_class, common);
+}
+
+static void qfq_purge_queue(struct qfq_class *cl)
+{
+ unsigned int len = cl->qdisc->q.qlen;
+
+ qdisc_reset(cl->qdisc);
+ qdisc_tree_decrease_qlen(cl->qdisc, len);
+}
+
+static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
+ [TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
+ [TCA_QFQ_LMAX] = { .type = NLA_U32 },
+};
+
+/*
+ * Calculate a flow index, given its weight and maximum packet length.
+ * index = log_2(maxlen/weight) but we need to apply the scaling.
+ * This is used only once at flow creation.
+ */
+static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
+{
+ u64 slot_size = (u64)maxlen * inv_w;
+ unsigned long size_map;
+ int index = 0;
+
+ size_map = slot_size >> min_slot_shift;
+ if (!size_map)
+ goto out;
+
+ index = __fls(size_map) + 1; /* basically a log_2 */
+ index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
+
+ if (index < 0)
+ index = 0;
+out:
+ pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
+ (unsigned long) ONE_FP/inv_w, maxlen, index);
+
+ return index;
+}
+
+static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
+static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
+ enum update_reason);
+
+static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+ u32 lmax, u32 weight)
+{
+ INIT_LIST_HEAD(&agg->active);
+ hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
+
+ agg->lmax = lmax;
+ agg->class_weight = weight;
+}
+
+static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
+ u32 lmax, u32 weight)
+{
+ struct qfq_aggregate *agg;
+
+ hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
+ if (agg->lmax == lmax && agg->class_weight == weight)
+ return agg;
+
+ return NULL;
+}
+
+
+/* Update aggregate as a function of the new number of classes. */
+static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+ int new_num_classes)
+{
+ u32 new_agg_weight;
+
+ if (new_num_classes == q->max_agg_classes)
+ hlist_del_init(&agg->nonfull_next);
+
+ if (agg->num_classes > new_num_classes &&
+ new_num_classes == q->max_agg_classes - 1) /* agg no more full */
+ hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
+
+ /* The next assignment may let
+ * agg->initial_budget > agg->budgetmax
+ * hold, we will take it into account in charge_actual_service().
+ */
+ agg->budgetmax = new_num_classes * agg->lmax;
+ new_agg_weight = agg->class_weight * new_num_classes;
+ agg->inv_w = ONE_FP/new_agg_weight;
+
+ if (agg->grp == NULL) {
+ int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
+ q->min_slot_shift);
+ agg->grp = &q->groups[i];
+ }
+
+ q->wsum +=
+ (int) agg->class_weight * (new_num_classes - agg->num_classes);
+ q->iwsum = ONE_FP / q->wsum;
+
+ agg->num_classes = new_num_classes;
+}
+
+/* Add class to aggregate. */
+static void qfq_add_to_agg(struct qfq_sched *q,
+ struct qfq_aggregate *agg,
+ struct qfq_class *cl)
+{
+ cl->agg = agg;
+
+ qfq_update_agg(q, agg, agg->num_classes+1);
+ if (cl->qdisc->q.qlen > 0) { /* adding an active class */
+ list_add_tail(&cl->alist, &agg->active);
+ if (list_first_entry(&agg->active, struct qfq_class, alist) ==
+ cl && q->in_serv_agg != agg) /* agg was inactive */
+ qfq_activate_agg(q, agg, enqueue); /* schedule agg */
+ }
+}
+
+static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
+
+static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ if (!hlist_unhashed(&agg->nonfull_next))
+ hlist_del_init(&agg->nonfull_next);
+ q->wsum -= agg->class_weight;
+ if (q->wsum != 0)
+ q->iwsum = ONE_FP / q->wsum;
+
+ if (q->in_serv_agg == agg)
+ q->in_serv_agg = qfq_choose_next_agg(q);
+ kfree(agg);
+}
+
+/* Deschedule class from within its parent aggregate. */
+static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
+{
+ struct qfq_aggregate *agg = cl->agg;
+
+
+ list_del(&cl->alist); /* remove from RR queue of the aggregate */
+ if (list_empty(&agg->active)) /* agg is now inactive */
+ qfq_deactivate_agg(q, agg);
+}
+
+/* Remove class from its parent aggregate. */
+static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
+{
+ struct qfq_aggregate *agg = cl->agg;
+
+ cl->agg = NULL;
+ if (agg->num_classes == 1) { /* agg being emptied, destroy it */
+ qfq_destroy_agg(q, agg);
+ return;
+ }
+ qfq_update_agg(q, agg, agg->num_classes-1);
+}
+
+/* Deschedule class and remove it from its parent aggregate. */
+static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
+{
+ if (cl->qdisc->q.qlen > 0) /* class is active */
+ qfq_deactivate_class(q, cl);
+
+ qfq_rm_from_agg(q, cl);
+}
+
+/* Move class to a new aggregate, matching the new class weight and/or lmax */
+static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
+ u32 lmax)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
+
+ if (new_agg == NULL) { /* create new aggregate */
+ new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
+ if (new_agg == NULL)
+ return -ENOBUFS;
+ qfq_init_agg(q, new_agg, lmax, weight);
+ }
+ qfq_deact_rm_from_agg(q, cl);
+ qfq_add_to_agg(q, new_agg, cl);
+
+ return 0;
+}
+
+static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl = (struct qfq_class *)*arg;
+ bool existing = false;
+ struct nlattr *tb[TCA_QFQ_MAX + 1];
+ struct qfq_aggregate *new_agg = NULL;
+ u32 weight, lmax, inv_w;
+ int err;
+ int delta_w;
+
+ if (tca[TCA_OPTIONS] == NULL) {
+ pr_notice("qfq: no options\n");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_QFQ_WEIGHT]) {
+ weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
+ if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
+ pr_notice("qfq: invalid weight %u\n", weight);
+ return -EINVAL;
+ }
+ } else
+ weight = 1;
+
+ if (tb[TCA_QFQ_LMAX]) {
+ lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
+ if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
+ pr_notice("qfq: invalid max length %u\n", lmax);
+ return -EINVAL;
+ }
+ } else
+ lmax = psched_mtu(qdisc_dev(sch));
+
+ inv_w = ONE_FP / weight;
+ weight = ONE_FP / inv_w;
+
+ if (cl != NULL &&
+ lmax == cl->agg->lmax &&
+ weight == cl->agg->class_weight)
+ return 0; /* nothing to change */
+
+ delta_w = weight - (cl ? cl->agg->class_weight : 0);
+
+ if (q->wsum + delta_w > QFQ_MAX_WSUM) {
+ pr_notice("qfq: total weight out of range (%d + %u)\n",
+ delta_w, q->wsum);
+ return -EINVAL;
+ }
+
+ if (cl != NULL) { /* modify existing class */
+ if (tca[TCA_RATE]) {
+ err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE]);
+ if (err)
+ return err;
+ }
+ existing = true;
+ goto set_change_agg;
+ }
+
+ /* create and init new class */
+ cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
+ if (cl == NULL)
+ return -ENOBUFS;
+
+ cl->refcnt = 1;
+ cl->common.classid = classid;
+ cl->deficit = lmax;
+
+ cl->qdisc = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, classid);
+ if (cl->qdisc == NULL)
+ cl->qdisc = &noop_qdisc;
+
+ if (tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE]);
+ if (err)
+ goto destroy_class;
+ }
+
+ sch_tree_lock(sch);
+ qdisc_class_hash_insert(&q->clhash, &cl->common);
+ sch_tree_unlock(sch);
+
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+set_change_agg:
+ sch_tree_lock(sch);
+ new_agg = qfq_find_agg(q, lmax, weight);
+ if (new_agg == NULL) { /* create new aggregate */
+ sch_tree_unlock(sch);
+ new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
+ if (new_agg == NULL) {
+ err = -ENOBUFS;
+ gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ goto destroy_class;
+ }
+ sch_tree_lock(sch);
+ qfq_init_agg(q, new_agg, lmax, weight);
+ }
+ if (existing)
+ qfq_deact_rm_from_agg(q, cl);
+ qfq_add_to_agg(q, new_agg, cl);
+ sch_tree_unlock(sch);
+
+ *arg = (unsigned long)cl;
+ return 0;
+
+destroy_class:
+ qdisc_destroy(cl->qdisc);
+ kfree(cl);
+ return err;
+}
+
+static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+
+ qfq_rm_from_agg(q, cl);
+ gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ qdisc_destroy(cl->qdisc);
+ kfree(cl);
+}
+
+static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ if (cl->filter_cnt > 0)
+ return -EBUSY;
+
+ sch_tree_lock(sch);
+
+ qfq_purge_queue(cl);
+ qdisc_class_hash_remove(&q->clhash, &cl->common);
+
+ BUG_ON(--cl->refcnt == 0);
+ /*
+ * This shouldn't happen: we "hold" one cops->get() when called
+ * from tc_ctl_tclass; the destroy method is done from cops->put().
+ */
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid)
+{
+ struct qfq_class *cl = qfq_find_class(sch, classid);
+
+ if (cl != NULL)
+ cl->refcnt++;
+
+ return (unsigned long)cl;
+}
+
+static void qfq_put_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ if (--cl->refcnt == 0)
+ qfq_destroy_class(sch, cl);
+}
+
+static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+
+ return &q->filter_list;
+}
+
+static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ struct qfq_class *cl = qfq_find_class(sch, classid);
+
+ if (cl != NULL)
+ cl->filter_cnt++;
+
+ return (unsigned long)cl;
+}
+
+static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ cl->filter_cnt--;
+}
+
+static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ if (new == NULL) {
+ new = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, cl->common.classid);
+ if (new == NULL)
+ new = &noop_qdisc;
+ }
+
+ sch_tree_lock(sch);
+ qfq_purge_queue(cl);
+ *old = cl->qdisc;
+ cl->qdisc = new;
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ return cl->qdisc;
+}
+
+static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+ struct nlattr *nest;
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle = cl->common.classid;
+ tcm->tcm_info = cl->qdisc->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
+ nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+ struct tc_qfq_stats xstats;
+
+ memset(&xstats, 0, sizeof(xstats));
+ cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
+
+ xstats.weight = cl->agg->class_weight;
+ xstats.lmax = cl->agg->lmax;
+
+ if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ struct tcf_result res;
+ int result;
+
+ if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
+ pr_debug("qfq_classify: found %d\n", skb->priority);
+ cl = qfq_find_class(sch, skb->priority);
+ if (cl != NULL)
+ return cl;
+ }
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tc_classify(skb, q->filter_list, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ cl = (struct qfq_class *)res.class;
+ if (cl == NULL)
+ cl = qfq_find_class(sch, res.classid);
+ return cl;
+ }
+
+ return NULL;
+}
+
+/* Generic comparison function, handling wraparound. */
+static inline int qfq_gt(u64 a, u64 b)
+{
+ return (s64)(a - b) > 0;
+}
+
+/* Round a precise timestamp to its slotted value. */
+static inline u64 qfq_round_down(u64 ts, unsigned int shift)
+{
+ return ts & ~((1ULL << shift) - 1);
+}
+
+/* return the pointer to the group with lowest index in the bitmap */
+static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
+ unsigned long bitmap)
+{
+ int index = __ffs(bitmap);
+ return &q->groups[index];
+}
+/* Calculate a mask to mimic what would be ffs_from(). */
+static inline unsigned long mask_from(unsigned long bitmap, int from)
+{
+ return bitmap & ~((1UL << from) - 1);
+}
+
+/*
+ * The state computation relies on ER=0, IR=1, EB=2, IB=3
+ * First compute eligibility comparing grp->S, q->V,
+ * then check if someone is blocking us and possibly add EB
+ */
+static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
+{
+ /* if S > V we are not eligible */
+ unsigned int state = qfq_gt(grp->S, q->V);
+ unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (qfq_gt(grp->F, next->F))
+ state |= EB;
+ }
+
+ return state;
+}
+
+
+/*
+ * In principle
+ * q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ * q->bitmaps[src] &= ~mask;
+ * but we should make sure that src != dst
+ */
+static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
+ int src, int dst)
+{
+ q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ q->bitmaps[src] &= ~mask;
+}
+
+static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
+{
+ unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (!qfq_gt(next->F, old_F))
+ return;
+ }
+
+ mask = (1UL << index) - 1;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+}
+
+/*
+ * perhaps
+ *
+ old_V ^= q->V;
+ old_V >>= q->min_slot_shift;
+ if (old_V) {
+ ...
+ }
+ *
+ */
+static void qfq_make_eligible(struct qfq_sched *q)
+{
+ unsigned long vslot = q->V >> q->min_slot_shift;
+ unsigned long old_vslot = q->oldV >> q->min_slot_shift;
+
+ if (vslot != old_vslot) {
+ unsigned long mask;
+ int last_flip_pos = fls(vslot ^ old_vslot);
+
+ if (last_flip_pos > 31) /* higher than the number of groups */
+ mask = ~0UL; /* make all groups eligible */
+ else
+ mask = (1UL << last_flip_pos) - 1;
+
+ qfq_move_groups(q, mask, IR, ER);
+ qfq_move_groups(q, mask, IB, EB);
+ }
+}
+
+/*
+ * The index of the slot in which the input aggregate agg is to be
+ * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
+ * and not a '-1' because the start time of the group may be moved
+ * backward by one slot after the aggregate has been inserted, and
+ * this would cause non-empty slots to be right-shifted by one
+ * position.
+ *
+ * QFQ+ fully satisfies this bound to the slot index if the parameters
+ * of the classes are not changed dynamically, and if QFQ+ never
+ * happens to postpone the service of agg unjustly, i.e., it never
+ * happens that the aggregate becomes backlogged and eligible, or just
+ * eligible, while an aggregate with a higher approximated finish time
+ * is being served. In particular, in this case QFQ+ guarantees that
+ * the timestamps of agg are low enough that the slot index is never
+ * higher than 2. Unfortunately, QFQ+ cannot provide the same
+ * guarantee if it happens to unjustly postpone the service of agg, or
+ * if the parameters of some class are changed.
+ *
+ * As for the first event, i.e., an out-of-order service, the
+ * upper bound to the slot index guaranteed by QFQ+ grows to
+ * 2 +
+ * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
+ * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
+ *
+ * The following function deals with this problem by backward-shifting
+ * the timestamps of agg, if needed, so as to guarantee that the slot
+ * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
+ * cause the service of other aggregates to be postponed, yet the
+ * worst-case guarantees of these aggregates are not violated. In
+ * fact, in case of no out-of-order service, the timestamps of agg
+ * would have been even lower than they are after the backward shift,
+ * because QFQ+ would have guaranteed a maximum value equal to 2 for
+ * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
+ * service is postponed because of the backward-shift would have
+ * however waited for the service of agg before being served.
+ *
+ * The other event that may cause the slot index to be higher than 2
+ * for agg is a recent change of the parameters of some class. If the
+ * weight of a class is increased or the lmax (max_pkt_size) of the
+ * class is decreased, then a new aggregate with smaller slot size
+ * than the original parent aggregate of the class may happen to be
+ * activated. The activation of this aggregate should be properly
+ * delayed to when the service of the class has finished in the ideal
+ * system tracked by QFQ+. If the activation of the aggregate is not
+ * delayed to this reference time instant, then this aggregate may be
+ * unjustly served before other aggregates waiting for service. This
+ * may cause the above bound to the slot index to be violated for some
+ * of these unlucky aggregates.
+ *
+ * Instead of delaying the activation of the new aggregate, which is
+ * quite complex, the above-discussed capping of the slot index is
+ * used to handle also the consequences of a change of the parameters
+ * of a class.
+ */
+static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
+ u64 roundedS)
+{
+ u64 slot = (roundedS - grp->S) >> grp->slot_shift;
+ unsigned int i; /* slot index in the bucket list */
+
+ if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
+ u64 deltaS = roundedS - grp->S -
+ ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
+ agg->S -= deltaS;
+ agg->F -= deltaS;
+ slot = QFQ_MAX_SLOTS - 2;
+ }
+
+ i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+ hlist_add_head(&agg->next, &grp->slots[i]);
+ __set_bit(slot, &grp->full_slots);
+}
+
+/* Maybe introduce hlist_first_entry?? */
+static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
+{
+ return hlist_entry(grp->slots[grp->front].first,
+ struct qfq_aggregate, next);
+}
+
+/*
+ * remove the entry from the slot
+ */
+static void qfq_front_slot_remove(struct qfq_group *grp)
+{
+ struct qfq_aggregate *agg = qfq_slot_head(grp);
+
+ BUG_ON(!agg);
+ hlist_del(&agg->next);
+ if (hlist_empty(&grp->slots[grp->front]))
+ __clear_bit(0, &grp->full_slots);
+}
+
+/*
+ * Returns the first aggregate in the first non-empty bucket of the
+ * group. As a side effect, adjusts the bucket list so the first
+ * non-empty bucket is at position 0 in full_slots.
+ */
+static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
+{
+ unsigned int i;
+
+ pr_debug("qfq slot_scan: grp %u full %#lx\n",
+ grp->index, grp->full_slots);
+
+ if (grp->full_slots == 0)
+ return NULL;
+
+ i = __ffs(grp->full_slots); /* zero based */
+ if (i > 0) {
+ grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
+ grp->full_slots >>= i;
+ }
+
+ return qfq_slot_head(grp);
+}
+
+/*
+ * adjust the bucket list. When the start time of a group decreases,
+ * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
+ * move the objects. The mask of occupied slots must be shifted
+ * because we use ffs() to find the first non-empty slot.
+ * This covers decreases in the group's start time, but what about
+ * increases of the start time ?
+ * Here too we should make sure that i is less than 32
+ */
+static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
+{
+ unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
+
+ grp->full_slots <<= i;
+ grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
+}
+
+static void qfq_update_eligible(struct qfq_sched *q)
+{
+ struct qfq_group *grp;
+ unsigned long ineligible;
+
+ ineligible = q->bitmaps[IR] | q->bitmaps[IB];
+ if (ineligible) {
+ if (!q->bitmaps[ER]) {
+ grp = qfq_ffs(q, ineligible);
+ if (qfq_gt(grp->S, q->V))
+ q->V = grp->S;
+ }
+ qfq_make_eligible(q);
+ }
+}
+
+/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
+static void agg_dequeue(struct qfq_aggregate *agg,
+ struct qfq_class *cl, unsigned int len)
+{
+ qdisc_dequeue_peeked(cl->qdisc);
+
+ cl->deficit -= (int) len;
+
+ if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
+ list_del(&cl->alist);
+ else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
+ cl->deficit += agg->lmax;
+ list_move_tail(&cl->alist, &agg->active);
+ }
+}
+
+static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
+ struct qfq_class **cl,
+ unsigned int *len)
+{
+ struct sk_buff *skb;
+
+ *cl = list_first_entry(&agg->active, struct qfq_class, alist);
+ skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
+ if (skb == NULL)
+ WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
+ else
+ *len = qdisc_pkt_len(skb);
+
+ return skb;
+}
+
+/* Update F according to the actual service received by the aggregate. */
+static inline void charge_actual_service(struct qfq_aggregate *agg)
+{
+ /* Compute the service received by the aggregate, taking into
+ * account that, after decreasing the number of classes in
+ * agg, it may happen that
+ * agg->initial_budget - agg->budget > agg->bugdetmax
+ */
+ u32 service_received = min(agg->budgetmax,
+ agg->initial_budget - agg->budget);
+
+ agg->F = agg->S + (u64)service_received * agg->inv_w;
+}
+
+/* Assign a reasonable start time for a new aggregate in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in EB (see [2]). So, if we have groups in ER,
+ * set S to the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ unsigned long mask;
+ u64 limit, roundedF;
+ int slot_shift = agg->grp->slot_shift;
+
+ roundedF = qfq_round_down(agg->F, slot_shift);
+ limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
+
+ if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
+ /* timestamp was stale */
+ mask = mask_from(q->bitmaps[ER], agg->grp->index);
+ if (mask) {
+ struct qfq_group *next = qfq_ffs(q, mask);
+ if (qfq_gt(roundedF, next->F)) {
+ if (qfq_gt(limit, next->F))
+ agg->S = next->F;
+ else /* preserve timestamp correctness */
+ agg->S = limit;
+ return;
+ }
+ }
+ agg->S = q->V;
+ } else /* timestamp is not stale */
+ agg->S = agg->F;
+}
+
+/* Update the timestamps of agg before scheduling/rescheduling it for
+ * service. In particular, assign to agg->F its maximum possible
+ * value, i.e., the virtual finish time with which the aggregate
+ * should be labeled if it used all its budget once in service.
+ */
+static inline void
+qfq_update_agg_ts(struct qfq_sched *q,
+ struct qfq_aggregate *agg, enum update_reason reason)
+{
+ if (reason != requeue)
+ qfq_update_start(q, agg);
+ else /* just charge agg for the service received */
+ agg->S = agg->F;
+
+ agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
+}
+
+static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);
+
+static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
+ struct qfq_class *cl;
+ struct sk_buff *skb = NULL;
+ /* next-packet len, 0 means no more active classes in in-service agg */
+ unsigned int len = 0;
+
+ if (in_serv_agg == NULL)
+ return NULL;
+
+ if (!list_empty(&in_serv_agg->active))
+ skb = qfq_peek_skb(in_serv_agg, &cl, &len);
+
+ /*
+ * If there are no active classes in the in-service aggregate,
+ * or if the aggregate has not enough budget to serve its next
+ * class, then choose the next aggregate to serve.
+ */
+ if (len == 0 || in_serv_agg->budget < len) {
+ charge_actual_service(in_serv_agg);
+
+ /* recharge the budget of the aggregate */
+ in_serv_agg->initial_budget = in_serv_agg->budget =
+ in_serv_agg->budgetmax;
+
+ if (!list_empty(&in_serv_agg->active)) {
+ /*
+ * Still active: reschedule for
+ * service. Possible optimization: if no other
+ * aggregate is active, then there is no point
+ * in rescheduling this aggregate, and we can
+ * just keep it as the in-service one. This
+ * should be however a corner case, and to
+ * handle it, we would need to maintain an
+ * extra num_active_aggs field.
+ */
+ qfq_update_agg_ts(q, in_serv_agg, requeue);
+ qfq_schedule_agg(q, in_serv_agg);
+ } else if (sch->q.qlen == 0) { /* no aggregate to serve */
+ q->in_serv_agg = NULL;
+ return NULL;
+ }
+
+ /*
+ * If we get here, there are other aggregates queued:
+ * choose the new aggregate to serve.
+ */
+ in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
+ skb = qfq_peek_skb(in_serv_agg, &cl, &len);
+ }
+ if (!skb)
+ return NULL;
+
+ sch->q.qlen--;
+ qdisc_bstats_update(sch, skb);
+
+ agg_dequeue(in_serv_agg, cl, len);
+ /* If lmax is lowered, through qfq_change_class, for a class
+ * owning pending packets with larger size than the new value
+ * of lmax, then the following condition may hold.
+ */
+ if (unlikely(in_serv_agg->budget < len))
+ in_serv_agg->budget = 0;
+ else
+ in_serv_agg->budget -= len;
+
+ q->V += (u64)len * q->iwsum;
+ pr_debug("qfq dequeue: len %u F %lld now %lld\n",
+ len, (unsigned long long) in_serv_agg->F,
+ (unsigned long long) q->V);
+
+ return skb;
+}
+
+static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
+{
+ struct qfq_group *grp;
+ struct qfq_aggregate *agg, *new_front_agg;
+ u64 old_F;
+
+ qfq_update_eligible(q);
+ q->oldV = q->V;
+
+ if (!q->bitmaps[ER])
+ return NULL;
+
+ grp = qfq_ffs(q, q->bitmaps[ER]);
+ old_F = grp->F;
+
+ agg = qfq_slot_head(grp);
+
+ /* agg starts to be served, remove it from schedule */
+ qfq_front_slot_remove(grp);
+
+ new_front_agg = qfq_slot_scan(grp);
+
+ if (new_front_agg == NULL) /* group is now inactive, remove from ER */
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ else {
+ u64 roundedS = qfq_round_down(new_front_agg->S,
+ grp->slot_shift);
+ unsigned int s;
+
+ if (grp->S == roundedS)
+ return agg;
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+
+ qfq_unblock_groups(q, grp->index, old_F);
+
+ return agg;
+}
+
+static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ struct qfq_aggregate *agg;
+ int err = 0;
+
+ cl = qfq_classify(skb, sch, &err);
+ if (cl == NULL) {
+ if (err & __NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return err;
+ }
+ pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
+
+ if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
+ pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
+ cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
+ err = qfq_change_agg(sch, cl, cl->agg->class_weight,
+ qdisc_pkt_len(skb));
+ if (err)
+ return err;
+ }
+
+ err = qdisc_enqueue(skb, cl->qdisc);
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ pr_debug("qfq_enqueue: enqueue failed %d\n", err);
+ if (net_xmit_drop_count(err)) {
+ cl->qstats.drops++;
+ sch->qstats.drops++;
+ }
+ return err;
+ }
+
+ bstats_update(&cl->bstats, skb);
+ ++sch->q.qlen;
+
+ agg = cl->agg;
+ /* if the queue was not empty, then done here */
+ if (cl->qdisc->q.qlen != 1) {
+ if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
+ list_first_entry(&agg->active, struct qfq_class, alist)
+ == cl && cl->deficit < qdisc_pkt_len(skb))
+ list_move_tail(&cl->alist, &agg->active);
+
+ return err;
+ }
+
+ /* schedule class for service within the aggregate */
+ cl->deficit = agg->lmax;
+ list_add_tail(&cl->alist, &agg->active);
+
+ if (list_first_entry(&agg->active, struct qfq_class, alist) != cl ||
+ q->in_serv_agg == agg)
+ return err; /* non-empty or in service, nothing else to do */
+
+ qfq_activate_agg(q, agg, enqueue);
+
+ return err;
+}
+
+/*
+ * Schedule aggregate according to its timestamps.
+ */
+static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ struct qfq_group *grp = agg->grp;
+ u64 roundedS;
+ int s;
+
+ roundedS = qfq_round_down(agg->S, grp->slot_shift);
+
+ /*
+ * Insert agg in the correct bucket.
+ * If agg->S >= grp->S we don't need to adjust the
+ * bucket list and simply go to the insertion phase.
+ * Otherwise grp->S is decreasing, we must make room
+ * in the bucket list, and also recompute the group state.
+ * Finally, if there were no flows in this group and nobody
+ * was in ER make sure to adjust V.
+ */
+ if (grp->full_slots) {
+ if (!qfq_gt(grp->S, agg->S))
+ goto skip_update;
+
+ /* create a slot for this agg->S */
+ qfq_slot_rotate(grp, roundedS);
+ /* group was surely ineligible, remove */
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) &&
+ q->in_serv_agg == NULL)
+ q->V = roundedS;
+
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+
+ pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
+ s, q->bitmaps[s],
+ (unsigned long long) agg->S,
+ (unsigned long long) agg->F,
+ (unsigned long long) q->V);
+
+skip_update:
+ qfq_slot_insert(grp, agg, roundedS);
+}
+
+
+/* Update agg ts and schedule agg for service */
+static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+ enum update_reason reason)
+{
+ agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */
+
+ qfq_update_agg_ts(q, agg, reason);
+ if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */
+ q->in_serv_agg = agg; /* start serving this aggregate */
+ /* update V: to be in service, agg must be eligible */
+ q->oldV = q->V = agg->S;
+ } else if (agg != q->in_serv_agg)
+ qfq_schedule_agg(q, agg);
+}
+
+static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
+ struct qfq_aggregate *agg)
+{
+ unsigned int i, offset;
+ u64 roundedS;
+
+ roundedS = qfq_round_down(agg->S, grp->slot_shift);
+ offset = (roundedS - grp->S) >> grp->slot_shift;
+
+ i = (grp->front + offset) % QFQ_MAX_SLOTS;
+
+ hlist_del(&agg->next);
+ if (hlist_empty(&grp->slots[i]))
+ __clear_bit(offset, &grp->full_slots);
+}
+
+/*
+ * Called to forcibly deschedule an aggregate. If the aggregate is
+ * not in the front bucket, or if the latter has other aggregates in
+ * the front bucket, we can simply remove the aggregate with no other
+ * side effects.
+ * Otherwise we must propagate the event up.
+ */
+static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ struct qfq_group *grp = agg->grp;
+ unsigned long mask;
+ u64 roundedS;
+ int s;
+
+ if (agg == q->in_serv_agg) {
+ charge_actual_service(agg);
+ q->in_serv_agg = qfq_choose_next_agg(q);
+ return;
+ }
+
+ agg->F = agg->S;
+ qfq_slot_remove(q, grp, agg);
+
+ if (!grp->full_slots) {
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+
+ if (test_bit(grp->index, &q->bitmaps[ER]) &&
+ !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
+ mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
+ if (mask)
+ mask = ~((1UL << __fls(mask)) - 1);
+ else
+ mask = ~0UL;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+ }
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ } else if (hlist_empty(&grp->slots[grp->front])) {
+ agg = qfq_slot_scan(grp);
+ roundedS = qfq_round_down(agg->S, grp->slot_shift);
+ if (grp->S != roundedS) {
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+ }
+}
+
+static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ if (cl->qdisc->q.qlen == 0)
+ qfq_deactivate_class(q, cl);
+}
+
+static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
+ struct hlist_head *slot)
+{
+ struct qfq_aggregate *agg;
+ struct qfq_class *cl;
+ unsigned int len;
+
+ hlist_for_each_entry(agg, slot, next) {
+ list_for_each_entry(cl, &agg->active, alist) {
+
+ if (!cl->qdisc->ops->drop)
+ continue;
+
+ len = cl->qdisc->ops->drop(cl->qdisc);
+ if (len > 0) {
+ if (cl->qdisc->q.qlen == 0)
+ qfq_deactivate_class(q, cl);
+
+ return len;
+ }
+ }
+ }
+ return 0;
+}
+
+static unsigned int qfq_drop(struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_group *grp;
+ unsigned int i, j, len;
+
+ for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+ grp = &q->groups[i];
+ for (j = 0; j < QFQ_MAX_SLOTS; j++) {
+ len = qfq_drop_from_slot(q, &grp->slots[j]);
+ if (len > 0) {
+ sch->q.qlen--;
+ return len;
+ }
+ }
+
+ }
+
+ return 0;
+}
+
+static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_group *grp;
+ int i, j, err;
+ u32 max_cl_shift, maxbudg_shift, max_classes;
+
+ err = qdisc_class_hash_init(&q->clhash);
+ if (err < 0)
+ return err;
+
+ if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
+ max_classes = QFQ_MAX_AGG_CLASSES;
+ else
+ max_classes = qdisc_dev(sch)->tx_queue_len + 1;
+ /* max_cl_shift = floor(log_2(max_classes)) */
+ max_cl_shift = __fls(max_classes);
+ q->max_agg_classes = 1<<max_cl_shift;
+
+ /* maxbudg_shift = log2(max_len * max_classes_per_agg) */
+ maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
+ q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
+
+ for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+ grp = &q->groups[i];
+ grp->index = i;
+ grp->slot_shift = q->min_slot_shift + i;
+ for (j = 0; j < QFQ_MAX_SLOTS; j++)
+ INIT_HLIST_HEAD(&grp->slots[j]);
+ }
+
+ INIT_HLIST_HEAD(&q->nonfull_aggs);
+
+ return 0;
+}
+
+static void qfq_reset_qdisc(struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ unsigned int i;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (cl->qdisc->q.qlen > 0)
+ qfq_deactivate_class(q, cl);
+
+ qdisc_reset(cl->qdisc);
+ }
+ }
+ sch->q.qlen = 0;
+}
+
+static void qfq_destroy_qdisc(struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ struct hlist_node *next;
+ unsigned int i;
+
+ tcf_destroy_chain(&q->filter_list);
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+ common.hnode) {
+ qfq_destroy_class(sch, cl);
+ }
+ }
+ qdisc_class_hash_destroy(&q->clhash);
+}
+
+static const struct Qdisc_class_ops qfq_class_ops = {
+ .change = qfq_change_class,
+ .delete = qfq_delete_class,
+ .get = qfq_get_class,
+ .put = qfq_put_class,
+ .tcf_chain = qfq_tcf_chain,
+ .bind_tcf = qfq_bind_tcf,
+ .unbind_tcf = qfq_unbind_tcf,
+ .graft = qfq_graft_class,
+ .leaf = qfq_class_leaf,
+ .qlen_notify = qfq_qlen_notify,
+ .dump = qfq_dump_class,
+ .dump_stats = qfq_dump_class_stats,
+ .walk = qfq_walk,
+};
+
+static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
+ .cl_ops = &qfq_class_ops,
+ .id = "qfq",
+ .priv_size = sizeof(struct qfq_sched),
+ .enqueue = qfq_enqueue,
+ .dequeue = qfq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = qfq_drop,
+ .init = qfq_init_qdisc,
+ .reset = qfq_reset_qdisc,
+ .destroy = qfq_destroy_qdisc,
+ .owner = THIS_MODULE,
+};
+
+static int __init qfq_init(void)
+{
+ return register_qdisc(&qfq_qdisc_ops);
+}
+
+static void __exit qfq_exit(void)
+{
+ unregister_qdisc(&qfq_qdisc_ops);
+}
+
+module_init(qfq_init);
+module_exit(qfq_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 8d42bb3ba54..633e32defdc 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -36,11 +36,12 @@
if RED works correctly.
*/
-struct red_sched_data
-{
+struct red_sched_data {
u32 limit; /* HARD maximal queue length */
unsigned char flags;
+ struct timer_list adapt_timer;
struct red_parms parms;
+ struct red_vars vars;
struct red_stats stats;
struct Qdisc *qdisc;
};
@@ -55,47 +56,47 @@ static inline int red_use_harddrop(struct red_sched_data *q)
return q->flags & TC_RED_HARDDROP;
}
-static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
int ret;
- q->parms.qavg = red_calc_qavg(&q->parms, child->qstats.backlog);
-
- if (red_is_idling(&q->parms))
- red_end_of_idle_period(&q->parms);
-
- switch (red_action(&q->parms, q->parms.qavg)) {
- case RED_DONT_MARK:
- break;
-
- case RED_PROB_MARK:
- sch->qstats.overlimits++;
- if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
- q->stats.prob_drop++;
- goto congestion_drop;
- }
-
- q->stats.prob_mark++;
- break;
-
- case RED_HARD_MARK:
- sch->qstats.overlimits++;
- if (red_use_harddrop(q) || !red_use_ecn(q) ||
- !INET_ECN_set_ce(skb)) {
- q->stats.forced_drop++;
- goto congestion_drop;
- }
-
- q->stats.forced_mark++;
- break;
+ q->vars.qavg = red_calc_qavg(&q->parms,
+ &q->vars,
+ child->qstats.backlog);
+
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
+
+ switch (red_action(&q->parms, &q->vars, q->vars.qavg)) {
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ sch->qstats.overlimits++;
+ if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ break;
+
+ case RED_HARD_MARK:
+ sch->qstats.overlimits++;
+ if (red_use_harddrop(q) || !red_use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ break;
}
ret = qdisc_enqueue(skb, child);
if (likely(ret == NET_XMIT_SUCCESS)) {
- sch->bstats.bytes += qdisc_pkt_len(skb);
- sch->bstats.packets++;
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
@@ -108,22 +109,24 @@ congestion_drop:
return NET_XMIT_CN;
}
-static struct sk_buff * red_dequeue(struct Qdisc* sch)
+static struct sk_buff *red_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
skb = child->dequeue(child);
- if (skb)
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
- else if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
-
+ } else {
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
+ }
return skb;
}
-static struct sk_buff * red_peek(struct Qdisc* sch)
+static struct sk_buff *red_peek(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
@@ -131,7 +134,7 @@ static struct sk_buff * red_peek(struct Qdisc* sch)
return child->ops->peek(child);
}
-static unsigned int red_drop(struct Qdisc* sch)
+static unsigned int red_drop(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
@@ -144,30 +147,33 @@ static unsigned int red_drop(struct Qdisc* sch)
return len;
}
- if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
return 0;
}
-static void red_reset(struct Qdisc* sch)
+static void red_reset(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
- red_restart(&q->parms);
+ red_restart(&q->vars);
}
static void red_destroy(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
+
+ del_timer_sync(&q->adapt_timer);
qdisc_destroy(q->qdisc);
}
static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
[TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) },
[TCA_RED_STAB] = { .len = RED_STAB_SIZE },
+ [TCA_RED_MAX_P] = { .type = NLA_U32 },
};
static int red_change(struct Qdisc *sch, struct nlattr *opt)
@@ -177,6 +183,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
struct tc_red_qopt *ctl;
struct Qdisc *child = NULL;
int err;
+ u32 max_P;
if (opt == NULL)
return -EINVAL;
@@ -189,6 +196,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
tb[TCA_RED_STAB] == NULL)
return -EINVAL;
+ max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
+
ctl = nla_data(tb[TCA_RED_PARMS]);
if (ctl->limit > 0) {
@@ -206,22 +215,42 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
q->qdisc = child;
}
- red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
- ctl->Plog, ctl->Scell_log,
- nla_data(tb[TCA_RED_STAB]));
+ red_set_parms(&q->parms,
+ ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ nla_data(tb[TCA_RED_STAB]),
+ max_P);
+ red_set_vars(&q->vars);
- if (skb_queue_empty(&sch->q))
- red_end_of_idle_period(&q->parms);
+ del_timer(&q->adapt_timer);
+ if (ctl->flags & TC_RED_ADAPTATIVE)
+ mod_timer(&q->adapt_timer, jiffies + HZ/2);
+
+ if (!q->qdisc->q.qlen)
+ red_start_of_idle_period(&q->vars);
sch_tree_unlock(sch);
return 0;
}
-static int red_init(struct Qdisc* sch, struct nlattr *opt)
+static inline void red_adaptative_timer(unsigned long arg)
+{
+ struct Qdisc *sch = (struct Qdisc *)arg;
+ struct red_sched_data *q = qdisc_priv(sch);
+ spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+
+ spin_lock(root_lock);
+ red_adaptative_algo(&q->parms, &q->vars);
+ mod_timer(&q->adapt_timer, jiffies + HZ/2);
+ spin_unlock(root_lock);
+}
+
+static int red_init(struct Qdisc *sch, struct nlattr *opt)
{
struct red_sched_data *q = qdisc_priv(sch);
q->qdisc = &noop_qdisc;
+ setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch);
return red_change(sch, opt);
}
@@ -239,10 +268,13 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
.Scell_log = q->parms.Scell_log,
};
+ sch->qstats.backlog = q->qdisc->qstats.backlog;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
- NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
+ nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P))
+ goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
new file mode 100644
index 00000000000..9b0f7093d97
--- /dev/null
+++ b/net/sched/sch_sfb.c
@@ -0,0 +1,725 @@
+/*
+ * net/sched/sch_sfb.c Stochastic Fair Blue
+ *
+ * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
+ * A New Class of Active Queue Management Algorithms.
+ * U. Michigan CSE-TR-387-99, April 1999.
+ *
+ * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <net/ip.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/flow_keys.h>
+
+/*
+ * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
+ * This implementation uses L = 8 and N = 16
+ * This permits us to split one 32bit hash (provided per packet by rxhash or
+ * external classifier) into 8 subhashes of 4 bits.
+ */
+#define SFB_BUCKET_SHIFT 4
+#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */
+#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
+#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */
+
+/* SFB algo uses a virtual queue, named "bin" */
+struct sfb_bucket {
+ u16 qlen; /* length of virtual queue */
+ u16 p_mark; /* marking probability */
+};
+
+/* We use a double buffering right before hash change
+ * (Section 4.4 of SFB reference : moving hash functions)
+ */
+struct sfb_bins {
+ u32 perturbation; /* jhash perturbation */
+ struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
+};
+
+struct sfb_sched_data {
+ struct Qdisc *qdisc;
+ struct tcf_proto *filter_list;
+ unsigned long rehash_interval;
+ unsigned long warmup_time; /* double buffering warmup time in jiffies */
+ u32 max;
+ u32 bin_size; /* maximum queue length per bin */
+ u32 increment; /* d1 */
+ u32 decrement; /* d2 */
+ u32 limit; /* HARD maximal queue length */
+ u32 penalty_rate;
+ u32 penalty_burst;
+ u32 tokens_avail;
+ unsigned long rehash_time;
+ unsigned long token_time;
+
+ u8 slot; /* current active bins (0 or 1) */
+ bool double_buffering;
+ struct sfb_bins bins[2];
+
+ struct {
+ u32 earlydrop;
+ u32 penaltydrop;
+ u32 bucketdrop;
+ u32 queuedrop;
+ u32 childdrop; /* drops in child qdisc */
+ u32 marked; /* ECN mark */
+ } stats;
+};
+
+/*
+ * Each queued skb might be hashed on one or two bins
+ * We store in skb_cb the two hash values.
+ * (A zero value means double buffering was not used)
+ */
+struct sfb_skb_cb {
+ u32 hashes[2];
+};
+
+static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb));
+ return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+/*
+ * If using 'internal' SFB flow classifier, hash comes from skb rxhash
+ * If using external classifier, hash comes from the classid.
+ */
+static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
+{
+ return sfb_skb_cb(skb)->hashes[slot];
+}
+
+/* Probabilities are coded as Q0.16 fixed-point values,
+ * with 0xFFFF representing 65535/65536 (almost 1.0)
+ * Addition and subtraction are saturating in [0, 65535]
+ */
+static u32 prob_plus(u32 p1, u32 p2)
+{
+ u32 res = p1 + p2;
+
+ return min_t(u32, res, SFB_MAX_PROB);
+}
+
+static u32 prob_minus(u32 p1, u32 p2)
+{
+ return p1 > p2 ? p1 - p2 : 0;
+}
+
+static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
+{
+ int i;
+ struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b[hash].qlen < 0xFFFF)
+ b[hash].qlen++;
+ b += SFB_NUMBUCKETS; /* next level */
+ }
+}
+
+static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ u32 sfbhash;
+
+ sfbhash = sfb_hash(skb, 0);
+ if (sfbhash)
+ increment_one_qlen(sfbhash, 0, q);
+
+ sfbhash = sfb_hash(skb, 1);
+ if (sfbhash)
+ increment_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_one_qlen(u32 sfbhash, u32 slot,
+ struct sfb_sched_data *q)
+{
+ int i;
+ struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b[hash].qlen > 0)
+ b[hash].qlen--;
+ b += SFB_NUMBUCKETS; /* next level */
+ }
+}
+
+static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ u32 sfbhash;
+
+ sfbhash = sfb_hash(skb, 0);
+ if (sfbhash)
+ decrement_one_qlen(sfbhash, 0, q);
+
+ sfbhash = sfb_hash(skb, 1);
+ if (sfbhash)
+ decrement_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+ b->p_mark = prob_minus(b->p_mark, q->decrement);
+}
+
+static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+ b->p_mark = prob_plus(b->p_mark, q->increment);
+}
+
+static void sfb_zero_all_buckets(struct sfb_sched_data *q)
+{
+ memset(&q->bins, 0, sizeof(q->bins));
+}
+
+/*
+ * compute max qlen, max p_mark, and avg p_mark
+ */
+static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q)
+{
+ int i;
+ u32 qlen = 0, prob = 0, totalpm = 0;
+ const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
+ if (qlen < b->qlen)
+ qlen = b->qlen;
+ totalpm += b->p_mark;
+ if (prob < b->p_mark)
+ prob = b->p_mark;
+ b++;
+ }
+ *prob_r = prob;
+ *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS);
+ return qlen;
+}
+
+
+static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
+{
+ q->bins[slot].perturbation = prandom_u32();
+}
+
+static void sfb_swap_slot(struct sfb_sched_data *q)
+{
+ sfb_init_perturbation(q->slot, q);
+ q->slot ^= 1;
+ q->double_buffering = false;
+}
+
+/* Non elastic flows are allowed to use part of the bandwidth, expressed
+ * in "penalty_rate" packets per second, with "penalty_burst" burst
+ */
+static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ if (q->penalty_rate == 0 || q->penalty_burst == 0)
+ return true;
+
+ if (q->tokens_avail < 1) {
+ unsigned long age = min(10UL * HZ, jiffies - q->token_time);
+
+ q->tokens_avail = (age * q->penalty_rate) / HZ;
+ if (q->tokens_avail > q->penalty_burst)
+ q->tokens_avail = q->penalty_burst;
+ q->token_time = jiffies;
+ if (q->tokens_avail < 1)
+ return true;
+ }
+
+ q->tokens_avail--;
+ return false;
+}
+
+static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q,
+ int *qerr, u32 *salt)
+{
+ struct tcf_result res;
+ int result;
+
+ result = tc_classify(skb, q->filter_list, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return false;
+ }
+#endif
+ *salt = TC_H_MIN(res.classid);
+ return true;
+ }
+ return false;
+}
+
+static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ int i;
+ u32 p_min = ~0;
+ u32 minqlen = ~0;
+ u32 r, slot, salt, sfbhash;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ struct flow_keys keys;
+
+ if (unlikely(sch->q.qlen >= q->limit)) {
+ sch->qstats.overlimits++;
+ q->stats.queuedrop++;
+ goto drop;
+ }
+
+ if (q->rehash_interval > 0) {
+ unsigned long limit = q->rehash_time + q->rehash_interval;
+
+ if (unlikely(time_after(jiffies, limit))) {
+ sfb_swap_slot(q);
+ q->rehash_time = jiffies;
+ } else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
+ time_after(jiffies, limit - q->warmup_time))) {
+ q->double_buffering = true;
+ }
+ }
+
+ if (q->filter_list) {
+ /* If using external classifiers, get result and record it. */
+ if (!sfb_classify(skb, q, &ret, &salt))
+ goto other_drop;
+ keys.src = salt;
+ keys.dst = 0;
+ keys.ports = 0;
+ } else {
+ skb_flow_dissect(skb, &keys);
+ }
+
+ slot = q->slot;
+
+ sfbhash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src,
+ (__force u32)keys.ports,
+ q->bins[slot].perturbation);
+ if (!sfbhash)
+ sfbhash = 1;
+ sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+ struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b->qlen == 0)
+ decrement_prob(b, q);
+ else if (b->qlen >= q->bin_size)
+ increment_prob(b, q);
+ if (minqlen > b->qlen)
+ minqlen = b->qlen;
+ if (p_min > b->p_mark)
+ p_min = b->p_mark;
+ }
+
+ slot ^= 1;
+ sfb_skb_cb(skb)->hashes[slot] = 0;
+
+ if (unlikely(minqlen >= q->max)) {
+ sch->qstats.overlimits++;
+ q->stats.bucketdrop++;
+ goto drop;
+ }
+
+ if (unlikely(p_min >= SFB_MAX_PROB)) {
+ /* Inelastic flow */
+ if (q->double_buffering) {
+ sfbhash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src,
+ (__force u32)keys.ports,
+ q->bins[slot].perturbation);
+ if (!sfbhash)
+ sfbhash = 1;
+ sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+ struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b->qlen == 0)
+ decrement_prob(b, q);
+ else if (b->qlen >= q->bin_size)
+ increment_prob(b, q);
+ }
+ }
+ if (sfb_rate_limit(skb, q)) {
+ sch->qstats.overlimits++;
+ q->stats.penaltydrop++;
+ goto drop;
+ }
+ goto enqueue;
+ }
+
+ r = prandom_u32() & SFB_MAX_PROB;
+
+ if (unlikely(r < p_min)) {
+ if (unlikely(p_min > SFB_MAX_PROB / 2)) {
+ /* If we're marking that many packets, then either
+ * this flow is unresponsive, or we're badly congested.
+ * In either case, we want to start dropping packets.
+ */
+ if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
+ q->stats.earlydrop++;
+ goto drop;
+ }
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.marked++;
+ } else {
+ q->stats.earlydrop++;
+ goto drop;
+ }
+ }
+
+enqueue:
+ ret = qdisc_enqueue(skb, child);
+ if (likely(ret == NET_XMIT_SUCCESS)) {
+ sch->q.qlen++;
+ increment_qlen(skb, q);
+ } else if (net_xmit_drop_count(ret)) {
+ q->stats.childdrop++;
+ sch->qstats.drops++;
+ }
+ return ret;
+
+drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+other_drop:
+ if (ret & __NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ struct sk_buff *skb;
+
+ skb = child->dequeue(q->qdisc);
+
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ decrement_qlen(skb, q);
+ }
+
+ return skb;
+}
+
+static struct sk_buff *sfb_peek(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+
+ return child->ops->peek(child);
+}
+
+/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
+
+static void sfb_reset(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset(q->qdisc);
+ sch->q.qlen = 0;
+ q->slot = 0;
+ q->double_buffering = false;
+ sfb_zero_all_buckets(q);
+ sfb_init_perturbation(0, q);
+}
+
+static void sfb_destroy(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ qdisc_destroy(q->qdisc);
+}
+
+static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
+ [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) },
+};
+
+static const struct tc_sfb_qopt sfb_default_ops = {
+ .rehash_interval = 600 * MSEC_PER_SEC,
+ .warmup_time = 60 * MSEC_PER_SEC,
+ .limit = 0,
+ .max = 25,
+ .bin_size = 20,
+ .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
+ .decrement = (SFB_MAX_PROB + 3000) / 6000,
+ .penalty_rate = 10,
+ .penalty_burst = 20,
+};
+
+static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child;
+ struct nlattr *tb[TCA_SFB_MAX + 1];
+ const struct tc_sfb_qopt *ctl = &sfb_default_ops;
+ u32 limit;
+ int err;
+
+ if (opt) {
+ err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
+ if (err < 0)
+ return -EINVAL;
+
+ if (tb[TCA_SFB_PARMS] == NULL)
+ return -EINVAL;
+
+ ctl = nla_data(tb[TCA_SFB_PARMS]);
+ }
+
+ limit = ctl->limit;
+ if (limit == 0)
+ limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
+
+ child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+
+ sch_tree_lock(sch);
+
+ qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+ qdisc_destroy(q->qdisc);
+ q->qdisc = child;
+
+ q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
+ q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
+ q->rehash_time = jiffies;
+ q->limit = limit;
+ q->increment = ctl->increment;
+ q->decrement = ctl->decrement;
+ q->max = ctl->max;
+ q->bin_size = ctl->bin_size;
+ q->penalty_rate = ctl->penalty_rate;
+ q->penalty_burst = ctl->penalty_burst;
+ q->tokens_avail = ctl->penalty_burst;
+ q->token_time = jiffies;
+
+ q->slot = 0;
+ q->double_buffering = false;
+ sfb_zero_all_buckets(q);
+ sfb_init_perturbation(0, q);
+ sfb_init_perturbation(1, q);
+
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ q->qdisc = &noop_qdisc;
+ return sfb_change(sch, opt);
+}
+
+static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+ struct tc_sfb_qopt opt = {
+ .rehash_interval = jiffies_to_msecs(q->rehash_interval),
+ .warmup_time = jiffies_to_msecs(q->warmup_time),
+ .limit = q->limit,
+ .max = q->max,
+ .bin_size = q->bin_size,
+ .increment = q->increment,
+ .decrement = q->decrement,
+ .penalty_rate = q->penalty_rate,
+ .penalty_burst = q->penalty_burst,
+ };
+
+ sch->qstats.backlog = q->qdisc->qstats.backlog;
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct tc_sfb_xstats st = {
+ .earlydrop = q->stats.earlydrop,
+ .penaltydrop = q->stats.penaltydrop,
+ .bucketdrop = q->stats.bucketdrop,
+ .queuedrop = q->stats.queuedrop,
+ .childdrop = q->stats.childdrop,
+ .marked = q->stats.marked,
+ };
+
+ st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ return -ENOSYS;
+}
+
+static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->qdisc;
+ q->qdisc = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ return q->qdisc;
+}
+
+static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void sfb_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg)
+{
+ return -ENOSYS;
+}
+
+static int sfb_delete(struct Qdisc *sch, unsigned long cl)
+{
+ return -ENOSYS;
+}
+
+static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+
+static const struct Qdisc_class_ops sfb_class_ops = {
+ .graft = sfb_graft,
+ .leaf = sfb_leaf,
+ .get = sfb_get,
+ .put = sfb_put,
+ .change = sfb_change_class,
+ .delete = sfb_delete,
+ .walk = sfb_walk,
+ .tcf_chain = sfb_find_tcf,
+ .bind_tcf = sfb_bind,
+ .unbind_tcf = sfb_put,
+ .dump = sfb_dump_class,
+};
+
+static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
+ .id = "sfb",
+ .priv_size = sizeof(struct sfb_sched_data),
+ .cl_ops = &sfb_class_ops,
+ .enqueue = sfb_enqueue,
+ .dequeue = sfb_dequeue,
+ .peek = sfb_peek,
+ .init = sfb_init,
+ .reset = sfb_reset,
+ .destroy = sfb_destroy,
+ .change = sfb_change,
+ .dump = sfb_dump,
+ .dump_stats = sfb_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init sfb_module_init(void)
+{
+ return register_qdisc(&sfb_qdisc_ops);
+}
+
+static void __exit sfb_module_exit(void)
+{
+ unregister_qdisc(&sfb_qdisc_ops);
+}
+
+module_init(sfb_module_init)
+module_exit(sfb_module_exit)
+
+MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
+MODULE_AUTHOR("Juliusz Chroboczek");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 3cf478d012d..1af2f73906d 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -17,13 +17,14 @@
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/init.h>
-#include <linux/ipv6.h>
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/slab.h>
-#include <net/ip.h>
+#include <linux/vmalloc.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/flow_keys.h>
+#include <net/red.h>
/* Stochastic Fairness Queuing algorithm.
@@ -66,105 +67,119 @@
SFQ is superior for this purpose.
IMPLEMENTATION:
- This implementation limits maximal queue length to 128;
- maximal mtu to 2^15-1; number of hash buckets to 1024.
- The only goal of this restrictions was that all data
- fit into one 4K page :-). Struct sfq_sched_data is
- organized in anti-cache manner: all the data for a bucket
- are scattered over different locations. This is not good,
- but it allowed me to put it into 4K.
+ This implementation limits :
+ - maximal queue length per flow to 127 packets.
+ - max mtu to 2^18-1;
+ - max 65408 flows,
+ - number of hash buckets to 65536.
It is easy to increase these values, but not in flight. */
-#define SFQ_DEPTH 128
-#define SFQ_HASH_DIVISOR 1024
+#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
+#define SFQ_DEFAULT_FLOWS 128
+#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
+#define SFQ_EMPTY_SLOT 0xffff
+#define SFQ_DEFAULT_HASH_DIVISOR 1024
-/* This type should contain at least SFQ_DEPTH*2 values */
-typedef unsigned char sfq_index;
+/* We use 16 bits to store allot, and want to handle packets up to 64K
+ * Scale allot by 8 (1<<3) so that no overflow occurs.
+ */
+#define SFQ_ALLOT_SHIFT 3
+#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
-struct sfq_head
-{
+/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
+typedef u16 sfq_index;
+
+/*
+ * We dont use pointers to save space.
+ * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
+ * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
+ * are 'pointers' to dep[] array
+ */
+struct sfq_head {
sfq_index next;
sfq_index prev;
};
-struct sfq_sched_data
-{
-/* Parameters */
- int perturb_period;
- unsigned quantum; /* Allotment per round: MUST BE >= MTU */
- int limit;
+struct sfq_slot {
+ struct sk_buff *skblist_next;
+ struct sk_buff *skblist_prev;
+ sfq_index qlen; /* number of skbs in skblist */
+ sfq_index next; /* next slot in sfq RR chain */
+ struct sfq_head dep; /* anchor in dep[] chains */
+ unsigned short hash; /* hash value (index in ht[]) */
+ short allot; /* credit for this slot */
+
+ unsigned int backlog;
+ struct red_vars vars;
+};
-/* Variables */
+struct sfq_sched_data {
+/* frequently used fields */
+ int limit; /* limit of total number of packets in this qdisc */
+ unsigned int divisor; /* number of slots in hash table */
+ u8 headdrop;
+ u8 maxdepth; /* limit of packets per flow */
+
+ u32 perturbation;
+ u8 cur_depth; /* depth of longest slot */
+ u8 flags;
+ unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
struct tcf_proto *filter_list;
+ sfq_index *ht; /* Hash table ('divisor' slots) */
+ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
+
+ struct red_parms *red_parms;
+ struct tc_sfqred_stats stats;
+ struct sfq_slot *tail; /* current slot in round */
+
+ struct sfq_head dep[SFQ_MAX_DEPTH + 1];
+ /* Linked lists of slots, indexed by depth
+ * dep[0] : list of unused flows
+ * dep[1] : list of flows with 1 packet
+ * dep[X] : list of flows with X packets
+ */
+
+ unsigned int maxflows; /* number of flows in flows array */
+ int perturb_period;
+ unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
struct timer_list perturb_timer;
- u32 perturbation;
- sfq_index tail; /* Index of current slot in round */
- sfq_index max_depth; /* Maximal depth */
-
- sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */
- sfq_index next[SFQ_DEPTH]; /* Active slots link */
- short allot[SFQ_DEPTH]; /* Current allotment per slot */
- unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */
- struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */
- struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */
};
-static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
+/*
+ * sfq_head are either in a sfq_slot or in dep[] array
+ */
+static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
{
- return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
+ if (val < SFQ_MAX_FLOWS)
+ return &q->slots[val].dep;
+ return &q->dep[val - SFQ_MAX_FLOWS];
}
-static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
-{
- u32 h, h2;
+/*
+ * In order to be able to quickly rehash our queue when timer changes
+ * q->perturbation, we store flow_keys in skb->cb[]
+ */
+struct sfq_skb_cb {
+ struct flow_keys keys;
+};
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- {
- const struct iphdr *iph;
- int poff;
+static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
+ return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
+}
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
- goto err;
- iph = ip_hdr(skb);
- h = (__force u32)iph->daddr;
- h2 = (__force u32)iph->saddr ^ iph->protocol;
- if (iph->frag_off & htons(IP_MF|IP_OFFSET))
- break;
- poff = proto_ports_offset(iph->protocol);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
- iph = ip_hdr(skb);
- h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff);
- }
- break;
- }
- case htons(ETH_P_IPV6):
- {
- struct ipv6hdr *iph;
- int poff;
-
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
- goto err;
- iph = ipv6_hdr(skb);
- h = (__force u32)iph->daddr.s6_addr32[3];
- h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
- poff = proto_ports_offset(iph->nexthdr);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
- iph = ipv6_hdr(skb);
- h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff);
- }
- break;
- }
- default:
-err:
- h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
- h2 = (unsigned long)skb->sk;
- }
+static unsigned int sfq_hash(const struct sfq_sched_data *q,
+ const struct sk_buff *skb)
+{
+ const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
+ unsigned int hash;
- return sfq_fold_hash(q, h, h2);
+ hash = jhash_3words((__force u32)keys->dst,
+ (__force u32)keys->src ^ keys->ip_proto,
+ (__force u32)keys->ports, q->perturbation);
+ return hash & (q->divisor - 1);
}
static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -176,11 +191,13 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
if (TC_H_MAJ(skb->priority) == sch->handle &&
TC_H_MIN(skb->priority) > 0 &&
- TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
+ TC_H_MIN(skb->priority) <= q->divisor)
return TC_H_MIN(skb->priority);
- if (!q->filter_list)
+ if (!q->filter_list) {
+ skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
return sfq_hash(q, skb) + 1;
+ }
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
result = tc_classify(skb, q->filter_list, &res);
@@ -194,36 +211,50 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
return 0;
}
#endif
- if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
+ if (TC_H_MIN(res.classid) <= q->divisor)
return TC_H_MIN(res.classid);
}
return 0;
}
+/*
+ * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
+ */
static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
- int d = q->qs[x].qlen + SFQ_DEPTH;
+ struct sfq_slot *slot = &q->slots[x];
+ int qlen = slot->qlen;
+
+ p = qlen + SFQ_MAX_FLOWS;
+ n = q->dep[qlen].next;
- p = d;
- n = q->dep[d].next;
- q->dep[x].next = n;
- q->dep[x].prev = p;
- q->dep[p].next = q->dep[n].prev = x;
+ slot->dep.next = n;
+ slot->dep.prev = p;
+
+ q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
+ sfq_dep_head(q, n)->prev = x;
}
+#define sfq_unlink(q, x, n, p) \
+ do { \
+ n = q->slots[x].dep.next; \
+ p = q->slots[x].dep.prev; \
+ sfq_dep_head(q, p)->next = n; \
+ sfq_dep_head(q, n)->prev = p; \
+ } while (0)
+
+
static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
+ int d;
- n = q->dep[x].next;
- p = q->dep[x].prev;
- q->dep[p].next = n;
- q->dep[n].prev = p;
-
- if (n == p && q->max_depth == q->qs[x].qlen + 1)
- q->max_depth--;
+ sfq_unlink(q, x, n, p);
+ d = q->slots[x].qlen--;
+ if (n == p && q->cur_depth == d)
+ q->cur_depth--;
sfq_link(q, x);
}
@@ -232,34 +263,76 @@ static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
sfq_index p, n;
int d;
- n = q->dep[x].next;
- p = q->dep[x].prev;
- q->dep[p].next = n;
- q->dep[n].prev = p;
- d = q->qs[x].qlen;
- if (q->max_depth < d)
- q->max_depth = d;
+ sfq_unlink(q, x, n, p);
+ d = ++q->slots[x].qlen;
+ if (q->cur_depth < d)
+ q->cur_depth = d;
sfq_link(q, x);
}
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from tail of slot queue */
+static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
+{
+ struct sk_buff *skb = slot->skblist_prev;
+
+ slot->skblist_prev = skb->prev;
+ skb->prev->next = (struct sk_buff *)slot;
+ skb->next = skb->prev = NULL;
+ return skb;
+}
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
+{
+ struct sk_buff *skb = slot->skblist_next;
+
+ slot->skblist_next = skb->next;
+ skb->next->prev = (struct sk_buff *)slot;
+ skb->next = skb->prev = NULL;
+ return skb;
+}
+
+static inline void slot_queue_init(struct sfq_slot *slot)
+{
+ memset(slot, 0, sizeof(*slot));
+ slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
+}
+
+/* add skb to slot queue (tail add) */
+static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
+{
+ skb->prev = slot->skblist_prev;
+ skb->next = (struct sk_buff *)slot;
+ slot->skblist_prev->next = skb;
+ slot->skblist_prev = skb;
+}
+
+#define slot_queue_walk(slot, skb) \
+ for (skb = slot->skblist_next; \
+ skb != (struct sk_buff *)slot; \
+ skb = skb->next)
+
static unsigned int sfq_drop(struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
- sfq_index d = q->max_depth;
+ sfq_index x, d = q->cur_depth;
struct sk_buff *skb;
unsigned int len;
+ struct sfq_slot *slot;
- /* Queue is full! Find the longest slot and
- drop a packet from it */
-
+ /* Queue is full! Find the longest slot and drop tail packet from it */
if (d > 1) {
- sfq_index x = q->dep[d + SFQ_DEPTH].next;
- skb = q->qs[x].prev;
+ x = q->dep[d].next;
+ slot = &q->slots[x];
+drop:
+ skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
len = qdisc_pkt_len(skb);
- __skb_unlink(skb, &q->qs[x]);
- kfree_skb(skb);
+ slot->backlog -= len;
sfq_dec(q, x);
+ kfree_skb(skb);
sch->q.qlen--;
sch->qstats.drops++;
sch->qstats.backlog -= len;
@@ -268,31 +341,43 @@ static unsigned int sfq_drop(struct Qdisc *sch)
if (d == 1) {
/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
- d = q->next[q->tail];
- q->next[q->tail] = q->next[d];
- q->allot[q->next[d]] += q->quantum;
- skb = q->qs[d].prev;
- len = qdisc_pkt_len(skb);
- __skb_unlink(skb, &q->qs[d]);
- kfree_skb(skb);
- sfq_dec(q, d);
- sch->q.qlen--;
- q->ht[q->hash[d]] = SFQ_DEPTH;
- sch->qstats.drops++;
- sch->qstats.backlog -= len;
- return len;
+ x = q->tail->next;
+ slot = &q->slots[x];
+ q->tail->next = slot->next;
+ q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ goto drop;
}
return 0;
}
+/* Is ECN parameter configured */
+static int sfq_prob_mark(const struct sfq_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max threshold just be marked */
+static int sfq_hard_mark(const struct sfq_sched_data *q)
+{
+ return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
+}
+
+static int sfq_headdrop(const struct sfq_sched_data *q)
+{
+ return q->headdrop;
+}
+
static int
sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned int hash;
- sfq_index x;
+ sfq_index x, qlen;
+ struct sfq_slot *slot;
int uninitialized_var(ret);
+ struct sk_buff *head;
+ int delta;
hash = sfq_classify(skb, sch, &ret);
if (hash == 0) {
@@ -304,54 +389,114 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
hash--;
x = q->ht[hash];
- if (x == SFQ_DEPTH) {
- q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
- q->hash[x] = hash;
+ slot = &q->slots[x];
+ if (x == SFQ_EMPTY_SLOT) {
+ x = q->dep[0].next; /* get a free slot */
+ if (x >= SFQ_MAX_FLOWS)
+ return qdisc_drop(skb, sch);
+ q->ht[hash] = x;
+ slot = &q->slots[x];
+ slot->hash = hash;
+ slot->backlog = 0; /* should already be 0 anyway... */
+ red_set_vars(&slot->vars);
+ goto enqueue;
}
+ if (q->red_parms) {
+ slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
+ &slot->vars,
+ slot->backlog);
+ switch (red_action(q->red_parms,
+ &slot->vars,
+ slot->vars.qavg)) {
+ case RED_DONT_MARK:
+ break;
- /* If selected queue has length q->limit, this means that
- * all another queues are empty and that we do simple tail drop,
- * i.e. drop _this_ packet.
- */
- if (q->qs[x].qlen >= q->limit)
- return qdisc_drop(skb, sch);
+ case RED_PROB_MARK:
+ sch->qstats.overlimits++;
+ if (sfq_prob_mark(q)) {
+ /* We know we have at least one packet in queue */
+ if (sfq_headdrop(q) &&
+ INET_ECN_set_ce(slot->skblist_next)) {
+ q->stats.prob_mark_head++;
+ break;
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.prob_mark++;
+ break;
+ }
+ }
+ q->stats.prob_drop++;
+ goto congestion_drop;
+
+ case RED_HARD_MARK:
+ sch->qstats.overlimits++;
+ if (sfq_hard_mark(q)) {
+ /* We know we have at least one packet in queue */
+ if (sfq_headdrop(q) &&
+ INET_ECN_set_ce(slot->skblist_next)) {
+ q->stats.forced_mark_head++;
+ break;
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.forced_mark++;
+ break;
+ }
+ }
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+ }
+ if (slot->qlen >= q->maxdepth) {
+congestion_drop:
+ if (!sfq_headdrop(q))
+ return qdisc_drop(skb, sch);
+
+ /* We know we have at least one packet in queue */
+ head = slot_dequeue_head(slot);
+ delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
+ sch->qstats.backlog -= delta;
+ slot->backlog -= delta;
+ qdisc_drop(head, sch);
+
+ slot_queue_add(slot, skb);
+ return NET_XMIT_CN;
+ }
+
+enqueue:
sch->qstats.backlog += qdisc_pkt_len(skb);
- __skb_queue_tail(&q->qs[x], skb);
+ slot->backlog += qdisc_pkt_len(skb);
+ slot_queue_add(slot, skb);
sfq_inc(q, x);
- if (q->qs[x].qlen == 1) { /* The flow is new */
- if (q->tail == SFQ_DEPTH) { /* It is the first flow */
- q->tail = x;
- q->next[x] = x;
- q->allot[x] = q->quantum;
+ if (slot->qlen == 1) { /* The flow is new */
+ if (q->tail == NULL) { /* It is the first flow */
+ slot->next = x;
} else {
- q->next[x] = q->next[q->tail];
- q->next[q->tail] = x;
- q->tail = x;
+ slot->next = q->tail->next;
+ q->tail->next = x;
}
+ /* We put this flow at the end of our flow list.
+ * This might sound unfair for a new flow to wait after old ones,
+ * but we could endup servicing new flows only, and freeze old ones.
+ */
+ q->tail = slot;
+ /* We could use a bigger initial quantum for new flows */
+ slot->allot = q->scaled_quantum;
}
- if (++sch->q.qlen <= q->limit) {
- sch->bstats.bytes += qdisc_pkt_len(skb);
- sch->bstats.packets++;
+ if (++sch->q.qlen <= q->limit)
return NET_XMIT_SUCCESS;
- }
+ qlen = slot->qlen;
sfq_drop(sch);
- return NET_XMIT_CN;
-}
-
-static struct sk_buff *
-sfq_peek(struct Qdisc *sch)
-{
- struct sfq_sched_data *q = qdisc_priv(sch);
- sfq_index a;
-
- /* No active slots */
- if (q->tail == SFQ_DEPTH)
- return NULL;
+ /* Return Congestion Notification only if we dropped a packet
+ * from this flow.
+ */
+ if (qlen != slot->qlen)
+ return NET_XMIT_CN;
- a = q->next[q->tail];
- return skb_peek(&q->qs[a]);
+ /* As we dropped a packet, better let upper stack know this */
+ qdisc_tree_decrease_qlen(sch, 1);
+ return NET_XMIT_SUCCESS;
}
static struct sk_buff *
@@ -359,34 +504,38 @@ sfq_dequeue(struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
- sfq_index a, old_a;
+ sfq_index a, next_a;
+ struct sfq_slot *slot;
/* No active slots */
- if (q->tail == SFQ_DEPTH)
+ if (q->tail == NULL)
return NULL;
- a = old_a = q->next[q->tail];
-
- /* Grab packet */
- skb = __skb_dequeue(&q->qs[a]);
+next_slot:
+ a = q->tail->next;
+ slot = &q->slots[a];
+ if (slot->allot <= 0) {
+ q->tail = slot;
+ slot->allot += q->scaled_quantum;
+ goto next_slot;
+ }
+ skb = slot_dequeue_head(slot);
sfq_dec(q, a);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
sch->qstats.backlog -= qdisc_pkt_len(skb);
-
+ slot->backlog -= qdisc_pkt_len(skb);
/* Is the slot empty? */
- if (q->qs[a].qlen == 0) {
- q->ht[q->hash[a]] = SFQ_DEPTH;
- a = q->next[a];
- if (a == old_a) {
- q->tail = SFQ_DEPTH;
+ if (slot->qlen == 0) {
+ q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ next_a = slot->next;
+ if (a == next_a) {
+ q->tail = NULL; /* no more active slots */
return skb;
}
- q->next[q->tail] = a;
- q->allot[a] += q->quantum;
- } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
- q->tail = a;
- a = q->next[a];
- q->allot[a] += q->quantum;
+ q->tail->next = next_a;
+ } else {
+ slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
}
return skb;
}
@@ -400,12 +549,90 @@ sfq_reset(struct Qdisc *sch)
kfree_skb(skb);
}
+/*
+ * When q->perturbation is changed, we rehash all queued skbs
+ * to avoid OOO (Out Of Order) effects.
+ * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
+ * counters.
+ */
+static void sfq_rehash(struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ int i;
+ struct sfq_slot *slot;
+ struct sk_buff_head list;
+ int dropped = 0;
+
+ __skb_queue_head_init(&list);
+
+ for (i = 0; i < q->maxflows; i++) {
+ slot = &q->slots[i];
+ if (!slot->qlen)
+ continue;
+ while (slot->qlen) {
+ skb = slot_dequeue_head(slot);
+ sfq_dec(q, i);
+ __skb_queue_tail(&list, skb);
+ }
+ slot->backlog = 0;
+ red_set_vars(&slot->vars);
+ q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ }
+ q->tail = NULL;
+
+ while ((skb = __skb_dequeue(&list)) != NULL) {
+ unsigned int hash = sfq_hash(q, skb);
+ sfq_index x = q->ht[hash];
+
+ slot = &q->slots[x];
+ if (x == SFQ_EMPTY_SLOT) {
+ x = q->dep[0].next; /* get a free slot */
+ if (x >= SFQ_MAX_FLOWS) {
+drop: sch->qstats.backlog -= qdisc_pkt_len(skb);
+ kfree_skb(skb);
+ dropped++;
+ continue;
+ }
+ q->ht[hash] = x;
+ slot = &q->slots[x];
+ slot->hash = hash;
+ }
+ if (slot->qlen >= q->maxdepth)
+ goto drop;
+ slot_queue_add(slot, skb);
+ if (q->red_parms)
+ slot->vars.qavg = red_calc_qavg(q->red_parms,
+ &slot->vars,
+ slot->backlog);
+ slot->backlog += qdisc_pkt_len(skb);
+ sfq_inc(q, x);
+ if (slot->qlen == 1) { /* The flow is new */
+ if (q->tail == NULL) { /* It is the first flow */
+ slot->next = x;
+ } else {
+ slot->next = q->tail->next;
+ q->tail->next = x;
+ }
+ q->tail = slot;
+ slot->allot = q->scaled_quantum;
+ }
+ }
+ sch->q.qlen -= dropped;
+ qdisc_tree_decrease_qlen(sch, dropped);
+}
+
static void sfq_perturbation(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc *)arg;
struct sfq_sched_data *q = qdisc_priv(sch);
+ spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
- q->perturbation = net_random();
+ spin_lock(root_lock);
+ q->perturbation = prandom_u32();
+ if (!q->filter_list && q->tail)
+ sfq_rehash(sch);
+ spin_unlock(root_lock);
if (q->perturb_period)
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
@@ -415,16 +642,53 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct tc_sfq_qopt *ctl = nla_data(opt);
+ struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
unsigned int qlen;
+ struct red_parms *p = NULL;
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
-
+ if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
+ ctl_v1 = nla_data(opt);
+ if (ctl->divisor &&
+ (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
+ return -EINVAL;
+ if (ctl_v1 && ctl_v1->qth_min) {
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ }
sch_tree_lock(sch);
- q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
+ if (ctl->quantum) {
+ q->quantum = ctl->quantum;
+ q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+ }
q->perturb_period = ctl->perturb_period * HZ;
- if (ctl->limit)
- q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
+ if (ctl->flows)
+ q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
+ if (ctl->divisor) {
+ q->divisor = ctl->divisor;
+ q->maxflows = min_t(u32, q->maxflows, q->divisor);
+ }
+ if (ctl_v1) {
+ if (ctl_v1->depth)
+ q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
+ if (p) {
+ swap(q->red_parms, p);
+ red_set_parms(q->red_parms,
+ ctl_v1->qth_min, ctl_v1->qth_max,
+ ctl_v1->Wlog,
+ ctl_v1->Plog, ctl_v1->Scell_log,
+ NULL,
+ ctl_v1->max_P);
+ }
+ q->flags = ctl_v1->flags;
+ q->headdrop = ctl_v1->headdrop;
+ }
+ if (ctl->limit) {
+ q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
+ q->maxflows = min_t(u32, q->maxflows, q->limit);
+ }
qlen = sch->q.qlen;
while (sch->q.qlen > q->limit)
@@ -434,12 +698,39 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
del_timer(&q->perturb_timer);
if (q->perturb_period) {
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
- q->perturbation = net_random();
+ q->perturbation = prandom_u32();
}
sch_tree_unlock(sch);
+ kfree(p);
return 0;
}
+static void *sfq_alloc(size_t sz)
+{
+ void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+ if (!ptr)
+ ptr = vmalloc(sz);
+ return ptr;
+}
+
+static void sfq_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static void sfq_destroy(struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ q->perturb_period = 0;
+ del_timer_sync(&q->perturb_timer);
+ sfq_free(q->ht);
+ sfq_free(q->slots);
+ kfree(q->red_parms);
+}
+
static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct sfq_sched_data *q = qdisc_priv(sch);
@@ -449,56 +740,77 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
q->perturb_timer.data = (unsigned long)sch;
init_timer_deferrable(&q->perturb_timer);
- for (i = 0; i < SFQ_HASH_DIVISOR; i++)
- q->ht[i] = SFQ_DEPTH;
-
- for (i = 0; i < SFQ_DEPTH; i++) {
- skb_queue_head_init(&q->qs[i]);
- q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
- q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
+ for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
+ q->dep[i].next = i + SFQ_MAX_FLOWS;
+ q->dep[i].prev = i + SFQ_MAX_FLOWS;
}
- q->limit = SFQ_DEPTH - 1;
- q->max_depth = 0;
- q->tail = SFQ_DEPTH;
- if (opt == NULL) {
- q->quantum = psched_mtu(qdisc_dev(sch));
- q->perturb_period = 0;
- q->perturbation = net_random();
- } else {
+ q->limit = SFQ_MAX_DEPTH;
+ q->maxdepth = SFQ_MAX_DEPTH;
+ q->cur_depth = 0;
+ q->tail = NULL;
+ q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
+ q->maxflows = SFQ_DEFAULT_FLOWS;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+ q->perturb_period = 0;
+ q->perturbation = prandom_u32();
+
+ if (opt) {
int err = sfq_change(sch, opt);
if (err)
return err;
}
- for (i = 0; i < SFQ_DEPTH; i++)
+ q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
+ q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
+ if (!q->ht || !q->slots) {
+ sfq_destroy(sch);
+ return -ENOMEM;
+ }
+ for (i = 0; i < q->divisor; i++)
+ q->ht[i] = SFQ_EMPTY_SLOT;
+
+ for (i = 0; i < q->maxflows; i++) {
+ slot_queue_init(&q->slots[i]);
sfq_link(q, i);
+ }
+ if (q->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
-static void sfq_destroy(struct Qdisc *sch)
-{
- struct sfq_sched_data *q = qdisc_priv(sch);
-
- tcf_destroy_chain(&q->filter_list);
- q->perturb_period = 0;
- del_timer_sync(&q->perturb_timer);
-}
-
static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
- struct tc_sfq_qopt opt;
-
- opt.quantum = q->quantum;
- opt.perturb_period = q->perturb_period / HZ;
-
- opt.limit = q->limit;
- opt.divisor = SFQ_HASH_DIVISOR;
- opt.flows = q->limit;
+ struct tc_sfq_qopt_v1 opt;
+ struct red_parms *p = q->red_parms;
+
+ memset(&opt, 0, sizeof(opt));
+ opt.v0.quantum = q->quantum;
+ opt.v0.perturb_period = q->perturb_period / HZ;
+ opt.v0.limit = q->limit;
+ opt.v0.divisor = q->divisor;
+ opt.v0.flows = q->maxflows;
+ opt.depth = q->maxdepth;
+ opt.headdrop = q->headdrop;
+
+ if (p) {
+ opt.qth_min = p->qth_min >> p->Wlog;
+ opt.qth_max = p->qth_max >> p->Wlog;
+ opt.Wlog = p->Wlog;
+ opt.Plog = p->Plog;
+ opt.Scell_log = p->Scell_log;
+ opt.max_P = p->max_P;
+ }
+ memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
+ opt.flags = q->flags;
- NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
return skb->len;
@@ -520,6 +832,8 @@ static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
+ /* we cannot bypass queue discipline anymore */
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
@@ -547,10 +861,17 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct sfq_sched_data *q = qdisc_priv(sch);
- sfq_index idx = q->ht[cl-1];
- struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
- struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
+ sfq_index idx = q->ht[cl - 1];
+ struct gnet_stats_queue qs = { 0 };
+ struct tc_sfq_xstats xstats = { 0 };
+ if (idx != SFQ_EMPTY_SLOT) {
+ const struct sfq_slot *slot = &q->slots[idx];
+
+ xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
+ qs.qlen = slot->qlen;
+ qs.backlog = slot->backlog;
+ }
if (gnet_stats_copy_queue(d, &qs) < 0)
return -1;
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
@@ -564,8 +885,8 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
if (arg->stop)
return;
- for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
- if (q->ht[i] == SFQ_DEPTH ||
+ for (i = 0; i < q->divisor; i++) {
+ if (q->ht[i] == SFQ_EMPTY_SLOT ||
arg->count < arg->skip) {
arg->count++;
continue;
@@ -596,7 +917,7 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
.priv_size = sizeof(struct sfq_sched_data),
.enqueue = sfq_enqueue,
.dequeue = sfq_dequeue,
- .peek = sfq_peek,
+ .peek = qdisc_peek_dequeued,
.drop = sfq_drop,
.init = sfq_init,
.reset = sfq_reset,
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 641a30d6463..18ff6343370 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -19,6 +19,7 @@
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
+#include <net/sch_generic.h>
#include <net/pkt_sched.h>
@@ -97,35 +98,106 @@
changed the limit is not effective anymore.
*/
-struct tbf_sched_data
-{
+struct tbf_sched_data {
/* Parameters */
u32 limit; /* Maximal length of backlog: bytes */
- u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
- u32 mtu;
u32 max_size;
- struct qdisc_rate_table *R_tab;
- struct qdisc_rate_table *P_tab;
+ s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
+ s64 mtu;
+ struct psched_ratecfg rate;
+ struct psched_ratecfg peak;
/* Variables */
- long tokens; /* Current number of B tokens */
- long ptokens; /* Current number of P tokens */
- psched_time_t t_c; /* Time check-point */
+ s64 tokens; /* Current number of B tokens */
+ s64 ptokens; /* Current number of P tokens */
+ s64 t_c; /* Time check-point */
struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
struct qdisc_watchdog watchdog; /* Watchdog timer */
};
-#define L2T(q,L) qdisc_l2t((q)->R_tab,L)
-#define L2T_P(q,L) qdisc_l2t((q)->P_tab,L)
-static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+/* Time to Length, convert time in ns to length in bytes
+ * to determinate how many bytes can be sent in given time.
+ */
+static u64 psched_ns_t2l(const struct psched_ratecfg *r,
+ u64 time_in_ns)
+{
+ /* The formula is :
+ * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
+ */
+ u64 len = time_in_ns * r->rate_bytes_ps;
+
+ do_div(len, NSEC_PER_SEC);
+
+ if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
+ do_div(len, 53);
+ len = len * 48;
+ }
+
+ if (len > r->overhead)
+ len -= r->overhead;
+ else
+ len = 0;
+
+ return len;
+}
+
+/*
+ * Return length of individual segments of a gso packet,
+ * including all headers (MAC, IP, TCP/UDP)
+ */
+static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/* GSO packet is too big, segment it so that tbf can transmit
+ * each segment in time
+ */
+static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
- int ret;
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features = netif_skb_features(skb);
+ int ret, nb;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
- if (qdisc_pkt_len(skb) > q->max_size)
+ if (IS_ERR_OR_NULL(segs))
return qdisc_reshape_fail(skb, sch);
+ nb = 0;
+ while (segs) {
+ nskb = segs->next;
+ segs->next = NULL;
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ ret = qdisc_enqueue(segs, q->qdisc);
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ sch->qstats.drops++;
+ } else {
+ nb++;
+ }
+ segs = nskb;
+ }
+ sch->q.qlen += nb;
+ if (nb > 1)
+ qdisc_tree_decrease_qlen(sch, 1 - nb);
+ consume_skb(skb);
+ return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
+
+static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ int ret;
+
+ if (qdisc_pkt_len(skb) > q->max_size) {
+ if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
+ return tbf_segment(skb, sch);
+ return qdisc_reshape_fail(skb, sch);
+ }
ret = qdisc_enqueue(skb, q->qdisc);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
@@ -134,12 +206,10 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
}
sch->q.qlen++;
- sch->bstats.bytes += qdisc_pkt_len(skb);
- sch->bstats.packets++;
return NET_XMIT_SUCCESS;
}
-static unsigned int tbf_drop(struct Qdisc* sch)
+static unsigned int tbf_drop(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
unsigned int len = 0;
@@ -151,7 +221,12 @@ static unsigned int tbf_drop(struct Qdisc* sch)
return len;
}
-static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
+static bool tbf_peak_present(const struct tbf_sched_data *q)
+{
+ return q->peak.rate_bytes_ps;
+}
+
+static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
@@ -159,24 +234,24 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
skb = q->qdisc->ops->peek(q->qdisc);
if (skb) {
- psched_time_t now;
- long toks;
- long ptoks = 0;
+ s64 now;
+ s64 toks;
+ s64 ptoks = 0;
unsigned int len = qdisc_pkt_len(skb);
- now = psched_get_time();
- toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
+ now = ktime_to_ns(ktime_get());
+ toks = min_t(s64, now - q->t_c, q->buffer);
- if (q->P_tab) {
+ if (tbf_peak_present(q)) {
ptoks = toks + q->ptokens;
- if (ptoks > (long)q->mtu)
+ if (ptoks > q->mtu)
ptoks = q->mtu;
- ptoks -= L2T_P(q, len);
+ ptoks -= (s64) psched_l2t_ns(&q->peak, len);
}
toks += q->tokens;
- if (toks > (long)q->buffer)
+ if (toks > q->buffer)
toks = q->buffer;
- toks -= L2T(q, len);
+ toks -= (s64) psched_l2t_ns(&q->rate, len);
if ((toks|ptoks) >= 0) {
skb = qdisc_dequeue_peeked(q->qdisc);
@@ -187,12 +262,13 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
q->tokens = toks;
q->ptokens = ptoks;
sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
return skb;
}
- qdisc_watchdog_schedule(&q->watchdog,
- now + max_t(long, -toks, -ptoks));
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ now + max_t(long, -toks, -ptoks));
/* Maybe we have a shorter packet in the queue,
which can be sent now. It sounds cool,
@@ -210,13 +286,13 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
return NULL;
}
-static void tbf_reset(struct Qdisc* sch)
+static void tbf_reset(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
- q->t_c = psched_get_time();
+ q->t_c = ktime_to_ns(ktime_get());
q->tokens = q->buffer;
q->ptokens = q->mtu;
qdisc_watchdog_cancel(&q->watchdog);
@@ -226,20 +302,26 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
[TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
[TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_TBF_RATE64] = { .type = NLA_U64 },
+ [TCA_TBF_PRATE64] = { .type = NLA_U64 },
+ [TCA_TBF_BURST] = { .type = NLA_U32 },
+ [TCA_TBF_PBURST] = { .type = NLA_U32 },
};
-static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
+static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
{
int err;
struct tbf_sched_data *q = qdisc_priv(sch);
- struct nlattr *tb[TCA_TBF_PTAB + 1];
+ struct nlattr *tb[TCA_TBF_MAX + 1];
struct tc_tbf_qopt *qopt;
- struct qdisc_rate_table *rtab = NULL;
- struct qdisc_rate_table *ptab = NULL;
struct Qdisc *child = NULL;
- int max_size,n;
+ struct psched_ratecfg rate;
+ struct psched_ratecfg peak;
+ u64 max_size;
+ s64 buffer, mtu;
+ u64 rate64 = 0, prate64 = 0;
- err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
+ err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
if (err < 0)
return err;
@@ -248,30 +330,59 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
goto done;
qopt = nla_data(tb[TCA_TBF_PARMS]);
- rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
- if (rtab == NULL)
- goto done;
+ if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
+ tb[TCA_TBF_RTAB]));
+
+ if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
+ tb[TCA_TBF_PTAB]));
+
+ buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
+ mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
+
+ if (tb[TCA_TBF_RATE64])
+ rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
+ psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
+
+ if (tb[TCA_TBF_BURST]) {
+ max_size = nla_get_u32(tb[TCA_TBF_BURST]);
+ buffer = psched_l2t_ns(&rate, max_size);
+ } else {
+ max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
+ }
if (qopt->peakrate.rate) {
- if (qopt->peakrate.rate > qopt->rate.rate)
- ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
- if (ptab == NULL)
+ if (tb[TCA_TBF_PRATE64])
+ prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
+ psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
+ if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
+ pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
+ peak.rate_bytes_ps, rate.rate_bytes_ps);
+ err = -EINVAL;
goto done;
+ }
+
+ if (tb[TCA_TBF_PBURST]) {
+ u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
+ max_size = min_t(u32, max_size, pburst);
+ mtu = psched_l2t_ns(&peak, pburst);
+ } else {
+ max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
+ }
+ } else {
+ memset(&peak, 0, sizeof(peak));
}
- for (n = 0; n < 256; n++)
- if (rtab->data[n] > qopt->buffer) break;
- max_size = (n << qopt->rate.cell_log)-1;
- if (ptab) {
- int size;
+ if (max_size < psched_mtu(qdisc_dev(sch)))
+ pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
+ max_size, qdisc_dev(sch)->name,
+ psched_mtu(qdisc_dev(sch)));
- for (n = 0; n < 256; n++)
- if (ptab->data[n] > qopt->mtu) break;
- size = (n << qopt->peakrate.cell_log)-1;
- if (size < max_size) max_size = size;
- }
- if (max_size < 0)
+ if (!max_size) {
+ err = -EINVAL;
goto done;
+ }
if (q->qdisc != &noop_qdisc) {
err = fifo_set_limit(q->qdisc, qopt->limit);
@@ -292,33 +403,35 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
q->qdisc = child;
}
q->limit = qopt->limit;
- q->mtu = qopt->mtu;
+ if (tb[TCA_TBF_PBURST])
+ q->mtu = mtu;
+ else
+ q->mtu = PSCHED_TICKS2NS(qopt->mtu);
q->max_size = max_size;
- q->buffer = qopt->buffer;
+ if (tb[TCA_TBF_BURST])
+ q->buffer = buffer;
+ else
+ q->buffer = PSCHED_TICKS2NS(qopt->buffer);
q->tokens = q->buffer;
q->ptokens = q->mtu;
- swap(q->R_tab, rtab);
- swap(q->P_tab, ptab);
+ memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
+ memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
sch_tree_unlock(sch);
err = 0;
done:
- if (rtab)
- qdisc_put_rtab(rtab);
- if (ptab)
- qdisc_put_rtab(ptab);
return err;
}
-static int tbf_init(struct Qdisc* sch, struct nlattr *opt)
+static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
{
struct tbf_sched_data *q = qdisc_priv(sch);
if (opt == NULL)
return -EINVAL;
- q->t_c = psched_get_time();
+ q->t_c = ktime_to_ns(ktime_get());
qdisc_watchdog_init(&q->watchdog, sch);
q->qdisc = &noop_qdisc;
@@ -330,12 +443,6 @@ static void tbf_destroy(struct Qdisc *sch)
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
-
- if (q->P_tab)
- qdisc_put_rtab(q->P_tab);
- if (q->R_tab)
- qdisc_put_rtab(q->R_tab);
-
qdisc_destroy(q->qdisc);
}
@@ -345,22 +452,30 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
struct nlattr *nest;
struct tc_tbf_qopt opt;
+ sch->qstats.backlog = q->qdisc->qstats.backlog;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
opt.limit = q->limit;
- opt.rate = q->R_tab->rate;
- if (q->P_tab)
- opt.peakrate = q->P_tab->rate;
+ psched_ratecfg_getrate(&opt.rate, &q->rate);
+ if (tbf_peak_present(q))
+ psched_ratecfg_getrate(&opt.peakrate, &q->peak);
else
memset(&opt.peakrate, 0, sizeof(opt.peakrate));
- opt.mtu = q->mtu;
- opt.buffer = q->buffer;
- NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
+ opt.mtu = PSCHED_NS2TICKS(q->mtu);
+ opt.buffer = PSCHED_NS2TICKS(q->buffer);
+ if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
+ nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
+ goto nla_put_failure;
+ if (tbf_peak_present(q) &&
+ q->peak.rate_bytes_ps >= (1ULL << 32) &&
+ nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
+ goto nla_put_failure;
- nla_nest_end(skb, nest);
- return skb->len;
+ return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
@@ -423,8 +538,7 @@ static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
}
}
-static const struct Qdisc_class_ops tbf_class_ops =
-{
+static const struct Qdisc_class_ops tbf_class_ops = {
.graft = tbf_graft,
.leaf = tbf_leaf,
.get = tbf_get,
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 106479a7c94..47416716294 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -53,48 +53,45 @@
which will not break load balancing, though native slave
traffic will have the highest priority. */
-struct teql_master
-{
+struct teql_master {
struct Qdisc_ops qops;
struct net_device *dev;
struct Qdisc *slaves;
struct list_head master_list;
+ unsigned long tx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_errors;
+ unsigned long tx_dropped;
};
-struct teql_sched_data
-{
+struct teql_sched_data {
struct Qdisc *next;
struct teql_master *m;
- struct neighbour *ncache;
struct sk_buff_head q;
};
-#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next)
+#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
-#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT)
+#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
/* "teql*" qdisc routines */
static int
-teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct teql_sched_data *q = qdisc_priv(sch);
if (q->q.qlen < dev->tx_queue_len) {
__skb_queue_tail(&q->q, skb);
- sch->bstats.bytes += qdisc_pkt_len(skb);
- sch->bstats.packets++;
return NET_XMIT_SUCCESS;
}
- kfree_skb(skb);
- sch->qstats.drops++;
- return NET_XMIT_DROP;
+ return qdisc_drop(skb, sch);
}
static struct sk_buff *
-teql_dequeue(struct Qdisc* sch)
+teql_dequeue(struct Qdisc *sch)
{
struct teql_sched_data *dat = qdisc_priv(sch);
struct netdev_queue *dat_queue;
@@ -108,19 +105,21 @@ teql_dequeue(struct Qdisc* sch)
dat->m->slaves = sch;
netif_wake_queue(m);
}
+ } else {
+ qdisc_bstats_update(sch, skb);
}
sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
return skb;
}
static struct sk_buff *
-teql_peek(struct Qdisc* sch)
+teql_peek(struct Qdisc *sch)
{
/* teql is meant to be used as root qdisc */
return NULL;
}
-static __inline__ void
+static inline void
teql_neigh_release(struct neighbour *n)
{
if (n)
@@ -128,23 +127,23 @@ teql_neigh_release(struct neighbour *n)
}
static void
-teql_reset(struct Qdisc* sch)
+teql_reset(struct Qdisc *sch)
{
struct teql_sched_data *dat = qdisc_priv(sch);
skb_queue_purge(&dat->q);
sch->q.qlen = 0;
- teql_neigh_release(xchg(&dat->ncache, NULL));
}
static void
-teql_destroy(struct Qdisc* sch)
+teql_destroy(struct Qdisc *sch)
{
struct Qdisc *q, *prev;
struct teql_sched_data *dat = qdisc_priv(sch);
struct teql_master *master = dat->m;
- if ((prev = master->slaves) != NULL) {
+ prev = master->slaves;
+ if (prev) {
do {
q = NEXT_SLAVE(prev);
if (q == sch) {
@@ -165,7 +164,6 @@ teql_destroy(struct Qdisc* sch)
}
}
skb_queue_purge(&dat->q);
- teql_neigh_release(xchg(&dat->ncache, NULL));
break;
}
@@ -176,7 +174,7 @@ teql_destroy(struct Qdisc* sch)
static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
{
struct net_device *dev = qdisc_dev(sch);
- struct teql_master *m = (struct teql_master*)sch->ops;
+ struct teql_master *m = (struct teql_master *)sch->ops;
struct teql_sched_data *q = qdisc_priv(sch);
if (dev->hard_header_len > m->dev->hard_header_len)
@@ -222,23 +220,27 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
static int
-__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
+__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
+ struct net_device *dev, struct netdev_queue *txq,
+ struct dst_entry *dst)
{
- struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, 0);
- struct teql_sched_data *q = qdisc_priv(dev_queue->qdisc);
- struct neighbour *mn = skb_dst(skb)->neighbour;
- struct neighbour *n = q->ncache;
+ struct neighbour *n;
+ int err = 0;
- if (mn->tbl == NULL)
- return -EINVAL;
- if (n && n->tbl == mn->tbl &&
- memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) {
- atomic_inc(&n->refcnt);
- } else {
- n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev);
- if (IS_ERR(n))
- return PTR_ERR(n);
+ n = dst_neigh_lookup_skb(dst, skb);
+ if (!n)
+ return -ENOENT;
+
+ if (dst->dev != dev) {
+ struct neighbour *mn;
+
+ mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev);
+ neigh_release(n);
+ if (IS_ERR(mn))
+ return PTR_ERR(mn);
+ n = mn;
}
+
if (neigh_event_send(n, skb_res) == 0) {
int err;
char haddr[MAX_ADDR_LEN];
@@ -247,35 +249,39 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *
err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
NULL, skb->len);
- if (err < 0) {
- neigh_release(n);
- return -EINVAL;
- }
- teql_neigh_release(xchg(&q->ncache, n));
- return 0;
+ if (err < 0)
+ err = -EINVAL;
+ } else {
+ err = (skb_res == NULL) ? -EAGAIN : 1;
}
neigh_release(n);
- return (skb_res == NULL) ? -EAGAIN : 1;
+ return err;
}
static inline int teql_resolve(struct sk_buff *skb,
- struct sk_buff *skb_res, struct net_device *dev)
+ struct sk_buff *skb_res,
+ struct net_device *dev,
+ struct netdev_queue *txq)
{
- struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+ struct dst_entry *dst = skb_dst(skb);
+ int res;
+
if (txq->qdisc == &noop_qdisc)
return -ENODEV;
- if (dev->header_ops == NULL ||
- skb_dst(skb) == NULL ||
- skb_dst(skb)->neighbour == NULL)
+ if (!dev->header_ops || !dst)
return 0;
- return __teql_resolve(skb, skb_res, dev);
+
+ rcu_read_lock();
+ res = __teql_resolve(skb, skb_res, dev, txq, dst);
+ rcu_read_unlock();
+
+ return res;
}
static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct teql_master *master = netdev_priv(dev);
- struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
struct Qdisc *start, *q;
int busy;
int nores;
@@ -288,7 +294,8 @@ restart:
nores = 0;
busy = 0;
- if ((q = start) == NULL)
+ q = start;
+ if (!q)
goto drop;
do {
@@ -298,30 +305,30 @@ restart:
if (slave_txq->qdisc_sleeping != q)
continue;
- if (__netif_subqueue_stopped(slave, subq) ||
+ if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
!netif_running(slave)) {
busy = 1;
continue;
}
- switch (teql_resolve(skb, skb_res, slave)) {
+ switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
case 0:
if (__netif_tx_trylock(slave_txq)) {
unsigned int length = qdisc_pkt_len(skb);
- if (!netif_tx_queue_frozen_or_stopped(slave_txq) &&
+ if (!netif_xmit_frozen_or_stopped(slave_txq) &&
slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) {
txq_trans_update(slave_txq);
__netif_tx_unlock(slave_txq);
master->slaves = NEXT_SLAVE(q);
netif_wake_queue(dev);
- txq->tx_packets++;
- txq->tx_bytes += length;
+ master->tx_packets++;
+ master->tx_bytes += length;
return NETDEV_TX_OK;
}
__netif_tx_unlock(slave_txq);
}
- if (netif_queue_stopped(dev))
+ if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))
busy = 1;
break;
case 1:
@@ -343,20 +350,20 @@ restart:
netif_stop_queue(dev);
return NETDEV_TX_BUSY;
}
- dev->stats.tx_errors++;
+ master->tx_errors++;
drop:
- txq->tx_dropped++;
+ master->tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
static int teql_master_open(struct net_device *dev)
{
- struct Qdisc * q;
+ struct Qdisc *q;
struct teql_master *m = netdev_priv(dev);
int mtu = 0xFFFE;
- unsigned flags = IFF_NOARP|IFF_MULTICAST;
+ unsigned int flags = IFF_NOARP | IFF_MULTICAST;
if (m->slaves == NULL)
return -EUNATCH;
@@ -399,6 +406,18 @@ static int teql_master_close(struct net_device *dev)
return 0;
}
+static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct teql_master *m = netdev_priv(dev);
+
+ stats->tx_packets = m->tx_packets;
+ stats->tx_bytes = m->tx_bytes;
+ stats->tx_errors = m->tx_errors;
+ stats->tx_dropped = m->tx_dropped;
+ return stats;
+}
+
static int teql_master_mtu(struct net_device *dev, int new_mtu)
{
struct teql_master *m = netdev_priv(dev);
@@ -412,7 +431,7 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu)
do {
if (new_mtu > qdisc_dev(q)->mtu)
return -EINVAL;
- } while ((q=NEXT_SLAVE(q)) != m->slaves);
+ } while ((q = NEXT_SLAVE(q)) != m->slaves);
}
dev->mtu = new_mtu;
@@ -423,6 +442,7 @@ static const struct net_device_ops teql_netdev_ops = {
.ndo_open = teql_master_open,
.ndo_stop = teql_master_close,
.ndo_start_xmit = teql_master_xmit,
+ .ndo_get_stats64 = teql_master_stats64,
.ndo_change_mtu = teql_master_mtu,
};