diff options
Diffstat (limited to 'net/sched')
55 files changed, 7295 insertions, 2420 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2590e91b328..a1a8e29e5fc 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -250,6 +250,64 @@ config NET_SCH_QFQ If unsure, say N. +config NET_SCH_CODEL + tristate "Controlled Delay AQM (CODEL)" + help + Say Y here if you want to use the Controlled Delay (CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_codel. + + If unsure, say N. + +config NET_SCH_FQ_CODEL + tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)" + help + Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_fq_codel. + + If unsure, say N. + +config NET_SCH_FQ + tristate "Fair Queue" + help + Say Y here if you want to use the FQ packet scheduling algorithm. + + FQ does flow separation, and is able to respect pacing requirements + set by TCP stack into sk->sk_pacing_rate (for localy generated + traffic) + + To compile this driver as a module, choose M here: the module + will be called sch_fq. + + If unsure, say N. + +config NET_SCH_HHF + tristate "Heavy-Hitter Filter (HHF)" + help + Say Y here if you want to use the Heavy-Hitter Filter (HHF) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_hhf. + +config NET_SCH_PIE + tristate "Proportional Integral controller Enhanced (PIE) scheduler" + help + Say Y here if you want to use the Proportional Integral controller + Enhanced scheduler packet scheduling algorithm. + For more information, please see + http://tools.ietf.org/html/draft-pan-tsvwg-pie-00 + + To compile this driver as a module, choose M here: the module + will be called sch_pie. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT @@ -260,6 +318,32 @@ config NET_SCH_INGRESS To compile this code as a module, choose M here: the module will be called sch_ingress. +config NET_SCH_PLUG + tristate "Plug network traffic until release (PLUG)" + ---help--- + + This queuing discipline allows userspace to plug/unplug a network + output queue, using the netlink interface. When it receives an + enqueue command it inserts a plug into the outbound queue that + causes following packets to enqueue until a dequeue command arrives + over netlink, causing the plug to be removed and resuming the normal + packet flow. + + This module also provides a generic "network output buffering" + functionality (aka output commit), wherein upon arrival of a dequeue + command, only packets up to the first plug are released for delivery. + The Remus HA project uses this module to enable speculative execution + of virtual machines by allowing the generated network output to be rolled + back if needed. + + For more information, please refer to http://wiki.xensource.com/xenwiki/Remus + + Say Y here if you are using this kernel for Xen dom0 and + want to protect Xen guests with Remus. + + To compile this code as a module, choose M here: the + module will be called sch_plug. + comment "Classification" config NET_CLS @@ -373,6 +457,7 @@ config NET_CLS_FLOW config NET_CLS_CGROUP tristate "Control Group Classifier" select NET_CLS + select CGROUP_NET_CLASSID depends on CGROUPS ---help--- Say Y here if you want to classify packets based on the control @@ -381,6 +466,16 @@ config NET_CLS_CGROUP To compile this code as a module, choose M here: the module will be called cls_cgroup. +config NET_CLS_BPF + tristate "BPF-based classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + programmable BPF (JIT'ed) filters as an alternative to ematches. + + To compile this code as a module, choose M here: the module will + be called cls_bpf. + config NET_EMATCH bool "Extended Matches" select NET_CLS @@ -459,6 +554,26 @@ config NET_EMATCH_TEXT To compile this code as a module, choose M here: the module will be called em_text. +config NET_EMATCH_CANID + tristate "CAN Identifier" + depends on NET_EMATCH && (CAN=y || CAN=m) + ---help--- + Say Y here if you want to be able to classify CAN frames based + on CAN Identifier. + + To compile this code as a module, choose M here: the + module will be called em_canid. + +config NET_EMATCH_IPSET + tristate "IPset" + depends on NET_EMATCH && IP_SET + ---help--- + Say Y here if you want to be able to classify packets based on + ipset membership. + + To compile this code as a module, choose M here: the + module will be called em_ipset. + config NET_CLS_ACT bool "Actions" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index dc5889c0a15..0a869a11f3e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -33,9 +33,15 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o +obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o +obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o +obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o +obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o +obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o @@ -46,9 +52,12 @@ obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o +obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o obj-$(CONFIG_NET_EMATCH) += ematch.o obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o obj-$(CONFIG_NET_EMATCH_META) += em_meta.o obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o +obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o +obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 93fdf131bd7..648778aef1a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -27,42 +27,40 @@ #include <net/act_api.h> #include <net/netlink.h> -void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo) +void tcf_hash_destroy(struct tc_action *a) { - unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); - struct tcf_common **p1p; - - for (p1p = &hinfo->htab[h]; *p1p; p1p = &(*p1p)->tcfc_next) { - if (*p1p == p) { - write_lock_bh(hinfo->lock); - *p1p = p->tcfc_next; - write_unlock_bh(hinfo->lock); - gen_kill_estimator(&p->tcfc_bstats, - &p->tcfc_rate_est); - /* - * gen_estimator est_timer() might access p->tcfc_lock - * or bstats, wait a RCU grace period before freeing p - */ - kfree_rcu(p, tcfc_rcu); - return; - } - } - WARN_ON(1); + struct tcf_common *p = a->priv; + struct tcf_hashinfo *hinfo = a->ops->hinfo; + + spin_lock_bh(&hinfo->lock); + hlist_del(&p->tcfc_head); + spin_unlock_bh(&hinfo->lock); + gen_kill_estimator(&p->tcfc_bstats, + &p->tcfc_rate_est); + /* + * gen_estimator est_timer() might access p->tcfc_lock + * or bstats, wait a RCU grace period before freeing p + */ + kfree_rcu(p, tcfc_rcu); } EXPORT_SYMBOL(tcf_hash_destroy); -int tcf_hash_release(struct tcf_common *p, int bind, - struct tcf_hashinfo *hinfo) +int tcf_hash_release(struct tc_action *a, int bind) { + struct tcf_common *p = a->priv; int ret = 0; if (p) { if (bind) p->tcfc_bindcnt--; + else if (p->tcfc_bindcnt > 0) + return -EPERM; p->tcfc_refcnt--; if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { - tcf_hash_destroy(p, hinfo); + if (a->ops->cleanup) + a->ops->cleanup(a, bind); + tcf_hash_destroy(a); ret = 1; } } @@ -71,20 +69,22 @@ int tcf_hash_release(struct tcf_common *p, int bind, EXPORT_SYMBOL(tcf_hash_release); static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, - struct tc_action *a, struct tcf_hashinfo *hinfo) + struct tc_action *a) { + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; - read_lock_bh(hinfo->lock); + spin_lock_bh(&hinfo->lock); s_i = cb->args[0]; for (i = 0; i < (hinfo->hmask + 1); i++) { - p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; - for (; p; p = p->tcfc_next) { + hlist_for_each_entry_rcu(p, head, tcfc_head) { index++; if (index < s_i) continue; @@ -107,7 +107,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, } } done: - read_unlock_bh(hinfo->lock); + spin_unlock_bh(&hinfo->lock); if (n_i) cb->args[0] += n_i; return n_i; @@ -117,79 +117,82 @@ nla_put_failure: goto done; } -static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, - struct tcf_hashinfo *hinfo) +static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a) { - struct tcf_common *p, *s_p; + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct hlist_head *head; + struct hlist_node *n; + struct tcf_common *p; struct nlattr *nest; int i = 0, n_i = 0; + int ret = -EINVAL; nest = nla_nest_start(skb, a->order); if (nest == NULL) goto nla_put_failure; - NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind); + if (nla_put_string(skb, TCA_KIND, a->ops->kind)) + goto nla_put_failure; for (i = 0; i < (hinfo->hmask + 1); i++) { - p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; - - while (p != NULL) { - s_p = p->tcfc_next; - if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo)) + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; + hlist_for_each_entry_safe(p, n, head, tcfc_head) { + a->priv = p; + ret = tcf_hash_release(a, 0); + if (ret == ACT_P_DELETED) { module_put(a->ops->owner); - n_i++; - p = s_p; + n_i++; + } else if (ret < 0) + goto nla_put_failure; } } - NLA_PUT_U32(skb, TCA_FCNT, n_i); + if (nla_put_u32(skb, TCA_FCNT, n_i)) + goto nla_put_failure; nla_nest_end(skb, nest); return n_i; nla_put_failure: nla_nest_cancel(skb, nest); - return -EINVAL; + return ret; } -int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, - int type, struct tc_action *a) +static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, + int type, struct tc_action *a) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; - if (type == RTM_DELACTION) { - return tcf_del_walker(skb, a, hinfo); + return tcf_del_walker(skb, a); } else if (type == RTM_GETACTION) { - return tcf_dump_walker(skb, cb, a, hinfo); + return tcf_dump_walker(skb, cb, a); } else { WARN(1, "tcf_generic_walker: unknown action %d\n", type); return -EINVAL; } } -EXPORT_SYMBOL(tcf_generic_walker); -struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) +static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) { - struct tcf_common *p; + struct tcf_common *p = NULL; + struct hlist_head *head; - read_lock_bh(hinfo->lock); - for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p; - p = p->tcfc_next) { + spin_lock_bh(&hinfo->lock); + head = &hinfo->htab[tcf_hash(index, hinfo->hmask)]; + hlist_for_each_entry_rcu(p, head, tcfc_head) if (p->tcfc_index == index) break; - } - read_unlock_bh(hinfo->lock); + spin_unlock_bh(&hinfo->lock); return p; } -EXPORT_SYMBOL(tcf_hash_lookup); -u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo) +u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo) { - u32 val = *idx_gen; + u32 val = hinfo->index; do { if (++val == 0) val = 1; } while (tcf_hash_lookup(val, hinfo)); - return (*idx_gen = val); + hinfo->index = val; + return val; } EXPORT_SYMBOL(tcf_hash_new_index); @@ -206,34 +209,46 @@ int tcf_hash_search(struct tc_action *a, u32 index) } EXPORT_SYMBOL(tcf_hash_search); -struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind, - struct tcf_hashinfo *hinfo) +int tcf_hash_check(u32 index, struct tc_action *a, int bind) { + struct tcf_hashinfo *hinfo = a->ops->hinfo; struct tcf_common *p = NULL; if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) { if (bind) p->tcfc_bindcnt++; p->tcfc_refcnt++; a->priv = p; + return 1; } - return p; + return 0; } EXPORT_SYMBOL(tcf_hash_check); -struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est, - struct tc_action *a, int size, int bind, - u32 *idx_gen, struct tcf_hashinfo *hinfo) +void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) { + struct tcf_common *pc = a->priv; + if (est) + gen_kill_estimator(&pc->tcfc_bstats, + &pc->tcfc_rate_est); + kfree_rcu(pc, tcfc_rcu); +} +EXPORT_SYMBOL(tcf_hash_cleanup); + +int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, + int size, int bind) +{ + struct tcf_hashinfo *hinfo = a->ops->hinfo; struct tcf_common *p = kzalloc(size, GFP_KERNEL); if (unlikely(!p)) - return ERR_PTR(-ENOMEM); + return -ENOMEM; p->tcfc_refcnt = 1; if (bind) p->tcfc_bindcnt = 1; spin_lock_init(&p->tcfc_lock); - p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo); + INIT_HLIST_NODE(&p->tcfc_head); + p->tcfc_index = index ? index : tcf_hash_new_index(hinfo); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { @@ -241,42 +256,64 @@ struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est, &p->tcfc_lock, est); if (err) { kfree(p); - return ERR_PTR(err); + return err; } } a->priv = (void *) p; - return p; + return 0; } EXPORT_SYMBOL(tcf_hash_create); -void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo) +void tcf_hash_insert(struct tc_action *a) { + struct tcf_common *p = a->priv; + struct tcf_hashinfo *hinfo = a->ops->hinfo; unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); - write_lock_bh(hinfo->lock); - p->tcfc_next = hinfo->htab[h]; - hinfo->htab[h] = p; - write_unlock_bh(hinfo->lock); + spin_lock_bh(&hinfo->lock); + hlist_add_head(&p->tcfc_head, &hinfo->htab[h]); + spin_unlock_bh(&hinfo->lock); } EXPORT_SYMBOL(tcf_hash_insert); -static struct tc_action_ops *act_base = NULL; +static LIST_HEAD(act_base); static DEFINE_RWLOCK(act_mod_lock); -int tcf_register_action(struct tc_action_ops *act) +int tcf_register_action(struct tc_action_ops *act, unsigned int mask) { - struct tc_action_ops *a, **ap; + struct tc_action_ops *a; + int err; + + /* Must supply act, dump and init */ + if (!act->act || !act->dump || !act->init) + return -EINVAL; + + /* Supply defaults */ + if (!act->lookup) + act->lookup = tcf_hash_search; + if (!act->walk) + act->walk = tcf_generic_walker; + + act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL); + if (!act->hinfo) + return -ENOMEM; + err = tcf_hashinfo_init(act->hinfo, mask); + if (err) { + kfree(act->hinfo); + return err; + } write_lock(&act_mod_lock); - for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) { + list_for_each_entry(a, &act_base, head) { if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { write_unlock(&act_mod_lock); + tcf_hashinfo_destroy(act->hinfo); + kfree(act->hinfo); return -EEXIST; } } - act->next = NULL; - *ap = act; + list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); return 0; } @@ -284,17 +321,18 @@ EXPORT_SYMBOL(tcf_register_action); int tcf_unregister_action(struct tc_action_ops *act) { - struct tc_action_ops *a, **ap; + struct tc_action_ops *a; int err = -ENOENT; write_lock(&act_mod_lock); - for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) - if (a == act) + list_for_each_entry(a, &act_base, head) { + if (a == act) { + list_del(&act->head); + tcf_hashinfo_destroy(act->hinfo); + kfree(act->hinfo); + err = 0; break; - if (a) { - *ap = a->next; - a->next = NULL; - err = 0; + } } write_unlock(&act_mod_lock); return err; @@ -304,69 +342,42 @@ EXPORT_SYMBOL(tcf_unregister_action); /* lookup by name */ static struct tc_action_ops *tc_lookup_action_n(char *kind) { - struct tc_action_ops *a = NULL; + struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); - for (a = act_base; a; a = a->next) { + list_for_each_entry(a, &act_base, head) { if (strcmp(kind, a->kind) == 0) { - if (!try_module_get(a->owner)) { - read_unlock(&act_mod_lock); - return NULL; - } + if (try_module_get(a->owner)) + res = a; break; } } read_unlock(&act_mod_lock); } - return a; + return res; } /* lookup by nlattr */ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) { - struct tc_action_ops *a = NULL; + struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); - for (a = act_base; a; a = a->next) { + list_for_each_entry(a, &act_base, head) { if (nla_strcmp(kind, a->kind) == 0) { - if (!try_module_get(a->owner)) { - read_unlock(&act_mod_lock); - return NULL; - } + if (try_module_get(a->owner)) + res = a; break; } } read_unlock(&act_mod_lock); } - return a; + return res; } -#if 0 -/* lookup by id */ -static struct tc_action_ops *tc_lookup_action_id(u32 type) -{ - struct tc_action_ops *a = NULL; - - if (type) { - read_lock(&act_mod_lock); - for (a = act_base; a; a = a->next) { - if (a->type == type) { - if (!try_module_get(a->owner)) { - read_unlock(&act_mod_lock); - return NULL; - } - break; - } - } - read_unlock(&act_mod_lock); - } - return a; -} -#endif - -int tcf_action_exec(struct sk_buff *skb, const struct tc_action *act, +int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, struct tcf_result *res) { const struct tc_action *a; @@ -377,53 +388,44 @@ int tcf_action_exec(struct sk_buff *skb, const struct tc_action *act, ret = TC_ACT_OK; goto exec_done; } - while ((a = act) != NULL) { + list_for_each_entry(a, actions, list) { repeat: - if (a->ops && a->ops->act) { - ret = a->ops->act(skb, a, res); - if (TC_MUNGED & skb->tc_verd) { - /* copied already, allow trampling */ - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); - skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); - } - if (ret == TC_ACT_REPEAT) - goto repeat; /* we need a ttl - JHS */ - if (ret != TC_ACT_PIPE) - goto exec_done; + ret = a->ops->act(skb, a, res); + if (TC_MUNGED & skb->tc_verd) { + /* copied already, allow trampling */ + skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); + skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); } - act = a->next; + if (ret == TC_ACT_REPEAT) + goto repeat; /* we need a ttl - JHS */ + if (ret != TC_ACT_PIPE) + goto exec_done; } exec_done: return ret; } EXPORT_SYMBOL(tcf_action_exec); -void tcf_action_destroy(struct tc_action *act, int bind) +int tcf_action_destroy(struct list_head *actions, int bind) { - struct tc_action *a; + struct tc_action *a, *tmp; + int ret = 0; - for (a = act; a; a = act) { - if (a->ops && a->ops->cleanup) { - if (a->ops->cleanup(a, bind) == ACT_P_DELETED) - module_put(a->ops->owner); - act = act->next; - kfree(a); - } else { - /*FIXME: Remove later - catch insertion bugs*/ - WARN(1, "tcf_action_destroy: BUG? destroying NULL ops\n"); - act = act->next; - kfree(a); - } + list_for_each_entry_safe(a, tmp, actions, list) { + ret = tcf_hash_release(a, bind); + if (ret == ACT_P_DELETED) + module_put(a->ops->owner); + else if (ret < 0) + return ret; + list_del(&a->list); + kfree(a); } + return ret; } int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - int err = -EINVAL; - - if (a->ops == NULL || a->ops->dump == NULL) - return err; return a->ops->dump(skb, a, bind, ref); } @@ -434,10 +436,8 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; - if (a->ops == NULL || a->ops->dump == NULL) - return err; - - NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind); + if (nla_put_string(skb, TCA_KIND, a->ops->kind)) + goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; nest = nla_nest_start(skb, TCA_OPTIONS); @@ -456,14 +456,13 @@ nla_put_failure: EXPORT_SYMBOL(tcf_action_dump_1); int -tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) +tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref) { struct tc_action *a; int err = -EINVAL; struct nlattr *nest; - while ((a = act) != NULL) { - act = a->next; + list_for_each_entry(a, actions, list) { nest = nla_nest_start(skb, a->order); if (nest == NULL) goto nla_put_failure; @@ -482,8 +481,9 @@ errout: return err; } -struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind) +struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, + struct nlattr *est, char *name, int ovr, + int bind) { struct tc_action *a; struct tc_action_ops *a_o; @@ -537,11 +537,13 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, if (a == NULL) goto err_mod; + a->ops = a_o; + INIT_LIST_HEAD(&a->list); /* backward compatibility for policer */ if (name == NULL) - err = a_o->init(tb[TCA_ACT_OPTIONS], est, a, ovr, bind); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind); else - err = a_o->init(nla, est, a, ovr, bind); + err = a_o->init(net, nla, est, a, ovr, bind); if (err < 0) goto err_free; @@ -551,7 +553,6 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, */ if (err != ACT_P_CREATED) module_put(a_o->owner); - a->ops = a_o; return a; @@ -563,36 +564,33 @@ err_out: return ERR_PTR(err); } -struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind) +int tcf_action_init(struct net *net, struct nlattr *nla, + struct nlattr *est, char *name, int ovr, + int bind, struct list_head *actions) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; - struct tc_action *head = NULL, *act, *act_prev = NULL; + struct tc_action *act; int err; int i; err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); if (err < 0) - return ERR_PTR(err); + return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(tb[i], est, name, ovr, bind); - if (IS_ERR(act)) + act = tcf_action_init_1(net, tb[i], est, name, ovr, bind); + if (IS_ERR(act)) { + err = PTR_ERR(act); goto err; + } act->order = i; - - if (head == NULL) - head = act; - else - act_prev->next = act; - act_prev = act; + list_add_tail(&act->list, actions); } - return head; + return 0; err: - if (head != NULL) - tcf_action_destroy(head, bind); - return act; + tcf_action_destroy(actions, bind); + return err; } int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, @@ -600,9 +598,9 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, { int err = 0; struct gnet_dump d; - struct tcf_act_hdr *h = a->priv; + struct tcf_common *p = a->priv; - if (h == NULL) + if (p == NULL) goto errout; /* compat_mode being true specifies a call that is supposed @@ -611,24 +609,20 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (compat_mode) { if (a->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, - TCA_STATS, TCA_XSTATS, &h->tcf_lock, &d); + TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, - &h->tcf_lock, &d); + &p->tcfc_lock, &d); if (err < 0) goto errout; - if (a->ops != NULL && a->ops->get_stats != NULL) - if (a->ops->get_stats(skb, a) < 0) - goto errout; - - if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 || - gnet_stats_copy_rate_est(&d, &h->tcf_bstats, - &h->tcf_rate_est) < 0 || - gnet_stats_copy_queue(&d, &h->tcf_qstats) < 0) + if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 || + gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, + &p->tcfc_rate_est) < 0 || + gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) @@ -641,7 +635,7 @@ errout: } static int -tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, +tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq, u16 flags, int event, int bind, int ref) { struct tcamsg *t; @@ -649,50 +643,64 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; - nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); - - t = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags); + if (!nlh) + goto out_nlmsg_trim; + t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); if (nest == NULL) - goto nla_put_failure; + goto out_nlmsg_trim; - if (tcf_action_dump(skb, a, bind, ref) < 0) - goto nla_put_failure; + if (tcf_action_dump(skb, actions, bind, ref) < 0) + goto out_nlmsg_trim; nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; -nla_put_failure: -nlmsg_failure: +out_nlmsg_trim: nlmsg_trim(skb, b); return -1; } static int -act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n, - struct tc_action *a, int event) +act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, + struct list_head *actions, int event) { struct sk_buff *skb; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { kfree_skb(skb); return -EINVAL; } - return rtnl_unicast(skb, net, pid); + return rtnl_unicast(skb, net, portid); +} + +static struct tc_action *create_a(int i) +{ + struct tc_action *act; + + act = kzalloc(sizeof(*act), GFP_KERNEL); + if (act == NULL) { + pr_debug("create_a: failed to alloc!\n"); + return NULL; + } + act->order = i; + INIT_LIST_HEAD(&act->list); + return act; } static struct tc_action * -tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid) +tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) { struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action *a; @@ -710,16 +718,14 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid) index = nla_get_u32(tb[TCA_ACT_INDEX]); err = -ENOMEM; - a = kzalloc(sizeof(struct tc_action), GFP_KERNEL); + a = create_a(0); if (a == NULL) goto err_out; err = -EINVAL; a->ops = tc_lookup_action(tb[TCA_ACT_KIND]); - if (a->ops == NULL) + if (a->ops == NULL) /* could happen in batch of actions */ goto err_free; - if (a->ops->lookup == NULL) - goto err_mod; err = -ENOENT; if (a->ops->lookup(a, index) == 0) goto err_mod; @@ -735,31 +741,18 @@ err_out: return ERR_PTR(err); } -static void cleanup_a(struct tc_action *act) +static void cleanup_a(struct list_head *actions) { - struct tc_action *a; + struct tc_action *a, *tmp; - for (a = act; a; a = act) { - act = a->next; + list_for_each_entry_safe(a, tmp, actions, list) { + list_del(&a->list); kfree(a); } } -static struct tc_action *create_a(int i) -{ - struct tc_action *act; - - act = kzalloc(sizeof(*act), GFP_KERNEL); - if (act == NULL) { - pr_debug("create_a: failed to alloc!\n"); - return NULL; - } - act->order = i; - return act; -} - static int tca_action_flush(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 pid) + struct nlmsghdr *n, u32 portid) { struct sk_buff *skb; unsigned char *b; @@ -769,18 +762,12 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, struct nlattr *nest; struct nlattr *tb[TCA_ACT_MAX + 1]; struct nlattr *kind; - struct tc_action *a = create_a(0); + struct tc_action a; int err = -ENOMEM; - if (a == NULL) { - pr_debug("tca_action_flush: couldnt create tc_action\n"); - return err; - } - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { pr_debug("tca_action_flush: failed skb alloc\n"); - kfree(a); return err; } @@ -792,23 +779,27 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, err = -EINVAL; kind = tb[TCA_ACT_KIND]; - a->ops = tc_lookup_action(kind); - if (a->ops == NULL) + memset(&a, 0, sizeof(struct tc_action)); + INIT_LIST_HEAD(&a.list); + a.ops = tc_lookup_action(kind); + if (a.ops == NULL) /*some idjot trying to flush unknown action */ goto err_out; - nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t)); - t = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); + if (!nlh) + goto out_module_put; + t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); if (nest == NULL) - goto nla_put_failure; + goto out_module_put; - err = a->ops->walk(skb, &dcb, RTM_DELACTION, a); + err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a); if (err < 0) - goto nla_put_failure; + goto out_module_put; if (err == 0) goto noflush_out; @@ -816,32 +807,61 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; - module_put(a->ops->owner); - kfree(a); - err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, + module_put(a.ops->owner); + err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err > 0) return 0; return err; -nla_put_failure: -nlmsg_failure: - module_put(a->ops->owner); +out_module_put: + module_put(a.ops->owner); err_out: noflush_out: kfree_skb(skb); - kfree(a); return err; } static int +tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, + u32 portid) +{ + int ret; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, + 0, 1) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + /* now do the delete */ + ret = tcf_action_destroy(actions, 0); + if (ret < 0) { + kfree_skb(skb); + return ret; + } + + ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); + if (ret > 0) + return 0; + return ret; +} + +static int tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, - u32 pid, int event) + u32 portid, int event) { int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; - struct tc_action *head = NULL, *act, *act_prev = NULL; + struct tc_action *act; + LIST_HEAD(actions); ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); if (ret < 0) @@ -849,139 +869,88 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { if (tb[1] != NULL) - return tca_action_flush(net, tb[1], n, pid); + return tca_action_flush(net, tb[1], n, portid); else return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_get_1(tb[i], n, pid); + act = tcf_action_get_1(tb[i], n, portid); if (IS_ERR(act)) { ret = PTR_ERR(act); goto err; } act->order = i; - - if (head == NULL) - head = act; - else - act_prev->next = act; - act_prev = act; + list_add_tail(&act->list, &actions); } if (event == RTM_GETACTION) - ret = act_get_notify(net, pid, n, head, event); + ret = act_get_notify(net, portid, n, &actions, event); else { /* delete */ - struct sk_buff *skb; - - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) { - ret = -ENOBUFS; - goto err; - } - - if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event, - 0, 1) <= 0) { - kfree_skb(skb); - ret = -EINVAL; + ret = tcf_del_notify(net, n, &actions, portid); + if (ret) goto err; - } - - /* now do the delete */ - tcf_action_destroy(head, 0); - ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); - if (ret > 0) - return 0; return ret; } err: - cleanup_a(head); + cleanup_a(&actions); return ret; } -static int tcf_add_notify(struct net *net, struct tc_action *a, - u32 pid, u32 seq, int event, u16 flags) +static int +tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, + u32 portid) { - struct tcamsg *t; - struct nlmsghdr *nlh; struct sk_buff *skb; - struct nlattr *nest; - unsigned char *b; int err = 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - b = skb_tail_pointer(skb); - - nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); - t = NLMSG_DATA(nlh); - t->tca_family = AF_UNSPEC; - t->tca__pad1 = 0; - t->tca__pad2 = 0; - - nest = nla_nest_start(skb, TCA_ACT_TAB); - if (nest == NULL) - goto nla_put_failure; - - if (tcf_action_dump(skb, a, 0, 0) < 0) - goto nla_put_failure; - - nla_nest_end(skb, nest); - - nlh->nlmsg_len = skb_tail_pointer(skb) - b; - NETLINK_CB(skb).dst_group = RTNLGRP_TC; + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, + RTM_NEWACTION, 0, 0) <= 0) { + kfree_skb(skb); + return -EINVAL; + } - err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags & NLM_F_ECHO); + err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); if (err > 0) err = 0; return err; - -nla_put_failure: -nlmsg_failure: - kfree_skb(skb); - return -1; } - static int tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, - u32 pid, int ovr) + u32 portid, int ovr) { int ret = 0; - struct tc_action *act; - struct tc_action *a; - u32 seq = n->nlmsg_seq; + LIST_HEAD(actions); - act = tcf_action_init(nla, NULL, NULL, ovr, 0); - if (act == NULL) - goto done; - if (IS_ERR(act)) { - ret = PTR_ERR(act); + ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions); + if (ret) goto done; - } /* dump then free all the actions after update; inserted policy * stays intact */ - ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); - for (a = act; a; a = act) { - act = a->next; - kfree(a); - } + ret = tcf_add_notify(net, n, &actions, portid); + cleanup_a(&actions); done: return ret; } -static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ACT_MAX + 1]; - u32 pid = skb ? NETLINK_CB(skb).pid : 0; + u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; + if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); if (ret < 0) return ret; @@ -1003,17 +972,17 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; replay: - ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr); + ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr); if (ret == -EAGAIN) goto replay; break; case RTM_DELACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, - pid, RTM_DELACTION); + portid, RTM_DELACTION); break; case RTM_GETACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, - pid, RTM_GETACTION); + portid, RTM_GETACTION); break; default: BUG(); @@ -1059,7 +1028,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) struct tc_action_ops *a_o; struct tc_action a; int ret = 0; - struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh); + struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); struct nlattr *kind = find_dump_kind(cb->nlh); if (kind == NULL) { @@ -1074,26 +1043,22 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) memset(&a, 0, sizeof(struct tc_action)); a.ops = a_o; - if (a_o->walk == NULL) { - WARN(1, "tc_dump_action: %s !capable of dumping table\n", - a_o->kind); - goto nla_put_failure; - } - - nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - cb->nlh->nlmsg_type, sizeof(*t)); - t = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*t), 0); + if (!nlh) + goto out_module_put; + t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); if (nest == NULL) - goto nla_put_failure; + goto out_module_put; ret = a_o->walk(skb, cb, RTM_GETACTION, &a); if (ret < 0) - goto nla_put_failure; + goto out_module_put; if (ret > 0) { nla_nest_end(skb, nest); @@ -1102,13 +1067,12 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) nla_nest_cancel(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; - if (NETLINK_CB(cb->skb).pid && ret) + if (NETLINK_CB(cb->skb).portid && ret) nlh->nlmsg_flags |= NLM_F_MULTI; module_put(a_o->owner); return skb->len; -nla_put_failure: -nlmsg_failure: +out_module_put: module_put(a_o->owner); nlmsg_trim(skb, b); return skb->len; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 453a73431ac..edbf40dac70 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -37,26 +37,16 @@ #include <net/tc_act/tc_csum.h> #define CSUM_TAB_MASK 15 -static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1]; -static u32 csum_idx_gen; -static DEFINE_RWLOCK(csum_lock); - -static struct tcf_hashinfo csum_hash_info = { - .htab = tcf_csum_ht, - .hmask = CSUM_TAB_MASK, - .lock = &csum_lock, -}; static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, }; -static int tcf_csum_init(struct nlattr *nla, struct nlattr *est, +static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tc_csum *parm; - struct tcf_common *pc; struct tcf_csum *p; int ret = 0, err; @@ -71,39 +61,31 @@ static int tcf_csum_init(struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_CSUM_PARMS]); - pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info); - if (!pc) { - pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, - &csum_idx_gen, &csum_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); - p = to_tcf_csum(pc); + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { - p = to_tcf_csum(pc); - if (!ovr) { - tcf_hash_release(pc, bind, &csum_hash_info); + if (bind)/* dont override defaults */ + return 0; + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } + p = to_tcf_csum(a); spin_lock_bh(&p->tcf_lock); p->tcf_action = parm->action; p->update_flags = parm->update_flags; spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &csum_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_csum_cleanup(struct tc_action *a, int bind) -{ - struct tcf_csum *p = a->priv; - return tcf_hash_release(&p->common, bind, &csum_hash_info); -} - /** * tcf_csum_skb_nextlayer - Get next layer pointer * @skb: sk_buff to use @@ -166,15 +148,17 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb, return 1; } -static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h, +static int tcf_csum_ipv6_icmp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct icmp6hdr *icmp6h; + const struct ipv6hdr *ip6h; icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h)); if (icmp6h == NULL) return 0; + ip6h = ipv6_hdr(skb); icmp6h->icmp6_cksum = 0; skb->csum = csum_partial(icmp6h, ipl - ihl, 0); icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -186,15 +170,17 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h, return 1; } -static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph, +static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct tcphdr *tcph; + const struct iphdr *iph; tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); if (tcph == NULL) return 0; + iph = ip_hdr(skb); tcph->check = 0; skb->csum = csum_partial(tcph, ipl - ihl, 0); tcph->check = tcp_v4_check(ipl - ihl, @@ -205,15 +191,17 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph, return 1; } -static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h, +static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct tcphdr *tcph; + const struct ipv6hdr *ip6h; tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); if (tcph == NULL) return 0; + ip6h = ipv6_hdr(skb); tcph->check = 0; skb->csum = csum_partial(tcph, ipl - ihl, 0); tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -225,10 +213,11 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h, return 1; } -static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph, +static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl, int udplite) { struct udphdr *udph; + const struct iphdr *iph; u16 ul; /* @@ -242,6 +231,7 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph, if (udph == NULL) return 0; + iph = ip_hdr(skb); ul = ntohs(udph->len); if (udplite || udph->check) { @@ -276,10 +266,11 @@ ignore_obscure_skb: return 1; } -static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h, +static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl, int udplite) { struct udphdr *udph; + const struct ipv6hdr *ip6h; u16 ul; /* @@ -293,6 +284,7 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h, if (udph == NULL) return 0; + ip6h = ipv6_hdr(skb); ul = ntohs(udph->len); udph->check = 0; @@ -328,7 +320,7 @@ ignore_obscure_skb: static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) { - struct iphdr *iph; + const struct iphdr *iph; int ntkoff; ntkoff = skb_network_offset(skb); @@ -353,19 +345,19 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) break; case IPPROTO_TCP: if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) - if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4, + if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4, ntohs(iph->tot_len))) goto fail; break; case IPPROTO_UDP: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) - if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4, ntohs(iph->tot_len), 0)) goto fail; break; case IPPROTO_UDPLITE: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) - if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4, ntohs(iph->tot_len), 1)) goto fail; break; @@ -377,7 +369,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) goto fail; - ip_send_check(iph); + ip_send_check(ip_hdr(skb)); } return 1; @@ -397,7 +389,7 @@ static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, while (len > 1) { switch (xh[off]) { - case IPV6_TLV_PAD0: + case IPV6_TLV_PAD1: optlen = 1; break; case IPV6_TLV_JUMBO: @@ -456,6 +448,7 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) ixhl = ipv6_optlen(ip6xh); if (!pskb_may_pull(skb, hl + ixhl + ntkoff)) goto fail; + ip6xh = (void *)(skb_network_header(skb) + hl); if ((nexthdr == NEXTHDR_HOP) && !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl))) goto fail; @@ -464,25 +457,25 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) break; case IPPROTO_ICMPV6: if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) - if (!tcf_csum_ipv6_icmp(skb, ip6h, + if (!tcf_csum_ipv6_icmp(skb, hl, pl + sizeof(*ip6h))) goto fail; goto done; case IPPROTO_TCP: if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) - if (!tcf_csum_ipv6_tcp(skb, ip6h, + if (!tcf_csum_ipv6_tcp(skb, hl, pl + sizeof(*ip6h))) goto fail; goto done; case IPPROTO_UDP: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) - if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + if (!tcf_csum_ipv6_udp(skb, hl, pl + sizeof(*ip6h), 0)) goto fail; goto done; case IPPROTO_UDPLITE: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) - if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + if (!tcf_csum_ipv6_udp(skb, hl, pl + sizeof(*ip6h), 1)) goto fail; goto done; @@ -550,11 +543,13 @@ static int tcf_csum_dump(struct sk_buff *skb, }; struct tcf_t t; - NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t); + if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; @@ -565,16 +560,11 @@ nla_put_failure: static struct tc_action_ops act_csum_ops = { .kind = "csum", - .hinfo = &csum_hash_info, .type = TCA_ACT_CSUM, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_csum, .dump = tcf_csum_dump, - .cleanup = tcf_csum_cleanup, - .lookup = tcf_hash_search, .init = tcf_csum_init, - .walk = tcf_generic_walker }; MODULE_DESCRIPTION("Checksum updating actions"); @@ -582,7 +572,7 @@ MODULE_LICENSE("GPL"); static int __init csum_init_module(void) { - return tcf_register_action(&act_csum_ops); + return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK); } static void __exit csum_cleanup_module(void) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index b77f5a06a65..d6bcbd9f779 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -24,20 +24,11 @@ #include <net/tc_act/tc_gact.h> #define GACT_TAB_MASK 15 -static struct tcf_common *tcf_gact_ht[GACT_TAB_MASK + 1]; -static u32 gact_idx_gen; -static DEFINE_RWLOCK(gact_lock); - -static struct tcf_hashinfo gact_hash_info = { - .htab = tcf_gact_ht, - .hmask = GACT_TAB_MASK, - .lock = &gact_lock, -}; #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) { - if (!gact->tcfg_pval || net_random() % gact->tcfg_pval) + if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } @@ -58,15 +49,18 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { [TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) }, }; -static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_gact_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_GACT_MAX + 1]; struct tc_gact *parm; struct tcf_gact *gact; - struct tcf_common *pc; int ret = 0; int err; +#ifdef CONFIG_GACT_PROB + struct tc_gact_p *p_parm = NULL; +#endif if (nla == NULL) return -EINVAL; @@ -82,29 +76,33 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, #ifndef CONFIG_GACT_PROB if (tb[TCA_GACT_PROB] != NULL) return -EOPNOTSUPP; +#else + if (tb[TCA_GACT_PROB]) { + p_parm = nla_data(tb[TCA_GACT_PROB]); + if (p_parm->ptype >= MAX_RAND) + return -EINVAL; + } #endif - pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info); - if (!pc) { - pc = tcf_hash_create(parm->index, est, a, sizeof(*gact), - bind, &gact_idx_gen, &gact_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { - if (!ovr) { - tcf_hash_release(pc, bind, &gact_hash_info); + if (bind)/* dont override defaults */ + return 0; + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } - gact = to_gact(pc); + gact = to_gact(a); spin_lock_bh(&gact->tcf_lock); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB - if (tb[TCA_GACT_PROB] != NULL) { - struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]); + if (p_parm) { gact->tcfg_paction = p_parm->paction; gact->tcfg_pval = p_parm->pval; gact->tcfg_ptype = p_parm->ptype; @@ -112,19 +110,10 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, #endif spin_unlock_bh(&gact->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &gact_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_gact_cleanup(struct tc_action *a, int bind) -{ - struct tcf_gact *gact = a->priv; - - if (gact) - return tcf_hash_release(&gact->common, bind, &gact_hash_info); - return 0; -} - static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -133,7 +122,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, spin_lock(&gact->tcf_lock); #ifdef CONFIG_GACT_PROB - if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL) + if (gact->tcfg_ptype) action = gact_rand[gact->tcfg_ptype](gact); else action = gact->tcf_action; @@ -162,7 +151,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int }; struct tcf_t t; - NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; #ifdef CONFIG_GACT_PROB if (gact->tcfg_ptype) { struct tc_gact_p p_opt = { @@ -171,13 +161,15 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int .ptype = gact->tcfg_ptype, }; - NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); + if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt)) + goto nla_put_failure; } #endif t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); - NLA_PUT(skb, TCA_GACT_TM, sizeof(t), &t); + if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -187,16 +179,11 @@ nla_put_failure: static struct tc_action_ops act_gact_ops = { .kind = "gact", - .hinfo = &gact_hash_info, .type = TCA_ACT_GACT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_gact, .dump = tcf_gact_dump, - .cleanup = tcf_gact_cleanup, - .lookup = tcf_hash_search, .init = tcf_gact_init, - .walk = tcf_generic_walker }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -210,7 +197,7 @@ static int __init gact_init_module(void) #else pr_info("GACT probability NOT on\n"); #endif - return tcf_register_action(&act_gact_ops); + return tcf_register_action(&act_gact_ops, GACT_TAB_MASK); } static void __exit gact_cleanup_module(void) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 60f8f616e8f..8a64a0734ae 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -1,5 +1,5 @@ /* - * net/sched/ipt.c iptables target interface + * net/sched/ipt.c iptables target interface * *TODO: Add other tables. For now we only support the ipv4 table targets * @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Copyright: Jamal Hadi Salim (2002-4) + * Copyright: Jamal Hadi Salim (2002-13) */ #include <linux/types.h> @@ -29,15 +29,6 @@ #define IPT_TAB_MASK 15 -static struct tcf_common *tcf_ipt_ht[IPT_TAB_MASK + 1]; -static u32 ipt_idx_gen; -static DEFINE_RWLOCK(ipt_lock); - -static struct tcf_hashinfo ipt_hash_info = { - .htab = tcf_ipt_ht, - .hmask = IPT_TAB_MASK, - .lock = &ipt_lock, -}; static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) { @@ -77,22 +68,12 @@ static void ipt_destroy_target(struct xt_entry_target *t) module_put(par.target->me); } -static int tcf_ipt_release(struct tcf_ipt *ipt, int bind) +static void tcf_ipt_release(struct tc_action *a, int bind) { - int ret = 0; - if (ipt) { - if (bind) - ipt->tcf_bindcnt--; - ipt->tcf_refcnt--; - if (ipt->tcf_bindcnt <= 0 && ipt->tcf_refcnt <= 0) { - ipt_destroy_target(ipt->tcfi_t); - kfree(ipt->tcfi_tname); - kfree(ipt->tcfi_t); - tcf_hash_destroy(&ipt->common, &ipt_hash_info); - ret = ACT_P_DELETED; - } - } - return ret; + struct tcf_ipt *ipt = to_ipt(a); + ipt_destroy_target(ipt->tcfi_t); + kfree(ipt->tcfi_tname); + kfree(ipt->tcfi_t); } static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { @@ -102,12 +83,11 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; -static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, +static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; - struct tcf_common *pc; struct xt_entry_target *td, *t; char *tname; int ret = 0, err; @@ -133,20 +113,20 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, if (tb[TCA_IPT_INDEX] != NULL) index = nla_get_u32(tb[TCA_IPT_INDEX]); - pc = tcf_hash_check(index, a, bind, &ipt_hash_info); - if (!pc) { - pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind, - &ipt_idx_gen, &ipt_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); + if (!tcf_hash_check(index, a, bind) ) { + ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { - if (!ovr) { - tcf_ipt_release(to_ipt(pc), bind); + if (bind)/* dont override defaults */ + return 0; + tcf_hash_release(a, bind); + + if (!ovr) return -EEXIST; - } } - ipt = to_ipt(pc); + ipt = to_ipt(a); hook = nla_get_u32(tb[TCA_IPT_HOOK]); @@ -177,7 +157,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &ipt_hash_info); + tcf_hash_insert(a); return ret; err3: @@ -185,16 +165,11 @@ err3: err2: kfree(tname); err1: - kfree(pc); + if (ret == ACT_P_CREATED) + tcf_hash_cleanup(a, est); return err; } -static int tcf_ipt_cleanup(struct tc_action *a, int bind) -{ - struct tcf_ipt *ipt = a->priv; - return tcf_ipt_release(ipt, bind); -} - static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -202,10 +177,8 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_ipt *ipt = a->priv; struct xt_action_param par; - if (skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return TC_ACT_UNSPEC; - } + if (skb_unclone(skb, GFP_ATOMIC)) + return TC_ACT_UNSPEC; spin_lock(&ipt->tcf_lock); @@ -235,9 +208,8 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, result = TC_ACT_PIPE; break; default: - if (net_ratelimit()) - pr_notice("tc filter: Bogus netfilter code" - " %d assume ACCEPT\n", ret); + net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n", + ret); result = TC_POLICE_OK; break; } @@ -267,15 +239,17 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int c.refcnt = ipt->tcf_refcnt - ref; strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); - NLA_PUT(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t); - NLA_PUT_U32(skb, TCA_IPT_INDEX, ipt->tcf_index); - NLA_PUT_U32(skb, TCA_IPT_HOOK, ipt->tcfi_hook); - NLA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c); - NLA_PUT_STRING(skb, TCA_IPT_TABLE, ipt->tcfi_tname); + if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || + nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) || + nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) || + nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) || + nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname)) + goto nla_put_failure; tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); - NLA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm); + if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm)) + goto nla_put_failure; kfree(t); return skb->len; @@ -287,29 +261,49 @@ nla_put_failure: static struct tc_action_ops act_ipt_ops = { .kind = "ipt", - .hinfo = &ipt_hash_info, .type = TCA_ACT_IPT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_ipt, .dump = tcf_ipt_dump, - .cleanup = tcf_ipt_cleanup, - .lookup = tcf_hash_search, + .cleanup = tcf_ipt_release, + .init = tcf_ipt_init, +}; + +static struct tc_action_ops act_xt_ops = { + .kind = "xt", + .type = TCA_ACT_XT, + .owner = THIS_MODULE, + .act = tcf_ipt, + .dump = tcf_ipt_dump, + .cleanup = tcf_ipt_release, .init = tcf_ipt_init, - .walk = tcf_generic_walker }; -MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); MODULE_DESCRIPTION("Iptables target actions"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("act_xt"); static int __init ipt_init_module(void) { - return tcf_register_action(&act_ipt_ops); + int ret1, ret2; + + ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK); + if (ret1 < 0) + printk("Failed to load xt action\n"); + ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK); + if (ret2 < 0) + printk("Failed to load ipt action\n"); + + if (ret1 < 0 && ret2 < 0) { + return ret1; + } else + return 0; } static void __exit ipt_cleanup_module(void) { + tcf_unregister_action(&act_xt_ops); tcf_unregister_action(&act_ipt_ops); } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e051398fdf6..4f912c0e225 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,45 +30,27 @@ #include <linux/if_arp.h> #define MIRRED_TAB_MASK 7 -static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1]; -static u32 mirred_idx_gen; -static DEFINE_RWLOCK(mirred_lock); static LIST_HEAD(mirred_list); -static struct tcf_hashinfo mirred_hash_info = { - .htab = tcf_mirred_ht, - .hmask = MIRRED_TAB_MASK, - .lock = &mirred_lock, -}; - -static int tcf_mirred_release(struct tcf_mirred *m, int bind) +static void tcf_mirred_release(struct tc_action *a, int bind) { - if (m) { - if (bind) - m->tcf_bindcnt--; - m->tcf_refcnt--; - if (!m->tcf_bindcnt && m->tcf_refcnt <= 0) { - list_del(&m->tcfm_list); - if (m->tcfm_dev) - dev_put(m->tcfm_dev); - tcf_hash_destroy(&m->common, &mirred_hash_info); - return 1; - } - } - return 0; + struct tcf_mirred *m = to_mirred(a); + list_del(&m->tcfm_list); + if (m->tcfm_dev) + dev_put(m->tcfm_dev); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, }; -static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_mirred_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) { struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tc_mirred *parm; struct tcf_mirred *m; - struct tcf_common *pc; struct net_device *dev; int ret, ok_push = 0; @@ -88,7 +70,7 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, return -EINVAL; } if (parm->ifindex) { - dev = __dev_get_by_index(&init_net, parm->ifindex); + dev = __dev_get_by_index(net, parm->ifindex); if (dev == NULL) return -ENODEV; switch (dev->type) { @@ -108,22 +90,20 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, dev = NULL; } - pc = tcf_hash_check(parm->index, a, bind, &mirred_hash_info); - if (!pc) { + if (!tcf_hash_check(parm->index, a, bind)) { if (dev == NULL) return -EINVAL; - pc = tcf_hash_create(parm->index, est, a, sizeof(*m), bind, - &mirred_idx_gen, &mirred_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); + ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { if (!ovr) { - tcf_mirred_release(to_mirred(pc), bind); + tcf_hash_release(a, bind); return -EEXIST; } } - m = to_mirred(pc); + m = to_mirred(a); spin_lock_bh(&m->tcf_lock); m->tcf_action = parm->action; @@ -139,21 +119,12 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&m->tcf_lock); if (ret == ACT_P_CREATED) { list_add(&m->tcfm_list, &mirred_list); - tcf_hash_insert(pc, &mirred_hash_info); + tcf_hash_insert(a); } return ret; } -static int tcf_mirred_cleanup(struct tc_action *a, int bind) -{ - struct tcf_mirred *m = a->priv; - - if (m) - return tcf_mirred_release(m, bind); - return 0; -} - static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -174,9 +145,8 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, } if (!(dev->flags & IFF_UP)) { - if (net_ratelimit()) - pr_notice("tc mirred to Houston: device %s is down\n", - dev->name); + net_notice_ratelimited("tc mirred to Houston: device %s is down\n", + dev->name); goto out; } @@ -201,13 +171,12 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, out: if (err) { m->tcf_qstats.overlimits++; - /* should we be asking for packet to be dropped? - * may make sense for redirect case only - */ - retval = TC_ACT_SHOT; - } else { + if (m->tcfm_eaction != TCA_EGRESS_MIRROR) + retval = TC_ACT_SHOT; + else + retval = m->tcf_action; + } else retval = m->tcf_action; - } spin_unlock(&m->tcf_lock); return retval; @@ -227,11 +196,13 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i }; struct tcf_t t; - NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(m->tcf_tm.expires); - NLA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t); + if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -242,7 +213,7 @@ nla_put_failure: static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; if (event == NETDEV_UNREGISTER) @@ -260,19 +231,14 @@ static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; - static struct tc_action_ops act_mirred_ops = { .kind = "mirred", - .hinfo = &mirred_hash_info, .type = TCA_ACT_MIRRED, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_mirred, .dump = tcf_mirred_dump, - .cleanup = tcf_mirred_cleanup, - .lookup = tcf_hash_search, + .cleanup = tcf_mirred_release, .init = tcf_mirred_init, - .walk = tcf_generic_walker }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); @@ -286,13 +252,13 @@ static int __init mirred_init_module(void) return err; pr_info("Mirror/redirect action on\n"); - return tcf_register_action(&act_mirred_ops); + return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK); } static void __exit mirred_cleanup_module(void) { - unregister_netdevice_notifier(&mirred_device_notifier); tcf_unregister_action(&act_mirred_ops); + unregister_netdevice_notifier(&mirred_device_notifier); } module_init(mirred_init_module); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 001d1b35486..270a030d5fd 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -30,28 +30,18 @@ #define NAT_TAB_MASK 15 -static struct tcf_common *tcf_nat_ht[NAT_TAB_MASK + 1]; -static u32 nat_idx_gen; -static DEFINE_RWLOCK(nat_lock); - -static struct tcf_hashinfo nat_hash_info = { - .htab = tcf_nat_ht, - .hmask = NAT_TAB_MASK, - .lock = &nat_lock, -}; static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, }; -static int tcf_nat_init(struct nlattr *nla, struct nlattr *est, +static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { struct nlattr *tb[TCA_NAT_MAX + 1]; struct tc_nat *parm; int ret = 0, err; struct tcf_nat *p; - struct tcf_common *pc; if (nla == NULL) return -EINVAL; @@ -64,21 +54,19 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_NAT_PARMS]); - pc = tcf_hash_check(parm->index, a, bind, &nat_hash_info); - if (!pc) { - pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, - &nat_idx_gen, &nat_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); - p = to_tcf_nat(pc); + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + if (ret) + return ret; ret = ACT_P_CREATED; } else { - p = to_tcf_nat(pc); - if (!ovr) { - tcf_hash_release(pc, bind, &nat_hash_info); + if (bind) + return 0; + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } + p = to_tcf_nat(a); spin_lock_bh(&p->tcf_lock); p->old_addr = parm->old_addr; @@ -90,18 +78,11 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &nat_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_nat_cleanup(struct tc_action *a, int bind) -{ - struct tcf_nat *p = a->priv; - - return tcf_hash_release(&p->common, bind, &nat_hash_info); -} - static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -284,11 +265,13 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - NLA_PUT(skb, TCA_NAT_TM, sizeof(t), &t); + if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; @@ -299,16 +282,11 @@ nla_put_failure: static struct tc_action_ops act_nat_ops = { .kind = "nat", - .hinfo = &nat_hash_info, .type = TCA_ACT_NAT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_nat, .dump = tcf_nat_dump, - .cleanup = tcf_nat_cleanup, - .lookup = tcf_hash_search, .init = tcf_nat_init, - .walk = tcf_generic_walker }; MODULE_DESCRIPTION("Stateless NAT actions"); @@ -316,7 +294,7 @@ MODULE_LICENSE("GPL"); static int __init nat_init_module(void) { - return tcf_register_action(&act_nat_ops); + return tcf_register_action(&act_nat_ops, NAT_TAB_MASK); } static void __exit nat_cleanup_module(void) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 10d3aed8656..5f9bcb2e080 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -24,28 +24,19 @@ #include <net/tc_act/tc_pedit.h> #define PEDIT_TAB_MASK 15 -static struct tcf_common *tcf_pedit_ht[PEDIT_TAB_MASK + 1]; -static u32 pedit_idx_gen; -static DEFINE_RWLOCK(pedit_lock); - -static struct tcf_hashinfo pedit_hash_info = { - .htab = tcf_pedit_ht, - .hmask = PEDIT_TAB_MASK, - .lock = &pedit_lock, -}; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, }; -static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_pedit_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; int ret = 0, err; struct tcf_pedit *p; - struct tcf_common *pc; struct tc_pedit_key *keys = NULL; int ksize; @@ -63,27 +54,27 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est, if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize) return -EINVAL; - pc = tcf_hash_check(parm->index, a, bind, &pedit_hash_info); - if (!pc) { + if (!tcf_hash_check(parm->index, a, bind)) { if (!parm->nkeys) return -EINVAL; - pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, - &pedit_idx_gen, &pedit_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); - p = to_pedit(pc); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + if (ret) + return ret; + p = to_pedit(a); keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) { - kfree(pc); + tcf_hash_cleanup(a, est); return -ENOMEM; } ret = ACT_P_CREATED; } else { - p = to_pedit(pc); - if (!ovr) { - tcf_hash_release(pc, bind, &pedit_hash_info); + p = to_pedit(a); + tcf_hash_release(a, bind); + if (bind) + return 0; + if (!ovr) return -EEXIST; - } + if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) @@ -102,22 +93,15 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est, memcpy(p->tcfp_keys, parm->keys, ksize); spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &pedit_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_pedit_cleanup(struct tc_action *a, int bind) +static void tcf_pedit_cleanup(struct tc_action *a, int bind) { struct tcf_pedit *p = a->priv; - - if (p) { - struct tc_pedit_key *keys = p->tcfp_keys; - if (tcf_hash_release(&p->common, bind, &pedit_hash_info)) { - kfree(keys); - return 1; - } - } - return 0; + struct tc_pedit_key *keys = p->tcfp_keys; + kfree(keys); } static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, @@ -127,8 +111,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, int i, munged = 0; unsigned int off; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return p->tcf_action; off = skb_network_offset(skb); @@ -215,11 +198,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, opt->refcnt = p->tcf_refcnt - ref; opt->bindcnt = p->tcf_bindcnt - bind; - NLA_PUT(skb, TCA_PEDIT_PARMS, s, opt); + if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - NLA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t); + if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t)) + goto nla_put_failure; kfree(opt); return skb->len; @@ -231,16 +216,12 @@ nla_put_failure: static struct tc_action_ops act_pedit_ops = { .kind = "pedit", - .hinfo = &pedit_hash_info, .type = TCA_ACT_PEDIT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_pedit, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, - .lookup = tcf_hash_search, .init = tcf_pedit_init, - .walk = tcf_generic_walker }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -249,7 +230,7 @@ MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { - return tcf_register_action(&act_pedit_ops); + return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK); } static void __exit pedit_cleanup_module(void) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 6fb3f5af0f8..0566e4606a4 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -22,19 +22,25 @@ #include <net/act_api.h> #include <net/netlink.h> -#define L2T(p, L) qdisc_l2t((p)->tcfp_R_tab, L) -#define L2T_P(p, L) qdisc_l2t((p)->tcfp_P_tab, L) +struct tcf_police { + struct tcf_common common; + int tcfp_result; + u32 tcfp_ewma_rate; + s64 tcfp_burst; + u32 tcfp_mtu; + s64 tcfp_toks; + s64 tcfp_ptoks; + s64 tcfp_mtu_ptoks; + s64 tcfp_t_c; + struct psched_ratecfg rate; + bool rate_present; + struct psched_ratecfg peak; + bool peak_present; +}; +#define to_police(pc) \ + container_of(pc, struct tcf_police, common) #define POL_TAB_MASK 15 -static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1]; -static u32 police_idx_gen; -static DEFINE_RWLOCK(police_lock); - -static struct tcf_hashinfo police_hash_info = { - .htab = tcf_police_ht, - .hmask = POL_TAB_MASK, - .lock = &police_lock, -}; /* old policer structure from before tc actions */ struct tc_police_compat { @@ -52,18 +58,20 @@ struct tc_police_compat { static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb, int type, struct tc_action *a) { + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; - read_lock_bh(&police_lock); + spin_lock_bh(&hinfo->lock); s_i = cb->args[0]; for (i = 0; i < (POL_TAB_MASK + 1); i++) { - p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)]; + head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)]; - for (; p; p = p->tcfc_next) { + hlist_for_each_entry_rcu(p, head, tcfc_head) { index++; if (index < s_i) continue; @@ -86,7 +94,7 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c } } done: - read_unlock_bh(&police_lock); + spin_unlock_bh(&hinfo->lock); if (n_i) cb->args[0] += n_i; return n_i; @@ -96,33 +104,6 @@ nla_put_failure: goto done; } -static void tcf_police_destroy(struct tcf_police *p) -{ - unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK); - struct tcf_common **p1p; - - for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->tcfc_next) { - if (*p1p == &p->common) { - write_lock_bh(&police_lock); - *p1p = p->tcf_next; - write_unlock_bh(&police_lock); - gen_kill_estimator(&p->tcf_bstats, - &p->tcf_rate_est); - if (p->tcfp_R_tab) - qdisc_put_rtab(p->tcfp_R_tab); - if (p->tcfp_P_tab) - qdisc_put_rtab(p->tcfp_P_tab); - /* - * gen_estimator est_timer() might access p->tcf_lock - * or bstats, wait a RCU grace period before freeing p - */ - kfree_rcu(p, tcf_rcu); - return; - } - } - WARN_ON(1); -} - static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE }, [TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE }, @@ -130,8 +111,9 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RESULT] = { .type = NLA_U32 }, }; -static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_act_police_locate(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { unsigned int h; int ret = 0, err; @@ -139,6 +121,7 @@ static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; + struct tcf_hashinfo *hinfo = a->ops->hinfo; int size; if (nla == NULL) @@ -156,19 +139,17 @@ static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, parm = nla_data(tb[TCA_POLICE_TBF]); if (parm->index) { - struct tcf_common *pc; - - pc = tcf_hash_lookup(parm->index, &police_hash_info); - if (pc != NULL) { - a->priv = pc; - police = to_police(pc); + if (tcf_hash_search(a, parm->index)) { + police = to_police(a->priv); if (bind) { police->tcf_bindcnt += 1; police->tcf_refcnt += 1; + return 0; } if (ovr) goto override; - return ret; + /* not replacing */ + return -EEXIST; } } @@ -211,26 +192,36 @@ override: } /* No failure allowed after this point */ - if (R_tab != NULL) { - qdisc_put_rtab(police->tcfp_R_tab); - police->tcfp_R_tab = R_tab; + police->tcfp_mtu = parm->mtu; + if (police->tcfp_mtu == 0) { + police->tcfp_mtu = ~0; + if (R_tab) + police->tcfp_mtu = 255 << R_tab->rate.cell_log; } - if (P_tab != NULL) { - qdisc_put_rtab(police->tcfp_P_tab); - police->tcfp_P_tab = P_tab; + if (R_tab) { + police->rate_present = true; + psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0); + qdisc_put_rtab(R_tab); + } else { + police->rate_present = false; + } + if (P_tab) { + police->peak_present = true; + psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0); + qdisc_put_rtab(P_tab); + } else { + police->peak_present = false; } if (tb[TCA_POLICE_RESULT]) police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); - police->tcfp_toks = police->tcfp_burst = parm->burst; - police->tcfp_mtu = parm->mtu; - if (police->tcfp_mtu == 0) { - police->tcfp_mtu = ~0; - if (police->tcfp_R_tab) - police->tcfp_mtu = 255<<police->tcfp_R_tab->rate.cell_log; + police->tcfp_burst = PSCHED_TICKS2NS(parm->burst); + police->tcfp_toks = police->tcfp_burst; + if (police->peak_present) { + police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak, + police->tcfp_mtu); + police->tcfp_ptoks = police->tcfp_mtu_ptoks; } - if (police->tcfp_P_tab) - police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu); police->tcf_action = parm->action; if (tb[TCA_POLICE_AVRATE]) @@ -240,14 +231,13 @@ override: if (ret != ACT_P_CREATED) return ret; - police->tcfp_t_c = psched_get_time(); + police->tcfp_t_c = ktime_to_ns(ktime_get()); police->tcf_index = parm->index ? parm->index : - tcf_hash_new_index(&police_idx_gen, &police_hash_info); + tcf_hash_new_index(hinfo); h = tcf_hash(police->tcf_index, POL_TAB_MASK); - write_lock_bh(&police_lock); - police->tcf_next = tcf_police_ht[h]; - tcf_police_ht[h] = &police->common; - write_unlock_bh(&police_lock); + spin_lock_bh(&hinfo->lock); + hlist_add_head(&police->tcf_head, &hinfo->htab[h]); + spin_unlock_bh(&hinfo->lock); a->priv = police; return ret; @@ -255,40 +245,20 @@ override: failure_unlock: spin_unlock_bh(&police->tcf_lock); failure: - if (P_tab) - qdisc_put_rtab(P_tab); - if (R_tab) - qdisc_put_rtab(R_tab); + qdisc_put_rtab(P_tab); + qdisc_put_rtab(R_tab); if (ret == ACT_P_CREATED) kfree(police); return err; } -static int tcf_act_police_cleanup(struct tc_action *a, int bind) -{ - struct tcf_police *p = a->priv; - int ret = 0; - - if (p != NULL) { - if (bind) - p->tcf_bindcnt--; - - p->tcf_refcnt--; - if (p->tcf_refcnt <= 0 && !p->tcf_bindcnt) { - tcf_police_destroy(p); - ret = 1; - } - } - return ret; -} - static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = a->priv; - psched_time_t now; - long toks; - long ptoks = 0; + s64 now; + s64 toks; + s64 ptoks = 0; spin_lock(&police->tcf_lock); @@ -304,24 +274,25 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, } if (qdisc_pkt_len(skb) <= police->tcfp_mtu) { - if (police->tcfp_R_tab == NULL) { + if (!police->rate_present) { spin_unlock(&police->tcf_lock); return police->tcfp_result; } - now = psched_get_time(); - toks = psched_tdiff_bounded(now, police->tcfp_t_c, - police->tcfp_burst); - if (police->tcfp_P_tab) { + now = ktime_to_ns(ktime_get()); + toks = min_t(s64, now - police->tcfp_t_c, + police->tcfp_burst); + if (police->peak_present) { ptoks = toks + police->tcfp_ptoks; - if (ptoks > (long)L2T_P(police, police->tcfp_mtu)) - ptoks = (long)L2T_P(police, police->tcfp_mtu); - ptoks -= L2T_P(police, qdisc_pkt_len(skb)); + if (ptoks > police->tcfp_mtu_ptoks) + ptoks = police->tcfp_mtu_ptoks; + ptoks -= (s64) psched_l2t_ns(&police->peak, + qdisc_pkt_len(skb)); } toks += police->tcfp_toks; - if (toks > (long)police->tcfp_burst) + if (toks > police->tcfp_burst) toks = police->tcfp_burst; - toks -= L2T(police, qdisc_pkt_len(skb)); + toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb)); if ((toks|ptoks) >= 0) { police->tcfp_t_c = now; police->tcfp_toks = toks; @@ -347,20 +318,23 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) .index = police->tcf_index, .action = police->tcf_action, .mtu = police->tcfp_mtu, - .burst = police->tcfp_burst, + .burst = PSCHED_NS2TICKS(police->tcfp_burst), .refcnt = police->tcf_refcnt - ref, .bindcnt = police->tcf_bindcnt - bind, }; - if (police->tcfp_R_tab) - opt.rate = police->tcfp_R_tab->rate; - if (police->tcfp_P_tab) - opt.peakrate = police->tcfp_P_tab->rate; - NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); - if (police->tcfp_result) - NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result); - if (police->tcfp_ewma_rate) - NLA_PUT_U32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate); + if (police->rate_present) + psched_ratecfg_getrate(&opt.rate, &police->rate); + if (police->peak_present) + psched_ratecfg_getrate(&opt.peakrate, &police->peak); + if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) + goto nla_put_failure; + if (police->tcfp_result && + nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result)) + goto nla_put_failure; + if (police->tcfp_ewma_rate && + nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -374,14 +348,10 @@ MODULE_LICENSE("GPL"); static struct tc_action_ops act_police_ops = { .kind = "police", - .hinfo = &police_hash_info, .type = TCA_ID_POLICE, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_act_police, .dump = tcf_act_police_dump, - .cleanup = tcf_act_police_cleanup, - .lookup = tcf_hash_search, .init = tcf_act_police_locate, .walk = tcf_act_police_walker }; @@ -389,7 +359,7 @@ static struct tc_action_ops act_police_ops = { static int __init police_init_module(void) { - return tcf_register_action(&act_police_ops); + return tcf_register_action(&act_police_ops, POL_TAB_MASK); } static void __exit diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 73e0a3ab4d5..992c2317ce8 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -25,15 +25,6 @@ #include <net/tc_act/tc_defact.h> #define SIMP_TAB_MASK 7 -static struct tcf_common *tcf_simp_ht[SIMP_TAB_MASK + 1]; -static u32 simp_idx_gen; -static DEFINE_RWLOCK(simp_lock); - -static struct tcf_hashinfo simp_hash_info = { - .htab = tcf_simp_ht, - .hmask = SIMP_TAB_MASK, - .lock = &simp_lock, -}; #define SIMP_MAX_DATA 32 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, @@ -55,20 +46,10 @@ static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, return d->tcf_action; } -static int tcf_simp_release(struct tcf_defact *d, int bind) +static void tcf_simp_release(struct tc_action *a, int bind) { - int ret = 0; - if (d) { - if (bind) - d->tcf_bindcnt--; - d->tcf_refcnt--; - if (d->tcf_bindcnt <= 0 && d->tcf_refcnt <= 0) { - kfree(d->tcfd_defdata); - tcf_hash_destroy(&d->common, &simp_hash_info); - ret = 1; - } - } - return ret; + struct tcf_defact *d = to_defact(a); + kfree(d->tcfd_defdata); } static int alloc_defdata(struct tcf_defact *d, char *defdata) @@ -95,13 +76,13 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA }, }; -static int tcf_simp_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_simp_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; struct tcf_defact *d; - struct tcf_common *pc; char *defdata; int ret = 0, err; @@ -121,44 +102,36 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est, parm = nla_data(tb[TCA_DEF_PARMS]); defdata = nla_data(tb[TCA_DEF_DATA]); - pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info); - if (!pc) { - pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, - &simp_idx_gen, &simp_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + if (ret) + return ret; - d = to_defact(pc); + d = to_defact(a); ret = alloc_defdata(d, defdata); if (ret < 0) { - kfree(pc); + tcf_hash_cleanup(a, est); return ret; } d->tcf_action = parm->action; ret = ACT_P_CREATED; } else { - d = to_defact(pc); - if (!ovr) { - tcf_simp_release(d, bind); + d = to_defact(a); + + if (bind) + return 0; + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } + reset_policy(d, defdata, parm); } if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &simp_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_simp_cleanup(struct tc_action *a, int bind) -{ - struct tcf_defact *d = a->priv; - - if (d) - return tcf_simp_release(d, bind); - return 0; -} - static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -172,12 +145,14 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt); - NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata); + if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || + nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); - NLA_PUT(skb, TCA_DEF_TM, sizeof(t), &t); + if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -187,15 +162,12 @@ nla_put_failure: static struct tc_action_ops act_simp_ops = { .kind = "simple", - .hinfo = &simp_hash_info, .type = TCA_ACT_SIMP, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_simp, .dump = tcf_simp_dump, - .cleanup = tcf_simp_cleanup, + .cleanup = tcf_simp_release, .init = tcf_simp_init, - .walk = tcf_generic_walker, }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); @@ -204,7 +176,8 @@ MODULE_LICENSE("GPL"); static int __init simp_init_module(void) { - int ret = tcf_register_action(&act_simp_ops); + int ret; + ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK); if (!ret) pr_info("Simple TC action Loaded\n"); return ret; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 35dbbe91027..fcfeeaf838b 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -11,8 +11,7 @@ * more details. * * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. + * this program; if not, see <http://www.gnu.org/licenses/>. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ @@ -29,15 +28,6 @@ #include <net/tc_act/tc_skbedit.h> #define SKBEDIT_TAB_MASK 15 -static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1]; -static u32 skbedit_idx_gen; -static DEFINE_RWLOCK(skbedit_lock); - -static struct tcf_hashinfo skbedit_hash_info = { - .htab = tcf_skbedit_ht, - .hmask = SKBEDIT_TAB_MASK, - .lock = &skbedit_lock, -}; static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) @@ -67,13 +57,13 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, }; -static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_skbedit_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; - struct tcf_common *pc; u32 flags = 0, *priority = NULL, *mark = NULL; u16 *queue_mapping = NULL; int ret = 0, err; @@ -108,21 +98,20 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est, parm = nla_data(tb[TCA_SKBEDIT_PARMS]); - pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info); - if (!pc) { - pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, - &skbedit_idx_gen, &skbedit_hash_info); - if (IS_ERR(pc)) - return PTR_ERR(pc); + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + if (ret) + return ret; - d = to_skbedit(pc); + d = to_skbedit(a); ret = ACT_P_CREATED; } else { - d = to_skbedit(pc); - if (!ovr) { - tcf_hash_release(pc, bind, &skbedit_hash_info); + d = to_skbedit(a); + if (bind) + return 0; + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } spin_lock_bh(&d->tcf_lock); @@ -140,19 +129,10 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&d->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(pc, &skbedit_hash_info); + tcf_hash_insert(a); return ret; } -static int tcf_skbedit_cleanup(struct tc_action *a, int bind) -{ - struct tcf_skbedit *d = a->priv; - - if (d) - return tcf_hash_release(&d->common, bind, &skbedit_hash_info); - return 0; -} - static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -166,20 +146,25 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt); - if (d->flags & SKBEDIT_F_PRIORITY) - NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), - &d->priority); - if (d->flags & SKBEDIT_F_QUEUE_MAPPING) - NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING, - sizeof(d->queue_mapping), &d->queue_mapping); - if (d->flags & SKBEDIT_F_MARK) - NLA_PUT(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), - &d->mark); + if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + if ((d->flags & SKBEDIT_F_PRIORITY) && + nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), + &d->priority)) + goto nla_put_failure; + if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && + nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING, + sizeof(d->queue_mapping), &d->queue_mapping)) + goto nla_put_failure; + if ((d->flags & SKBEDIT_F_MARK) && + nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), + &d->mark)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); - NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t); + if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -189,15 +174,11 @@ nla_put_failure: static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", - .hinfo = &skbedit_hash_info, .type = TCA_ACT_SKBEDIT, - .capab = TCA_CAP_NONE, .owner = THIS_MODULE, .act = tcf_skbedit, .dump = tcf_skbedit_dump, - .cleanup = tcf_skbedit_cleanup, .init = tcf_skbedit_init, - .walk = tcf_generic_walker, }; MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>"); @@ -206,7 +187,7 @@ MODULE_LICENSE("GPL"); static int __init skbedit_init_module(void) { - return tcf_register_action(&act_skbedit_ops); + return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK); } static void __exit skbedit_cleanup_module(void) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a69d44f1dac..45527e6b52d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -22,7 +22,6 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/kmod.h> -#include <linux/netlink.h> #include <linux/err.h> #include <linux/slab.h> #include <net/net_namespace.h> @@ -32,8 +31,7 @@ #include <net/pkt_cls.h> /* The list of all installed classifier types */ - -static struct tcf_proto_ops *tcf_proto_base __read_mostly; +static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); @@ -42,36 +40,35 @@ static DEFINE_RWLOCK(cls_mod_lock); static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind) { - const struct tcf_proto_ops *t = NULL; + const struct tcf_proto_ops *t, *res = NULL; if (kind) { read_lock(&cls_mod_lock); - for (t = tcf_proto_base; t; t = t->next) { + list_for_each_entry(t, &tcf_proto_base, head) { if (nla_strcmp(kind, t->kind) == 0) { - if (!try_module_get(t->owner)) - t = NULL; + if (try_module_get(t->owner)) + res = t; break; } } read_unlock(&cls_mod_lock); } - return t; + return res; } /* Register(unregister) new classifier type */ int register_tcf_proto_ops(struct tcf_proto_ops *ops) { - struct tcf_proto_ops *t, **tp; + struct tcf_proto_ops *t; int rc = -EEXIST; write_lock(&cls_mod_lock); - for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) + list_for_each_entry(t, &tcf_proto_base, head) if (!strcmp(ops->kind, t->kind)) goto out; - ops->next = NULL; - *tp = ops; + list_add_tail(&ops->head, &tcf_proto_base); rc = 0; out: write_unlock(&cls_mod_lock); @@ -81,19 +78,17 @@ EXPORT_SYMBOL(register_tcf_proto_ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) { - struct tcf_proto_ops *t, **tp; + struct tcf_proto_ops *t; int rc = -ENOENT; write_lock(&cls_mod_lock); - for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) - if (t == ops) + list_for_each_entry(t, &tcf_proto_base, head) { + if (t == ops) { + list_del(&t->head); + rc = 0; break; - - if (!t) - goto out; - *tp = t->next; - rc = 0; -out: + } + } write_unlock(&cls_mod_lock); return rc; } @@ -118,7 +113,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) /* Add/change/delete/get a filter node */ -static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; @@ -139,8 +134,16 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) int err; int tp_created = 0; + if ((n->nlmsg_type != RTM_GETTFILTER) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + replay: - t = NLMSG_DATA(n); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); + if (err < 0) + return err; + + t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); nprio = prio; @@ -162,10 +165,6 @@ replay: if (dev == NULL) return -ENODEV; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); - if (err < 0) - return err; - /* Find qdisc */ if (!parent) { q = dev->qdisc; @@ -319,7 +318,8 @@ replay: } } - err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh); + err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, + n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE); if (err == 0) { if (tp_created) { spin_lock_bh(root_lock); @@ -342,32 +342,35 @@ errout: return err; } -static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, - unsigned long fh, u32 pid, u32 seq, u16 flags, int event) +static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, + unsigned long fh, u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); - nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); - tcm = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); + if (!nlh) + goto out_nlmsg_trim; + tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex; tcm->tcm_parent = tp->classid; tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); - NLA_PUT_STRING(skb, TCA_KIND, tp->ops->kind); + if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) + goto nla_put_failure; tcm->tcm_handle = fh; if (RTM_DELTFILTER != event) { tcm->tcm_handle = 0; - if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) + if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0) goto nla_put_failure; } nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; -nlmsg_failure: +out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return -1; @@ -378,18 +381,18 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, unsigned long fh, int event) { struct sk_buff *skb; - u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { + if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) { kfree_skb(skb); return -EINVAL; } - return rtnetlink_send(skb, net, pid, RTNLGRP_TC, + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } @@ -403,8 +406,9 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg) { struct tcf_dump_args *a = (void *)arg; + struct net *net = sock_net(a->skb->sk); - return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, + return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); } @@ -417,12 +421,12 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; struct Qdisc *q; struct tcf_proto *tp, **chain; - struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh); + struct tcmsg *tcm = nlmsg_data(cb->nlh); unsigned long cl = 0; const struct Qdisc_class_ops *cops; struct tcf_dump_args arg; - if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) @@ -462,7 +466,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, + if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) break; @@ -495,45 +499,41 @@ out: void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (exts->action) { - tcf_action_destroy(exts->action, TCA_ACT_UNBIND); - exts->action = NULL; - } + tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND); + INIT_LIST_HEAD(&exts->actions); #endif } EXPORT_SYMBOL(tcf_exts_destroy); -int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb, - struct nlattr *rate_tlv, struct tcf_exts *exts, - const struct tcf_ext_map *map) +int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, + struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr) { - memset(exts, 0, sizeof(*exts)); - #ifdef CONFIG_NET_CLS_ACT { struct tc_action *act; - if (map->police && tb[map->police]) { - act = tcf_action_init_1(tb[map->police], rate_tlv, - "police", TCA_ACT_NOREPLACE, + INIT_LIST_HEAD(&exts->actions); + if (exts->police && tb[exts->police]) { + act = tcf_action_init_1(net, tb[exts->police], rate_tlv, + "police", ovr, TCA_ACT_BIND); if (IS_ERR(act)) return PTR_ERR(act); - act->type = TCA_OLD_COMPAT; - exts->action = act; - } else if (map->action && tb[map->action]) { - act = tcf_action_init(tb[map->action], rate_tlv, NULL, - TCA_ACT_NOREPLACE, TCA_ACT_BIND); - if (IS_ERR(act)) - return PTR_ERR(act); - - exts->action = act; + act->type = exts->type = TCA_OLD_COMPAT; + list_add(&act->list, &exts->actions); + } else if (exts->action && tb[exts->action]) { + int err; + err = tcf_action_init(net, tb[exts->action], rate_tlv, + NULL, ovr, + TCA_ACT_BIND, &exts->actions); + if (err) + return err; } } #else - if ((map->action && tb[map->action]) || - (map->police && tb[map->police])) + if ((exts->action && tb[exts->action]) || + (exts->police && tb[exts->police])) return -EOPNOTSUPP; #endif @@ -545,43 +545,42 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, struct tcf_exts *src) { #ifdef CONFIG_NET_CLS_ACT - if (src->action) { - struct tc_action *act; - tcf_tree_lock(tp); - act = dst->action; - dst->action = src->action; - tcf_tree_unlock(tp); - if (act) - tcf_action_destroy(act, TCA_ACT_UNBIND); - } + LIST_HEAD(tmp); + tcf_tree_lock(tp); + list_splice_init(&dst->actions, &tmp); + list_splice(&src->actions, &dst->actions); + tcf_tree_unlock(tp); + tcf_action_destroy(&tmp, TCA_ACT_UNBIND); #endif } EXPORT_SYMBOL(tcf_exts_change); -int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, - const struct tcf_ext_map *map) +#define tcf_exts_first_act(ext) \ + list_first_entry(&(exts)->actions, struct tc_action, list) + +int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (map->action && exts->action) { + if (exts->action && !list_empty(&exts->actions)) { /* * again for backward compatible mode - we want * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ struct nlattr *nest; - - if (exts->action->type != TCA_OLD_COMPAT) { - nest = nla_nest_start(skb, map->action); + if (exts->type != TCA_OLD_COMPAT) { + nest = nla_nest_start(skb, exts->action); if (nest == NULL) goto nla_put_failure; - if (tcf_action_dump(skb, exts->action, 0, 0) < 0) + if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - } else if (map->police) { - nest = nla_nest_start(skb, map->police); - if (nest == NULL) + } else if (exts->police) { + struct tc_action *act = tcf_exts_first_act(exts); + nest = nla_nest_start(skb, exts->police); + if (nest == NULL || !act) goto nla_put_failure; - if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) + if (tcf_action_dump_old(skb, act, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } @@ -594,17 +593,14 @@ nla_put_failure: __attribute__ ((unused)) EXPORT_SYMBOL(tcf_exts_dump); -int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, - const struct tcf_ext_map *map) +int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - if (exts->action) - if (tcf_action_copy_stats(skb, exts->action, 1) < 0) - goto nla_put_failure; + struct tc_action *a = tcf_exts_first_act(exts); + if (tcf_action_copy_stats(skb, a, 1) < 0) + return -1; #endif return 0; -nla_put_failure: __attribute__ ((unused)) - return -1; } EXPORT_SYMBOL(tcf_exts_dump_stats); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index ea1f70b5a5f..0ae1813e3e9 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -34,16 +34,11 @@ struct basic_filter { struct list_head link; }; -static const struct tcf_ext_map basic_ext_map = { - .action = TCA_BASIC_ACT, - .police = TCA_BASIC_POLICE -}; - static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { int r; - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *f; list_for_each_entry(f, &head->flist, link) { @@ -61,7 +56,7 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long basic_get(struct tcf_proto *tp, u32 handle) { unsigned long l = 0UL; - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *f; if (head == NULL) @@ -112,7 +107,7 @@ static void basic_destroy(struct tcf_proto *tp) static int basic_delete(struct tcf_proto *tp, unsigned long arg) { - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *t, *f = (struct basic_filter *) arg; list_for_each_entry(t, &head->flist, link) @@ -132,15 +127,17 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED }, }; -static int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, - unsigned long base, struct nlattr **tb, - struct nlattr *est) +static int basic_set_parms(struct net *net, struct tcf_proto *tp, + struct basic_filter *f, unsigned long base, + struct nlattr **tb, + struct nlattr *est, bool ovr) { - int err = -EINVAL; + int err; struct tcf_exts e; struct tcf_ematch_tree t; - err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map); + tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; @@ -162,11 +159,12 @@ errout: return err; } -static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, unsigned long *arg) +static int basic_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, + struct nlattr **tca, unsigned long *arg, bool ovr) { int err; - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct nlattr *tb[TCA_BASIC_MAX + 1]; struct basic_filter *f = (struct basic_filter *) *arg; @@ -181,7 +179,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, if (f != NULL) { if (handle && f->handle != handle) return -EINVAL; - return basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); + return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); } err = -ENOBUFS; @@ -189,6 +187,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, if (f == NULL) goto errout; + tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); err = -EINVAL; if (handle) f->handle = handle; @@ -207,7 +206,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, f->handle = head->hgenerator; } - err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); + err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); if (err < 0) goto errout; @@ -226,7 +225,7 @@ errout: static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct basic_head *head = (struct basic_head *) tp->root; + struct basic_head *head = tp->root; struct basic_filter *f; list_for_each_entry(f, &head->flist, link) { @@ -242,7 +241,7 @@ skip: } } -static int basic_dump(struct tcf_proto *tp, unsigned long fh, +static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct basic_filter *f = (struct basic_filter *) fh; @@ -257,16 +256,17 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - if (f->res.classid) - NLA_PUT_U32(skb, TCA_BASIC_CLASSID, f->res.classid); + if (f->res.classid && + nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid)) + goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || + if (tcf_exts_dump(skb, &f->exts) < 0 || tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &basic_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c new file mode 100644 index 00000000000..13f64df2c71 --- /dev/null +++ b/net/sched/cls_bpf.c @@ -0,0 +1,382 @@ +/* + * Berkeley Packet Filter based traffic classifier + * + * Might be used to classify traffic through flexible, user-defined and + * possibly JIT-ed BPF filters for traffic control as an alternative to + * ematches. + * + * (C) 2013 Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/filter.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_DESCRIPTION("TC BPF based classifier"); + +struct cls_bpf_head { + struct list_head plist; + u32 hgen; +}; + +struct cls_bpf_prog { + struct sk_filter *filter; + struct sock_filter *bpf_ops; + struct tcf_exts exts; + struct tcf_result res; + struct list_head link; + u32 handle; + u16 bpf_len; +}; + +static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { + [TCA_BPF_CLASSID] = { .type = NLA_U32 }, + [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, + [TCA_BPF_OPS] = { .type = NLA_BINARY, + .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, +}; + +static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog; + int ret; + + list_for_each_entry(prog, &head->plist, link) { + int filter_res = SK_RUN_FILTER(prog->filter, skb); + + if (filter_res == 0) + continue; + + *res = prog->res; + if (filter_res != -1) + res->classid = filter_res; + + ret = tcf_exts_exec(skb, &prog->exts, res); + if (ret < 0) + continue; + + return ret; + } + + return -1; +} + +static int cls_bpf_init(struct tcf_proto *tp) +{ + struct cls_bpf_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + + INIT_LIST_HEAD(&head->plist); + tp->root = head; + + return 0; +} + +static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) +{ + tcf_unbind_filter(tp, &prog->res); + tcf_exts_destroy(tp, &prog->exts); + + sk_unattached_filter_destroy(prog->filter); + + kfree(prog->bpf_ops); + kfree(prog); +} + +static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg; + + list_for_each_entry(prog, &head->plist, link) { + if (prog == todel) { + tcf_tree_lock(tp); + list_del(&prog->link); + tcf_tree_unlock(tp); + + cls_bpf_delete_prog(tp, prog); + return 0; + } + } + + return -ENOENT; +} + +static void cls_bpf_destroy(struct tcf_proto *tp) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog, *tmp; + + list_for_each_entry_safe(prog, tmp, &head->plist, link) { + list_del(&prog->link); + cls_bpf_delete_prog(tp, prog); + } + + kfree(head); +} + +static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog; + unsigned long ret = 0UL; + + if (head == NULL) + return 0UL; + + list_for_each_entry(prog, &head->plist, link) { + if (prog->handle == handle) { + ret = (unsigned long) prog; + break; + } + } + + return ret; +} + +static void cls_bpf_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, + struct cls_bpf_prog *prog, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct sock_filter *bpf_ops, *bpf_old; + struct tcf_exts exts; + struct sock_fprog_kern tmp; + struct sk_filter *fp, *fp_old; + u16 bpf_size, bpf_len; + u32 classid; + int ret; + + if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID]) + return -EINVAL; + + tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + if (ret < 0) + return ret; + + classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]); + if (bpf_len > BPF_MAXINSNS || bpf_len == 0) { + ret = -EINVAL; + goto errout; + } + + bpf_size = bpf_len * sizeof(*bpf_ops); + bpf_ops = kzalloc(bpf_size, GFP_KERNEL); + if (bpf_ops == NULL) { + ret = -ENOMEM; + goto errout; + } + + memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); + + tmp.len = bpf_len; + tmp.filter = bpf_ops; + + ret = sk_unattached_filter_create(&fp, &tmp); + if (ret) + goto errout_free; + + tcf_tree_lock(tp); + fp_old = prog->filter; + bpf_old = prog->bpf_ops; + + prog->bpf_len = bpf_len; + prog->bpf_ops = bpf_ops; + prog->filter = fp; + prog->res.classid = classid; + tcf_tree_unlock(tp); + + tcf_bind_filter(tp, &prog->res, base); + tcf_exts_change(tp, &prog->exts, &exts); + + if (fp_old) + sk_unattached_filter_destroy(fp_old); + if (bpf_old) + kfree(bpf_old); + + return 0; + +errout_free: + kfree(bpf_ops); +errout: + tcf_exts_destroy(tp, &exts); + return ret; +} + +static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, + struct cls_bpf_head *head) +{ + unsigned int i = 0x80000000; + + do { + if (++head->hgen == 0x7FFFFFFF) + head->hgen = 1; + } while (--i > 0 && cls_bpf_get(tp, head->hgen)); + if (i == 0) + pr_err("Insufficient number of handles\n"); + + return i; +} + +static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg, bool ovr) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg; + struct nlattr *tb[TCA_BPF_MAX + 1]; + int ret; + + if (tca[TCA_OPTIONS] == NULL) + return -EINVAL; + + ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy); + if (ret < 0) + return ret; + + if (prog != NULL) { + if (handle && prog->handle != handle) + return -EINVAL; + return cls_bpf_modify_existing(net, tp, prog, base, tb, + tca[TCA_RATE], ovr); + } + + prog = kzalloc(sizeof(*prog), GFP_KERNEL); + if (prog == NULL) + return -ENOBUFS; + + tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); + if (handle == 0) + prog->handle = cls_bpf_grab_new_handle(tp, head); + else + prog->handle = handle; + if (prog->handle == 0) { + ret = -EINVAL; + goto errout; + } + + ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr); + if (ret < 0) + goto errout; + + tcf_tree_lock(tp); + list_add(&prog->link, &head->plist); + tcf_tree_unlock(tp); + + *arg = (unsigned long) prog; + + return 0; +errout: + if (*arg == 0UL && prog) + kfree(prog); + + return ret; +} + +static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *tm) +{ + struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; + struct nlattr *nest, *nla; + + if (prog == NULL) + return skb->len; + + tm->tcm_handle = prog->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) + goto nla_put_failure; + if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len)) + goto nla_put_failure; + + nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len * + sizeof(struct sock_filter)); + if (nla == NULL) + goto nla_put_failure; + + memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + + if (tcf_exts_dump(skb, &prog->exts) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &prog->exts) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog; + + list_for_each_entry(prog, &head->plist, link) { + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long) prog, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static struct tcf_proto_ops cls_bpf_ops __read_mostly = { + .kind = "bpf", + .owner = THIS_MODULE, + .classify = cls_bpf_classify, + .init = cls_bpf_init, + .destroy = cls_bpf_destroy, + .get = cls_bpf_get, + .put = cls_bpf_put, + .change = cls_bpf_change, + .delete = cls_bpf_delete, + .walk = cls_bpf_walk, + .dump = cls_bpf_dump, +}; + +static int __init cls_bpf_init_mod(void) +{ + return register_tcf_proto_ops(&cls_bpf_ops); +} + +static void __exit cls_bpf_exit_mod(void) +{ + unregister_tcf_proto_ops(&cls_bpf_ops); +} + +module_init(cls_bpf_init_mod); +module_exit(cls_bpf_exit_mod); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index f84fdc3a7f2..cacf01bd04f 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -11,90 +11,13 @@ #include <linux/module.h> #include <linux/slab.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/errno.h> #include <linux/skbuff.h> -#include <linux/cgroup.h> #include <linux/rcupdate.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> #include <net/cls_cgroup.h> -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp); -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); -static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); - -struct cgroup_subsys net_cls_subsys = { - .name = "net_cls", - .create = cgrp_create, - .destroy = cgrp_destroy, - .populate = cgrp_populate, -#ifdef CONFIG_NET_CLS_CGROUP - .subsys_id = net_cls_subsys_id, -#endif - .module = THIS_MODULE, -}; - - -static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), - struct cgroup_cls_state, css); -} - -static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) -{ - return container_of(task_subsys_state(p, net_cls_subsys_id), - struct cgroup_cls_state, css); -} - -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp) -{ - struct cgroup_cls_state *cs; - - cs = kzalloc(sizeof(*cs), GFP_KERNEL); - if (!cs) - return ERR_PTR(-ENOMEM); - - if (cgrp->parent) - cs->classid = cgrp_cls_state(cgrp->parent)->classid; - - return &cs->css; -} - -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - kfree(cgrp_cls_state(cgrp)); -} - -static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) -{ - return cgrp_cls_state(cgrp)->classid; -} - -static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value) -{ - cgrp_cls_state(cgrp)->classid = (u32) value; - return 0; -} - -static struct cftype ss_files[] = { - { - .name = "classid", - .read_u64 = read_classid, - .write_u64 = write_classid, - }, -}; - -static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); -} - struct cls_cgroup_head { u32 handle; struct tcf_exts exts; @@ -153,18 +76,14 @@ static int cls_cgroup_init(struct tcf_proto *tp) return 0; } -static const struct tcf_ext_map cgroup_ext_map = { - .action = TCA_CGROUP_ACT, - .police = TCA_CGROUP_POLICE, -}; - static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, }; -static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base, +static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { struct nlattr *tb[TCA_CGROUP_MAX + 1]; struct cls_cgroup_head *head = tp->root; @@ -183,6 +102,7 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base, if (head == NULL) return -ENOBUFS; + tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); head->handle = handle; tcf_tree_lock(tp); @@ -198,7 +118,8 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base, if (err < 0) return err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &cgroup_ext_map); + tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) return err; @@ -243,7 +164,7 @@ skip: arg->count++; } -static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh, +static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct cls_cgroup_head *head = tp->root; @@ -256,13 +177,13 @@ static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - if (tcf_exts_dump(skb, &head->exts, &cgroup_ext_map) < 0 || + if (tcf_exts_dump(skb, &head->exts) < 0 || tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &head->exts, &cgroup_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &head->exts) < 0) goto nla_put_failure; return skb->len; @@ -288,36 +209,12 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = { static int __init init_cgroup_cls(void) { - int ret; - - ret = cgroup_load_subsys(&net_cls_subsys); - if (ret) - goto out; - -#ifndef CONFIG_NET_CLS_CGROUP - /* We can't use rcu_assign_pointer because this is an int. */ - smp_wmb(); - net_cls_subsys_id = net_cls_subsys.subsys_id; -#endif - - ret = register_tcf_proto_ops(&cls_cgroup_ops); - if (ret) - cgroup_unload_subsys(&net_cls_subsys); - -out: - return ret; + return register_tcf_proto_ops(&cls_cgroup_ops); } static void __exit exit_cgroup_cls(void) { unregister_tcf_proto_ops(&cls_cgroup_ops); - -#ifndef CONFIG_NET_CLS_CGROUP - net_cls_subsys_id = -1; - synchronize_rcu(); -#endif - - cgroup_unload_subsys(&net_cls_subsys); } module_init(init_cgroup_cls); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 1d8bd0dbcd1..35be16f7c19 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -56,11 +56,6 @@ struct flow_filter { u32 hashrnd; }; -static const struct tcf_ext_map flow_ext_map = { - .action = TCA_FLOW_ACT, - .police = TCA_FLOW_POLICE, -}; - static inline u32 addr_fold(void *addr) { unsigned long a = (unsigned long)addr; @@ -193,15 +188,19 @@ static u32 flow_get_rtclassid(const struct sk_buff *skb) static u32 flow_get_skuid(const struct sk_buff *skb) { - if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) - return skb->sk->sk_socket->file->f_cred->fsuid; + if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { + kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid; + return from_kuid(&init_user_ns, skuid); + } return 0; } static u32 flow_get_skgid(const struct sk_buff *skb) { - if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) - return skb->sk->sk_socket->file->f_cred->fsgid; + if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { + kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid; + return from_kgid(&init_user_ns, skgid); + } return 0; } @@ -216,7 +215,7 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb) static u32 flow_get_rxhash(struct sk_buff *skb) { - return skb_get_rxhash(skb); + return skb_get_hash(skb); } static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) @@ -347,9 +346,10 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, }; -static int flow_change(struct tcf_proto *tp, unsigned long base, +static int flow_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { struct flow_head *head = tp->root; struct flow_filter *f; @@ -386,9 +386,14 @@ static int flow_change(struct tcf_proto *tp, unsigned long base, if (fls(keymask) - 1 > FLOW_KEY_MAX) return -EOPNOTSUPP; + + if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && + sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns) + return -EOPNOTSUPP; } - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map); + tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) return err; @@ -446,6 +451,7 @@ static int flow_change(struct tcf_proto *tp, unsigned long base, f->handle = handle; f->mask = ~0U; + tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); get_random_bytes(&f->hashrnd, 4); f->perturb_timer.function = flow_perturbation; @@ -557,7 +563,7 @@ static void flow_put(struct tcf_proto *tp, unsigned long f) { } -static int flow_dump(struct tcf_proto *tp, unsigned long fh, +static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct flow_filter *f = (struct flow_filter *)fh; @@ -572,27 +578,34 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - NLA_PUT_U32(skb, TCA_FLOW_KEYS, f->keymask); - NLA_PUT_U32(skb, TCA_FLOW_MODE, f->mode); + if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) || + nla_put_u32(skb, TCA_FLOW_MODE, f->mode)) + goto nla_put_failure; if (f->mask != ~0 || f->xor != 0) { - NLA_PUT_U32(skb, TCA_FLOW_MASK, f->mask); - NLA_PUT_U32(skb, TCA_FLOW_XOR, f->xor); + if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) || + nla_put_u32(skb, TCA_FLOW_XOR, f->xor)) + goto nla_put_failure; } - if (f->rshift) - NLA_PUT_U32(skb, TCA_FLOW_RSHIFT, f->rshift); - if (f->addend) - NLA_PUT_U32(skb, TCA_FLOW_ADDEND, f->addend); + if (f->rshift && + nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift)) + goto nla_put_failure; + if (f->addend && + nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend)) + goto nla_put_failure; - if (f->divisor) - NLA_PUT_U32(skb, TCA_FLOW_DIVISOR, f->divisor); - if (f->baseclass) - NLA_PUT_U32(skb, TCA_FLOW_BASECLASS, f->baseclass); + if (f->divisor && + nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor)) + goto nla_put_failure; + if (f->baseclass && + nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass)) + goto nla_put_failure; - if (f->perturb_period) - NLA_PUT_U32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ); + if (f->perturb_period && + nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ)) + goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; #ifdef CONFIG_NET_EMATCH if (f->ematches.hdr.nmatches && @@ -601,7 +614,7 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh, #endif nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 389af152ec4..861b03ccfed 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -29,11 +29,11 @@ #include <net/act_api.h> #include <net/pkt_cls.h> -#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *)) +#define HTSIZE 256 struct fw_head { - struct fw_filter *ht[HTSIZE]; - u32 mask; + u32 mask; + struct fw_filter *ht[HTSIZE]; }; struct fw_filter { @@ -41,46 +41,22 @@ struct fw_filter { u32 id; struct tcf_result res; #ifdef CONFIG_NET_CLS_IND - char indev[IFNAMSIZ]; + int ifindex; #endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; }; -static const struct tcf_ext_map fw_ext_map = { - .action = TCA_FW_ACT, - .police = TCA_FW_POLICE -}; - -static inline int fw_hash(u32 handle) +static u32 fw_hash(u32 handle) { - if (HTSIZE == 4096) - return ((handle >> 24) & 0xFFF) ^ - ((handle >> 12) & 0xFFF) ^ - (handle & 0xFFF); - else if (HTSIZE == 2048) - return ((handle >> 22) & 0x7FF) ^ - ((handle >> 11) & 0x7FF) ^ - (handle & 0x7FF); - else if (HTSIZE == 1024) - return ((handle >> 20) & 0x3FF) ^ - ((handle >> 10) & 0x3FF) ^ - (handle & 0x3FF); - else if (HTSIZE == 512) - return (handle >> 27) ^ - ((handle >> 18) & 0x1FF) ^ - ((handle >> 9) & 0x1FF) ^ - (handle & 0x1FF); - else if (HTSIZE == 256) { - u8 *t = (u8 *) &handle; - return t[0] ^ t[1] ^ t[2] ^ t[3]; - } else - return handle & (HTSIZE - 1); + handle ^= (handle >> 16); + handle ^= (handle >> 8); + return handle % HTSIZE; } static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f; int r; u32 id = skb->mark; @@ -91,7 +67,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (f->id == id) { *res = f->res; #ifdef CONFIG_NET_CLS_IND - if (!tcf_match_indev(skb, f->indev)) + if (!tcf_match_indev(skb, f->ifindex)) continue; #endif /* CONFIG_NET_CLS_IND */ r = tcf_exts_exec(skb, &f->exts, res); @@ -116,7 +92,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long fw_get(struct tcf_proto *tp, u32 handle) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f; if (head == NULL) @@ -165,7 +141,7 @@ static void fw_destroy(struct tcf_proto *tp) static int fw_delete(struct tcf_proto *tp, unsigned long arg) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f = (struct fw_filter *)arg; struct fw_filter **fp; @@ -192,19 +168,19 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { }; static int -fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, - struct nlattr **tb, struct nlattr **tca, unsigned long base) +fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, + struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct tcf_exts e; u32 mask; int err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &fw_ext_map); + tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) return err; - err = -EINVAL; if (tb[TCA_FW_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); tcf_bind_filter(tp, &f->res, base); @@ -212,12 +188,17 @@ fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FW_INDEV]) { - err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV]); - if (err < 0) + int ret; + ret = tcf_change_indev(net, tb[TCA_FW_INDEV]); + if (ret < 0) { + err = ret; goto errout; + } + f->ifindex = ret; } #endif /* CONFIG_NET_CLS_IND */ + err = -EINVAL; if (tb[TCA_FW_MASK]) { mask = nla_get_u32(tb[TCA_FW_MASK]); if (mask != head->mask) @@ -233,12 +214,13 @@ errout: return err; } -static int fw_change(struct tcf_proto *tp, unsigned long base, +static int fw_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f = (struct fw_filter *) *arg; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FW_MAX + 1]; @@ -254,7 +236,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base, if (f != NULL) { if (f->id != handle && handle) return -EINVAL; - return fw_change_attrs(tp, f, tb, tca, base); + return fw_change_attrs(net, tp, f, tb, tca, base, ovr); } if (!handle) @@ -279,9 +261,10 @@ static int fw_change(struct tcf_proto *tp, unsigned long base, if (f == NULL) return -ENOBUFS; + tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); f->id = handle; - err = fw_change_attrs(tp, f, tb, tca, base); + err = fw_change_attrs(net, tp, f, tb, tca, base, ovr); if (err < 0) goto errout; @@ -300,7 +283,7 @@ errout: static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; int h; if (head == NULL) @@ -326,10 +309,10 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int fw_dump(struct tcf_proto *tp, unsigned long fh, +static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct fw_head *head = (struct fw_head *)tp->root; + struct fw_head *head = tp->root; struct fw_filter *f = (struct fw_filter *)fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -346,21 +329,27 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - if (f->res.classid) - NLA_PUT_U32(skb, TCA_FW_CLASSID, f->res.classid); + if (f->res.classid && + nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid)) + goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND - if (strlen(f->indev)) - NLA_PUT_STRING(skb, TCA_FW_INDEV, f->indev); + if (f->ifindex) { + struct net_device *dev; + dev = __dev_get_by_index(net, f->ifindex); + if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name)) + goto nla_put_failure; + } #endif /* CONFIG_NET_CLS_IND */ - if (head->mask != 0xFFFFFFFF) - NLA_PUT_U32(skb, TCA_FW_MASK, head->mask); + if (head->mask != 0xFFFFFFFF && + nla_put_u32(skb, TCA_FW_MASK, head->mask)) + goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 13ab66e9df5..dd9fc2523c7 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -59,11 +59,6 @@ struct route4_filter { #define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) -static const struct tcf_ext_map route_ext_map = { - .police = TCA_ROUTE4_POLICE, - .action = TCA_ROUTE4_ACT -}; - static inline int route4_fastmap_hash(u32 id, int iif) { return id & 0xF; @@ -128,7 +123,7 @@ static inline int route4_hash_wild(void) static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct route4_head *head = (struct route4_head *)tp->root; + struct route4_head *head = tp->root; struct dst_entry *dst; struct route4_bucket *b; struct route4_filter *f; @@ -143,7 +138,7 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (head == NULL) goto old_method; - iif = ((struct rtable *)dst)->rt_iif; + iif = inet_iif(skb); h = route4_fastmap_hash(id, iif); if (id == head->fastmap[h].id && @@ -218,7 +213,7 @@ static inline u32 from_hash(u32 id) static unsigned long route4_get(struct tcf_proto *tp, u32 handle) { - struct route4_head *head = (struct route4_head *)tp->root; + struct route4_head *head = tp->root; struct route4_bucket *b; struct route4_filter *f; unsigned int h1, h2; @@ -289,7 +284,7 @@ static void route4_destroy(struct tcf_proto *tp) static int route4_delete(struct tcf_proto *tp, unsigned long arg) { - struct route4_head *head = (struct route4_head *)tp->root; + struct route4_head *head = tp->root; struct route4_filter **fp, *f = (struct route4_filter *)arg; unsigned int h = 0; struct route4_bucket *b; @@ -335,9 +330,11 @@ static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = { [TCA_ROUTE4_IIF] = { .type = NLA_U32 }, }; -static int route4_set_parms(struct tcf_proto *tp, unsigned long base, - struct route4_filter *f, u32 handle, struct route4_head *head, - struct nlattr **tb, struct nlattr *est, int new) +static int route4_set_parms(struct net *net, struct tcf_proto *tp, + unsigned long base, struct route4_filter *f, + u32 handle, struct route4_head *head, + struct nlattr **tb, struct nlattr *est, int new, + bool ovr) { int err; u32 id = 0, to = 0, nhandle = 0x8000; @@ -346,7 +343,8 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base, struct route4_bucket *b; struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map); + tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; @@ -427,10 +425,11 @@ errout: return err; } -static int route4_change(struct tcf_proto *tp, unsigned long base, +static int route4_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { struct route4_head *head = tp->root; struct route4_filter *f, *f1, **fp; @@ -456,8 +455,8 @@ static int route4_change(struct tcf_proto *tp, unsigned long base, if (f->bkt) old_handle = f->handle; - err = route4_set_parms(tp, base, f, handle, head, tb, - tca[TCA_RATE], 0); + err = route4_set_parms(net, tp, base, f, handle, head, tb, + tca[TCA_RATE], 0, ovr); if (err < 0) return err; @@ -479,8 +478,9 @@ static int route4_change(struct tcf_proto *tp, unsigned long base, if (f == NULL) goto errout; - err = route4_set_parms(tp, base, f, handle, head, tb, - tca[TCA_RATE], 1); + tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + err = route4_set_parms(net, tp, base, f, handle, head, tb, + tca[TCA_RATE], 1, ovr); if (err < 0) goto errout; @@ -552,7 +552,7 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int route4_dump(struct tcf_proto *tp, unsigned long fh, +static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct route4_filter *f = (struct route4_filter *)fh; @@ -571,24 +571,28 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh, if (!(f->handle & 0x8000)) { id = f->id & 0xFF; - NLA_PUT_U32(skb, TCA_ROUTE4_TO, id); + if (nla_put_u32(skb, TCA_ROUTE4_TO, id)) + goto nla_put_failure; } if (f->handle & 0x80000000) { - if ((f->handle >> 16) != 0xFFFF) - NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif); + if ((f->handle >> 16) != 0xFFFF && + nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif)) + goto nla_put_failure; } else { id = f->id >> 16; - NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id); + if (nla_put_u32(skb, TCA_ROUTE4_FROM, id)) + goto nla_put_failure; } - if (f->res.classid) - NLA_PUT_U32(skb, TCA_ROUTE4_CLASSID, f->res.classid); + if (f->res.classid && + nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid)) + goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index b01427924f8..1020e233a5d 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -116,11 +116,6 @@ static inline unsigned int hash_src(__be32 *src) return h & 0xF; } -static struct tcf_ext_map rsvp_ext_map = { - .police = TCA_RSVP_POLICE, - .action = TCA_RSVP_ACT -}; - #define RSVP_APPLY_RESULT() \ { \ int r = tcf_exts_exec(skb, &f->exts, res); \ @@ -416,10 +411,11 @@ static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, }; -static int rsvp_change(struct tcf_proto *tp, unsigned long base, +static int rsvp_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { struct rsvp_head *data = tp->root; struct rsvp_filter *f, **fp; @@ -439,7 +435,8 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base, if (err < 0) return err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &rsvp_ext_map); + tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) return err; @@ -470,6 +467,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base, if (f == NULL) goto errout2; + tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); h2 = 16; if (tb[TCA_RSVP_SRC]) { memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); @@ -596,7 +594,7 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, +static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct rsvp_filter *f = (struct rsvp_filter *)fh; @@ -615,25 +613,29 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); + if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst)) + goto nla_put_failure; pinfo.dpi = s->dpi; pinfo.spi = f->spi; pinfo.protocol = s->protocol; pinfo.tunnelid = s->tunnelid; pinfo.tunnelhdr = f->tunnelhdr; pinfo.pad = 0; - NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); - if (f->res.classid) - NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid); - if (((f->handle >> 8) & 0xFF) != 16) - NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); + if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo)) + goto nla_put_failure; + if (f->res.classid && + nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid)) + goto nla_put_failure; + if (((f->handle >> 8) & 0xFF) != 16 && + nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src)) + goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) + if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index dbe199234c6..c721cd4a469 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -24,9 +24,6 @@ #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ -#define PRIV(tp) ((struct tcindex_data *) (tp)->root) - - struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; @@ -50,11 +47,6 @@ struct tcindex_data { int fall_through; /* 0: only classify if explicit match */ }; -static const struct tcf_ext_map tcindex_ext_map = { - .police = TCA_TCINDEX_POLICE, - .action = TCA_TCINDEX_ACT -}; - static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) { @@ -82,7 +74,7 @@ tcindex_lookup(struct tcindex_data *p, u16 key) static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *f; int key = (skb->tc_index & p->mask) >> p->shift; @@ -107,7 +99,7 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r; pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); @@ -145,7 +137,7 @@ static int tcindex_init(struct tcf_proto *tp) static int __tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; struct tcindex_filter *f = NULL; @@ -196,10 +188,17 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, }; +static void tcindex_filter_result_init(struct tcindex_filter_result *r) +{ + memset(r, 0, sizeof(*r)); + tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); +} + static int -tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, - struct tcindex_data *p, struct tcindex_filter_result *r, - struct nlattr **tb, struct nlattr *est) +tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, + u32 handle, struct tcindex_data *p, + struct tcindex_filter_result *r, struct nlattr **tb, + struct nlattr *est, bool ovr) { int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; @@ -208,17 +207,17 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map); + tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; memcpy(&cp, p, sizeof(cp)); - memset(&new_filter_result, 0, sizeof(new_filter_result)); + tcindex_filter_result_init(&new_filter_result); + tcindex_filter_result_init(&cr); if (old_r) - memcpy(&cr, r, sizeof(cr)); - else - memset(&cr, 0, sizeof(cr)); + cr.res = r->res; if (tb[TCA_TCINDEX_HASH]) cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); @@ -270,9 +269,14 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, err = -ENOMEM; if (!cp.perfect && !cp.h) { if (valid_perfect_hash(&cp)) { + int i; + cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL); if (!cp.perfect) goto errout; + for (i = 0; i < cp.hash; i++) + tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT, + TCA_TCINDEX_POLICE); balloc = 1; } else { cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL); @@ -298,14 +302,17 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, tcf_bind_filter(tp, &cr.res, base); } - tcf_exts_change(tp, &cr.exts, &e); + if (old_r) + tcf_exts_change(tp, &r->exts, &e); + else + tcf_exts_change(tp, &cr.exts, &e); tcf_tree_lock(tp); if (old_r && old_r != r) - memset(old_r, 0, sizeof(*old_r)); + tcindex_filter_result_init(old_r); memcpy(p, &cp, sizeof(cp)); - memcpy(r, &cr, sizeof(cr)); + r->res = cr.res; if (r == &new_filter_result) { struct tcindex_filter **fp; @@ -332,12 +339,13 @@ errout: } static int -tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, unsigned long *arg) +tcindex_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, + struct nlattr **tca, unsigned long *arg, bool ovr) { struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_TCINDEX_MAX + 1]; - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; int err; @@ -352,13 +360,14 @@ tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle, if (err < 0) return err; - return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE]); + return tcindex_set_parms(net, tp, base, handle, p, r, tb, + tca[TCA_RATE], ovr); } static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter *f, *next; int i; @@ -405,7 +414,7 @@ static int tcindex_destroy_element(struct tcf_proto *tp, static void tcindex_destroy(struct tcf_proto *tp) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcf_walker walker; pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); @@ -420,10 +429,10 @@ static void tcindex_destroy(struct tcf_proto *tp) } -static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, +static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct tcindex_data *p = PRIV(tp); + struct tcindex_data *p = tp->root; struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -438,10 +447,11 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, if (!fh) { t->tcm_handle = ~0; /* whatever ... */ - NLA_PUT_U32(skb, TCA_TCINDEX_HASH, p->hash); - NLA_PUT_U16(skb, TCA_TCINDEX_MASK, p->mask); - NLA_PUT_U32(skb, TCA_TCINDEX_SHIFT, p->shift); - NLA_PUT_U32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through); + if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) || + nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) || + nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) || + nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through)) + goto nla_put_failure; nla_nest_end(skb, nest); } else { if (p->perfect) { @@ -460,14 +470,15 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, } } pr_debug("handle = %d\n", t->tcm_handle); - if (r->res.class) - NLA_PUT_U32(skb, TCA_TCINDEX_CLASSID, r->res.classid); + if (r->res.class && + nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid)) + goto nla_put_failure; - if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) + if (tcf_exts_dump(skb, &r->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &r->exts) < 0) goto nla_put_failure; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 939b627b479..70c0be8d012 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -38,6 +38,7 @@ #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <linux/bitmap.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> @@ -48,7 +49,7 @@ struct tc_u_knode { struct tc_u_hnode *ht_up; struct tcf_exts exts; #ifdef CONFIG_NET_CLS_IND - char indev[IFNAMSIZ]; + int ifindex; #endif u8 fshift; struct tcf_result res; @@ -79,11 +80,6 @@ struct tc_u_common { u32 hgenerator; }; -static const struct tcf_ext_map u32_ext_map = { - .action = TCA_U32_ACT, - .police = TCA_U32_POLICE -}; - static inline unsigned int u32_hash_fold(__be32 key, const struct tc_u32_sel *sel, u8 fshift) @@ -100,7 +96,7 @@ static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct unsigned int off; } stack[TC_U32_MAXDEPTH]; - struct tc_u_hnode *ht = (struct tc_u_hnode *)tp->root; + struct tc_u_hnode *ht = tp->root; unsigned int off = skb_network_offset(skb); struct tc_u_knode *n; int sdepth = 0; @@ -157,7 +153,7 @@ check_terminal: *res = n->res; #ifdef CONFIG_NET_CLS_IND - if (!tcf_match_indev(skb, n->indev)) { + if (!tcf_match_indev(skb, n->ifindex)) { n = n->next; goto next_knode; } @@ -234,8 +230,7 @@ out: return -1; deadloop: - if (net_ratelimit()) - pr_warning("cls_u32: dead loop\n"); + net_warn_ratelimited("cls_u32: dead loop\n"); return -1; } @@ -353,7 +348,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) return 0; } -static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) +static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { struct tc_u_knode **kp; struct tc_u_hnode *ht = key->ht_up; @@ -466,17 +461,25 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg) return 0; } +#define NR_U32_NODE (1<<12) static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) { struct tc_u_knode *n; - unsigned int i = 0x7FF; + unsigned long i; + unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long), + GFP_KERNEL); + if (!bitmap) + return handle | 0xFFF; for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next) - if (i < TC_U32_NODE(n->handle)) - i = TC_U32_NODE(n->handle); - i++; + set_bit(TC_U32_NODE(n->handle), bitmap); + + i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); + if (i >= NR_U32_NODE) + i = find_next_zero_bit(bitmap, NR_U32_NODE, 1); - return handle | (i > 0xFFF ? 0xFFF : i); + kfree(bitmap); + return handle | (i >= NR_U32_NODE ? 0xFFF : i); } static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { @@ -489,15 +492,16 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) }, }; -static int u32_set_parms(struct tcf_proto *tp, unsigned long base, - struct tc_u_hnode *ht, +static int u32_set_parms(struct net *net, struct tcf_proto *tp, + unsigned long base, struct tc_u_hnode *ht, struct tc_u_knode *n, struct nlattr **tb, - struct nlattr *est) + struct nlattr *est, bool ovr) { int err; struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map); + tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) return err; @@ -532,9 +536,11 @@ static int u32_set_parms(struct tcf_proto *tp, unsigned long base, #ifdef CONFIG_NET_CLS_IND if (tb[TCA_U32_INDEV]) { - err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV]); - if (err < 0) + int ret; + ret = tcf_change_indev(net, tb[TCA_U32_INDEV]); + if (ret < 0) goto errout; + n->ifindex = ret; } #endif tcf_exts_change(tp, &n->exts, &e); @@ -545,9 +551,10 @@ errout: return err; } -static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, +static int u32_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - unsigned long *arg) + unsigned long *arg, bool ovr) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; @@ -570,7 +577,8 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, if (TC_U32_KEY(n->handle) == 0) return -EINVAL; - return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE]); + return u32_set_parms(net, tp, base, n->ht_up, n, tb, + tca[TCA_RATE], ovr); } if (tb[TCA_U32_DIVISOR]) { @@ -645,6 +653,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, n->ht_up = ht; n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; + tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); #ifdef CONFIG_CLS_U32_MARK if (tb[TCA_U32_MARK]) { @@ -656,7 +665,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, } #endif - err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE]); + err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr); if (err == 0) { struct tc_u_knode **ins; for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) @@ -714,7 +723,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } -static int u32_dump(struct tcf_proto *tp, unsigned long fh, +static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct tc_u_knode *n = (struct tc_u_knode *)fh; @@ -733,43 +742,54 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, struct tc_u_hnode *ht = (struct tc_u_hnode *)fh; u32 divisor = ht->divisor + 1; - NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor); + if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor)) + goto nla_put_failure; } else { - NLA_PUT(skb, TCA_U32_SEL, - sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), - &n->sel); + if (nla_put(skb, TCA_U32_SEL, + sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), + &n->sel)) + goto nla_put_failure; if (n->ht_up) { u32 htid = n->handle & 0xFFFFF000; - NLA_PUT_U32(skb, TCA_U32_HASH, htid); + if (nla_put_u32(skb, TCA_U32_HASH, htid)) + goto nla_put_failure; } - if (n->res.classid) - NLA_PUT_U32(skb, TCA_U32_CLASSID, n->res.classid); - if (n->ht_down) - NLA_PUT_U32(skb, TCA_U32_LINK, n->ht_down->handle); + if (n->res.classid && + nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid)) + goto nla_put_failure; + if (n->ht_down && + nla_put_u32(skb, TCA_U32_LINK, n->ht_down->handle)) + goto nla_put_failure; #ifdef CONFIG_CLS_U32_MARK - if (n->mark.val || n->mark.mask) - NLA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark); + if ((n->mark.val || n->mark.mask) && + nla_put(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark)) + goto nla_put_failure; #endif - if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) + if (tcf_exts_dump(skb, &n->exts) < 0) goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND - if (strlen(n->indev)) - NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev); + if (n->ifindex) { + struct net_device *dev; + dev = __dev_get_by_index(net, n->ifindex); + if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name)) + goto nla_put_failure; + } #endif #ifdef CONFIG_CLS_U32_PERF - NLA_PUT(skb, TCA_U32_PCNT, - sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), - n->pf); + if (nla_put(skb, TCA_U32_PCNT, + sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), + n->pf)) + goto nla_put_failure; #endif } nla_nest_end(skb, nest); if (TC_U32_KEY(n->handle)) - if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) + if (tcf_exts_dump_stats(skb, &n->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c new file mode 100644 index 00000000000..bfd34e4c1af --- /dev/null +++ b/net/sched/em_canid.c @@ -0,0 +1,240 @@ +/* + * em_canid.c Ematch rule to match CAN frames according to their CAN IDs + * + * This program is free software; you can distribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de> + * Copyright: (c) 2011 Czech Technical University in Prague + * (c) 2011 Volkswagen Group Research + * Authors: Michal Sojka <sojkam1@fel.cvut.cz> + * Pavel Pisa <pisa@cmp.felk.cvut.cz> + * Rostislav Lisovy <lisovy@gmail.cz> + * Funded by: Volkswagen Group Research + */ + +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <net/pkt_cls.h> +#include <linux/can.h> + +#define EM_CAN_RULES_MAX 500 + +struct canid_match { + /* For each SFF CAN ID (11 bit) there is one record in this bitfield */ + DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS)); + + int rules_count; + int sff_rules_count; + int eff_rules_count; + + /* + * Raw rules copied from netlink message; Used for sending + * information to userspace (when 'tc filter show' is invoked) + * AND when matching EFF frames + */ + struct can_filter rules_raw[]; +}; + +/** + * em_canid_get_id() - Extracts Can ID out of the sk_buff structure. + */ +static canid_t em_canid_get_id(struct sk_buff *skb) +{ + /* CAN ID is stored within the data field */ + struct can_frame *cf = (struct can_frame *)skb->data; + + return cf->can_id; +} + +static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id, + u32 can_mask) +{ + int i; + + /* + * Limit can_mask and can_id to SFF range to + * protect against write after end of array + */ + can_mask &= CAN_SFF_MASK; + can_id &= can_mask; + + /* Single frame */ + if (can_mask == CAN_SFF_MASK) { + set_bit(can_id, cm->match_sff); + return; + } + + /* All frames */ + if (can_mask == 0) { + bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS)); + return; + } + + /* + * Individual frame filter. + * Add record (set bit to 1) for each ID that + * conforms particular rule + */ + for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) { + if ((i & can_mask) == can_id) + set_bit(i, cm->match_sff); + } +} + +static inline struct canid_match *em_canid_priv(struct tcf_ematch *m) +{ + return (struct canid_match *)m->data; +} + +static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, + struct tcf_pkt_info *info) +{ + struct canid_match *cm = em_canid_priv(m); + canid_t can_id; + int match = 0; + int i; + const struct can_filter *lp; + + can_id = em_canid_get_id(skb); + + if (can_id & CAN_EFF_FLAG) { + for (i = 0, lp = cm->rules_raw; + i < cm->eff_rules_count; i++, lp++) { + if (!(((lp->can_id ^ can_id) & lp->can_mask))) { + match = 1; + break; + } + } + } else { /* SFF */ + can_id &= CAN_SFF_MASK; + match = (test_bit(can_id, cm->match_sff) ? 1 : 0); + } + + return match; +} + +static int em_canid_change(struct tcf_proto *tp, void *data, int len, + struct tcf_ematch *m) +{ + struct can_filter *conf = data; /* Array with rules */ + struct canid_match *cm; + struct canid_match *cm_old = (struct canid_match *)m->data; + int i; + + if (!len) + return -EINVAL; + + if (len % sizeof(struct can_filter)) + return -EINVAL; + + if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX) + return -EINVAL; + + cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL); + if (!cm) + return -ENOMEM; + + cm->rules_count = len / sizeof(struct can_filter); + + /* + * We need two for() loops for copying rules into two contiguous + * areas in rules_raw to process all eff rules with a simple loop. + * NB: The configuration interface supports sff and eff rules. + * We do not support filters here that match for the same can_id + * provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123). + * For this (unusual case) two filters have to be specified. The + * SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id. + */ + + /* Fill rules_raw with EFF rules first */ + for (i = 0; i < cm->rules_count; i++) { + if (conf[i].can_id & CAN_EFF_FLAG) { + memcpy(cm->rules_raw + cm->eff_rules_count, + &conf[i], + sizeof(struct can_filter)); + + cm->eff_rules_count++; + } + } + + /* append SFF frame rules */ + for (i = 0; i < cm->rules_count; i++) { + if (!(conf[i].can_id & CAN_EFF_FLAG)) { + memcpy(cm->rules_raw + + cm->eff_rules_count + + cm->sff_rules_count, + &conf[i], sizeof(struct can_filter)); + + cm->sff_rules_count++; + + em_canid_sff_match_add(cm, + conf[i].can_id, conf[i].can_mask); + } + } + + m->datalen = sizeof(struct canid_match) + len; + m->data = (unsigned long)cm; + + if (cm_old != NULL) { + pr_err("canid: Configuring an existing ematch!\n"); + kfree(cm_old); + } + + return 0; +} + +static void em_canid_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +{ + struct canid_match *cm = em_canid_priv(m); + + kfree(cm); +} + +static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m) +{ + struct canid_match *cm = em_canid_priv(m); + + /* + * When configuring this ematch 'rules_count' is set not to exceed + * 'rules_raw' array size + */ + if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count, + &cm->rules_raw) < 0) + return -EMSGSIZE; + + return 0; +} + +static struct tcf_ematch_ops em_canid_ops = { + .kind = TCF_EM_CANID, + .change = em_canid_change, + .match = em_canid_match, + .destroy = em_canid_destroy, + .dump = em_canid_dump, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_canid_ops.link) +}; + +static int __init init_em_canid(void) +{ + return tcf_em_register(&em_canid_ops); +} + +static void __exit exit_em_canid(void) +{ + tcf_em_unregister(&em_canid_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_canid); +module_exit(exit_em_canid); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID); diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c new file mode 100644 index 00000000000..527aeb7a3ff --- /dev/null +++ b/net/sched/em_ipset.c @@ -0,0 +1,136 @@ +/* + * net/sched/em_ipset.c ipset ematch + * + * Copyright (c) 2012 Florian Westphal <fw@strlen.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/gfp.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/netfilter/xt_set.h> +#include <linux/ipv6.h> +#include <net/ip.h> +#include <net/pkt_cls.h> + +static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, + struct tcf_ematch *em) +{ + struct xt_set_info *set = data; + ip_set_id_t index; + struct net *net = dev_net(qdisc_dev(tp->q)); + + if (data_len != sizeof(*set)) + return -EINVAL; + + index = ip_set_nfnl_get_byindex(net, set->index); + if (index == IPSET_INVALID_ID) + return -ENOENT; + + em->datalen = sizeof(*set); + em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL); + if (em->data) + return 0; + + ip_set_nfnl_put(net, index); + return -ENOMEM; +} + +static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em) +{ + const struct xt_set_info *set = (const void *) em->data; + if (set) { + ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index); + kfree((void *) em->data); + } +} + +static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct ip_set_adt_opt opt; + struct xt_action_param acpar; + const struct xt_set_info *set = (const void *) em->data; + struct net_device *dev, *indev = NULL; + int ret, network_offset; + + switch (skb->protocol) { + case htons(ETH_P_IP): + acpar.family = NFPROTO_IPV4; + if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + return 0; + acpar.thoff = ip_hdrlen(skb); + break; + case htons(ETH_P_IPV6): + acpar.family = NFPROTO_IPV6; + if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + /* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */ + acpar.thoff = sizeof(struct ipv6hdr); + break; + default: + return 0; + } + + acpar.hooknum = 0; + + opt.family = acpar.family; + opt.dim = set->dim; + opt.flags = set->flags; + opt.cmdflags = 0; + opt.ext.timeout = ~0u; + + network_offset = skb_network_offset(skb); + skb_pull(skb, network_offset); + + dev = skb->dev; + + rcu_read_lock(); + + if (dev && skb->skb_iif) + indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif); + + acpar.in = indev ? indev : dev; + acpar.out = dev; + + ret = ip_set_test(set->index, skb, &acpar, &opt); + + rcu_read_unlock(); + + skb_push(skb, network_offset); + return ret; +} + +static struct tcf_ematch_ops em_ipset_ops = { + .kind = TCF_EM_IPSET, + .change = em_ipset_change, + .destroy = em_ipset_destroy, + .match = em_ipset_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_ipset_ops.link) +}; + +static int __init init_em_ipset(void) +{ + return tcf_em_register(&em_ipset_ops); +} + +static void __exit exit_em_ipset(void) +{ + tcf_em_unregister(&em_ipset_ops); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("TC extended match for IP sets"); + +module_init(init_em_ipset); +module_exit(exit_em_ipset); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 1363bf14e61..9b8c0b0e60d 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -222,7 +222,7 @@ META_COLLECTOR(int_maclen) META_COLLECTOR(int_rxhash) { - dst->value = skb_get_rxhash(skb); + dst->value = skb_get_hash(skb); } /************************************************************************** @@ -264,47 +264,59 @@ META_COLLECTOR(int_rtiif) if (unlikely(skb_rtable(skb) == NULL)) *err = -1; else - dst->value = skb_rtable(skb)->rt_iif; + dst->value = inet_iif(skb); } /************************************************************************** * Socket Attributes **************************************************************************/ -#define SKIP_NONLOCAL(skb) \ - if (unlikely(skb->sk == NULL)) { \ - *err = -1; \ - return; \ - } +#define skip_nonlocal(skb) \ + (unlikely(skb->sk == NULL)) META_COLLECTOR(int_sk_family) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_family; } META_COLLECTOR(int_sk_state) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_state; } META_COLLECTOR(int_sk_reuse) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_reuse; } META_COLLECTOR(int_sk_bound_if) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } /* No error if bound_dev_if is 0, legal userspace check */ dst->value = skb->sk->sk_bound_dev_if; } META_COLLECTOR(var_sk_bound_if) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } if (skb->sk->sk_bound_dev_if == 0) { dst->value = (unsigned long) "any"; @@ -322,151 +334,226 @@ META_COLLECTOR(var_sk_bound_if) META_COLLECTOR(int_sk_refcnt) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = atomic_read(&skb->sk->sk_refcnt); } META_COLLECTOR(int_sk_rcvbuf) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_rcvbuf; } META_COLLECTOR(int_sk_shutdown) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_shutdown; } META_COLLECTOR(int_sk_proto) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_protocol; } META_COLLECTOR(int_sk_type) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_type; } META_COLLECTOR(int_sk_rmem_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = sk_rmem_alloc_get(skb->sk); } META_COLLECTOR(int_sk_wmem_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = sk_wmem_alloc_get(skb->sk); } META_COLLECTOR(int_sk_omem_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = atomic_read(&skb->sk->sk_omem_alloc); } META_COLLECTOR(int_sk_rcv_qlen) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_receive_queue.qlen; } META_COLLECTOR(int_sk_snd_qlen) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_write_queue.qlen; } META_COLLECTOR(int_sk_wmem_queued) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_wmem_queued; } META_COLLECTOR(int_sk_fwd_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_forward_alloc; } META_COLLECTOR(int_sk_sndbuf) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_sndbuf; } META_COLLECTOR(int_sk_alloc) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = (__force int) skb->sk->sk_allocation; } META_COLLECTOR(int_sk_hash) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_hash; } META_COLLECTOR(int_sk_lingertime) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_lingertime / HZ; } META_COLLECTOR(int_sk_err_qlen) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_error_queue.qlen; } META_COLLECTOR(int_sk_ack_bl) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_ack_backlog; } META_COLLECTOR(int_sk_max_ack_bl) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_max_ack_backlog; } META_COLLECTOR(int_sk_prio) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_priority; } META_COLLECTOR(int_sk_rcvlowat) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_rcvlowat; } META_COLLECTOR(int_sk_rcvtimeo) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_rcvtimeo / HZ; } META_COLLECTOR(int_sk_sndtimeo) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_sndtimeo / HZ; } META_COLLECTOR(int_sk_sendmsg_off) { - SKIP_NONLOCAL(skb); - dst->value = skb->sk->sk_sndmsg_off; + if (skip_nonlocal(skb)) { + *err = -1; + return; + } + dst->value = skb->sk->sk_frag.offset; } META_COLLECTOR(int_sk_write_pend) { - SKIP_NONLOCAL(skb); + if (skip_nonlocal(skb)) { + *err = -1; + return; + } dst->value = skb->sk->sk_write_pending; } @@ -585,8 +672,9 @@ static void meta_var_apply_extras(struct meta_value *v, static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { - if (v->val && v->len) - NLA_PUT(skb, tlv, v->len, (void *) v->val); + if (v->val && v->len && + nla_put(skb, tlv, v->len, (void *) v->val)) + goto nla_put_failure; return 0; nla_put_failure: @@ -636,10 +724,13 @@ static void meta_int_apply_extras(struct meta_value *v, static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { - if (v->len == sizeof(unsigned long)) - NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val); - else if (v->len == sizeof(u32)) - NLA_PUT_U32(skb, tlv, v->val); + if (v->len == sizeof(unsigned long)) { + if (nla_put(skb, tlv, sizeof(unsigned long), &v->val)) + goto nla_put_failure; + } else if (v->len == sizeof(u32)) { + if (nla_put_u32(skb, tlv, v->val)) + goto nla_put_failure; + } return 0; @@ -789,8 +880,10 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len, goto errout; meta = kzalloc(sizeof(*meta), GFP_KERNEL); - if (meta == NULL) + if (meta == NULL) { + err = -ENOMEM; goto errout; + } memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left)); memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); @@ -831,7 +924,8 @@ static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em) memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left)); memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right)); - NLA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr); + if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr)) + goto nla_put_failure; ops = meta_type_ops(&meta->lvalue); if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 || diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 88d93eb9250..3a633debb6d 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -441,7 +441,8 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) if (top_start == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr); + if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr)) + goto nla_put_failure; list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST); if (list_start == NULL) @@ -457,7 +458,8 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) .flags = em->flags }; - NLA_PUT(skb, i + 1, sizeof(em_hdr), &em_hdr); + if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr)) + goto nla_put_failure; if (em->ops && em->ops->dump) { if (em->ops->dump(skb, em) < 0) @@ -535,9 +537,7 @@ pop_stack: return res; stack_overflow: - if (net_ratelimit()) - pr_warning("tc ematch: local stack overflow," - " increase NET_EMATCH_STACK\n"); + net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n"); return -1; } EXPORT_SYMBOL(__tcf_em_tree_match); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 3d8981fde30..58bed7599db 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -135,7 +135,7 @@ static DEFINE_RWLOCK(qdisc_mod_lock); static struct Qdisc_ops *qdisc_base; -/* Register/uregister queueing discipline */ +/* Register/unregister queueing discipline */ int register_qdisc(struct Qdisc_ops *qops) { @@ -200,6 +200,58 @@ int unregister_qdisc(struct Qdisc_ops *qops) } EXPORT_SYMBOL(unregister_qdisc); +/* Get default qdisc if not otherwise specified */ +void qdisc_get_default(char *name, size_t len) +{ + read_lock(&qdisc_mod_lock); + strlcpy(name, default_qdisc_ops->id, len); + read_unlock(&qdisc_mod_lock); +} + +static struct Qdisc_ops *qdisc_lookup_default(const char *name) +{ + struct Qdisc_ops *q = NULL; + + for (q = qdisc_base; q; q = q->next) { + if (!strcmp(name, q->id)) { + if (!try_module_get(q->owner)) + q = NULL; + break; + } + } + + return q; +} + +/* Set new default qdisc to use */ +int qdisc_set_default(const char *name) +{ + const struct Qdisc_ops *ops; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + write_lock(&qdisc_mod_lock); + ops = qdisc_lookup_default(name); + if (!ops) { + /* Not found, drop lock and try to load module */ + write_unlock(&qdisc_mod_lock); + request_module("sch_%s", name); + write_lock(&qdisc_mod_lock); + + ops = qdisc_lookup_default(name); + } + + if (ops) { + /* Set new default */ + module_put(default_qdisc_ops->owner); + default_qdisc_ops = ops; + } + write_unlock(&qdisc_mod_lock); + + return ops ? 0 : -ENOENT; +} + /* We know handle. Find qdisc among all qdisc's attached to device (root qdisc, all its children, children of children etc.) */ @@ -219,11 +271,16 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) return NULL; } -static void qdisc_list_add(struct Qdisc *q) +void qdisc_list_add(struct Qdisc *q) { - if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) - list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list); + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { + struct Qdisc *root = qdisc_dev(q)->qdisc; + + WARN_ON_ONCE(root == &noop_qdisc); + list_add_tail(&q->list, &root->list); + } } +EXPORT_SYMBOL(qdisc_list_add); void qdisc_list_del(struct Qdisc *q) { @@ -285,28 +342,70 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) return q; } +/* The linklayer setting were not transferred from iproute2, in older + * versions, and the rate tables lookup systems have been dropped in + * the kernel. To keep backward compatible with older iproute2 tc + * utils, we detect the linklayer setting by detecting if the rate + * table were modified. + * + * For linklayer ATM table entries, the rate table will be aligned to + * 48 bytes, thus some table entries will contain the same value. The + * mpu (min packet unit) is also encoded into the old rate table, thus + * starting from the mpu, we find low and high table entries for + * mapping this cell. If these entries contain the same value, when + * the rate tables have been modified for linklayer ATM. + * + * This is done by rounding mpu to the nearest 48 bytes cell/entry, + * and then roundup to the next cell, calc the table entry one below, + * and compare. + */ +static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) +{ + int low = roundup(r->mpu, 48); + int high = roundup(low+1, 48); + int cell_low = low >> r->cell_log; + int cell_high = (high >> r->cell_log) - 1; + + /* rtab is too inaccurate at rates > 100Mbit/s */ + if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { + pr_debug("TC linklayer: Giving up ATM detection\n"); + return TC_LINKLAYER_ETHERNET; + } + + if ((cell_high > cell_low) && (cell_high < 256) + && (rtab[cell_low] == rtab[cell_high])) { + pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", + cell_low, cell_high, rtab[cell_high]); + return TC_LINKLAYER_ATM; + } + return TC_LINKLAYER_ETHERNET; +} + static struct qdisc_rate_table *qdisc_rtab_list; struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) { struct qdisc_rate_table *rtab; + if (tab == NULL || r->rate == 0 || r->cell_log == 0 || + nla_len(tab) != TC_RTAB_SIZE) + return NULL; + for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { - if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { + if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && + !memcmp(&rtab->data, nla_data(tab), 1024)) { rtab->refcnt++; return rtab; } } - if (tab == NULL || r->rate == 0 || r->cell_log == 0 || - nla_len(tab) != TC_RTAB_SIZE) - return NULL; - rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); if (rtab) { rtab->rate = *r; rtab->refcnt = 1; memcpy(rtab->data, nla_data(tab), 1024); + if (r->linklayer == TC_LINKLAYER_UNAWARE) + r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } @@ -426,7 +525,8 @@ static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) nest = nla_nest_start(skb, TCA_STAB); if (nest == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts); + if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) + goto nla_put_failure; nla_nest_end(skb, nest); return skb->len; @@ -463,7 +563,7 @@ out: } EXPORT_SYMBOL(__qdisc_calculate_pkt_len); -void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc) +void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) { if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", @@ -492,20 +592,19 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { - ktime_t time; - if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; qdisc_throttled(wd->qdisc); - time = ktime_set(0, 0); - time = ktime_add_ns(time, PSCHED_TICKS2NS(expires)); - hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); + + hrtimer_start(&wd->timer, + ns_to_ktime(expires), + HRTIMER_MODE_ABS); } -EXPORT_SYMBOL(qdisc_watchdog_schedule); +EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { @@ -545,7 +644,7 @@ static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n) void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) { struct Qdisc_class_common *cl; - struct hlist_node *n, *next; + struct hlist_node *next; struct hlist_head *nhash, *ohash; unsigned int nsize, nmask, osize; unsigned int i, h; @@ -564,7 +663,7 @@ void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) sch_tree_lock(sch); for (i = 0; i < osize; i++) { - hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) { + hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { h = qdisc_class_hash(cl->classid, nmask); hlist_add_head(&cl->hnode, &nhash[h]); } @@ -643,9 +742,11 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; + int drops; if (n == 0) return; + drops = max_t(int, n, 0); while ((parentid = sch->parent)) { if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) return; @@ -662,6 +763,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) cops->put(sch, cl); } sch->q.qlen -= n; + sch->qstats.drops += drops; } } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); @@ -833,6 +935,8 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, goto err_out3; } lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); + if (!netif_is_multiqueue(dev)) + sch->flags |= TCQ_F_ONETXQUEUE; } sch->handle = handle; @@ -969,25 +1073,30 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) * Delete/get qdisc. */ -static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); - struct tcmsg *tcm = NLMSG_DATA(n); + struct tcmsg *tcm = nlmsg_data(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; - u32 clid = tcm->tcm_parent; + u32 clid; struct Qdisc *q = NULL; struct Qdisc *p = NULL; int err; - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; + if ((n->nlmsg_type != RTM_GETQDISC) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); if (err < 0) return err; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + clid = tcm->tcm_parent; if (clid) { if (clid != TC_H_ROOT) { if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { @@ -1033,7 +1142,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) * Create/change qdisc. */ -static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm; @@ -1043,9 +1152,16 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) struct Qdisc *q, *p; int err; + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + replay: /* Reinit, just in case something touches this. */ - tcm = NLMSG_DATA(n); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + + tcm = nlmsg_data(n); clid = tcm->tcm_parent; q = p = NULL; @@ -1053,9 +1169,6 @@ replay: if (!dev) return -ENODEV; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); - if (err < 0) - return err; if (clid) { if (clid != TC_H_ROOT) { @@ -1184,7 +1297,7 @@ graft: } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, - u32 pid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1192,8 +1305,11 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, struct gnet_dump d; struct qdisc_size_table *stab; - nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); - tcm = NLMSG_DATA(nlh); + cond_resched(); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); + if (!nlh) + goto out_nlmsg_trim; + tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; @@ -1201,7 +1317,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, tcm->tcm_parent = clid; tcm->tcm_handle = q->handle; tcm->tcm_info = atomic_read(&q->refcnt); - NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); + if (nla_put_string(skb, TCA_KIND, q->ops->id)) + goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; q->qstats.qlen = q->q.qlen; @@ -1228,7 +1345,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; -nlmsg_failure: +out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return -1; @@ -1244,25 +1361,25 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct Qdisc *old, struct Qdisc *new) { struct sk_buff *skb; - u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (old && !tc_qdisc_dump_ignore(old)) { - if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, + if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new)) { - if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, + if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) goto err_out; } if (skb->len) - return rtnetlink_send(skb, net, pid, RTNLGRP_TC, + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); err_out: @@ -1285,7 +1402,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, q_idx++; } else { if (!tc_qdisc_dump_ignore(q) && - tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, + tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) goto done; q_idx++; @@ -1296,7 +1413,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, continue; } if (!tc_qdisc_dump_ignore(q) && - tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, + tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) goto done; q_idx++; @@ -1320,9 +1437,9 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; - rcu_read_lock(); idx = 0; - for_each_netdev_rcu(net, dev) { + ASSERT_RTNL(); + for_each_netdev(net, dev) { struct netdev_queue *dev_queue; if (idx < s_idx) @@ -1345,8 +1462,6 @@ cont: } done: - rcu_read_unlock(); - cb->args[0] = idx; cb->args[1] = q_idx; @@ -1361,29 +1476,33 @@ done: -static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); - struct tcmsg *tcm = NLMSG_DATA(n); + struct tcmsg *tcm = nlmsg_data(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; struct Qdisc *q = NULL; const struct Qdisc_class_ops *cops; unsigned long cl = 0; unsigned long new_cl; - u32 pid = tcm->tcm_parent; - u32 clid = tcm->tcm_handle; - u32 qid = TC_H_MAJ(clid); + u32 portid; + u32 clid; + u32 qid; int err; - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; + if ((n->nlmsg_type != RTM_GETTCLASS) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); if (err < 0) return err; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + /* parent == TC_H_UNSPEC - unspecified parent. parent == TC_H_ROOT - class is root, which has no parent. @@ -1399,8 +1518,12 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) /* Step 1. Determine qdisc handle X:0 */ - if (pid != TC_H_ROOT) { - u32 qid1 = TC_H_MAJ(pid); + portid = tcm->tcm_parent; + clid = tcm->tcm_handle; + qid = TC_H_MAJ(clid); + + if (portid != TC_H_ROOT) { + u32 qid1 = TC_H_MAJ(portid); if (qid && qid1) { /* If both majors are known, they must be identical. */ @@ -1414,10 +1537,10 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) /* Now qid is genuine qdisc handle consistent * both with parent and child. * - * TC_H_MAJ(pid) still may be unspecified, complete it now. + * TC_H_MAJ(portid) still may be unspecified, complete it now. */ - if (pid) - pid = TC_H_MAKE(qid, pid); + if (portid) + portid = TC_H_MAKE(qid, portid); } else { if (qid == 0) qid = dev->qdisc->handle; @@ -1435,7 +1558,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) /* Now try to get class */ if (clid == 0) { - if (pid == TC_H_ROOT) + if (portid == TC_H_ROOT) clid = qid; } else clid = TC_H_MAKE(qid, clid); @@ -1474,7 +1597,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) new_cl = cl; err = -EOPNOTSUPP; if (cops->change) - err = cops->change(q, clid, pid, tca, &new_cl); + err = cops->change(q, clid, portid, tca, &new_cl); if (err == 0) tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); @@ -1488,7 +1611,7 @@ out: static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, unsigned long cl, - u32 pid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1496,8 +1619,11 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, struct gnet_dump d; const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; - nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); - tcm = NLMSG_DATA(nlh); + cond_resched(); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); + if (!nlh) + goto out_nlmsg_trim; + tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; @@ -1505,7 +1631,8 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, tcm->tcm_parent = q->handle; tcm->tcm_handle = q->handle; tcm->tcm_info = 0; - NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); + if (nla_put_string(skb, TCA_KIND, q->ops->id)) + goto nla_put_failure; if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) goto nla_put_failure; @@ -1522,7 +1649,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; -nlmsg_failure: +out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return -1; @@ -1533,18 +1660,18 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, unsigned long cl, int event) { struct sk_buff *skb; - u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) { kfree_skb(skb); return -EINVAL; } - return rtnetlink_send(skb, net, pid, RTNLGRP_TC, + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } @@ -1558,7 +1685,7 @@ static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walk { struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; - return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, + return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); } @@ -1613,13 +1740,13 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) { - struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh); + struct tcmsg *tcm = nlmsg_data(cb->nlh); struct net *net = sock_net(skb->sk); struct netdev_queue *dev_queue; struct net_device *dev; int t, s_t; - if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return 0; dev = dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) @@ -1688,12 +1815,10 @@ reclassify: tp = otp; if (verd++ >= MAX_REC_LOOP) { - if (net_ratelimit()) - pr_notice("%s: packet reclassify loop" - " rule prio %u protocol %02x\n", - tp->q->ops->id, - tp->prio & 0xffff, - ntohs(tp->protocol)); + net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n", + tp->q->ops->id, + tp->prio & 0xffff, + ntohs(tp->protocol)); return TC_ACT_SHOT; } skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); @@ -1753,7 +1878,7 @@ static int __net_init psched_net_init(struct net *net) { struct proc_dir_entry *e; - e = proc_net_fops_create(net, "psched", 0, &psched_fops); + e = proc_create("psched", 0, net->proc_net, &psched_fops); if (e == NULL) return -ENOMEM; @@ -1762,7 +1887,7 @@ static int __net_init psched_net_init(struct net *net) static void __net_exit psched_net_exit(struct net *net) { - proc_net_remove(net, "psched"); + remove_proc_entry("psched", net->proc_net); } #else static int __net_init psched_net_init(struct net *net) @@ -1791,6 +1916,7 @@ static int __init pktsched_init(void) return err; } + register_qdisc(&pfifo_fast_ops); register_qdisc(&pfifo_qdisc_ops); register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index e25e49061a0..8449b337f9e 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -16,8 +16,6 @@ #include <net/netlink.h> #include <net/pkt_sched.h> -extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ - /* * The ATM queuing discipline provides a framework for invoking classifiers * (aka "filters"), which in turn select classes of this queuing discipline. @@ -423,8 +421,6 @@ drop: __maybe_unused } return ret; } - qdisc_bstats_update(sch, skb); - bstats_update(&flow->bstats, skb); /* * Okay, this may seem weird. We pretend we've dropped the packet if * it goes via ATM. The reason for this is that the outer qdisc @@ -472,6 +468,8 @@ static void sch_atm_dequeue(unsigned long data) if (unlikely(!skb)) break; + qdisc_bstats_update(sch, skb); + bstats_update(&flow->bstats, skb); pr_debug("atm_tc_dequeue: sending on class %p\n", flow); /* remove any LL header somebody else has attached */ skb_pull(skb, skb_network_offset(skb)); @@ -601,26 +599,31 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, if (nest == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr); + if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr)) + goto nla_put_failure; if (flow->vcc) { struct sockaddr_atmpvc pvc; int state; + memset(&pvc, 0, sizeof(pvc)); pvc.sap_family = AF_ATMPVC; pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; pvc.sap_addr.vpi = flow->vcc->vpi; pvc.sap_addr.vci = flow->vcc->vci; - NLA_PUT(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc); + if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc)) + goto nla_put_failure; state = ATM_VF2VS(flow->vcc->flags); - NLA_PUT_U32(skb, TCA_ATM_STATE, state); + if (nla_put_u32(skb, TCA_ATM_STATE, state)) + goto nla_put_failure; } - if (flow->excess) - NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid); - else - NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0); - - nla_nest_end(skb, nest); - return skb->len; + if (flow->excess) { + if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->classid)) + goto nla_put_failure; + } else { + if (nla_put_u32(skb, TCA_ATM_EXCESS, 0)) + goto nla_put_failure; + } + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 24d94c097b3..ead526467cc 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -130,7 +130,7 @@ struct cbq_class { psched_time_t penalized; struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct tc_cbq_xstats xstats; struct tcf_proto *filter_list; @@ -250,10 +250,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) cl = defmap[TC_PRIO_BESTEFFORT]; - if (cl == NULL || cl->level >= head->level) + if (cl == NULL) goto fallback; } - + if (cl->level >= head->level) + goto fallback; #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: @@ -508,8 +509,7 @@ static void cbq_ovl_delay(struct cbq_class *cl) cl->cpriority = TC_CBQ_MAXPRIO; q->pmask |= (1<<TC_CBQ_MAXPRIO); - expires = ktime_set(0, 0); - expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched)); + expires = ns_to_ktime(PSCHED_TICKS2NS(sched)); if (hrtimer_try_to_cancel(&q->delay_timer) && ktime_to_ns(ktime_sub( hrtimer_get_expires(&q->delay_timer), @@ -962,8 +962,11 @@ cbq_dequeue(struct Qdisc *sch) cbq_update(q); if ((incr -= incr2) < 0) incr = 0; + q->now += incr; + } else { + if (now > q->now) + q->now = now; } - q->now += incr; q->now_rt = now; for (;;) { @@ -1041,14 +1044,13 @@ static void cbq_adjust_levels(struct cbq_class *this) static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) { struct cbq_class *cl; - struct hlist_node *n; unsigned int h; if (q->quanta[prio] == 0) return; for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { /* BUGGGG... Beware! This expression suffer of * arithmetic overflows! */ @@ -1056,9 +1058,10 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ q->quanta[prio]; } - if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { - pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n", - cl->common.classid, cl->quantum); + if (cl->quantum <= 0 || + cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) { + pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n", + cl->common.classid, cl->quantum); cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; } } @@ -1087,10 +1090,9 @@ static void cbq_sync_defmap(struct cbq_class *cl) continue; for (h = 0; h < q->clhash.hashsize; h++) { - struct hlist_node *n; struct cbq_class *c; - hlist_for_each_entry(c, n, &q->clhash.hash[h], + hlist_for_each_entry(c, &q->clhash.hash[h], common.hnode) { if (c->split == split && c->level < level && c->defmap & (1<<i)) { @@ -1210,7 +1212,6 @@ cbq_reset(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl; - struct hlist_node *n; int prio; unsigned int h; @@ -1228,7 +1229,7 @@ cbq_reset(struct Qdisc *sch) q->active[prio] = NULL; for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { qdisc_reset(cl->q); cl->next_alive = NULL; @@ -1425,7 +1426,8 @@ static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) { unsigned char *b = skb_tail_pointer(skb); - NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); + if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -1450,7 +1452,8 @@ static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) opt.minidle = (u32)(-cl->minidle); opt.offtime = cl->offtime; opt.change = ~0; - NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -1463,12 +1466,14 @@ static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_wrropt opt; + memset(&opt, 0, sizeof(opt)); opt.flags = 0; opt.allot = cl->allot; opt.priority = cl->priority + 1; opt.cpriority = cl->cpriority + 1; opt.weight = cl->weight; - NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -1485,7 +1490,8 @@ static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) opt.priority2 = cl->priority2 + 1; opt.pad = 0; opt.penalty = cl->penalty; - NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -1502,7 +1508,8 @@ static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) opt.split = cl->split ? cl->split->common.classid : 0; opt.defmap = cl->defmap; opt.defchange = ~0; - NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt)) + goto nla_put_failure; } return skb->len; @@ -1521,7 +1528,8 @@ static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) opt.police = cl->police; opt.__res1 = 0; opt.__res2 = 0; - NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt)) + goto nla_put_failure; } return skb->len; @@ -1555,8 +1563,7 @@ static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (cbq_dump_attr(skb, &q->link) < 0) goto nla_put_failure; - nla_nest_end(skb, nest); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); @@ -1591,8 +1598,7 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg, goto nla_put_failure; if (cbq_dump_attr(skb, cl) < 0) goto nla_put_failure; - nla_nest_end(skb, nest); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); @@ -1691,7 +1697,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) static void cbq_destroy(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); - struct hlist_node *n, *next; + struct hlist_node *next; struct cbq_class *cl; unsigned int h; @@ -1704,11 +1710,11 @@ static void cbq_destroy(struct Qdisc *sch) * be bound to classes which have been destroyed already. --TGR '04 */ for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) tcf_destroy_chain(&cl->filter_list); } for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], common.hnode) cbq_destroy_class(sch, cl); } @@ -1775,8 +1781,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { - if (rtab) - qdisc_put_rtab(rtab); + qdisc_put_rtab(rtab); return err; } } @@ -2007,14 +2012,13 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl; - struct hlist_node *n; unsigned int h; if (arg->stop) return; for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 7e267d7b9c7..ed30e436128 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -14,7 +14,6 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> -#include <linux/reciprocal_div.h> #include <linux/vmalloc.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> @@ -77,12 +76,6 @@ struct choke_sched_data { struct sk_buff **tab; }; -/* deliver a random number between 0 and N - 1 */ -static u32 random_N(unsigned int N) -{ - return reciprocal_divide(random32(), N); -} - /* number of elements in queue including holes */ static unsigned int choke_len(const struct choke_sched_data *q) { @@ -233,7 +226,7 @@ static struct sk_buff *choke_peek_random(const struct choke_sched_data *q, int retrys = 3; do { - *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask; + *pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask; skb = q->tab[*pidx]; if (skb) return skb; @@ -332,15 +325,13 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) } q->stats.pdrop++; - sch->qstats.drops++; - kfree_skb(skb); - return NET_XMIT_DROP; + return qdisc_drop(skb, sch); - congestion_drop: +congestion_drop: qdisc_drop(skb, sch); return NET_XMIT_CN; - other_drop: +other_drop: if (ret & __NET_XMIT_BYPASS) sch->qstats.drops++; kfree_skb(skb); @@ -400,12 +391,7 @@ static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { static void choke_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static int choke_change(struct Qdisc *sch, struct nlattr *opt) @@ -440,7 +426,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) if (mask != q->tab_mask) { struct sk_buff **ntab; - ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL); + ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), + GFP_KERNEL | __GFP_NOWARN); if (!ntab) ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *)); if (!ntab) @@ -515,8 +502,9 @@ static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt); - NLA_PUT_U32(skb, TCA_CHOKE_MAX_P, q->parms.max_P); + if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) || + nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P)) + goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c new file mode 100644 index 00000000000..2f9ab17db85 --- /dev/null +++ b/net/sched/sch_codel.c @@ -0,0 +1,276 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm + * + * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com> + * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net> + * + * Implemented on linux by : + * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> + * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/prefetch.h> +#include <net/pkt_sched.h> +#include <net/codel.h> + + +#define DEFAULT_CODEL_LIMIT 1000 + +struct codel_sched_data { + struct codel_params params; + struct codel_vars vars; + struct codel_stats stats; + u32 drop_overlimit; +}; + +/* This is the specific function called from codel_dequeue() + * to dequeue a packet from queue. Note: backlog is handled in + * codel, we dont need to reduce it here. + */ +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +{ + struct sk_buff *skb = __skb_dequeue(&sch->q); + + prefetch(&skb->end); /* we'll need skb_shinfo() */ + return skb; +} + +static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue); + + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + * or HTB crashes. Defer it for next round. + */ + if (q->stats.drop_count && sch->q.qlen) { + qdisc_tree_decrease_qlen(sch, q->stats.drop_count); + q->stats.drop_count = 0; + } + if (skb) + qdisc_bstats_update(sch, skb); + return skb; +} + +static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct codel_sched_data *q; + + if (likely(qdisc_qlen(sch) < sch->limit)) { + codel_set_enqueue_time(skb); + return qdisc_enqueue_tail(skb, sch); + } + q = qdisc_priv(sch); + q->drop_overlimit++; + return qdisc_drop(skb, sch); +} + +static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { + [TCA_CODEL_TARGET] = { .type = NLA_U32 }, + [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, + [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, + [TCA_CODEL_ECN] = { .type = NLA_U32 }, +}; + +static int codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CODEL_MAX + 1]; + unsigned int qlen; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + if (tb[TCA_CODEL_TARGET]) { + u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); + + q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_CODEL_INTERVAL]) { + u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); + + q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_CODEL_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); + + if (tb[TCA_CODEL_ECN]) + q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]); + + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = __skb_dequeue(&sch->q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + sch_tree_unlock(sch); + return 0; +} + +static int codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + sch->limit = DEFAULT_CODEL_LIMIT; + + codel_params_init(&q->params); + codel_vars_init(&q->vars); + codel_stats_init(&q->stats); + + if (opt) { + int err = codel_change(sch, opt); + + if (err) + return err; + } + + if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + + return 0; +} + +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CODEL_TARGET, + codel_time_to_us(q->params.target)) || + nla_put_u32(skb, TCA_CODEL_LIMIT, + sch->limit) || + nla_put_u32(skb, TCA_CODEL_INTERVAL, + codel_time_to_us(q->params.interval)) || + nla_put_u32(skb, TCA_CODEL_ECN, + q->params.ecn)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; +} + +static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + const struct codel_sched_data *q = qdisc_priv(sch); + struct tc_codel_xstats st = { + .maxpacket = q->stats.maxpacket, + .count = q->vars.count, + .lastcount = q->vars.lastcount, + .drop_overlimit = q->drop_overlimit, + .ldelay = codel_time_to_us(q->vars.ldelay), + .dropping = q->vars.dropping, + .ecn_mark = q->stats.ecn_mark, + }; + + if (q->vars.dropping) { + codel_tdiff_t delta = q->vars.drop_next - codel_get_time(); + + if (delta >= 0) + st.drop_next = codel_time_to_us(delta); + else + st.drop_next = -codel_time_to_us(-delta); + } + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void codel_reset(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + qdisc_reset_queue(sch); + codel_vars_init(&q->vars); +} + +static struct Qdisc_ops codel_qdisc_ops __read_mostly = { + .id = "codel", + .priv_size = sizeof(struct codel_sched_data), + + .enqueue = codel_qdisc_enqueue, + .dequeue = codel_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = codel_init, + .reset = codel_reset, + .change = codel_change, + .dump = codel_dump, + .dump_stats = codel_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init codel_module_init(void) +{ + return register_qdisc(&codel_qdisc_ops); +} + +static void __exit codel_module_exit(void) +{ + unregister_qdisc(&codel_qdisc_ops); +} + +module_init(codel_module_init) +module_exit(codel_module_exit) + +MODULE_DESCRIPTION("Controlled Delay queue discipline"); +MODULE_AUTHOR("Dave Taht"); +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 6b7fe4a84f1..7bbbfe11219 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -25,7 +25,7 @@ struct drr_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct list_head alist; struct Qdisc *qdisc; @@ -260,7 +260,8 @@ static int drr_dump_class(struct Qdisc *sch, unsigned long arg, nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - NLA_PUT_U32(skb, TCA_DRR_QUANTUM, cl->quantum); + if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum)) + goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: @@ -292,14 +293,13 @@ static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; - struct hlist_node *n; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; @@ -351,7 +351,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; - int err; + int err = 0; cl = drr_classify(skb, sch, &err); if (cl == NULL) { @@ -375,8 +375,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl->deficit = cl->quantum; } - bstats_update(&cl->bstats, skb); - sch->q.qlen++; return err; } @@ -393,8 +391,10 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) while (1) { cl = list_first_entry(&q->active, struct drr_class, alist); skb = cl->qdisc->ops->peek(cl->qdisc); - if (skb == NULL) + if (skb == NULL) { + qdisc_warn_nonwc(__func__, cl->qdisc); goto out; + } len = qdisc_pkt_len(skb); if (len <= cl->deficit) { @@ -402,6 +402,8 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) skb = qdisc_dequeue_peeked(cl->qdisc); if (cl->qdisc->q.qlen == 0) list_del(&cl->alist); + + bstats_update(&cl->bstats, skb); qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; @@ -450,11 +452,10 @@ static void drr_reset_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; - struct hlist_node *n; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen) list_del(&cl->alist); qdisc_reset(cl->qdisc); @@ -467,13 +468,13 @@ static void drr_destroy_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; - struct hlist_node *n, *next; + struct hlist_node *next; unsigned int i; tcf_destroy_chain(&q->filter_list); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) drr_destroy_class(sch, cl); } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 2c790204d04..49d6ef338b5 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -47,7 +47,7 @@ struct dsmark_qdisc_data { static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) { - return (index <= p->indices && index > 0); + return index <= p->indices && index > 0; } /* ------------------------- Class/flow operations ------------------------- */ @@ -57,8 +57,8 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg, { struct dsmark_qdisc_data *p = qdisc_priv(sch); - pr_debug("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n", - sch, p, new, old); + pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n", + __func__, sch, p, new, old); if (new == NULL) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, @@ -85,8 +85,8 @@ static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) static unsigned long dsmark_get(struct Qdisc *sch, u32 classid) { - pr_debug("dsmark_get(sch %p,[qdisc %p],classid %x)\n", - sch, qdisc_priv(sch), classid); + pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", + __func__, sch, qdisc_priv(sch), classid); return TC_H_MIN(classid) + 1; } @@ -118,8 +118,8 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, int err = -EINVAL; u8 mask = 0; - pr_debug("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," - "arg 0x%lx\n", sch, p, classid, parent, *arg); + pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n", + __func__, sch, p, classid, parent, *arg); if (!dsmark_valid_index(p, *arg)) { err = -ENOENT; @@ -166,7 +166,8 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) struct dsmark_qdisc_data *p = qdisc_priv(sch); int i; - pr_debug("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); + pr_debug("%s(sch %p,[qdisc %p],walker %p)\n", + __func__, sch, p, walker); if (walker->stop) return; @@ -199,7 +200,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct dsmark_qdisc_data *p = qdisc_priv(sch); int err; - pr_debug("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p); if (p->set_tc_index) { switch (skb->protocol) { @@ -265,8 +266,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; drop: - kfree_skb(skb); - sch->qstats.drops++; + qdisc_drop(skb, sch); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } @@ -276,7 +276,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) struct sk_buff *skb; u32 index; - pr_debug("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); skb = p->q->ops->dequeue(p->q); if (skb == NULL) @@ -304,8 +304,8 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) * and don't need yet another qdisc as a bypass. */ if (p->mask[index] != 0xff || p->value[index]) - pr_warning("dsmark_dequeue: unsupported protocol %d\n", - ntohs(skb->protocol)); + pr_warn("%s: unsupported protocol %d\n", + __func__, ntohs(skb->protocol)); break; } @@ -316,7 +316,7 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch) { struct dsmark_qdisc_data *p = qdisc_priv(sch); - pr_debug("dsmark_peek(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); return p->q->ops->peek(p->q); } @@ -326,7 +326,7 @@ static unsigned int dsmark_drop(struct Qdisc *sch) struct dsmark_qdisc_data *p = qdisc_priv(sch); unsigned int len; - pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); if (p->q->ops->drop == NULL) return 0; @@ -347,7 +347,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) u16 indices; u8 *mask; - pr_debug("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); + pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt); if (!opt) goto errout; @@ -385,7 +385,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (p->q == NULL) p->q = &noop_qdisc; - pr_debug("dsmark_init: qdisc %p\n", p->q); + pr_debug("%s: qdisc %p\n", __func__, p->q); err = 0; errout: @@ -396,7 +396,7 @@ static void dsmark_reset(struct Qdisc *sch) { struct dsmark_qdisc_data *p = qdisc_priv(sch); - pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); qdisc_reset(p->q); sch->q.qlen = 0; } @@ -405,7 +405,7 @@ static void dsmark_destroy(struct Qdisc *sch) { struct dsmark_qdisc_data *p = qdisc_priv(sch); - pr_debug("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p); + pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); tcf_destroy_chain(&p->filter_list); qdisc_destroy(p->q); @@ -418,7 +418,7 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, struct dsmark_qdisc_data *p = qdisc_priv(sch); struct nlattr *opts = NULL; - pr_debug("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl); + pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl); if (!dsmark_valid_index(p, cl)) return -EINVAL; @@ -429,8 +429,9 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]); - NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]); + if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) || + nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1])) + goto nla_put_failure; return nla_nest_end(skb, opts); @@ -447,13 +448,16 @@ static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - NLA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices); + if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices)) + goto nla_put_failure; - if (p->default_index != NO_DEFAULT_INDEX) - NLA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index); + if (p->default_index != NO_DEFAULT_INDEX && + nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index)) + goto nla_put_failure; - if (p->set_tc_index) - NLA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX); + if (p->set_tc_index && + nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX)) + goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 66effe2da8e..e15a9eb2908 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -85,7 +85,8 @@ static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tc_fifo_qopt opt = { .limit = sch->limit }; - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c new file mode 100644 index 00000000000..ba32c2b005d --- /dev/null +++ b/net/sched/sch_fq.c @@ -0,0 +1,849 @@ +/* + * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) + * + * Copyright (C) 2013 Eric Dumazet <edumazet@google.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Meant to be mostly used for localy generated traffic : + * Fast classification depends on skb->sk being set before reaching us. + * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. + * All packets belonging to a socket are considered as a 'flow'. + * + * Flows are dynamically allocated and stored in a hash table of RB trees + * They are also part of one Round Robin 'queues' (new or old flows) + * + * Burst avoidance (aka pacing) capability : + * + * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a + * bunch of packets, and this packet scheduler adds delay between + * packets to respect rate limitation. + * + * enqueue() : + * - lookup one RB tree (out of 1024 or more) to find the flow. + * If non existent flow, create it, add it to the tree. + * Add skb to the per flow list of skb (fifo). + * - Use a special fifo for high prio packets + * + * dequeue() : serves flows in Round Robin + * Note : When a flow becomes empty, we do not immediately remove it from + * rb trees, for performance reasons (its expected to send additional packets, + * or SLAB cache will reuse socket for another flow) + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/hash.h> +#include <linux/prefetch.h> +#include <linux/vmalloc.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/sock.h> +#include <net/tcp_states.h> + +/* + * Per flow structure, dynamically allocated + */ +struct fq_flow { + struct sk_buff *head; /* list of skbs for this flow : first skb */ + union { + struct sk_buff *tail; /* last skb in the list */ + unsigned long age; /* jiffies when flow was emptied, for gc */ + }; + struct rb_node fq_node; /* anchor in fq_root[] trees */ + struct sock *sk; + int qlen; /* number of packets in flow queue */ + int credit; + u32 socket_hash; /* sk_hash */ + struct fq_flow *next; /* next pointer in RR lists, or &detached */ + + struct rb_node rate_node; /* anchor in q->delayed tree */ + u64 time_next_packet; +}; + +struct fq_flow_head { + struct fq_flow *first; + struct fq_flow *last; +}; + +struct fq_sched_data { + struct fq_flow_head new_flows; + + struct fq_flow_head old_flows; + + struct rb_root delayed; /* for rate limited flows */ + u64 time_next_delayed_flow; + + struct fq_flow internal; /* for non classified or high prio packets */ + u32 quantum; + u32 initial_quantum; + u32 flow_refill_delay; + u32 flow_max_rate; /* optional max rate per flow */ + u32 flow_plimit; /* max packets per flow */ + struct rb_root *fq_root; + u8 rate_enable; + u8 fq_trees_log; + + u32 flows; + u32 inactive_flows; + u32 throttled_flows; + + u64 stat_gc_flows; + u64 stat_internal_packets; + u64 stat_tcp_retrans; + u64 stat_throttled; + u64 stat_flows_plimit; + u64 stat_pkts_too_long; + u64 stat_allocation_errors; + struct qdisc_watchdog watchdog; +}; + +/* special value to mark a detached flow (not on old/new list) */ +static struct fq_flow detached, throttled; + +static void fq_flow_set_detached(struct fq_flow *f) +{ + f->next = &detached; + f->age = jiffies; +} + +static bool fq_flow_is_detached(const struct fq_flow *f) +{ + return f->next == &detached; +} + +static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) +{ + struct rb_node **p = &q->delayed.rb_node, *parent = NULL; + + while (*p) { + struct fq_flow *aux; + + parent = *p; + aux = container_of(parent, struct fq_flow, rate_node); + if (f->time_next_packet >= aux->time_next_packet) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&f->rate_node, parent, p); + rb_insert_color(&f->rate_node, &q->delayed); + q->throttled_flows++; + q->stat_throttled++; + + f->next = &throttled; + if (q->time_next_delayed_flow > f->time_next_packet) + q->time_next_delayed_flow = f->time_next_packet; +} + + +static struct kmem_cache *fq_flow_cachep __read_mostly; + +static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) +{ + if (head->first) + head->last->next = flow; + else + head->first = flow; + head->last = flow; + flow->next = NULL; +} + +/* limit number of collected flows per round */ +#define FQ_GC_MAX 8 +#define FQ_GC_AGE (3*HZ) + +static bool fq_gc_candidate(const struct fq_flow *f) +{ + return fq_flow_is_detached(f) && + time_after(jiffies, f->age + FQ_GC_AGE); +} + +static void fq_gc(struct fq_sched_data *q, + struct rb_root *root, + struct sock *sk) +{ + struct fq_flow *f, *tofree[FQ_GC_MAX]; + struct rb_node **p, *parent; + int fcnt = 0; + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) + break; + + if (fq_gc_candidate(f)) { + tofree[fcnt++] = f; + if (fcnt == FQ_GC_MAX) + break; + } + + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; + while (fcnt) { + struct fq_flow *f = tofree[--fcnt]; + + rb_erase(&f->fq_node, root); + kmem_cache_free(fq_flow_cachep, f); + } +} + +static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) +{ + struct rb_node **p, *parent; + struct sock *sk = skb->sk; + struct rb_root *root; + struct fq_flow *f; + + /* warning: no starvation prevention... */ + if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) + return &q->internal; + + if (unlikely(!sk)) { + /* By forcing low order bit to 1, we make sure to not + * collide with a local flow (socket pointers are word aligned) + */ + sk = (struct sock *)(skb_get_hash(skb) | 1L); + } + + root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; + + if (q->flows >= (2U << q->fq_trees_log) && + q->inactive_flows > q->flows/2) + fq_gc(q, root, sk); + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) { + /* socket might have been reallocated, so check + * if its sk_hash is the same. + * It not, we need to refill credit with + * initial quantum + */ + if (unlikely(skb->sk && + f->socket_hash != sk->sk_hash)) { + f->credit = q->initial_quantum; + f->socket_hash = sk->sk_hash; + f->time_next_packet = 0ULL; + } + return f; + } + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!f)) { + q->stat_allocation_errors++; + return &q->internal; + } + fq_flow_set_detached(f); + f->sk = sk; + if (skb->sk) + f->socket_hash = sk->sk_hash; + f->credit = q->initial_quantum; + + rb_link_node(&f->fq_node, parent, p); + rb_insert_color(&f->fq_node, root); + + q->flows++; + q->inactive_flows++; + return f; +} + + +/* remove one skb from head of flow queue */ +static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) +{ + struct sk_buff *skb = flow->head; + + if (skb) { + flow->head = skb->next; + skb->next = NULL; + flow->qlen--; + sch->qstats.backlog -= qdisc_pkt_len(skb); + sch->q.qlen--; + } + return skb; +} + +/* We might add in the future detection of retransmits + * For the time being, just return false + */ +static bool skb_is_retransmit(struct sk_buff *skb) +{ + return false; +} + +/* add skb to flow queue + * flow queue is a linked list, kind of FIFO, except for TCP retransmits + * We special case tcp retransmits to be transmitted before other packets. + * We rely on fact that TCP retransmits are unlikely, so we do not waste + * a separate queue or a pointer. + * head-> [retrans pkt 1] + * [retrans pkt 2] + * [ normal pkt 1] + * [ normal pkt 2] + * [ normal pkt 3] + * tail-> [ normal pkt 4] + */ +static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) +{ + struct sk_buff *prev, *head = flow->head; + + skb->next = NULL; + if (!head) { + flow->head = skb; + flow->tail = skb; + return; + } + if (likely(!skb_is_retransmit(skb))) { + flow->tail->next = skb; + flow->tail = skb; + return; + } + + /* This skb is a tcp retransmit, + * find the last retrans packet in the queue + */ + prev = NULL; + while (skb_is_retransmit(head)) { + prev = head; + head = head->next; + if (!head) + break; + } + if (!prev) { /* no rtx packet in queue, become the new head */ + skb->next = flow->head; + flow->head = skb; + } else { + if (prev == flow->tail) + flow->tail = skb; + else + skb->next = prev->next; + prev->next = skb; + } +} + +static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct fq_flow *f; + + if (unlikely(sch->q.qlen >= sch->limit)) + return qdisc_drop(skb, sch); + + f = fq_classify(skb, q); + if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { + q->stat_flows_plimit++; + return qdisc_drop(skb, sch); + } + + f->qlen++; + if (skb_is_retransmit(skb)) + q->stat_tcp_retrans++; + sch->qstats.backlog += qdisc_pkt_len(skb); + if (fq_flow_is_detached(f)) { + fq_flow_add_tail(&q->new_flows, f); + if (time_after(jiffies, f->age + q->flow_refill_delay)) + f->credit = max_t(u32, f->credit, q->quantum); + q->inactive_flows--; + qdisc_unthrottled(sch); + } + + /* Note: this overwrites f->age */ + flow_queue_add(f, skb); + + if (unlikely(f == &q->internal)) { + q->stat_internal_packets++; + qdisc_unthrottled(sch); + } + sch->q.qlen++; + + return NET_XMIT_SUCCESS; +} + +static void fq_check_throttled(struct fq_sched_data *q, u64 now) +{ + struct rb_node *p; + + if (q->time_next_delayed_flow > now) + return; + + q->time_next_delayed_flow = ~0ULL; + while ((p = rb_first(&q->delayed)) != NULL) { + struct fq_flow *f = container_of(p, struct fq_flow, rate_node); + + if (f->time_next_packet > now) { + q->time_next_delayed_flow = f->time_next_packet; + break; + } + rb_erase(p, &q->delayed); + q->throttled_flows--; + fq_flow_add_tail(&q->old_flows, f); + } +} + +static struct sk_buff *fq_dequeue(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct fq_flow_head *head; + struct sk_buff *skb; + struct fq_flow *f; + u32 rate; + + skb = fq_dequeue_head(sch, &q->internal); + if (skb) + goto out; + fq_check_throttled(q, now); +begin: + head = &q->new_flows; + if (!head->first) { + head = &q->old_flows; + if (!head->first) { + if (q->time_next_delayed_flow != ~0ULL) + qdisc_watchdog_schedule_ns(&q->watchdog, + q->time_next_delayed_flow); + return NULL; + } + } + f = head->first; + + if (f->credit <= 0) { + f->credit += q->quantum; + head->first = f->next; + fq_flow_add_tail(&q->old_flows, f); + goto begin; + } + + if (unlikely(f->head && now < f->time_next_packet)) { + head->first = f->next; + fq_flow_set_throttled(q, f); + goto begin; + } + + skb = fq_dequeue_head(sch, f); + if (!skb) { + head->first = f->next; + /* force a pass through old_flows to prevent starvation */ + if ((head == &q->new_flows) && q->old_flows.first) { + fq_flow_add_tail(&q->old_flows, f); + } else { + fq_flow_set_detached(f); + q->inactive_flows++; + } + goto begin; + } + prefetch(&skb->end); + f->time_next_packet = now; + f->credit -= qdisc_pkt_len(skb); + + if (f->credit > 0 || !q->rate_enable) + goto out; + + rate = q->flow_max_rate; + if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) + rate = min(skb->sk->sk_pacing_rate, rate); + + if (rate != ~0U) { + u32 plen = max(qdisc_pkt_len(skb), q->quantum); + u64 len = (u64)plen * NSEC_PER_SEC; + + if (likely(rate)) + do_div(len, rate); + /* Since socket rate can change later, + * clamp the delay to 125 ms. + * TODO: maybe segment the too big skb, as in commit + * e43ac79a4bc ("sch_tbf: segment too big GSO packets") + */ + if (unlikely(len > 125 * NSEC_PER_MSEC)) { + len = 125 * NSEC_PER_MSEC; + q->stat_pkts_too_long++; + } + + f->time_next_packet = now + len; + } +out: + qdisc_bstats_update(sch, skb); + qdisc_unthrottled(sch); + return skb; +} + +static void fq_reset(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct rb_root *root; + struct sk_buff *skb; + struct rb_node *p; + struct fq_flow *f; + unsigned int idx; + + while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL) + kfree_skb(skb); + + if (!q->fq_root) + return; + + for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { + root = &q->fq_root[idx]; + while ((p = rb_first(root)) != NULL) { + f = container_of(p, struct fq_flow, fq_node); + rb_erase(p, root); + + while ((skb = fq_dequeue_head(sch, f)) != NULL) + kfree_skb(skb); + + kmem_cache_free(fq_flow_cachep, f); + } + } + q->new_flows.first = NULL; + q->old_flows.first = NULL; + q->delayed = RB_ROOT; + q->flows = 0; + q->inactive_flows = 0; + q->throttled_flows = 0; +} + +static void fq_rehash(struct fq_sched_data *q, + struct rb_root *old_array, u32 old_log, + struct rb_root *new_array, u32 new_log) +{ + struct rb_node *op, **np, *parent; + struct rb_root *oroot, *nroot; + struct fq_flow *of, *nf; + int fcnt = 0; + u32 idx; + + for (idx = 0; idx < (1U << old_log); idx++) { + oroot = &old_array[idx]; + while ((op = rb_first(oroot)) != NULL) { + rb_erase(op, oroot); + of = container_of(op, struct fq_flow, fq_node); + if (fq_gc_candidate(of)) { + fcnt++; + kmem_cache_free(fq_flow_cachep, of); + continue; + } + nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; + + np = &nroot->rb_node; + parent = NULL; + while (*np) { + parent = *np; + + nf = container_of(parent, struct fq_flow, fq_node); + BUG_ON(nf->sk == of->sk); + + if (nf->sk > of->sk) + np = &parent->rb_right; + else + np = &parent->rb_left; + } + + rb_link_node(&of->fq_node, parent, np); + rb_insert_color(&of->fq_node, nroot); + } + } + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; +} + +static void *fq_alloc_node(size_t sz, int node) +{ + void *ptr; + + ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); + if (!ptr) + ptr = vmalloc_node(sz, node); + return ptr; +} + +static void fq_free(void *addr) +{ + kvfree(addr); +} + +static int fq_resize(struct Qdisc *sch, u32 log) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct rb_root *array; + void *old_fq_root; + u32 idx; + + if (q->fq_root && log == q->fq_trees_log) + return 0; + + /* If XPS was setup, we can allocate memory on right NUMA node */ + array = fq_alloc_node(sizeof(struct rb_root) << log, + netdev_queue_numa_node_read(sch->dev_queue)); + if (!array) + return -ENOMEM; + + for (idx = 0; idx < (1U << log); idx++) + array[idx] = RB_ROOT; + + sch_tree_lock(sch); + + old_fq_root = q->fq_root; + if (old_fq_root) + fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); + + q->fq_root = array; + q->fq_trees_log = log; + + sch_tree_unlock(sch); + + fq_free(old_fq_root); + + return 0; +} + +static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { + [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, + [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, +}; + +static int fq_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_MAX + 1]; + int err, drop_count = 0; + u32 fq_log; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + fq_log = q->fq_trees_log; + + if (tb[TCA_FQ_BUCKETS_LOG]) { + u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); + + if (nval >= 1 && nval <= ilog2(256*1024)) + fq_log = nval; + else + err = -EINVAL; + } + if (tb[TCA_FQ_PLIMIT]) + sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); + + if (tb[TCA_FQ_FLOW_PLIMIT]) + q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); + + if (tb[TCA_FQ_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); + + if (tb[TCA_FQ_INITIAL_QUANTUM]) + q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); + + if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) + pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", + nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); + + if (tb[TCA_FQ_FLOW_MAX_RATE]) + q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + + if (tb[TCA_FQ_RATE_ENABLE]) { + u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); + + if (enable <= 1) + q->rate_enable = enable; + else + err = -EINVAL; + } + + if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { + u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; + + q->flow_refill_delay = usecs_to_jiffies(usecs_delay); + } + + if (!err) { + sch_tree_unlock(sch); + err = fq_resize(sch, fq_log); + sch_tree_lock(sch); + } + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_dequeue(sch); + + if (!skb) + break; + kfree_skb(skb); + drop_count++; + } + qdisc_tree_decrease_qlen(sch, drop_count); + + sch_tree_unlock(sch); + return err; +} + +static void fq_destroy(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + + fq_reset(sch); + fq_free(q->fq_root); + qdisc_watchdog_cancel(&q->watchdog); +} + +static int fq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + int err; + + sch->limit = 10000; + q->flow_plimit = 100; + q->quantum = 2 * psched_mtu(qdisc_dev(sch)); + q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); + q->flow_refill_delay = msecs_to_jiffies(40); + q->flow_max_rate = ~0U; + q->rate_enable = 1; + q->new_flows.first = NULL; + q->old_flows.first = NULL; + q->delayed = RB_ROOT; + q->fq_root = NULL; + q->fq_trees_log = ilog2(1024); + qdisc_watchdog_init(&q->watchdog, sch); + + if (opt) + err = fq_change(sch, opt); + else + err = fq_resize(sch, q->fq_trees_log); + + return err; +} + +static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ + + if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || + nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || + nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || + nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || + nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || + nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, + jiffies_to_usecs(q->flow_refill_delay)) || + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + return -1; +} + +static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct tc_fq_qd_stats st = { + .gc_flows = q->stat_gc_flows, + .highprio_packets = q->stat_internal_packets, + .tcp_retrans = q->stat_tcp_retrans, + .throttled = q->stat_throttled, + .flows_plimit = q->stat_flows_plimit, + .pkts_too_long = q->stat_pkts_too_long, + .allocation_errors = q->stat_allocation_errors, + .flows = q->flows, + .inactive_flows = q->inactive_flows, + .throttled_flows = q->throttled_flows, + .time_next_delayed_flow = q->time_next_delayed_flow - now, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops fq_qdisc_ops __read_mostly = { + .id = "fq", + .priv_size = sizeof(struct fq_sched_data), + + .enqueue = fq_enqueue, + .dequeue = fq_dequeue, + .peek = qdisc_peek_dequeued, + .init = fq_init, + .reset = fq_reset, + .destroy = fq_destroy, + .change = fq_change, + .dump = fq_dump, + .dump_stats = fq_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_module_init(void) +{ + int ret; + + fq_flow_cachep = kmem_cache_create("fq_flow_cache", + sizeof(struct fq_flow), + 0, 0, NULL); + if (!fq_flow_cachep) + return -ENOMEM; + + ret = register_qdisc(&fq_qdisc_ops); + if (ret) + kmem_cache_destroy(fq_flow_cachep); + return ret; +} + +static void __exit fq_module_exit(void) +{ + unregister_qdisc(&fq_qdisc_ops); + kmem_cache_destroy(fq_flow_cachep); +} + +module_init(fq_module_init) +module_exit(fq_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c new file mode 100644 index 00000000000..063b726bf1f --- /dev/null +++ b/net/sched/sch_fq_codel.c @@ -0,0 +1,620 @@ +/* + * Fair Queue CoDel discipline + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/flow_keys.h> +#include <net/codel.h> + +/* Fair Queue CoDel. + * + * Principles : + * Packets are classified (internal classifier or external) on flows. + * This is a Stochastic model (as we use a hash, several flows + * might be hashed on same slot) + * Each flow has a CoDel managed queue. + * Flows are linked onto two (Round Robin) lists, + * so that new flows have priority on old ones. + * + * For a given flow, packets are not reordered (CoDel uses a FIFO) + * head drops only. + * ECN capability is on by default. + * Low memory footprint (64 bytes per flow) + */ + +struct fq_codel_flow { + struct sk_buff *head; + struct sk_buff *tail; + struct list_head flowchain; + int deficit; + u32 dropped; /* number of drops (or ECN marks) on this flow */ + struct codel_vars cvars; +}; /* please try to keep this structure <= 64 bytes */ + +struct fq_codel_sched_data { + struct tcf_proto *filter_list; /* optional external classifier */ + struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ + u32 *backlogs; /* backlog table [flows_cnt] */ + u32 flows_cnt; /* number of flows */ + u32 perturbation; /* hash perturbation */ + u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ + struct codel_params cparams; + struct codel_stats cstats; + u32 drop_overlimit; + u32 new_flow_count; + + struct list_head new_flows; /* list of new flows */ + struct list_head old_flows; /* list of old flows */ +}; + +static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, + const struct sk_buff *skb) +{ + struct flow_keys keys; + unsigned int hash; + + skb_flow_dissect(skb, &keys); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src ^ keys.ip_proto, + (__force u32)keys.ports, q->perturbation); + return ((u64)hash * q->flows_cnt) >> 32; +} + +static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->flows_cnt) + return TC_H_MIN(skb->priority); + + if (!q->filter_list) + return fq_codel_hash(q, skb) + 1; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= q->flows_cnt) + return TC_H_MIN(res.classid); + } + return 0; +} + +/* helper functions : might be changed when/if skb use a standard list_head */ + +/* remove one skb from head of slot queue */ +static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) +{ + struct sk_buff *skb = flow->head; + + flow->head = skb->next; + skb->next = NULL; + return skb; +} + +/* add skb to flow queue (tail add) */ +static inline void flow_queue_add(struct fq_codel_flow *flow, + struct sk_buff *skb) +{ + if (flow->head == NULL) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static unsigned int fq_codel_drop(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + unsigned int maxbacklog = 0, idx = 0, i, len; + struct fq_codel_flow *flow; + + /* Queue is full! Find the fat flow and drop packet from it. + * This might sound expensive, but with 1024 flows, we scan + * 4KB of memory, and we dont need to handle a complex tree + * in fast path (packet queue/enqueue) with many cache misses. + */ + for (i = 0; i < q->flows_cnt; i++) { + if (q->backlogs[i] > maxbacklog) { + maxbacklog = q->backlogs[i]; + idx = i; + } + } + flow = &q->flows[idx]; + skb = dequeue_head(flow); + len = qdisc_pkt_len(skb); + q->backlogs[idx] -= len; + kfree_skb(skb); + sch->q.qlen--; + sch->qstats.drops++; + sch->qstats.backlog -= len; + flow->dropped++; + return idx; +} + +static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + unsigned int idx; + struct fq_codel_flow *flow; + int uninitialized_var(ret); + + idx = fq_codel_classify(skb, sch, &ret); + if (idx == 0) { + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } + idx--; + + codel_set_enqueue_time(skb); + flow = &q->flows[idx]; + flow_queue_add(flow, skb); + q->backlogs[idx] += qdisc_pkt_len(skb); + sch->qstats.backlog += qdisc_pkt_len(skb); + + if (list_empty(&flow->flowchain)) { + list_add_tail(&flow->flowchain, &q->new_flows); + q->new_flow_count++; + flow->deficit = q->quantum; + flow->dropped = 0; + } + if (++sch->q.qlen <= sch->limit) + return NET_XMIT_SUCCESS; + + q->drop_overlimit++; + /* Return Congestion Notification only if we dropped a packet + * from this flow. + */ + if (fq_codel_drop(sch) == idx) + return NET_XMIT_CN; + + /* As we dropped a packet, better let upper stack know this */ + qdisc_tree_decrease_qlen(sch, 1); + return NET_XMIT_SUCCESS; +} + +/* This is the specific function called from codel_dequeue() + * to dequeue a packet from queue. Note: backlog is handled in + * codel, we dont need to reduce it here. + */ +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct fq_codel_flow *flow; + struct sk_buff *skb = NULL; + + flow = container_of(vars, struct fq_codel_flow, cvars); + if (flow->head) { + skb = dequeue_head(flow); + q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); + sch->q.qlen--; + } + return skb; +} + +static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + struct fq_codel_flow *flow; + struct list_head *head; + u32 prev_drop_count, prev_ecn_mark; + +begin: + head = &q->new_flows; + if (list_empty(head)) { + head = &q->old_flows; + if (list_empty(head)) + return NULL; + } + flow = list_first_entry(head, struct fq_codel_flow, flowchain); + + if (flow->deficit <= 0) { + flow->deficit += q->quantum; + list_move_tail(&flow->flowchain, &q->old_flows); + goto begin; + } + + prev_drop_count = q->cstats.drop_count; + prev_ecn_mark = q->cstats.ecn_mark; + + skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, + dequeue); + + flow->dropped += q->cstats.drop_count - prev_drop_count; + flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; + + if (!skb) { + /* force a pass through old_flows to prevent starvation */ + if ((head == &q->new_flows) && !list_empty(&q->old_flows)) + list_move_tail(&flow->flowchain, &q->old_flows); + else + list_del_init(&flow->flowchain); + goto begin; + } + qdisc_bstats_update(sch, skb); + flow->deficit -= qdisc_pkt_len(skb); + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + * or HTB crashes. Defer it for next round. + */ + if (q->cstats.drop_count && sch->q.qlen) { + qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + q->cstats.drop_count = 0; + } + return skb; +} + +static void fq_codel_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = fq_codel_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { + [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, +}; + +static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy); + if (err < 0) + return err; + if (tb[TCA_FQ_CODEL_FLOWS]) { + if (q->flows) + return -EINVAL; + q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]); + if (!q->flows_cnt || + q->flows_cnt > 65536) + return -EINVAL; + } + sch_tree_lock(sch); + + if (tb[TCA_FQ_CODEL_TARGET]) { + u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); + + q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_FQ_CODEL_INTERVAL]) { + u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); + + q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_FQ_CODEL_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); + + if (tb[TCA_FQ_CODEL_ECN]) + q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); + + if (tb[TCA_FQ_CODEL_QUANTUM]) + q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_codel_dequeue(sch); + + kfree_skb(skb); + q->cstats.drop_count++; + } + qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + q->cstats.drop_count = 0; + + sch_tree_unlock(sch); + return 0; +} + +static void *fq_codel_zalloc(size_t sz) +{ + void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); + + if (!ptr) + ptr = vzalloc(sz); + return ptr; +} + +static void fq_codel_free(void *addr) +{ + kvfree(addr); +} + +static void fq_codel_destroy(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + fq_codel_free(q->backlogs); + fq_codel_free(q->flows); +} + +static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + int i; + + sch->limit = 10*1024; + q->flows_cnt = 1024; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->perturbation = prandom_u32(); + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + codel_params_init(&q->cparams); + codel_stats_init(&q->cstats); + q->cparams.ecn = true; + + if (opt) { + int err = fq_codel_change(sch, opt); + if (err) + return err; + } + + if (!q->flows) { + q->flows = fq_codel_zalloc(q->flows_cnt * + sizeof(struct fq_codel_flow)); + if (!q->flows) + return -ENOMEM; + q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); + if (!q->backlogs) { + fq_codel_free(q->flows); + return -ENOMEM; + } + for (i = 0; i < q->flows_cnt; i++) { + struct fq_codel_flow *flow = q->flows + i; + + INIT_LIST_HEAD(&flow->flowchain); + codel_vars_init(&flow->cvars); + } + } + if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, + codel_time_to_us(q->cparams.target)) || + nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, + sch->limit) || + nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, + codel_time_to_us(q->cparams.interval)) || + nla_put_u32(skb, TCA_FQ_CODEL_ECN, + q->cparams.ecn) || + nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, + q->quantum) || + nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, + q->flows_cnt)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + return -1; +} + +static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct tc_fq_codel_xstats st = { + .type = TCA_FQ_CODEL_XSTATS_QDISC, + }; + struct list_head *pos; + + st.qdisc_stats.maxpacket = q->cstats.maxpacket; + st.qdisc_stats.drop_overlimit = q->drop_overlimit; + st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; + st.qdisc_stats.new_flow_count = q->new_flow_count; + + list_for_each(pos, &q->new_flows) + st.qdisc_stats.new_flows_len++; + + list_for_each(pos, &q->old_flows) + st.qdisc_stats.old_flows_len++; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + /* we cannot bypass queue discipline anymore */ + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static void fq_codel_put(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + u32 idx = cl - 1; + struct gnet_stats_queue qs = { 0 }; + struct tc_fq_codel_xstats xstats; + + if (idx < q->flows_cnt) { + const struct fq_codel_flow *flow = &q->flows[idx]; + const struct sk_buff *skb = flow->head; + + memset(&xstats, 0, sizeof(xstats)); + xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; + xstats.class_stats.deficit = flow->deficit; + xstats.class_stats.ldelay = + codel_time_to_us(flow->cvars.ldelay); + xstats.class_stats.count = flow->cvars.count; + xstats.class_stats.lastcount = flow->cvars.lastcount; + xstats.class_stats.dropping = flow->cvars.dropping; + if (flow->cvars.dropping) { + codel_tdiff_t delta = flow->cvars.drop_next - + codel_get_time(); + + xstats.class_stats.drop_next = (delta >= 0) ? + codel_time_to_us(delta) : + -codel_time_to_us(-delta); + } + while (skb) { + qs.qlen++; + skb = skb->next; + } + qs.backlog = q->backlogs[idx]; + qs.drops = flow->dropped; + } + if (gnet_stats_copy_queue(d, &qs) < 0) + return -1; + if (idx < q->flows_cnt) + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); + return 0; +} + +static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->flows_cnt; i++) { + if (list_empty(&q->flows[i].flowchain) || + arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops fq_codel_class_ops = { + .leaf = fq_codel_leaf, + .get = fq_codel_get, + .put = fq_codel_put, + .tcf_chain = fq_codel_find_tcf, + .bind_tcf = fq_codel_bind, + .unbind_tcf = fq_codel_put, + .dump = fq_codel_dump_class, + .dump_stats = fq_codel_dump_class_stats, + .walk = fq_codel_walk, +}; + +static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { + .cl_ops = &fq_codel_class_ops, + .id = "fq_codel", + .priv_size = sizeof(struct fq_codel_sched_data), + .enqueue = fq_codel_enqueue, + .dequeue = fq_codel_dequeue, + .peek = qdisc_peek_dequeued, + .drop = fq_codel_drop, + .init = fq_codel_init, + .reset = fq_codel_reset, + .destroy = fq_codel_destroy, + .change = fq_codel_change, + .dump = fq_codel_dump, + .dump_stats = fq_codel_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_codel_module_init(void) +{ + return register_qdisc(&fq_codel_qdisc_ops); +} + +static void __exit fq_codel_module_exit(void) +{ + unregister_qdisc(&fq_codel_qdisc_ops); +} + +module_init(fq_codel_module_init) +module_exit(fq_codel_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 67fc573e013..e1543b03e39 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -25,9 +25,15 @@ #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/if_vlan.h> +#include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> +/* Qdisc to use by default */ +const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; +EXPORT_SYMBOL(default_qdisc_ops); + /* Main transmission queue. */ /* Modifications to data participating in scheduling must be protected with @@ -53,20 +59,19 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) static inline struct sk_buff *dequeue_skb(struct Qdisc *q) { struct sk_buff *skb = q->gso_skb; + const struct netdev_queue *txq = q->dev_queue; if (unlikely(skb)) { - struct net_device *dev = qdisc_dev(q); - struct netdev_queue *txq; - /* check the reason of requeuing without tx lock first */ - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb)); if (!netif_xmit_frozen_or_stopped(txq)) { q->gso_skb = NULL; q->q.qlen--; } else skb = NULL; } else { - skb = q->dequeue(q); + if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) + skb = q->dequeue(q); } return skb; @@ -86,9 +91,8 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, * deadloop is detected. Return OK to try the next skb. */ kfree_skb(skb); - if (net_ratelimit()) - pr_warning("Dead loop on netdevice %s, fix it urgently!\n", - dev_queue->dev->name); + net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n", + dev_queue->dev->name); ret = qdisc_qlen(q); } else { /* @@ -136,9 +140,9 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, ret = handle_dev_cpu_collision(skb, txq, q); } else { /* Driver returned NETDEV_TX_BUSY - requeue skb */ - if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit())) - pr_warning("BUG %s code %d qlen %d\n", - dev->name, ret, q->q.qlen); + if (unlikely(ret != NETDEV_TX_BUSY)) + net_warn_ratelimited("BUG %s code %d qlen %d\n", + dev->name, ret, q->q.qlen); ret = dev_requeue_skb(skb, q); } @@ -208,15 +212,19 @@ void __qdisc_run(struct Qdisc *q) unsigned long dev_trans_start(struct net_device *dev) { - unsigned long val, res = dev->trans_start; + unsigned long val, res; unsigned int i; + if (is_vlan_dev(dev)) + dev = vlan_dev_real_dev(dev); + res = dev->trans_start; for (i = 0; i < dev->num_tx_queues; i++) { val = netdev_get_tx_queue(dev, i)->trans_start; if (val && time_after(val, res)) res = val; } dev->trans_start = res; + return res; } EXPORT_SYMBOL(dev_trans_start); @@ -302,6 +310,7 @@ void netif_carrier_on(struct net_device *dev) if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; + atomic_inc(&dev->carrier_changes); linkwatch_fire_event(dev); if (netif_running(dev)) __netdev_watchdog_up(dev); @@ -320,41 +329,24 @@ void netif_carrier_off(struct net_device *dev) if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; + atomic_inc(&dev->carrier_changes); linkwatch_fire_event(dev); } } EXPORT_SYMBOL(netif_carrier_off); -/** - * netif_notify_peers - notify network peers about existence of @dev - * @dev: network device - * - * Generate traffic such that interested network peers are aware of - * @dev, such as by generating a gratuitous ARP. This may be used when - * a device wants to inform the rest of the network about some sort of - * reconfiguration such as a failover event or virtual machine - * migration. - */ -void netif_notify_peers(struct net_device *dev) -{ - rtnl_lock(); - call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); - rtnl_unlock(); -} -EXPORT_SYMBOL(netif_notify_peers); - /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. */ -static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) { kfree_skb(skb); return NET_XMIT_CN; } -static struct sk_buff *noop_dequeue(struct Qdisc * qdisc) +static struct sk_buff *noop_dequeue(struct Qdisc *qdisc) { return NULL; } @@ -512,7 +504,8 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1); - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -543,15 +536,17 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { .dump = pfifo_fast_dump, .owner = THIS_MODULE, }; -EXPORT_SYMBOL(pfifo_fast_ops); + +static struct lock_class_key qdisc_tx_busylock; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops) + const struct Qdisc_ops *ops) { void *p; struct Qdisc *sch; unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size; int err = -ENOBUFS; + struct net_device *dev = dev_queue->dev; p = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); @@ -571,12 +566,16 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); + spin_lock_init(&sch->busylock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + sch->ops = ops; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; - dev_hold(qdisc_dev(sch)); + dev_hold(dev); atomic_set(&sch->refcnt, 1); return sch; @@ -585,10 +584,14 @@ errout: } struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops, unsigned int parentid) + const struct Qdisc_ops *ops, + unsigned int parentid) { struct Qdisc *sch; + if (!try_module_get(ops->owner)) + goto errout; + sch = qdisc_alloc(dev_queue, ops); if (IS_ERR(sch)) goto errout; @@ -692,11 +695,13 @@ static void attach_one_default_qdisc(struct net_device *dev, if (dev->tx_queue_len) { qdisc = qdisc_create_dflt(dev_queue, - &pfifo_fast_ops, TC_H_ROOT); + default_qdisc_ops, TC_H_ROOT); if (!qdisc) { netdev_info(dev, "activation failed\n"); return; } + if (!netif_is_multiqueue(dev)) + qdisc->flags |= TCQ_F_ONETXQUEUE; } dev_queue->qdisc_sleeping = qdisc; } @@ -715,8 +720,8 @@ static void attach_default_qdiscs(struct net_device *dev) } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT); if (qdisc) { - qdisc->ops->attach(qdisc); dev->qdisc = qdisc; + qdisc->ops->attach(qdisc); } } } @@ -743,9 +748,8 @@ void dev_activate(struct net_device *dev) int need_watchdog; /* No queueing discipline is attached to device; - create default one i.e. pfifo_fast for devices, - which need queueing and noqueue_qdisc for - virtual interfaces + * create default one for devices, which need queueing + * and noqueue_qdisc for virtual interfaces */ if (dev->qdisc == &noop_qdisc) @@ -827,7 +831,7 @@ void dev_deactivate_many(struct list_head *head) struct net_device *dev; bool sync_needed = false; - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc); if (dev_ingress_queue(dev)) @@ -846,7 +850,7 @@ void dev_deactivate_many(struct list_head *head) synchronize_net(); /* Wait for outstanding qdisc_run calls. */ - list_for_each_entry(dev, head, unreg_list) + list_for_each_entry(dev, head, close_list) while (some_qdisc_is_busy(dev)) yield(); } @@ -855,7 +859,7 @@ void dev_deactivate(struct net_device *dev) { LIST_HEAD(single); - list_add(&dev->unreg_list, &single); + list_add(&dev->close_list, &single); dev_deactivate_many(&single); list_del(&single); } @@ -906,3 +910,39 @@ void dev_shutdown(struct net_device *dev) WARN_ON(timer_pending(&dev->watchdog_timer)); } + +void psched_ratecfg_precompute(struct psched_ratecfg *r, + const struct tc_ratespec *conf, + u64 rate64) +{ + memset(r, 0, sizeof(*r)); + r->overhead = conf->overhead; + r->rate_bytes_ps = max_t(u64, conf->rate, rate64); + r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); + r->mult = 1; + /* + * The deal here is to replace a divide by a reciprocal one + * in fast path (a reciprocal divide is a multiply and a shift) + * + * Normal formula would be : + * time_in_ns = (NSEC_PER_SEC * len) / rate_bps + * + * We compute mult/shift to use instead : + * time_in_ns = (len * mult) >> shift; + * + * We try to get the highest possible mult value for accuracy, + * but have to make sure no overflows will ever happen. + */ + if (r->rate_bytes_ps > 0) { + u64 factor = NSEC_PER_SEC; + + for (;;) { + r->mult = div64_u64(factor, r->rate_bytes_ps); + if (r->mult & (1U << 31) || factor & (1ULL << 63)) + break; + factor <<= 1; + r->shift++; + } + } +} +EXPORT_SYMBOL(psched_ratecfg_precompute); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 0b15236be7b..12cbc09157f 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -102,9 +102,8 @@ static inline int gred_wred_mode_check(struct Qdisc *sch) if (q == NULL) continue; - for (n = 0; n < table->DPs; n++) - if (table->tab[n] && table->tab[n] != q && - table->tab[n]->prio == q->prio) + for (n = i + 1; n < table->DPs; n++) + if (table->tab[n] && table->tab[n]->prio == q->prio) return 1; } @@ -137,6 +136,7 @@ static inline void gred_store_wred_set(struct gred_sched *table, struct gred_sched_data *q) { table->wred_set.qavg = q->vars.qavg; + table->wred_set.qidlestart = q->vars.qidlestart; } static inline int gred_use_ecn(struct gred_sched *t) @@ -176,7 +176,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; } - /* sum up all the qaves of prios <= to ours to get the new qave */ + /* sum up all the qaves of prios < ours to get the new qave */ if (!gred_wred_mode(t) && gred_rio_mode(t)) { int i; @@ -255,23 +255,23 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch) u16 dp = tc_index_to_dp(skb); if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { - if (net_ratelimit()) - pr_warning("GRED: Unable to relocate VQ 0x%x " - "after dequeue, screwing up " - "backlog.\n", tc_index_to_dp(skb)); + net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n", + tc_index_to_dp(skb)); } else { q->backlog -= qdisc_pkt_len(skb); - if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->vars); + if (gred_wred_mode(t)) { + if (!sch->qstats.backlog) + red_start_of_idle_period(&t->wred_set); + } else { + if (!q->backlog) + red_start_of_idle_period(&q->vars); + } } return skb; } - if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) - red_start_of_idle_period(&t->wred_set); - return NULL; } @@ -287,27 +287,26 @@ static unsigned int gred_drop(struct Qdisc *sch) u16 dp = tc_index_to_dp(skb); if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { - if (net_ratelimit()) - pr_warning("GRED: Unable to relocate VQ 0x%x " - "while dropping, screwing up " - "backlog.\n", tc_index_to_dp(skb)); + net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n", + tc_index_to_dp(skb)); } else { q->backlog -= len; q->stats.other++; - if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->vars); + if (gred_wred_mode(t)) { + if (!sch->qstats.backlog) + red_start_of_idle_period(&t->wred_set); + } else { + if (!q->backlog) + red_start_of_idle_period(&q->vars); + } } qdisc_drop(skb, sch); return len; } - if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) - red_start_of_idle_period(&t->wred_set); - return 0; - } static void gred_reset(struct Qdisc *sch) @@ -371,8 +370,8 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) for (i = table->DPs; i < MAX_DPs; i++) { if (table->tab[i]) { - pr_warning("GRED: Warning: Destroying " - "shadowed VQ 0x%x\n", i); + pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n", + i); gred_destroy_vq(table->tab[i]); table->tab[i] = NULL; } @@ -521,14 +520,16 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); + if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt)) + goto nla_put_failure; for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; max_p[i] = q ? q->parms.max_P : 0; } - NLA_PUT(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p); + if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p)) + goto nla_put_failure; parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) @@ -537,6 +538,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; struct tc_gred_qopt opt; + unsigned long qavg; memset(&opt, 0, sizeof(opt)); @@ -565,13 +567,12 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) opt.packets = q->packetsin; opt.bytesin = q->bytesin; - if (gred_wred_mode(table)) { - q->vars.qidlestart = - table->tab[table->def]->vars.qidlestart; - q->vars.qavg = table->tab[table->def]->vars.qavg; - } + if (gred_wred_mode(table)) + gred_load_wred_set(table, q); - opt.qave = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg); + qavg = red_calc_qavg(&q->parms, &q->vars, + q->vars.qavg >> q->parms.Wlog); + opt.qave = qavg >> q->parms.Wlog; append_opt: if (nla_append(skb, sizeof(opt), &opt) < 0) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 9bdca2e011e..ec8aeaac1dd 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -114,7 +114,7 @@ struct hfsc_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; unsigned int level; /* class level in hierarchy */ struct tcf_proto *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ @@ -1305,7 +1305,8 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) tsc.m1 = sm2m(sc->sm1); tsc.d = dx2d(sc->dx); tsc.m2 = sm2m(sc->sm2); - NLA_PUT(skb, attr, sizeof(tsc), &tsc); + if (nla_put(skb, attr, sizeof(tsc), &tsc)) + goto nla_put_failure; return skb->len; @@ -1352,8 +1353,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, goto nla_put_failure; if (hfsc_dump_curves(skb, cl) < 0) goto nla_put_failure; - nla_nest_end(skb, nest); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); @@ -1388,7 +1388,6 @@ static void hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct hfsc_sched *q = qdisc_priv(sch); - struct hlist_node *n; struct hfsc_class *cl; unsigned int i; @@ -1396,7 +1395,7 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) { if (arg->count < arg->skip) { arg->count++; @@ -1522,11 +1521,10 @@ hfsc_reset_qdisc(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl; - struct hlist_node *n; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) hfsc_reset_class(cl); } q->eligible = RB_ROOT; @@ -1539,16 +1537,16 @@ static void hfsc_destroy_qdisc(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); - struct hlist_node *n, *next; + struct hlist_node *next; struct hfsc_class *cl; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) tcf_destroy_chain(&cl->filter_list); } for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], cl_common.hnode) hfsc_destroy_class(sch, cl); } @@ -1563,17 +1561,17 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; struct hfsc_class *cl; - struct hlist_node *n; unsigned int i; sch->qstats.backlog = 0; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) sch->qstats.backlog += cl->qdisc->qstats.backlog; } qopt.defcls = q->defcls; - NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -1607,7 +1605,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl->qdisc->q.qlen == 1) set_active(cl, qdisc_pkt_len(skb)); - bstats_update(&cl->bstats, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -1655,6 +1652,7 @@ hfsc_dequeue(struct Qdisc *sch) return NULL; } + bstats_update(&cl->bstats, skb); update_vf(cl, qdisc_pkt_len(skb), cur_time); if (realtime) cl->cl_cumul += qdisc_pkt_len(skb); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c new file mode 100644 index 00000000000..d85b6812a7d --- /dev/null +++ b/net/sched/sch_hhf.c @@ -0,0 +1,740 @@ +/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF) + * + * Copyright (C) 2013 Terry Lam <vtlam@google.com> + * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> + */ + +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <net/flow_keys.h> +#include <net/pkt_sched.h> +#include <net/sock.h> + +/* Heavy-Hitter Filter (HHF) + * + * Principles : + * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter + * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified + * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. + * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, + * in which the heavy-hitter bucket is served with less weight. + * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) + * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have + * higher share of bandwidth. + * + * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the + * following paper: + * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and + * Accounting", in ACM SIGCOMM, 2002. + * + * Conceptually, a multi-stage filter comprises k independent hash functions + * and k counter arrays. Packets are indexed into k counter arrays by k hash + * functions, respectively. The counters are then increased by the packet sizes. + * Therefore, + * - For a heavy-hitter flow: *all* of its k array counters must be large. + * - For a non-heavy-hitter flow: some of its k array counters can be large + * due to hash collision with other small flows; however, with high + * probability, not *all* k counters are large. + * + * By the design of the multi-stage filter algorithm, the false negative rate + * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is + * susceptible to false positives (non-heavy-hitters mistakenly classified as + * heavy-hitters). + * Therefore, we also implement the following optimizations to reduce false + * positives by avoiding unnecessary increment of the counter values: + * - Optimization O1: once a heavy-hitter is identified, its bytes are not + * accounted in the array counters. This technique is called "shielding" + * in Section 3.3.1 of [EV02]. + * - Optimization O2: conservative update of counters + * (Section 3.3.2 of [EV02]), + * New counter value = max {old counter value, + * smallest counter value + packet bytes} + * + * Finally, we refresh the counters periodically since otherwise the counter + * values will keep accumulating. + * + * Once a flow is classified as heavy-hitter, we also save its per-flow state + * in an exact-matching flow table so that its subsequent packets can be + * dispatched to the heavy-hitter bucket accordingly. + * + * + * At a high level, this qdisc works as follows: + * Given a packet p: + * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching + * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter + * bucket. + * - Otherwise, forward p to the multi-stage filter, denoted filter F + * + If F decides that p belongs to a non-heavy-hitter flow, then send p + * to the non-heavy-hitter bucket. + * + Otherwise, if F decides that p belongs to a new heavy-hitter flow, + * then set up a new flow entry for the flow-id of p in the table T and + * send p to the heavy-hitter bucket. + * + * In this implementation: + * - T is a fixed-size hash-table with 1024 entries. Hash collision is + * resolved by linked-list chaining. + * - F has four counter arrays, each array containing 1024 32-bit counters. + * That means 4 * 1024 * 32 bits = 16KB of memory. + * - Since each array in F contains 1024 counters, 10 bits are sufficient to + * index into each array. + * Hence, instead of having four hash functions, we chop the 32-bit + * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is + * computed as XOR sum of those three chunks. + * - We need to clear the counter arrays periodically; however, directly + * memsetting 16KB of memory can lead to cache eviction and unwanted delay. + * So by representing each counter by a valid bit, we only need to reset + * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. + * - The Deficit Round Robin engine is taken from fq_codel implementation + * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to + * fq_codel_flow in fq_codel implementation. + * + */ + +/* Non-configurable parameters */ +#define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */ +#define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */ +#define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */ +#define HHF_BIT_MASK_LEN 10 /* masking 10 bits */ +#define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */ + +#define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */ +enum wdrr_bucket_idx { + WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */ + WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */ +}; + +#define hhf_time_before(a, b) \ + (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) + +/* Heavy-hitter per-flow state */ +struct hh_flow_state { + u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */ + u32 hit_timestamp; /* last time heavy-hitter was seen */ + struct list_head flowchain; /* chaining under hash collision */ +}; + +/* Weighted Deficit Round Robin (WDRR) scheduler */ +struct wdrr_bucket { + struct sk_buff *head; + struct sk_buff *tail; + struct list_head bucketchain; + int deficit; +}; + +struct hhf_sched_data { + struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; + u32 perturbation; /* hash perturbation */ + u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ + u32 drop_overlimit; /* number of times max qdisc packet + * limit was hit + */ + struct list_head *hh_flows; /* table T (currently active HHs) */ + u32 hh_flows_limit; /* max active HH allocs */ + u32 hh_flows_overlimit; /* num of disallowed HH allocs */ + u32 hh_flows_total_cnt; /* total admitted HHs */ + u32 hh_flows_current_cnt; /* total current HHs */ + u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ + u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays + * was reset + */ + unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits + * of hhf_arrays + */ + /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ + struct list_head new_buckets; /* list of new buckets */ + struct list_head old_buckets; /* list of old buckets */ + + /* Configurable HHF parameters */ + u32 hhf_reset_timeout; /* interval to reset counter + * arrays in filter F + * (default 40ms) + */ + u32 hhf_admit_bytes; /* counter thresh to classify as + * HH (default 128KB). + * With these default values, + * 128KB / 40ms = 25 Mbps + * i.e., we expect to capture HHs + * sending > 25 Mbps. + */ + u32 hhf_evict_timeout; /* aging threshold to evict idle + * HHs out of table T. This should + * be large enough to avoid + * reordering during HH eviction. + * (default 1s) + */ + u32 hhf_non_hh_weight; /* WDRR weight for non-HHs + * (default 2, + * i.e., non-HH : HH = 2 : 1) + */ +}; + +static u32 hhf_time_stamp(void) +{ + return jiffies; +} + +static unsigned int skb_hash(const struct hhf_sched_data *q, + const struct sk_buff *skb) +{ + struct flow_keys keys; + unsigned int hash; + + if (skb->sk && skb->sk->sk_hash) + return skb->sk->sk_hash; + + skb_flow_dissect(skb, &keys); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src ^ keys.ip_proto, + (__force u32)keys.ports, q->perturbation); + return hash; +} + +/* Looks up a heavy-hitter flow in a chaining list of table T. */ +static struct hh_flow_state *seek_list(const u32 hash, + struct list_head *head, + struct hhf_sched_data *q) +{ + struct hh_flow_state *flow, *next; + u32 now = hhf_time_stamp(); + + if (list_empty(head)) + return NULL; + + list_for_each_entry_safe(flow, next, head, flowchain) { + u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + + if (hhf_time_before(prev, now)) { + /* Delete expired heavy-hitters, but preserve one entry + * to avoid kzalloc() when next time this slot is hit. + */ + if (list_is_last(&flow->flowchain, head)) + return NULL; + list_del(&flow->flowchain); + kfree(flow); + q->hh_flows_current_cnt--; + } else if (flow->hash_id == hash) { + return flow; + } + } + return NULL; +} + +/* Returns a flow state entry for a new heavy-hitter. Either reuses an expired + * entry or dynamically alloc a new entry. + */ +static struct hh_flow_state *alloc_new_hh(struct list_head *head, + struct hhf_sched_data *q) +{ + struct hh_flow_state *flow; + u32 now = hhf_time_stamp(); + + if (!list_empty(head)) { + /* Find an expired heavy-hitter flow entry. */ + list_for_each_entry(flow, head, flowchain) { + u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + + if (hhf_time_before(prev, now)) + return flow; + } + } + + if (q->hh_flows_current_cnt >= q->hh_flows_limit) { + q->hh_flows_overlimit++; + return NULL; + } + /* Create new entry. */ + flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); + if (!flow) + return NULL; + + q->hh_flows_current_cnt++; + INIT_LIST_HEAD(&flow->flowchain); + list_add_tail(&flow->flowchain, head); + + return flow; +} + +/* Assigns packets to WDRR buckets. Implements a multi-stage filter to + * classify heavy-hitters. + */ +static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + u32 tmp_hash, hash; + u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; + struct hh_flow_state *flow; + u32 pkt_len, min_hhf_val; + int i; + u32 prev; + u32 now = hhf_time_stamp(); + + /* Reset the HHF counter arrays if this is the right time. */ + prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; + if (hhf_time_before(prev, now)) { + for (i = 0; i < HHF_ARRAYS_CNT; i++) + bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); + q->hhf_arrays_reset_timestamp = now; + } + + /* Get hashed flow-id of the skb. */ + hash = skb_hash(q, skb); + + /* Check if this packet belongs to an already established HH flow. */ + flow_pos = hash & HHF_BIT_MASK; + flow = seek_list(hash, &q->hh_flows[flow_pos], q); + if (flow) { /* found its HH flow */ + flow->hit_timestamp = now; + return WDRR_BUCKET_FOR_HH; + } + + /* Now pass the packet through the multi-stage filter. */ + tmp_hash = hash; + xorsum = 0; + for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { + /* Split the skb_hash into three 10-bit chunks. */ + filter_pos[i] = tmp_hash & HHF_BIT_MASK; + xorsum ^= filter_pos[i]; + tmp_hash >>= HHF_BIT_MASK_LEN; + } + /* The last chunk is computed as XOR sum of other chunks. */ + filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; + + pkt_len = qdisc_pkt_len(skb); + min_hhf_val = ~0U; + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + u32 val; + + if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { + q->hhf_arrays[i][filter_pos[i]] = 0; + __set_bit(filter_pos[i], q->hhf_valid_bits[i]); + } + + val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; + if (min_hhf_val > val) + min_hhf_val = val; + } + + /* Found a new HH iff all counter values > HH admit threshold. */ + if (min_hhf_val > q->hhf_admit_bytes) { + /* Just captured a new heavy-hitter. */ + flow = alloc_new_hh(&q->hh_flows[flow_pos], q); + if (!flow) /* memory alloc problem */ + return WDRR_BUCKET_FOR_NON_HH; + flow->hash_id = hash; + flow->hit_timestamp = now; + q->hh_flows_total_cnt++; + + /* By returning without updating counters in q->hhf_arrays, + * we implicitly implement "shielding" (see Optimization O1). + */ + return WDRR_BUCKET_FOR_HH; + } + + /* Conservative update of HHF arrays (see Optimization O2). */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) + q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; + } + return WDRR_BUCKET_FOR_NON_HH; +} + +/* Removes one skb from head of bucket. */ +static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) +{ + struct sk_buff *skb = bucket->head; + + bucket->head = skb->next; + skb->next = NULL; + return skb; +} + +/* Tail-adds skb to bucket. */ +static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) +{ + if (bucket->head == NULL) + bucket->head = skb; + else + bucket->tail->next = skb; + bucket->tail = skb; + skb->next = NULL; +} + +static unsigned int hhf_drop(struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct wdrr_bucket *bucket; + + /* Always try to drop from heavy-hitters first. */ + bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; + if (!bucket->head) + bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; + + if (bucket->head) { + struct sk_buff *skb = dequeue_head(bucket); + + sch->q.qlen--; + sch->qstats.drops++; + sch->qstats.backlog -= qdisc_pkt_len(skb); + kfree_skb(skb); + } + + /* Return id of the bucket from which the packet was dropped. */ + return bucket - q->buckets; +} + +static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + enum wdrr_bucket_idx idx; + struct wdrr_bucket *bucket; + + idx = hhf_classify(skb, sch); + + bucket = &q->buckets[idx]; + bucket_add(bucket, skb); + sch->qstats.backlog += qdisc_pkt_len(skb); + + if (list_empty(&bucket->bucketchain)) { + unsigned int weight; + + /* The logic of new_buckets vs. old_buckets is the same as + * new_flows vs. old_flows in the implementation of fq_codel, + * i.e., short bursts of non-HHs should have strict priority. + */ + if (idx == WDRR_BUCKET_FOR_HH) { + /* Always move heavy-hitters to old bucket. */ + weight = 1; + list_add_tail(&bucket->bucketchain, &q->old_buckets); + } else { + weight = q->hhf_non_hh_weight; + list_add_tail(&bucket->bucketchain, &q->new_buckets); + } + bucket->deficit = weight * q->quantum; + } + if (++sch->q.qlen <= sch->limit) + return NET_XMIT_SUCCESS; + + q->drop_overlimit++; + /* Return Congestion Notification only if we dropped a packet from this + * bucket. + */ + if (hhf_drop(sch) == idx) + return NET_XMIT_CN; + + /* As we dropped a packet, better let upper stack know this. */ + qdisc_tree_decrease_qlen(sch, 1); + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *hhf_dequeue(struct Qdisc *sch) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = NULL; + struct wdrr_bucket *bucket; + struct list_head *head; + +begin: + head = &q->new_buckets; + if (list_empty(head)) { + head = &q->old_buckets; + if (list_empty(head)) + return NULL; + } + bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); + + if (bucket->deficit <= 0) { + int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? + 1 : q->hhf_non_hh_weight; + + bucket->deficit += weight * q->quantum; + list_move_tail(&bucket->bucketchain, &q->old_buckets); + goto begin; + } + + if (bucket->head) { + skb = dequeue_head(bucket); + sch->q.qlen--; + sch->qstats.backlog -= qdisc_pkt_len(skb); + } + + if (!skb) { + /* Force a pass through old_buckets to prevent starvation. */ + if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) + list_move_tail(&bucket->bucketchain, &q->old_buckets); + else + list_del_init(&bucket->bucketchain); + goto begin; + } + qdisc_bstats_update(sch, skb); + bucket->deficit -= qdisc_pkt_len(skb); + + return skb; +} + +static void hhf_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = hhf_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void *hhf_zalloc(size_t sz) +{ + void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); + + if (!ptr) + ptr = vzalloc(sz); + + return ptr; +} + +static void hhf_free(void *addr) +{ + kvfree(addr); +} + +static void hhf_destroy(struct Qdisc *sch) +{ + int i; + struct hhf_sched_data *q = qdisc_priv(sch); + + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + hhf_free(q->hhf_arrays[i]); + hhf_free(q->hhf_valid_bits[i]); + } + + for (i = 0; i < HH_FLOWS_CNT; i++) { + struct hh_flow_state *flow, *next; + struct list_head *head = &q->hh_flows[i]; + + if (list_empty(head)) + continue; + list_for_each_entry_safe(flow, next, head, flowchain) { + list_del(&flow->flowchain); + kfree(flow); + } + } + hhf_free(q->hh_flows); +} + +static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { + [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 }, + [TCA_HHF_QUANTUM] = { .type = NLA_U32 }, + [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, + [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 }, + [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 }, + [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 }, + [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 }, +}; + +static int hhf_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_HHF_MAX + 1]; + unsigned int qlen; + int err; + u64 non_hh_quantum; + u32 new_quantum = q->quantum; + u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy); + if (err < 0) + return err; + + if (tb[TCA_HHF_QUANTUM]) + new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); + + if (tb[TCA_HHF_NON_HH_WEIGHT]) + new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); + + non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; + if (non_hh_quantum > INT_MAX) + return -EINVAL; + + sch_tree_lock(sch); + + if (tb[TCA_HHF_BACKLOG_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); + + q->quantum = new_quantum; + q->hhf_non_hh_weight = new_hhf_non_hh_weight; + + if (tb[TCA_HHF_HH_FLOWS_LIMIT]) + q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]); + + if (tb[TCA_HHF_RESET_TIMEOUT]) { + u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); + + q->hhf_reset_timeout = usecs_to_jiffies(us); + } + + if (tb[TCA_HHF_ADMIT_BYTES]) + q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]); + + if (tb[TCA_HHF_EVICT_TIMEOUT]) { + u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); + + q->hhf_evict_timeout = usecs_to_jiffies(us); + } + + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = hhf_dequeue(sch); + + kfree_skb(skb); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + sch_tree_unlock(sch); + return 0; +} + +static int hhf_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + int i; + + sch->limit = 1000; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->perturbation = prandom_u32(); + INIT_LIST_HEAD(&q->new_buckets); + INIT_LIST_HEAD(&q->old_buckets); + + /* Configurable HHF parameters */ + q->hhf_reset_timeout = HZ / 25; /* 40 ms */ + q->hhf_admit_bytes = 131072; /* 128 KB */ + q->hhf_evict_timeout = HZ; /* 1 sec */ + q->hhf_non_hh_weight = 2; + + if (opt) { + int err = hhf_change(sch, opt); + + if (err) + return err; + } + + if (!q->hh_flows) { + /* Initialize heavy-hitter flow table. */ + q->hh_flows = hhf_zalloc(HH_FLOWS_CNT * + sizeof(struct list_head)); + if (!q->hh_flows) + return -ENOMEM; + for (i = 0; i < HH_FLOWS_CNT; i++) + INIT_LIST_HEAD(&q->hh_flows[i]); + + /* Cap max active HHs at twice len of hh_flows table. */ + q->hh_flows_limit = 2 * HH_FLOWS_CNT; + q->hh_flows_overlimit = 0; + q->hh_flows_total_cnt = 0; + q->hh_flows_current_cnt = 0; + + /* Initialize heavy-hitter filter arrays. */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * + sizeof(u32)); + if (!q->hhf_arrays[i]) { + hhf_destroy(sch); + return -ENOMEM; + } + } + q->hhf_arrays_reset_timestamp = hhf_time_stamp(); + + /* Initialize valid bits of heavy-hitter filter arrays. */ + for (i = 0; i < HHF_ARRAYS_CNT; i++) { + q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / + BITS_PER_BYTE); + if (!q->hhf_valid_bits[i]) { + hhf_destroy(sch); + return -ENOMEM; + } + } + + /* Initialize Weighted DRR buckets. */ + for (i = 0; i < WDRR_BUCKET_CNT; i++) { + struct wdrr_bucket *bucket = q->buckets + i; + + INIT_LIST_HEAD(&bucket->bucketchain); + } + } + + return 0; +} + +static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) || + nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, + jiffies_to_usecs(q->hhf_reset_timeout)) || + nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) || + nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, + jiffies_to_usecs(q->hhf_evict_timeout)) || + nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + return -1; +} + +static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct hhf_sched_data *q = qdisc_priv(sch); + struct tc_hhf_xstats st = { + .drop_overlimit = q->drop_overlimit, + .hh_overlimit = q->hh_flows_overlimit, + .hh_tot_count = q->hh_flows_total_cnt, + .hh_cur_count = q->hh_flows_current_cnt, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { + .id = "hhf", + .priv_size = sizeof(struct hhf_sched_data), + + .enqueue = hhf_enqueue, + .dequeue = hhf_dequeue, + .peek = qdisc_peek_dequeued, + .drop = hhf_drop, + .init = hhf_init, + .reset = hhf_reset, + .destroy = hhf_destroy, + .change = hhf_change, + .dump = hhf_dump, + .dump_stats = hhf_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init hhf_module_init(void) +{ + return register_qdisc(&hhf_qdisc_ops); +} + +static void __exit hhf_module_exit(void) +{ + unregister_qdisc(&hhf_qdisc_ops); +} + +module_init(hhf_module_init) +module_exit(hhf_module_exit) +MODULE_AUTHOR("Terry Lam"); +MODULE_AUTHOR("Nandita Dukkipati"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 29b942ce9e8..9f949abcace 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -38,6 +38,7 @@ #include <linux/workqueue.h> #include <linux/slab.h> #include <net/netlink.h> +#include <net/sch_generic.h> #include <net/pkt_sched.h> /* HTB algorithm. @@ -64,6 +65,10 @@ static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis f module_param (htb_hysteresis, int, 0640); MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate"); +static int htb_rate_est = 0; /* htb classes have a default rate estimator */ +module_param(htb_rate_est, int, 0640); +MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes"); + /* used internaly to keep status of single class */ enum htb_cmode { HTB_CANT_SEND, /* class can't send and can't borrow */ @@ -71,95 +76,105 @@ enum htb_cmode { HTB_CAN_SEND /* class can send */ }; -/* interior & leaf nodes; props specific to leaves are marked L: */ +struct htb_prio { + union { + struct rb_root row; + struct rb_root feed; + }; + struct rb_node *ptr; + /* When class changes from state 1->2 and disconnects from + * parent's feed then we lost ptr value and start from the + * first child again. Here we store classid of the + * last valid ptr (used when ptr is NULL). + */ + u32 last_ptr_id; +}; + +/* interior & leaf nodes; props specific to leaves are marked L: + * To reduce false sharing, place mostly read fields at beginning, + * and mostly written ones at the end. + */ struct htb_class { struct Qdisc_class_common common; - /* general class parameters */ - struct gnet_stats_basic_packed bstats; - struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; - struct tc_htb_xstats xstats; /* our special stats */ - int refcnt; /* usage count of this class */ + struct psched_ratecfg rate; + struct psched_ratecfg ceil; + s64 buffer, cbuffer;/* token bucket depth/rate */ + s64 mbuffer; /* max wait time */ + u32 prio; /* these two are used only by leaves... */ + int quantum; /* but stored for parent-to-leaf return */ - /* topology */ - int level; /* our level (see above) */ - unsigned int children; - struct htb_class *parent; /* parent class */ + struct tcf_proto *filter_list; /* class attached filters */ + int filter_cnt; + int refcnt; /* usage count of this class */ - int prio; /* these two are used only by leaves... */ - int quantum; /* but stored for parent-to-leaf return */ + int level; /* our level (see above) */ + unsigned int children; + struct htb_class *parent; /* parent class */ + + struct gnet_stats_rate_est64 rate_est; + + /* + * Written often fields + */ + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct tc_htb_xstats xstats; /* our special stats */ + + /* token bucket parameters */ + s64 tokens, ctokens;/* current number of tokens */ + s64 t_c; /* checkpoint time */ union { struct htb_class_leaf { - struct Qdisc *q; - int deficit[TC_HTB_MAXDEPTH]; struct list_head drop_list; + int deficit[TC_HTB_MAXDEPTH]; + struct Qdisc *q; } leaf; struct htb_class_inner { - struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */ - struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ - /* When class changes from state 1->2 and disconnects from - * parent's feed then we lost ptr value and start from the - * first child again. Here we store classid of the - * last valid ptr (used when ptr is NULL). - */ - u32 last_ptr_id[TC_HTB_NUMPRIO]; + struct htb_prio clprio[TC_HTB_NUMPRIO]; } inner; } un; - struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ - struct rb_node pq_node; /* node for event queue */ - psched_time_t pq_key; + s64 pq_key; - int prio_activity; /* for which prios are we active */ - enum htb_cmode cmode; /* current mode of the class */ - - /* class attached filters */ - struct tcf_proto *filter_list; - int filter_cnt; + int prio_activity; /* for which prios are we active */ + enum htb_cmode cmode; /* current mode of the class */ + struct rb_node pq_node; /* node for event queue */ + struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ +}; - /* token bucket parameters */ - struct qdisc_rate_table *rate; /* rate table of the class itself */ - struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ - long buffer, cbuffer; /* token bucket depth/rate */ - psched_tdiff_t mbuffer; /* max wait time */ - long tokens, ctokens; /* current number of tokens */ - psched_time_t t_c; /* checkpoint time */ +struct htb_level { + struct rb_root wait_pq; + struct htb_prio hprio[TC_HTB_NUMPRIO]; }; struct htb_sched { struct Qdisc_class_hash clhash; - struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ - - /* self list - roots of self generating tree */ - struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; - int row_mask[TC_HTB_MAXDEPTH]; - struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; - u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + int defcls; /* class where unclassified flows go to */ + int rate2quantum; /* quant = rate / rate2quantum */ - /* self wait list - roots of wait PQs per row */ - struct rb_root wait_pq[TC_HTB_MAXDEPTH]; + /* filters for qdisc itself */ + struct tcf_proto *filter_list; - /* time of nearest event per level (row) */ - psched_time_t near_ev_cache[TC_HTB_MAXDEPTH]; +#define HTB_WARN_TOOMANYEVENTS 0x1 + unsigned int warned; /* only one warning */ + int direct_qlen; + struct work_struct work; - int defcls; /* class where unclassified flows go to */ + /* non shaped skbs; let them go directly thru */ + struct sk_buff_head direct_queue; + long direct_pkts; - /* filters for qdisc itself */ - struct tcf_proto *filter_list; + struct qdisc_watchdog watchdog; - int rate2quantum; /* quant = rate / rate2quantum */ - psched_time_t now; /* cached dequeue time */ - struct qdisc_watchdog watchdog; + s64 now; /* cached dequeue time */ + struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ - /* non shaped skbs; let them go directly thru */ - struct sk_buff_head direct_queue; - int direct_qlen; /* max qlen of above */ + /* time of nearest event per level (row) */ + s64 near_ev_cache[TC_HTB_MAXDEPTH]; - long direct_pkts; + int row_mask[TC_HTB_MAXDEPTH]; -#define HTB_WARN_TOOMANYEVENTS 0x1 - unsigned int warned; /* only one warning */ - struct work_struct work; + struct htb_level hlevel[TC_HTB_MAXDEPTH]; }; /* find class in global hash table using given handle */ @@ -204,11 +219,16 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, if (skb->priority == sch->handle) return HTB_DIRECT; /* X:0 (direct flow) selected */ cl = htb_find(skb->priority, sch); - if (cl && cl->level == 0) - return cl; + if (cl) { + if (cl->level == 0) + return cl; + /* Start with inner filter chain if a non-leaf class is selected */ + tcf = cl->filter_list; + } else { + tcf = q->filter_list; + } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - tcf = q->filter_list; while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -273,9 +293,9 @@ static void htb_add_to_id_tree(struct rb_root *root, * already in the queue. */ static void htb_add_to_wait_tree(struct htb_sched *q, - struct htb_class *cl, long delay) + struct htb_class *cl, s64 delay) { - struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; + struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL; cl->pq_key = q->now + delay; if (cl->pq_key == q->now) @@ -295,7 +315,7 @@ static void htb_add_to_wait_tree(struct htb_sched *q, p = &parent->rb_left; } rb_link_node(&cl->pq_node, parent, p); - rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); + rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq); } /** @@ -322,7 +342,7 @@ static inline void htb_add_class_to_row(struct htb_sched *q, while (mask) { int prio = ffz(~mask); mask &= ~(1 << prio); - htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio); + htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio); } } @@ -348,16 +368,18 @@ static inline void htb_remove_class_from_row(struct htb_sched *q, struct htb_class *cl, int mask) { int m = 0; + struct htb_level *hlevel = &q->hlevel[cl->level]; while (mask) { int prio = ffz(~mask); + struct htb_prio *hprio = &hlevel->hprio[prio]; mask &= ~(1 << prio); - if (q->ptr[cl->level][prio] == cl->node + prio) - htb_next_rb_node(q->ptr[cl->level] + prio); + if (hprio->ptr == cl->node + prio) + htb_next_rb_node(&hprio->ptr); - htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio); - if (!q->row[cl->level][prio].rb_node) + htb_safe_rb_erase(cl->node + prio, &hprio->row); + if (!hprio->row.rb_node) m |= 1 << prio; } q->row_mask[cl->level] &= ~m; @@ -381,13 +403,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.feed[prio].rb_node) + if (p->un.inner.clprio[prio].feed.rb_node) /* parent already has its feed in use so that * reset bit in mask as parent is already ok */ mask &= ~(1 << prio); - htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio); + htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio); } p->prio_activity |= mask; cl = p; @@ -417,18 +439,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.ptr[prio] == cl->node + prio) { + if (p->un.inner.clprio[prio].ptr == cl->node + prio) { /* we are removing child which is pointed to from * parent feed - forget the pointer but remember * classid */ - p->un.inner.last_ptr_id[prio] = cl->common.classid; - p->un.inner.ptr[prio] = NULL; + p->un.inner.clprio[prio].last_ptr_id = cl->common.classid; + p->un.inner.clprio[prio].ptr = NULL; } - htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio); + htb_safe_rb_erase(cl->node + prio, + &p->un.inner.clprio[prio].feed); - if (!p->un.inner.feed[prio].rb_node) + if (!p->un.inner.clprio[prio].feed.rb_node) mask |= 1 << prio; } @@ -441,14 +464,14 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) htb_remove_class_from_row(q, cl, mask); } -static inline long htb_lowater(const struct htb_class *cl) +static inline s64 htb_lowater(const struct htb_class *cl) { if (htb_hysteresis) return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0; else return 0; } -static inline long htb_hiwater(const struct htb_class *cl) +static inline s64 htb_hiwater(const struct htb_class *cl) { if (htb_hysteresis) return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0; @@ -469,9 +492,9 @@ static inline long htb_hiwater(const struct htb_class *cl) * mode transitions per time unit. The speed gain is about 1/6. */ static inline enum htb_cmode -htb_class_mode(struct htb_class *cl, long *diff) +htb_class_mode(struct htb_class *cl, s64 *diff) { - long toks; + s64 toks; if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) { *diff = -toks; @@ -495,7 +518,7 @@ htb_class_mode(struct htb_class *cl, long *diff) * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). */ static void -htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) +htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) { enum htb_cmode new_mode = htb_class_mode(cl, diff); @@ -558,9 +581,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) __skb_queue_tail(&q->direct_queue, skb); q->direct_pkts++; } else { - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; + return qdisc_drop(skb, sch); } #ifdef CONFIG_NET_CLS_ACT } else if (!cl) { @@ -576,7 +597,6 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } return ret; } else { - bstats_update(&cl->bstats, skb); htb_activate(q, cl); } @@ -584,26 +604,26 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } -static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff) +static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff) { - long toks = diff + cl->tokens; + s64 toks = diff + cl->tokens; if (toks > cl->buffer) toks = cl->buffer; - toks -= (long) qdisc_l2t(cl->rate, bytes); + toks -= (s64) psched_l2t_ns(&cl->rate, bytes); if (toks <= -cl->mbuffer) toks = 1 - cl->mbuffer; cl->tokens = toks; } -static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff) +static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff) { - long toks = diff + cl->ctokens; + s64 toks = diff + cl->ctokens; if (toks > cl->cbuffer) toks = cl->cbuffer; - toks -= (long) qdisc_l2t(cl->ceil, bytes); + toks -= (s64) psched_l2t_ns(&cl->ceil, bytes); if (toks <= -cl->mbuffer) toks = 1 - cl->mbuffer; @@ -626,10 +646,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, { int bytes = qdisc_pkt_len(skb); enum htb_cmode old_mode; - long diff; + s64 diff; while (cl) { - diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); + diff = min_t(s64, q->now - cl->t_c, cl->mbuffer); if (cl->level >= level) { if (cl->level == level) cl->xstats.lends++; @@ -646,7 +666,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, htb_change_class_mode(q, cl, &diff); if (old_mode != cl->cmode) { if (old_mode != HTB_CAN_SEND) - htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level); + htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq); if (cl->cmode != HTB_CAN_SEND) htb_add_to_wait_tree(q, cl, diff); } @@ -666,18 +686,20 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, * next pending event (0 for no event in pq, q->now for too many events). * Note: Applied are events whose have cl->pq_key <= q->now. */ -static psched_time_t htb_do_events(struct htb_sched *q, int level, - unsigned long start) +static s64 htb_do_events(struct htb_sched *q, const int level, + unsigned long start) { /* don't run for longer than 2 jiffies; 2 is used instead of * 1 to simplify things when jiffy is going to be incremented * too soon */ unsigned long stop_at = start + 2; + struct rb_root *wait_pq = &q->hlevel[level].wait_pq; + while (time_before(jiffies, stop_at)) { struct htb_class *cl; - long diff; - struct rb_node *p = rb_first(&q->wait_pq[level]); + s64 diff; + struct rb_node *p = rb_first(wait_pq); if (!p) return 0; @@ -686,8 +708,8 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level, if (cl->pq_key > q->now) return cl->pq_key; - htb_safe_rb_erase(p, q->wait_pq + level); - diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); + htb_safe_rb_erase(p, wait_pq); + diff = min_t(s64, q->now - cl->t_c, cl->mbuffer); htb_change_class_mode(q, cl, &diff); if (cl->cmode != HTB_CAN_SEND) htb_add_to_wait_tree(q, cl, diff); @@ -695,7 +717,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level, /* too much load - let's continue after a break for scheduling */ if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) { - pr_warning("htb: too many events!\n"); + pr_warn("htb: too many events!\n"); q->warned |= HTB_WARN_TOOMANYEVENTS; } @@ -730,8 +752,7 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n, * * Find leaf where current feed pointers points to. */ -static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, - struct rb_node **pptr, u32 * pid) +static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio) { int i; struct { @@ -740,10 +761,10 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, u32 *pid; } stk[TC_HTB_MAXDEPTH], *sp = stk; - BUG_ON(!tree->rb_node); - sp->root = tree->rb_node; - sp->pptr = pptr; - sp->pid = pid; + BUG_ON(!hprio->row.rb_node); + sp->root = hprio->row.rb_node; + sp->pptr = &hprio->ptr; + sp->pid = &hprio->last_ptr_id; for (i = 0; i < 65535; i++) { if (!*sp->pptr && *sp->pid) { @@ -770,12 +791,15 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, } } else { struct htb_class *cl; + struct htb_prio *clp; + cl = rb_entry(*sp->pptr, struct htb_class, node[prio]); if (!cl->level) return cl; - (++sp)->root = cl->un.inner.feed[prio].rb_node; - sp->pptr = cl->un.inner.ptr + prio; - sp->pid = cl->un.inner.last_ptr_id + prio; + clp = &cl->un.inner.clprio[prio]; + (++sp)->root = clp->feed.rb_node; + sp->pptr = &clp->ptr; + sp->pid = &clp->last_ptr_id; } } WARN_ON(1); @@ -785,15 +809,16 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, /* dequeues packet at given priority and level; call only if * you are sure that there is active class at prio/level */ -static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio, - int level) +static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio, + const int level) { struct sk_buff *skb = NULL; struct htb_class *cl, *start; + struct htb_level *hlevel = &q->hlevel[level]; + struct htb_prio *hprio = &hlevel->hprio[prio]; + /* look initial class up in the row */ - start = cl = htb_lookup_leaf(q->row[level] + prio, prio, - q->ptr[level] + prio, - q->last_ptr_id[level] + prio); + start = cl = htb_lookup_leaf(hprio, prio); do { next: @@ -813,9 +838,7 @@ next: if ((q->row_mask[level] & (1 << prio)) == 0) return NULL; - next = htb_lookup_leaf(q->row[level] + prio, - prio, q->ptr[level] + prio, - q->last_ptr_id[level] + prio); + next = htb_lookup_leaf(hprio, prio); if (cl == start) /* fix start if we just deleted it */ start = next; @@ -828,20 +851,19 @@ next: break; qdisc_warn_nonwc("htb", cl->un.leaf.q); - htb_next_rb_node((level ? cl->parent->un.inner.ptr : q-> - ptr[0]) + prio); - cl = htb_lookup_leaf(q->row[level] + prio, prio, - q->ptr[level] + prio, - q->last_ptr_id[level] + prio); + htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr: + &q->hlevel[0].hprio[prio].ptr); + cl = htb_lookup_leaf(hprio, prio); } while (cl != start); if (likely(skb != NULL)) { + bstats_update(&cl->bstats, skb); cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb); if (cl->un.leaf.deficit[level] < 0) { cl->un.leaf.deficit[level] += cl->quantum; - htb_next_rb_node((level ? cl->parent->un.inner.ptr : q-> - ptr[0]) + prio); + htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr : + &q->hlevel[0].hprio[prio].ptr); } /* this used to be after charge_class but this constelation * gives us slightly better performance @@ -858,7 +880,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) struct sk_buff *skb; struct htb_sched *q = qdisc_priv(sch); int level; - psched_time_t next_event; + s64 next_event; unsigned long start_at; /* try to dequeue direct packets as high prio (!) to minimize cpu work */ @@ -873,23 +895,22 @@ ok: if (!sch->q.qlen) goto fin; - q->now = psched_get_time(); + q->now = ktime_to_ns(ktime_get()); start_at = jiffies; - next_event = q->now + 5 * PSCHED_TICKS_PER_SEC; + next_event = q->now + 5LLU * NSEC_PER_SEC; for (level = 0; level < TC_HTB_MAXDEPTH; level++) { /* common case optimization - skip event handler quickly */ int m; - psched_time_t event; + s64 event = q->near_ev_cache[level]; - if (q->now >= q->near_ev_cache[level]) { + if (q->now >= event) { event = htb_do_events(q, level, start_at); if (!event) - event = q->now + PSCHED_TICKS_PER_SEC; + event = q->now + NSEC_PER_SEC; q->near_ev_cache[level] = event; - } else - event = q->near_ev_cache[level]; + } if (next_event > event) next_event = event; @@ -905,10 +926,17 @@ ok: } } sch->qstats.overlimits++; - if (likely(next_event > q->now)) - qdisc_watchdog_schedule(&q->watchdog, next_event); - else + if (likely(next_event > q->now)) { + if (!test_bit(__QDISC_STATE_DEACTIVATED, + &qdisc_root_sleeping(q->watchdog.qdisc)->state)) { + ktime_t time = ns_to_ktime(next_event); + qdisc_throttled(q->watchdog.qdisc); + hrtimer_start(&q->watchdog.timer, time, + HRTIMER_MODE_ABS); + } + } else { schedule_work(&q->work); + } fin: return skb; } @@ -943,11 +971,10 @@ static void htb_reset(struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl; - struct hlist_node *n; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->level) memset(&cl->un.inner, 0, sizeof(cl->un.inner)); else { @@ -963,10 +990,8 @@ static void htb_reset(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); __skb_queue_purge(&q->direct_queue); sch->q.qlen = 0; - memset(q->row, 0, sizeof(q->row)); + memset(q->hlevel, 0, sizeof(q->hlevel)); memset(q->row_mask, 0, sizeof(q->row_mask)); - memset(q->wait_pq, 0, sizeof(q->wait_pq)); - memset(q->ptr, 0, sizeof(q->ptr)); for (i = 0; i < TC_HTB_NUMPRIO; i++) INIT_LIST_HEAD(q->drops + i); } @@ -976,6 +1001,9 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { [TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) }, [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 }, + [TCA_HTB_RATE64] = { .type = NLA_U64 }, + [TCA_HTB_CEIL64] = { .type = NLA_U64 }, }; static void htb_work_func(struct work_struct *work) @@ -989,7 +1017,7 @@ static void htb_work_func(struct work_struct *work) static int htb_init(struct Qdisc *sch, struct nlattr *opt) { struct htb_sched *q = qdisc_priv(sch); - struct nlattr *tb[TCA_HTB_INIT + 1]; + struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_glob *gopt; int err; int i; @@ -997,20 +1025,16 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy); + err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy); if (err < 0) return err; - if (tb[TCA_HTB_INIT] == NULL) { - pr_err("HTB: hey probably you have bad tc tool ?\n"); + if (!tb[TCA_HTB_INIT]) return -EINVAL; - } + gopt = nla_data(tb[TCA_HTB_INIT]); - if (gopt->version != HTB_VER >> 16) { - pr_err("HTB: need tc/htb version %d (minor is %d), you have %d\n", - HTB_VER >> 16, HTB_VER & 0xffff, gopt->version); + if (gopt->version != HTB_VER >> 16) return -EINVAL; - } err = qdisc_class_hash_init(&q->clhash); if (err < 0) @@ -1022,10 +1046,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) INIT_WORK(&q->work, htb_work_func); skb_queue_head_init(&q->direct_queue); - q->direct_qlen = qdisc_dev(sch)->tx_queue_len; - if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ - q->direct_qlen = 2; - + if (tb[TCA_HTB_DIRECT_QLEN]) + q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); + else { + q->direct_qlen = qdisc_dev(sch)->tx_queue_len; + if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ + q->direct_qlen = 2; + } if ((q->rate2quantum = gopt->rate2quantum) < 1) q->rate2quantum = 1; q->defcls = gopt->defcls; @@ -1035,12 +1062,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) { - spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); struct htb_sched *q = qdisc_priv(sch); struct nlattr *nest; struct tc_htb_glob gopt; - spin_lock_bh(root_lock); + /* Its safe to not acquire qdisc lock. As we hold RTNL, + * no change can happen on the qdisc parameters. + */ gopt.direct_pkts = q->direct_pkts; gopt.version = HTB_VER; @@ -1051,14 +1079,13 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); - nla_nest_end(skb, nest); + if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) || + nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen)) + goto nla_put_failure; - spin_unlock_bh(root_lock); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: - spin_unlock_bh(root_lock); nla_nest_cancel(skb, nest); return -1; } @@ -1067,11 +1094,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct htb_class *cl = (struct htb_class *)arg; - spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); struct nlattr *nest; struct tc_htb_opt opt; - spin_lock_bh(root_lock); + /* Its safe to not acquire qdisc lock. As we hold RTNL, + * no change can happen on the class parameters. + */ tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT; tcm->tcm_handle = cl->common.classid; if (!cl->level && cl->un.leaf.q) @@ -1083,21 +1111,25 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, memset(&opt, 0, sizeof(opt)); - opt.rate = cl->rate->rate; - opt.buffer = cl->buffer; - opt.ceil = cl->ceil->rate; - opt.cbuffer = cl->cbuffer; + psched_ratecfg_getrate(&opt.rate, &cl->rate); + opt.buffer = PSCHED_NS2TICKS(cl->buffer); + psched_ratecfg_getrate(&opt.ceil, &cl->ceil); + opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer); opt.quantum = cl->quantum; opt.prio = cl->prio; opt.level = cl->level; - NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) && + nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps)) + goto nla_put_failure; + if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) && + nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps)) + goto nla_put_failure; - nla_nest_end(skb, nest); - spin_unlock_bh(root_lock); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: - spin_unlock_bh(root_lock); nla_nest_cancel(skb, nest); return -1; } @@ -1109,8 +1141,8 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) if (!cl->level && cl->un.leaf.q) cl->qstats.qlen = cl->un.leaf.q->q.qlen; - cl->xstats.tokens = cl->tokens; - cl->xstats.ctokens = cl->ctokens; + cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); + cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || @@ -1184,7 +1216,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity); if (parent->cmode != HTB_CAN_SEND) - htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level); + htb_safe_rb_erase(&parent->pq_node, + &q->hlevel[parent->level].wait_pq); parent->level = 0; memset(&parent->un.inner, 0, sizeof(parent->un.inner)); @@ -1192,7 +1225,7 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, parent->un.leaf.q = new_q ? new_q : &noop_qdisc; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; - parent->t_c = psched_get_time(); + parent->t_c = ktime_to_ns(ktime_get()); parent->cmode = HTB_CAN_SEND; } @@ -1203,9 +1236,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) qdisc_destroy(cl->un.leaf.q); } gen_kill_estimator(&cl->bstats, &cl->rate_est); - qdisc_put_rtab(cl->rate); - qdisc_put_rtab(cl->ceil); - tcf_destroy_chain(&cl->filter_list); kfree(cl); } @@ -1213,7 +1243,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) static void htb_destroy(struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); - struct hlist_node *n, *next; + struct hlist_node *next; struct htb_class *cl; unsigned int i; @@ -1227,11 +1257,11 @@ static void htb_destroy(struct Qdisc *sch) tcf_destroy_chain(&q->filter_list); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) tcf_destroy_chain(&cl->filter_list); } for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) htb_destroy_class(sch, cl); } @@ -1247,9 +1277,10 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) struct Qdisc *new_q = NULL; int last_child = 0; - // TODO: why don't allow to delete subtree ? references ? does - // tc subsys quarantee us that in htb_destroy it holds no class - // refs so that we can remove children safely there ? + /* TODO: why don't allow to delete subtree ? references ? does + * tc subsys guarantee us that in htb_destroy it holds no class + * refs so that we can remove children safely there ? + */ if (cl->children || cl->filter_cnt) return -EBUSY; @@ -1276,7 +1307,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) htb_deactivate(q, cl); if (cl->cmode != HTB_CAN_SEND) - htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level); + htb_safe_rb_erase(&cl->pq_node, + &q->hlevel[cl->level].wait_pq); if (last_child) htb_parent_to_leaf(q, cl, new_q); @@ -1307,9 +1339,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; - struct qdisc_rate_table *rtab = NULL, *ctab = NULL; - struct nlattr *tb[__TCA_HTB_MAX]; + struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; + u64 rate64, ceil64; /* extract all subattrs from opt attr */ if (!opt) @@ -1326,12 +1358,16 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch); hopt = nla_data(tb[TCA_HTB_PARMS]); - - rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); - ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); - if (!rtab || !ctab) + if (!hopt->rate.rate || !hopt->ceil.rate) goto failure; + /* Keeping backward compatible with rate_table based iproute2 tc */ + if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB])); + + if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB])); + if (!cl) { /* new class */ struct Qdisc *new_q; int prio; @@ -1365,12 +1401,14 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; - err = gen_new_estimator(&cl->bstats, &cl->rate_est, - qdisc_root_sleeping_lock(sch), - tca[TCA_RATE] ? : &est.nla); - if (err) { - kfree(cl); - goto failure; + if (htb_rate_est || tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE] ? : &est.nla); + if (err) { + kfree(cl); + goto failure; + } } cl->refcnt = 1; @@ -1400,7 +1438,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* remove from evt list because of level change */ if (parent->cmode != HTB_CAN_SEND) { - htb_safe_rb_erase(&parent->pq_node, q->wait_pq); + htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq); parent->cmode = HTB_CAN_SEND; } parent->level = (parent->parent ? parent->parent->level @@ -1414,10 +1452,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->parent = parent; /* set class to be in HTB_CAN_SEND state */ - cl->tokens = hopt->buffer; - cl->ctokens = hopt->cbuffer; - cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC; /* 1min */ - cl->t_c = psched_get_time(); + cl->tokens = PSCHED_TICKS2NS(hopt->buffer); + cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer); + cl->mbuffer = 60ULL * NSEC_PER_SEC; /* 1min */ + cl->t_c = ktime_to_ns(ktime_get()); cl->cmode = HTB_CAN_SEND; /* attach to the hash list and parent's family */ @@ -1435,21 +1473,30 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, sch_tree_lock(sch); } + rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + + ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + + psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); + psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); + /* it used to be a nasty bug here, we have to check that node * is really leaf before changing cl->un.leaf ! */ if (!cl->level) { - cl->quantum = rtab->rate.rate / q->rate2quantum; + u64 quantum = cl->rate.rate_bytes_ps; + + do_div(quantum, q->rate2quantum); + cl->quantum = min_t(u64, quantum, INT_MAX); + if (!hopt->quantum && cl->quantum < 1000) { - pr_warning( - "HTB: quantum of class %X is small. Consider r2q change.\n", - cl->common.classid); + pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n", + cl->common.classid); cl->quantum = 1000; } if (!hopt->quantum && cl->quantum > 200000) { - pr_warning( - "HTB: quantum of class %X is big. Consider r2q change.\n", - cl->common.classid); + pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n", + cl->common.classid); cl->quantum = 200000; } if (hopt->quantum) @@ -1458,14 +1505,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->prio = TC_HTB_NUMPRIO - 1; } - cl->buffer = hopt->buffer; - cl->cbuffer = hopt->cbuffer; - if (cl->rate) - qdisc_put_rtab(cl->rate); - cl->rate = rtab; - if (cl->ceil) - qdisc_put_rtab(cl->ceil); - cl->ceil = ctab; + cl->buffer = PSCHED_TICKS2NS(hopt->buffer); + cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); + sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); @@ -1474,10 +1516,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, return 0; failure: - if (rtab) - qdisc_put_rtab(rtab); - if (ctab) - qdisc_put_rtab(ctab); return err; } @@ -1521,14 +1559,13 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl; - struct hlist_node *n; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index bce1665239b..62871c14e1f 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -100,8 +100,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - nla_nest_end(skb, nest); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 0a4b2f9a009..a8b2864a696 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -57,12 +57,13 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); - qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, + qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1))); if (qdisc == NULL) goto err; priv->qdiscs[ntx] = qdisc; + qdisc->flags |= TCQ_F_ONETXQUEUE; } sch->flags |= TCQ_F_MQROOT; @@ -77,14 +78,19 @@ static void mq_attach(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); - struct Qdisc *qdisc; + struct Qdisc *qdisc, *old; unsigned int ntx; for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = priv->qdiscs[ntx]; - qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc); - if (qdisc) - qdisc_destroy(qdisc); + old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + if (old) + qdisc_destroy(old); +#ifdef CONFIG_NET_SCHED + if (ntx < dev->real_num_tx_queues) + qdisc_list_add(qdisc); +#endif + } kfree(priv->qdiscs); priv->qdiscs = NULL; @@ -150,7 +156,8 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, dev_deactivate(dev); *old = dev_graft_qdisc(dev_queue, new); - + if (new) + new->flags |= TCQ_F_ONETXQUEUE; if (dev->flags & IFF_UP) dev_activate(dev); return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 28de4309233..6749e2f540d 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -124,7 +124,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < dev->num_tx_queues; i++) { dev_queue = netdev_get_tx_queue(dev, i); - qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, + qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1))); if (qdisc == NULL) { @@ -132,6 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) goto err; } priv->qdiscs[i] = qdisc; + qdisc->flags |= TCQ_F_ONETXQUEUE; } /* If the mqprio options indicate that hardware should own @@ -166,15 +167,17 @@ static void mqprio_attach(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); - struct Qdisc *qdisc; + struct Qdisc *qdisc, *old; unsigned int ntx; /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = priv->qdiscs[ntx]; - qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc); - if (qdisc) - qdisc_destroy(qdisc); + old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + if (old) + qdisc_destroy(old); + if (ntx < dev->real_num_tx_queues) + qdisc_list_add(qdisc); } kfree(priv->qdiscs); priv->qdiscs = NULL; @@ -205,6 +208,9 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); + if (new) + new->flags |= TCQ_F_ONETXQUEUE; + if (dev->flags & IFF_UP) dev_activate(dev); @@ -247,7 +253,8 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) opt.offset[i] = dev->tc_to_txq[i].offset; } - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 49131d7a744..afb050a735f 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -11,8 +11,7 @@ * more details. * * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. + * this program; if not, see <http://www.gnu.org/licenses/>. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ @@ -284,7 +283,8 @@ static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) opt.bands = q->bands; opt.max_bands = q->max_bands; - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e83d61ca78c..111d70fddae 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -23,9 +23,11 @@ #include <linux/vmalloc.h> #include <linux/rtnetlink.h> #include <linux/reciprocal_div.h> +#include <linux/rbtree.h> #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/inet_ecn.h> #define VERSION "1.3" @@ -67,7 +69,8 @@ */ struct netem_sched_data { - /* internal t(ime)fifo qdisc uses sch->q and sch->limit */ + /* internal t(ime)fifo qdisc uses t_root and sch->limit */ + struct rb_root t_root; /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; @@ -78,16 +81,17 @@ struct netem_sched_data { psched_tdiff_t jitter; u32 loss; + u32 ecn; u32 limit; u32 counter; u32 gap; u32 duplicate; u32 reorder; u32 corrupt; - u32 rate; + u64 rate; s32 packet_overhead; u32 cell_size; - u32 cell_size_reciprocal; + struct reciprocal_value cell_size_reciprocal; s32 cell_overhead; struct crndstate { @@ -106,6 +110,18 @@ struct netem_sched_data { CLG_GILB_ELL, } loss_model; + enum { + TX_IN_GAP_PERIOD = 1, + TX_IN_BURST_PERIOD, + LOST_IN_GAP_PERIOD, + LOST_IN_BURST_PERIOD, + } _4_state_model; + + enum { + GOOD_STATE = 1, + BAD_STATE, + } GE_state_model; + /* Correlated Loss Generation models */ struct clgstate { /* state of the Markov chain */ @@ -126,10 +142,35 @@ struct netem_sched_data { */ struct netem_skb_cb { psched_time_t time_to_send; + ktime_t tstamp_save; }; +/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp + * to hold a rb_node structure. + * + * If struct sk_buff layout is changed, the following checks will complain. + */ +static struct rb_node *netem_rb_node(struct sk_buff *skb) +{ + BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0); + BUILD_BUG_ON(offsetof(struct sk_buff, prev) != + offsetof(struct sk_buff, next) + sizeof(skb->next)); + BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) != + offsetof(struct sk_buff, prev) + sizeof(skb->prev)); + BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) + + sizeof(skb->prev) + + sizeof(skb->tstamp)); + return (struct rb_node *)&skb->next; +} + +static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) +{ + return (struct sk_buff *)rb; +} + static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { + /* we assume we can use skb next/prev/tstamp as storage for rb_node */ qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; } @@ -140,7 +181,7 @@ static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) static void init_crandom(struct crndstate *state, unsigned long rho) { state->rho = rho; - state->last = net_random(); + state->last = prandom_u32(); } /* get_crandom - correlated random number generator @@ -153,9 +194,9 @@ static u32 get_crandom(struct crndstate *state) unsigned long answer; if (state->rho == 0) /* no correlation */ - return net_random(); + return prandom_u32(); - value = net_random(); + value = prandom_u32(); rho = (u64)state->rho + 1; answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; state->last = answer; @@ -169,51 +210,52 @@ static u32 get_crandom(struct crndstate *state) static bool loss_4state(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; - u32 rnd = net_random(); + u32 rnd = prandom_u32(); /* * Makes a comparison between rnd and the transition * probabilities outgoing from the current state, then decides the * next state and if the next packet has to be transmitted or lost. * The four states correspond to: - * 1 => successfully transmitted packets within a gap period - * 4 => isolated losses within a gap period - * 3 => lost packets within a burst period - * 2 => successfully transmitted packets within a burst period + * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period + * LOST_IN_BURST_PERIOD => isolated losses within a gap period + * LOST_IN_GAP_PERIOD => lost packets within a burst period + * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period */ switch (clg->state) { - case 1: + case TX_IN_GAP_PERIOD: if (rnd < clg->a4) { - clg->state = 4; + clg->state = LOST_IN_BURST_PERIOD; return true; - } else if (clg->a4 < rnd && rnd < clg->a1) { - clg->state = 3; + } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { + clg->state = LOST_IN_GAP_PERIOD; return true; - } else if (clg->a1 < rnd) - clg->state = 1; + } else if (clg->a1 + clg->a4 < rnd) { + clg->state = TX_IN_GAP_PERIOD; + } break; - case 2: + case TX_IN_BURST_PERIOD: if (rnd < clg->a5) { - clg->state = 3; + clg->state = LOST_IN_GAP_PERIOD; return true; - } else - clg->state = 2; + } else { + clg->state = TX_IN_BURST_PERIOD; + } break; - case 3: + case LOST_IN_GAP_PERIOD: if (rnd < clg->a3) - clg->state = 2; + clg->state = TX_IN_BURST_PERIOD; else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { - clg->state = 1; - return true; + clg->state = TX_IN_GAP_PERIOD; } else if (clg->a2 + clg->a3 < rnd) { - clg->state = 3; + clg->state = LOST_IN_GAP_PERIOD; return true; } break; - case 4: - clg->state = 1; + case LOST_IN_BURST_PERIOD: + clg->state = TX_IN_GAP_PERIOD; break; } @@ -235,15 +277,16 @@ static bool loss_gilb_ell(struct netem_sched_data *q) struct clgstate *clg = &q->clg; switch (clg->state) { - case 1: - if (net_random() < clg->a1) - clg->state = 2; - if (net_random() < clg->a4) + case GOOD_STATE: + if (prandom_u32() < clg->a1) + clg->state = BAD_STATE; + if (prandom_u32() < clg->a4) return true; - case 2: - if (net_random() < clg->a2) - clg->state = 1; - if (clg->a3 > net_random()) + break; + case BAD_STATE: + if (prandom_u32() < clg->a2) + clg->state = GOOD_STATE; + if (prandom_u32() > clg->a3) return true; } @@ -329,29 +372,40 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche return PSCHED_NS2TICKS(ticks); } -static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) +static void tfifo_reset(struct Qdisc *sch) { - struct sk_buff_head *list = &sch->q; - psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; - struct sk_buff *skb; + struct netem_sched_data *q = qdisc_priv(sch); + struct rb_node *p; - if (likely(skb_queue_len(list) < sch->limit)) { - skb = skb_peek_tail(list); - /* Optimize for add at tail */ - if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send)) - return qdisc_enqueue_tail(nskb, sch); + while ((p = rb_first(&q->t_root))) { + struct sk_buff *skb = netem_rb_to_skb(p); - skb_queue_reverse_walk(list, skb) { - if (tnext >= netem_skb_cb(skb)->time_to_send) - break; - } - - __skb_queue_after(list, skb, nskb); - sch->qstats.backlog += qdisc_pkt_len(nskb); - return NET_XMIT_SUCCESS; + rb_erase(p, &q->t_root); + skb->next = NULL; + skb->prev = NULL; + kfree_skb(skb); } +} + +static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; + struct rb_node **p = &q->t_root.rb_node, *parent = NULL; + + while (*p) { + struct sk_buff *skb; - return qdisc_reshape_fail(nskb, sch); + parent = *p; + skb = netem_rb_to_skb(parent); + if (tnext >= netem_skb_cb(skb)->time_to_send) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(netem_rb_node(nskb), parent, p); + rb_insert_color(netem_rb_node(nskb), &q->t_root); + sch->q.qlen++; } /* @@ -366,7 +420,6 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* We don't fill cb now as skb_unshare() may invalidate it */ struct netem_skb_cb *cb; struct sk_buff *skb2; - int ret; int count = 1; /* Random duplication */ @@ -374,16 +427,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) ++count; /* Drop packet? */ - if (loss_event(q)) - --count; - + if (loss_event(q)) { + if (q->ecn && INET_ECN_set_ce(skb)) + sch->qstats.drops++; /* mark packet */ + else + --count; + } if (count == 0) { sch->qstats.drops++; kfree_skb(skb); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } - skb_orphan(skb); + /* If a delay is expected, orphan the skb. (orphaning usually takes + * place at TX completion time, so _before_ the link transit delay) + */ + if (q->latency || q->jitter) + skb_orphan_partial(skb); /* * If we need to duplicate packet, then re-insert at top of the @@ -408,14 +468,18 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb))) { - sch->qstats.drops++; - return NET_XMIT_DROP; - } + skb_checksum_help(skb))) + return qdisc_drop(skb, sch); - skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); + skb->data[prandom_u32() % skb_headlen(skb)] ^= + 1<<(prandom_u32() % 8); } + if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) + return qdisc_reshape_fail(skb, sch); + + sch->qstats.backlog += qdisc_pkt_len(skb); + cb = netem_skb_cb(skb); if (q->gap == 0 || /* not doing reordering */ q->counter < q->gap - 1 || /* inside last reordering gap */ @@ -429,25 +493,30 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) now = psched_get_time(); if (q->rate) { - struct sk_buff_head *list = &sch->q; - - delay += packet_len_2_sched_time(skb->len, q); + struct sk_buff *last; - if (!skb_queue_empty(list)) { + if (!skb_queue_empty(&sch->q)) + last = skb_peek_tail(&sch->q); + else + last = netem_rb_to_skb(rb_last(&q->t_root)); + if (last) { /* - * Last packet in queue is reference point (now). - * First packet in queue is already in flight, - * calculate this time bonus and substract + * Last packet in queue is reference point (now), + * calculate this time bonus and subtract * from delay. */ - delay -= now - netem_skb_cb(skb_peek(list))->time_to_send; - now = netem_skb_cb(skb_peek_tail(list))->time_to_send; + delay -= netem_skb_cb(last)->time_to_send - now; + delay = max_t(psched_tdiff_t, 0, delay); + now = netem_skb_cb(last)->time_to_send; } + + delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q); } cb->time_to_send = now + delay; + cb->tstamp_save = skb->tstamp; ++q->counter; - ret = tfifo_enqueue(skb, sch); + tfifo_enqueue(skb, sch); } else { /* * Do re-ordering by putting one out of N packets at the front @@ -457,16 +526,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->counter = 0; __skb_queue_head(&sch->q, skb); - sch->qstats.backlog += qdisc_pkt_len(skb); sch->qstats.requeues++; - ret = NET_XMIT_SUCCESS; - } - - if (ret != NET_XMIT_SUCCESS) { - if (net_xmit_drop_count(ret)) { - sch->qstats.drops++; - return ret; - } } return NET_XMIT_SUCCESS; @@ -478,6 +538,22 @@ static unsigned int netem_drop(struct Qdisc *sch) unsigned int len; len = qdisc_queue_drop(sch); + + if (!len) { + struct rb_node *p = rb_first(&q->t_root); + + if (p) { + struct sk_buff *skb = netem_rb_to_skb(p); + + rb_erase(p, &q->t_root); + sch->q.qlen--; + skb->next = NULL; + skb->prev = NULL; + len = qdisc_pkt_len(skb); + sch->qstats.backlog -= len; + kfree_skb(skb); + } + } if (!len && q->qdisc && q->qdisc->ops->drop) len = q->qdisc->ops->drop(q->qdisc); if (len) @@ -490,20 +566,35 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; + struct rb_node *p; if (qdisc_is_throttled(sch)) return NULL; tfifo_dequeue: - skb = qdisc_peek_head(sch); + skb = __skb_dequeue(&sch->q); if (skb) { - const struct netem_skb_cb *cb = netem_skb_cb(skb); +deliver: + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); + return skb; + } + p = rb_first(&q->t_root); + if (p) { + psched_time_t time_to_send; + + skb = netem_rb_to_skb(p); /* if more time remaining? */ - if (cb->time_to_send <= psched_get_time()) { - skb = qdisc_dequeue_tail(sch); - if (unlikely(!skb)) - goto qdisc_dequeue; + time_to_send = netem_skb_cb(skb)->time_to_send; + if (time_to_send <= psched_get_time()) { + rb_erase(p, &q->t_root); + + sch->q.qlen--; + skb->next = NULL; + skb->prev = NULL; + skb->tstamp = netem_skb_cb(skb)->tstamp_save; #ifdef CONFIG_NET_CLS_ACT /* @@ -525,10 +616,7 @@ tfifo_dequeue: } goto tfifo_dequeue; } -deliver: - qdisc_unthrottled(sch); - qdisc_bstats_update(sch, skb); - return skb; + goto deliver; } if (q->qdisc) { @@ -536,10 +624,9 @@ deliver: if (skb) goto deliver; } - qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); + qdisc_watchdog_schedule(&q->watchdog, time_to_send); } -qdisc_dequeue: if (q->qdisc) { skb = q->qdisc->ops->dequeue(q->qdisc); if (skb) @@ -553,6 +640,7 @@ static void netem_reset(struct Qdisc *sch) struct netem_sched_data *q = qdisc_priv(sch); qdisc_reset_queue(sch); + tfifo_reset(sch); if (q->qdisc) qdisc_reset(q->qdisc); qdisc_watchdog_cancel(&q->watchdog); @@ -560,12 +648,7 @@ static void netem_reset(struct Qdisc *sch) static void dist_free(struct disttable *d) { - if (d) { - if (is_vmalloc_addr(d)) - vfree(d); - else - kfree(d); - } + kvfree(d); } /* @@ -606,9 +689,8 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) return 0; } -static void get_correlation(struct Qdisc *sch, const struct nlattr *attr) +static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); const struct tc_netem_corr *c = nla_data(attr); init_crandom(&q->delay_cor, c->delay_corr); @@ -616,47 +698,45 @@ static void get_correlation(struct Qdisc *sch, const struct nlattr *attr) init_crandom(&q->dup_cor, c->dup_corr); } -static void get_reorder(struct Qdisc *sch, const struct nlattr *attr) +static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); const struct tc_netem_reorder *r = nla_data(attr); q->reorder = r->probability; init_crandom(&q->reorder_cor, r->correlation); } -static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr) +static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); const struct tc_netem_corrupt *r = nla_data(attr); q->corrupt = r->probability; init_crandom(&q->corrupt_cor, r->correlation); } -static void get_rate(struct Qdisc *sch, const struct nlattr *attr) +static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); const struct tc_netem_rate *r = nla_data(attr); q->rate = r->rate; q->packet_overhead = r->packet_overhead; q->cell_size = r->cell_size; + q->cell_overhead = r->cell_overhead; if (q->cell_size) q->cell_size_reciprocal = reciprocal_value(q->cell_size); - q->cell_overhead = r->cell_overhead; + else + q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; } -static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) +static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); const struct nlattr *la; int rem; nla_for_each_nested(la, attr, rem) { u16 type = nla_type(la); - switch(type) { + switch (type) { case NETEM_LOSS_GI: { const struct tc_netem_gimodel *gi = nla_data(la); @@ -667,7 +747,7 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) q->loss_model = CLG_4_STATES; - q->clg.state = 1; + q->clg.state = TX_IN_GAP_PERIOD; q->clg.a1 = gi->p13; q->clg.a2 = gi->p31; q->clg.a3 = gi->p32; @@ -685,7 +765,7 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) } q->loss_model = CLG_GILB_ELL; - q->clg.state = 1; + q->clg.state = GOOD_STATE; q->clg.a1 = ge->p; q->clg.a2 = ge->r; q->clg.a3 = ge->h; @@ -708,6 +788,8 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, + [TCA_NETEM_ECN] = { .type = NLA_U32 }, + [TCA_NETEM_RATE64] = { .type = NLA_U64 }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -734,6 +816,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) struct netem_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_NETEM_MAX + 1]; struct tc_netem_qopt *qopt; + struct clgstate old_clg; + int old_loss_model = CLG_RANDOM; int ret; if (opt == NULL) @@ -744,6 +828,33 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) if (ret < 0) return ret; + /* backup q->clg and q->loss_model */ + old_clg = q->clg; + old_loss_model = q->loss_model; + + if (tb[TCA_NETEM_LOSS]) { + ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); + if (ret) { + q->loss_model = old_loss_model; + return ret; + } + } else { + q->loss_model = CLG_RANDOM; + } + + if (tb[TCA_NETEM_DELAY_DIST]) { + ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); + if (ret) { + /* recover clg and loss_model, in case of + * q->clg and q->loss_model were modified + * in get_loss_clg() + */ + q->clg = old_clg; + q->loss_model = old_loss_model; + return ret; + } + } + sch->limit = qopt->limit; q->latency = qopt->latency; @@ -761,26 +872,23 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) q->reorder = ~0; if (tb[TCA_NETEM_CORR]) - get_correlation(sch, tb[TCA_NETEM_CORR]); - - if (tb[TCA_NETEM_DELAY_DIST]) { - ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); - if (ret) - return ret; - } + get_correlation(q, tb[TCA_NETEM_CORR]); if (tb[TCA_NETEM_REORDER]) - get_reorder(sch, tb[TCA_NETEM_REORDER]); + get_reorder(q, tb[TCA_NETEM_REORDER]); if (tb[TCA_NETEM_CORRUPT]) - get_corrupt(sch, tb[TCA_NETEM_CORRUPT]); + get_corrupt(q, tb[TCA_NETEM_CORRUPT]); if (tb[TCA_NETEM_RATE]) - get_rate(sch, tb[TCA_NETEM_RATE]); + get_rate(q, tb[TCA_NETEM_RATE]); - q->loss_model = CLG_RANDOM; - if (tb[TCA_NETEM_LOSS]) - ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]); + if (tb[TCA_NETEM_RATE64]) + q->rate = max_t(u64, q->rate, + nla_get_u64(tb[TCA_NETEM_RATE64])); + + if (tb[TCA_NETEM_ECN]) + q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); return ret; } @@ -836,7 +944,8 @@ static int dump_loss_model(const struct netem_sched_data *q, .p23 = q->clg.a5, }; - NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi); + if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) + goto nla_put_failure; break; } case CLG_GILB_ELL: { @@ -847,7 +956,8 @@ static int dump_loss_model(const struct netem_sched_data *q, .k1 = q->clg.a4, }; - NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge); + if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) + goto nla_put_failure; break; } } @@ -876,26 +986,40 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) qopt.loss = q->loss; qopt.gap = q->gap; qopt.duplicate = q->duplicate; - NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) + goto nla_put_failure; cor.delay_corr = q->delay_cor.rho; cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; - NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); + if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) + goto nla_put_failure; reorder.probability = q->reorder; reorder.correlation = q->reorder_cor.rho; - NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) + goto nla_put_failure; corrupt.probability = q->corrupt; corrupt.correlation = q->corrupt_cor.rho; - NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) + goto nla_put_failure; - rate.rate = q->rate; + if (q->rate >= (1ULL << 32)) { + if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate)) + goto nla_put_failure; + rate.rate = ~0U; + } else { + rate.rate = q->rate; + } rate.packet_overhead = q->packet_overhead; rate.cell_size = q->cell_size; rate.cell_overhead = q->cell_overhead; - NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate); + if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) + goto nla_put_failure; + + if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) + goto nla_put_failure; if (dump_loss_model(q, skb) != 0) goto nla_put_failure; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c new file mode 100644 index 00000000000..fefeeb73f15 --- /dev/null +++ b/net/sched/sch_pie.c @@ -0,0 +1,566 @@ +/* Copyright (C) 2013 Cisco Systems, Inc, 2013. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Author: Vijay Subramanian <vijaynsu@cisco.com> + * Author: Mythili Prabhu <mysuryan@cisco.com> + * + * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no> + * University of Oslo, Norway. + * + * References: + * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00 + * IEEE Conference on High Performance Switching and Routing 2013 : + * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem" + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> + +#define QUEUE_THRESHOLD 10000 +#define DQCOUNT_INVALID -1 +#define MAX_PROB 0xffffffff +#define PIE_SCALE 8 + +/* parameters used */ +struct pie_params { + psched_time_t target; /* user specified target delay in pschedtime */ + u32 tupdate; /* timer frequency (in jiffies) */ + u32 limit; /* number of packets that can be enqueued */ + u32 alpha; /* alpha and beta are between 0 and 32 */ + u32 beta; /* and are used for shift relative to 1 */ + bool ecn; /* true if ecn is enabled */ + bool bytemode; /* to scale drop early prob based on pkt size */ +}; + +/* variables used */ +struct pie_vars { + u32 prob; /* probability but scaled by u32 limit. */ + psched_time_t burst_time; + psched_time_t qdelay; + psched_time_t qdelay_old; + u64 dq_count; /* measured in bytes */ + psched_time_t dq_tstamp; /* drain rate */ + u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ + u32 qlen_old; /* in bytes */ +}; + +/* statistics gathering */ +struct pie_stats { + u32 packets_in; /* total number of packets enqueued */ + u32 dropped; /* packets dropped due to pie_action */ + u32 overlimit; /* dropped due to lack of space in queue */ + u32 maxq; /* maximum queue size */ + u32 ecn_mark; /* packets marked with ECN */ +}; + +/* private data for the Qdisc */ +struct pie_sched_data { + struct pie_params params; + struct pie_vars vars; + struct pie_stats stats; + struct timer_list adapt_timer; +}; + +static void pie_params_init(struct pie_params *params) +{ + params->alpha = 2; + params->beta = 20; + params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */ + params->limit = 1000; /* default of 1000 packets */ + params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */ + params->ecn = false; + params->bytemode = false; +} + +static void pie_vars_init(struct pie_vars *vars) +{ + vars->dq_count = DQCOUNT_INVALID; + vars->avg_dq_rate = 0; + /* default of 100 ms in pschedtime */ + vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC); +} + +static bool drop_early(struct Qdisc *sch, u32 packet_size) +{ + struct pie_sched_data *q = qdisc_priv(sch); + u32 rnd; + u32 local_prob = q->vars.prob; + u32 mtu = psched_mtu(qdisc_dev(sch)); + + /* If there is still burst allowance left skip random early drop */ + if (q->vars.burst_time > 0) + return false; + + /* If current delay is less than half of target, and + * if drop prob is low already, disable early_drop + */ + if ((q->vars.qdelay < q->params.target / 2) + && (q->vars.prob < MAX_PROB / 5)) + return false; + + /* If we have fewer than 2 mtu-sized packets, disable drop_early, + * similar to min_th in RED + */ + if (sch->qstats.backlog < 2 * mtu) + return false; + + /* If bytemode is turned on, use packet size to compute new + * probablity. Smaller packets will have lower drop prob in this case + */ + if (q->params.bytemode && packet_size <= mtu) + local_prob = (local_prob / mtu) * packet_size; + else + local_prob = q->vars.prob; + + rnd = prandom_u32(); + if (rnd < local_prob) + return true; + + return false; +} + +static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + bool enqueue = false; + + if (unlikely(qdisc_qlen(sch) >= sch->limit)) { + q->stats.overlimit++; + goto out; + } + + if (!drop_early(sch, skb->len)) { + enqueue = true; + } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && + INET_ECN_set_ce(skb)) { + /* If packet is ecn capable, mark it if drop probability + * is lower than 10%, else drop it. + */ + q->stats.ecn_mark++; + enqueue = true; + } + + /* we can enqueue the packet */ + if (enqueue) { + q->stats.packets_in++; + if (qdisc_qlen(sch) > q->stats.maxq) + q->stats.maxq = qdisc_qlen(sch); + + return qdisc_enqueue_tail(skb, sch); + } + +out: + q->stats.dropped++; + return qdisc_drop(skb, sch); +} + +static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { + [TCA_PIE_TARGET] = {.type = NLA_U32}, + [TCA_PIE_LIMIT] = {.type = NLA_U32}, + [TCA_PIE_TUPDATE] = {.type = NLA_U32}, + [TCA_PIE_ALPHA] = {.type = NLA_U32}, + [TCA_PIE_BETA] = {.type = NLA_U32}, + [TCA_PIE_ECN] = {.type = NLA_U32}, + [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, +}; + +static int pie_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct pie_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_PIE_MAX + 1]; + unsigned int qlen; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + /* convert from microseconds to pschedtime */ + if (tb[TCA_PIE_TARGET]) { + /* target is in us */ + u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); + + /* convert to pschedtime */ + q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + } + + /* tupdate is in jiffies */ + if (tb[TCA_PIE_TUPDATE]) + q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); + + if (tb[TCA_PIE_LIMIT]) { + u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); + + q->params.limit = limit; + sch->limit = limit; + } + + if (tb[TCA_PIE_ALPHA]) + q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); + + if (tb[TCA_PIE_BETA]) + q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); + + if (tb[TCA_PIE_ECN]) + q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); + + if (tb[TCA_PIE_BYTEMODE]) + q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); + + /* Drop excess packets if new limit is lower */ + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = __skb_dequeue(&sch->q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + sch_tree_unlock(sch); + return 0; +} + +static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) +{ + + struct pie_sched_data *q = qdisc_priv(sch); + int qlen = sch->qstats.backlog; /* current queue size in bytes */ + + /* If current queue is about 10 packets or more and dq_count is unset + * we have enough packets to calculate the drain rate. Save + * current time as dq_tstamp and start measurement cycle. + */ + if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { + q->vars.dq_tstamp = psched_get_time(); + q->vars.dq_count = 0; + } + + /* Calculate the average drain rate from this value. If queue length + * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset + * the dq_count to -1 as we don't have enough packets to calculate the + * drain rate anymore The following if block is entered only when we + * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) + * and we calculate the drain rate for the threshold here. dq_count is + * in bytes, time difference in psched_time, hence rate is in + * bytes/psched_time. + */ + if (q->vars.dq_count != DQCOUNT_INVALID) { + q->vars.dq_count += skb->len; + + if (q->vars.dq_count >= QUEUE_THRESHOLD) { + psched_time_t now = psched_get_time(); + u32 dtime = now - q->vars.dq_tstamp; + u32 count = q->vars.dq_count << PIE_SCALE; + + if (dtime == 0) + return; + + count = count / dtime; + + if (q->vars.avg_dq_rate == 0) + q->vars.avg_dq_rate = count; + else + q->vars.avg_dq_rate = + (q->vars.avg_dq_rate - + (q->vars.avg_dq_rate >> 3)) + (count >> 3); + + /* If the queue has receded below the threshold, we hold + * on to the last drain rate calculated, else we reset + * dq_count to 0 to re-enter the if block when the next + * packet is dequeued + */ + if (qlen < QUEUE_THRESHOLD) + q->vars.dq_count = DQCOUNT_INVALID; + else { + q->vars.dq_count = 0; + q->vars.dq_tstamp = psched_get_time(); + } + + if (q->vars.burst_time > 0) { + if (q->vars.burst_time > dtime) + q->vars.burst_time -= dtime; + else + q->vars.burst_time = 0; + } + } + } +} + +static void calculate_probability(struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + u32 qlen = sch->qstats.backlog; /* queue size in bytes */ + psched_time_t qdelay = 0; /* in pschedtime */ + psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ + s32 delta = 0; /* determines the change in probability */ + u32 oldprob; + u32 alpha, beta; + bool update_prob = true; + + q->vars.qdelay_old = q->vars.qdelay; + + if (q->vars.avg_dq_rate > 0) + qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; + else + qdelay = 0; + + /* If qdelay is zero and qlen is not, it means qlen is very small, less + * than dequeue_rate, so we do not update probabilty in this round + */ + if (qdelay == 0 && qlen != 0) + update_prob = false; + + /* In the algorithm, alpha and beta are between 0 and 2 with typical + * value for alpha as 0.125. In this implementation, we use values 0-32 + * passed from user space to represent this. Also, alpha and beta have + * unit of HZ and need to be scaled before they can used to update + * probability. alpha/beta are updated locally below by 1) scaling them + * appropriately 2) scaling down by 16 to come to 0-2 range. + * Please see paper for details. + * + * We scale alpha and beta differently depending on whether we are in + * light, medium or high dropping mode. + */ + if (q->vars.prob < MAX_PROB / 100) { + alpha = + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; + beta = + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; + } else if (q->vars.prob < MAX_PROB / 10) { + alpha = + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; + beta = + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; + } else { + alpha = + (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + beta = + (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + } + + /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ + delta += alpha * ((qdelay - q->params.target)); + delta += beta * ((qdelay - qdelay_old)); + + oldprob = q->vars.prob; + + /* to ensure we increase probability in steps of no more than 2% */ + if (delta > (s32) (MAX_PROB / (100 / 2)) && + q->vars.prob >= MAX_PROB / 10) + delta = (MAX_PROB / 100) * 2; + + /* Non-linear drop: + * Tune drop probability to increase quickly for high delays(>= 250ms) + * 250ms is derived through experiments and provides error protection + */ + + if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) + delta += MAX_PROB / (100 / 2); + + q->vars.prob += delta; + + if (delta > 0) { + /* prevent overflow */ + if (q->vars.prob < oldprob) { + q->vars.prob = MAX_PROB; + /* Prevent normalization error. If probability is at + * maximum value already, we normalize it here, and + * skip the check to do a non-linear drop in the next + * section. + */ + update_prob = false; + } + } else { + /* prevent underflow */ + if (q->vars.prob > oldprob) + q->vars.prob = 0; + } + + /* Non-linear drop in probability: Reduce drop probability quickly if + * delay is 0 for 2 consecutive Tupdate periods. + */ + + if ((qdelay == 0) && (qdelay_old == 0) && update_prob) + q->vars.prob = (q->vars.prob * 98) / 100; + + q->vars.qdelay = qdelay; + q->vars.qlen_old = qlen; + + /* We restart the measurement cycle if the following conditions are met + * 1. If the delay has been low for 2 consecutive Tupdate periods + * 2. Calculated drop probability is zero + * 3. We have atleast one estimate for the avg_dq_rate ie., + * is a non-zero value + */ + if ((q->vars.qdelay < q->params.target / 2) && + (q->vars.qdelay_old < q->params.target / 2) && + (q->vars.prob == 0) && + (q->vars.avg_dq_rate > 0)) + pie_vars_init(&q->vars); +} + +static void pie_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + struct pie_sched_data *q = qdisc_priv(sch); + spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + + spin_lock(root_lock); + calculate_probability(sch); + + /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ + if (q->params.tupdate) + mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); + spin_unlock(root_lock); + +} + +static int pie_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct pie_sched_data *q = qdisc_priv(sch); + + pie_params_init(&q->params); + pie_vars_init(&q->vars); + sch->limit = q->params.limit; + + setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch); + mod_timer(&q->adapt_timer, jiffies + HZ / 2); + + if (opt) { + int err = pie_change(sch, opt); + + if (err) + return err; + } + + return 0; +} + +static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct pie_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + /* convert target from pschedtime to us */ + if (nla_put_u32(skb, TCA_PIE_TARGET, + ((u32) PSCHED_TICKS2NS(q->params.target)) / + NSEC_PER_USEC) || + nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || + nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || + nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || + nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || + nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; + +} + +static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct pie_sched_data *q = qdisc_priv(sch); + struct tc_pie_xstats st = { + .prob = q->vars.prob, + .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / + NSEC_PER_USEC, + /* unscale and return dq_rate in bytes per sec */ + .avg_dq_rate = q->vars.avg_dq_rate * + (PSCHED_TICKS_PER_SEC) >> PIE_SCALE, + .packets_in = q->stats.packets_in, + .overlimit = q->stats.overlimit, + .maxq = q->stats.maxq, + .dropped = q->stats.dropped, + .ecn_mark = q->stats.ecn_mark, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + skb = __qdisc_dequeue_head(sch, &sch->q); + + if (!skb) + return NULL; + + pie_process_dequeue(sch, skb); + return skb; +} + +static void pie_reset(struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + qdisc_reset_queue(sch); + pie_vars_init(&q->vars); +} + +static void pie_destroy(struct Qdisc *sch) +{ + struct pie_sched_data *q = qdisc_priv(sch); + q->params.tupdate = 0; + del_timer_sync(&q->adapt_timer); +} + +static struct Qdisc_ops pie_qdisc_ops __read_mostly = { + .id = "pie", + .priv_size = sizeof(struct pie_sched_data), + .enqueue = pie_qdisc_enqueue, + .dequeue = pie_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = pie_init, + .destroy = pie_destroy, + .reset = pie_reset, + .change = pie_change, + .dump = pie_dump, + .dump_stats = pie_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init pie_module_init(void) +{ + return register_qdisc(&pie_qdisc_ops); +} + +static void __exit pie_module_exit(void) +{ + unregister_qdisc(&pie_qdisc_ops); +} + +module_init(pie_module_init); +module_exit(pie_module_exit); + +MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler"); +MODULE_AUTHOR("Vijay Subramanian"); +MODULE_AUTHOR("Mythili Prabhu"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c new file mode 100644 index 00000000000..89f8fcf73f1 --- /dev/null +++ b/net/sched/sch_plug.c @@ -0,0 +1,233 @@ +/* + * sch_plug.c Queue traffic until an explicit release command + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * There are two ways to use this qdisc: + * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating + * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands. + * + * 2. For network output buffering (a.k.a output commit) functionality. + * Output commit property is commonly used by applications using checkpoint + * based fault-tolerance to ensure that the checkpoint from which a system + * is being restored is consistent w.r.t outside world. + * + * Consider for e.g. Remus - a Virtual Machine checkpointing system, + * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated + * asynchronously to the backup host, while the VM continues executing the + * next epoch speculatively. + * + * The following is a typical sequence of output buffer operations: + * 1.At epoch i, start_buffer(i) + * 2. At end of epoch i (i.e. after 50ms): + * 2.1 Stop VM and take checkpoint(i). + * 2.2 start_buffer(i+1) and Resume VM + * 3. While speculatively executing epoch(i+1), asynchronously replicate + * checkpoint(i) to backup host. + * 4. When checkpoint_ack(i) is received from backup, release_buffer(i) + * Thus, this Qdisc would receive the following sequence of commands: + * TCQ_PLUG_BUFFER (epoch i) + * .. TCQ_PLUG_BUFFER (epoch i+1) + * ....TCQ_PLUG_RELEASE_ONE (epoch i) + * ......TCQ_PLUG_BUFFER (epoch i+2) + * ........ + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> + +/* + * State of the queue, when used for network output buffering: + * + * plug(i+1) plug(i) head + * ------------------+--------------------+----------------> + * | | + * | | + * pkts_current_epoch| pkts_last_epoch |pkts_to_release + * ----------------->|<--------+--------->|+---------------> + * v v + * + */ + +struct plug_sched_data { + /* If true, the dequeue function releases all packets + * from head to end of the queue. The queue turns into + * a pass-through queue for newly arriving packets. + */ + bool unplug_indefinite; + + /* Queue Limit in bytes */ + u32 limit; + + /* Number of packets (output) from the current speculatively + * executing epoch. + */ + u32 pkts_current_epoch; + + /* Number of packets corresponding to the recently finished + * epoch. These will be released when we receive a + * TCQ_PLUG_RELEASE_ONE command. This command is typically + * issued after committing a checkpoint at the target. + */ + u32 pkts_last_epoch; + + /* + * Number of packets from the head of the queue, that can + * be released (committed checkpoint). + */ + u32 pkts_to_release; +}; + +static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + if (likely(sch->qstats.backlog + skb->len <= q->limit)) { + if (!q->unplug_indefinite) + q->pkts_current_epoch++; + return qdisc_enqueue_tail(skb, sch); + } + + return qdisc_reshape_fail(skb, sch); +} + +static struct sk_buff *plug_dequeue(struct Qdisc *sch) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + if (qdisc_is_throttled(sch)) + return NULL; + + if (!q->unplug_indefinite) { + if (!q->pkts_to_release) { + /* No more packets to dequeue. Block the queue + * and wait for the next release command. + */ + qdisc_throttled(sch); + return NULL; + } + q->pkts_to_release--; + } + + return qdisc_dequeue_head(sch); +} + +static int plug_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + q->pkts_current_epoch = 0; + q->pkts_last_epoch = 0; + q->pkts_to_release = 0; + q->unplug_indefinite = false; + + if (opt == NULL) { + /* We will set a default limit of 100 pkts (~150kB) + * in case tx_queue_len is not available. The + * default value is completely arbitrary. + */ + u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100; + q->limit = pkt_limit * psched_mtu(qdisc_dev(sch)); + } else { + struct tc_plug_qopt *ctl = nla_data(opt); + + if (nla_len(opt) < sizeof(*ctl)) + return -EINVAL; + + q->limit = ctl->limit; + } + + qdisc_throttled(sch); + return 0; +} + +/* Receives 4 types of messages: + * TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ +static int plug_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct plug_sched_data *q = qdisc_priv(sch); + struct tc_plug_qopt *msg; + + if (opt == NULL) + return -EINVAL; + + msg = nla_data(opt); + if (nla_len(opt) < sizeof(*msg)) + return -EINVAL; + + switch (msg->action) { + case TCQ_PLUG_BUFFER: + /* Save size of the current buffer */ + q->pkts_last_epoch = q->pkts_current_epoch; + q->pkts_current_epoch = 0; + if (q->unplug_indefinite) + qdisc_throttled(sch); + q->unplug_indefinite = false; + break; + case TCQ_PLUG_RELEASE_ONE: + /* Add packets from the last complete buffer to the + * packets to be released set. + */ + q->pkts_to_release += q->pkts_last_epoch; + q->pkts_last_epoch = 0; + qdisc_unthrottled(sch); + netif_schedule_queue(sch->dev_queue); + break; + case TCQ_PLUG_RELEASE_INDEFINITE: + q->unplug_indefinite = true; + q->pkts_to_release = 0; + q->pkts_last_epoch = 0; + q->pkts_current_epoch = 0; + qdisc_unthrottled(sch); + netif_schedule_queue(sch->dev_queue); + break; + case TCQ_PLUG_LIMIT: + /* Limit is supplied in bytes */ + q->limit = msg->limit; + break; + default: + return -EINVAL; + } + + return 0; +} + +static struct Qdisc_ops plug_qdisc_ops __read_mostly = { + .id = "plug", + .priv_size = sizeof(struct plug_sched_data), + .enqueue = plug_enqueue, + .dequeue = plug_dequeue, + .peek = qdisc_peek_head, + .init = plug_init, + .change = plug_change, + .owner = THIS_MODULE, +}; + +static int __init plug_module_init(void) +{ + return register_qdisc(&plug_qdisc_ops); +} + +static void __exit plug_module_exit(void) +{ + unregister_qdisc(&plug_qdisc_ops); +} +module_init(plug_module_init) +module_exit(plug_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index b5d56a22b1d..79359b69ad8 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -247,7 +247,8 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) opt.bands = q->bands; memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1); - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index e68cb440756..8056fb4e618 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1,7 +1,8 @@ /* - * net/sched/sch_qfq.c Quick Fair Queueing Scheduler. + * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler. * * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. + * Copyright (c) 2012 Paolo Valente. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -19,12 +20,18 @@ #include <net/pkt_cls.h> -/* Quick Fair Queueing - =================== +/* Quick Fair Queueing Plus + ======================== Sources: - Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient + [1] Paolo Valente, + "Reducing the Execution Time of Fair-Queueing Schedulers." + http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf + + Sources for QFQ: + + [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient Packet Scheduling with Tight Bandwidth Distribution Guarantees." See also: @@ -33,6 +40,20 @@ /* + QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES + classes. Each aggregate is timestamped with a virtual start time S + and a virtual finish time F, and scheduled according to its + timestamps. S and F are computed as a function of a system virtual + time function V. The classes within each aggregate are instead + scheduled with DRR. + + To speed up operations, QFQ+ divides also aggregates into a limited + number of groups. Which group a class belongs to depends on the + ratio between the maximum packet length for the class and the weight + of the class. Groups have their own S and F. In the end, QFQ+ + schedules groups, then aggregates within groups, then classes within + aggregates. See [1] and [2] for a full description. + Virtual time computations. S, F and V are all computed in fixed point arithmetic with @@ -76,26 +97,27 @@ #define QFQ_MAX_SLOTS 32 /* - * Shifts used for class<->group mapping. We allow class weights that are - * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the + * Shifts used for aggregate<->group mapping. We allow class weights that are + * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the * group with the smallest index that can support the L_i / r_i configured - * for the class. + * for the classes in the aggregate. * * grp->index is the index of the group; and grp->slot_shift * is the shift for the corresponding (scaled) sigma_i. */ -#define QFQ_MAX_INDEX 19 -#define QFQ_MAX_WSHIFT 16 +#define QFQ_MAX_INDEX 24 +#define QFQ_MAX_WSHIFT 10 -#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) -#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT) +#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */ +#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT) #define FRAC_BITS 30 /* fixed point arithmetic */ #define ONE_FP (1UL << FRAC_BITS) -#define IWSUM (ONE_FP/QFQ_MAX_WSUM) -#define QFQ_MTU_SHIFT 11 -#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) +#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ +#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ + +#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */ /* * Possible group states. These values are used as indexes for the bitmaps @@ -105,6 +127,8 @@ enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; struct qfq_group; +struct qfq_aggregate; + struct qfq_class { struct Qdisc_class_common common; @@ -113,9 +137,14 @@ struct qfq_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct Qdisc *qdisc; + struct list_head alist; /* Link for active-classes list. */ + struct qfq_aggregate *agg; /* Parent aggregate. */ + int deficit; /* DRR deficit counter. */ +}; +struct qfq_aggregate { struct hlist_node next; /* Link for the slot list. */ u64 S, F; /* flow timestamps (exact) */ @@ -126,8 +155,18 @@ struct qfq_class { struct qfq_group *grp; /* these are copied from the flowset. */ - u32 inv_w; /* ONE_FP/weight */ - u32 lmax; /* Max packet size for this flow. */ + u32 class_weight; /* Weight of each class in this aggregate. */ + /* Max pkt size for the classes in this aggregate, DRR quantum. */ + int lmax; + + u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */ + u32 budgetmax; /* Max budget for this aggregate. */ + u32 initial_budget, budget; /* Initial and current budget. */ + + int num_classes; /* Number of classes in this aggr. */ + struct list_head active; /* DRR queue of active classes. */ + + struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */ }; struct qfq_group { @@ -137,7 +176,7 @@ struct qfq_group { unsigned int front; /* Index of the front slot. */ unsigned long full_slots; /* non-empty slots */ - /* Array of RR lists of active classes. */ + /* Array of RR lists of active aggregates. */ struct hlist_head slots[QFQ_MAX_SLOTS]; }; @@ -145,13 +184,29 @@ struct qfq_sched { struct tcf_proto *filter_list; struct Qdisc_class_hash clhash; - u64 V; /* Precise virtual time. */ - u32 wsum; /* weight sum */ + u64 oldV, V; /* Precise virtual times. */ + struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ + u32 num_active_agg; /* Num. of active aggregates */ + u32 wsum; /* weight sum */ + u32 iwsum; /* inverse weight sum */ unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ + u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */ + + u32 max_agg_classes; /* Max number of classes per aggr. */ + struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */ }; +/* + * Possible reasons why the timestamps of an aggregate are updated + * enqueue: the aggregate switches from idle to active and must scheduled + * for service + * requeue: the aggregate finishes its budget, so it stops being served and + * must be rescheduled for service + */ +enum update_reason {enqueue, requeue}; + static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) { struct qfq_sched *q = qdisc_priv(sch); @@ -181,18 +236,18 @@ static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { * index = log_2(maxlen/weight) but we need to apply the scaling. * This is used only once at flow creation. */ -static int qfq_calc_index(u32 inv_w, unsigned int maxlen) +static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift) { u64 slot_size = (u64)maxlen * inv_w; unsigned long size_map; int index = 0; - size_map = slot_size >> QFQ_MIN_SLOT_SHIFT; + size_map = slot_size >> min_slot_shift; if (!size_map) goto out; index = __fls(size_map) + 1; /* basically a log_2 */ - index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + index -= !(slot_size - (1ULL << (index + min_slot_shift - 1))); if (index < 0) index = 0; @@ -203,14 +258,160 @@ out: return index; } +static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *); +static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *, + enum update_reason); + +static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + u32 lmax, u32 weight) +{ + INIT_LIST_HEAD(&agg->active); + hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); + + agg->lmax = lmax; + agg->class_weight = weight; +} + +static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, + u32 lmax, u32 weight) +{ + struct qfq_aggregate *agg; + + hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) + if (agg->lmax == lmax && agg->class_weight == weight) + return agg; + + return NULL; +} + + +/* Update aggregate as a function of the new number of classes. */ +static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + int new_num_classes) +{ + u32 new_agg_weight; + + if (new_num_classes == q->max_agg_classes) + hlist_del_init(&agg->nonfull_next); + + if (agg->num_classes > new_num_classes && + new_num_classes == q->max_agg_classes - 1) /* agg no more full */ + hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); + + /* The next assignment may let + * agg->initial_budget > agg->budgetmax + * hold, we will take it into account in charge_actual_service(). + */ + agg->budgetmax = new_num_classes * agg->lmax; + new_agg_weight = agg->class_weight * new_num_classes; + agg->inv_w = ONE_FP/new_agg_weight; + + if (agg->grp == NULL) { + int i = qfq_calc_index(agg->inv_w, agg->budgetmax, + q->min_slot_shift); + agg->grp = &q->groups[i]; + } + + q->wsum += + (int) agg->class_weight * (new_num_classes - agg->num_classes); + q->iwsum = ONE_FP / q->wsum; + + agg->num_classes = new_num_classes; +} + +/* Add class to aggregate. */ +static void qfq_add_to_agg(struct qfq_sched *q, + struct qfq_aggregate *agg, + struct qfq_class *cl) +{ + cl->agg = agg; + + qfq_update_agg(q, agg, agg->num_classes+1); + if (cl->qdisc->q.qlen > 0) { /* adding an active class */ + list_add_tail(&cl->alist, &agg->active); + if (list_first_entry(&agg->active, struct qfq_class, alist) == + cl && q->in_serv_agg != agg) /* agg was inactive */ + qfq_activate_agg(q, agg, enqueue); /* schedule agg */ + } +} + +static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); + +static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + if (!hlist_unhashed(&agg->nonfull_next)) + hlist_del_init(&agg->nonfull_next); + q->wsum -= agg->class_weight; + if (q->wsum != 0) + q->iwsum = ONE_FP / q->wsum; + + if (q->in_serv_agg == agg) + q->in_serv_agg = qfq_choose_next_agg(q); + kfree(agg); +} + +/* Deschedule class from within its parent aggregate. */ +static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) +{ + struct qfq_aggregate *agg = cl->agg; + + + list_del(&cl->alist); /* remove from RR queue of the aggregate */ + if (list_empty(&agg->active)) /* agg is now inactive */ + qfq_deactivate_agg(q, agg); +} + +/* Remove class from its parent aggregate. */ +static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) +{ + struct qfq_aggregate *agg = cl->agg; + + cl->agg = NULL; + if (agg->num_classes == 1) { /* agg being emptied, destroy it */ + qfq_destroy_agg(q, agg); + return; + } + qfq_update_agg(q, agg, agg->num_classes-1); +} + +/* Deschedule class and remove it from its parent aggregate. */ +static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) +{ + if (cl->qdisc->q.qlen > 0) /* class is active */ + qfq_deactivate_class(q, cl); + + qfq_rm_from_agg(q, cl); +} + +/* Move class to a new aggregate, matching the new class weight and/or lmax */ +static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, + u32 lmax) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight); + + if (new_agg == NULL) { /* create new aggregate */ + new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); + if (new_agg == NULL) + return -ENOBUFS; + qfq_init_agg(q, new_agg, lmax, weight); + } + qfq_deact_rm_from_agg(q, cl); + qfq_add_to_agg(q, new_agg, cl); + + return 0; +} + static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)*arg; + bool existing = false; struct nlattr *tb[TCA_QFQ_MAX + 1]; + struct qfq_aggregate *new_agg = NULL; u32 weight, lmax, inv_w; - int i, err; + int err; int delta_w; if (tca[TCA_OPTIONS] == NULL) { @@ -231,25 +432,32 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } else weight = 1; - inv_w = ONE_FP / weight; - weight = ONE_FP / inv_w; - delta_w = weight - (cl ? ONE_FP / cl->inv_w : 0); - if (q->wsum + delta_w > QFQ_MAX_WSUM) { - pr_notice("qfq: total weight out of range (%u + %u)\n", - delta_w, q->wsum); - return -EINVAL; - } - if (tb[TCA_QFQ_LMAX]) { lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); - if (!lmax || lmax > (1UL << QFQ_MTU_SHIFT)) { + if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { pr_notice("qfq: invalid max length %u\n", lmax); return -EINVAL; } } else - lmax = 1UL << QFQ_MTU_SHIFT; + lmax = psched_mtu(qdisc_dev(sch)); + + inv_w = ONE_FP / weight; + weight = ONE_FP / inv_w; + + if (cl != NULL && + lmax == cl->agg->lmax && + weight == cl->agg->class_weight) + return 0; /* nothing to change */ + + delta_w = weight - (cl ? cl->agg->class_weight : 0); + + if (q->wsum + delta_w > QFQ_MAX_WSUM) { + pr_notice("qfq: total weight out of range (%d + %u)\n", + delta_w, q->wsum); + return -EINVAL; + } - if (cl != NULL) { + if (cl != NULL) { /* modify existing class */ if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, &cl->rate_est, qdisc_root_sleeping_lock(sch), @@ -257,27 +465,18 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (err) return err; } - - if (inv_w != cl->inv_w) { - sch_tree_lock(sch); - q->wsum += delta_w; - cl->inv_w = inv_w; - sch_tree_unlock(sch); - } - return 0; + existing = true; + goto set_change_agg; } + /* create and init new class */ cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); if (cl == NULL) return -ENOBUFS; cl->refcnt = 1; cl->common.classid = classid; - cl->lmax = lmax; - cl->inv_w = inv_w; - i = qfq_calc_index(cl->inv_w, cl->lmax); - - cl->grp = &q->groups[i]; + cl->deficit = lmax; cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); @@ -288,13 +487,9 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_new_estimator(&cl->bstats, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); - if (err) { - qdisc_destroy(cl->qdisc); - kfree(cl); - return err; - } + if (err) + goto destroy_class; } - q->wsum += weight; sch_tree_lock(sch); qdisc_class_hash_insert(&q->clhash, &cl->common); @@ -302,19 +497,39 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, qdisc_class_hash_grow(sch, &q->clhash); +set_change_agg: + sch_tree_lock(sch); + new_agg = qfq_find_agg(q, lmax, weight); + if (new_agg == NULL) { /* create new aggregate */ + sch_tree_unlock(sch); + new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); + if (new_agg == NULL) { + err = -ENOBUFS; + gen_kill_estimator(&cl->bstats, &cl->rate_est); + goto destroy_class; + } + sch_tree_lock(sch); + qfq_init_agg(q, new_agg, lmax, weight); + } + if (existing) + qfq_deact_rm_from_agg(q, cl); + qfq_add_to_agg(q, new_agg, cl); + sch_tree_unlock(sch); + *arg = (unsigned long)cl; return 0; + +destroy_class: + qdisc_destroy(cl->qdisc); + kfree(cl); + return err; } static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) { struct qfq_sched *q = qdisc_priv(sch); - if (cl->inv_w) { - q->wsum -= ONE_FP / cl->inv_w; - cl->inv_w = 0; - } - + qfq_rm_from_agg(q, cl); gen_kill_estimator(&cl->bstats, &cl->rate_est); qdisc_destroy(cl->qdisc); kfree(cl); @@ -429,8 +644,9 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - NLA_PUT_U32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w); - NLA_PUT_U32(skb, TCA_QFQ_LMAX, cl->lmax); + if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || + nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax)) + goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: @@ -447,8 +663,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, memset(&xstats, 0, sizeof(xstats)); cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; - xstats.weight = ONE_FP/cl->inv_w; - xstats.lmax = cl->lmax; + xstats.weight = cl->agg->class_weight; + xstats.lmax = cl->agg->lmax; if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || @@ -462,14 +678,13 @@ static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; - struct hlist_node *n; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; @@ -599,45 +814,111 @@ static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) * perhaps * old_V ^= q->V; - old_V >>= QFQ_MIN_SLOT_SHIFT; + old_V >>= q->min_slot_shift; if (old_V) { ... } * */ -static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) +static void qfq_make_eligible(struct qfq_sched *q) { - unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT; - unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + unsigned long vslot = q->V >> q->min_slot_shift; + unsigned long old_vslot = q->oldV >> q->min_slot_shift; if (vslot != old_vslot) { - unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1; + unsigned long mask; + int last_flip_pos = fls(vslot ^ old_vslot); + + if (last_flip_pos > 31) /* higher than the number of groups */ + mask = ~0UL; /* make all groups eligible */ + else + mask = (1UL << last_flip_pos) - 1; + qfq_move_groups(q, mask, IR, ER); qfq_move_groups(q, mask, IB, EB); } } - /* - * XXX we should make sure that slot becomes less than 32. - * This is guaranteed by the input values. - * roundedS is always cl->S rounded on grp->slot_shift bits. + * The index of the slot in which the input aggregate agg is to be + * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2' + * and not a '-1' because the start time of the group may be moved + * backward by one slot after the aggregate has been inserted, and + * this would cause non-empty slots to be right-shifted by one + * position. + * + * QFQ+ fully satisfies this bound to the slot index if the parameters + * of the classes are not changed dynamically, and if QFQ+ never + * happens to postpone the service of agg unjustly, i.e., it never + * happens that the aggregate becomes backlogged and eligible, or just + * eligible, while an aggregate with a higher approximated finish time + * is being served. In particular, in this case QFQ+ guarantees that + * the timestamps of agg are low enough that the slot index is never + * higher than 2. Unfortunately, QFQ+ cannot provide the same + * guarantee if it happens to unjustly postpone the service of agg, or + * if the parameters of some class are changed. + * + * As for the first event, i.e., an out-of-order service, the + * upper bound to the slot index guaranteed by QFQ+ grows to + * 2 + + * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * + * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1. + * + * The following function deals with this problem by backward-shifting + * the timestamps of agg, if needed, so as to guarantee that the slot + * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may + * cause the service of other aggregates to be postponed, yet the + * worst-case guarantees of these aggregates are not violated. In + * fact, in case of no out-of-order service, the timestamps of agg + * would have been even lower than they are after the backward shift, + * because QFQ+ would have guaranteed a maximum value equal to 2 for + * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose + * service is postponed because of the backward-shift would have + * however waited for the service of agg before being served. + * + * The other event that may cause the slot index to be higher than 2 + * for agg is a recent change of the parameters of some class. If the + * weight of a class is increased or the lmax (max_pkt_size) of the + * class is decreased, then a new aggregate with smaller slot size + * than the original parent aggregate of the class may happen to be + * activated. The activation of this aggregate should be properly + * delayed to when the service of the class has finished in the ideal + * system tracked by QFQ+. If the activation of the aggregate is not + * delayed to this reference time instant, then this aggregate may be + * unjustly served before other aggregates waiting for service. This + * may cause the above bound to the slot index to be violated for some + * of these unlucky aggregates. + * + * Instead of delaying the activation of the new aggregate, which is + * quite complex, the above-discussed capping of the slot index is + * used to handle also the consequences of a change of the parameters + * of a class. */ -static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, u64 roundedS) { u64 slot = (roundedS - grp->S) >> grp->slot_shift; - unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; + unsigned int i; /* slot index in the bucket list */ + + if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { + u64 deltaS = roundedS - grp->S - + ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); + agg->S -= deltaS; + agg->F -= deltaS; + slot = QFQ_MAX_SLOTS - 2; + } - hlist_add_head(&cl->next, &grp->slots[i]); + i = (grp->front + slot) % QFQ_MAX_SLOTS; + + hlist_add_head(&agg->next, &grp->slots[i]); __set_bit(slot, &grp->full_slots); } /* Maybe introduce hlist_first_entry?? */ -static struct qfq_class *qfq_slot_head(struct qfq_group *grp) +static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp) { return hlist_entry(grp->slots[grp->front].first, - struct qfq_class, next); + struct qfq_aggregate, next); } /* @@ -645,20 +926,20 @@ static struct qfq_class *qfq_slot_head(struct qfq_group *grp) */ static void qfq_front_slot_remove(struct qfq_group *grp) { - struct qfq_class *cl = qfq_slot_head(grp); + struct qfq_aggregate *agg = qfq_slot_head(grp); - BUG_ON(!cl); - hlist_del(&cl->next); + BUG_ON(!agg); + hlist_del(&agg->next); if (hlist_empty(&grp->slots[grp->front])) __clear_bit(0, &grp->full_slots); } /* - * Returns the first full queue in a group. As a side effect, - * adjust the bucket list so the first non-empty bucket is at - * position 0 in full_slots. + * Returns the first aggregate in the first non-empty bucket of the + * group. As a side effect, adjusts the bucket list so the first + * non-empty bucket is at position 0 in full_slots. */ -static struct qfq_class *qfq_slot_scan(struct qfq_group *grp) +static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp) { unsigned int i; @@ -694,7 +975,7 @@ static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) grp->front = (grp->front - i) % QFQ_MAX_SLOTS; } -static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) +static void qfq_update_eligible(struct qfq_sched *q) { struct qfq_group *grp; unsigned long ineligible; @@ -706,147 +987,239 @@ static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) if (qfq_gt(grp->S, q->V)) q->V = grp->S; } - qfq_make_eligible(q, old_V); + qfq_make_eligible(q); } } -/* What is length of next packet in queue (0 if queue is empty) */ -static unsigned int qdisc_peek_len(struct Qdisc *sch) -{ - struct sk_buff *skb; - - skb = sch->ops->peek(sch); - return skb ? qdisc_pkt_len(skb) : 0; -} - -/* - * Updates the class, returns true if also the group needs to be updated. - */ -static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl) +/* Dequeue head packet of the head class in the DRR queue of the aggregate. */ +static void agg_dequeue(struct qfq_aggregate *agg, + struct qfq_class *cl, unsigned int len) { - unsigned int len = qdisc_peek_len(cl->qdisc); + qdisc_dequeue_peeked(cl->qdisc); - cl->S = cl->F; - if (!len) - qfq_front_slot_remove(grp); /* queue is empty */ - else { - u64 roundedS; - - cl->F = cl->S + (u64)len * cl->inv_w; - roundedS = qfq_round_down(cl->S, grp->slot_shift); - if (roundedS == grp->S) - return false; + cl->deficit -= (int) len; - qfq_front_slot_remove(grp); - qfq_slot_insert(grp, cl, roundedS); + if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ + list_del(&cl->alist); + else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { + cl->deficit += agg->lmax; + list_move_tail(&cl->alist, &agg->active); } - - return true; } -static struct sk_buff *qfq_dequeue(struct Qdisc *sch) +static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, + struct qfq_class **cl, + unsigned int *len) { - struct qfq_sched *q = qdisc_priv(sch); - struct qfq_group *grp; - struct qfq_class *cl; struct sk_buff *skb; - unsigned int len; - u64 old_V; - if (!q->bitmaps[ER]) - return NULL; - - grp = qfq_ffs(q, q->bitmaps[ER]); - - cl = qfq_slot_head(grp); - skb = qdisc_dequeue_peeked(cl->qdisc); - if (!skb) { + *cl = list_first_entry(&agg->active, struct qfq_class, alist); + skb = (*cl)->qdisc->ops->peek((*cl)->qdisc); + if (skb == NULL) WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); - return NULL; - } + else + *len = qdisc_pkt_len(skb); - sch->q.qlen--; - qdisc_bstats_update(sch, skb); - - old_V = q->V; - len = qdisc_pkt_len(skb); - q->V += (u64)len * IWSUM; - pr_debug("qfq dequeue: len %u F %lld now %lld\n", - len, (unsigned long long) cl->F, (unsigned long long) q->V); - - if (qfq_update_class(grp, cl)) { - u64 old_F = grp->F; - - cl = qfq_slot_scan(grp); - if (!cl) - __clear_bit(grp->index, &q->bitmaps[ER]); - else { - u64 roundedS = qfq_round_down(cl->S, grp->slot_shift); - unsigned int s; - - if (grp->S == roundedS) - goto skip_unblock; - grp->S = roundedS; - grp->F = roundedS + (2ULL << grp->slot_shift); - __clear_bit(grp->index, &q->bitmaps[ER]); - s = qfq_calc_state(q, grp); - __set_bit(grp->index, &q->bitmaps[s]); - } - - qfq_unblock_groups(q, grp->index, old_F); - } + return skb; +} -skip_unblock: - qfq_update_eligible(q, old_V); +/* Update F according to the actual service received by the aggregate. */ +static inline void charge_actual_service(struct qfq_aggregate *agg) +{ + /* Compute the service received by the aggregate, taking into + * account that, after decreasing the number of classes in + * agg, it may happen that + * agg->initial_budget - agg->budget > agg->bugdetmax + */ + u32 service_received = min(agg->budgetmax, + agg->initial_budget - agg->budget); - return skb; + agg->F = agg->S + (u64)service_received * agg->inv_w; } -/* - * Assign a reasonable start time for a new flow k in group i. +/* Assign a reasonable start time for a new aggregate in group i. * Admissible values for \hat(F) are multiples of \sigma_i * no greater than V+\sigma_i . Larger values mean that * we had a wraparound so we consider the timestamp to be stale. * * If F is not stale and F >= V then we set S = F. * Otherwise we should assign S = V, but this may violate - * the ordering in ER. So, if we have groups in ER, set S to - * the F_j of the first group j which would be blocking us. + * the ordering in EB (see [2]). So, if we have groups in ER, + * set S to the F_j of the first group j which would be blocking us. * We are guaranteed not to move S backward because * otherwise our group i would still be blocked. */ -static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) +static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) { unsigned long mask; u64 limit, roundedF; - int slot_shift = cl->grp->slot_shift; + int slot_shift = agg->grp->slot_shift; - roundedF = qfq_round_down(cl->F, slot_shift); + roundedF = qfq_round_down(agg->F, slot_shift); limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); - if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { + if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { /* timestamp was stale */ - mask = mask_from(q->bitmaps[ER], cl->grp->index); + mask = mask_from(q->bitmaps[ER], agg->grp->index); if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { - cl->S = next->F; + if (qfq_gt(limit, next->F)) + agg->S = next->F; + else /* preserve timestamp correctness */ + agg->S = limit; return; } } - cl->S = q->V; + agg->S = q->V; } else /* timestamp is not stale */ - cl->S = cl->F; + agg->S = agg->F; } -static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +/* Update the timestamps of agg before scheduling/rescheduling it for + * service. In particular, assign to agg->F its maximum possible + * value, i.e., the virtual finish time with which the aggregate + * should be labeled if it used all its budget once in service. + */ +static inline void +qfq_update_agg_ts(struct qfq_sched *q, + struct qfq_aggregate *agg, enum update_reason reason) +{ + if (reason != requeue) + qfq_update_start(q, agg); + else /* just charge agg for the service received */ + agg->S = agg->F; + + agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; +} + +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg); + +static struct sk_buff *qfq_dequeue(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); + struct qfq_aggregate *in_serv_agg = q->in_serv_agg; + struct qfq_class *cl; + struct sk_buff *skb = NULL; + /* next-packet len, 0 means no more active classes in in-service agg */ + unsigned int len = 0; + + if (in_serv_agg == NULL) + return NULL; + + if (!list_empty(&in_serv_agg->active)) + skb = qfq_peek_skb(in_serv_agg, &cl, &len); + + /* + * If there are no active classes in the in-service aggregate, + * or if the aggregate has not enough budget to serve its next + * class, then choose the next aggregate to serve. + */ + if (len == 0 || in_serv_agg->budget < len) { + charge_actual_service(in_serv_agg); + + /* recharge the budget of the aggregate */ + in_serv_agg->initial_budget = in_serv_agg->budget = + in_serv_agg->budgetmax; + + if (!list_empty(&in_serv_agg->active)) { + /* + * Still active: reschedule for + * service. Possible optimization: if no other + * aggregate is active, then there is no point + * in rescheduling this aggregate, and we can + * just keep it as the in-service one. This + * should be however a corner case, and to + * handle it, we would need to maintain an + * extra num_active_aggs field. + */ + qfq_update_agg_ts(q, in_serv_agg, requeue); + qfq_schedule_agg(q, in_serv_agg); + } else if (sch->q.qlen == 0) { /* no aggregate to serve */ + q->in_serv_agg = NULL; + return NULL; + } + + /* + * If we get here, there are other aggregates queued: + * choose the new aggregate to serve. + */ + in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q); + skb = qfq_peek_skb(in_serv_agg, &cl, &len); + } + if (!skb) + return NULL; + + sch->q.qlen--; + qdisc_bstats_update(sch, skb); + + agg_dequeue(in_serv_agg, cl, len); + /* If lmax is lowered, through qfq_change_class, for a class + * owning pending packets with larger size than the new value + * of lmax, then the following condition may hold. + */ + if (unlikely(in_serv_agg->budget < len)) + in_serv_agg->budget = 0; + else + in_serv_agg->budget -= len; + + q->V += (u64)len * q->iwsum; + pr_debug("qfq dequeue: len %u F %lld now %lld\n", + len, (unsigned long long) in_serv_agg->F, + (unsigned long long) q->V); + + return skb; +} + +static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) +{ struct qfq_group *grp; + struct qfq_aggregate *agg, *new_front_agg; + u64 old_F; + + qfq_update_eligible(q); + q->oldV = q->V; + + if (!q->bitmaps[ER]) + return NULL; + + grp = qfq_ffs(q, q->bitmaps[ER]); + old_F = grp->F; + + agg = qfq_slot_head(grp); + + /* agg starts to be served, remove it from schedule */ + qfq_front_slot_remove(grp); + + new_front_agg = qfq_slot_scan(grp); + + if (new_front_agg == NULL) /* group is now inactive, remove from ER */ + __clear_bit(grp->index, &q->bitmaps[ER]); + else { + u64 roundedS = qfq_round_down(new_front_agg->S, + grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + return agg; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + + qfq_unblock_groups(q, grp->index, old_F); + + return agg; +} + +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; - int err; - u64 roundedS; - int s; + struct qfq_aggregate *agg; + int err = 0; cl = qfq_classify(skb, sch, &err); if (cl == NULL) { @@ -857,6 +1230,15 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) } pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); + if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) { + pr_debug("qfq: increasing maxpkt from %u to %u for class %u", + cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid); + err = qfq_change_agg(sch, cl, cl->agg->class_weight, + qdisc_pkt_len(skb)); + if (err) + return err; + } + err = qdisc_enqueue(skb, cl->qdisc); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); @@ -870,21 +1252,44 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) bstats_update(&cl->bstats, skb); ++sch->q.qlen; - /* If the new skb is not the head of queue, then done here. */ - if (cl->qdisc->q.qlen != 1) + agg = cl->agg; + /* if the queue was not empty, then done here */ + if (cl->qdisc->q.qlen != 1) { + if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && + list_first_entry(&agg->active, struct qfq_class, alist) + == cl && cl->deficit < qdisc_pkt_len(skb)) + list_move_tail(&cl->alist, &agg->active); + return err; + } + + /* schedule class for service within the aggregate */ + cl->deficit = agg->lmax; + list_add_tail(&cl->alist, &agg->active); + + if (list_first_entry(&agg->active, struct qfq_class, alist) != cl || + q->in_serv_agg == agg) + return err; /* non-empty or in service, nothing else to do */ + + qfq_activate_agg(q, agg, enqueue); + + return err; +} - /* If reach this point, queue q was idle */ - grp = cl->grp; - qfq_update_start(q, cl); +/* + * Schedule aggregate according to its timestamps. + */ +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + struct qfq_group *grp = agg->grp; + u64 roundedS; + int s; - /* compute new finish time and rounded start. */ - cl->F = cl->S + (u64)qdisc_pkt_len(skb) * cl->inv_w; - roundedS = qfq_round_down(cl->S, grp->slot_shift); + roundedS = qfq_round_down(agg->S, grp->slot_shift); /* - * insert cl in the correct bucket. - * If cl->S >= grp->S we don't need to adjust the + * Insert agg in the correct bucket. + * If agg->S >= grp->S we don't need to adjust the * bucket list and simply go to the insertion phase. * Otherwise grp->S is decreasing, we must make room * in the bucket list, and also recompute the group state. @@ -892,15 +1297,16 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) * was in ER make sure to adjust V. */ if (grp->full_slots) { - if (!qfq_gt(grp->S, cl->S)) + if (!qfq_gt(grp->S, agg->S)) goto skip_update; - /* create a slot for this cl->S */ + /* create a slot for this agg->S */ qfq_slot_rotate(grp, roundedS); /* group was surely ineligible, remove */ __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[IB]); - } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) && + q->in_serv_agg == NULL) q->V = roundedS; grp->S = roundedS; @@ -910,48 +1316,68 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", s, q->bitmaps[s], - (unsigned long long) cl->S, - (unsigned long long) cl->F, + (unsigned long long) agg->S, + (unsigned long long) agg->F, (unsigned long long) q->V); skip_update: - qfq_slot_insert(grp, cl, roundedS); - - return err; + qfq_slot_insert(grp, agg, roundedS); } +/* Update agg ts and schedule agg for service */ +static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + enum update_reason reason) +{ + agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */ + + qfq_update_agg_ts(q, agg, reason); + if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */ + q->in_serv_agg = agg; /* start serving this aggregate */ + /* update V: to be in service, agg must be eligible */ + q->oldV = q->V = agg->S; + } else if (agg != q->in_serv_agg) + qfq_schedule_agg(q, agg); +} + static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, - struct qfq_class *cl) + struct qfq_aggregate *agg) { unsigned int i, offset; u64 roundedS; - roundedS = qfq_round_down(cl->S, grp->slot_shift); + roundedS = qfq_round_down(agg->S, grp->slot_shift); offset = (roundedS - grp->S) >> grp->slot_shift; + i = (grp->front + offset) % QFQ_MAX_SLOTS; - hlist_del(&cl->next); + hlist_del(&agg->next); if (hlist_empty(&grp->slots[i])) __clear_bit(offset, &grp->full_slots); } /* - * called to forcibly destroy a queue. - * If the queue is not in the front bucket, or if it has - * other queues in the front bucket, we can simply remove - * the queue with no other side effects. + * Called to forcibly deschedule an aggregate. If the aggregate is + * not in the front bucket, or if the latter has other aggregates in + * the front bucket, we can simply remove the aggregate with no other + * side effects. * Otherwise we must propagate the event up. */ -static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) +static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { - struct qfq_group *grp = cl->grp; + struct qfq_group *grp = agg->grp; unsigned long mask; u64 roundedS; int s; - cl->F = cl->S; - qfq_slot_remove(q, grp, cl); + if (agg == q->in_serv_agg) { + charge_actual_service(agg); + q->in_serv_agg = qfq_choose_next_agg(q); + return; + } + + agg->F = agg->S; + qfq_slot_remove(q, grp, agg); if (!grp->full_slots) { __clear_bit(grp->index, &q->bitmaps[IR]); @@ -970,8 +1396,8 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) } __clear_bit(grp->index, &q->bitmaps[ER]); } else if (hlist_empty(&grp->slots[grp->front])) { - cl = qfq_slot_scan(grp); - roundedS = qfq_round_down(cl->S, grp->slot_shift); + agg = qfq_slot_scan(grp); + roundedS = qfq_round_down(agg->S, grp->slot_shift); if (grp->S != roundedS) { __clear_bit(grp->index, &q->bitmaps[ER]); __clear_bit(grp->index, &q->bitmaps[IR]); @@ -983,8 +1409,6 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) __set_bit(grp->index, &q->bitmaps[s]); } } - - qfq_update_eligible(q, q->V); } static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) @@ -996,6 +1420,31 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) qfq_deactivate_class(q, cl); } +static unsigned int qfq_drop_from_slot(struct qfq_sched *q, + struct hlist_head *slot) +{ + struct qfq_aggregate *agg; + struct qfq_class *cl; + unsigned int len; + + hlist_for_each_entry(agg, slot, next) { + list_for_each_entry(cl, &agg->active, alist) { + + if (!cl->qdisc->ops->drop) + continue; + + len = cl->qdisc->ops->drop(cl->qdisc); + if (len > 0) { + if (cl->qdisc->q.qlen == 0) + qfq_deactivate_class(q, cl); + + return len; + } + } + } + return 0; +} + static unsigned int qfq_drop(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); @@ -1005,24 +1454,13 @@ static unsigned int qfq_drop(struct Qdisc *sch) for (i = 0; i <= QFQ_MAX_INDEX; i++) { grp = &q->groups[i]; for (j = 0; j < QFQ_MAX_SLOTS; j++) { - struct qfq_class *cl; - struct hlist_node *n; - - hlist_for_each_entry(cl, n, &grp->slots[j], next) { - - if (!cl->qdisc->ops->drop) - continue; - - len = cl->qdisc->ops->drop(cl->qdisc); - if (len > 0) { - sch->q.qlen--; - if (!cl->qdisc->q.qlen) - qfq_deactivate_class(q, cl); - - return len; - } + len = qfq_drop_from_slot(q, &grp->slots[j]); + if (len > 0) { + sch->q.qlen--; + return len; } } + } return 0; @@ -1033,44 +1471,50 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) struct qfq_sched *q = qdisc_priv(sch); struct qfq_group *grp; int i, j, err; + u32 max_cl_shift, maxbudg_shift, max_classes; err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; + if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES) + max_classes = QFQ_MAX_AGG_CLASSES; + else + max_classes = qdisc_dev(sch)->tx_queue_len + 1; + /* max_cl_shift = floor(log_2(max_classes)) */ + max_cl_shift = __fls(max_classes); + q->max_agg_classes = 1<<max_cl_shift; + + /* maxbudg_shift = log2(max_len * max_classes_per_agg) */ + maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift; + q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX; + for (i = 0; i <= QFQ_MAX_INDEX; i++) { grp = &q->groups[i]; grp->index = i; - grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - - (QFQ_MAX_INDEX - i); + grp->slot_shift = q->min_slot_shift + i; for (j = 0; j < QFQ_MAX_SLOTS; j++) INIT_HLIST_HEAD(&grp->slots[j]); } + INIT_HLIST_HEAD(&q->nonfull_aggs); + return 0; } static void qfq_reset_qdisc(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); - struct qfq_group *grp; struct qfq_class *cl; - struct hlist_node *n, *tmp; - unsigned int i, j; + unsigned int i; - for (i = 0; i <= QFQ_MAX_INDEX; i++) { - grp = &q->groups[i]; - for (j = 0; j < QFQ_MAX_SLOTS; j++) { - hlist_for_each_entry_safe(cl, n, tmp, - &grp->slots[j], next) { + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { + if (cl->qdisc->q.qlen > 0) qfq_deactivate_class(q, cl); - } - } - } - for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) qdisc_reset(cl->qdisc); + } } sch->q.qlen = 0; } @@ -1079,13 +1523,13 @@ static void qfq_destroy_qdisc(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; - struct hlist_node *n, *next; + struct hlist_node *next; unsigned int i; tcf_destroy_chain(&q->filter_list); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) { qfq_destroy_class(sch, cl); } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index a5cc3012cf4..633e32defdc 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -272,8 +272,9 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); - NLA_PUT_U32(skb, TCA_RED_MAX_P, q->parms.max_P); + if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || + nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P)) + goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index d7eea99333e..9b0f7093d97 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -220,7 +220,7 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) { - q->bins[slot].perturbation = net_random(); + q->bins[slot].perturbation = prandom_u32(); } static void sfb_swap_slot(struct sfb_sched_data *q) @@ -381,7 +381,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) goto enqueue; } - r = net_random() & SFB_MAX_PROB; + r = prandom_u32() & SFB_MAX_PROB; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { @@ -570,7 +570,10 @@ static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) sch->qstats.backlog = q->qdisc->qstats.backlog; opts = nla_nest_start(skb, TCA_OPTIONS); - NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt); + if (opts == NULL) + goto nla_put_failure; + if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 60d47180f04..1af2f73906d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -237,10 +237,12 @@ static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) } #define sfq_unlink(q, x, n, p) \ - n = q->slots[x].dep.next; \ - p = q->slots[x].dep.prev; \ - sfq_dep_head(q, p)->next = n; \ - sfq_dep_head(q, n)->prev = p + do { \ + n = q->slots[x].dep.next; \ + p = q->slots[x].dep.prev; \ + sfq_dep_head(q, p)->next = n; \ + sfq_dep_head(q, n)->prev = p; \ + } while (0) static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) @@ -469,11 +471,15 @@ enqueue: if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ slot->next = x; - q->tail = slot; } else { slot->next = q->tail->next; q->tail->next = x; } + /* We put this flow at the end of our flow list. + * This might sound unfair for a new flow to wait after old ones, + * but we could endup servicing new flows only, and freeze old ones. + */ + q->tail = slot; /* We could use a bigger initial quantum for new flows */ slot->allot = q->scaled_quantum; } @@ -623,7 +629,7 @@ static void sfq_perturbation(unsigned long arg) spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); - q->perturbation = net_random(); + q->perturbation = prandom_u32(); if (!q->filter_list && q->tail) sfq_rehash(sch); spin_unlock(root_lock); @@ -692,7 +698,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) del_timer(&q->perturb_timer); if (q->perturb_period) { mod_timer(&q->perturb_timer, jiffies + q->perturb_period); - q->perturbation = net_random(); + q->perturbation = prandom_u32(); } sch_tree_unlock(sch); kfree(p); @@ -710,12 +716,7 @@ static void *sfq_alloc(size_t sz) static void sfq_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static void sfq_destroy(struct Qdisc *sch) @@ -753,7 +754,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) q->quantum = psched_mtu(qdisc_dev(sch)); q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); q->perturb_period = 0; - q->perturbation = net_random(); + q->perturbation = prandom_u32(); if (opt) { int err = sfq_change(sch, opt); @@ -808,7 +809,8 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); opt.flags = q->flags; - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index b8e156319d7..18ff6343370 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -19,6 +19,7 @@ #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> +#include <net/sch_generic.h> #include <net/pkt_sched.h> @@ -100,31 +101,103 @@ struct tbf_sched_data { /* Parameters */ u32 limit; /* Maximal length of backlog: bytes */ - u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ - u32 mtu; u32 max_size; - struct qdisc_rate_table *R_tab; - struct qdisc_rate_table *P_tab; + s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ + s64 mtu; + struct psched_ratecfg rate; + struct psched_ratecfg peak; /* Variables */ - long tokens; /* Current number of B tokens */ - long ptokens; /* Current number of P tokens */ - psched_time_t t_c; /* Time check-point */ + s64 tokens; /* Current number of B tokens */ + s64 ptokens; /* Current number of P tokens */ + s64 t_c; /* Time check-point */ struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ struct qdisc_watchdog watchdog; /* Watchdog timer */ }; -#define L2T(q, L) qdisc_l2t((q)->R_tab, L) -#define L2T_P(q, L) qdisc_l2t((q)->P_tab, L) + +/* Time to Length, convert time in ns to length in bytes + * to determinate how many bytes can be sent in given time. + */ +static u64 psched_ns_t2l(const struct psched_ratecfg *r, + u64 time_in_ns) +{ + /* The formula is : + * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC + */ + u64 len = time_in_ns * r->rate_bytes_ps; + + do_div(len, NSEC_PER_SEC); + + if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) { + do_div(len, 53); + len = len * 48; + } + + if (len > r->overhead) + len -= r->overhead; + else + len = 0; + + return len; +} + +/* + * Return length of individual segments of a gso packet, + * including all headers (MAC, IP, TCP/UDP) + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + return hdr_len + skb_gso_transport_seglen(skb); +} + +/* GSO packet is too big, segment it so that tbf can transmit + * each segment in time + */ +static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct sk_buff *segs, *nskb; + netdev_features_t features = netif_skb_features(skb); + int ret, nb; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + + if (IS_ERR_OR_NULL(segs)) + return qdisc_reshape_fail(skb, sch); + + nb = 0; + while (segs) { + nskb = segs->next; + segs->next = NULL; + qdisc_skb_cb(segs)->pkt_len = segs->len; + ret = qdisc_enqueue(segs, q->qdisc); + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + } else { + nb++; + } + segs = nskb; + } + sch->q.qlen += nb; + if (nb > 1) + qdisc_tree_decrease_qlen(sch, 1 - nb); + consume_skb(skb); + return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; +} static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); int ret; - if (qdisc_pkt_len(skb) > q->max_size) + if (qdisc_pkt_len(skb) > q->max_size) { + if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) + return tbf_segment(skb, sch); return qdisc_reshape_fail(skb, sch); - + } ret = qdisc_enqueue(skb, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) @@ -148,6 +221,11 @@ static unsigned int tbf_drop(struct Qdisc *sch) return len; } +static bool tbf_peak_present(const struct tbf_sched_data *q) +{ + return q->peak.rate_bytes_ps; +} + static struct sk_buff *tbf_dequeue(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); @@ -156,24 +234,24 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) skb = q->qdisc->ops->peek(q->qdisc); if (skb) { - psched_time_t now; - long toks; - long ptoks = 0; + s64 now; + s64 toks; + s64 ptoks = 0; unsigned int len = qdisc_pkt_len(skb); - now = psched_get_time(); - toks = psched_tdiff_bounded(now, q->t_c, q->buffer); + now = ktime_to_ns(ktime_get()); + toks = min_t(s64, now - q->t_c, q->buffer); - if (q->P_tab) { + if (tbf_peak_present(q)) { ptoks = toks + q->ptokens; - if (ptoks > (long)q->mtu) + if (ptoks > q->mtu) ptoks = q->mtu; - ptoks -= L2T_P(q, len); + ptoks -= (s64) psched_l2t_ns(&q->peak, len); } toks += q->tokens; - if (toks > (long)q->buffer) + if (toks > q->buffer) toks = q->buffer; - toks -= L2T(q, len); + toks -= (s64) psched_l2t_ns(&q->rate, len); if ((toks|ptoks) >= 0) { skb = qdisc_dequeue_peeked(q->qdisc); @@ -189,8 +267,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) return skb; } - qdisc_watchdog_schedule(&q->watchdog, - now + max_t(long, -toks, -ptoks)); + qdisc_watchdog_schedule_ns(&q->watchdog, + now + max_t(long, -toks, -ptoks)); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, @@ -214,7 +292,7 @@ static void tbf_reset(struct Qdisc *sch) qdisc_reset(q->qdisc); sch->q.qlen = 0; - q->t_c = psched_get_time(); + q->t_c = ktime_to_ns(ktime_get()); q->tokens = q->buffer; q->ptokens = q->mtu; qdisc_watchdog_cancel(&q->watchdog); @@ -224,20 +302,26 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) }, [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_TBF_RATE64] = { .type = NLA_U64 }, + [TCA_TBF_PRATE64] = { .type = NLA_U64 }, + [TCA_TBF_BURST] = { .type = NLA_U32 }, + [TCA_TBF_PBURST] = { .type = NLA_U32 }, }; static int tbf_change(struct Qdisc *sch, struct nlattr *opt) { int err; struct tbf_sched_data *q = qdisc_priv(sch); - struct nlattr *tb[TCA_TBF_PTAB + 1]; + struct nlattr *tb[TCA_TBF_MAX + 1]; struct tc_tbf_qopt *qopt; - struct qdisc_rate_table *rtab = NULL; - struct qdisc_rate_table *ptab = NULL; struct Qdisc *child = NULL; - int max_size, n; + struct psched_ratecfg rate; + struct psched_ratecfg peak; + u64 max_size; + s64 buffer, mtu; + u64 rate64 = 0, prate64 = 0; - err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy); + err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy); if (err < 0) return err; @@ -246,33 +330,59 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) goto done; qopt = nla_data(tb[TCA_TBF_PARMS]); - rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]); - if (rtab == NULL) - goto done; + if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&qopt->rate, + tb[TCA_TBF_RTAB])); + + if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) + qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate, + tb[TCA_TBF_PTAB])); + + buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U); + mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U); + + if (tb[TCA_TBF_RATE64]) + rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); + psched_ratecfg_precompute(&rate, &qopt->rate, rate64); + + if (tb[TCA_TBF_BURST]) { + max_size = nla_get_u32(tb[TCA_TBF_BURST]); + buffer = psched_l2t_ns(&rate, max_size); + } else { + max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U); + } if (qopt->peakrate.rate) { - if (qopt->peakrate.rate > qopt->rate.rate) - ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]); - if (ptab == NULL) + if (tb[TCA_TBF_PRATE64]) + prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); + psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64); + if (peak.rate_bytes_ps <= rate.rate_bytes_ps) { + pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n", + peak.rate_bytes_ps, rate.rate_bytes_ps); + err = -EINVAL; goto done; - } + } - for (n = 0; n < 256; n++) - if (rtab->data[n] > qopt->buffer) - break; - max_size = (n << qopt->rate.cell_log) - 1; - if (ptab) { - int size; - - for (n = 0; n < 256; n++) - if (ptab->data[n] > qopt->mtu) - break; - size = (n << qopt->peakrate.cell_log) - 1; - if (size < max_size) - max_size = size; + if (tb[TCA_TBF_PBURST]) { + u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]); + max_size = min_t(u32, max_size, pburst); + mtu = psched_l2t_ns(&peak, pburst); + } else { + max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu)); + } + } else { + memset(&peak, 0, sizeof(peak)); } - if (max_size < 0) + + if (max_size < psched_mtu(qdisc_dev(sch))) + pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n", + max_size, qdisc_dev(sch)->name, + psched_mtu(qdisc_dev(sch))); + + if (!max_size) { + err = -EINVAL; goto done; + } if (q->qdisc != &noop_qdisc) { err = fifo_set_limit(q->qdisc, qopt->limit); @@ -293,22 +403,24 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) q->qdisc = child; } q->limit = qopt->limit; - q->mtu = qopt->mtu; + if (tb[TCA_TBF_PBURST]) + q->mtu = mtu; + else + q->mtu = PSCHED_TICKS2NS(qopt->mtu); q->max_size = max_size; - q->buffer = qopt->buffer; + if (tb[TCA_TBF_BURST]) + q->buffer = buffer; + else + q->buffer = PSCHED_TICKS2NS(qopt->buffer); q->tokens = q->buffer; q->ptokens = q->mtu; - swap(q->R_tab, rtab); - swap(q->P_tab, ptab); + memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg)); + memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg)); sch_tree_unlock(sch); err = 0; done: - if (rtab) - qdisc_put_rtab(rtab); - if (ptab) - qdisc_put_rtab(ptab); return err; } @@ -319,7 +431,7 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - q->t_c = psched_get_time(); + q->t_c = ktime_to_ns(ktime_get()); qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = &noop_qdisc; @@ -331,12 +443,6 @@ static void tbf_destroy(struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); - - if (q->P_tab) - qdisc_put_rtab(q->P_tab); - if (q->R_tab) - qdisc_put_rtab(q->R_tab); - qdisc_destroy(q->qdisc); } @@ -352,17 +458,24 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; opt.limit = q->limit; - opt.rate = q->R_tab->rate; - if (q->P_tab) - opt.peakrate = q->P_tab->rate; + psched_ratecfg_getrate(&opt.rate, &q->rate); + if (tbf_peak_present(q)) + psched_ratecfg_getrate(&opt.peakrate, &q->peak); else memset(&opt.peakrate, 0, sizeof(opt.peakrate)); - opt.mtu = q->mtu; - opt.buffer = q->buffer; - NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); + opt.mtu = PSCHED_NS2TICKS(q->mtu); + opt.buffer = PSCHED_NS2TICKS(q->buffer); + if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + if (q->rate.rate_bytes_ps >= (1ULL << 32) && + nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps)) + goto nla_put_failure; + if (tbf_peak_present(q) && + q->peak.rate_bytes_ps >= (1ULL << 32) && + nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps)) + goto nla_put_failure; - nla_nest_end(skb, nest); - return skb->len; + return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 45326599fda..47416716294 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -67,7 +67,6 @@ struct teql_master { struct teql_sched_data { struct Qdisc *next; struct teql_master *m; - struct neighbour *ncache; struct sk_buff_head q; }; @@ -88,9 +87,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; + return qdisc_drop(skb, sch); } static struct sk_buff * @@ -136,7 +133,6 @@ teql_reset(struct Qdisc *sch) skb_queue_purge(&dat->q); sch->q.qlen = 0; - teql_neigh_release(xchg(&dat->ncache, NULL)); } static void @@ -168,7 +164,6 @@ teql_destroy(struct Qdisc *sch) } } skb_queue_purge(&dat->q); - teql_neigh_release(xchg(&dat->ncache, NULL)); break; } @@ -227,21 +222,25 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt) static int __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev, struct netdev_queue *txq, - struct neighbour *mn) + struct dst_entry *dst) { - struct teql_sched_data *q = qdisc_priv(txq->qdisc); - struct neighbour *n = q->ncache; + struct neighbour *n; + int err = 0; - if (mn->tbl == NULL) - return -EINVAL; - if (n && n->tbl == mn->tbl && - memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { - atomic_inc(&n->refcnt); - } else { - n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev); - if (IS_ERR(n)) - return PTR_ERR(n); + n = dst_neigh_lookup_skb(dst, skb); + if (!n) + return -ENOENT; + + if (dst->dev != dev) { + struct neighbour *mn; + + mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev); + neigh_release(n); + if (IS_ERR(mn)) + return PTR_ERR(mn); + n = mn; } + if (neigh_event_send(n, skb_res) == 0) { int err; char haddr[MAX_ADDR_LEN]; @@ -250,15 +249,13 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr, NULL, skb->len); - if (err < 0) { - neigh_release(n); - return -EINVAL; - } - teql_neigh_release(xchg(&q->ncache, n)); - return 0; + if (err < 0) + err = -EINVAL; + } else { + err = (skb_res == NULL) ? -EAGAIN : 1; } neigh_release(n); - return (skb_res == NULL) ? -EAGAIN : 1; + return err; } static inline int teql_resolve(struct sk_buff *skb, @@ -267,7 +264,6 @@ static inline int teql_resolve(struct sk_buff *skb, struct netdev_queue *txq) { struct dst_entry *dst = skb_dst(skb); - struct neighbour *mn; int res; if (txq->qdisc == &noop_qdisc) @@ -277,8 +273,7 @@ static inline int teql_resolve(struct sk_buff *skb, return 0; rcu_read_lock(); - mn = dst_get_neighbour_noref(dst); - res = mn ? __teql_resolve(skb, skb_res, dev, txq, mn) : 0; + res = __teql_resolve(skb, skb_res, dev, txq, dst); rcu_read_unlock(); return res; |
