diff options
author | Arjan van de Ven <arjan@linux.intel.com> | 2008-10-17 09:20:26 -0700 |
---|---|---|
committer | Arjan van de Ven <arjan@linux.intel.com> | 2008-10-17 09:20:26 -0700 |
commit | 651dab4264e4ba0e563f5ff56f748127246e9065 (patch) | |
tree | 016630974bdcb00fe529b673f96d389e0fd6dc94 /net/sched | |
parent | 40b8606253552109815786e5d4b0de98782d31f5 (diff) | |
parent | 2e532d68a2b3e2aa6b19731501222069735c741c (diff) |
Merge commit 'linus/master' into merge-linus
Conflicts:
arch/x86/kvm/i8254.c
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 20 | ||||
-rw-r--r-- | net/sched/Makefile | 2 | ||||
-rw-r--r-- | net/sched/act_ipt.c | 46 | ||||
-rw-r--r-- | net/sched/act_skbedit.c | 203 | ||||
-rw-r--r-- | net/sched/cls_flow.c | 28 | ||||
-rw-r--r-- | net/sched/em_cmp.c | 9 | ||||
-rw-r--r-- | net/sched/sch_dsmark.c | 8 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 32 | ||||
-rw-r--r-- | net/sched/sch_multiq.c | 477 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 18 | ||||
-rw-r--r-- | net/sched/sch_prio.c | 6 | ||||
-rw-r--r-- | net/sched/sch_sfq.c | 4 |
12 files changed, 789 insertions, 64 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 9437b27ff84..6767e54155d 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -106,6 +106,15 @@ config NET_SCH_PRIO To compile this code as a module, choose M here: the module will be called sch_prio. +config NET_SCH_MULTIQ + tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)" + ---help--- + Say Y here if you want to use an n-band queue packet scheduler + to support devices that have multiple hardware transmit queues. + + To compile this code as a module, choose M here: the + module will be called sch_multiq. + config NET_SCH_RED tristate "Random Early Detection (RED)" ---help--- @@ -476,6 +485,17 @@ config NET_ACT_SIMP To compile this code as a module, choose M here: the module will be called simple. +config NET_ACT_SKBEDIT + tristate "SKB Editing" + depends on NET_CLS_ACT + ---help--- + Say Y here to change skb priority or queue_mapping settings. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called skbedit. + config NET_CLS_IND bool "Incoming device classification" depends on NET_CLS_U32 || NET_CLS_FW diff --git a/net/sched/Makefile b/net/sched/Makefile index 1d2b0f7df84..e60c9925b26 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o obj-$(CONFIG_NET_ACT_NAT) += act_nat.o obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o +obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o @@ -26,6 +27,7 @@ obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o +obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index d1263b3c96c..0453d79ebf5 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -40,6 +40,7 @@ static struct tcf_hashinfo ipt_hash_info = { static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook) { + struct xt_tgchk_param par; struct xt_target *target; int ret = 0; @@ -49,29 +50,30 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int return -ENOENT; t->u.kernel.target = target; - - ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t), - table, hook, 0, 0); - if (ret) { + par.table = table; + par.entryinfo = NULL; + par.target = target; + par.targinfo = t->data; + par.hook_mask = hook; + par.family = NFPROTO_IPV4; + + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); + if (ret < 0) { module_put(t->u.kernel.target->me); return ret; } - if (t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(table, NULL, - t->u.kernel.target, t->data, - hook)) { - module_put(t->u.kernel.target->me); - ret = -EINVAL; - } - - return ret; + return 0; } static void ipt_destroy_target(struct ipt_entry_target *t) { - if (t->u.kernel.target->destroy) - t->u.kernel.target->destroy(t->u.kernel.target, t->data); - module_put(t->u.kernel.target->me); + struct xt_tgdtor_param par = { + .target = t->u.kernel.target, + .targinfo = t->data, + }; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); } static int tcf_ipt_release(struct tcf_ipt *ipt, int bind) @@ -196,6 +198,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a, { int ret = 0, result = 0; struct tcf_ipt *ipt = a->priv; + struct xt_target_param par; if (skb_cloned(skb)) { if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) @@ -211,10 +214,13 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a, /* yes, we have to worry about both in and out dev worry later - danger - this API seems to have changed from earlier kernels */ - ret = ipt->tcfi_t->u.kernel.target->target(skb, skb->dev, NULL, - ipt->tcfi_hook, - ipt->tcfi_t->u.kernel.target, - ipt->tcfi_t->data); + par.in = skb->dev; + par.out = NULL; + par.hooknum = ipt->tcfi_hook; + par.target = ipt->tcfi_t->u.kernel.target; + par.targinfo = ipt->tcfi_t->data; + ret = par.target->target(skb, &par); + switch (ret) { case NF_ACCEPT: result = TC_ACT_OK; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c new file mode 100644 index 00000000000..fe9777e77f3 --- /dev/null +++ b/net/sched/act_skbedit.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2008, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Author: Alexander Duyck <alexander.h.duyck@intel.com> + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> + +#include <linux/tc_act/tc_skbedit.h> +#include <net/tc_act/tc_skbedit.h> + +#define SKBEDIT_TAB_MASK 15 +static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1]; +static u32 skbedit_idx_gen; +static DEFINE_RWLOCK(skbedit_lock); + +static struct tcf_hashinfo skbedit_hash_info = { + .htab = tcf_skbedit_ht, + .hmask = SKBEDIT_TAB_MASK, + .lock = &skbedit_lock, +}; + +static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_skbedit *d = a->priv; + + spin_lock(&d->tcf_lock); + d->tcf_tm.lastuse = jiffies; + d->tcf_bstats.bytes += qdisc_pkt_len(skb); + d->tcf_bstats.packets++; + + if (d->flags & SKBEDIT_F_PRIORITY) + skb->priority = d->priority; + if (d->flags & SKBEDIT_F_QUEUE_MAPPING && + skb->dev->real_num_tx_queues > d->queue_mapping) + skb_set_queue_mapping(skb, d->queue_mapping); + + spin_unlock(&d->tcf_lock); + return d->tcf_action; +} + +static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { + [TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) }, + [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) }, + [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) }, +}; + +static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; + struct tc_skbedit *parm; + struct tcf_skbedit *d; + struct tcf_common *pc; + u32 flags = 0, *priority = NULL; + u16 *queue_mapping = NULL; + int ret = 0, err; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy); + if (err < 0) + return err; + + if (tb[TCA_SKBEDIT_PARMS] == NULL) + return -EINVAL; + + if (tb[TCA_SKBEDIT_PRIORITY] != NULL) { + flags |= SKBEDIT_F_PRIORITY; + priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]); + } + + if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) { + flags |= SKBEDIT_F_QUEUE_MAPPING; + queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]); + } + if (!flags) + return -EINVAL; + + parm = nla_data(tb[TCA_SKBEDIT_PARMS]); + + pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info); + if (!pc) { + pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, + &skbedit_idx_gen, &skbedit_hash_info); + if (unlikely(!pc)) + return -ENOMEM; + + d = to_skbedit(pc); + ret = ACT_P_CREATED; + } else { + d = to_skbedit(pc); + if (!ovr) { + tcf_hash_release(pc, bind, &skbedit_hash_info); + return -EEXIST; + } + } + + spin_lock_bh(&d->tcf_lock); + + d->flags = flags; + if (flags & SKBEDIT_F_PRIORITY) + d->priority = *priority; + if (flags & SKBEDIT_F_QUEUE_MAPPING) + d->queue_mapping = *queue_mapping; + d->tcf_action = parm->action; + + spin_unlock_bh(&d->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &skbedit_hash_info); + return ret; +} + +static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind) +{ + struct tcf_skbedit *d = a->priv; + + if (d) + return tcf_hash_release(&d->common, bind, &skbedit_hash_info); + return 0; +} + +static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_skbedit *d = a->priv; + struct tc_skbedit opt; + struct tcf_t t; + + opt.index = d->tcf_index; + opt.refcnt = d->tcf_refcnt - ref; + opt.bindcnt = d->tcf_bindcnt - bind; + opt.action = d->tcf_action; + NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt); + if (d->flags & SKBEDIT_F_PRIORITY) + NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), + &d->priority); + if (d->flags & SKBEDIT_F_QUEUE_MAPPING) + NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING, + sizeof(d->queue_mapping), &d->queue_mapping); + t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(d->tcf_tm.expires); + NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_skbedit_ops = { + .kind = "skbedit", + .hinfo = &skbedit_hash_info, + .type = TCA_ACT_SKBEDIT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_skbedit, + .dump = tcf_skbedit_dump, + .cleanup = tcf_skbedit_cleanup, + .init = tcf_skbedit_init, + .walk = tcf_generic_walker, +}; + +MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>"); +MODULE_DESCRIPTION("SKB Editing"); +MODULE_LICENSE("GPL"); + +static int __init skbedit_init_module(void) +{ + return tcf_register_action(&act_skbedit_ops); +} + +static void __exit skbedit_cleanup_module(void) +{ + tcf_unregister_action(&act_skbedit_ops); +} + +module_init(skbedit_init_module); +module_exit(skbedit_cleanup_module); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 8f63a1a9401..0ebaff637e3 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -67,9 +67,9 @@ static inline u32 addr_fold(void *addr) static u32 flow_get_src(const struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): return ntohl(ip_hdr(skb)->saddr); - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]); default: return addr_fold(skb->sk); @@ -79,9 +79,9 @@ static u32 flow_get_src(const struct sk_buff *skb) static u32 flow_get_dst(const struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): return ntohl(ip_hdr(skb)->daddr); - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]); default: return addr_fold(skb->dst) ^ (__force u16)skb->protocol; @@ -91,9 +91,9 @@ static u32 flow_get_dst(const struct sk_buff *skb) static u32 flow_get_proto(const struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): return ip_hdr(skb)->protocol; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): return ipv6_hdr(skb)->nexthdr; default: return 0; @@ -120,7 +120,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb) u32 res = 0; switch (skb->protocol) { - case __constant_htons(ETH_P_IP): { + case htons(ETH_P_IP): { struct iphdr *iph = ip_hdr(skb); if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && @@ -128,7 +128,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb) res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4)); break; } - case __constant_htons(ETH_P_IPV6): { + case htons(ETH_P_IPV6): { struct ipv6hdr *iph = ipv6_hdr(skb); if (has_ports(iph->nexthdr)) @@ -147,7 +147,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb) u32 res = 0; switch (skb->protocol) { - case __constant_htons(ETH_P_IP): { + case htons(ETH_P_IP): { struct iphdr *iph = ip_hdr(skb); if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && @@ -155,7 +155,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb) res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2)); break; } - case __constant_htons(ETH_P_IPV6): { + case htons(ETH_P_IPV6): { struct ipv6hdr *iph = ipv6_hdr(skb); if (has_ports(iph->nexthdr)) @@ -213,9 +213,9 @@ static u32 flow_get_nfct(const struct sk_buff *skb) static u32 flow_get_nfct_src(const struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, src.u3.ip)); - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): return ntohl(CTTUPLE(skb, src.u3.ip6[3])); } fallback: @@ -225,9 +225,9 @@ fallback: static u32 flow_get_nfct_dst(const struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, dst.u3.ip)); - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): return ntohl(CTTUPLE(skb, dst.u3.ip6[3])); } fallback: diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index cc49c932641..bc450397487 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -14,6 +14,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/tc_ematch/tc_em_cmp.h> +#include <asm/unaligned.h> #include <net/pkt_cls.h> static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp) @@ -37,8 +38,7 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, break; case TCF_EM_ALIGN_U16: - val = *ptr << 8; - val |= *(ptr+1); + val = get_unaligned_be16(ptr); if (cmp_needs_transformation(cmp)) val = be16_to_cpu(val); @@ -47,10 +47,7 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, case TCF_EM_ALIGN_U32: /* Worth checking boundries? The branching seems * to get worse. Visit again. */ - val = *ptr << 24; - val |= *(ptr+1) << 16; - val |= *(ptr+2) << 8; - val |= *(ptr+3); + val = get_unaligned_be32(ptr); if (cmp_needs_transformation(cmp)) val = be32_to_cpu(val); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index edd1298f85f..ba43aab3a85 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -202,7 +202,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (p->set_tc_index) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): if (skb_cow_head(skb, sizeof(struct iphdr))) goto drop; @@ -210,7 +210,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) & ~INET_ECN_MASK; break; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): if (skb_cow_head(skb, sizeof(struct ipv6hdr))) goto drop; @@ -289,11 +289,11 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) pr_debug("index %d->%d\n", skb->tc_index, index); switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): ipv4_change_dsfield(ip_hdr(skb), p->mask[index], p->value[index]); break; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index], p->value[index]); break; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 9634091ee2f..7b5572d6beb 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -44,23 +44,30 @@ static inline int qdisc_qlen(struct Qdisc *q) static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - if (unlikely(skb->next)) - q->gso_skb = skb; - else - q->ops->requeue(skb, q); - + q->gso_skb = skb; + q->qstats.requeues++; __netif_schedule(q); + return 0; } static inline struct sk_buff *dequeue_skb(struct Qdisc *q) { - struct sk_buff *skb; + struct sk_buff *skb = q->gso_skb; - if ((skb = q->gso_skb)) - q->gso_skb = NULL; - else + if (unlikely(skb)) { + struct net_device *dev = qdisc_dev(q); + struct netdev_queue *txq; + + /* check the reason of requeuing without tx lock first */ + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq)) + q->gso_skb = NULL; + else + skb = NULL; + } else { skb = q->dequeue(q); + } return skb; } @@ -215,10 +222,9 @@ static void dev_watchdog(unsigned long arg) time_after(jiffies, (dev->trans_start + dev->watchdog_timeo))) { char drivername[64]; - printk(KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit timed out\n", + WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit timed out\n", dev->name, netdev_drivername(dev, drivername, 64)); dev->tx_timeout(dev); - WARN_ON_ONCE(1); } if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + @@ -328,6 +334,7 @@ struct Qdisc noop_qdisc = { .flags = TCQ_F_BUILTIN, .ops = &noop_qdisc_ops, .list = LIST_HEAD_INIT(noop_qdisc.list), + .requeue.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, }; @@ -353,6 +360,7 @@ static struct Qdisc noqueue_qdisc = { .flags = TCQ_F_BUILTIN, .ops = &noqueue_qdisc_ops, .list = LIST_HEAD_INIT(noqueue_qdisc.list), + .requeue.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), .dev_queue = &noqueue_netdev_queue, }; @@ -473,6 +481,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->padded = (char *) sch - (char *) p; INIT_LIST_HEAD(&sch->list); + skb_queue_head_init(&sch->requeue); skb_queue_head_init(&sch->q); sch->ops = ops; sch->enqueue = ops->enqueue; @@ -541,6 +550,7 @@ void qdisc_destroy(struct Qdisc *qdisc) dev_put(qdisc_dev(qdisc)); kfree_skb(qdisc->gso_skb); + __skb_queue_purge(&qdisc->requeue); kfree((char *) qdisc - qdisc->padded); } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c new file mode 100644 index 00000000000..915f3149dde --- /dev/null +++ b/net/sched/sch_multiq.c @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2008, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Author: Alexander Duyck <alexander.h.duyck@intel.com> + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> + + +struct multiq_sched_data { + u16 bands; + u16 max_bands; + u16 curband; + struct tcf_proto *filter_list; + struct Qdisc **queues; +}; + + +static struct Qdisc * +multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + u32 band; + struct tcf_result res; + int err; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + err = tc_classify(skb, q->filter_list, &res); +#ifdef CONFIG_NET_CLS_ACT + switch (err) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + } +#endif + band = skb_get_queue_mapping(skb); + + if (band >= q->bands) + return q->queues[0]; + + return q->queues[band]; +} + +static int +multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct Qdisc *qdisc; + int ret; + + qdisc = multiq_classify(skb, sch, &ret); +#ifdef CONFIG_NET_CLS_ACT + if (qdisc == NULL) { + + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } +#endif + + ret = qdisc_enqueue(skb, qdisc); + if (ret == NET_XMIT_SUCCESS) { + sch->bstats.bytes += qdisc_pkt_len(skb); + sch->bstats.packets++; + sch->q.qlen++; + return NET_XMIT_SUCCESS; + } + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + return ret; +} + + +static int +multiq_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct Qdisc *qdisc; + struct multiq_sched_data *q = qdisc_priv(sch); + int ret; + + qdisc = multiq_classify(skb, sch, &ret); +#ifdef CONFIG_NET_CLS_ACT + if (qdisc == NULL) { + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } +#endif + + ret = qdisc->ops->requeue(skb, qdisc); + if (ret == NET_XMIT_SUCCESS) { + sch->q.qlen++; + sch->qstats.requeues++; + if (q->curband) + q->curband--; + else + q->curband = q->bands - 1; + return NET_XMIT_SUCCESS; + } + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + return ret; +} + + +static struct sk_buff *multiq_dequeue(struct Qdisc *sch) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc; + struct sk_buff *skb; + int band; + + for (band = 0; band < q->bands; band++) { + /* cycle through bands to ensure fairness */ + q->curband++; + if (q->curband >= q->bands) + q->curband = 0; + + /* Check that target subqueue is available before + * pulling an skb to avoid excessive requeues + */ + if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) { + qdisc = q->queues[q->curband]; + skb = qdisc->dequeue(qdisc); + if (skb) { + sch->q.qlen--; + return skb; + } + } + } + return NULL; + +} + +static unsigned int multiq_drop(struct Qdisc *sch) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + int band; + unsigned int len; + struct Qdisc *qdisc; + + for (band = q->bands-1; band >= 0; band--) { + qdisc = q->queues[band]; + if (qdisc->ops->drop) { + len = qdisc->ops->drop(qdisc); + if (len != 0) { + sch->q.qlen--; + return len; + } + } + } + return 0; +} + + +static void +multiq_reset(struct Qdisc *sch) +{ + u16 band; + struct multiq_sched_data *q = qdisc_priv(sch); + + for (band = 0; band < q->bands; band++) + qdisc_reset(q->queues[band]); + sch->q.qlen = 0; + q->curband = 0; +} + +static void +multiq_destroy(struct Qdisc *sch) +{ + int band; + struct multiq_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + for (band = 0; band < q->bands; band++) + qdisc_destroy(q->queues[band]); + + kfree(q->queues); +} + +static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + struct tc_multiq_qopt *qopt; + int i; + + if (!netif_is_multiqueue(qdisc_dev(sch))) + return -EINVAL; + if (nla_len(opt) < sizeof(*qopt)) + return -EINVAL; + + qopt = nla_data(opt); + + qopt->bands = qdisc_dev(sch)->real_num_tx_queues; + + sch_tree_lock(sch); + q->bands = qopt->bands; + for (i = q->bands; i < q->max_bands; i++) { + if (q->queues[i] != &noop_qdisc) { + struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc); + qdisc_tree_decrease_qlen(child, child->q.qlen); + qdisc_destroy(child); + } + } + + sch_tree_unlock(sch); + + for (i = 0; i < q->bands; i++) { + if (q->queues[i] == &noop_qdisc) { + struct Qdisc *child; + child = qdisc_create_dflt(qdisc_dev(sch), + sch->dev_queue, + &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, + i + 1)); + if (child) { + sch_tree_lock(sch); + child = xchg(&q->queues[i], child); + + if (child != &noop_qdisc) { + qdisc_tree_decrease_qlen(child, + child->q.qlen); + qdisc_destroy(child); + } + sch_tree_unlock(sch); + } + } + } + return 0; +} + +static int multiq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + int i, err; + + q->queues = NULL; + + if (opt == NULL) + return -EINVAL; + + q->max_bands = qdisc_dev(sch)->num_tx_queues; + + q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL); + if (!q->queues) + return -ENOBUFS; + for (i = 0; i < q->max_bands; i++) + q->queues[i] = &noop_qdisc; + + err = multiq_tune(sch,opt); + + if (err) + kfree(q->queues); + + return err; +} + +static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb_tail_pointer(skb); + struct tc_multiq_qopt opt; + + opt.bands = q->bands; + opt.max_bands = q->max_bands; + + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + if (band >= q->bands) + return -EINVAL; + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->queues[band]; + q->queues[band] = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc * +multiq_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + if (band >= q->bands) + return NULL; + + return q->queues[band]; +} + +static unsigned long multiq_get(struct Qdisc *sch, u32 classid) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned long band = TC_H_MIN(classid); + + if (band - 1 >= q->bands) + return 0; + return band; +} + +static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return multiq_get(sch, classid); +} + + +static void multiq_put(struct Qdisc *q, unsigned long cl) +{ + return; +} + +static int multiq_change(struct Qdisc *sch, u32 handle, u32 parent, + struct nlattr **tca, unsigned long *arg) +{ + unsigned long cl = *arg; + struct multiq_sched_data *q = qdisc_priv(sch); + + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + +static int multiq_delete(struct Qdisc *sch, unsigned long cl) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + + +static int multiq_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + + if (cl - 1 > q->bands) + return -ENOENT; + tcm->tcm_handle |= TC_H_MIN(cl); + if (q->queues[cl-1]) + tcm->tcm_info = q->queues[cl-1]->handle; + return 0; +} + +static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + struct Qdisc *cl_q; + + cl_q = q->queues[cl - 1]; + if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || + gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + return -1; + + return 0; +} + +static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + int band; + + if (arg->stop) + return; + + for (band = 0; band < q->bands; band++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, band+1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static const struct Qdisc_class_ops multiq_class_ops = { + .graft = multiq_graft, + .leaf = multiq_leaf, + .get = multiq_get, + .put = multiq_put, + .change = multiq_change, + .delete = multiq_delete, + .walk = multiq_walk, + .tcf_chain = multiq_find_tcf, + .bind_tcf = multiq_bind, + .unbind_tcf = multiq_put, + .dump = multiq_dump_class, + .dump_stats = multiq_dump_class_stats, +}; + < |