diff options
Diffstat (limited to 'net/sched/sch_fq.c')
| -rw-r--r-- | net/sched/sch_fq.c | 110 |
1 files changed, 71 insertions, 39 deletions
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index a2fef8b10b9..ba32c2b005d 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -47,6 +47,7 @@ #include <linux/rbtree.h> #include <linux/hash.h> #include <linux/prefetch.h> +#include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/sock.h> @@ -88,7 +89,7 @@ struct fq_sched_data { struct fq_flow internal; /* for non classified or high prio packets */ u32 quantum; u32 initial_quantum; - u32 flow_default_rate;/* rate per flow : bytes per second */ + u32 flow_refill_delay; u32 flow_max_rate; /* optional max rate per flow */ u32 flow_plimit; /* max packets per flow */ struct rb_root *fq_root; @@ -115,6 +116,7 @@ static struct fq_flow detached, throttled; static void fq_flow_set_detached(struct fq_flow *f) { f->next = &detached; + f->age = jiffies; } static bool fq_flow_is_detached(const struct fq_flow *f) @@ -209,28 +211,22 @@ static void fq_gc(struct fq_sched_data *q, } } -static const u8 prio2band[TC_PRIO_MAX + 1] = { - 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 -}; - static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) { struct rb_node **p, *parent; struct sock *sk = skb->sk; struct rb_root *root; struct fq_flow *f; - int band; /* warning: no starvation prevention... */ - band = prio2band[skb->priority & TC_PRIO_MAX]; - if (unlikely(band == 0)) + if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) return &q->internal; if (unlikely(!sk)) { /* By forcing low order bit to 1, we make sure to not * collide with a local flow (socket pointers are word aligned) */ - sk = (struct sock *)(skb_get_rxhash(skb) | 1L); + sk = (struct sock *)(skb_get_hash(skb) | 1L); } root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; @@ -255,6 +251,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) f->socket_hash != sk->sk_hash)) { f->credit = q->initial_quantum; f->socket_hash = sk->sk_hash; + f->time_next_packet = 0ULL; } return f; } @@ -372,17 +369,20 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) } f->qlen++; - flow_queue_add(f, skb); if (skb_is_retransmit(skb)) q->stat_tcp_retrans++; sch->qstats.backlog += qdisc_pkt_len(skb); if (fq_flow_is_detached(f)) { fq_flow_add_tail(&q->new_flows, f); - if (q->quantum > f->credit) - f->credit = q->quantum; + if (time_after(jiffies, f->age + q->flow_refill_delay)) + f->credit = max_t(u32, f->credit, q->quantum); q->inactive_flows--; qdisc_unthrottled(sch); } + + /* Note: this overwrites f->age */ + flow_queue_add(f, skb); + if (unlikely(f == &q->internal)) { q->stat_internal_packets++; qdisc_unthrottled(sch); @@ -460,7 +460,6 @@ begin: fq_flow_add_tail(&q->old_flows, f); } else { fq_flow_set_detached(f); - f->age = jiffies; q->inactive_flows++; } goto begin; @@ -472,20 +471,16 @@ begin: if (f->credit > 0 || !q->rate_enable) goto out; - if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) { - rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate; + rate = q->flow_max_rate; + if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) + rate = min(skb->sk->sk_pacing_rate, rate); - rate = min(rate, q->flow_max_rate); - } else { - rate = q->flow_max_rate; - if (rate == ~0U) - goto out; - } - if (rate) { + if (rate != ~0U) { u32 plen = max(qdisc_pkt_len(skb), q->quantum); u64 len = (u64)plen * NSEC_PER_SEC; - do_div(len, rate); + if (likely(rate)) + do_div(len, rate); /* Since socket rate can change later, * clamp the delay to 125 ms. * TODO: maybe segment the too big skb, as in commit @@ -584,28 +579,53 @@ static void fq_rehash(struct fq_sched_data *q, q->stat_gc_flows += fcnt; } -static int fq_resize(struct fq_sched_data *q, u32 log) +static void *fq_alloc_node(size_t sz, int node) { + void *ptr; + + ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); + if (!ptr) + ptr = vmalloc_node(sz, node); + return ptr; +} + +static void fq_free(void *addr) +{ + kvfree(addr); +} + +static int fq_resize(struct Qdisc *sch, u32 log) +{ + struct fq_sched_data *q = qdisc_priv(sch); struct rb_root *array; + void *old_fq_root; u32 idx; if (q->fq_root && log == q->fq_trees_log) return 0; - array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL); + /* If XPS was setup, we can allocate memory on right NUMA node */ + array = fq_alloc_node(sizeof(struct rb_root) << log, + netdev_queue_numa_node_read(sch->dev_queue)); if (!array) return -ENOMEM; for (idx = 0; idx < (1U << log); idx++) array[idx] = RB_ROOT; - if (q->fq_root) { - fq_rehash(q, q->fq_root, q->fq_trees_log, array, log); - kfree(q->fq_root); - } + sch_tree_lock(sch); + + old_fq_root = q->fq_root; + if (old_fq_root) + fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); + q->fq_root = array; q->fq_trees_log = log; + sch_tree_unlock(sch); + + fq_free(old_fq_root); + return 0; } @@ -618,6 +638,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt) @@ -656,10 +677,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); if (tb[TCA_FQ_INITIAL_QUANTUM]) - q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); + q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) - q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]); + pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", + nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); if (tb[TCA_FQ_FLOW_MAX_RATE]) q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); @@ -673,9 +695,17 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) err = -EINVAL; } - if (!err) - err = fq_resize(q, fq_log); + if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { + u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; + + q->flow_refill_delay = usecs_to_jiffies(usecs_delay); + } + if (!err) { + sch_tree_unlock(sch); + err = fq_resize(sch, fq_log); + sch_tree_lock(sch); + } while (sch->q.qlen > sch->limit) { struct sk_buff *skb = fq_dequeue(sch); @@ -695,7 +725,7 @@ static void fq_destroy(struct Qdisc *sch) struct fq_sched_data *q = qdisc_priv(sch); fq_reset(sch); - kfree(q->fq_root); + fq_free(q->fq_root); qdisc_watchdog_cancel(&q->watchdog); } @@ -708,7 +738,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) q->flow_plimit = 100; q->quantum = 2 * psched_mtu(qdisc_dev(sch)); q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); - q->flow_default_rate = 0; + q->flow_refill_delay = msecs_to_jiffies(40); q->flow_max_rate = ~0U; q->rate_enable = 1; q->new_flows.first = NULL; @@ -721,7 +751,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) if (opt) err = fq_change(sch, opt); else - err = fq_resize(q, q->fq_trees_log); + err = fq_resize(sch, q->fq_trees_log); return err; } @@ -735,18 +765,20 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; + /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ + if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || - nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) || nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || + nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, + jiffies_to_usecs(q->flow_refill_delay)) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) goto nla_put_failure; - nla_nest_end(skb, opts); - return skb->len; + return nla_nest_end(skb, opts); nla_put_failure: return -1; |
