diff options
Diffstat (limited to 'net/sched/sch_fq.c')
| -rw-r--r-- | net/sched/sch_fq.c | 188 | 
1 files changed, 122 insertions, 66 deletions
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 32ad015ee8c..ba32c2b005d 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -47,6 +47,7 @@  #include <linux/rbtree.h>  #include <linux/hash.h>  #include <linux/prefetch.h> +#include <linux/vmalloc.h>  #include <net/netlink.h>  #include <net/pkt_sched.h>  #include <net/sock.h> @@ -88,7 +89,7 @@ struct fq_sched_data {  	struct fq_flow	internal;	/* for non classified or high prio packets */  	u32		quantum;  	u32		initial_quantum; -	u32		flow_default_rate;/* rate per flow : bytes per second */ +	u32		flow_refill_delay;  	u32		flow_max_rate;	/* optional max rate per flow */  	u32		flow_plimit;	/* max packets per flow */  	struct rb_root	*fq_root; @@ -115,6 +116,7 @@ static struct fq_flow detached, throttled;  static void fq_flow_set_detached(struct fq_flow *f)  {  	f->next = &detached; +	f->age = jiffies;  }  static bool fq_flow_is_detached(const struct fq_flow *f) @@ -209,28 +211,22 @@ static void fq_gc(struct fq_sched_data *q,  	}  } -static const u8 prio2band[TC_PRIO_MAX + 1] = { -	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 -}; -  static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)  {  	struct rb_node **p, *parent;  	struct sock *sk = skb->sk;  	struct rb_root *root;  	struct fq_flow *f; -	int band;  	/* warning: no starvation prevention... */ -	band = prio2band[skb->priority & TC_PRIO_MAX]; -	if (unlikely(band == 0)) +	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))  		return &q->internal;  	if (unlikely(!sk)) {  		/* By forcing low order bit to 1, we make sure to not  		 * collide with a local flow (socket pointers are word aligned)  		 */ -		sk = (struct sock *)(skb_get_rxhash(skb) | 1L); +		sk = (struct sock *)(skb_get_hash(skb) | 1L);  	}  	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; @@ -255,6 +251,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)  				     f->socket_hash != sk->sk_hash)) {  				f->credit = q->initial_quantum;  				f->socket_hash = sk->sk_hash; +				f->time_next_packet = 0ULL;  			}  			return f;  		} @@ -285,7 +282,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)  /* remove one skb from head of flow queue */ -static struct sk_buff *fq_dequeue_head(struct fq_flow *flow) +static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)  {  	struct sk_buff *skb = flow->head; @@ -293,6 +290,8 @@ static struct sk_buff *fq_dequeue_head(struct fq_flow *flow)  		flow->head = skb->next;  		skb->next = NULL;  		flow->qlen--; +		sch->qstats.backlog -= qdisc_pkt_len(skb); +		sch->q.qlen--;  	}  	return skb;  } @@ -370,17 +369,20 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	}  	f->qlen++; -	flow_queue_add(f, skb);  	if (skb_is_retransmit(skb))  		q->stat_tcp_retrans++;  	sch->qstats.backlog += qdisc_pkt_len(skb);  	if (fq_flow_is_detached(f)) {  		fq_flow_add_tail(&q->new_flows, f); -		if (q->quantum > f->credit) -			f->credit = q->quantum; +		if (time_after(jiffies, f->age + q->flow_refill_delay)) +			f->credit = max_t(u32, f->credit, q->quantum);  		q->inactive_flows--;  		qdisc_unthrottled(sch);  	} + +	/* Note: this overwrites f->age */ +	flow_queue_add(f, skb); +  	if (unlikely(f == &q->internal)) {  		q->stat_internal_packets++;  		qdisc_unthrottled(sch); @@ -418,8 +420,9 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)  	struct fq_flow_head *head;  	struct sk_buff *skb;  	struct fq_flow *f; +	u32 rate; -	skb = fq_dequeue_head(&q->internal); +	skb = fq_dequeue_head(sch, &q->internal);  	if (skb)  		goto out;  	fq_check_throttled(q, now); @@ -449,7 +452,7 @@ begin:  		goto begin;  	} -	skb = fq_dequeue_head(f); +	skb = fq_dequeue_head(sch, f);  	if (!skb) {  		head->first = f->next;  		/* force a pass through old_flows to prevent starvation */ @@ -457,7 +460,6 @@ begin:  			fq_flow_add_tail(&q->old_flows, f);  		} else {  			fq_flow_set_detached(f); -			f->age = jiffies;  			q->inactive_flows++;  		}  		goto begin; @@ -466,43 +468,70 @@ begin:  	f->time_next_packet = now;  	f->credit -= qdisc_pkt_len(skb); -	if (f->credit <= 0 && -	    q->rate_enable && -	    skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) { -		u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate; +	if (f->credit > 0 || !q->rate_enable) +		goto out; + +	rate = q->flow_max_rate; +	if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) +		rate = min(skb->sk->sk_pacing_rate, rate); -		rate = min(rate, q->flow_max_rate); -		if (rate) { -			u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC; +	if (rate != ~0U) { +		u32 plen = max(qdisc_pkt_len(skb), q->quantum); +		u64 len = (u64)plen * NSEC_PER_SEC; +		if (likely(rate))  			do_div(len, rate); -			/* Since socket rate can change later, -			 * clamp the delay to 125 ms. -			 * TODO: maybe segment the too big skb, as in commit -			 * e43ac79a4bc ("sch_tbf: segment too big GSO packets") -			 */ -			if (unlikely(len > 125 * NSEC_PER_MSEC)) { -				len = 125 * NSEC_PER_MSEC; -				q->stat_pkts_too_long++; -			} - -			f->time_next_packet = now + len; +		/* Since socket rate can change later, +		 * clamp the delay to 125 ms. +		 * TODO: maybe segment the too big skb, as in commit +		 * e43ac79a4bc ("sch_tbf: segment too big GSO packets") +		 */ +		if (unlikely(len > 125 * NSEC_PER_MSEC)) { +			len = 125 * NSEC_PER_MSEC; +			q->stat_pkts_too_long++;  		} + +		f->time_next_packet = now + len;  	}  out: -	sch->qstats.backlog -= qdisc_pkt_len(skb);  	qdisc_bstats_update(sch, skb); -	sch->q.qlen--;  	qdisc_unthrottled(sch);  	return skb;  }  static void fq_reset(struct Qdisc *sch)  { +	struct fq_sched_data *q = qdisc_priv(sch); +	struct rb_root *root;  	struct sk_buff *skb; +	struct rb_node *p; +	struct fq_flow *f; +	unsigned int idx; -	while ((skb = fq_dequeue(sch)) != NULL) +	while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)  		kfree_skb(skb); + +	if (!q->fq_root) +		return; + +	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { +		root = &q->fq_root[idx]; +		while ((p = rb_first(root)) != NULL) { +			f = container_of(p, struct fq_flow, fq_node); +			rb_erase(p, root); + +			while ((skb = fq_dequeue_head(sch, f)) != NULL) +				kfree_skb(skb); + +			kmem_cache_free(fq_flow_cachep, f); +		} +	} +	q->new_flows.first	= NULL; +	q->old_flows.first	= NULL; +	q->delayed		= RB_ROOT; +	q->flows		= 0; +	q->inactive_flows	= 0; +	q->throttled_flows	= 0;  }  static void fq_rehash(struct fq_sched_data *q, @@ -550,28 +579,53 @@ static void fq_rehash(struct fq_sched_data *q,  	q->stat_gc_flows += fcnt;  } -static int fq_resize(struct fq_sched_data *q, u32 log) +static void *fq_alloc_node(size_t sz, int node) +{ +	void *ptr; + +	ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); +	if (!ptr) +		ptr = vmalloc_node(sz, node); +	return ptr; +} + +static void fq_free(void *addr) +{ +	kvfree(addr); +} + +static int fq_resize(struct Qdisc *sch, u32 log)  { +	struct fq_sched_data *q = qdisc_priv(sch);  	struct rb_root *array; +	void *old_fq_root;  	u32 idx;  	if (q->fq_root && log == q->fq_trees_log)  		return 0; -	array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL); +	/* If XPS was setup, we can allocate memory on right NUMA node */ +	array = fq_alloc_node(sizeof(struct rb_root) << log, +			      netdev_queue_numa_node_read(sch->dev_queue));  	if (!array)  		return -ENOMEM;  	for (idx = 0; idx < (1U << log); idx++)  		array[idx] = RB_ROOT; -	if (q->fq_root) { -		fq_rehash(q, q->fq_root, q->fq_trees_log, array, log); -		kfree(q->fq_root); -	} +	sch_tree_lock(sch); + +	old_fq_root = q->fq_root; +	if (old_fq_root) +		fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); +  	q->fq_root = array;  	q->fq_trees_log = log; +	sch_tree_unlock(sch); + +	fq_free(old_fq_root); +  	return 0;  } @@ -584,6 +638,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {  	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },  	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },  	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 }, +	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },  };  static int fq_change(struct Qdisc *sch, struct nlattr *opt) @@ -622,10 +677,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)  		q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);  	if (tb[TCA_FQ_INITIAL_QUANTUM]) -		q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); +		q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);  	if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) -		q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]); +		pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", +				    nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));  	if (tb[TCA_FQ_FLOW_MAX_RATE])  		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); @@ -639,12 +695,22 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)  			err = -EINVAL;  	} -	if (!err) -		err = fq_resize(q, fq_log); +	if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { +		u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; + +		q->flow_refill_delay = usecs_to_jiffies(usecs_delay); +	} +	if (!err) { +		sch_tree_unlock(sch); +		err = fq_resize(sch, fq_log); +		sch_tree_lock(sch); +	}  	while (sch->q.qlen > sch->limit) {  		struct sk_buff *skb = fq_dequeue(sch); +		if (!skb) +			break;  		kfree_skb(skb);  		drop_count++;  	} @@ -657,21 +723,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)  static void fq_destroy(struct Qdisc *sch)  {  	struct fq_sched_data *q = qdisc_priv(sch); -	struct rb_root *root; -	struct rb_node *p; -	unsigned int idx; -	if (q->fq_root) { -		for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { -			root = &q->fq_root[idx]; -			while ((p = rb_first(root)) != NULL) { -				rb_erase(p, root); -				kmem_cache_free(fq_flow_cachep, -						container_of(p, struct fq_flow, fq_node)); -			} -		} -		kfree(q->fq_root); -	} +	fq_reset(sch); +	fq_free(q->fq_root);  	qdisc_watchdog_cancel(&q->watchdog);  } @@ -684,7 +738,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)  	q->flow_plimit		= 100;  	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));  	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch)); -	q->flow_default_rate	= 0; +	q->flow_refill_delay	= msecs_to_jiffies(40);  	q->flow_max_rate	= ~0U;  	q->rate_enable		= 1;  	q->new_flows.first	= NULL; @@ -697,7 +751,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)  	if (opt)  		err = fq_change(sch, opt);  	else -		err = fq_resize(q, q->fq_trees_log); +		err = fq_resize(sch, q->fq_trees_log);  	return err;  } @@ -711,18 +765,20 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)  	if (opts == NULL)  		goto nla_put_failure; +	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ +  	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||  	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||  	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||  	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||  	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || -	    nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) ||  	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || +	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, +			jiffies_to_usecs(q->flow_refill_delay)) ||  	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))  		goto nla_put_failure; -	nla_nest_end(skb, opts); -	return skb->len; +	return nla_nest_end(skb, opts);  nla_put_failure:  	return -1;  | 
