diff options
Diffstat (limited to 'net/sched')
46 files changed, 2959 insertions, 1440 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index c03a32a0418..a1a8e29e5fc 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -286,6 +286,28 @@ config NET_SCH_FQ  	  If unsure, say N. +config NET_SCH_HHF +	tristate "Heavy-Hitter Filter (HHF)" +	help +	  Say Y here if you want to use the Heavy-Hitter Filter (HHF) +	  packet scheduling algorithm. + +	  To compile this driver as a module, choose M here: the module +	  will be called sch_hhf. + +config NET_SCH_PIE +	tristate "Proportional Integral controller Enhanced (PIE) scheduler" +	help +	  Say Y here if you want to use the Proportional Integral controller +	  Enhanced scheduler packet scheduling algorithm. +	  For more information, please see +	  http://tools.ietf.org/html/draft-pan-tsvwg-pie-00 + +	  To compile this driver as a module, choose M here: the module +	  will be called sch_pie. + +	  If unsure, say N. +  config NET_SCH_INGRESS  	tristate "Ingress Qdisc"  	depends on NET_CLS_ACT @@ -435,6 +457,7 @@ config NET_CLS_FLOW  config NET_CLS_CGROUP  	tristate "Control Group Classifier"  	select NET_CLS +	select CGROUP_NET_CLASSID  	depends on CGROUPS  	---help---  	  Say Y here if you want to classify packets based on the control @@ -443,6 +466,16 @@ config NET_CLS_CGROUP  	  To compile this code as a module, choose M here: the  	  module will be called cls_cgroup. +config NET_CLS_BPF +	tristate "BPF-based classifier" +	select NET_CLS +	---help--- +	  If you say Y here, you will be able to classify packets based on +	  programmable BPF (JIT'ed) filters as an alternative to ematches. + +	  To compile this code as a module, choose M here: the module will +	  be called cls_bpf. +  config NET_EMATCH  	bool "Extended Matches"  	select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index e5f9abe9a5d..0a869a11f3e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -40,6 +40,8 @@ obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o  obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o  obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o  obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o +obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o +obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o  obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o  obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o @@ -50,6 +52,7 @@ obj-$(CONFIG_NET_CLS_RSVP6)	+= cls_rsvp6.o  obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o  obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o  obj-$(CONFIG_NET_CLS_CGROUP)	+= cls_cgroup.o +obj-$(CONFIG_NET_CLS_BPF)	+= cls_bpf.o  obj-$(CONFIG_NET_EMATCH)	+= ematch.o  obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o  obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index fd7072827a4..648778aef1a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -27,42 +27,40 @@  #include <net/act_api.h>  #include <net/netlink.h> -void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo) +void tcf_hash_destroy(struct tc_action *a)  { -	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); -	struct tcf_common **p1p; - -	for (p1p = &hinfo->htab[h]; *p1p; p1p = &(*p1p)->tcfc_next) { -		if (*p1p == p) { -			write_lock_bh(hinfo->lock); -			*p1p = p->tcfc_next; -			write_unlock_bh(hinfo->lock); -			gen_kill_estimator(&p->tcfc_bstats, -					   &p->tcfc_rate_est); -			/* -			 * gen_estimator est_timer() might access p->tcfc_lock -			 * or bstats, wait a RCU grace period before freeing p -			 */ -			kfree_rcu(p, tcfc_rcu); -			return; -		} -	} -	WARN_ON(1); +	struct tcf_common *p = a->priv; +	struct tcf_hashinfo *hinfo = a->ops->hinfo; + +	spin_lock_bh(&hinfo->lock); +	hlist_del(&p->tcfc_head); +	spin_unlock_bh(&hinfo->lock); +	gen_kill_estimator(&p->tcfc_bstats, +			   &p->tcfc_rate_est); +	/* +	 * gen_estimator est_timer() might access p->tcfc_lock +	 * or bstats, wait a RCU grace period before freeing p +	 */ +	kfree_rcu(p, tcfc_rcu);  }  EXPORT_SYMBOL(tcf_hash_destroy); -int tcf_hash_release(struct tcf_common *p, int bind, -		     struct tcf_hashinfo *hinfo) +int tcf_hash_release(struct tc_action *a, int bind)  { +	struct tcf_common *p = a->priv;  	int ret = 0;  	if (p) {  		if (bind)  			p->tcfc_bindcnt--; +		else if (p->tcfc_bindcnt > 0) +			return -EPERM;  		p->tcfc_refcnt--;  		if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { -			tcf_hash_destroy(p, hinfo); +			if (a->ops->cleanup) +				a->ops->cleanup(a, bind); +			tcf_hash_destroy(a);  			ret = 1;  		}  	} @@ -71,20 +69,22 @@ int tcf_hash_release(struct tcf_common *p, int bind,  EXPORT_SYMBOL(tcf_hash_release);  static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, -			   struct tc_action *a, struct tcf_hashinfo *hinfo) +			   struct tc_action *a)  { +	struct tcf_hashinfo *hinfo = a->ops->hinfo; +	struct hlist_head *head;  	struct tcf_common *p;  	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;  	struct nlattr *nest; -	read_lock_bh(hinfo->lock); +	spin_lock_bh(&hinfo->lock);  	s_i = cb->args[0];  	for (i = 0; i < (hinfo->hmask + 1); i++) { -		p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; +		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; -		for (; p; p = p->tcfc_next) { +		hlist_for_each_entry_rcu(p, head, tcfc_head) {  			index++;  			if (index < s_i)  				continue; @@ -107,7 +107,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,  		}  	}  done: -	read_unlock_bh(hinfo->lock); +	spin_unlock_bh(&hinfo->lock);  	if (n_i)  		cb->args[0] += n_i;  	return n_i; @@ -117,12 +117,15 @@ nla_put_failure:  	goto done;  } -static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, -			  struct tcf_hashinfo *hinfo) +static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a)  { -	struct tcf_common *p, *s_p; +	struct tcf_hashinfo *hinfo = a->ops->hinfo; +	struct hlist_head *head; +	struct hlist_node *n; +	struct tcf_common *p;  	struct nlattr *nest;  	int i = 0, n_i = 0; +	int ret = -EINVAL;  	nest = nla_nest_start(skb, a->order);  	if (nest == NULL) @@ -130,14 +133,15 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,  	if (nla_put_string(skb, TCA_KIND, a->ops->kind))  		goto nla_put_failure;  	for (i = 0; i < (hinfo->hmask + 1); i++) { -		p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; - -		while (p != NULL) { -			s_p = p->tcfc_next; -			if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo)) +		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; +		hlist_for_each_entry_safe(p, n, head, tcfc_head) { +			a->priv = p; +			ret = tcf_hash_release(a, 0); +			if (ret == ACT_P_DELETED) {  				module_put(a->ops->owner); -			n_i++; -			p = s_p; +				n_i++; +			} else if (ret < 0) +				goto nla_put_failure;  		}  	}  	if (nla_put_u32(skb, TCA_FCNT, n_i)) @@ -147,51 +151,48 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,  	return n_i;  nla_put_failure:  	nla_nest_cancel(skb, nest); -	return -EINVAL; +	return ret;  } -int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, -		       int type, struct tc_action *a) +static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, +			      int type, struct tc_action *a)  { -	struct tcf_hashinfo *hinfo = a->ops->hinfo; -  	if (type == RTM_DELACTION) { -		return tcf_del_walker(skb, a, hinfo); +		return tcf_del_walker(skb, a);  	} else if (type == RTM_GETACTION) { -		return tcf_dump_walker(skb, cb, a, hinfo); +		return tcf_dump_walker(skb, cb, a);  	} else {  		WARN(1, "tcf_generic_walker: unknown action %d\n", type);  		return -EINVAL;  	}  } -EXPORT_SYMBOL(tcf_generic_walker); -struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) +static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)  { -	struct tcf_common *p; +	struct tcf_common *p = NULL; +	struct hlist_head *head; -	read_lock_bh(hinfo->lock); -	for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p; -	     p = p->tcfc_next) { +	spin_lock_bh(&hinfo->lock); +	head = &hinfo->htab[tcf_hash(index, hinfo->hmask)]; +	hlist_for_each_entry_rcu(p, head, tcfc_head)  		if (p->tcfc_index == index)  			break; -	} -	read_unlock_bh(hinfo->lock); +	spin_unlock_bh(&hinfo->lock);  	return p;  } -EXPORT_SYMBOL(tcf_hash_lookup); -u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo) +u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)  { -	u32 val = *idx_gen; +	u32 val = hinfo->index;  	do {  		if (++val == 0)  			val = 1;  	} while (tcf_hash_lookup(val, hinfo)); -	return (*idx_gen = val); +	hinfo->index = val; +	return val;  }  EXPORT_SYMBOL(tcf_hash_new_index); @@ -208,34 +209,46 @@ int tcf_hash_search(struct tc_action *a, u32 index)  }  EXPORT_SYMBOL(tcf_hash_search); -struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind, -				  struct tcf_hashinfo *hinfo) +int tcf_hash_check(u32 index, struct tc_action *a, int bind)  { +	struct tcf_hashinfo *hinfo = a->ops->hinfo;  	struct tcf_common *p = NULL;  	if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {  		if (bind)  			p->tcfc_bindcnt++;  		p->tcfc_refcnt++;  		a->priv = p; +		return 1;  	} -	return p; +	return 0;  }  EXPORT_SYMBOL(tcf_hash_check); -struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est, -				   struct tc_action *a, int size, int bind, -				   u32 *idx_gen, struct tcf_hashinfo *hinfo) +void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)  { +	struct tcf_common *pc = a->priv; +	if (est) +		gen_kill_estimator(&pc->tcfc_bstats, +				   &pc->tcfc_rate_est); +	kfree_rcu(pc, tcfc_rcu); +} +EXPORT_SYMBOL(tcf_hash_cleanup); + +int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, +		    int size, int bind) +{ +	struct tcf_hashinfo *hinfo = a->ops->hinfo;  	struct tcf_common *p = kzalloc(size, GFP_KERNEL);  	if (unlikely(!p)) -		return ERR_PTR(-ENOMEM); +		return -ENOMEM;  	p->tcfc_refcnt = 1;  	if (bind)  		p->tcfc_bindcnt = 1;  	spin_lock_init(&p->tcfc_lock); -	p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo); +	INIT_HLIST_NODE(&p->tcfc_head); +	p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);  	p->tcfc_tm.install = jiffies;  	p->tcfc_tm.lastuse = jiffies;  	if (est) { @@ -243,42 +256,64 @@ struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est,  					    &p->tcfc_lock, est);  		if (err) {  			kfree(p); -			return ERR_PTR(err); +			return err;  		}  	}  	a->priv = (void *) p; -	return p; +	return 0;  }  EXPORT_SYMBOL(tcf_hash_create); -void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo) +void tcf_hash_insert(struct tc_action *a)  { +	struct tcf_common *p = a->priv; +	struct tcf_hashinfo *hinfo = a->ops->hinfo;  	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); -	write_lock_bh(hinfo->lock); -	p->tcfc_next = hinfo->htab[h]; -	hinfo->htab[h] = p; -	write_unlock_bh(hinfo->lock); +	spin_lock_bh(&hinfo->lock); +	hlist_add_head(&p->tcfc_head, &hinfo->htab[h]); +	spin_unlock_bh(&hinfo->lock);  }  EXPORT_SYMBOL(tcf_hash_insert); -static struct tc_action_ops *act_base = NULL; +static LIST_HEAD(act_base);  static DEFINE_RWLOCK(act_mod_lock); -int tcf_register_action(struct tc_action_ops *act) +int tcf_register_action(struct tc_action_ops *act, unsigned int mask)  { -	struct tc_action_ops *a, **ap; +	struct tc_action_ops *a; +	int err; + +	/* Must supply act, dump and init */ +	if (!act->act || !act->dump || !act->init) +		return -EINVAL; + +	/* Supply defaults */ +	if (!act->lookup) +		act->lookup = tcf_hash_search; +	if (!act->walk) +		act->walk = tcf_generic_walker; + +	act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL); +	if (!act->hinfo) +		return -ENOMEM; +	err = tcf_hashinfo_init(act->hinfo, mask); +	if (err) { +		kfree(act->hinfo); +		return err; +	}  	write_lock(&act_mod_lock); -	for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) { +	list_for_each_entry(a, &act_base, head) {  		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {  			write_unlock(&act_mod_lock); +			tcf_hashinfo_destroy(act->hinfo); +			kfree(act->hinfo);  			return -EEXIST;  		}  	} -	act->next = NULL; -	*ap = act; +	list_add_tail(&act->head, &act_base);  	write_unlock(&act_mod_lock);  	return 0;  } @@ -286,17 +321,18 @@ EXPORT_SYMBOL(tcf_register_action);  int tcf_unregister_action(struct tc_action_ops *act)  { -	struct tc_action_ops *a, **ap; +	struct tc_action_ops *a;  	int err = -ENOENT;  	write_lock(&act_mod_lock); -	for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) -		if (a == act) +	list_for_each_entry(a, &act_base, head) { +		if (a == act) { +			list_del(&act->head); +			tcf_hashinfo_destroy(act->hinfo); +			kfree(act->hinfo); +			err = 0;  			break; -	if (a) { -		*ap = a->next; -		a->next = NULL; -		err = 0; +		}  	}  	write_unlock(&act_mod_lock);  	return err; @@ -306,69 +342,42 @@ EXPORT_SYMBOL(tcf_unregister_action);  /* lookup by name */  static struct tc_action_ops *tc_lookup_action_n(char *kind)  { -	struct tc_action_ops *a = NULL; +	struct tc_action_ops *a, *res = NULL;  	if (kind) {  		read_lock(&act_mod_lock); -		for (a = act_base; a; a = a->next) { +		list_for_each_entry(a, &act_base, head) {  			if (strcmp(kind, a->kind) == 0) { -				if (!try_module_get(a->owner)) { -					read_unlock(&act_mod_lock); -					return NULL; -				} +				if (try_module_get(a->owner)) +					res = a;  				break;  			}  		}  		read_unlock(&act_mod_lock);  	} -	return a; +	return res;  }  /* lookup by nlattr */  static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)  { -	struct tc_action_ops *a = NULL; +	struct tc_action_ops *a, *res = NULL;  	if (kind) {  		read_lock(&act_mod_lock); -		for (a = act_base; a; a = a->next) { +		list_for_each_entry(a, &act_base, head) {  			if (nla_strcmp(kind, a->kind) == 0) { -				if (!try_module_get(a->owner)) { -					read_unlock(&act_mod_lock); -					return NULL; -				} +				if (try_module_get(a->owner)) +					res = a;  				break;  			}  		}  		read_unlock(&act_mod_lock);  	} -	return a; +	return res;  } -#if 0 -/* lookup by id */ -static struct tc_action_ops *tc_lookup_action_id(u32 type) -{ -	struct tc_action_ops *a = NULL; - -	if (type) { -		read_lock(&act_mod_lock); -		for (a = act_base; a; a = a->next) { -			if (a->type == type) { -				if (!try_module_get(a->owner)) { -					read_unlock(&act_mod_lock); -					return NULL; -				} -				break; -			} -		} -		read_unlock(&act_mod_lock); -	} -	return a; -} -#endif - -int tcf_action_exec(struct sk_buff *skb, const struct tc_action *act, +int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,  		    struct tcf_result *res)  {  	const struct tc_action *a; @@ -379,53 +388,44 @@ int tcf_action_exec(struct sk_buff *skb, const struct tc_action *act,  		ret = TC_ACT_OK;  		goto exec_done;  	} -	while ((a = act) != NULL) { +	list_for_each_entry(a, actions, list) {  repeat: -		if (a->ops && a->ops->act) { -			ret = a->ops->act(skb, a, res); -			if (TC_MUNGED & skb->tc_verd) { -				/* copied already, allow trampling */ -				skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); -				skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); -			} -			if (ret == TC_ACT_REPEAT) -				goto repeat;	/* we need a ttl - JHS */ -			if (ret != TC_ACT_PIPE) -				goto exec_done; +		ret = a->ops->act(skb, a, res); +		if (TC_MUNGED & skb->tc_verd) { +			/* copied already, allow trampling */ +			skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); +			skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);  		} -		act = a->next; +		if (ret == TC_ACT_REPEAT) +			goto repeat;	/* we need a ttl - JHS */ +		if (ret != TC_ACT_PIPE) +			goto exec_done;  	}  exec_done:  	return ret;  }  EXPORT_SYMBOL(tcf_action_exec); -void tcf_action_destroy(struct tc_action *act, int bind) +int tcf_action_destroy(struct list_head *actions, int bind)  { -	struct tc_action *a; +	struct tc_action *a, *tmp; +	int ret = 0; -	for (a = act; a; a = act) { -		if (a->ops && a->ops->cleanup) { -			if (a->ops->cleanup(a, bind) == ACT_P_DELETED) -				module_put(a->ops->owner); -			act = act->next; -			kfree(a); -		} else { -			/*FIXME: Remove later - catch insertion bugs*/ -			WARN(1, "tcf_action_destroy: BUG? destroying NULL ops\n"); -			act = act->next; -			kfree(a); -		} +	list_for_each_entry_safe(a, tmp, actions, list) { +		ret = tcf_hash_release(a, bind); +		if (ret == ACT_P_DELETED) +			module_put(a->ops->owner); +		else if (ret < 0) +			return ret; +		list_del(&a->list); +		kfree(a);  	} +	return ret;  }  int  tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)  { -	int err = -EINVAL; - -	if (a->ops == NULL || a->ops->dump == NULL) -		return err;  	return a->ops->dump(skb, a, bind, ref);  } @@ -436,9 +436,6 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)  	unsigned char *b = skb_tail_pointer(skb);  	struct nlattr *nest; -	if (a->ops == NULL || a->ops->dump == NULL) -		return err; -  	if (nla_put_string(skb, TCA_KIND, a->ops->kind))  		goto nla_put_failure;  	if (tcf_action_copy_stats(skb, a, 0)) @@ -459,14 +456,13 @@ nla_put_failure:  EXPORT_SYMBOL(tcf_action_dump_1);  int -tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) +tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref)  {  	struct tc_action *a;  	int err = -EINVAL;  	struct nlattr *nest; -	while ((a = act) != NULL) { -		act = a->next; +	list_for_each_entry(a, actions, list) {  		nest = nla_nest_start(skb, a->order);  		if (nest == NULL)  			goto nla_put_failure; @@ -541,6 +537,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,  	if (a == NULL)  		goto err_mod; +	a->ops = a_o; +	INIT_LIST_HEAD(&a->list);  	/* backward compatibility for policer */  	if (name == NULL)  		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind); @@ -555,7 +553,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,  	 */  	if (err != ACT_P_CREATED)  		module_put(a_o->owner); -	a->ops = a_o;  	return a; @@ -567,37 +564,33 @@ err_out:  	return ERR_PTR(err);  } -struct tc_action *tcf_action_init(struct net *net, struct nlattr *nla, +int tcf_action_init(struct net *net, struct nlattr *nla,  				  struct nlattr *est, char *name, int ovr, -				  int bind) +				  int bind, struct list_head *actions)  {  	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; -	struct tc_action *head = NULL, *act, *act_prev = NULL; +	struct tc_action *act;  	int err;  	int i;  	err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);  	if (err < 0) -		return ERR_PTR(err); +		return err;  	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {  		act = tcf_action_init_1(net, tb[i], est, name, ovr, bind); -		if (IS_ERR(act)) +		if (IS_ERR(act)) { +			err = PTR_ERR(act);  			goto err; +		}  		act->order = i; - -		if (head == NULL) -			head = act; -		else -			act_prev->next = act; -		act_prev = act; +		list_add_tail(&act->list, actions);  	} -	return head; +	return 0;  err: -	if (head != NULL) -		tcf_action_destroy(head, bind); -	return act; +	tcf_action_destroy(actions, bind); +	return err;  }  int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, @@ -605,9 +598,9 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,  {  	int err = 0;  	struct gnet_dump d; -	struct tcf_act_hdr *h = a->priv; +	struct tcf_common *p = a->priv; -	if (h == NULL) +	if (p == NULL)  		goto errout;  	/* compat_mode being true specifies a call that is supposed @@ -616,24 +609,20 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,  	if (compat_mode) {  		if (a->type == TCA_OLD_COMPAT)  			err = gnet_stats_start_copy_compat(skb, 0, -				TCA_STATS, TCA_XSTATS, &h->tcf_lock, &d); +				TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d);  		else  			return 0;  	} else  		err = gnet_stats_start_copy(skb, TCA_ACT_STATS, -					    &h->tcf_lock, &d); +					    &p->tcfc_lock, &d);  	if (err < 0)  		goto errout; -	if (a->ops != NULL && a->ops->get_stats != NULL) -		if (a->ops->get_stats(skb, a) < 0) -			goto errout; - -	if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 || -	    gnet_stats_copy_rate_est(&d, &h->tcf_bstats, -				     &h->tcf_rate_est) < 0 || -	    gnet_stats_copy_queue(&d, &h->tcf_qstats) < 0) +	if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 || +	    gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, +				     &p->tcfc_rate_est) < 0 || +	    gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0)  		goto errout;  	if (gnet_stats_finish_copy(&d) < 0) @@ -646,7 +635,7 @@ errout:  }  static int -tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 portid, u32 seq, +tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq,  	     u16 flags, int event, int bind, int ref)  {  	struct tcamsg *t; @@ -666,7 +655,7 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 portid, u32 seq,  	if (nest == NULL)  		goto out_nlmsg_trim; -	if (tcf_action_dump(skb, a, bind, ref) < 0) +	if (tcf_action_dump(skb, actions, bind, ref) < 0)  		goto out_nlmsg_trim;  	nla_nest_end(skb, nest); @@ -681,14 +670,14 @@ out_nlmsg_trim:  static int  act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, -	       struct tc_action *a, int event) +	       struct list_head *actions, int event)  {  	struct sk_buff *skb;  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);  	if (!skb)  		return -ENOBUFS; -	if (tca_get_fill(skb, a, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { +	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {  		kfree_skb(skb);  		return -EINVAL;  	} @@ -696,6 +685,20 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,  	return rtnl_unicast(skb, net, portid);  } +static struct tc_action *create_a(int i) +{ +	struct tc_action *act; + +	act = kzalloc(sizeof(*act), GFP_KERNEL); +	if (act == NULL) { +		pr_debug("create_a: failed to alloc!\n"); +		return NULL; +	} +	act->order = i; +	INIT_LIST_HEAD(&act->list); +	return act; +} +  static struct tc_action *  tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)  { @@ -715,16 +718,14 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)  	index = nla_get_u32(tb[TCA_ACT_INDEX]);  	err = -ENOMEM; -	a = kzalloc(sizeof(struct tc_action), GFP_KERNEL); +	a = create_a(0);  	if (a == NULL)  		goto err_out;  	err = -EINVAL;  	a->ops = tc_lookup_action(tb[TCA_ACT_KIND]); -	if (a->ops == NULL) +	if (a->ops == NULL) /* could happen in batch of actions */  		goto err_free; -	if (a->ops->lookup == NULL) -		goto err_mod;  	err = -ENOENT;  	if (a->ops->lookup(a, index) == 0)  		goto err_mod; @@ -740,29 +741,16 @@ err_out:  	return ERR_PTR(err);  } -static void cleanup_a(struct tc_action *act) +static void cleanup_a(struct list_head *actions)  { -	struct tc_action *a; +	struct tc_action *a, *tmp; -	for (a = act; a; a = act) { -		act = a->next; +	list_for_each_entry_safe(a, tmp, actions, list) { +		list_del(&a->list);  		kfree(a);  	}  } -static struct tc_action *create_a(int i) -{ -	struct tc_action *act; - -	act = kzalloc(sizeof(*act), GFP_KERNEL); -	if (act == NULL) { -		pr_debug("create_a: failed to alloc!\n"); -		return NULL; -	} -	act->order = i; -	return act; -} -  static int tca_action_flush(struct net *net, struct nlattr *nla,  			    struct nlmsghdr *n, u32 portid)  { @@ -774,18 +762,12 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,  	struct nlattr *nest;  	struct nlattr *tb[TCA_ACT_MAX + 1];  	struct nlattr *kind; -	struct tc_action *a = create_a(0); +	struct tc_action a;  	int err = -ENOMEM; -	if (a == NULL) { -		pr_debug("tca_action_flush: couldnt create tc_action\n"); -		return err; -	} -  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);  	if (!skb) {  		pr_debug("tca_action_flush: failed skb alloc\n"); -		kfree(a);  		return err;  	} @@ -797,8 +779,10 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,  	err = -EINVAL;  	kind = tb[TCA_ACT_KIND]; -	a->ops = tc_lookup_action(kind); -	if (a->ops == NULL) +	memset(&a, 0, sizeof(struct tc_action)); +	INIT_LIST_HEAD(&a.list); +	a.ops = tc_lookup_action(kind); +	if (a.ops == NULL) /*some idjot trying to flush unknown action */  		goto err_out;  	nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); @@ -813,7 +797,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,  	if (nest == NULL)  		goto out_module_put; -	err = a->ops->walk(skb, &dcb, RTM_DELACTION, a); +	err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a);  	if (err < 0)  		goto out_module_put;  	if (err == 0) @@ -823,8 +807,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,  	nlh->nlmsg_len = skb_tail_pointer(skb) - b;  	nlh->nlmsg_flags |= NLM_F_ROOT; -	module_put(a->ops->owner); -	kfree(a); +	module_put(a.ops->owner);  	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,  			     n->nlmsg_flags & NLM_F_ECHO);  	if (err > 0) @@ -833,21 +816,52 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,  	return err;  out_module_put: -	module_put(a->ops->owner); +	module_put(a.ops->owner);  err_out:  noflush_out:  	kfree_skb(skb); -	kfree(a);  	return err;  }  static int +tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, +	       u32 portid) +{ +	int ret; +	struct sk_buff *skb; + +	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (!skb) +		return -ENOBUFS; + +	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, +			 0, 1) <= 0) { +		kfree_skb(skb); +		return -EINVAL; +	} + +	/* now do the delete */ +	ret = tcf_action_destroy(actions, 0); +	if (ret < 0) { +		kfree_skb(skb); +		return ret; +	} + +	ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, +			     n->nlmsg_flags & NLM_F_ECHO); +	if (ret > 0) +		return 0; +	return ret; +} + +static int  tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,  	      u32 portid, int event)  {  	int i, ret;  	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; -	struct tc_action *head = NULL, *act, *act_prev = NULL; +	struct tc_action *act; +	LIST_HEAD(actions);  	ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);  	if (ret < 0) @@ -867,117 +881,62 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,  			goto err;  		}  		act->order = i; - -		if (head == NULL) -			head = act; -		else -			act_prev->next = act; -		act_prev = act; +		list_add_tail(&act->list, &actions);  	}  	if (event == RTM_GETACTION) -		ret = act_get_notify(net, portid, n, head, event); +		ret = act_get_notify(net, portid, n, &actions, event);  	else { /* delete */ -		struct sk_buff *skb; - -		skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); -		if (!skb) { -			ret = -ENOBUFS; -			goto err; -		} - -		if (tca_get_fill(skb, head, portid, n->nlmsg_seq, 0, event, -				 0, 1) <= 0) { -			kfree_skb(skb); -			ret = -EINVAL; +		ret = tcf_del_notify(net, n, &actions, portid); +		if (ret)  			goto err; -		} - -		/* now do the delete */ -		tcf_action_destroy(head, 0); -		ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, -				     n->nlmsg_flags & NLM_F_ECHO); -		if (ret > 0) -			return 0;  		return ret;  	}  err: -	cleanup_a(head); +	cleanup_a(&actions);  	return ret;  } -static int tcf_add_notify(struct net *net, struct tc_action *a, -			  u32 portid, u32 seq, int event, u16 flags) +static int +tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, +	       u32 portid)  { -	struct tcamsg *t; -	struct nlmsghdr *nlh;  	struct sk_buff *skb; -	struct nlattr *nest; -	unsigned char *b;  	int err = 0;  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);  	if (!skb)  		return -ENOBUFS; -	b = skb_tail_pointer(skb); - -	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags); -	if (!nlh) -		goto out_kfree_skb; -	t = nlmsg_data(nlh); -	t->tca_family = AF_UNSPEC; -	t->tca__pad1 = 0; -	t->tca__pad2 = 0; - -	nest = nla_nest_start(skb, TCA_ACT_TAB); -	if (nest == NULL) -		goto out_kfree_skb; - -	if (tcf_action_dump(skb, a, 0, 0) < 0) -		goto out_kfree_skb; - -	nla_nest_end(skb, nest); - -	nlh->nlmsg_len = skb_tail_pointer(skb) - b; -	NETLINK_CB(skb).dst_group = RTNLGRP_TC; +	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, +			 RTM_NEWACTION, 0, 0) <= 0) { +		kfree_skb(skb); +		return -EINVAL; +	} -	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); +	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, +			     n->nlmsg_flags & NLM_F_ECHO);  	if (err > 0)  		err = 0;  	return err; - -out_kfree_skb: -	kfree_skb(skb); -	return -1;  } -  static int  tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,  	       u32 portid, int ovr)  {  	int ret = 0; -	struct tc_action *act; -	struct tc_action *a; -	u32 seq = n->nlmsg_seq; +	LIST_HEAD(actions); -	act = tcf_action_init(net, nla, NULL, NULL, ovr, 0); -	if (act == NULL) -		goto done; -	if (IS_ERR(act)) { -		ret = PTR_ERR(act); +	ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions); +	if (ret)  		goto done; -	}  	/* dump then free all the actions after update; inserted policy  	 * stays intact  	 */ -	ret = tcf_add_notify(net, act, portid, seq, RTM_NEWACTION, n->nlmsg_flags); -	for (a = act; a; a = act) { -		act = a->next; -		kfree(a); -	} +	ret = tcf_add_notify(net, n, &actions, portid); +	cleanup_a(&actions);  done:  	return ret;  } @@ -989,7 +948,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)  	u32 portid = skb ? NETLINK_CB(skb).portid : 0;  	int ret = 0, ovr = 0; -	if ((n->nlmsg_type != RTM_GETACTION) && !capable(CAP_NET_ADMIN)) +	if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN))  		return -EPERM;  	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); @@ -1084,12 +1043,6 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)  	memset(&a, 0, sizeof(struct tc_action));  	a.ops = a_o; -	if (a_o->walk == NULL) { -		WARN(1, "tc_dump_action: %s !capable of dumping table\n", -		     a_o->kind); -		goto out_module_put; -	} -  	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,  			cb->nlh->nlmsg_type, sizeof(*t), 0);  	if (!nlh) diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 3a4c0caa1f7..edbf40dac70 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -37,15 +37,6 @@  #include <net/tc_act/tc_csum.h>  #define CSUM_TAB_MASK 15 -static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1]; -static u32 csum_idx_gen; -static DEFINE_RWLOCK(csum_lock); - -static struct tcf_hashinfo csum_hash_info = { -	.htab	= tcf_csum_ht, -	.hmask	= CSUM_TAB_MASK, -	.lock	= &csum_lock, -};  static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {  	[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, @@ -56,7 +47,6 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,  {  	struct nlattr *tb[TCA_CSUM_MAX + 1];  	struct tc_csum *parm; -	struct tcf_common *pc;  	struct tcf_csum *p;  	int ret = 0, err; @@ -71,39 +61,31 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,  		return -EINVAL;  	parm = nla_data(tb[TCA_CSUM_PARMS]); -	pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info); -	if (!pc) { -		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, -				     &csum_idx_gen, &csum_hash_info); -		if (IS_ERR(pc)) -			return PTR_ERR(pc); -		p = to_tcf_csum(pc); +	if (!tcf_hash_check(parm->index, a, bind)) { +		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); +		if (ret) +			return ret;  		ret = ACT_P_CREATED;  	} else { -		p = to_tcf_csum(pc); -		if (!ovr) { -			tcf_hash_release(pc, bind, &csum_hash_info); +		if (bind)/* dont override defaults */ +			return 0; +		tcf_hash_release(a, bind); +		if (!ovr)  			return -EEXIST; -		}  	} +	p = to_tcf_csum(a);  	spin_lock_bh(&p->tcf_lock);  	p->tcf_action = parm->action;  	p->update_flags = parm->update_flags;  	spin_unlock_bh(&p->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &csum_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static int tcf_csum_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_csum *p = a->priv; -	return tcf_hash_release(&p->common, bind, &csum_hash_info); -} -  /**   * tcf_csum_skb_nextlayer - Get next layer pointer   * @skb: sk_buff to use @@ -578,16 +560,11 @@ nla_put_failure:  static struct tc_action_ops act_csum_ops = {  	.kind		= "csum", -	.hinfo		= &csum_hash_info,  	.type		= TCA_ACT_CSUM, -	.capab		= TCA_CAP_NONE,  	.owner		= THIS_MODULE,  	.act		= tcf_csum,  	.dump		= tcf_csum_dump, -	.cleanup	= tcf_csum_cleanup, -	.lookup		= tcf_hash_search,  	.init		= tcf_csum_init, -	.walk		= tcf_generic_walker  };  MODULE_DESCRIPTION("Checksum updating actions"); @@ -595,7 +572,7 @@ MODULE_LICENSE("GPL");  static int __init csum_init_module(void)  { -	return tcf_register_action(&act_csum_ops); +	return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK);  }  static void __exit csum_cleanup_module(void) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index fd2b3cff5fa..d6bcbd9f779 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -24,20 +24,11 @@  #include <net/tc_act/tc_gact.h>  #define GACT_TAB_MASK	15 -static struct tcf_common *tcf_gact_ht[GACT_TAB_MASK + 1]; -static u32 gact_idx_gen; -static DEFINE_RWLOCK(gact_lock); - -static struct tcf_hashinfo gact_hash_info = { -	.htab	=	tcf_gact_ht, -	.hmask	=	GACT_TAB_MASK, -	.lock	=	&gact_lock, -};  #ifdef CONFIG_GACT_PROB  static int gact_net_rand(struct tcf_gact *gact)  { -	if (!gact->tcfg_pval || net_random() % gact->tcfg_pval) +	if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval)  		return gact->tcf_action;  	return gact->tcfg_paction;  } @@ -65,7 +56,6 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,  	struct nlattr *tb[TCA_GACT_MAX + 1];  	struct tc_gact *parm;  	struct tcf_gact *gact; -	struct tcf_common *pc;  	int ret = 0;  	int err;  #ifdef CONFIG_GACT_PROB @@ -94,21 +84,20 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,  	}  #endif -	pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info); -	if (!pc) { -		pc = tcf_hash_create(parm->index, est, a, sizeof(*gact), -				     bind, &gact_idx_gen, &gact_hash_info); -		if (IS_ERR(pc)) -			return PTR_ERR(pc); +	if (!tcf_hash_check(parm->index, a, bind)) { +		ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind); +		if (ret) +			return ret;  		ret = ACT_P_CREATED;  	} else { -		if (!ovr) { -			tcf_hash_release(pc, bind, &gact_hash_info); +		if (bind)/* dont override defaults */ +			return 0; +		tcf_hash_release(a, bind); +		if (!ovr)  			return -EEXIST; -		}  	} -	gact = to_gact(pc); +	gact = to_gact(a);  	spin_lock_bh(&gact->tcf_lock);  	gact->tcf_action = parm->action; @@ -121,19 +110,10 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,  #endif  	spin_unlock_bh(&gact->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &gact_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static int tcf_gact_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_gact *gact = a->priv; - -	if (gact) -		return tcf_hash_release(&gact->common, bind, &gact_hash_info); -	return 0; -} -  static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,  		    struct tcf_result *res)  { @@ -199,16 +179,11 @@ nla_put_failure:  static struct tc_action_ops act_gact_ops = {  	.kind		=	"gact", -	.hinfo		=	&gact_hash_info,  	.type		=	TCA_ACT_GACT, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_gact,  	.dump		=	tcf_gact_dump, -	.cleanup	=	tcf_gact_cleanup, -	.lookup		=	tcf_hash_search,  	.init		=	tcf_gact_init, -	.walk		=	tcf_generic_walker  };  MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -222,7 +197,7 @@ static int __init gact_init_module(void)  #else  	pr_info("GACT probability NOT on\n");  #endif -	return tcf_register_action(&act_gact_ops); +	return tcf_register_action(&act_gact_ops, GACT_TAB_MASK);  }  static void __exit gact_cleanup_module(void) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 60d88b6b956..8a64a0734ae 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -29,15 +29,6 @@  #define IPT_TAB_MASK     15 -static struct tcf_common *tcf_ipt_ht[IPT_TAB_MASK + 1]; -static u32 ipt_idx_gen; -static DEFINE_RWLOCK(ipt_lock); - -static struct tcf_hashinfo ipt_hash_info = { -	.htab	=	tcf_ipt_ht, -	.hmask	=	IPT_TAB_MASK, -	.lock	=	&ipt_lock, -};  static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)  { @@ -77,22 +68,12 @@ static void ipt_destroy_target(struct xt_entry_target *t)  	module_put(par.target->me);  } -static int tcf_ipt_release(struct tcf_ipt *ipt, int bind) +static void tcf_ipt_release(struct tc_action *a, int bind)  { -	int ret = 0; -	if (ipt) { -		if (bind) -			ipt->tcf_bindcnt--; -		ipt->tcf_refcnt--; -		if (ipt->tcf_bindcnt <= 0 && ipt->tcf_refcnt <= 0) { -			ipt_destroy_target(ipt->tcfi_t); -			kfree(ipt->tcfi_tname); -			kfree(ipt->tcfi_t); -			tcf_hash_destroy(&ipt->common, &ipt_hash_info); -			ret = ACT_P_DELETED; -		} -	} -	return ret; +	struct tcf_ipt *ipt = to_ipt(a); +	ipt_destroy_target(ipt->tcfi_t); +	kfree(ipt->tcfi_tname); +	kfree(ipt->tcfi_t);  }  static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { @@ -107,7 +88,6 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,  {  	struct nlattr *tb[TCA_IPT_MAX + 1];  	struct tcf_ipt *ipt; -	struct tcf_common *pc;  	struct xt_entry_target *td, *t;  	char *tname;  	int ret = 0, err; @@ -133,20 +113,20 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,  	if (tb[TCA_IPT_INDEX] != NULL)  		index = nla_get_u32(tb[TCA_IPT_INDEX]); -	pc = tcf_hash_check(index, a, bind, &ipt_hash_info); -	if (!pc) { -		pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind, -				     &ipt_idx_gen, &ipt_hash_info); -		if (IS_ERR(pc)) -			return PTR_ERR(pc); +	if (!tcf_hash_check(index, a, bind) ) { +		ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind); +		if (ret) +			return ret;  		ret = ACT_P_CREATED;  	} else { -		if (!ovr) { -			tcf_ipt_release(to_ipt(pc), bind); +		if (bind)/* dont override defaults */ +			return 0; +		tcf_hash_release(a, bind); + +		if (!ovr)  			return -EEXIST; -		}  	} -	ipt = to_ipt(pc); +	ipt = to_ipt(a);  	hook = nla_get_u32(tb[TCA_IPT_HOOK]); @@ -177,7 +157,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,  	ipt->tcfi_hook  = hook;  	spin_unlock_bh(&ipt->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &ipt_hash_info); +		tcf_hash_insert(a);  	return ret;  err3: @@ -185,21 +165,11 @@ err3:  err2:  	kfree(tname);  err1: -	if (ret == ACT_P_CREATED) { -		if (est) -			gen_kill_estimator(&pc->tcfc_bstats, -					   &pc->tcfc_rate_est); -		kfree_rcu(pc, tcfc_rcu); -	} +	if (ret == ACT_P_CREATED) +		tcf_hash_cleanup(a, est);  	return err;  } -static int tcf_ipt_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_ipt *ipt = a->priv; -	return tcf_ipt_release(ipt, bind); -} -  static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,  		   struct tcf_result *res)  { @@ -291,30 +261,22 @@ nla_put_failure:  static struct tc_action_ops act_ipt_ops = {  	.kind		=	"ipt", -	.hinfo		=	&ipt_hash_info,  	.type		=	TCA_ACT_IPT, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_ipt,  	.dump		=	tcf_ipt_dump, -	.cleanup	=	tcf_ipt_cleanup, -	.lookup		=	tcf_hash_search, +	.cleanup	=	tcf_ipt_release,  	.init		=	tcf_ipt_init, -	.walk		=	tcf_generic_walker  };  static struct tc_action_ops act_xt_ops = {  	.kind		=	"xt", -	.hinfo		=	&ipt_hash_info, -	.type		=	TCA_ACT_IPT, -	.capab		=	TCA_CAP_NONE, +	.type		=	TCA_ACT_XT,  	.owner		=	THIS_MODULE,  	.act		=	tcf_ipt,  	.dump		=	tcf_ipt_dump, -	.cleanup	=	tcf_ipt_cleanup, -	.lookup		=	tcf_hash_search, +	.cleanup	=	tcf_ipt_release,  	.init		=	tcf_ipt_init, -	.walk		=	tcf_generic_walker  };  MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); @@ -325,16 +287,17 @@ MODULE_ALIAS("act_xt");  static int __init ipt_init_module(void)  {  	int ret1, ret2; -	ret1 = tcf_register_action(&act_xt_ops); + +	ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK);  	if (ret1 < 0)  		printk("Failed to load xt action\n"); -	ret2 = tcf_register_action(&act_ipt_ops); +	ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK);  	if (ret2 < 0)  		printk("Failed to load ipt action\n"); -	if (ret1 < 0 && ret2 < 0) +	if (ret1 < 0 && ret2 < 0) {  		return ret1; -	else +	} else  		return 0;  } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 977c10e0631..4f912c0e225 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,32 +30,14 @@  #include <linux/if_arp.h>  #define MIRRED_TAB_MASK     7 -static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1]; -static u32 mirred_idx_gen; -static DEFINE_RWLOCK(mirred_lock);  static LIST_HEAD(mirred_list); -static struct tcf_hashinfo mirred_hash_info = { -	.htab	=	tcf_mirred_ht, -	.hmask	=	MIRRED_TAB_MASK, -	.lock	=	&mirred_lock, -}; - -static int tcf_mirred_release(struct tcf_mirred *m, int bind) +static void tcf_mirred_release(struct tc_action *a, int bind)  { -	if (m) { -		if (bind) -			m->tcf_bindcnt--; -		m->tcf_refcnt--; -		if (!m->tcf_bindcnt && m->tcf_refcnt <= 0) { -			list_del(&m->tcfm_list); -			if (m->tcfm_dev) -				dev_put(m->tcfm_dev); -			tcf_hash_destroy(&m->common, &mirred_hash_info); -			return 1; -		} -	} -	return 0; +	struct tcf_mirred *m = to_mirred(a); +	list_del(&m->tcfm_list); +	if (m->tcfm_dev) +		dev_put(m->tcfm_dev);  }  static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { @@ -69,7 +51,6 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,  	struct nlattr *tb[TCA_MIRRED_MAX + 1];  	struct tc_mirred *parm;  	struct tcf_mirred *m; -	struct tcf_common *pc;  	struct net_device *dev;  	int ret, ok_push = 0; @@ -109,22 +90,20 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,  		dev = NULL;  	} -	pc = tcf_hash_check(parm->index, a, bind, &mirred_hash_info); -	if (!pc) { +	if (!tcf_hash_check(parm->index, a, bind)) {  		if (dev == NULL)  			return -EINVAL; -		pc = tcf_hash_create(parm->index, est, a, sizeof(*m), bind, -				     &mirred_idx_gen, &mirred_hash_info); -		if (IS_ERR(pc)) -			return PTR_ERR(pc); +		ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind); +		if (ret) +			return ret;  		ret = ACT_P_CREATED;  	} else {  		if (!ovr) { -			tcf_mirred_release(to_mirred(pc), bind); +			tcf_hash_release(a, bind);  			return -EEXIST;  		}  	} -	m = to_mirred(pc); +	m = to_mirred(a);  	spin_lock_bh(&m->tcf_lock);  	m->tcf_action = parm->action; @@ -140,21 +119,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,  	spin_unlock_bh(&m->tcf_lock);  	if (ret == ACT_P_CREATED) {  		list_add(&m->tcfm_list, &mirred_list); -		tcf_hash_insert(pc, &mirred_hash_info); +		tcf_hash_insert(a);  	}  	return ret;  } -static int tcf_mirred_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_mirred *m = a->priv; - -	if (m) -		return tcf_mirred_release(m, bind); -	return 0; -} -  static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,  		      struct tcf_result *res)  { @@ -261,19 +231,14 @@ static struct notifier_block mirred_device_notifier = {  	.notifier_call = mirred_device_event,  }; -  static struct tc_action_ops act_mirred_ops = {  	.kind		=	"mirred", -	.hinfo		=	&mirred_hash_info,  	.type		=	TCA_ACT_MIRRED, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_mirred,  	.dump		=	tcf_mirred_dump, -	.cleanup	=	tcf_mirred_cleanup, -	.lookup		=	tcf_hash_search, +	.cleanup	=	tcf_mirred_release,  	.init		=	tcf_mirred_init, -	.walk		=	tcf_generic_walker  };  MODULE_AUTHOR("Jamal Hadi Salim(2002)"); @@ -287,13 +252,13 @@ static int __init mirred_init_module(void)  		return err;  	pr_info("Mirror/redirect action on\n"); -	return tcf_register_action(&act_mirred_ops); +	return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK);  }  static void __exit mirred_cleanup_module(void)  { -	unregister_netdevice_notifier(&mirred_device_notifier);  	tcf_unregister_action(&act_mirred_ops); +	unregister_netdevice_notifier(&mirred_device_notifier);  }  module_init(mirred_init_module); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 876f0ef2969..270a030d5fd 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -30,15 +30,6 @@  #define NAT_TAB_MASK	15 -static struct tcf_common *tcf_nat_ht[NAT_TAB_MASK + 1]; -static u32 nat_idx_gen; -static DEFINE_RWLOCK(nat_lock); - -static struct tcf_hashinfo nat_hash_info = { -	.htab	=	tcf_nat_ht, -	.hmask	=	NAT_TAB_MASK, -	.lock	=	&nat_lock, -};  static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {  	[TCA_NAT_PARMS]	= { .len = sizeof(struct tc_nat) }, @@ -51,7 +42,6 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,  	struct tc_nat *parm;  	int ret = 0, err;  	struct tcf_nat *p; -	struct tcf_common *pc;  	if (nla == NULL)  		return -EINVAL; @@ -64,21 +54,19 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,  		return -EINVAL;  	parm = nla_data(tb[TCA_NAT_PARMS]); -	pc = tcf_hash_check(parm->index, a, bind, &nat_hash_info); -	if (!pc) { -		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, -				     &nat_idx_gen, &nat_hash_info); -		if (IS_ERR(pc)) -			return PTR_ERR(pc); -		p = to_tcf_nat(pc); +	if (!tcf_hash_check(parm->index, a, bind)) { +		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); +		if (ret) +			return ret;  		ret = ACT_P_CREATED;  	} else { -		p = to_tcf_nat(pc); -		if (!ovr) { -			tcf_hash_release(pc, bind, &nat_hash_info); +		if (bind) +			return 0; +		tcf_hash_release(a, bind); +		if (!ovr)  			return -EEXIST; -		}  	} +	p = to_tcf_nat(a);  	spin_lock_bh(&p->tcf_lock);  	p->old_addr = parm->old_addr; @@ -90,18 +78,11 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,  	spin_unlock_bh(&p->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &nat_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static int tcf_nat_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_nat *p = a->priv; - -	return tcf_hash_release(&p->common, bind, &nat_hash_info); -} -  static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,  		   struct tcf_result *res)  { @@ -301,16 +282,11 @@ nla_put_failure:  static struct tc_action_ops act_nat_ops = {  	.kind		=	"nat", -	.hinfo		=	&nat_hash_info,  	.type		=	TCA_ACT_NAT, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_nat,  	.dump		=	tcf_nat_dump, -	.cleanup	=	tcf_nat_cleanup, -	.lookup		=	tcf_hash_search,  	.init		=	tcf_nat_init, -	.walk		=	tcf_generic_walker  };  MODULE_DESCRIPTION("Stateless NAT actions"); @@ -318,7 +294,7 @@ MODULE_LICENSE("GPL");  static int __init nat_init_module(void)  { -	return tcf_register_action(&act_nat_ops); +	return tcf_register_action(&act_nat_ops, NAT_TAB_MASK);  }  static void __exit nat_cleanup_module(void) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 7ed78c9e505..5f9bcb2e080 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -24,15 +24,6 @@  #include <net/tc_act/tc_pedit.h>  #define PEDIT_TAB_MASK	15 -static struct tcf_common *tcf_pedit_ht[PEDIT_TAB_MASK + 1]; -static u32 pedit_idx_gen; -static DEFINE_RWLOCK(pedit_lock); - -static struct tcf_hashinfo pedit_hash_info = { -	.htab	=	tcf_pedit_ht, -	.hmask	=	PEDIT_TAB_MASK, -	.lock	=	&pedit_lock, -};  static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {  	[TCA_PEDIT_PARMS]	= { .len = sizeof(struct tc_pedit) }, @@ -46,7 +37,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,  	struct tc_pedit *parm;  	int ret = 0, err;  	struct tcf_pedit *p; -	struct tcf_common *pc;  	struct tc_pedit_key *keys = NULL;  	int ksize; @@ -64,30 +54,27 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,  	if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)  		return -EINVAL; -	pc = tcf_hash_check(parm->index, a, bind, &pedit_hash_info); -	if (!pc) { +	if (!tcf_hash_check(parm->index, a, bind)) {  		if (!parm->nkeys)  			return -EINVAL; -		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, -				     &pedit_idx_gen, &pedit_hash_info); -		if (IS_ERR(pc)) -			return PTR_ERR(pc); -		p = to_pedit(pc); +		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); +		if (ret) +			return ret; +		p = to_pedit(a);  		keys = kmalloc(ksize, GFP_KERNEL);  		if (keys == NULL) { -			if (est) -				gen_kill_estimator(&pc->tcfc_bstats, -						   &pc->tcfc_rate_est); -			kfree_rcu(pc, tcfc_rcu); +			tcf_hash_cleanup(a, est);  			return -ENOMEM;  		}  		ret = ACT_P_CREATED;  	} else { -		p = to_pedit(pc); -		if (!ovr) { -			tcf_hash_release(pc, bind, &pedit_hash_info); +		p = to_pedit(a); +		tcf_hash_release(a, bind); +		if (bind) +			return 0; +		if (!ovr)  			return -EEXIST; -		} +  		if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {  			keys = kmalloc(ksize, GFP_KERNEL);  			if (keys == NULL) @@ -106,22 +93,15 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,  	memcpy(p->tcfp_keys, parm->keys, ksize);  	spin_unlock_bh(&p->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &pedit_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static int tcf_pedit_cleanup(struct tc_action *a, int bind) +static void tcf_pedit_cleanup(struct tc_action *a, int bind)  {  	struct tcf_pedit *p = a->priv; - -	if (p) { -		struct tc_pedit_key *keys = p->tcfp_keys; -		if (tcf_hash_release(&p->common, bind, &pedit_hash_info)) { -			kfree(keys); -			return 1; -		} -	} -	return 0; +	struct tc_pedit_key *keys = p->tcfp_keys; +	kfree(keys);  }  static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, @@ -236,16 +216,12 @@ nla_put_failure:  static struct tc_action_ops act_pedit_ops = {  	.kind		=	"pedit", -	.hinfo		=	&pedit_hash_info,  	.type		=	TCA_ACT_PEDIT, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_pedit,  	.dump		=	tcf_pedit_dump,  	.cleanup	=	tcf_pedit_cleanup, -	.lookup		=	tcf_hash_search,  	.init		=	tcf_pedit_init, -	.walk		=	tcf_generic_walker  };  MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -254,7 +230,7 @@ MODULE_LICENSE("GPL");  static int __init pedit_init_module(void)  { -	return tcf_register_action(&act_pedit_ops); +	return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK);  }  static void __exit pedit_cleanup_module(void) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 189e3c5b3d0..0566e4606a4 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -41,15 +41,6 @@ struct tcf_police {  	container_of(pc, struct tcf_police, common)  #define POL_TAB_MASK     15 -static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1]; -static u32 police_idx_gen; -static DEFINE_RWLOCK(police_lock); - -static struct tcf_hashinfo police_hash_info = { -	.htab	=	tcf_police_ht, -	.hmask	=	POL_TAB_MASK, -	.lock	=	&police_lock, -};  /* old policer structure from before tc actions */  struct tc_police_compat { @@ -67,18 +58,20 @@ struct tc_police_compat {  static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,  			      int type, struct tc_action *a)  { +	struct tcf_hashinfo *hinfo = a->ops->hinfo; +	struct hlist_head *head;  	struct tcf_common *p;  	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;  	struct nlattr *nest; -	read_lock_bh(&police_lock); +	spin_lock_bh(&hinfo->lock);  	s_i = cb->args[0];  	for (i = 0; i < (POL_TAB_MASK + 1); i++) { -		p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)]; +		head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)]; -		for (; p; p = p->tcfc_next) { +		hlist_for_each_entry_rcu(p, head, tcfc_head) {  			index++;  			if (index < s_i)  				continue; @@ -101,7 +94,7 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c  		}  	}  done: -	read_unlock_bh(&police_lock); +	spin_unlock_bh(&hinfo->lock);  	if (n_i)  		cb->args[0] += n_i;  	return n_i; @@ -111,29 +104,6 @@ nla_put_failure:  	goto done;  } -static void tcf_police_destroy(struct tcf_police *p) -{ -	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK); -	struct tcf_common **p1p; - -	for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->tcfc_next) { -		if (*p1p == &p->common) { -			write_lock_bh(&police_lock); -			*p1p = p->tcf_next; -			write_unlock_bh(&police_lock); -			gen_kill_estimator(&p->tcf_bstats, -					   &p->tcf_rate_est); -			/* -			 * gen_estimator est_timer() might access p->tcf_lock -			 * or bstats, wait a RCU grace period before freeing p -			 */ -			kfree_rcu(p, tcf_rcu); -			return; -		} -	} -	WARN_ON(1); -} -  static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {  	[TCA_POLICE_RATE]	= { .len = TC_RTAB_SIZE },  	[TCA_POLICE_PEAKRATE]	= { .len = TC_RTAB_SIZE }, @@ -151,6 +121,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,  	struct tc_police *parm;  	struct tcf_police *police;  	struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; +	struct tcf_hashinfo *hinfo = a->ops->hinfo;  	int size;  	if (nla == NULL) @@ -168,19 +139,17 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,  	parm = nla_data(tb[TCA_POLICE_TBF]);  	if (parm->index) { -		struct tcf_common *pc; - -		pc = tcf_hash_lookup(parm->index, &police_hash_info); -		if (pc != NULL) { -			a->priv = pc; -			police = to_police(pc); +		if (tcf_hash_search(a, parm->index)) { +			police = to_police(a->priv);  			if (bind) {  				police->tcf_bindcnt += 1;  				police->tcf_refcnt += 1; +				return 0;  			}  			if (ovr)  				goto override; -			return ret; +			/* not replacing */ +			return -EEXIST;  		}  	} @@ -231,14 +200,14 @@ override:  	}  	if (R_tab) {  		police->rate_present = true; -		psched_ratecfg_precompute(&police->rate, &R_tab->rate); +		psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);  		qdisc_put_rtab(R_tab);  	} else {  		police->rate_present = false;  	}  	if (P_tab) {  		police->peak_present = true; -		psched_ratecfg_precompute(&police->peak, &P_tab->rate); +		psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);  		qdisc_put_rtab(P_tab);  	} else {  		police->peak_present = false; @@ -264,12 +233,11 @@ override:  	police->tcfp_t_c = ktime_to_ns(ktime_get());  	police->tcf_index = parm->index ? parm->index : -		tcf_hash_new_index(&police_idx_gen, &police_hash_info); +		tcf_hash_new_index(hinfo);  	h = tcf_hash(police->tcf_index, POL_TAB_MASK); -	write_lock_bh(&police_lock); -	police->tcf_next = tcf_police_ht[h]; -	tcf_police_ht[h] = &police->common; -	write_unlock_bh(&police_lock); +	spin_lock_bh(&hinfo->lock); +	hlist_add_head(&police->tcf_head, &hinfo->htab[h]); +	spin_unlock_bh(&hinfo->lock);  	a->priv = police;  	return ret; @@ -277,33 +245,13 @@ override:  failure_unlock:  	spin_unlock_bh(&police->tcf_lock);  failure: -	if (P_tab) -		qdisc_put_rtab(P_tab); -	if (R_tab) -		qdisc_put_rtab(R_tab); +	qdisc_put_rtab(P_tab); +	qdisc_put_rtab(R_tab);  	if (ret == ACT_P_CREATED)  		kfree(police);  	return err;  } -static int tcf_act_police_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_police *p = a->priv; -	int ret = 0; - -	if (p != NULL) { -		if (bind) -			p->tcf_bindcnt--; - -		p->tcf_refcnt--; -		if (p->tcf_refcnt <= 0 && !p->tcf_bindcnt) { -			tcf_police_destroy(p); -			ret = 1; -		} -	} -	return ret; -} -  static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,  			  struct tcf_result *res)  { @@ -400,14 +348,10 @@ MODULE_LICENSE("GPL");  static struct tc_action_ops act_police_ops = {  	.kind		=	"police", -	.hinfo		=	&police_hash_info,  	.type		=	TCA_ID_POLICE, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_act_police,  	.dump		=	tcf_act_police_dump, -	.cleanup	=	tcf_act_police_cleanup, -	.lookup		=	tcf_hash_search,  	.init		=	tcf_act_police_locate,  	.walk		=	tcf_act_police_walker  }; @@ -415,7 +359,7 @@ static struct tc_action_ops act_police_ops = {  static int __init  police_init_module(void)  { -	return tcf_register_action(&act_police_ops); +	return tcf_register_action(&act_police_ops, POL_TAB_MASK);  }  static void __exit diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 7725eb4ab75..992c2317ce8 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -25,15 +25,6 @@  #include <net/tc_act/tc_defact.h>  #define SIMP_TAB_MASK     7 -static struct tcf_common *tcf_simp_ht[SIMP_TAB_MASK + 1]; -static u32 simp_idx_gen; -static DEFINE_RWLOCK(simp_lock); - -static struct tcf_hashinfo simp_hash_info = { -	.htab	=	tcf_simp_ht, -	.hmask	=	SIMP_TAB_MASK, -	.lock	=	&simp_lock, -};  #define SIMP_MAX_DATA	32  static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, @@ -55,20 +46,10 @@ static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,  	return d->tcf_action;  } -static int tcf_simp_release(struct tcf_defact *d, int bind) +static void tcf_simp_release(struct tc_action *a, int bind)  { -	int ret = 0; -	if (d) { -		if (bind) -			d->tcf_bindcnt--; -		d->tcf_refcnt--; -		if (d->tcf_bindcnt <= 0 && d->tcf_refcnt <= 0) { -			kfree(d->tcfd_defdata); -			tcf_hash_destroy(&d->common, &simp_hash_info); -			ret = 1; -		} -	} -	return ret; +	struct tcf_defact *d = to_defact(a); +	kfree(d->tcfd_defdata);  }  static int alloc_defdata(struct tcf_defact *d, char *defdata) @@ -102,7 +83,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,  	struct nlattr *tb[TCA_DEF_MAX + 1];  	struct tc_defact *parm;  	struct tcf_defact *d; -	struct tcf_common *pc;  	char *defdata;  	int ret = 0, err; @@ -122,47 +102,36 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,  	parm = nla_data(tb[TCA_DEF_PARMS]);  	defdata = nla_data(tb[TCA_DEF_DATA]); -	pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info); -	if (!pc) { -		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, -				     &simp_idx_gen, &simp_hash_info); -		if (IS_ERR(pc)) -			return PTR_ERR(pc); +	if (!tcf_hash_check(parm->index, a, bind)) { +		ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); +		if (ret) +			return ret; -		d = to_defact(pc); +		d = to_defact(a);  		ret = alloc_defdata(d, defdata);  		if (ret < 0) { -			if (est) -				gen_kill_estimator(&pc->tcfc_bstats, -						   &pc->tcfc_rate_est); -			kfree_rcu(pc, tcfc_rcu); +			tcf_hash_cleanup(a, est);  			return ret;  		}  		d->tcf_action = parm->action;  		ret = ACT_P_CREATED;  	} else { -		d = to_defact(pc); -		if (!ovr) { -			tcf_simp_release(d, bind); +		d = to_defact(a); + +		if (bind) +			return 0; +		tcf_hash_release(a, bind); +		if (!ovr)  			return -EEXIST; -		} +  		reset_policy(d, defdata, parm);  	}  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &simp_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static int tcf_simp_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_defact *d = a->priv; - -	if (d) -		return tcf_simp_release(d, bind); -	return 0; -} -  static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,  			 int bind, int ref)  { @@ -193,15 +162,12 @@ nla_put_failure:  static struct tc_action_ops act_simp_ops = {  	.kind		=	"simple", -	.hinfo		=	&simp_hash_info,  	.type		=	TCA_ACT_SIMP, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_simp,  	.dump		=	tcf_simp_dump, -	.cleanup	=	tcf_simp_cleanup, +	.cleanup	=	tcf_simp_release,  	.init		=	tcf_simp_init, -	.walk		=	tcf_generic_walker,  };  MODULE_AUTHOR("Jamal Hadi Salim(2005)"); @@ -210,7 +176,8 @@ MODULE_LICENSE("GPL");  static int __init simp_init_module(void)  { -	int ret = tcf_register_action(&act_simp_ops); +	int ret; +	ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK);  	if (!ret)  		pr_info("Simple TC action Loaded\n");  	return ret; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index cb4221171f9..fcfeeaf838b 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -11,8 +11,7 @@   * more details.   *   * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. + * this program; if not, see <http://www.gnu.org/licenses/>.   *   * Author: Alexander Duyck <alexander.h.duyck@intel.com>   */ @@ -29,15 +28,6 @@  #include <net/tc_act/tc_skbedit.h>  #define SKBEDIT_TAB_MASK     15 -static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1]; -static u32 skbedit_idx_gen; -static DEFINE_RWLOCK(skbedit_lock); - -static struct tcf_hashinfo skbedit_hash_info = { -	.htab	=	tcf_skbedit_ht, -	.hmask	=	SKBEDIT_TAB_MASK, -	.lock	=	&skbedit_lock, -};  static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,  		       struct tcf_result *res) @@ -74,7 +64,6 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,  	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];  	struct tc_skbedit *parm;  	struct tcf_skbedit *d; -	struct tcf_common *pc;  	u32 flags = 0, *priority = NULL, *mark = NULL;  	u16 *queue_mapping = NULL;  	int ret = 0, err; @@ -109,21 +98,20 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,  	parm = nla_data(tb[TCA_SKBEDIT_PARMS]); -	pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info); -	if (!pc) { -		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, -				     &skbedit_idx_gen, &skbedit_hash_info); -		if (IS_ERR(pc)) -			return PTR_ERR(pc); +	if (!tcf_hash_check(parm->index, a, bind)) { +		ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); +		if (ret) +			return ret; -		d = to_skbedit(pc); +		d = to_skbedit(a);  		ret = ACT_P_CREATED;  	} else { -		d = to_skbedit(pc); -		if (!ovr) { -			tcf_hash_release(pc, bind, &skbedit_hash_info); +		d = to_skbedit(a); +		if (bind) +			return 0; +		tcf_hash_release(a, bind); +		if (!ovr)  			return -EEXIST; -		}  	}  	spin_lock_bh(&d->tcf_lock); @@ -141,19 +129,10 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,  	spin_unlock_bh(&d->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &skbedit_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static int tcf_skbedit_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_skbedit *d = a->priv; - -	if (d) -		return tcf_hash_release(&d->common, bind, &skbedit_hash_info); -	return 0; -} -  static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,  			    int bind, int ref)  { @@ -195,15 +174,11 @@ nla_put_failure:  static struct tc_action_ops act_skbedit_ops = {  	.kind		=	"skbedit", -	.hinfo		=	&skbedit_hash_info,  	.type		=	TCA_ACT_SKBEDIT, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_skbedit,  	.dump		=	tcf_skbedit_dump, -	.cleanup	=	tcf_skbedit_cleanup,  	.init		=	tcf_skbedit_init, -	.walk		=	tcf_generic_walker,  };  MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>"); @@ -212,7 +187,7 @@ MODULE_LICENSE("GPL");  static int __init skbedit_init_module(void)  { -	return tcf_register_action(&act_skbedit_ops); +	return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK);  }  static void __exit skbedit_cleanup_module(void) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8e118af9097..45527e6b52d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -31,8 +31,7 @@  #include <net/pkt_cls.h>  /* The list of all installed classifier types */ - -static struct tcf_proto_ops *tcf_proto_base __read_mostly; +static LIST_HEAD(tcf_proto_base);  /* Protects list of registered TC modules. It is pure SMP lock. */  static DEFINE_RWLOCK(cls_mod_lock); @@ -41,36 +40,35 @@ static DEFINE_RWLOCK(cls_mod_lock);  static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)  { -	const struct tcf_proto_ops *t = NULL; +	const struct tcf_proto_ops *t, *res = NULL;  	if (kind) {  		read_lock(&cls_mod_lock); -		for (t = tcf_proto_base; t; t = t->next) { +		list_for_each_entry(t, &tcf_proto_base, head) {  			if (nla_strcmp(kind, t->kind) == 0) { -				if (!try_module_get(t->owner)) -					t = NULL; +				if (try_module_get(t->owner)) +					res = t;  				break;  			}  		}  		read_unlock(&cls_mod_lock);  	} -	return t; +	return res;  }  /* Register(unregister) new classifier type */  int register_tcf_proto_ops(struct tcf_proto_ops *ops)  { -	struct tcf_proto_ops *t, **tp; +	struct tcf_proto_ops *t;  	int rc = -EEXIST;  	write_lock(&cls_mod_lock); -	for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) +	list_for_each_entry(t, &tcf_proto_base, head)  		if (!strcmp(ops->kind, t->kind))  			goto out; -	ops->next = NULL; -	*tp = ops; +	list_add_tail(&ops->head, &tcf_proto_base);  	rc = 0;  out:  	write_unlock(&cls_mod_lock); @@ -80,19 +78,17 @@ EXPORT_SYMBOL(register_tcf_proto_ops);  int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)  { -	struct tcf_proto_ops *t, **tp; +	struct tcf_proto_ops *t;  	int rc = -ENOENT;  	write_lock(&cls_mod_lock); -	for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) -		if (t == ops) +	list_for_each_entry(t, &tcf_proto_base, head) { +		if (t == ops) { +			list_del(&t->head); +			rc = 0;  			break; - -	if (!t) -		goto out; -	*tp = t->next; -	rc = 0; -out: +		} +	}  	write_unlock(&cls_mod_lock);  	return rc;  } @@ -138,7 +134,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)  	int err;  	int tp_created = 0; -	if ((n->nlmsg_type != RTM_GETTFILTER) && !capable(CAP_NET_ADMIN)) +	if ((n->nlmsg_type != RTM_GETTFILTER) && +	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  replay: @@ -321,7 +318,8 @@ replay:  		}  	} -	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh); +	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, +			      n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE);  	if (err == 0) {  		if (tp_created) {  			spin_lock_bh(root_lock); @@ -344,7 +342,7 @@ errout:  	return err;  } -static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, +static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp,  			 unsigned long fh, u32 portid, u32 seq, u16 flags, int event)  {  	struct tcmsg *tcm; @@ -366,7 +364,7 @@ static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp,  	tcm->tcm_handle = fh;  	if (RTM_DELTFILTER != event) {  		tcm->tcm_handle = 0; -		if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) +		if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)  			goto nla_put_failure;  	}  	nlh->nlmsg_len = skb_tail_pointer(skb) - b; @@ -389,7 +387,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,  	if (!skb)  		return -ENOBUFS; -	if (tcf_fill_node(skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) { +	if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) {  		kfree_skb(skb);  		return -EINVAL;  	} @@ -408,8 +406,9 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,  			 struct tcf_walker *arg)  {  	struct tcf_dump_args *a = (void *)arg; +	struct net *net = sock_net(a->skb->sk); -	return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, +	return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,  			     a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);  } @@ -467,7 +466,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)  		if (t > s_t)  			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));  		if (cb->args[1] == 0) { -			if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).portid, +			if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid,  					  cb->nlh->nlmsg_seq, NLM_F_MULTI,  					  RTM_NEWTFILTER) <= 0)  				break; @@ -500,46 +499,41 @@ out:  void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts)  {  #ifdef CONFIG_NET_CLS_ACT -	if (exts->action) { -		tcf_action_destroy(exts->action, TCA_ACT_UNBIND); -		exts->action = NULL; -	} +	tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND); +	INIT_LIST_HEAD(&exts->actions);  #endif  }  EXPORT_SYMBOL(tcf_exts_destroy);  int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, -		  struct nlattr *rate_tlv, struct tcf_exts *exts, -		  const struct tcf_ext_map *map) +		  struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)  { -	memset(exts, 0, sizeof(*exts)); -  #ifdef CONFIG_NET_CLS_ACT  	{  		struct tc_action *act; -		if (map->police && tb[map->police]) { -			act = tcf_action_init_1(net, tb[map->police], rate_tlv, -						"police", TCA_ACT_NOREPLACE, +		INIT_LIST_HEAD(&exts->actions); +		if (exts->police && tb[exts->police]) { +			act = tcf_action_init_1(net, tb[exts->police], rate_tlv, +						"police", ovr,  						TCA_ACT_BIND);  			if (IS_ERR(act))  				return PTR_ERR(act); -			act->type = TCA_OLD_COMPAT; -			exts->action = act; -		} else if (map->action && tb[map->action]) { -			act = tcf_action_init(net, tb[map->action], rate_tlv, -					      NULL, TCA_ACT_NOREPLACE, -					      TCA_ACT_BIND); -			if (IS_ERR(act)) -				return PTR_ERR(act); - -			exts->action = act; +			act->type = exts->type = TCA_OLD_COMPAT; +			list_add(&act->list, &exts->actions); +		} else if (exts->action && tb[exts->action]) { +			int err; +			err = tcf_action_init(net, tb[exts->action], rate_tlv, +					      NULL, ovr, +					      TCA_ACT_BIND, &exts->actions); +			if (err) +				return err;  		}  	}  #else -	if ((map->action && tb[map->action]) || -	    (map->police && tb[map->police])) +	if ((exts->action && tb[exts->action]) || +	    (exts->police && tb[exts->police]))  		return -EOPNOTSUPP;  #endif @@ -551,43 +545,42 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,  		     struct tcf_exts *src)  {  #ifdef CONFIG_NET_CLS_ACT -	if (src->action) { -		struct tc_action *act; -		tcf_tree_lock(tp); -		act = dst->action; -		dst->action = src->action; -		tcf_tree_unlock(tp); -		if (act) -			tcf_action_destroy(act, TCA_ACT_UNBIND); -	} +	LIST_HEAD(tmp); +	tcf_tree_lock(tp); +	list_splice_init(&dst->actions, &tmp); +	list_splice(&src->actions, &dst->actions); +	tcf_tree_unlock(tp); +	tcf_action_destroy(&tmp, TCA_ACT_UNBIND);  #endif  }  EXPORT_SYMBOL(tcf_exts_change); -int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, -		  const struct tcf_ext_map *map) +#define tcf_exts_first_act(ext) \ +		list_first_entry(&(exts)->actions, struct tc_action, list) + +int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)  {  #ifdef CONFIG_NET_CLS_ACT -	if (map->action && exts->action) { +	if (exts->action && !list_empty(&exts->actions)) {  		/*  		 * again for backward compatible mode - we want  		 * to work with both old and new modes of entering  		 * tc data even if iproute2  was newer - jhs  		 */  		struct nlattr *nest; - -		if (exts->action->type != TCA_OLD_COMPAT) { -			nest = nla_nest_start(skb, map->action); +		if (exts->type != TCA_OLD_COMPAT) { +			nest = nla_nest_start(skb, exts->action);  			if (nest == NULL)  				goto nla_put_failure; -			if (tcf_action_dump(skb, exts->action, 0, 0) < 0) +			if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0)  				goto nla_put_failure;  			nla_nest_end(skb, nest); -		} else if (map->police) { -			nest = nla_nest_start(skb, map->police); -			if (nest == NULL) +		} else if (exts->police) { +			struct tc_action *act = tcf_exts_first_act(exts); +			nest = nla_nest_start(skb, exts->police); +			if (nest == NULL || !act)  				goto nla_put_failure; -			if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) +			if (tcf_action_dump_old(skb, act, 0, 0) < 0)  				goto nla_put_failure;  			nla_nest_end(skb, nest);  		} @@ -600,17 +593,14 @@ nla_put_failure: __attribute__ ((unused))  EXPORT_SYMBOL(tcf_exts_dump); -int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, -			const struct tcf_ext_map *map) +int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)  {  #ifdef CONFIG_NET_CLS_ACT -	if (exts->action) -		if (tcf_action_copy_stats(skb, exts->action, 1) < 0) -			goto nla_put_failure; +	struct tc_action *a = tcf_exts_first_act(exts); +	if (tcf_action_copy_stats(skb, a, 1) < 0) +		return -1;  #endif  	return 0; -nla_put_failure: __attribute__ ((unused)) -	return -1;  }  EXPORT_SYMBOL(tcf_exts_dump_stats); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index d76a35d0dc8..0ae1813e3e9 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -34,16 +34,11 @@ struct basic_filter {  	struct list_head	link;  }; -static const struct tcf_ext_map basic_ext_map = { -	.action = TCA_BASIC_ACT, -	.police = TCA_BASIC_POLICE -}; -  static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			  struct tcf_result *res)  {  	int r; -	struct basic_head *head = (struct basic_head *) tp->root; +	struct basic_head *head = tp->root;  	struct basic_filter *f;  	list_for_each_entry(f, &head->flist, link) { @@ -61,7 +56,7 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,  static unsigned long basic_get(struct tcf_proto *tp, u32 handle)  {  	unsigned long l = 0UL; -	struct basic_head *head = (struct basic_head *) tp->root; +	struct basic_head *head = tp->root;  	struct basic_filter *f;  	if (head == NULL) @@ -112,7 +107,7 @@ static void basic_destroy(struct tcf_proto *tp)  static int basic_delete(struct tcf_proto *tp, unsigned long arg)  { -	struct basic_head *head = (struct basic_head *) tp->root; +	struct basic_head *head = tp->root;  	struct basic_filter *t, *f = (struct basic_filter *) arg;  	list_for_each_entry(t, &head->flist, link) @@ -135,13 +130,14 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {  static int basic_set_parms(struct net *net, struct tcf_proto *tp,  			   struct basic_filter *f, unsigned long base,  			   struct nlattr **tb, -			   struct nlattr *est) +			   struct nlattr *est, bool ovr)  { -	int err = -EINVAL; +	int err;  	struct tcf_exts e;  	struct tcf_ematch_tree t; -	err = tcf_exts_validate(net, tp, tb, est, &e, &basic_ext_map); +	tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); +	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);  	if (err < 0)  		return err; @@ -165,10 +161,10 @@ errout:  static int basic_change(struct net *net, struct sk_buff *in_skb,  			struct tcf_proto *tp, unsigned long base, u32 handle, -			struct nlattr **tca, unsigned long *arg) +			struct nlattr **tca, unsigned long *arg, bool ovr)  {  	int err; -	struct basic_head *head = (struct basic_head *) tp->root; +	struct basic_head *head = tp->root;  	struct nlattr *tb[TCA_BASIC_MAX + 1];  	struct basic_filter *f = (struct basic_filter *) *arg; @@ -183,7 +179,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,  	if (f != NULL) {  		if (handle && f->handle != handle)  			return -EINVAL; -		return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]); +		return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);  	}  	err = -ENOBUFS; @@ -191,6 +187,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,  	if (f == NULL)  		goto errout; +	tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);  	err = -EINVAL;  	if (handle)  		f->handle = handle; @@ -209,7 +206,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,  		f->handle = head->hgenerator;  	} -	err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]); +	err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);  	if (err < 0)  		goto errout; @@ -228,7 +225,7 @@ errout:  static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)  { -	struct basic_head *head = (struct basic_head *) tp->root; +	struct basic_head *head = tp->root;  	struct basic_filter *f;  	list_for_each_entry(f, &head->flist, link) { @@ -244,7 +241,7 @@ skip:  	}  } -static int basic_dump(struct tcf_proto *tp, unsigned long fh, +static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		      struct sk_buff *skb, struct tcmsg *t)  {  	struct basic_filter *f = (struct basic_filter *) fh; @@ -263,13 +260,13 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh,  	    nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid))  		goto nla_put_failure; -	if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || +	if (tcf_exts_dump(skb, &f->exts) < 0 ||  	    tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)  		goto nla_put_failure;  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &f->exts, &basic_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &f->exts) < 0)  		goto nla_put_failure;  	return skb->len; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c new file mode 100644 index 00000000000..13f64df2c71 --- /dev/null +++ b/net/sched/cls_bpf.c @@ -0,0 +1,382 @@ +/* + * Berkeley Packet Filter based traffic classifier + * + * Might be used to classify traffic through flexible, user-defined and + * possibly JIT-ed BPF filters for traffic control as an alternative to + * ematches. + * + * (C) 2013 Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/filter.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_DESCRIPTION("TC BPF based classifier"); + +struct cls_bpf_head { +	struct list_head plist; +	u32 hgen; +}; + +struct cls_bpf_prog { +	struct sk_filter *filter; +	struct sock_filter *bpf_ops; +	struct tcf_exts exts; +	struct tcf_result res; +	struct list_head link; +	u32 handle; +	u16 bpf_len; +}; + +static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { +	[TCA_BPF_CLASSID]	= { .type = NLA_U32 }, +	[TCA_BPF_OPS_LEN]	= { .type = NLA_U16 }, +	[TCA_BPF_OPS]		= { .type = NLA_BINARY, +				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, +}; + +static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, +			    struct tcf_result *res) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog; +	int ret; + +	list_for_each_entry(prog, &head->plist, link) { +		int filter_res = SK_RUN_FILTER(prog->filter, skb); + +		if (filter_res == 0) +			continue; + +		*res = prog->res; +		if (filter_res != -1) +			res->classid = filter_res; + +		ret = tcf_exts_exec(skb, &prog->exts, res); +		if (ret < 0) +			continue; + +		return ret; +	} + +	return -1; +} + +static int cls_bpf_init(struct tcf_proto *tp) +{ +	struct cls_bpf_head *head; + +	head = kzalloc(sizeof(*head), GFP_KERNEL); +	if (head == NULL) +		return -ENOBUFS; + +	INIT_LIST_HEAD(&head->plist); +	tp->root = head; + +	return 0; +} + +static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) +{ +	tcf_unbind_filter(tp, &prog->res); +	tcf_exts_destroy(tp, &prog->exts); + +	sk_unattached_filter_destroy(prog->filter); + +	kfree(prog->bpf_ops); +	kfree(prog); +} + +static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg; + +	list_for_each_entry(prog, &head->plist, link) { +		if (prog == todel) { +			tcf_tree_lock(tp); +			list_del(&prog->link); +			tcf_tree_unlock(tp); + +			cls_bpf_delete_prog(tp, prog); +			return 0; +		} +	} + +	return -ENOENT; +} + +static void cls_bpf_destroy(struct tcf_proto *tp) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog, *tmp; + +	list_for_each_entry_safe(prog, tmp, &head->plist, link) { +		list_del(&prog->link); +		cls_bpf_delete_prog(tp, prog); +	} + +	kfree(head); +} + +static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog; +	unsigned long ret = 0UL; + +	if (head == NULL) +		return 0UL; + +	list_for_each_entry(prog, &head->plist, link) { +		if (prog->handle == handle) { +			ret = (unsigned long) prog; +			break; +		} +	} + +	return ret; +} + +static void cls_bpf_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, +				   struct cls_bpf_prog *prog, +				   unsigned long base, struct nlattr **tb, +				   struct nlattr *est, bool ovr) +{ +	struct sock_filter *bpf_ops, *bpf_old; +	struct tcf_exts exts; +	struct sock_fprog_kern tmp; +	struct sk_filter *fp, *fp_old; +	u16 bpf_size, bpf_len; +	u32 classid; +	int ret; + +	if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID]) +		return -EINVAL; + +	tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); +	ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); +	if (ret < 0) +		return ret; + +	classid = nla_get_u32(tb[TCA_BPF_CLASSID]); +	bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]); +	if (bpf_len > BPF_MAXINSNS || bpf_len == 0) { +		ret = -EINVAL; +		goto errout; +	} + +	bpf_size = bpf_len * sizeof(*bpf_ops); +	bpf_ops = kzalloc(bpf_size, GFP_KERNEL); +	if (bpf_ops == NULL) { +		ret = -ENOMEM; +		goto errout; +	} + +	memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); + +	tmp.len = bpf_len; +	tmp.filter = bpf_ops; + +	ret = sk_unattached_filter_create(&fp, &tmp); +	if (ret) +		goto errout_free; + +	tcf_tree_lock(tp); +	fp_old = prog->filter; +	bpf_old = prog->bpf_ops; + +	prog->bpf_len = bpf_len; +	prog->bpf_ops = bpf_ops; +	prog->filter = fp; +	prog->res.classid = classid; +	tcf_tree_unlock(tp); + +	tcf_bind_filter(tp, &prog->res, base); +	tcf_exts_change(tp, &prog->exts, &exts); + +	if (fp_old) +		sk_unattached_filter_destroy(fp_old); +	if (bpf_old) +		kfree(bpf_old); + +	return 0; + +errout_free: +	kfree(bpf_ops); +errout: +	tcf_exts_destroy(tp, &exts); +	return ret; +} + +static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, +				   struct cls_bpf_head *head) +{ +	unsigned int i = 0x80000000; + +	do { +		if (++head->hgen == 0x7FFFFFFF) +			head->hgen = 1; +	} while (--i > 0 && cls_bpf_get(tp, head->hgen)); +	if (i == 0) +		pr_err("Insufficient number of handles\n"); + +	return i; +} + +static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, +			  struct tcf_proto *tp, unsigned long base, +			  u32 handle, struct nlattr **tca, +			  unsigned long *arg, bool ovr) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg; +	struct nlattr *tb[TCA_BPF_MAX + 1]; +	int ret; + +	if (tca[TCA_OPTIONS] == NULL) +		return -EINVAL; + +	ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy); +	if (ret < 0) +		return ret; + +	if (prog != NULL) { +		if (handle && prog->handle != handle) +			return -EINVAL; +		return cls_bpf_modify_existing(net, tp, prog, base, tb, +					       tca[TCA_RATE], ovr); +	} + +	prog = kzalloc(sizeof(*prog), GFP_KERNEL); +	if (prog == NULL) +		return -ENOBUFS; + +	tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); +	if (handle == 0) +		prog->handle = cls_bpf_grab_new_handle(tp, head); +	else +		prog->handle = handle; +	if (prog->handle == 0) { +		ret = -EINVAL; +		goto errout; +	} + +	ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr); +	if (ret < 0) +		goto errout; + +	tcf_tree_lock(tp); +	list_add(&prog->link, &head->plist); +	tcf_tree_unlock(tp); + +	*arg = (unsigned long) prog; + +	return 0; +errout: +	if (*arg == 0UL && prog) +		kfree(prog); + +	return ret; +} + +static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, +			struct sk_buff *skb, struct tcmsg *tm) +{ +	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; +	struct nlattr *nest, *nla; + +	if (prog == NULL) +		return skb->len; + +	tm->tcm_handle = prog->handle; + +	nest = nla_nest_start(skb, TCA_OPTIONS); +	if (nest == NULL) +		goto nla_put_failure; + +	if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) +		goto nla_put_failure; +	if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len)) +		goto nla_put_failure; + +	nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len * +			  sizeof(struct sock_filter)); +	if (nla == NULL) +		goto nla_put_failure; + +	memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + +	if (tcf_exts_dump(skb, &prog->exts) < 0) +		goto nla_put_failure; + +	nla_nest_end(skb, nest); + +	if (tcf_exts_dump_stats(skb, &prog->exts) < 0) +		goto nla_put_failure; + +	return skb->len; + +nla_put_failure: +	nla_nest_cancel(skb, nest); +	return -1; +} + +static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog; + +	list_for_each_entry(prog, &head->plist, link) { +		if (arg->count < arg->skip) +			goto skip; +		if (arg->fn(tp, (unsigned long) prog, arg) < 0) { +			arg->stop = 1; +			break; +		} +skip: +		arg->count++; +	} +} + +static struct tcf_proto_ops cls_bpf_ops __read_mostly = { +	.kind		=	"bpf", +	.owner		=	THIS_MODULE, +	.classify	=	cls_bpf_classify, +	.init		=	cls_bpf_init, +	.destroy	=	cls_bpf_destroy, +	.get		=	cls_bpf_get, +	.put		=	cls_bpf_put, +	.change		=	cls_bpf_change, +	.delete		=	cls_bpf_delete, +	.walk		=	cls_bpf_walk, +	.dump		=	cls_bpf_dump, +}; + +static int __init cls_bpf_init_mod(void) +{ +	return register_tcf_proto_ops(&cls_bpf_ops); +} + +static void __exit cls_bpf_exit_mod(void) +{ +	unregister_tcf_proto_ops(&cls_bpf_ops); +} + +module_init(cls_bpf_init_mod); +module_exit(cls_bpf_exit_mod); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 867b4a3e398..cacf01bd04f 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -11,109 +11,13 @@  #include <linux/module.h>  #include <linux/slab.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/errno.h>  #include <linux/skbuff.h> -#include <linux/cgroup.h>  #include <linux/rcupdate.h> -#include <linux/fdtable.h>  #include <net/rtnetlink.h>  #include <net/pkt_cls.h>  #include <net/sock.h>  #include <net/cls_cgroup.h> -static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) -{ -	return css ? container_of(css, struct cgroup_cls_state, css) : NULL; -} - -static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) -{ -	return css_cls_state(task_css(p, net_cls_subsys_id)); -} - -static struct cgroup_subsys_state * -cgrp_css_alloc(struct cgroup_subsys_state *parent_css) -{ -	struct cgroup_cls_state *cs; - -	cs = kzalloc(sizeof(*cs), GFP_KERNEL); -	if (!cs) -		return ERR_PTR(-ENOMEM); -	return &cs->css; -} - -static int cgrp_css_online(struct cgroup_subsys_state *css) -{ -	struct cgroup_cls_state *cs = css_cls_state(css); -	struct cgroup_cls_state *parent = css_cls_state(css_parent(css)); - -	if (parent) -		cs->classid = parent->classid; -	return 0; -} - -static void cgrp_css_free(struct cgroup_subsys_state *css) -{ -	kfree(css_cls_state(css)); -} - -static int update_classid(const void *v, struct file *file, unsigned n) -{ -	int err; -	struct socket *sock = sock_from_file(file, &err); -	if (sock) -		sock->sk->sk_classid = (u32)(unsigned long)v; -	return 0; -} - -static void cgrp_attach(struct cgroup_subsys_state *css, -			struct cgroup_taskset *tset) -{ -	struct task_struct *p; -	void *v; - -	cgroup_taskset_for_each(p, css, tset) { -		task_lock(p); -		v = (void *)(unsigned long)task_cls_classid(p); -		iterate_fd(p->files, 0, update_classid, v); -		task_unlock(p); -	} -} - -static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) -{ -	return css_cls_state(css)->classid; -} - -static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, -			 u64 value) -{ -	css_cls_state(css)->classid = (u32) value; -	return 0; -} - -static struct cftype ss_files[] = { -	{ -		.name = "classid", -		.read_u64 = read_classid, -		.write_u64 = write_classid, -	}, -	{ }	/* terminate */ -}; - -struct cgroup_subsys net_cls_subsys = { -	.name		= "net_cls", -	.css_alloc	= cgrp_css_alloc, -	.css_online	= cgrp_css_online, -	.css_free	= cgrp_css_free, -	.attach		= cgrp_attach, -	.subsys_id	= net_cls_subsys_id, -	.base_cftypes	= ss_files, -	.module		= THIS_MODULE, -}; -  struct cls_cgroup_head {  	u32			handle;  	struct tcf_exts		exts; @@ -172,11 +76,6 @@ static int cls_cgroup_init(struct tcf_proto *tp)  	return 0;  } -static const struct tcf_ext_map cgroup_ext_map = { -	.action = TCA_CGROUP_ACT, -	.police = TCA_CGROUP_POLICE, -}; -  static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {  	[TCA_CGROUP_EMATCHES]	= { .type = NLA_NESTED },  }; @@ -184,7 +83,7 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {  static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,  			     struct tcf_proto *tp, unsigned long base,  			     u32 handle, struct nlattr **tca, -			     unsigned long *arg) +			     unsigned long *arg, bool ovr)  {  	struct nlattr *tb[TCA_CGROUP_MAX + 1];  	struct cls_cgroup_head *head = tp->root; @@ -203,6 +102,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,  		if (head == NULL)  			return -ENOBUFS; +		tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);  		head->handle = handle;  		tcf_tree_lock(tp); @@ -218,8 +118,8 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,  	if (err < 0)  		return err; -	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, -				&cgroup_ext_map); +	tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); +	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);  	if (err < 0)  		return err; @@ -264,7 +164,7 @@ skip:  	arg->count++;  } -static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh, +static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  			   struct sk_buff *skb, struct tcmsg *t)  {  	struct cls_cgroup_head *head = tp->root; @@ -277,13 +177,13 @@ static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh,  	if (nest == NULL)  		goto nla_put_failure; -	if (tcf_exts_dump(skb, &head->exts, &cgroup_ext_map) < 0 || +	if (tcf_exts_dump(skb, &head->exts) < 0 ||  	    tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)  		goto nla_put_failure;  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &head->exts, &cgroup_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &head->exts) < 0)  		goto nla_put_failure;  	return skb->len; @@ -309,25 +209,12 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {  static int __init init_cgroup_cls(void)  { -	int ret; - -	ret = cgroup_load_subsys(&net_cls_subsys); -	if (ret) -		goto out; - -	ret = register_tcf_proto_ops(&cls_cgroup_ops); -	if (ret) -		cgroup_unload_subsys(&net_cls_subsys); - -out: -	return ret; +	return register_tcf_proto_ops(&cls_cgroup_ops);  }  static void __exit exit_cgroup_cls(void)  {  	unregister_tcf_proto_ops(&cls_cgroup_ops); - -	cgroup_unload_subsys(&net_cls_subsys);  }  module_init(init_cgroup_cls); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 7881e2fccbc..35be16f7c19 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -56,11 +56,6 @@ struct flow_filter {  	u32			hashrnd;  }; -static const struct tcf_ext_map flow_ext_map = { -	.action	= TCA_FLOW_ACT, -	.police	= TCA_FLOW_POLICE, -}; -  static inline u32 addr_fold(void *addr)  {  	unsigned long a = (unsigned long)addr; @@ -220,7 +215,7 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb)  static u32 flow_get_rxhash(struct sk_buff *skb)  { -	return skb_get_rxhash(skb); +	return skb_get_hash(skb);  }  static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) @@ -354,7 +349,7 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {  static int flow_change(struct net *net, struct sk_buff *in_skb,  		       struct tcf_proto *tp, unsigned long base,  		       u32 handle, struct nlattr **tca, -		       unsigned long *arg) +		       unsigned long *arg, bool ovr)  {  	struct flow_head *head = tp->root;  	struct flow_filter *f; @@ -397,7 +392,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,  			return -EOPNOTSUPP;  	} -	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &flow_ext_map); +	tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); +	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);  	if (err < 0)  		return err; @@ -455,6 +451,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,  		f->handle = handle;  		f->mask	  = ~0U; +		tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);  		get_random_bytes(&f->hashrnd, 4);  		f->perturb_timer.function = flow_perturbation; @@ -566,7 +563,7 @@ static void flow_put(struct tcf_proto *tp, unsigned long f)  {  } -static int flow_dump(struct tcf_proto *tp, unsigned long fh, +static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		     struct sk_buff *skb, struct tcmsg *t)  {  	struct flow_filter *f = (struct flow_filter *)fh; @@ -608,7 +605,7 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh,  	    nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ))  		goto nla_put_failure; -	if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0) +	if (tcf_exts_dump(skb, &f->exts) < 0)  		goto nla_put_failure;  #ifdef CONFIG_NET_EMATCH  	if (f->ematches.hdr.nmatches && @@ -617,7 +614,7 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh,  #endif  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &f->exts) < 0)  		goto nla_put_failure;  	return skb->len; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 9b97172db84..861b03ccfed 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -29,11 +29,11 @@  #include <net/act_api.h>  #include <net/pkt_cls.h> -#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *)) +#define HTSIZE 256  struct fw_head { -	struct fw_filter *ht[HTSIZE]; -	u32 mask; +	u32			mask; +	struct fw_filter	*ht[HTSIZE];  };  struct fw_filter { @@ -41,46 +41,22 @@ struct fw_filter {  	u32			id;  	struct tcf_result	res;  #ifdef CONFIG_NET_CLS_IND -	char			indev[IFNAMSIZ]; +	int			ifindex;  #endif /* CONFIG_NET_CLS_IND */  	struct tcf_exts		exts;  }; -static const struct tcf_ext_map fw_ext_map = { -	.action = TCA_FW_ACT, -	.police = TCA_FW_POLICE -}; - -static inline int fw_hash(u32 handle) +static u32 fw_hash(u32 handle)  { -	if (HTSIZE == 4096) -		return ((handle >> 24) & 0xFFF) ^ -		       ((handle >> 12) & 0xFFF) ^ -		       (handle & 0xFFF); -	else if (HTSIZE == 2048) -		return ((handle >> 22) & 0x7FF) ^ -		       ((handle >> 11) & 0x7FF) ^ -		       (handle & 0x7FF); -	else if (HTSIZE == 1024) -		return ((handle >> 20) & 0x3FF) ^ -		       ((handle >> 10) & 0x3FF) ^ -		       (handle & 0x3FF); -	else if (HTSIZE == 512) -		return (handle >> 27) ^ -		       ((handle >> 18) & 0x1FF) ^ -		       ((handle >> 9) & 0x1FF) ^ -		       (handle & 0x1FF); -	else if (HTSIZE == 256) { -		u8 *t = (u8 *) &handle; -		return t[0] ^ t[1] ^ t[2] ^ t[3]; -	} else -		return handle & (HTSIZE - 1); +	handle ^= (handle >> 16); +	handle ^= (handle >> 8); +	return handle % HTSIZE;  }  static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			  struct tcf_result *res)  { -	struct fw_head *head = (struct fw_head *)tp->root; +	struct fw_head *head = tp->root;  	struct fw_filter *f;  	int r;  	u32 id = skb->mark; @@ -91,7 +67,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			if (f->id == id) {  				*res = f->res;  #ifdef CONFIG_NET_CLS_IND -				if (!tcf_match_indev(skb, f->indev)) +				if (!tcf_match_indev(skb, f->ifindex))  					continue;  #endif /* CONFIG_NET_CLS_IND */  				r = tcf_exts_exec(skb, &f->exts, res); @@ -116,7 +92,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,  static unsigned long fw_get(struct tcf_proto *tp, u32 handle)  { -	struct fw_head *head = (struct fw_head *)tp->root; +	struct fw_head *head = tp->root;  	struct fw_filter *f;  	if (head == NULL) @@ -165,7 +141,7 @@ static void fw_destroy(struct tcf_proto *tp)  static int fw_delete(struct tcf_proto *tp, unsigned long arg)  { -	struct fw_head *head = (struct fw_head *)tp->root; +	struct fw_head *head = tp->root;  	struct fw_filter *f = (struct fw_filter *)arg;  	struct fw_filter **fp; @@ -193,14 +169,15 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {  static int  fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, -	struct nlattr **tb, struct nlattr **tca, unsigned long base) +	struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)  { -	struct fw_head *head = (struct fw_head *)tp->root; +	struct fw_head *head = tp->root;  	struct tcf_exts e;  	u32 mask;  	int err; -	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &fw_ext_map); +	tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); +	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);  	if (err < 0)  		return err; @@ -211,9 +188,13 @@ fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,  #ifdef CONFIG_NET_CLS_IND  	if (tb[TCA_FW_INDEV]) { -		err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV]); -		if (err < 0) +		int ret; +		ret = tcf_change_indev(net, tb[TCA_FW_INDEV]); +		if (ret < 0) { +			err = ret;  			goto errout; +		} +		f->ifindex = ret;  	}  #endif /* CONFIG_NET_CLS_IND */ @@ -237,9 +218,9 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,  		     struct tcf_proto *tp, unsigned long base,  		     u32 handle,  		     struct nlattr **tca, -		     unsigned long *arg) +		     unsigned long *arg, bool ovr)  { -	struct fw_head *head = (struct fw_head *)tp->root; +	struct fw_head *head = tp->root;  	struct fw_filter *f = (struct fw_filter *) *arg;  	struct nlattr *opt = tca[TCA_OPTIONS];  	struct nlattr *tb[TCA_FW_MAX + 1]; @@ -255,7 +236,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,  	if (f != NULL) {  		if (f->id != handle && handle)  			return -EINVAL; -		return fw_change_attrs(net, tp, f, tb, tca, base); +		return fw_change_attrs(net, tp, f, tb, tca, base, ovr);  	}  	if (!handle) @@ -280,9 +261,10 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,  	if (f == NULL)  		return -ENOBUFS; +	tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);  	f->id = handle; -	err = fw_change_attrs(net, tp, f, tb, tca, base); +	err = fw_change_attrs(net, tp, f, tb, tca, base, ovr);  	if (err < 0)  		goto errout; @@ -301,7 +283,7 @@ errout:  static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)  { -	struct fw_head *head = (struct fw_head *)tp->root; +	struct fw_head *head = tp->root;  	int h;  	if (head == NULL) @@ -327,10 +309,10 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)  	}  } -static int fw_dump(struct tcf_proto *tp, unsigned long fh, +static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		   struct sk_buff *skb, struct tcmsg *t)  { -	struct fw_head *head = (struct fw_head *)tp->root; +	struct fw_head *head = tp->root;  	struct fw_filter *f = (struct fw_filter *)fh;  	unsigned char *b = skb_tail_pointer(skb);  	struct nlattr *nest; @@ -351,20 +333,23 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh,  	    nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))  		goto nla_put_failure;  #ifdef CONFIG_NET_CLS_IND -	if (strlen(f->indev) && -	    nla_put_string(skb, TCA_FW_INDEV, f->indev)) -		goto nla_put_failure; +	if (f->ifindex) { +		struct net_device *dev; +		dev = __dev_get_by_index(net, f->ifindex); +		if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name)) +			goto nla_put_failure; +	}  #endif /* CONFIG_NET_CLS_IND */  	if (head->mask != 0xFFFFFFFF &&  	    nla_put_u32(skb, TCA_FW_MASK, head->mask))  		goto nla_put_failure; -	if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) +	if (tcf_exts_dump(skb, &f->exts) < 0)  		goto nla_put_failure;  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &f->exts) < 0)  		goto nla_put_failure;  	return skb->len; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 37da567d833..dd9fc2523c7 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -59,11 +59,6 @@ struct route4_filter {  #define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) -static const struct tcf_ext_map route_ext_map = { -	.police = TCA_ROUTE4_POLICE, -	.action = TCA_ROUTE4_ACT -}; -  static inline int route4_fastmap_hash(u32 id, int iif)  {  	return id & 0xF; @@ -128,7 +123,7 @@ static inline int route4_hash_wild(void)  static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			   struct tcf_result *res)  { -	struct route4_head *head = (struct route4_head *)tp->root; +	struct route4_head *head = tp->root;  	struct dst_entry *dst;  	struct route4_bucket *b;  	struct route4_filter *f; @@ -218,7 +213,7 @@ static inline u32 from_hash(u32 id)  static unsigned long route4_get(struct tcf_proto *tp, u32 handle)  { -	struct route4_head *head = (struct route4_head *)tp->root; +	struct route4_head *head = tp->root;  	struct route4_bucket *b;  	struct route4_filter *f;  	unsigned int h1, h2; @@ -289,7 +284,7 @@ static void route4_destroy(struct tcf_proto *tp)  static int route4_delete(struct tcf_proto *tp, unsigned long arg)  { -	struct route4_head *head = (struct route4_head *)tp->root; +	struct route4_head *head = tp->root;  	struct route4_filter **fp, *f = (struct route4_filter *)arg;  	unsigned int h = 0;  	struct route4_bucket *b; @@ -338,7 +333,8 @@ static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {  static int route4_set_parms(struct net *net, struct tcf_proto *tp,  			    unsigned long base, struct route4_filter *f,  			    u32 handle, struct route4_head *head, -			    struct nlattr **tb, struct nlattr *est, int new) +			    struct nlattr **tb, struct nlattr *est, int new, +			    bool ovr)  {  	int err;  	u32 id = 0, to = 0, nhandle = 0x8000; @@ -347,7 +343,8 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,  	struct route4_bucket *b;  	struct tcf_exts e; -	err = tcf_exts_validate(net, tp, tb, est, &e, &route_ext_map); +	tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); +	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);  	if (err < 0)  		return err; @@ -432,7 +429,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,  		       struct tcf_proto *tp, unsigned long base,  		       u32 handle,  		       struct nlattr **tca, -		       unsigned long *arg) +		       unsigned long *arg, bool ovr)  {  	struct route4_head *head = tp->root;  	struct route4_filter *f, *f1, **fp; @@ -459,7 +456,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,  			old_handle = f->handle;  		err = route4_set_parms(net, tp, base, f, handle, head, tb, -			tca[TCA_RATE], 0); +			tca[TCA_RATE], 0, ovr);  		if (err < 0)  			return err; @@ -481,8 +478,9 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,  	if (f == NULL)  		goto errout; +	tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);  	err = route4_set_parms(net, tp, base, f, handle, head, tb, -		tca[TCA_RATE], 1); +		tca[TCA_RATE], 1, ovr);  	if (err < 0)  		goto errout; @@ -554,7 +552,7 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)  	}  } -static int route4_dump(struct tcf_proto *tp, unsigned long fh, +static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		       struct sk_buff *skb, struct tcmsg *t)  {  	struct route4_filter *f = (struct route4_filter *)fh; @@ -589,12 +587,12 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh,  	    nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid))  		goto nla_put_failure; -	if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) +	if (tcf_exts_dump(skb, &f->exts) < 0)  		goto nla_put_failure;  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &f->exts) < 0)  		goto nla_put_failure;  	return skb->len; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 252d8b05872..1020e233a5d 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -116,11 +116,6 @@ static inline unsigned int hash_src(__be32 *src)  	return h & 0xF;  } -static struct tcf_ext_map rsvp_ext_map = { -	.police = TCA_RSVP_POLICE, -	.action = TCA_RSVP_ACT -}; -  #define RSVP_APPLY_RESULT()				\  {							\  	int r = tcf_exts_exec(skb, &f->exts, res);	\ @@ -420,7 +415,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,  		       struct tcf_proto *tp, unsigned long base,  		       u32 handle,  		       struct nlattr **tca, -		       unsigned long *arg) +		       unsigned long *arg, bool ovr)  {  	struct rsvp_head *data = tp->root;  	struct rsvp_filter *f, **fp; @@ -440,7 +435,8 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,  	if (err < 0)  		return err; -	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &rsvp_ext_map); +	tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); +	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);  	if (err < 0)  		return err; @@ -471,6 +467,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,  	if (f == NULL)  		goto errout2; +	tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);  	h2 = 16;  	if (tb[TCA_RSVP_SRC]) {  		memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); @@ -597,7 +594,7 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)  	}  } -static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, +static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		     struct sk_buff *skb, struct tcmsg *t)  {  	struct rsvp_filter *f = (struct rsvp_filter *)fh; @@ -633,12 +630,12 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,  	    nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src))  		goto nla_put_failure; -	if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) +	if (tcf_exts_dump(skb, &f->exts) < 0)  		goto nla_put_failure;  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &f->exts) < 0)  		goto nla_put_failure;  	return skb->len; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index b86535a4016..c721cd4a469 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -24,9 +24,6 @@  #define DEFAULT_HASH_SIZE	64	/* optimized for diffserv */ -#define	PRIV(tp)	((struct tcindex_data *) (tp)->root) - -  struct tcindex_filter_result {  	struct tcf_exts		exts;  	struct tcf_result	res; @@ -50,11 +47,6 @@ struct tcindex_data {  	int fall_through;	/* 0: only classify if explicit match */  }; -static const struct tcf_ext_map tcindex_ext_map = { -	.police = TCA_TCINDEX_POLICE, -	.action = TCA_TCINDEX_ACT -}; -  static inline int  tcindex_filter_is_set(struct tcindex_filter_result *r)  { @@ -82,7 +74,7 @@ tcindex_lookup(struct tcindex_data *p, u16 key)  static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			    struct tcf_result *res)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter_result *f;  	int key = (skb->tc_index & p->mask) >> p->shift; @@ -107,7 +99,7 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,  static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter_result *r;  	pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); @@ -145,7 +137,7 @@ static int tcindex_init(struct tcf_proto *tp)  static int  __tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;  	struct tcindex_filter *f = NULL; @@ -196,11 +188,17 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {  	[TCA_TCINDEX_CLASSID]		= { .type = NLA_U32 },  }; +static void tcindex_filter_result_init(struct tcindex_filter_result *r) +{ +	memset(r, 0, sizeof(*r)); +	tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); +} +  static int  tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,  		  u32 handle, struct tcindex_data *p,  		  struct tcindex_filter_result *r, struct nlattr **tb, -		 struct nlattr *est) +		  struct nlattr *est, bool ovr)  {  	int err, balloc = 0;  	struct tcindex_filter_result new_filter_result, *old_r = r; @@ -209,17 +207,17 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,  	struct tcindex_filter *f = NULL; /* make gcc behave */  	struct tcf_exts e; -	err = tcf_exts_validate(net, tp, tb, est, &e, &tcindex_ext_map); +	tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); +	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);  	if (err < 0)  		return err;  	memcpy(&cp, p, sizeof(cp)); -	memset(&new_filter_result, 0, sizeof(new_filter_result)); +	tcindex_filter_result_init(&new_filter_result); +	tcindex_filter_result_init(&cr);  	if (old_r) -		memcpy(&cr, r, sizeof(cr)); -	else -		memset(&cr, 0, sizeof(cr)); +		cr.res = r->res;  	if (tb[TCA_TCINDEX_HASH])  		cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); @@ -271,9 +269,14 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,  	err = -ENOMEM;  	if (!cp.perfect && !cp.h) {  		if (valid_perfect_hash(&cp)) { +			int i; +  			cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL);  			if (!cp.perfect)  				goto errout; +			for (i = 0; i < cp.hash; i++) +				tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT, +					      TCA_TCINDEX_POLICE);  			balloc = 1;  		} else {  			cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL); @@ -299,14 +302,17 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,  		tcf_bind_filter(tp, &cr.res, base);  	} -	tcf_exts_change(tp, &cr.exts, &e); +	if (old_r) +		tcf_exts_change(tp, &r->exts, &e); +	else +		tcf_exts_change(tp, &cr.exts, &e);  	tcf_tree_lock(tp);  	if (old_r && old_r != r) -		memset(old_r, 0, sizeof(*old_r)); +		tcindex_filter_result_init(old_r);  	memcpy(p, &cp, sizeof(cp)); -	memcpy(r, &cr, sizeof(cr)); +	r->res = cr.res;  	if (r == &new_filter_result) {  		struct tcindex_filter **fp; @@ -335,11 +341,11 @@ errout:  static int  tcindex_change(struct net *net, struct sk_buff *in_skb,  	       struct tcf_proto *tp, unsigned long base, u32 handle, -	       struct nlattr **tca, unsigned long *arg) +	       struct nlattr **tca, unsigned long *arg, bool ovr)  {  	struct nlattr *opt = tca[TCA_OPTIONS];  	struct nlattr *tb[TCA_TCINDEX_MAX + 1]; -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;  	int err; @@ -355,13 +361,13 @@ tcindex_change(struct net *net, struct sk_buff *in_skb,  		return err;  	return tcindex_set_parms(net, tp, base, handle, p, r, tb, -				 tca[TCA_RATE]); +				 tca[TCA_RATE], ovr);  }  static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter *f, *next;  	int i; @@ -408,7 +414,7 @@ static int tcindex_destroy_element(struct tcf_proto *tp,  static void tcindex_destroy(struct tcf_proto *tp)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcf_walker walker;  	pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); @@ -423,10 +429,10 @@ static void tcindex_destroy(struct tcf_proto *tp)  } -static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, +static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,      struct sk_buff *skb, struct tcmsg *t)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;  	unsigned char *b = skb_tail_pointer(skb);  	struct nlattr *nest; @@ -468,11 +474,11 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,  		    nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))  			goto nla_put_failure; -		if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) +		if (tcf_exts_dump(skb, &r->exts) < 0)  			goto nla_put_failure;  		nla_nest_end(skb, nest); -		if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) +		if (tcf_exts_dump_stats(skb, &r->exts) < 0)  			goto nla_put_failure;  	} diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index eb07a1e536e..70c0be8d012 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -38,6 +38,7 @@  #include <linux/errno.h>  #include <linux/rtnetlink.h>  #include <linux/skbuff.h> +#include <linux/bitmap.h>  #include <net/netlink.h>  #include <net/act_api.h>  #include <net/pkt_cls.h> @@ -48,7 +49,7 @@ struct tc_u_knode {  	struct tc_u_hnode	*ht_up;  	struct tcf_exts		exts;  #ifdef CONFIG_NET_CLS_IND -	char                     indev[IFNAMSIZ]; +	int			ifindex;  #endif  	u8			fshift;  	struct tcf_result	res; @@ -79,11 +80,6 @@ struct tc_u_common {  	u32			hgenerator;  }; -static const struct tcf_ext_map u32_ext_map = { -	.action = TCA_U32_ACT, -	.police = TCA_U32_POLICE -}; -  static inline unsigned int u32_hash_fold(__be32 key,  					 const struct tc_u32_sel *sel,  					 u8 fshift) @@ -100,7 +96,7 @@ static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct  		unsigned int	  off;  	} stack[TC_U32_MAXDEPTH]; -	struct tc_u_hnode *ht = (struct tc_u_hnode *)tp->root; +	struct tc_u_hnode *ht = tp->root;  	unsigned int off = skb_network_offset(skb);  	struct tc_u_knode *n;  	int sdepth = 0; @@ -157,7 +153,7 @@ check_terminal:  				*res = n->res;  #ifdef CONFIG_NET_CLS_IND -				if (!tcf_match_indev(skb, n->indev)) { +				if (!tcf_match_indev(skb, n->ifindex)) {  					n = n->next;  					goto next_knode;  				} @@ -352,7 +348,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)  	return 0;  } -static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) +static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)  {  	struct tc_u_knode **kp;  	struct tc_u_hnode *ht = key->ht_up; @@ -465,17 +461,25 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)  	return 0;  } +#define NR_U32_NODE (1<<12)  static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)  {  	struct tc_u_knode *n; -	unsigned int i = 0x7FF; +	unsigned long i; +	unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long), +					GFP_KERNEL); +	if (!bitmap) +		return handle | 0xFFF;  	for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next) -		if (i < TC_U32_NODE(n->handle)) -			i = TC_U32_NODE(n->handle); -	i++; +		set_bit(TC_U32_NODE(n->handle), bitmap); + +	i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); +	if (i >= NR_U32_NODE) +		i = find_next_zero_bit(bitmap, NR_U32_NODE, 1); -	return handle | (i > 0xFFF ? 0xFFF : i); +	kfree(bitmap); +	return handle | (i >= NR_U32_NODE ? 0xFFF : i);  }  static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { @@ -491,12 +495,13 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {  static int u32_set_parms(struct net *net, struct tcf_proto *tp,  			 unsigned long base, struct tc_u_hnode *ht,  			 struct tc_u_knode *n, struct nlattr **tb, -			 struct nlattr *est) +			 struct nlattr *est, bool ovr)  {  	int err;  	struct tcf_exts e; -	err = tcf_exts_validate(net, tp, tb, est, &e, &u32_ext_map); +	tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); +	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);  	if (err < 0)  		return err; @@ -531,9 +536,11 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,  #ifdef CONFIG_NET_CLS_IND  	if (tb[TCA_U32_INDEV]) { -		err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV]); -		if (err < 0) +		int ret; +		ret = tcf_change_indev(net, tb[TCA_U32_INDEV]); +		if (ret < 0)  			goto errout; +		n->ifindex = ret;  	}  #endif  	tcf_exts_change(tp, &n->exts, &e); @@ -547,7 +554,7 @@ errout:  static int u32_change(struct net *net, struct sk_buff *in_skb,  		      struct tcf_proto *tp, unsigned long base, u32 handle,  		      struct nlattr **tca, -		      unsigned long *arg) +		      unsigned long *arg, bool ovr)  {  	struct tc_u_common *tp_c = tp->data;  	struct tc_u_hnode *ht; @@ -571,7 +578,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,  			return -EINVAL;  		return u32_set_parms(net, tp, base, n->ht_up, n, tb, -				     tca[TCA_RATE]); +				     tca[TCA_RATE], ovr);  	}  	if (tb[TCA_U32_DIVISOR]) { @@ -646,6 +653,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,  	n->ht_up = ht;  	n->handle = handle;  	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; +	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);  #ifdef CONFIG_CLS_U32_MARK  	if (tb[TCA_U32_MARK]) { @@ -657,7 +665,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,  	}  #endif -	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE]); +	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);  	if (err == 0) {  		struct tc_u_knode **ins;  		for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) @@ -715,7 +723,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)  	}  } -static int u32_dump(struct tcf_proto *tp, unsigned long fh, +static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		     struct sk_buff *skb, struct tcmsg *t)  {  	struct tc_u_knode *n = (struct tc_u_knode *)fh; @@ -759,13 +767,16 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,  			goto nla_put_failure;  #endif -		if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) +		if (tcf_exts_dump(skb, &n->exts) < 0)  			goto nla_put_failure;  #ifdef CONFIG_NET_CLS_IND -		if (strlen(n->indev) && -		    nla_put_string(skb, TCA_U32_INDEV, n->indev)) -			goto nla_put_failure; +		if (n->ifindex) { +			struct net_device *dev; +			dev = __dev_get_by_index(net, n->ifindex); +			if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name)) +				goto nla_put_failure; +		}  #endif  #ifdef CONFIG_CLS_U32_PERF  		if (nla_put(skb, TCA_U32_PCNT, @@ -778,7 +789,7 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,  	nla_nest_end(skb, nest);  	if (TC_U32_KEY(n->handle)) -		if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) +		if (tcf_exts_dump_stats(skb, &n->exts) < 0)  			goto nla_put_failure;  	return skb->len; diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index 938b7cbf562..527aeb7a3ff 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -24,11 +24,12 @@ static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len,  {  	struct xt_set_info *set = data;  	ip_set_id_t index; +	struct net *net = dev_net(qdisc_dev(tp->q));  	if (data_len != sizeof(*set))  		return -EINVAL; -	index = ip_set_nfnl_get_byindex(set->index); +	index = ip_set_nfnl_get_byindex(net, set->index);  	if (index == IPSET_INVALID_ID)  		return -ENOENT; @@ -37,7 +38,7 @@ static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len,  	if (em->data)  		return 0; -	ip_set_nfnl_put(index); +	ip_set_nfnl_put(net, index);  	return -ENOMEM;  } @@ -45,7 +46,7 @@ static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em)  {  	const struct xt_set_info *set = (const void *) em->data;  	if (set) { -		ip_set_nfnl_put(set->index); +		ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index);  		kfree((void *) em->data);  	}  } diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 7c3de6ffa51..9b8c0b0e60d 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -222,7 +222,7 @@ META_COLLECTOR(int_maclen)  META_COLLECTOR(int_rxhash)  { -	dst->value = skb_get_rxhash(skb); +	dst->value = skb_get_hash(skb);  }  /************************************************************************** @@ -271,40 +271,52 @@ META_COLLECTOR(int_rtiif)   * Socket Attributes   **************************************************************************/ -#define SKIP_NONLOCAL(skb)			\ -	if (unlikely(skb->sk == NULL)) {	\ -		*err = -1;			\ -		return;				\ -	} +#define skip_nonlocal(skb) \ +	(unlikely(skb->sk == NULL))  META_COLLECTOR(int_sk_family)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_family;  }  META_COLLECTOR(int_sk_state)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_state;  }  META_COLLECTOR(int_sk_reuse)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_reuse;  }  META_COLLECTOR(int_sk_bound_if)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	/* No error if bound_dev_if is 0, legal userspace check */  	dst->value = skb->sk->sk_bound_dev_if;  }  META_COLLECTOR(var_sk_bound_if)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	if (skb->sk->sk_bound_dev_if == 0) {  		dst->value = (unsigned long) "any"; @@ -322,151 +334,226 @@ META_COLLECTOR(var_sk_bound_if)  META_COLLECTOR(int_sk_refcnt)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = atomic_read(&skb->sk->sk_refcnt);  }  META_COLLECTOR(int_sk_rcvbuf)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_rcvbuf;  }  META_COLLECTOR(int_sk_shutdown)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_shutdown;  }  META_COLLECTOR(int_sk_proto)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_protocol;  }  META_COLLECTOR(int_sk_type)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_type;  }  META_COLLECTOR(int_sk_rmem_alloc)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = sk_rmem_alloc_get(skb->sk);  }  META_COLLECTOR(int_sk_wmem_alloc)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = sk_wmem_alloc_get(skb->sk);  }  META_COLLECTOR(int_sk_omem_alloc)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = atomic_read(&skb->sk->sk_omem_alloc);  }  META_COLLECTOR(int_sk_rcv_qlen)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_receive_queue.qlen;  }  META_COLLECTOR(int_sk_snd_qlen)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_write_queue.qlen;  }  META_COLLECTOR(int_sk_wmem_queued)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_wmem_queued;  }  META_COLLECTOR(int_sk_fwd_alloc)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_forward_alloc;  }  META_COLLECTOR(int_sk_sndbuf)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_sndbuf;  }  META_COLLECTOR(int_sk_alloc)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = (__force int) skb->sk->sk_allocation;  }  META_COLLECTOR(int_sk_hash)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_hash;  }  META_COLLECTOR(int_sk_lingertime)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_lingertime / HZ;  }  META_COLLECTOR(int_sk_err_qlen)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_error_queue.qlen;  }  META_COLLECTOR(int_sk_ack_bl)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_ack_backlog;  }  META_COLLECTOR(int_sk_max_ack_bl)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_max_ack_backlog;  }  META_COLLECTOR(int_sk_prio)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_priority;  }  META_COLLECTOR(int_sk_rcvlowat)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_rcvlowat;  }  META_COLLECTOR(int_sk_rcvtimeo)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_rcvtimeo / HZ;  }  META_COLLECTOR(int_sk_sndtimeo)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_sndtimeo / HZ;  }  META_COLLECTOR(int_sk_sendmsg_off)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_frag.offset;  }  META_COLLECTOR(int_sk_write_pend)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_write_pending;  } @@ -793,8 +880,10 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len,  		goto errout;  	meta = kzalloc(sizeof(*meta), GFP_KERNEL); -	if (meta == NULL) +	if (meta == NULL) { +		err = -ENOMEM;  		goto errout; +	}  	memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));  	memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 2adda7fa2d3..58bed7599db 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -135,7 +135,7 @@ static DEFINE_RWLOCK(qdisc_mod_lock);  static struct Qdisc_ops *qdisc_base; -/* Register/uregister queueing discipline */ +/* Register/unregister queueing discipline */  int register_qdisc(struct Qdisc_ops *qops)  { @@ -271,11 +271,16 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)  	return NULL;  } -static void qdisc_list_add(struct Qdisc *q) +void qdisc_list_add(struct Qdisc *q)  { -	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) -		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list); +	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { +		struct Qdisc *root = qdisc_dev(q)->qdisc; + +		WARN_ON_ONCE(root == &noop_qdisc); +		list_add_tail(&q->list, &root->list); +	}  } +EXPORT_SYMBOL(qdisc_list_add);  void qdisc_list_del(struct Qdisc *q)  { @@ -558,7 +563,7 @@ out:  }  EXPORT_SYMBOL(__qdisc_calculate_pkt_len); -void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc) +void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)  {  	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {  		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", @@ -737,9 +742,11 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)  	const struct Qdisc_class_ops *cops;  	unsigned long cl;  	u32 parentid; +	int drops;  	if (n == 0)  		return; +	drops = max_t(int, n, 0);  	while ((parentid = sch->parent)) {  		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))  			return; @@ -756,6 +763,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)  			cops->put(sch, cl);  		}  		sch->q.qlen -= n; +		sch->qstats.drops += drops;  	}  }  EXPORT_SYMBOL(qdisc_tree_decrease_qlen); @@ -1076,7 +1084,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)  	struct Qdisc *p = NULL;  	int err; -	if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN)) +	if ((n->nlmsg_type != RTM_GETQDISC) && +	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); @@ -1143,7 +1152,7 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)  	struct Qdisc *q, *p;  	int err; -	if (!capable(CAP_NET_ADMIN)) +	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  replay: @@ -1296,6 +1305,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,  	struct gnet_dump d;  	struct qdisc_size_table *stab; +	cond_resched();  	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);  	if (!nlh)  		goto out_nlmsg_trim; @@ -1427,9 +1437,9 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)  	s_idx = cb->args[0];  	s_q_idx = q_idx = cb->args[1]; -	rcu_read_lock();  	idx = 0; -	for_each_netdev_rcu(net, dev) { +	ASSERT_RTNL(); +	for_each_netdev(net, dev) {  		struct netdev_queue *dev_queue;  		if (idx < s_idx) @@ -1452,8 +1462,6 @@ cont:  	}  done: -	rcu_read_unlock(); -  	cb->args[0] = idx;  	cb->args[1] = q_idx; @@ -1483,7 +1491,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)  	u32 qid;  	int err; -	if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN)) +	if ((n->nlmsg_type != RTM_GETTCLASS) && +	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); @@ -1610,6 +1619,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,  	struct gnet_dump d;  	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; +	cond_resched();  	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);  	if (!nlh)  		goto out_nlmsg_trim; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 1f9c31411f1..8449b337f9e 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -623,8 +623,7 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,  		if (nla_put_u32(skb, TCA_ATM_EXCESS, 0))  			goto nla_put_failure;  	} -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure:  	nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 7a42c81a19e..ead526467cc 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1058,9 +1058,10 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)  				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/  					q->quanta[prio];  			} -			if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { -				pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n", -					   cl->common.classid, cl->quantum); +			if (cl->quantum <= 0 || +			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) { +				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n", +					cl->common.classid, cl->quantum);  				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;  			}  		} @@ -1562,8 +1563,7 @@ static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)  		goto nla_put_failure;  	if (cbq_dump_attr(skb, &q->link) < 0)  		goto nla_put_failure; -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure:  	nla_nest_cancel(skb, nest); @@ -1598,8 +1598,7 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg,  		goto nla_put_failure;  	if (cbq_dump_attr(skb, cl) < 0)  		goto nla_put_failure; -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure:  	nla_nest_cancel(skb, nest); @@ -1782,8 +1781,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t  						    qdisc_root_sleeping_lock(sch),  						    tca[TCA_RATE]);  			if (err) { -				if (rtab) -					qdisc_put_rtab(rtab); +				qdisc_put_rtab(rtab);  				return err;  			}  		} diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ddd73cb2d7b..ed30e436128 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -14,7 +14,6 @@  #include <linux/types.h>  #include <linux/kernel.h>  #include <linux/skbuff.h> -#include <linux/reciprocal_div.h>  #include <linux/vmalloc.h>  #include <net/pkt_sched.h>  #include <net/inet_ecn.h> @@ -77,12 +76,6 @@ struct choke_sched_data {  	struct sk_buff **tab;  }; -/* deliver a random number between 0 and N - 1 */ -static u32 random_N(unsigned int N) -{ -	return reciprocal_divide(prandom_u32(), N); -} -  /* number of elements in queue including holes */  static unsigned int choke_len(const struct choke_sched_data *q)  { @@ -233,7 +226,7 @@ static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,  	int retrys = 3;  	do { -		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask; +		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;  		skb = q->tab[*pidx];  		if (skb)  			return skb; @@ -398,12 +391,7 @@ static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {  static void choke_free(void *addr)  { -	if (addr) { -		if (is_vmalloc_addr(addr)) -			vfree(addr); -		else -			kfree(addr); -	} +	kvfree(addr);  }  static int choke_change(struct Qdisc *sch, struct nlattr *opt) diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 8302717ea30..7bbbfe11219 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -391,8 +391,10 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)  	while (1) {  		cl = list_first_entry(&q->active, struct drr_class, alist);  		skb = cl->qdisc->ops->peek(cl->qdisc); -		if (skb == NULL) +		if (skb == NULL) { +			qdisc_warn_nonwc(__func__, cl->qdisc);  			goto out; +		}  		len = qdisc_pkt_len(skb);  		if (len <= cl->deficit) { diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 3886365cc20..49d6ef338b5 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -47,7 +47,7 @@ struct dsmark_qdisc_data {  static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)  { -	return (index <= p->indices && index > 0); +	return index <= p->indices && index > 0;  }  /* ------------------------- Class/flow operations ------------------------- */ @@ -57,8 +57,8 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg,  {  	struct dsmark_qdisc_data *p = qdisc_priv(sch); -	pr_debug("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n", -		sch, p, new, old); +	pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n", +		 __func__, sch, p, new, old);  	if (new == NULL) {  		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, @@ -85,8 +85,8 @@ static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)  static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)  { -	pr_debug("dsmark_get(sch %p,[qdisc %p],classid %x)\n", -		sch, qdisc_priv(sch), classid); +	pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", +		 __func__, sch, qdisc_priv(sch), classid);  	return TC_H_MIN(classid) + 1;  } @@ -118,8 +118,8 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,  	int err = -EINVAL;  	u8 mask = 0; -	pr_debug("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," -		"arg 0x%lx\n", sch, p, classid, parent, *arg); +	pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n", +		 __func__, sch, p, classid, parent, *arg);  	if (!dsmark_valid_index(p, *arg)) {  		err = -ENOENT; @@ -166,7 +166,8 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)  	struct dsmark_qdisc_data *p = qdisc_priv(sch);  	int i; -	pr_debug("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); +	pr_debug("%s(sch %p,[qdisc %p],walker %p)\n", +		 __func__, sch, p, walker);  	if (walker->stop)  		return; @@ -199,7 +200,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	struct dsmark_qdisc_data *p = qdisc_priv(sch);  	int err; -	pr_debug("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); +	pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);  	if (p->set_tc_index) {  		switch (skb->protocol) { @@ -275,7 +276,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)  	struct sk_buff *skb;  	u32 index; -	pr_debug("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p); +	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);  	skb = p->q->ops->dequeue(p->q);  	if (skb == NULL) @@ -303,8 +304,8 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)  		 * and don't need yet another qdisc as a bypass.  		 */  		if (p->mask[index] != 0xff || p->value[index]) -			pr_warning("dsmark_dequeue: unsupported protocol %d\n", -				   ntohs(skb->protocol)); +			pr_warn("%s: unsupported protocol %d\n", +				__func__, ntohs(skb->protocol));  		break;  	} @@ -315,7 +316,7 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch)  {  	struct dsmark_qdisc_data *p = qdisc_priv(sch); -	pr_debug("dsmark_peek(sch %p,[qdisc %p])\n", sch, p); +	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);  	return p->q->ops->peek(p->q);  } @@ -325,7 +326,7 @@ static unsigned int dsmark_drop(struct Qdisc *sch)  	struct dsmark_qdisc_data *p = qdisc_priv(sch);  	unsigned int len; -	pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); +	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);  	if (p->q->ops->drop == NULL)  		return 0; @@ -346,7 +347,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)  	u16 indices;  	u8 *mask; -	pr_debug("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); +	pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);  	if (!opt)  		goto errout; @@ -384,7 +385,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)  	if (p->q == NULL)  		p->q = &noop_qdisc; -	pr_debug("dsmark_init: qdisc %p\n", p->q); +	pr_debug("%s: qdisc %p\n", __func__, p->q);  	err = 0;  errout: @@ -395,7 +396,7 @@ static void dsmark_reset(struct Qdisc *sch)  {  	struct dsmark_qdisc_data *p = qdisc_priv(sch); -	pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); +	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);  	qdisc_reset(p->q);  	sch->q.qlen = 0;  } @@ -404,7 +405,7 @@ static void dsmark_destroy(struct Qdisc *sch)  {  	struct dsmark_qdisc_data *p = qdisc_priv(sch); -	pr_debug("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p); +	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);  	tcf_destroy_chain(&p->filter_list);  	qdisc_destroy(p->q); @@ -417,7 +418,7 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,  	struct dsmark_qdisc_data *p = qdisc_priv(sch);  	struct nlattr *opts = NULL; -	pr_debug("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl); +	pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);  	if (!dsmark_valid_index(p, cl))  		return -EINVAL; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index a9dfdda9ed1..ba32c2b005d 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -47,6 +47,7 @@  #include <linux/rbtree.h>  #include <linux/hash.h>  #include <linux/prefetch.h> +#include <linux/vmalloc.h>  #include <net/netlink.h>  #include <net/pkt_sched.h>  #include <net/sock.h> @@ -88,7 +89,7 @@ struct fq_sched_data {  	struct fq_flow	internal;	/* for non classified or high prio packets */  	u32		quantum;  	u32		initial_quantum; -	u32		flow_default_rate;/* rate per flow : bytes per second */ +	u32		flow_refill_delay;  	u32		flow_max_rate;	/* optional max rate per flow */  	u32		flow_plimit;	/* max packets per flow */  	struct rb_root	*fq_root; @@ -115,6 +116,7 @@ static struct fq_flow detached, throttled;  static void fq_flow_set_detached(struct fq_flow *f)  {  	f->next = &detached; +	f->age = jiffies;  }  static bool fq_flow_is_detached(const struct fq_flow *f) @@ -209,28 +211,22 @@ static void fq_gc(struct fq_sched_data *q,  	}  } -static const u8 prio2band[TC_PRIO_MAX + 1] = { -	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 -}; -  static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)  {  	struct rb_node **p, *parent;  	struct sock *sk = skb->sk;  	struct rb_root *root;  	struct fq_flow *f; -	int band;  	/* warning: no starvation prevention... */ -	band = prio2band[skb->priority & TC_PRIO_MAX]; -	if (unlikely(band == 0)) +	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))  		return &q->internal;  	if (unlikely(!sk)) {  		/* By forcing low order bit to 1, we make sure to not  		 * collide with a local flow (socket pointers are word aligned)  		 */ -		sk = (struct sock *)(skb_get_rxhash(skb) | 1L); +		sk = (struct sock *)(skb_get_hash(skb) | 1L);  	}  	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; @@ -255,6 +251,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)  				     f->socket_hash != sk->sk_hash)) {  				f->credit = q->initial_quantum;  				f->socket_hash = sk->sk_hash; +				f->time_next_packet = 0ULL;  			}  			return f;  		} @@ -372,17 +369,20 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	}  	f->qlen++; -	flow_queue_add(f, skb);  	if (skb_is_retransmit(skb))  		q->stat_tcp_retrans++;  	sch->qstats.backlog += qdisc_pkt_len(skb);  	if (fq_flow_is_detached(f)) {  		fq_flow_add_tail(&q->new_flows, f); -		if (q->quantum > f->credit) -			f->credit = q->quantum; +		if (time_after(jiffies, f->age + q->flow_refill_delay)) +			f->credit = max_t(u32, f->credit, q->quantum);  		q->inactive_flows--;  		qdisc_unthrottled(sch);  	} + +	/* Note: this overwrites f->age */ +	flow_queue_add(f, skb); +  	if (unlikely(f == &q->internal)) {  		q->stat_internal_packets++;  		qdisc_unthrottled(sch); @@ -460,7 +460,6 @@ begin:  			fq_flow_add_tail(&q->old_flows, f);  		} else {  			fq_flow_set_detached(f); -			f->age = jiffies;  			q->inactive_flows++;  		}  		goto begin; @@ -580,28 +579,53 @@ static void fq_rehash(struct fq_sched_data *q,  	q->stat_gc_flows += fcnt;  } -static int fq_resize(struct fq_sched_data *q, u32 log) +static void *fq_alloc_node(size_t sz, int node) +{ +	void *ptr; + +	ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); +	if (!ptr) +		ptr = vmalloc_node(sz, node); +	return ptr; +} + +static void fq_free(void *addr) +{ +	kvfree(addr); +} + +static int fq_resize(struct Qdisc *sch, u32 log)  { +	struct fq_sched_data *q = qdisc_priv(sch);  	struct rb_root *array; +	void *old_fq_root;  	u32 idx;  	if (q->fq_root && log == q->fq_trees_log)  		return 0; -	array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL); +	/* If XPS was setup, we can allocate memory on right NUMA node */ +	array = fq_alloc_node(sizeof(struct rb_root) << log, +			      netdev_queue_numa_node_read(sch->dev_queue));  	if (!array)  		return -ENOMEM;  	for (idx = 0; idx < (1U << log); idx++)  		array[idx] = RB_ROOT; -	if (q->fq_root) { -		fq_rehash(q, q->fq_root, q->fq_trees_log, array, log); -		kfree(q->fq_root); -	} +	sch_tree_lock(sch); + +	old_fq_root = q->fq_root; +	if (old_fq_root) +		fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); +  	q->fq_root = array;  	q->fq_trees_log = log; +	sch_tree_unlock(sch); + +	fq_free(old_fq_root); +  	return 0;  } @@ -614,6 +638,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {  	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },  	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },  	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 }, +	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },  };  static int fq_change(struct Qdisc *sch, struct nlattr *opt) @@ -655,7 +680,8 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)  		q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);  	if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) -		q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]); +		pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", +				    nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));  	if (tb[TCA_FQ_FLOW_MAX_RATE])  		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); @@ -669,9 +695,17 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)  			err = -EINVAL;  	} -	if (!err) -		err = fq_resize(q, fq_log); +	if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { +		u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; + +		q->flow_refill_delay = usecs_to_jiffies(usecs_delay); +	} +	if (!err) { +		sch_tree_unlock(sch); +		err = fq_resize(sch, fq_log); +		sch_tree_lock(sch); +	}  	while (sch->q.qlen > sch->limit) {  		struct sk_buff *skb = fq_dequeue(sch); @@ -691,7 +725,7 @@ static void fq_destroy(struct Qdisc *sch)  	struct fq_sched_data *q = qdisc_priv(sch);  	fq_reset(sch); -	kfree(q->fq_root); +	fq_free(q->fq_root);  	qdisc_watchdog_cancel(&q->watchdog);  } @@ -704,7 +738,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)  	q->flow_plimit		= 100;  	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));  	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch)); -	q->flow_default_rate	= 0; +	q->flow_refill_delay	= msecs_to_jiffies(40);  	q->flow_max_rate	= ~0U;  	q->rate_enable		= 1;  	q->new_flows.first	= NULL; @@ -717,7 +751,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)  	if (opt)  		err = fq_change(sch, opt);  	else -		err = fq_resize(q, q->fq_trees_log); +		err = fq_resize(sch, q->fq_trees_log);  	return err;  } @@ -731,20 +765,20 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)  	if (opts == NULL)  		goto nla_put_failure; -	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore, -	 * do not bother giving its value -	 */ +	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ +  	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||  	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||  	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||  	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||  	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||  	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || +	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, +			jiffies_to_usecs(q->flow_refill_delay)) ||  	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))  		goto nla_put_failure; -	nla_nest_end(skb, opts); -	return skb->len; +	return nla_nest_end(skb, opts);  nla_put_failure:  	return -1; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 55786283a3d..063b726bf1f 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -365,12 +365,7 @@ static void *fq_codel_zalloc(size_t sz)  static void fq_codel_free(void *addr)  { -	if (addr) { -		if (is_vmalloc_addr(addr)) -			vfree(addr); -		else -			kfree(addr); -	} +	kvfree(addr);  }  static void fq_codel_destroy(struct Qdisc *sch) @@ -390,7 +385,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)  	sch->limit = 10*1024;  	q->flows_cnt = 1024;  	q->quantum = psched_mtu(qdisc_dev(sch)); -	q->perturbation = net_random(); +	q->perturbation = prandom_u32();  	INIT_LIST_HEAD(&q->new_flows);  	INIT_LIST_HEAD(&q->old_flows);  	codel_params_init(&q->cparams); @@ -450,8 +445,7 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)  			q->flows_cnt))  		goto nla_put_failure; -	nla_nest_end(skb, opts); -	return skb->len; +	return nla_nest_end(skb, opts);  nla_put_failure:  	return -1; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a74e278654a..e1543b03e39 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -310,6 +310,7 @@ void netif_carrier_on(struct net_device *dev)  	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {  		if (dev->reg_state == NETREG_UNINITIALIZED)  			return; +		atomic_inc(&dev->carrier_changes);  		linkwatch_fire_event(dev);  		if (netif_running(dev))  			__netdev_watchdog_up(dev); @@ -328,6 +329,7 @@ void netif_carrier_off(struct net_device *dev)  	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {  		if (dev->reg_state == NETREG_UNINITIALIZED)  			return; +		atomic_inc(&dev->carrier_changes);  		linkwatch_fire_event(dev);  	}  } @@ -338,13 +340,13 @@ EXPORT_SYMBOL(netif_carrier_off);     cheaper.   */ -static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)  {  	kfree_skb(skb);  	return NET_XMIT_CN;  } -static struct sk_buff *noop_dequeue(struct Qdisc * qdisc) +static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)  {  	return NULL;  } @@ -718,8 +720,8 @@ static void attach_default_qdiscs(struct net_device *dev)  	} else {  		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);  		if (qdisc) { -			qdisc->ops->attach(qdisc);  			dev->qdisc = qdisc; +			qdisc->ops->attach(qdisc);  		}  	}  } @@ -829,7 +831,7 @@ void dev_deactivate_many(struct list_head *head)  	struct net_device *dev;  	bool sync_needed = false; -	list_for_each_entry(dev, head, unreg_list) { +	list_for_each_entry(dev, head, close_list) {  		netdev_for_each_tx_queue(dev, dev_deactivate_queue,  					 &noop_qdisc);  		if (dev_ingress_queue(dev)) @@ -848,7 +850,7 @@ void dev_deactivate_many(struct list_head *head)  		synchronize_net();  	/* Wait for outstanding qdisc_run calls. */ -	list_for_each_entry(dev, head, unreg_list) +	list_for_each_entry(dev, head, close_list)  		while (some_qdisc_is_busy(dev))  			yield();  } @@ -857,7 +859,7 @@ void dev_deactivate(struct net_device *dev)  {  	LIST_HEAD(single); -	list_add(&dev->unreg_list, &single); +	list_add(&dev->close_list, &single);  	dev_deactivate_many(&single);  	list_del(&single);  } @@ -910,11 +912,12 @@ void dev_shutdown(struct net_device *dev)  }  void psched_ratecfg_precompute(struct psched_ratecfg *r, -			       const struct tc_ratespec *conf) +			       const struct tc_ratespec *conf, +			       u64 rate64)  {  	memset(r, 0, sizeof(*r));  	r->overhead = conf->overhead; -	r->rate_bytes_ps = conf->rate; +	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);  	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);  	r->mult = 1;  	/* diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index d42234c0f13..12cbc09157f 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -370,8 +370,8 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)  	for (i = table->DPs; i < MAX_DPs; i++) {  		if (table->tab[i]) { -			pr_warning("GRED: Warning: Destroying " -				   "shadowed VQ 0x%x\n", i); +			pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n", +				i);  			gred_destroy_vq(table->tab[i]);  			table->tab[i] = NULL;  		} diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index c4075610502..ec8aeaac1dd 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1353,8 +1353,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,  		goto nla_put_failure;  	if (hfsc_dump_curves(skb, cl) < 0)  		goto nla_put_failure; -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);   nla_put_failure:  	nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c new file mode 100644 index 00000000000..d85b6812a7d --- /dev/null +++ b/net/sched/sch_hhf.c @@ -0,0 +1,740 @@ +/* net/sched/sch_hhf.c		Heavy-Hitter Filter (HHF) + * + * Copyright (C) 2013 Terry Lam <vtlam@google.com> + * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> + */ + +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <net/flow_keys.h> +#include <net/pkt_sched.h> +#include <net/sock.h> + +/*	Heavy-Hitter Filter (HHF) + * + * Principles : + * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter + * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified + * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. + * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, + * in which the heavy-hitter bucket is served with less weight. + * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) + * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have + * higher share of bandwidth. + * + * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the + * following paper: + * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and + * Accounting", in ACM SIGCOMM, 2002. + * + * Conceptually, a multi-stage filter comprises k independent hash functions + * and k counter arrays. Packets are indexed into k counter arrays by k hash + * functions, respectively. The counters are then increased by the packet sizes. + * Therefore, + *    - For a heavy-hitter flow: *all* of its k array counters must be large. + *    - For a non-heavy-hitter flow: some of its k array counters can be large + *      due to hash collision with other small flows; however, with high + *      probability, not *all* k counters are large. + * + * By the design of the multi-stage filter algorithm, the false negative rate + * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is + * susceptible to false positives (non-heavy-hitters mistakenly classified as + * heavy-hitters). + * Therefore, we also implement the following optimizations to reduce false + * positives by avoiding unnecessary increment of the counter values: + *    - Optimization O1: once a heavy-hitter is identified, its bytes are not + *        accounted in the array counters. This technique is called "shielding" + *        in Section 3.3.1 of [EV02]. + *    - Optimization O2: conservative update of counters + *                       (Section 3.3.2 of [EV02]), + *        New counter value = max {old counter value, + *                                 smallest counter value + packet bytes} + * + * Finally, we refresh the counters periodically since otherwise the counter + * values will keep accumulating. + * + * Once a flow is classified as heavy-hitter, we also save its per-flow state + * in an exact-matching flow table so that its subsequent packets can be + * dispatched to the heavy-hitter bucket accordingly. + * + * + * At a high level, this qdisc works as follows: + * Given a packet p: + *   - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching + *     heavy-hitter flow table, denoted table T, then send p to the heavy-hitter + *     bucket. + *   - Otherwise, forward p to the multi-stage filter, denoted filter F + *        + If F decides that p belongs to a non-heavy-hitter flow, then send p + *          to the non-heavy-hitter bucket. + *        + Otherwise, if F decides that p belongs to a new heavy-hitter flow, + *          then set up a new flow entry for the flow-id of p in the table T and + *          send p to the heavy-hitter bucket. + * + * In this implementation: + *   - T is a fixed-size hash-table with 1024 entries. Hash collision is + *     resolved by linked-list chaining. + *   - F has four counter arrays, each array containing 1024 32-bit counters. + *     That means 4 * 1024 * 32 bits = 16KB of memory. + *   - Since each array in F contains 1024 counters, 10 bits are sufficient to + *     index into each array. + *     Hence, instead of having four hash functions, we chop the 32-bit + *     skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is + *     computed as XOR sum of those three chunks. + *   - We need to clear the counter arrays periodically; however, directly + *     memsetting 16KB of memory can lead to cache eviction and unwanted delay. + *     So by representing each counter by a valid bit, we only need to reset + *     4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. + *   - The Deficit Round Robin engine is taken from fq_codel implementation + *     (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to + *     fq_codel_flow in fq_codel implementation. + * + */ + +/* Non-configurable parameters */ +#define HH_FLOWS_CNT	 1024  /* number of entries in exact-matching table T */ +#define HHF_ARRAYS_CNT	 4     /* number of arrays in multi-stage filter F */ +#define HHF_ARRAYS_LEN	 1024  /* number of counters in each array of F */ +#define HHF_BIT_MASK_LEN 10    /* masking 10 bits */ +#define HHF_BIT_MASK	 0x3FF /* bitmask of 10 bits */ + +#define WDRR_BUCKET_CNT  2     /* two buckets for Weighted DRR */ +enum wdrr_bucket_idx { +	WDRR_BUCKET_FOR_HH	= 0, /* bucket id for heavy-hitters */ +	WDRR_BUCKET_FOR_NON_HH	= 1  /* bucket id for non-heavy-hitters */ +}; + +#define hhf_time_before(a, b)	\ +	(typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) + +/* Heavy-hitter per-flow state */ +struct hh_flow_state { +	u32		 hash_id;	/* hash of flow-id (e.g. TCP 5-tuple) */ +	u32		 hit_timestamp;	/* last time heavy-hitter was seen */ +	struct list_head flowchain;	/* chaining under hash collision */ +}; + +/* Weighted Deficit Round Robin (WDRR) scheduler */ +struct wdrr_bucket { +	struct sk_buff	  *head; +	struct sk_buff	  *tail; +	struct list_head  bucketchain; +	int		  deficit; +}; + +struct hhf_sched_data { +	struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; +	u32		   perturbation;   /* hash perturbation */ +	u32		   quantum;        /* psched_mtu(qdisc_dev(sch)); */ +	u32		   drop_overlimit; /* number of times max qdisc packet +					    * limit was hit +					    */ +	struct list_head   *hh_flows;       /* table T (currently active HHs) */ +	u32		   hh_flows_limit;            /* max active HH allocs */ +	u32		   hh_flows_overlimit; /* num of disallowed HH allocs */ +	u32		   hh_flows_total_cnt;          /* total admitted HHs */ +	u32		   hh_flows_current_cnt;        /* total current HHs  */ +	u32		   *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ +	u32		   hhf_arrays_reset_timestamp;  /* last time hhf_arrays +							 * was reset +							 */ +	unsigned long	   *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits +							     * of hhf_arrays +							     */ +	/* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ +	struct list_head   new_buckets; /* list of new buckets */ +	struct list_head   old_buckets; /* list of old buckets */ + +	/* Configurable HHF parameters */ +	u32		   hhf_reset_timeout; /* interval to reset counter +					       * arrays in filter F +					       * (default 40ms) +					       */ +	u32		   hhf_admit_bytes;   /* counter thresh to classify as +					       * HH (default 128KB). +					       * With these default values, +					       * 128KB / 40ms = 25 Mbps +					       * i.e., we expect to capture HHs +					       * sending > 25 Mbps. +					       */ +	u32		   hhf_evict_timeout; /* aging threshold to evict idle +					       * HHs out of table T. This should +					       * be large enough to avoid +					       * reordering during HH eviction. +					       * (default 1s) +					       */ +	u32		   hhf_non_hh_weight; /* WDRR weight for non-HHs +					       * (default 2, +					       *  i.e., non-HH : HH = 2 : 1) +					       */ +}; + +static u32 hhf_time_stamp(void) +{ +	return jiffies; +} + +static unsigned int skb_hash(const struct hhf_sched_data *q, +			     const struct sk_buff *skb) +{ +	struct flow_keys keys; +	unsigned int hash; + +	if (skb->sk && skb->sk->sk_hash) +		return skb->sk->sk_hash; + +	skb_flow_dissect(skb, &keys); +	hash = jhash_3words((__force u32)keys.dst, +			    (__force u32)keys.src ^ keys.ip_proto, +			    (__force u32)keys.ports, q->perturbation); +	return hash; +} + +/* Looks up a heavy-hitter flow in a chaining list of table T. */ +static struct hh_flow_state *seek_list(const u32 hash, +				       struct list_head *head, +				       struct hhf_sched_data *q) +{ +	struct hh_flow_state *flow, *next; +	u32 now = hhf_time_stamp(); + +	if (list_empty(head)) +		return NULL; + +	list_for_each_entry_safe(flow, next, head, flowchain) { +		u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + +		if (hhf_time_before(prev, now)) { +			/* Delete expired heavy-hitters, but preserve one entry +			 * to avoid kzalloc() when next time this slot is hit. +			 */ +			if (list_is_last(&flow->flowchain, head)) +				return NULL; +			list_del(&flow->flowchain); +			kfree(flow); +			q->hh_flows_current_cnt--; +		} else if (flow->hash_id == hash) { +			return flow; +		} +	} +	return NULL; +} + +/* Returns a flow state entry for a new heavy-hitter.  Either reuses an expired + * entry or dynamically alloc a new entry. + */ +static struct hh_flow_state *alloc_new_hh(struct list_head *head, +					  struct hhf_sched_data *q) +{ +	struct hh_flow_state *flow; +	u32 now = hhf_time_stamp(); + +	if (!list_empty(head)) { +		/* Find an expired heavy-hitter flow entry. */ +		list_for_each_entry(flow, head, flowchain) { +			u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + +			if (hhf_time_before(prev, now)) +				return flow; +		} +	} + +	if (q->hh_flows_current_cnt >= q->hh_flows_limit) { +		q->hh_flows_overlimit++; +		return NULL; +	} +	/* Create new entry. */ +	flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); +	if (!flow) +		return NULL; + +	q->hh_flows_current_cnt++; +	INIT_LIST_HEAD(&flow->flowchain); +	list_add_tail(&flow->flowchain, head); + +	return flow; +} + +/* Assigns packets to WDRR buckets.  Implements a multi-stage filter to + * classify heavy-hitters. + */ +static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	u32 tmp_hash, hash; +	u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; +	struct hh_flow_state *flow; +	u32 pkt_len, min_hhf_val; +	int i; +	u32 prev; +	u32 now = hhf_time_stamp(); + +	/* Reset the HHF counter arrays if this is the right time. */ +	prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; +	if (hhf_time_before(prev, now)) { +		for (i = 0; i < HHF_ARRAYS_CNT; i++) +			bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); +		q->hhf_arrays_reset_timestamp = now; +	} + +	/* Get hashed flow-id of the skb. */ +	hash = skb_hash(q, skb); + +	/* Check if this packet belongs to an already established HH flow. */ +	flow_pos = hash & HHF_BIT_MASK; +	flow = seek_list(hash, &q->hh_flows[flow_pos], q); +	if (flow) { /* found its HH flow */ +		flow->hit_timestamp = now; +		return WDRR_BUCKET_FOR_HH; +	} + +	/* Now pass the packet through the multi-stage filter. */ +	tmp_hash = hash; +	xorsum = 0; +	for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { +		/* Split the skb_hash into three 10-bit chunks. */ +		filter_pos[i] = tmp_hash & HHF_BIT_MASK; +		xorsum ^= filter_pos[i]; +		tmp_hash >>= HHF_BIT_MASK_LEN; +	} +	/* The last chunk is computed as XOR sum of other chunks. */ +	filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; + +	pkt_len = qdisc_pkt_len(skb); +	min_hhf_val = ~0U; +	for (i = 0; i < HHF_ARRAYS_CNT; i++) { +		u32 val; + +		if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { +			q->hhf_arrays[i][filter_pos[i]] = 0; +			__set_bit(filter_pos[i], q->hhf_valid_bits[i]); +		} + +		val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; +		if (min_hhf_val > val) +			min_hhf_val = val; +	} + +	/* Found a new HH iff all counter values > HH admit threshold. */ +	if (min_hhf_val > q->hhf_admit_bytes) { +		/* Just captured a new heavy-hitter. */ +		flow = alloc_new_hh(&q->hh_flows[flow_pos], q); +		if (!flow) /* memory alloc problem */ +			return WDRR_BUCKET_FOR_NON_HH; +		flow->hash_id = hash; +		flow->hit_timestamp = now; +		q->hh_flows_total_cnt++; + +		/* By returning without updating counters in q->hhf_arrays, +		 * we implicitly implement "shielding" (see Optimization O1). +		 */ +		return WDRR_BUCKET_FOR_HH; +	} + +	/* Conservative update of HHF arrays (see Optimization O2). */ +	for (i = 0; i < HHF_ARRAYS_CNT; i++) { +		if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) +			q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; +	} +	return WDRR_BUCKET_FOR_NON_HH; +} + +/* Removes one skb from head of bucket. */ +static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) +{ +	struct sk_buff *skb = bucket->head; + +	bucket->head = skb->next; +	skb->next = NULL; +	return skb; +} + +/* Tail-adds skb to bucket. */ +static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) +{ +	if (bucket->head == NULL) +		bucket->head = skb; +	else +		bucket->tail->next = skb; +	bucket->tail = skb; +	skb->next = NULL; +} + +static unsigned int hhf_drop(struct Qdisc *sch) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	struct wdrr_bucket *bucket; + +	/* Always try to drop from heavy-hitters first. */ +	bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; +	if (!bucket->head) +		bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; + +	if (bucket->head) { +		struct sk_buff *skb = dequeue_head(bucket); + +		sch->q.qlen--; +		sch->qstats.drops++; +		sch->qstats.backlog -= qdisc_pkt_len(skb); +		kfree_skb(skb); +	} + +	/* Return id of the bucket from which the packet was dropped. */ +	return bucket - q->buckets; +} + +static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	enum wdrr_bucket_idx idx; +	struct wdrr_bucket *bucket; + +	idx = hhf_classify(skb, sch); + +	bucket = &q->buckets[idx]; +	bucket_add(bucket, skb); +	sch->qstats.backlog += qdisc_pkt_len(skb); + +	if (list_empty(&bucket->bucketchain)) { +		unsigned int weight; + +		/* The logic of new_buckets vs. old_buckets is the same as +		 * new_flows vs. old_flows in the implementation of fq_codel, +		 * i.e., short bursts of non-HHs should have strict priority. +		 */ +		if (idx == WDRR_BUCKET_FOR_HH) { +			/* Always move heavy-hitters to old bucket. */ +			weight = 1; +			list_add_tail(&bucket->bucketchain, &q->old_buckets); +		} else { +			weight = q->hhf_non_hh_weight; +			list_add_tail(&bucket->bucketchain, &q->new_buckets); +		} +		bucket->deficit = weight * q->quantum; +	} +	if (++sch->q.qlen <= sch->limit) +		return NET_XMIT_SUCCESS; + +	q->drop_overlimit++; +	/* Return Congestion Notification only if we dropped a packet from this +	 * bucket. +	 */ +	if (hhf_drop(sch) == idx) +		return NET_XMIT_CN; + +	/* As we dropped a packet, better let upper stack know this. */ +	qdisc_tree_decrease_qlen(sch, 1); +	return NET_XMIT_SUCCESS; +} + +static struct sk_buff *hhf_dequeue(struct Qdisc *sch) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	struct sk_buff *skb = NULL; +	struct wdrr_bucket *bucket; +	struct list_head *head; + +begin: +	head = &q->new_buckets; +	if (list_empty(head)) { +		head = &q->old_buckets; +		if (list_empty(head)) +			return NULL; +	} +	bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); + +	if (bucket->deficit <= 0) { +		int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? +			      1 : q->hhf_non_hh_weight; + +		bucket->deficit += weight * q->quantum; +		list_move_tail(&bucket->bucketchain, &q->old_buckets); +		goto begin; +	} + +	if (bucket->head) { +		skb = dequeue_head(bucket); +		sch->q.qlen--; +		sch->qstats.backlog -= qdisc_pkt_len(skb); +	} + +	if (!skb) { +		/* Force a pass through old_buckets to prevent starvation. */ +		if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) +			list_move_tail(&bucket->bucketchain, &q->old_buckets); +		else +			list_del_init(&bucket->bucketchain); +		goto begin; +	} +	qdisc_bstats_update(sch, skb); +	bucket->deficit -= qdisc_pkt_len(skb); + +	return skb; +} + +static void hhf_reset(struct Qdisc *sch) +{ +	struct sk_buff *skb; + +	while ((skb = hhf_dequeue(sch)) != NULL) +		kfree_skb(skb); +} + +static void *hhf_zalloc(size_t sz) +{ +	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); + +	if (!ptr) +		ptr = vzalloc(sz); + +	return ptr; +} + +static void hhf_free(void *addr) +{ +	kvfree(addr); +} + +static void hhf_destroy(struct Qdisc *sch) +{ +	int i; +	struct hhf_sched_data *q = qdisc_priv(sch); + +	for (i = 0; i < HHF_ARRAYS_CNT; i++) { +		hhf_free(q->hhf_arrays[i]); +		hhf_free(q->hhf_valid_bits[i]); +	} + +	for (i = 0; i < HH_FLOWS_CNT; i++) { +		struct hh_flow_state *flow, *next; +		struct list_head *head = &q->hh_flows[i]; + +		if (list_empty(head)) +			continue; +		list_for_each_entry_safe(flow, next, head, flowchain) { +			list_del(&flow->flowchain); +			kfree(flow); +		} +	} +	hhf_free(q->hh_flows); +} + +static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { +	[TCA_HHF_BACKLOG_LIMIT]	 = { .type = NLA_U32 }, +	[TCA_HHF_QUANTUM]	 = { .type = NLA_U32 }, +	[TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, +	[TCA_HHF_RESET_TIMEOUT]	 = { .type = NLA_U32 }, +	[TCA_HHF_ADMIT_BYTES]	 = { .type = NLA_U32 }, +	[TCA_HHF_EVICT_TIMEOUT]	 = { .type = NLA_U32 }, +	[TCA_HHF_NON_HH_WEIGHT]	 = { .type = NLA_U32 }, +}; + +static int hhf_change(struct Qdisc *sch, struct nlattr *opt) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	struct nlattr *tb[TCA_HHF_MAX + 1]; +	unsigned int qlen; +	int err; +	u64 non_hh_quantum; +	u32 new_quantum = q->quantum; +	u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; + +	if (!opt) +		return -EINVAL; + +	err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy); +	if (err < 0) +		return err; + +	if (tb[TCA_HHF_QUANTUM]) +		new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); + +	if (tb[TCA_HHF_NON_HH_WEIGHT]) +		new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); + +	non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; +	if (non_hh_quantum > INT_MAX) +		return -EINVAL; + +	sch_tree_lock(sch); + +	if (tb[TCA_HHF_BACKLOG_LIMIT]) +		sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); + +	q->quantum = new_quantum; +	q->hhf_non_hh_weight = new_hhf_non_hh_weight; + +	if (tb[TCA_HHF_HH_FLOWS_LIMIT]) +		q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]); + +	if (tb[TCA_HHF_RESET_TIMEOUT]) { +		u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); + +		q->hhf_reset_timeout = usecs_to_jiffies(us); +	} + +	if (tb[TCA_HHF_ADMIT_BYTES]) +		q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]); + +	if (tb[TCA_HHF_EVICT_TIMEOUT]) { +		u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); + +		q->hhf_evict_timeout = usecs_to_jiffies(us); +	} + +	qlen = sch->q.qlen; +	while (sch->q.qlen > sch->limit) { +		struct sk_buff *skb = hhf_dequeue(sch); + +		kfree_skb(skb); +	} +	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + +	sch_tree_unlock(sch); +	return 0; +} + +static int hhf_init(struct Qdisc *sch, struct nlattr *opt) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	int i; + +	sch->limit = 1000; +	q->quantum = psched_mtu(qdisc_dev(sch)); +	q->perturbation = prandom_u32(); +	INIT_LIST_HEAD(&q->new_buckets); +	INIT_LIST_HEAD(&q->old_buckets); + +	/* Configurable HHF parameters */ +	q->hhf_reset_timeout = HZ / 25; /* 40  ms */ +	q->hhf_admit_bytes = 131072;    /* 128 KB */ +	q->hhf_evict_timeout = HZ;      /* 1  sec */ +	q->hhf_non_hh_weight = 2; + +	if (opt) { +		int err = hhf_change(sch, opt); + +		if (err) +			return err; +	} + +	if (!q->hh_flows) { +		/* Initialize heavy-hitter flow table. */ +		q->hh_flows = hhf_zalloc(HH_FLOWS_CNT * +					 sizeof(struct list_head)); +		if (!q->hh_flows) +			return -ENOMEM; +		for (i = 0; i < HH_FLOWS_CNT; i++) +			INIT_LIST_HEAD(&q->hh_flows[i]); + +		/* Cap max active HHs at twice len of hh_flows table. */ +		q->hh_flows_limit = 2 * HH_FLOWS_CNT; +		q->hh_flows_overlimit = 0; +		q->hh_flows_total_cnt = 0; +		q->hh_flows_current_cnt = 0; + +		/* Initialize heavy-hitter filter arrays. */ +		for (i = 0; i < HHF_ARRAYS_CNT; i++) { +			q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * +						      sizeof(u32)); +			if (!q->hhf_arrays[i]) { +				hhf_destroy(sch); +				return -ENOMEM; +			} +		} +		q->hhf_arrays_reset_timestamp = hhf_time_stamp(); + +		/* Initialize valid bits of heavy-hitter filter arrays. */ +		for (i = 0; i < HHF_ARRAYS_CNT; i++) { +			q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / +							  BITS_PER_BYTE); +			if (!q->hhf_valid_bits[i]) { +				hhf_destroy(sch); +				return -ENOMEM; +			} +		} + +		/* Initialize Weighted DRR buckets. */ +		for (i = 0; i < WDRR_BUCKET_CNT; i++) { +			struct wdrr_bucket *bucket = q->buckets + i; + +			INIT_LIST_HEAD(&bucket->bucketchain); +		} +	} + +	return 0; +} + +static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	struct nlattr *opts; + +	opts = nla_nest_start(skb, TCA_OPTIONS); +	if (opts == NULL) +		goto nla_put_failure; + +	if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) || +	    nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) || +	    nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) || +	    nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, +			jiffies_to_usecs(q->hhf_reset_timeout)) || +	    nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) || +	    nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, +			jiffies_to_usecs(q->hhf_evict_timeout)) || +	    nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight)) +		goto nla_put_failure; + +	return nla_nest_end(skb, opts); + +nla_put_failure: +	return -1; +} + +static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	struct tc_hhf_xstats st = { +		.drop_overlimit = q->drop_overlimit, +		.hh_overlimit	= q->hh_flows_overlimit, +		.hh_tot_count	= q->hh_flows_total_cnt, +		.hh_cur_count	= q->hh_flows_current_cnt, +	}; + +	return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { +	.id		=	"hhf", +	.priv_size	=	sizeof(struct hhf_sched_data), + +	.enqueue	=	hhf_enqueue, +	.dequeue	=	hhf_dequeue, +	.peek		=	qdisc_peek_dequeued, +	.drop		=	hhf_drop, +	.init		=	hhf_init, +	.reset		=	hhf_reset, +	.destroy	=	hhf_destroy, +	.change		=	hhf_change, +	.dump		=	hhf_dump, +	.dump_stats	=	hhf_dump_stats, +	.owner		=	THIS_MODULE, +}; + +static int __init hhf_module_init(void) +{ +	return register_qdisc(&hhf_qdisc_ops); +} + +static void __exit hhf_module_exit(void) +{ +	unregister_qdisc(&hhf_qdisc_ops); +} + +module_init(hhf_module_init) +module_exit(hhf_module_exit) +MODULE_AUTHOR("Terry Lam"); +MODULE_AUTHOR("Nandita Dukkipati"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 863846cc551..9f949abcace 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -219,11 +219,16 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,  	if (skb->priority == sch->handle)  		return HTB_DIRECT;	/* X:0 (direct flow) selected */  	cl = htb_find(skb->priority, sch); -	if (cl && cl->level == 0) -		return cl; +	if (cl) { +		if (cl->level == 0) +			return cl; +		/* Start with inner filter chain if a non-leaf class is selected */ +		tcf = cl->filter_list; +	} else { +		tcf = q->filter_list; +	}  	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; -	tcf = q->filter_list;  	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {  #ifdef CONFIG_NET_CLS_ACT  		switch (result) { @@ -712,7 +717,7 @@ static s64 htb_do_events(struct htb_sched *q, const int level,  	/* too much load - let's continue after a break for scheduling */  	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) { -		pr_warning("htb: too many events!\n"); +		pr_warn("htb: too many events!\n");  		q->warned |= HTB_WARN_TOOMANYEVENTS;  	} @@ -997,6 +1002,8 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {  	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },  	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },  	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 }, +	[TCA_HTB_RATE64] = { .type = NLA_U64 }, +	[TCA_HTB_CEIL64] = { .type = NLA_U64 },  };  static void htb_work_func(struct work_struct *work) @@ -1055,12 +1062,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)  static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)  { -	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);  	struct htb_sched *q = qdisc_priv(sch);  	struct nlattr *nest;  	struct tc_htb_glob gopt; -	spin_lock_bh(root_lock); +	/* Its safe to not acquire qdisc lock. As we hold RTNL, +	 * no change can happen on the qdisc parameters. +	 */  	gopt.direct_pkts = q->direct_pkts;  	gopt.version = HTB_VER; @@ -1074,13 +1082,10 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)  	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||  	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))  		goto nla_put_failure; -	nla_nest_end(skb, nest); -	spin_unlock_bh(root_lock); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure: -	spin_unlock_bh(root_lock);  	nla_nest_cancel(skb, nest);  	return -1;  } @@ -1089,11 +1094,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,  			  struct sk_buff *skb, struct tcmsg *tcm)  {  	struct htb_class *cl = (struct htb_class *)arg; -	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);  	struct nlattr *nest;  	struct tc_htb_opt opt; -	spin_lock_bh(root_lock); +	/* Its safe to not acquire qdisc lock. As we hold RTNL, +	 * no change can happen on the class parameters. +	 */  	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;  	tcm->tcm_handle = cl->common.classid;  	if (!cl->level && cl->un.leaf.q) @@ -1114,13 +1120,16 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,  	opt.level = cl->level;  	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))  		goto nla_put_failure; +	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) && +	    nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps)) +		goto nla_put_failure; +	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) && +	    nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps)) +		goto nla_put_failure; -	nla_nest_end(skb, nest); -	spin_unlock_bh(root_lock); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure: -	spin_unlock_bh(root_lock);  	nla_nest_cancel(skb, nest);  	return -1;  } @@ -1268,9 +1277,10 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)  	struct Qdisc *new_q = NULL;  	int last_child = 0; -	// TODO: why don't allow to delete subtree ? references ? does -	// tc subsys quarantee us that in htb_destroy it holds no class -	// refs so that we can remove children safely there ? +	/* TODO: why don't allow to delete subtree ? references ? does +	 * tc subsys guarantee us that in htb_destroy it holds no class +	 * refs so that we can remove children safely there ? +	 */  	if (cl->children || cl->filter_cnt)  		return -EBUSY; @@ -1329,9 +1339,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  	struct htb_sched *q = qdisc_priv(sch);  	struct htb_class *cl = (struct htb_class *)*arg, *parent;  	struct nlattr *opt = tca[TCA_OPTIONS]; -	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;  	struct nlattr *tb[TCA_HTB_MAX + 1];  	struct tc_htb_opt *hopt; +	u64 rate64, ceil64;  	/* extract all subattrs from opt attr */  	if (!opt) @@ -1352,16 +1362,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  		goto failure;  	/* Keeping backward compatible with rate_table based iproute2 tc */ -	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) { -		rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); -		if (rtab) -			qdisc_put_rtab(rtab); -	} -	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) { -		ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); -		if (ctab) -			qdisc_put_rtab(ctab); -	} +	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) +		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB])); + +	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) +		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));  	if (!cl) {		/* new class */  		struct Qdisc *new_q; @@ -1468,21 +1473,30 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  		sch_tree_lock(sch);  	} +	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + +	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + +	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); +	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); +  	/* it used to be a nasty bug here, we have to check that node  	 * is really leaf before changing cl->un.leaf !  	 */  	if (!cl->level) { -		cl->quantum = hopt->rate.rate / q->rate2quantum; +		u64 quantum = cl->rate.rate_bytes_ps; + +		do_div(quantum, q->rate2quantum); +		cl->quantum = min_t(u64, quantum, INT_MAX); +  		if (!hopt->quantum && cl->quantum < 1000) { -			pr_warning( -			       "HTB: quantum of class %X is small. Consider r2q change.\n", -			       cl->common.classid); +			pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n", +				cl->common.classid);  			cl->quantum = 1000;  		}  		if (!hopt->quantum && cl->quantum > 200000) { -			pr_warning( -			       "HTB: quantum of class %X is big. Consider r2q change.\n", -			       cl->common.classid); +			pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n", +				cl->common.classid);  			cl->quantum = 200000;  		}  		if (hopt->quantum) @@ -1491,9 +1505,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  			cl->prio = TC_HTB_NUMPRIO - 1;  	} -	psched_ratecfg_precompute(&cl->rate, &hopt->rate); -	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil); -  	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);  	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index bce1665239b..62871c14e1f 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -100,8 +100,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)  	nest = nla_nest_start(skb, TCA_OPTIONS);  	if (nest == NULL)  		goto nla_put_failure; -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure:  	nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 2e56185736d..a8b2864a696 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -78,14 +78,19 @@ static void mq_attach(struct Qdisc *sch)  {  	struct net_device *dev = qdisc_dev(sch);  	struct mq_sched *priv = qdisc_priv(sch); -	struct Qdisc *qdisc; +	struct Qdisc *qdisc, *old;  	unsigned int ntx;  	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {  		qdisc = priv->qdiscs[ntx]; -		qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc); -		if (qdisc) -			qdisc_destroy(qdisc); +		old = dev_graft_qdisc(qdisc->dev_queue, qdisc); +		if (old) +			qdisc_destroy(old); +#ifdef CONFIG_NET_SCHED +		if (ntx < dev->real_num_tx_queues) +			qdisc_list_add(qdisc); +#endif +  	}  	kfree(priv->qdiscs);  	priv->qdiscs = NULL; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index d44c868cb53..6749e2f540d 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -167,15 +167,17 @@ static void mqprio_attach(struct Qdisc *sch)  {  	struct net_device *dev = qdisc_dev(sch);  	struct mqprio_sched *priv = qdisc_priv(sch); -	struct Qdisc *qdisc; +	struct Qdisc *qdisc, *old;  	unsigned int ntx;  	/* Attach underlying qdisc */  	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {  		qdisc = priv->qdiscs[ntx]; -		qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc); -		if (qdisc) -			qdisc_destroy(qdisc); +		old = dev_graft_qdisc(qdisc->dev_queue, qdisc); +		if (old) +			qdisc_destroy(old); +		if (ntx < dev->real_num_tx_queues) +			qdisc_list_add(qdisc);  	}  	kfree(priv->qdiscs);  	priv->qdiscs = NULL; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 2a2b096d9a6..afb050a735f 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -11,8 +11,7 @@   * more details.   *   * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. + * this program; if not, see <http://www.gnu.org/licenses/>.   *   * Author: Alexander Duyck <alexander.h.duyck@intel.com>   */ diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index a6d788d4521..111d70fddae 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -88,10 +88,10 @@ struct netem_sched_data {  	u32 duplicate;  	u32 reorder;  	u32 corrupt; -	u32 rate; +	u64 rate;  	s32 packet_overhead;  	u32 cell_size; -	u32 cell_size_reciprocal; +	struct reciprocal_value cell_size_reciprocal;  	s32 cell_overhead;  	struct crndstate { @@ -110,6 +110,18 @@ struct netem_sched_data {  		CLG_GILB_ELL,  	} loss_model; +	enum { +		TX_IN_GAP_PERIOD = 1, +		TX_IN_BURST_PERIOD, +		LOST_IN_GAP_PERIOD, +		LOST_IN_BURST_PERIOD, +	} _4_state_model; + +	enum { +		GOOD_STATE = 1, +		BAD_STATE, +	} GE_state_model; +  	/* Correlated Loss Generation models */  	struct clgstate {  		/* state of the Markov chain */ @@ -169,7 +181,7 @@ static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)  static void init_crandom(struct crndstate *state, unsigned long rho)  {  	state->rho = rho; -	state->last = net_random(); +	state->last = prandom_u32();  }  /* get_crandom - correlated random number generator @@ -182,9 +194,9 @@ static u32 get_crandom(struct crndstate *state)  	unsigned long answer;  	if (state->rho == 0)	/* no correlation */ -		return net_random(); +		return prandom_u32(); -	value = net_random(); +	value = prandom_u32();  	rho = (u64)state->rho + 1;  	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;  	state->last = answer; @@ -198,51 +210,52 @@ static u32 get_crandom(struct crndstate *state)  static bool loss_4state(struct netem_sched_data *q)  {  	struct clgstate *clg = &q->clg; -	u32 rnd = net_random(); +	u32 rnd = prandom_u32();  	/*  	 * Makes a comparison between rnd and the transition  	 * probabilities outgoing from the current state, then decides the  	 * next state and if the next packet has to be transmitted or lost.  	 * The four states correspond to: -	 *   1 => successfully transmitted packets within a gap period -	 *   4 => isolated losses within a gap period -	 *   3 => lost packets within a burst period -	 *   2 => successfully transmitted packets within a burst period +	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period +	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period +	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period +	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period  	 */  	switch (clg->state) { -	case 1: +	case TX_IN_GAP_PERIOD:  		if (rnd < clg->a4) { -			clg->state = 4; +			clg->state = LOST_IN_BURST_PERIOD;  			return true; -		} else if (clg->a4 < rnd && rnd < clg->a1) { -			clg->state = 3; +		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { +			clg->state = LOST_IN_GAP_PERIOD;  			return true; -		} else if (clg->a1 < rnd) -			clg->state = 1; +		} else if (clg->a1 + clg->a4 < rnd) { +			clg->state = TX_IN_GAP_PERIOD; +		}  		break; -	case 2: +	case TX_IN_BURST_PERIOD:  		if (rnd < clg->a5) { -			clg->state = 3; +			clg->state = LOST_IN_GAP_PERIOD;  			return true; -		} else -			clg->state = 2; +		} else { +			clg->state = TX_IN_BURST_PERIOD; +		}  		break; -	case 3: +	case LOST_IN_GAP_PERIOD:  		if (rnd < clg->a3) -			clg->state = 2; +			clg->state = TX_IN_BURST_PERIOD;  		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { -			clg->state = 1; -			return true; +			clg->state = TX_IN_GAP_PERIOD;  		} else if (clg->a2 + clg->a3 < rnd) { -			clg->state = 3; +			clg->state = LOST_IN_GAP_PERIOD;  			return true;  		}  		break; -	case 4: -		clg->state = 1; +	case LOST_IN_BURST_PERIOD: +		clg->state = TX_IN_GAP_PERIOD;  		break;  	} @@ -264,15 +277,16 @@ static bool loss_gilb_ell(struct netem_sched_data *q)  	struct clgstate *clg = &q->clg;  	switch (clg->state) { -	case 1: -		if (net_random() < clg->a1) -			clg->state = 2; -		if (net_random() < clg->a4) +	case GOOD_STATE: +		if (prandom_u32() < clg->a1) +			clg->state = BAD_STATE; +		if (prandom_u32() < clg->a4)  			return true; -	case 2: -		if (net_random() < clg->a2) -			clg->state = 1; -		if (clg->a3 > net_random()) +		break; +	case BAD_STATE: +		if (prandom_u32() < clg->a2) +			clg->state = GOOD_STATE; +		if (prandom_u32() > clg->a3)  			return true;  	} @@ -358,6 +372,21 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche  	return PSCHED_NS2TICKS(ticks);  } +static void tfifo_reset(struct Qdisc *sch) +{ +	struct netem_sched_data *q = qdisc_priv(sch); +	struct rb_node *p; + +	while ((p = rb_first(&q->t_root))) { +		struct sk_buff *skb = netem_rb_to_skb(p); + +		rb_erase(p, &q->t_root); +		skb->next = NULL; +		skb->prev = NULL; +		kfree_skb(skb); +	} +} +  static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)  {  	struct netem_sched_data *q = qdisc_priv(sch); @@ -442,7 +471,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)  		     skb_checksum_help(skb)))  			return qdisc_drop(skb, sch); -		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); +		skb->data[prandom_u32() % skb_headlen(skb)] ^= +			1<<(prandom_u32() % 8);  	}  	if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) @@ -480,7 +510,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)  				now = netem_skb_cb(last)->time_to_send;  			} -			delay += packet_len_2_sched_time(skb->len, q); +			delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);  		}  		cb->time_to_send = now + delay; @@ -520,6 +550,7 @@ static unsigned int netem_drop(struct Qdisc *sch)  			skb->next = NULL;  			skb->prev = NULL;  			len = qdisc_pkt_len(skb); +			sch->qstats.backlog -= len;  			kfree_skb(skb);  		}  	} @@ -609,6 +640,7 @@ static void netem_reset(struct Qdisc *sch)  	struct netem_sched_data *q = qdisc_priv(sch);  	qdisc_reset_queue(sch); +	tfifo_reset(sch);  	if (q->qdisc)  		qdisc_reset(q->qdisc);  	qdisc_watchdog_cancel(&q->watchdog); @@ -616,12 +648,7 @@ static void netem_reset(struct Qdisc *sch)  static void dist_free(struct disttable *d)  { -	if (d) { -		if (is_vmalloc_addr(d)) -			vfree(d); -		else -			kfree(d); -	} +	kvfree(d);  }  /* @@ -662,9 +689,8 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)  	return 0;  } -static void get_correlation(struct Qdisc *sch, const struct nlattr *attr) +static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)  { -	struct netem_sched_data *q = qdisc_priv(sch);  	const struct tc_netem_corr *c = nla_data(attr);  	init_crandom(&q->delay_cor, c->delay_corr); @@ -672,47 +698,45 @@ static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)  	init_crandom(&q->dup_cor, c->dup_corr);  } -static void get_reorder(struct Qdisc *sch, const struct nlattr *attr) +static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)  { -	struct netem_sched_data *q = qdisc_priv(sch);  	const struct tc_netem_reorder *r = nla_data(attr);  	q->reorder = r->probability;  	init_crandom(&q->reorder_cor, r->correlation);  } -static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr) +static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)  { -	struct netem_sched_data *q = qdisc_priv(sch);  	const struct tc_netem_corrupt *r = nla_data(attr);  	q->corrupt = r->probability;  	init_crandom(&q->corrupt_cor, r->correlation);  } -static void get_rate(struct Qdisc *sch, const struct nlattr *attr) +static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)  { -	struct netem_sched_data *q = qdisc_priv(sch);  	const struct tc_netem_rate *r = nla_data(attr);  	q->rate = r->rate;  	q->packet_overhead = r->packet_overhead;  	q->cell_size = r->cell_size; +	q->cell_overhead = r->cell_overhead;  	if (q->cell_size)  		q->cell_size_reciprocal = reciprocal_value(q->cell_size); -	q->cell_overhead = r->cell_overhead; +	else +		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };  } -static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) +static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)  { -	struct netem_sched_data *q = qdisc_priv(sch);  	const struct nlattr *la;  	int rem;  	nla_for_each_nested(la, attr, rem) {  		u16 type = nla_type(la); -		switch(type) { +		switch (type) {  		case NETEM_LOSS_GI: {  			const struct tc_netem_gimodel *gi = nla_data(la); @@ -723,7 +747,7 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)  			q->loss_model = CLG_4_STATES; -			q->clg.state = 1; +			q->clg.state = TX_IN_GAP_PERIOD;  			q->clg.a1 = gi->p13;  			q->clg.a2 = gi->p31;  			q->clg.a3 = gi->p32; @@ -741,7 +765,7 @@ static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)  			}  			q->loss_model = CLG_GILB_ELL; -			q->clg.state = 1; +			q->clg.state = GOOD_STATE;  			q->clg.a1 = ge->p;  			q->clg.a2 = ge->r;  			q->clg.a3 = ge->h; @@ -765,6 +789,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {  	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },  	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },  	[TCA_NETEM_ECN]		= { .type = NLA_U32 }, +	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },  };  static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -791,6 +816,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)  	struct netem_sched_data *q = qdisc_priv(sch);  	struct nlattr *tb[TCA_NETEM_MAX + 1];  	struct tc_netem_qopt *qopt; +	struct clgstate old_clg; +	int old_loss_model = CLG_RANDOM;  	int ret;  	if (opt == NULL) @@ -801,6 +828,33 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)  	if (ret < 0)  		return ret; +	/* backup q->clg and q->loss_model */ +	old_clg = q->clg; +	old_loss_model = q->loss_model; + +	if (tb[TCA_NETEM_LOSS]) { +		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); +		if (ret) { +			q->loss_model = old_loss_model; +			return ret; +		} +	} else { +		q->loss_model = CLG_RANDOM; +	} + +	if (tb[TCA_NETEM_DELAY_DIST]) { +		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); +		if (ret) { +			/* recover clg and loss_model, in case of +			 * q->clg and q->loss_model were modified +			 * in get_loss_clg() +			 */ +			q->clg = old_clg; +			q->loss_model = old_loss_model; +			return ret; +		} +	} +  	sch->limit = qopt->limit;  	q->latency = qopt->latency; @@ -818,30 +872,24 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)  		q->reorder = ~0;  	if (tb[TCA_NETEM_CORR]) -		get_correlation(sch, tb[TCA_NETEM_CORR]); - -	if (tb[TCA_NETEM_DELAY_DIST]) { -		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); -		if (ret) -			return ret; -	} +		get_correlation(q, tb[TCA_NETEM_CORR]);  	if (tb[TCA_NETEM_REORDER]) -		get_reorder(sch, tb[TCA_NETEM_REORDER]); +		get_reorder(q, tb[TCA_NETEM_REORDER]);  	if (tb[TCA_NETEM_CORRUPT]) -		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]); +		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);  	if (tb[TCA_NETEM_RATE]) -		get_rate(sch, tb[TCA_NETEM_RATE]); +		get_rate(q, tb[TCA_NETEM_RATE]); + +	if (tb[TCA_NETEM_RATE64]) +		q->rate = max_t(u64, q->rate, +				nla_get_u64(tb[TCA_NETEM_RATE64]));  	if (tb[TCA_NETEM_ECN])  		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); -	q->loss_model = CLG_RANDOM; -	if (tb[TCA_NETEM_LOSS]) -		ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]); -  	return ret;  } @@ -957,7 +1005,13 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)  	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))  		goto nla_put_failure; -	rate.rate = q->rate; +	if (q->rate >= (1ULL << 32)) { +		if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate)) +			goto nla_put_failure; +		rate.rate = ~0U; +	} else { +		rate.rate = q->rate; +	}  	rate.packet_overhead = q->packet_overhead;  	rate.cell_size = q->cell_size;  	rate.cell_overhead = q->cell_overhead; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c new file mode 100644 index 00000000000..fefeeb73f15 --- /dev/null +++ b/net/sched/sch_pie.c @@ -0,0 +1,566 @@ +/* Copyright (C) 2013 Cisco Systems, Inc, 2013. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * Author: Vijay Subramanian <vijaynsu@cisco.com> + * Author: Mythili Prabhu <mysuryan@cisco.com> + * + * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no> + * University of Oslo, Norway. + * + * References: + * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00 + * IEEE  Conference on High Performance Switching and Routing 2013 : + * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem" + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> + +#define QUEUE_THRESHOLD 10000 +#define DQCOUNT_INVALID -1 +#define MAX_PROB  0xffffffff +#define PIE_SCALE 8 + +/* parameters used */ +struct pie_params { +	psched_time_t target;	/* user specified target delay in pschedtime */ +	u32 tupdate;		/* timer frequency (in jiffies) */ +	u32 limit;		/* number of packets that can be enqueued */ +	u32 alpha;		/* alpha and beta are between 0 and 32 */ +	u32 beta;		/* and are used for shift relative to 1 */ +	bool ecn;		/* true if ecn is enabled */ +	bool bytemode;		/* to scale drop early prob based on pkt size */ +}; + +/* variables used */ +struct pie_vars { +	u32 prob;		/* probability but scaled by u32 limit. */ +	psched_time_t burst_time; +	psched_time_t qdelay; +	psched_time_t qdelay_old; +	u64 dq_count;		/* measured in bytes */ +	psched_time_t dq_tstamp;	/* drain rate */ +	u32 avg_dq_rate;	/* bytes per pschedtime tick,scaled */ +	u32 qlen_old;		/* in bytes */ +}; + +/* statistics gathering */ +struct pie_stats { +	u32 packets_in;		/* total number of packets enqueued */ +	u32 dropped;		/* packets dropped due to pie_action */ +	u32 overlimit;		/* dropped due to lack of space in queue */ +	u32 maxq;		/* maximum queue size */ +	u32 ecn_mark;		/* packets marked with ECN */ +}; + +/* private data for the Qdisc */ +struct pie_sched_data { +	struct pie_params params; +	struct pie_vars vars; +	struct pie_stats stats; +	struct timer_list adapt_timer; +}; + +static void pie_params_init(struct pie_params *params) +{ +	params->alpha = 2; +	params->beta = 20; +	params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC);	/* 30 ms */ +	params->limit = 1000;	/* default of 1000 packets */ +	params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC);	/* 20 ms */ +	params->ecn = false; +	params->bytemode = false; +} + +static void pie_vars_init(struct pie_vars *vars) +{ +	vars->dq_count = DQCOUNT_INVALID; +	vars->avg_dq_rate = 0; +	/* default of 100 ms in pschedtime */ +	vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC); +} + +static bool drop_early(struct Qdisc *sch, u32 packet_size) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	u32 rnd; +	u32 local_prob = q->vars.prob; +	u32 mtu = psched_mtu(qdisc_dev(sch)); + +	/* If there is still burst allowance left skip random early drop */ +	if (q->vars.burst_time > 0) +		return false; + +	/* If current delay is less than half of target, and +	 * if drop prob is low already, disable early_drop +	 */ +	if ((q->vars.qdelay < q->params.target / 2) +	    && (q->vars.prob < MAX_PROB / 5)) +		return false; + +	/* If we have fewer than 2 mtu-sized packets, disable drop_early, +	 * similar to min_th in RED +	 */ +	if (sch->qstats.backlog < 2 * mtu) +		return false; + +	/* If bytemode is turned on, use packet size to compute new +	 * probablity. Smaller packets will have lower drop prob in this case +	 */ +	if (q->params.bytemode && packet_size <= mtu) +		local_prob = (local_prob / mtu) * packet_size; +	else +		local_prob = q->vars.prob; + +	rnd = prandom_u32(); +	if (rnd < local_prob) +		return true; + +	return false; +} + +static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	bool enqueue = false; + +	if (unlikely(qdisc_qlen(sch) >= sch->limit)) { +		q->stats.overlimit++; +		goto out; +	} + +	if (!drop_early(sch, skb->len)) { +		enqueue = true; +	} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && +		   INET_ECN_set_ce(skb)) { +		/* If packet is ecn capable, mark it if drop probability +		 * is lower than 10%, else drop it. +		 */ +		q->stats.ecn_mark++; +		enqueue = true; +	} + +	/* we can enqueue the packet */ +	if (enqueue) { +		q->stats.packets_in++; +		if (qdisc_qlen(sch) > q->stats.maxq) +			q->stats.maxq = qdisc_qlen(sch); + +		return qdisc_enqueue_tail(skb, sch); +	} + +out: +	q->stats.dropped++; +	return qdisc_drop(skb, sch); +} + +static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { +	[TCA_PIE_TARGET] = {.type = NLA_U32}, +	[TCA_PIE_LIMIT] = {.type = NLA_U32}, +	[TCA_PIE_TUPDATE] = {.type = NLA_U32}, +	[TCA_PIE_ALPHA] = {.type = NLA_U32}, +	[TCA_PIE_BETA] = {.type = NLA_U32}, +	[TCA_PIE_ECN] = {.type = NLA_U32}, +	[TCA_PIE_BYTEMODE] = {.type = NLA_U32}, +}; + +static int pie_change(struct Qdisc *sch, struct nlattr *opt) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	struct nlattr *tb[TCA_PIE_MAX + 1]; +	unsigned int qlen; +	int err; + +	if (!opt) +		return -EINVAL; + +	err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy); +	if (err < 0) +		return err; + +	sch_tree_lock(sch); + +	/* convert from microseconds to pschedtime */ +	if (tb[TCA_PIE_TARGET]) { +		/* target is in us */ +		u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); + +		/* convert to pschedtime */ +		q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); +	} + +	/* tupdate is in jiffies */ +	if (tb[TCA_PIE_TUPDATE]) +		q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); + +	if (tb[TCA_PIE_LIMIT]) { +		u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); + +		q->params.limit = limit; +		sch->limit = limit; +	} + +	if (tb[TCA_PIE_ALPHA]) +		q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); + +	if (tb[TCA_PIE_BETA]) +		q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); + +	if (tb[TCA_PIE_ECN]) +		q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); + +	if (tb[TCA_PIE_BYTEMODE]) +		q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); + +	/* Drop excess packets if new limit is lower */ +	qlen = sch->q.qlen; +	while (sch->q.qlen > sch->limit) { +		struct sk_buff *skb = __skb_dequeue(&sch->q); + +		sch->qstats.backlog -= qdisc_pkt_len(skb); +		qdisc_drop(skb, sch); +	} +	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + +	sch_tree_unlock(sch); +	return 0; +} + +static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) +{ + +	struct pie_sched_data *q = qdisc_priv(sch); +	int qlen = sch->qstats.backlog;	/* current queue size in bytes */ + +	/* If current queue is about 10 packets or more and dq_count is unset +	 * we have enough packets to calculate the drain rate. Save +	 * current time as dq_tstamp and start measurement cycle. +	 */ +	if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { +		q->vars.dq_tstamp = psched_get_time(); +		q->vars.dq_count = 0; +	} + +	/* Calculate the average drain rate from this value.  If queue length +	 * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset +	 * the dq_count to -1 as we don't have enough packets to calculate the +	 * drain rate anymore The following if block is entered only when we +	 * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) +	 * and we calculate the drain rate for the threshold here.  dq_count is +	 * in bytes, time difference in psched_time, hence rate is in +	 * bytes/psched_time. +	 */ +	if (q->vars.dq_count != DQCOUNT_INVALID) { +		q->vars.dq_count += skb->len; + +		if (q->vars.dq_count >= QUEUE_THRESHOLD) { +			psched_time_t now = psched_get_time(); +			u32 dtime = now - q->vars.dq_tstamp; +			u32 count = q->vars.dq_count << PIE_SCALE; + +			if (dtime == 0) +				return; + +			count = count / dtime; + +			if (q->vars.avg_dq_rate == 0) +				q->vars.avg_dq_rate = count; +			else +				q->vars.avg_dq_rate = +				    (q->vars.avg_dq_rate - +				     (q->vars.avg_dq_rate >> 3)) + (count >> 3); + +			/* If the queue has receded below the threshold, we hold +			 * on to the last drain rate calculated, else we reset +			 * dq_count to 0 to re-enter the if block when the next +			 * packet is dequeued +			 */ +			if (qlen < QUEUE_THRESHOLD) +				q->vars.dq_count = DQCOUNT_INVALID; +			else { +				q->vars.dq_count = 0; +				q->vars.dq_tstamp = psched_get_time(); +			} + +			if (q->vars.burst_time > 0) { +				if (q->vars.burst_time > dtime) +					q->vars.burst_time -= dtime; +				else +					q->vars.burst_time = 0; +			} +		} +	} +} + +static void calculate_probability(struct Qdisc *sch) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	u32 qlen = sch->qstats.backlog;	/* queue size in bytes */ +	psched_time_t qdelay = 0;	/* in pschedtime */ +	psched_time_t qdelay_old = q->vars.qdelay;	/* in pschedtime */ +	s32 delta = 0;		/* determines the change in probability */ +	u32 oldprob; +	u32 alpha, beta; +	bool update_prob = true; + +	q->vars.qdelay_old = q->vars.qdelay; + +	if (q->vars.avg_dq_rate > 0) +		qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; +	else +		qdelay = 0; + +	/* If qdelay is zero and qlen is not, it means qlen is very small, less +	 * than dequeue_rate, so we do not update probabilty in this round +	 */ +	if (qdelay == 0 && qlen != 0) +		update_prob = false; + +	/* In the algorithm, alpha and beta are between 0 and 2 with typical +	 * value for alpha as 0.125. In this implementation, we use values 0-32 +	 * passed from user space to represent this. Also, alpha and beta have +	 * unit of HZ and need to be scaled before they can used to update +	 * probability. alpha/beta are updated locally below by 1) scaling them +	 * appropriately 2) scaling down by 16 to come to 0-2 range. +	 * Please see paper for details. +	 * +	 * We scale alpha and beta differently depending on whether we are in +	 * light, medium or high dropping mode. +	 */ +	if (q->vars.prob < MAX_PROB / 100) { +		alpha = +		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; +		beta = +		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; +	} else if (q->vars.prob < MAX_PROB / 10) { +		alpha = +		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; +		beta = +		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; +	} else { +		alpha = +		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; +		beta = +		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; +	} + +	/* alpha and beta should be between 0 and 32, in multiples of 1/16 */ +	delta += alpha * ((qdelay - q->params.target)); +	delta += beta * ((qdelay - qdelay_old)); + +	oldprob = q->vars.prob; + +	/* to ensure we increase probability in steps of no more than 2% */ +	if (delta > (s32) (MAX_PROB / (100 / 2)) && +	    q->vars.prob >= MAX_PROB / 10) +		delta = (MAX_PROB / 100) * 2; + +	/* Non-linear drop: +	 * Tune drop probability to increase quickly for high delays(>= 250ms) +	 * 250ms is derived through experiments and provides error protection +	 */ + +	if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) +		delta += MAX_PROB / (100 / 2); + +	q->vars.prob += delta; + +	if (delta > 0) { +		/* prevent overflow */ +		if (q->vars.prob < oldprob) { +			q->vars.prob = MAX_PROB; +			/* Prevent normalization error. If probability is at +			 * maximum value already, we normalize it here, and +			 * skip the check to do a non-linear drop in the next +			 * section. +			 */ +			update_prob = false; +		} +	} else { +		/* prevent underflow */ +		if (q->vars.prob > oldprob) +			q->vars.prob = 0; +	} + +	/* Non-linear drop in probability: Reduce drop probability quickly if +	 * delay is 0 for 2 consecutive Tupdate periods. +	 */ + +	if ((qdelay == 0) && (qdelay_old == 0) && update_prob) +		q->vars.prob = (q->vars.prob * 98) / 100; + +	q->vars.qdelay = qdelay; +	q->vars.qlen_old = qlen; + +	/* We restart the measurement cycle if the following conditions are met +	 * 1. If the delay has been low for 2 consecutive Tupdate periods +	 * 2. Calculated drop probability is zero +	 * 3. We have atleast one estimate for the avg_dq_rate ie., +	 *    is a non-zero value +	 */ +	if ((q->vars.qdelay < q->params.target / 2) && +	    (q->vars.qdelay_old < q->params.target / 2) && +	    (q->vars.prob == 0) && +	    (q->vars.avg_dq_rate > 0)) +		pie_vars_init(&q->vars); +} + +static void pie_timer(unsigned long arg) +{ +	struct Qdisc *sch = (struct Qdisc *)arg; +	struct pie_sched_data *q = qdisc_priv(sch); +	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + +	spin_lock(root_lock); +	calculate_probability(sch); + +	/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ +	if (q->params.tupdate) +		mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); +	spin_unlock(root_lock); + +} + +static int pie_init(struct Qdisc *sch, struct nlattr *opt) +{ +	struct pie_sched_data *q = qdisc_priv(sch); + +	pie_params_init(&q->params); +	pie_vars_init(&q->vars); +	sch->limit = q->params.limit; + +	setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch); +	mod_timer(&q->adapt_timer, jiffies + HZ / 2); + +	if (opt) { +		int err = pie_change(sch, opt); + +		if (err) +			return err; +	} + +	return 0; +} + +static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	struct nlattr *opts; + +	opts = nla_nest_start(skb, TCA_OPTIONS); +	if (opts == NULL) +		goto nla_put_failure; + +	/* convert target from pschedtime to us */ +	if (nla_put_u32(skb, TCA_PIE_TARGET, +			((u32) PSCHED_TICKS2NS(q->params.target)) / +			NSEC_PER_USEC) || +	    nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || +	    nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || +	    nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || +	    nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || +	    nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || +	    nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode)) +		goto nla_put_failure; + +	return nla_nest_end(skb, opts); + +nla_put_failure: +	nla_nest_cancel(skb, opts); +	return -1; + +} + +static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	struct tc_pie_xstats st = { +		.prob		= q->vars.prob, +		.delay		= ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / +				   NSEC_PER_USEC, +		/* unscale and return dq_rate in bytes per sec */ +		.avg_dq_rate	= q->vars.avg_dq_rate * +				  (PSCHED_TICKS_PER_SEC) >> PIE_SCALE, +		.packets_in	= q->stats.packets_in, +		.overlimit	= q->stats.overlimit, +		.maxq		= q->stats.maxq, +		.dropped	= q->stats.dropped, +		.ecn_mark	= q->stats.ecn_mark, +	}; + +	return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) +{ +	struct sk_buff *skb; +	skb = __qdisc_dequeue_head(sch, &sch->q); + +	if (!skb) +		return NULL; + +	pie_process_dequeue(sch, skb); +	return skb; +} + +static void pie_reset(struct Qdisc *sch) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	qdisc_reset_queue(sch); +	pie_vars_init(&q->vars); +} + +static void pie_destroy(struct Qdisc *sch) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	q->params.tupdate = 0; +	del_timer_sync(&q->adapt_timer); +} + +static struct Qdisc_ops pie_qdisc_ops __read_mostly = { +	.id = "pie", +	.priv_size	= sizeof(struct pie_sched_data), +	.enqueue	= pie_qdisc_enqueue, +	.dequeue	= pie_qdisc_dequeue, +	.peek		= qdisc_peek_dequeued, +	.init		= pie_init, +	.destroy	= pie_destroy, +	.reset		= pie_reset, +	.change		= pie_change, +	.dump		= pie_dump, +	.dump_stats	= pie_dump_stats, +	.owner		= THIS_MODULE, +}; + +static int __init pie_module_init(void) +{ +	return register_qdisc(&pie_qdisc_ops); +} + +static void __exit pie_module_exit(void) +{ +	unregister_qdisc(&pie_qdisc_ops); +} + +module_init(pie_module_init); +module_exit(pie_module_exit); + +MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler"); +MODULE_AUTHOR("Vijay Subramanian"); +MODULE_AUTHOR("Mythili Prabhu"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 30ea4674cab..9b0f7093d97 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -220,7 +220,7 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da  static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)  { -	q->bins[slot].perturbation = net_random(); +	q->bins[slot].perturbation = prandom_u32();  }  static void sfb_swap_slot(struct sfb_sched_data *q) @@ -381,7 +381,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)  		goto enqueue;  	} -	r = net_random() & SFB_MAX_PROB; +	r = prandom_u32() & SFB_MAX_PROB;  	if (unlikely(r < p_min)) {  		if (unlikely(p_min > SFB_MAX_PROB / 2)) { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index d3a1bc26dbf..1af2f73906d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -237,10 +237,12 @@ static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)  }  #define sfq_unlink(q, x, n, p)			\ -	n = q->slots[x].dep.next;		\ -	p = q->slots[x].dep.prev;		\ -	sfq_dep_head(q, p)->next = n;		\ -	sfq_dep_head(q, n)->prev = p +	do {					\ +		n = q->slots[x].dep.next;	\ +		p = q->slots[x].dep.prev;	\ +		sfq_dep_head(q, p)->next = n;	\ +		sfq_dep_head(q, n)->prev = p;	\ +	} while (0)  static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) @@ -627,7 +629,7 @@ static void sfq_perturbation(unsigned long arg)  	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));  	spin_lock(root_lock); -	q->perturbation = net_random(); +	q->perturbation = prandom_u32();  	if (!q->filter_list && q->tail)  		sfq_rehash(sch);  	spin_unlock(root_lock); @@ -696,7 +698,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)  	del_timer(&q->perturb_timer);  	if (q->perturb_period) {  		mod_timer(&q->perturb_timer, jiffies + q->perturb_period); -		q->perturbation = net_random(); +		q->perturbation = prandom_u32();  	}  	sch_tree_unlock(sch);  	kfree(p); @@ -714,12 +716,7 @@ static void *sfq_alloc(size_t sz)  static void sfq_free(void *addr)  { -	if (addr) { -		if (is_vmalloc_addr(addr)) -			vfree(addr); -		else -			kfree(addr); -	} +	kvfree(addr);  }  static void sfq_destroy(struct Qdisc *sch) @@ -757,7 +754,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)  	q->quantum = psched_mtu(qdisc_dev(sch));  	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);  	q->perturb_period = 0; -	q->perturbation = net_random(); +	q->perturbation = prandom_u32();  	if (opt) {  		int err = sfq_change(sch, opt); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 1aaf1b6e51a..18ff6343370 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -101,12 +101,11 @@  struct tbf_sched_data {  /* Parameters */  	u32		limit;		/* Maximal length of backlog: bytes */ +	u32		max_size;  	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */  	s64		mtu; -	u32		max_size;  	struct psched_ratecfg rate;  	struct psched_ratecfg peak; -	bool peak_present;  /* Variables */  	s64	tokens;			/* Current number of B tokens */ @@ -117,6 +116,42 @@ struct tbf_sched_data {  }; +/* Time to Length, convert time in ns to length in bytes + * to determinate how many bytes can be sent in given time. + */ +static u64 psched_ns_t2l(const struct psched_ratecfg *r, +			 u64 time_in_ns) +{ +	/* The formula is : +	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC +	 */ +	u64 len = time_in_ns * r->rate_bytes_ps; + +	do_div(len, NSEC_PER_SEC); + +	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) { +		do_div(len, 53); +		len = len * 48; +	} + +	if (len > r->overhead) +		len -= r->overhead; +	else +		len = 0; + +	return len; +} + +/* + * Return length of individual segments of a gso packet, + * including all headers (MAC, IP, TCP/UDP) + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ +	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); +	return hdr_len + skb_gso_transport_seglen(skb); +} +  /* GSO packet is too big, segment it so that tbf can transmit   * each segment in time   */ @@ -136,12 +171,8 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)  	while (segs) {  		nskb = segs->next;  		segs->next = NULL; -		if (likely(segs->len <= q->max_size)) { -			qdisc_skb_cb(segs)->pkt_len = segs->len; -			ret = qdisc_enqueue(segs, q->qdisc); -		} else { -			ret = qdisc_reshape_fail(skb, sch); -		} +		qdisc_skb_cb(segs)->pkt_len = segs->len; +		ret = qdisc_enqueue(segs, q->qdisc);  		if (ret != NET_XMIT_SUCCESS) {  			if (net_xmit_drop_count(ret))  				sch->qstats.drops++; @@ -163,7 +194,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	int ret;  	if (qdisc_pkt_len(skb) > q->max_size) { -		if (skb_is_gso(skb)) +		if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)  			return tbf_segment(skb, sch);  		return qdisc_reshape_fail(skb, sch);  	} @@ -190,6 +221,11 @@ static unsigned int tbf_drop(struct Qdisc *sch)  	return len;  } +static bool tbf_peak_present(const struct tbf_sched_data *q) +{ +	return q->peak.rate_bytes_ps; +} +  static struct sk_buff *tbf_dequeue(struct Qdisc *sch)  {  	struct tbf_sched_data *q = qdisc_priv(sch); @@ -206,7 +242,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)  		now = ktime_to_ns(ktime_get());  		toks = min_t(s64, now - q->t_c, q->buffer); -		if (q->peak_present) { +		if (tbf_peak_present(q)) {  			ptoks = toks + q->ptokens;  			if (ptoks > q->mtu)  				ptoks = q->mtu; @@ -266,20 +302,26 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {  	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },  	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },  	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, +	[TCA_TBF_RATE64]	= { .type = NLA_U64 }, +	[TCA_TBF_PRATE64]	= { .type = NLA_U64 }, +	[TCA_TBF_BURST] = { .type = NLA_U32 }, +	[TCA_TBF_PBURST] = { .type = NLA_U32 },  };  static int tbf_change(struct Qdisc *sch, struct nlattr *opt)  {  	int err;  	struct tbf_sched_data *q = qdisc_priv(sch); -	struct nlattr *tb[TCA_TBF_PTAB + 1]; +	struct nlattr *tb[TCA_TBF_MAX + 1];  	struct tc_tbf_qopt *qopt; -	struct qdisc_rate_table *rtab = NULL; -	struct qdisc_rate_table *ptab = NULL;  	struct Qdisc *child = NULL; -	int max_size, n; +	struct psched_ratecfg rate; +	struct psched_ratecfg peak; +	u64 max_size; +	s64 buffer, mtu; +	u64 rate64 = 0, prate64 = 0; -	err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy); +	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);  	if (err < 0)  		return err; @@ -288,33 +330,59 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)  		goto done;  	qopt = nla_data(tb[TCA_TBF_PARMS]); -	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]); -	if (rtab == NULL) -		goto done; +	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) +		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate, +					      tb[TCA_TBF_RTAB])); + +	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) +			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate, +						      tb[TCA_TBF_PTAB])); + +	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U); +	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U); + +	if (tb[TCA_TBF_RATE64]) +		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); +	psched_ratecfg_precompute(&rate, &qopt->rate, rate64); + +	if (tb[TCA_TBF_BURST]) { +		max_size = nla_get_u32(tb[TCA_TBF_BURST]); +		buffer = psched_l2t_ns(&rate, max_size); +	} else { +		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U); +	}  	if (qopt->peakrate.rate) { -		if (qopt->peakrate.rate > qopt->rate.rate) -			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]); -		if (ptab == NULL) +		if (tb[TCA_TBF_PRATE64]) +			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); +		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64); +		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) { +			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n", +					peak.rate_bytes_ps, rate.rate_bytes_ps); +			err = -EINVAL;  			goto done; -	} +		} -	for (n = 0; n < 256; n++) -		if (rtab->data[n] > qopt->buffer) -			break; -	max_size = (n << qopt->rate.cell_log) - 1; -	if (ptab) { -		int size; - -		for (n = 0; n < 256; n++) -			if (ptab->data[n] > qopt->mtu) -				break; -		size = (n << qopt->peakrate.cell_log) - 1; -		if (size < max_size) -			max_size = size; +		if (tb[TCA_TBF_PBURST]) { +			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]); +			max_size = min_t(u32, max_size, pburst); +			mtu = psched_l2t_ns(&peak, pburst); +		} else { +			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu)); +		} +	} else { +		memset(&peak, 0, sizeof(peak));  	} -	if (max_size < 0) + +	if (max_size < psched_mtu(qdisc_dev(sch))) +		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n", +				    max_size, qdisc_dev(sch)->name, +				    psched_mtu(qdisc_dev(sch))); + +	if (!max_size) { +		err = -EINVAL;  		goto done; +	}  	if (q->qdisc != &noop_qdisc) {  		err = fifo_set_limit(q->qdisc, qopt->limit); @@ -335,27 +403,24 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)  		q->qdisc = child;  	}  	q->limit = qopt->limit; -	q->mtu = PSCHED_TICKS2NS(qopt->mtu); +	if (tb[TCA_TBF_PBURST]) +		q->mtu = mtu; +	else +		q->mtu = PSCHED_TICKS2NS(qopt->mtu);  	q->max_size = max_size; -	q->buffer = PSCHED_TICKS2NS(qopt->buffer); +	if (tb[TCA_TBF_BURST]) +		q->buffer = buffer; +	else +		q->buffer = PSCHED_TICKS2NS(qopt->buffer);  	q->tokens = q->buffer;  	q->ptokens = q->mtu; -	psched_ratecfg_precompute(&q->rate, &rtab->rate); -	if (ptab) { -		psched_ratecfg_precompute(&q->peak, &ptab->rate); -		q->peak_present = true; -	} else { -		q->peak_present = false; -	} +	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg)); +	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));  	sch_tree_unlock(sch);  	err = 0;  done: -	if (rtab) -		qdisc_put_rtab(rtab); -	if (ptab) -		qdisc_put_rtab(ptab);  	return err;  } @@ -394,7 +459,7 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)  	opt.limit = q->limit;  	psched_ratecfg_getrate(&opt.rate, &q->rate); -	if (q->peak_present) +	if (tbf_peak_present(q))  		psched_ratecfg_getrate(&opt.peakrate, &q->peak);  	else  		memset(&opt.peakrate, 0, sizeof(opt.peakrate)); @@ -402,9 +467,15 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)  	opt.buffer = PSCHED_NS2TICKS(q->buffer);  	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))  		goto nla_put_failure; +	if (q->rate.rate_bytes_ps >= (1ULL << 32) && +	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps)) +		goto nla_put_failure; +	if (tbf_peak_present(q) && +	    q->peak.rate_bytes_ps >= (1ULL << 32) && +	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps)) +		goto nla_put_failure; -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure:  	nla_nest_cancel(skb, nest);  | 
